core/hfs_readwrite.c

   1 /*
   2  * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*      @(#)hfs_readwrite.c     1.0
  29  *
  30  *      (c) 1998-2001 Apple Inc.  All Rights Reserved
  31  *
  32  *      hfs_readwrite.c -- vnode operations to deal with reading and writing files.
  33  *
  34  */
  35
  36 #include <sys/param.h>
  37 #include <sys/systm.h>
  38 #include <sys/kernel.h>
  39 #include <sys/fcntl.h>
  40 #include <sys/stat.h>
  41 #include <sys/buf.h>
  42 #include <sys/proc.h>
  43 #include <sys/kauth.h>
  44 #include <sys/vnode.h>
  45 #include <sys/uio.h>
  46 #include <sys/vfs_context.h>
  47 #include <sys/disk.h>
  48 #include <sys/sysctl.h>
  49 #include <sys/fsctl.h>
  50 #include <sys/ubc.h>
  51 #include <sys/fsevents.h>
  52 #include <uuid/uuid.h>
  53
  54 #include <libkern/OSDebug.h>
  55
  56 #include <miscfs/specfs/specdev.h>
  57
  58 #include <sys/ubc.h>
  59
  60 #include <vm/vm_pageout.h>
  61 #include <vm/vm_kern.h>
  62
  63 #include <IOKit/IOBSD.h>
  64
  65 #include <sys/kdebug.h>
  66
  67 #include        "hfs.h"
  68 #include        "hfs_attrlist.h"
  69 #include        "hfs_endian.h"
  70 #include        "hfs_fsctl.h"
  71 #include        "hfs_quota.h"
  72 #include        "FileMgrInternal.h"
  73 #include        "BTreesInternal.h"
  74 #include        "hfs_cnode.h"
  75 #include        "hfs_dbg.h"
  76
  77 #if HFS_CONFIG_KEY_ROLL
  78 #include        "hfs_key_roll.h"
  79 #endif
  80
  81 #define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2)))
  82
  83 enum {
  84         MAXHFSFILESIZE = 0x7FFFFFFF             /* this needs to go in the mount structure */
  85 };
  86
  87 /* from bsd/hfs/hfs_vfsops.c */
  88 extern int hfs_vfs_vget (struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context);
  89
  90 /* from hfs_hotfiles.c */
  91 extern int hfs_pin_overflow_extents (struct hfsmount *hfsmp, uint32_t fileid,
  92                                               uint8_t forktype, uint32_t *pinned);
  93
  94 static int  hfs_clonefile(struct vnode *, int, int, int);
  95 static int  hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *);
  96 static int  do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skip, vfs_context_t context);
  97
  98
  99 /*
 100  * Read data from a file.
 101  */
 102 int
 103 hfs_vnop_read(struct vnop_read_args *ap)
 104 {
 105         /*
 106            struct vnop_read_args {
 107            struct vnodeop_desc *a_desc;
 108            vnode_t a_vp;
 109            struct uio *a_uio;
 110            int a_ioflag;
 111            vfs_context_t a_context;
 112            };
 113          */
 114
 115         uio_t uio = ap->a_uio;
 116         struct vnode *vp = ap->a_vp;
 117         struct cnode *cp;
 118         struct filefork *fp;
 119         struct hfsmount *hfsmp;
 120         off_t filesize;
 121         off_t filebytes;
 122         off_t start_resid = uio_resid(uio);
 123         off_t offset = uio_offset(uio);
 124         int retval = 0;
 125         int took_truncate_lock = 0;
 126         int io_throttle = 0;
 127         int throttled_count = 0;
 128
 129         /* Preflight checks */
 130         if (!vnode_isreg(vp)) {
 131                 /* can only read regular files */
 132                 if (vnode_isdir(vp))
 133                         return (EISDIR);
 134                 else
 135                         return (EPERM);
 136         }
 137         if (start_resid == 0)
 138                 return (0);             /* Nothing left to do */
 139         if (offset < 0)
 140                 return (EINVAL);        /* cant read from a negative offset */
 141
 142 #if SECURE_KERNEL
 143         if ((ap->a_ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) ==
 144                                                 (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) {
 145                 /* Don't allow unencrypted io request from user space */
 146                 return EPERM;
 147         }
 148 #endif
 149
 150 #if HFS_COMPRESSION
 151         if (VNODE_IS_RSRC(vp)) {
 152                 if (hfs_hides_rsrc(ap->a_context, VTOC(vp), 1)) { /* 1 == don't take the cnode lock */
 153                         return 0;
 154                 }
 155                 /* otherwise read the resource fork normally */
 156         } else {
 157                 int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
 158                 if (compressed) {
 159                         retval = decmpfs_read_compressed(ap, &compressed, VTOCMP(vp));
 160                         if (retval == 0 && !(ap->a_ioflag & IO_EVTONLY) && vnode_isfastdevicecandidate(vp)) {
 161                                 (void) hfs_addhotfile(vp);
 162                         }
 163                         if (compressed) {
 164                                 if (retval == 0) {
 165                                         /* successful read, update the access time */
 166                                         VTOC(vp)->c_touch_acctime = TRUE;
 167
 168                                         //
 169                                         // compressed files are not traditional hot file candidates
 170                                         // but they may be for CF (which ignores the ff_bytesread
 171                                         // field)
 172                                         //
 173                                         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
 174                                                 VTOF(vp)->ff_bytesread = 0;
 175                                         }
 176                                 }
 177                                 return retval;
 178                         }
 179                         /* otherwise the file was converted back to a regular file while we were reading it */
 180                         retval = 0;
 181                 } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
 182                         int error;
 183
 184                         error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP);
 185                         if (error) {
 186                                 return error;
 187                         }
 188
 189                 }
 190         }
 191 #endif /* HFS_COMPRESSION */
 192
 193         cp = VTOC(vp);
 194         fp = VTOF(vp);
 195         hfsmp = VTOHFS(vp);
 196
 197 #if CONFIG_PROTECT
 198         if ((retval = cp_handle_vnop (vp, CP_READ_ACCESS, ap->a_ioflag)) != 0) {
 199                 goto exit;
 200         }
 201
 202 #if HFS_CONFIG_KEY_ROLL
 203         if (ISSET(ap->a_ioflag, IO_ENCRYPTED)) {
 204                 off_rsrc_t off_rsrc = off_rsrc_make(offset + start_resid,
 205                                                                                         VNODE_IS_RSRC(vp));
 206
 207                 retval = hfs_key_roll_up_to(ap->a_context, vp, off_rsrc);
 208                 if (retval)
 209                         goto exit;
 210         }
 211 #endif // HFS_CONFIG_KEY_ROLL
 212 #endif // CONFIG_PROTECT
 213
 214         /*
 215          * If this read request originated from a syscall (as opposed to
 216          * an in-kernel page fault or something), then set it up for
 217          * throttle checks
 218          */
 219         if (ap->a_ioflag & IO_SYSCALL_DISPATCH) {
 220                 io_throttle = IO_RETURN_ON_THROTTLE;
 221         }
 222
 223 read_again:
 224
 225         /* Protect against a size change. */
 226         hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
 227         took_truncate_lock = 1;
 228
 229         filesize = fp->ff_size;
 230         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 231
 232         /*
 233          * Check the file size. Note that per POSIX spec, we return 0 at
 234          * file EOF, so attempting a read at an offset that is too big
 235          * should just return 0 on HFS+. Since the return value was initialized
 236          * to 0 above, we just jump to exit.  HFS Standard has its own behavior.
 237          */
 238         if (offset > filesize) {
 239 #if CONFIG_HFS_STD
 240                 if ((hfsmp->hfs_flags & HFS_STANDARD) &&
 241                     (offset > (off_t)MAXHFSFILESIZE)) {
 242                         retval = EFBIG;
 243                 }
 244 #endif
 245                 goto exit;
 246         }
 247
 248         KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_START,
 249                 (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0);
 250
 251         retval = cluster_read(vp, uio, filesize, ap->a_ioflag |io_throttle);
 252
 253         cp->c_touch_acctime = TRUE;
 254
 255         KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_END,
 256                 (int)uio_offset(uio), uio_resid(uio), (int)filesize,  (int)filebytes, 0);
 257
 258         /*
 259          * Keep track blocks read
 260          */
 261         if (hfsmp->hfc_stage == HFC_RECORDING && retval == 0) {
 262                 int took_cnode_lock = 0;
 263                 off_t bytesread;
 264
 265                 bytesread = start_resid - uio_resid(uio);
 266
 267                 /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
 268                 if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) {
 269                         hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 270                         took_cnode_lock = 1;
 271                 }
 272                 /*
 273                  * If this file hasn't been seen since the start of
 274                  * the current sampling period then start over.
 275                  */
 276                 if (cp->c_atime < hfsmp->hfc_timebase) {
 277                         struct timeval tv;
 278
 279                         fp->ff_bytesread = bytesread;
 280                         microtime(&tv);
 281                         cp->c_atime = tv.tv_sec;
 282                 } else {
 283                         fp->ff_bytesread += bytesread;
 284                 }
 285
 286                 if (!(ap->a_ioflag & IO_EVTONLY) && vnode_isfastdevicecandidate(vp)) {
 287                         //
 288                         // We don't add hotfiles for processes doing IO_EVTONLY I/O
 289                         // on the assumption that they're system processes such as
 290                         // mdworker which scan everything in the system (and thus
 291                         // do not represent user-initiated access to files)
 292                         //
 293                         (void) hfs_addhotfile(vp);
 294                 }
 295                 if (took_cnode_lock)
 296                         hfs_unlock(cp);
 297         }
 298 exit:
 299         if (took_truncate_lock) {
 300                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 301         }
 302         if (retval == EAGAIN) {
 303                 throttle_lowpri_io(1);
 304                 throttled_count++;
 305
 306                 retval = 0;
 307                 goto read_again;
 308         }
 309         if (throttled_count)
 310                 throttle_info_reset_window(NULL);
 311         return (retval);
 312 }
 313
 314 /*
 315  * Ideally, this wouldn't be necessary; the cluster code should be
 316  * able to handle this on the read-side.  See <rdar://20420068>.
 317  */
 318 static errno_t hfs_zero_eof_page(vnode_t vp, off_t zero_up_to)
 319 {
 320         hfs_assert(VTOC(vp)->c_lockowner != current_thread());
 321         hfs_assert(VTOC(vp)->c_truncatelockowner == current_thread());
 322
 323         struct filefork *fp = VTOF(vp);
 324
 325         if (!(fp->ff_size & PAGE_MASK_64) || zero_up_to <= fp->ff_size) {
 326                 // Nothing to do
 327                 return 0;
 328         }
 329
 330         zero_up_to = MIN(zero_up_to, (off_t)round_page_64(fp->ff_size));
 331
 332         /* N.B. At present, @zero_up_to is not important because the cluster
 333            code will always zero up to the end of the page anyway. */
 334         return cluster_write(vp, NULL, fp->ff_size, zero_up_to,
 335                                                  fp->ff_size, 0, IO_HEADZEROFILL);
 336 }
 337
 338 /*
 339  * Write data to a file.
 340  */
 341 int
 342 hfs_vnop_write(struct vnop_write_args *ap)
 343 {
 344         uio_t uio = ap->a_uio;
 345         struct vnode *vp = ap->a_vp;
 346         struct cnode *cp;
 347         struct filefork *fp;
 348         struct hfsmount *hfsmp;
 349         kauth_cred_t cred = NULL;
 350         off_t origFileSize;
 351         off_t writelimit;
 352         off_t bytesToAdd = 0;
 353         off_t actualBytesAdded;
 354         off_t filebytes;
 355         off_t offset;
 356         ssize_t resid;
 357         int eflags;
 358         int ioflag = ap->a_ioflag;
 359         int retval = 0;
 360         int lockflags;
 361         int cnode_locked = 0;
 362         int partialwrite = 0;
 363         int do_snapshot = 1;
 364         time_t orig_ctime=VTOC(vp)->c_ctime;
 365         int took_truncate_lock = 0;
 366         int io_return_on_throttle = 0;
 367         int throttled_count = 0;
 368
 369 #if HFS_COMPRESSION
 370         if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
 371                 int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
 372                 switch(state) {
 373                         case FILE_IS_COMPRESSED:
 374                                 return EACCES;
 375                         case FILE_IS_CONVERTING:
 376                                 /* if FILE_IS_CONVERTING, we allow writes but do not
 377                                    bother with snapshots or else we will deadlock.
 378                                 */
 379                                 do_snapshot = 0;
 380                                 break;
 381                         default:
 382                                 printf("invalid state %d for compressed file\n", state);
 383                                 /* fall through */
 384                 }
 385         } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
 386                 int error;
 387
 388                 error = check_for_dataless_file(vp, NAMESPACE_HANDLER_WRITE_OP);
 389                 if (error != 0) {
 390                         return error;
 391                 }
 392         }
 393
 394         if (do_snapshot) {
 395                 nspace_snapshot_event(vp, orig_ctime, NAMESPACE_HANDLER_WRITE_OP, uio);
 396         }
 397
 398 #endif
 399
 400 #if SECURE_KERNEL
 401         if ((ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) ==
 402                                                 (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) {
 403                 /* Don't allow unencrypted io request from user space */
 404                 return EPERM;
 405         }
 406 #endif
 407
 408         resid = uio_resid(uio);
 409         offset = uio_offset(uio);
 410
 411         if (offset < 0)
 412                 return (EINVAL);
 413         if (resid == 0)
 414                 return (E_NONE);
 415         if (!vnode_isreg(vp))
 416                 return (EPERM);  /* Can only write regular files */
 417
 418         cp = VTOC(vp);
 419         fp = VTOF(vp);
 420         hfsmp = VTOHFS(vp);
 421
 422 #if CONFIG_PROTECT
 423         if ((retval = cp_handle_vnop (vp, CP_WRITE_ACCESS, 0)) != 0) {
 424                 goto exit;
 425         }
 426 #endif
 427
 428         eflags = kEFDeferMask;  /* defer file block allocations */
 429 #if HFS_SPARSE_DEV
 430         /*
 431          * When the underlying device is sparse and space
 432          * is low (< 8MB), stop doing delayed allocations
 433          * and begin doing synchronous I/O.
 434          */
 435         if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
 436             (hfs_freeblks(hfsmp, 0) < 2048)) {
 437                 eflags &= ~kEFDeferMask;
 438                 ioflag |= IO_SYNC;
 439         }
 440 #endif /* HFS_SPARSE_DEV */
 441
 442         if ((ioflag & (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) ==
 443                         (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) {
 444                 io_return_on_throttle = IO_RETURN_ON_THROTTLE;
 445         }
 446
 447 again:
 448         /*
 449          * Protect against a size change.
 450          *
 451          * Note: If took_truncate_lock is true, then we previously got the lock shared
 452          * but needed to upgrade to exclusive.  So try getting it exclusive from the
 453          * start.
 454          */
 455         if (ioflag & IO_APPEND || took_truncate_lock) {
 456                 hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
 457         }
 458         else {
 459                 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
 460         }
 461         took_truncate_lock = 1;
 462
 463         /* Update UIO */
 464         if (ioflag & IO_APPEND) {
 465                 uio_setoffset(uio, fp->ff_size);
 466                 offset = fp->ff_size;
 467         }
 468         if ((cp->c_bsdflags & APPEND) && offset != fp->ff_size) {
 469                 retval = EPERM;
 470                 goto exit;
 471         }
 472
 473         cred = vfs_context_ucred(ap->a_context);
 474         if (cred && suser(cred, NULL) != 0)
 475                 eflags |= kEFReserveMask;
 476
 477         origFileSize = fp->ff_size;
 478         writelimit = offset + resid;
 479
 480         /*
 481          * We may need an exclusive truncate lock for several reasons, all
 482          * of which are because we may be writing to a (portion of a) block
 483          * for the first time, and we need to make sure no readers see the
 484          * prior, uninitialized contents of the block.  The cases are:
 485          *
 486          * 1. We have unallocated (delayed allocation) blocks.  We may be
 487          *    allocating new blocks to the file and writing to them.
 488          *    (A more precise check would be whether the range we're writing
 489          *    to contains delayed allocation blocks.)
 490          * 2. We need to extend the file.  The bytes between the old EOF
 491          *    and the new EOF are not yet initialized.  This is important
 492          *    even if we're not allocating new blocks to the file.  If the
 493          *    old EOF and new EOF are in the same block, we still need to
 494          *    protect that range of bytes until they are written for the
 495          *    first time.
 496          *
 497          * If we had a shared lock with the above cases, we need to try to upgrade
 498          * to an exclusive lock.  If the upgrade fails, we will lose the shared
 499          * lock, and will need to take the truncate lock again; the took_truncate_lock
 500          * flag will still be set, causing us to try for an exclusive lock next time.
 501          */
 502         if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) &&
 503             ((fp->ff_unallocblocks != 0) ||
 504              (writelimit > origFileSize))) {
 505                 if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) {
 506                         /*
 507                          * Lock upgrade failed and we lost our shared lock, try again.
 508                          * Note: we do not set took_truncate_lock=0 here.  Leaving it
 509                          * set to 1 will cause us to try to get the lock exclusive.
 510                          */
 511                         goto again;
 512                 }
 513                 else {
 514                         /* Store the owner in the c_truncatelockowner field if we successfully upgrade */
 515                         cp->c_truncatelockowner = current_thread();
 516                 }
 517         }
 518
 519         if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
 520                 goto exit;
 521         }
 522         cnode_locked = 1;
 523
 524         filebytes = hfs_blk_to_bytes(fp->ff_blocks, hfsmp->blockSize);
 525
 526         if (offset > filebytes
 527                 && (hfs_blk_to_bytes(hfs_freeblks(hfsmp, ISSET(eflags, kEFReserveMask)),
 528                                                          hfsmp->blockSize) < offset - filebytes)) {
 529                 retval = ENOSPC;
 530                 goto exit;
 531         }
 532
 533         KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_START,
 534                      (int)offset, uio_resid(uio), (int)fp->ff_size,
 535                      (int)filebytes, 0);
 536
 537         /* Check if we do not need to extend the file */
 538         if (writelimit <= filebytes) {
 539                 goto sizeok;
 540         }
 541
 542         bytesToAdd = writelimit - filebytes;
 543
 544 #if QUOTA
 545         retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, hfsmp->blockSize)),
 546                            cred, 0);
 547         if (retval)
 548                 goto exit;
 549 #endif /* QUOTA */
 550
 551         if (hfs_start_transaction(hfsmp) != 0) {
 552                 retval = EINVAL;
 553                 goto exit;
 554         }
 555
 556         while (writelimit > filebytes) {
 557                 bytesToAdd = writelimit - filebytes;
 558
 559                 /* Protect extents b-tree and allocation bitmap */
 560                 lockflags = SFL_BITMAP;
 561                 if (overflow_extents(fp))
 562                         lockflags |= SFL_EXTENTS;
 563                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
 564
 565                 /* Files that are changing size are not hot file candidates. */
 566                 if (hfsmp->hfc_stage == HFC_RECORDING) {
 567                         fp->ff_bytesread = 0;
 568                 }
 569                 retval = MacToVFSError(ExtendFileC (hfsmp, (FCB*)fp, bytesToAdd,
 570                                 0, eflags, &actualBytesAdded));
 571
 572                 hfs_systemfile_unlock(hfsmp, lockflags);
 573
 574                 if ((actualBytesAdded == 0) && (retval == E_NONE))
 575                         retval = ENOSPC;
 576                 if (retval != E_NONE)
 577                         break;
 578                 filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 579                 KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_NONE,
 580                         (int)offset, uio_resid(uio), (int)fp->ff_size,  (int)filebytes, 0);
 581         }
 582         (void) hfs_update(vp, 0);
 583         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
 584         (void) hfs_end_transaction(hfsmp);
 585
 586         /*
 587          * If we didn't grow the file enough try a partial write.
 588          * POSIX expects this behavior.
 589          */
 590         if ((retval == ENOSPC) && (filebytes > offset)) {
 591                 retval = 0;
 592                 partialwrite = 1;
 593                 uio_setresid(uio, (uio_resid(uio) - bytesToAdd));
 594                 resid -= bytesToAdd;
 595                 writelimit = filebytes;
 596         }
 597 sizeok:
 598         if (retval == E_NONE) {
 599                 off_t filesize;
 600                 off_t head_off;
 601                 int lflag;
 602
 603                 if (writelimit > fp->ff_size) {
 604                         filesize = writelimit;
 605                         struct timeval tv;
 606                         rl_add(fp->ff_size, writelimit - 1 , &fp->ff_invalidranges);
 607                         microuptime(&tv);
 608                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
 609                 } else
 610                         filesize = fp->ff_size;
 611
 612                 lflag = ioflag & ~(IO_TAILZEROFILL | IO_HEADZEROFILL | IO_NOZEROVALID | IO_NOZERODIRTY);
 613
 614                 /*
 615                  * We no longer use IO_HEADZEROFILL or IO_TAILZEROFILL (except
 616                  * for one case below).  For the regions that lie before the
 617                  * beginning and after the end of this write that are in the
 618                  * same page, we let the cluster code handle zeroing that out
 619                  * if necessary.  If those areas are not cached, the cluster
 620                  * code will try and read those areas in, and in the case
 621                  * where those regions have never been written to,
 622                  * hfs_vnop_blockmap will consult the invalid ranges and then
 623                  * indicate that.  The cluster code will zero out those areas.
 624                  */
 625
 626                 head_off = trunc_page_64(offset);
 627
 628                 if (head_off < offset && head_off >= fp->ff_size) {
 629                         /*
 630                          * The first page is beyond current EOF, so as an
 631                          * optimisation, we can pass IO_HEADZEROFILL.
 632                          */
 633                         lflag |= IO_HEADZEROFILL;
 634                 }
 635
 636                 hfs_unlock(cp);
 637                 cnode_locked = 0;
 638
 639                 /*
 640                  * We need to tell UBC the fork's new size BEFORE calling
 641                  * cluster_write, in case any of the new pages need to be
 642                  * paged out before cluster_write completes (which does happen
 643                  * in embedded systems due to extreme memory pressure).
 644                  * Similarly, we need to tell hfs_vnop_pageout what the new EOF
 645                  * will be, so that it can pass that on to cluster_pageout, and
 646                  * allow those pageouts.
 647                  *
 648                  * We don't update ff_size yet since we don't want pageins to
 649                  * be able to see uninitialized data between the old and new
 650                  * EOF, until cluster_write has completed and initialized that
 651                  * part of the file.
 652                  *
 653                  * The vnode pager relies on the file size last given to UBC via
 654                  * ubc_setsize.  hfs_vnop_pageout relies on fp->ff_new_size or
 655                  * ff_size (whichever is larger).  NOTE: ff_new_size is always
 656                  * zero, unless we are extending the file via write.
 657                  */
 658                 if (filesize > fp->ff_size) {
 659                         retval = hfs_zero_eof_page(vp, offset);
 660                         if (retval)
 661                                 goto exit;
 662                         fp->ff_new_size = filesize;
 663                         ubc_setsize(vp, filesize);
 664                 }
 665                 retval = cluster_write(vp, uio, fp->ff_size, filesize, head_off,
 666                                                            0, lflag | IO_NOZERODIRTY | io_return_on_throttle);
 667                 if (retval) {
 668                         fp->ff_new_size = 0;    /* no longer extending; use ff_size */
 669
 670                         if (retval == EAGAIN) {
 671                                 /*
 672                                  * EAGAIN indicates that we still have I/O to do, but
 673                                  * that we now need to be throttled
 674                                  */
 675                                 if (resid != uio_resid(uio)) {
 676                                         /*
 677                                          * did manage to do some I/O before returning EAGAIN
 678                                          */
 679                                         resid = uio_resid(uio);
 680                                         offset = uio_offset(uio);
 681
 682                                         cp->c_touch_chgtime = TRUE;
 683                                         cp->c_touch_modtime = TRUE;
 684                                         hfs_incr_gencount(cp);
 685                                 }
 686                                 if (filesize > fp->ff_size) {
 687                                         /*
 688                                          * we called ubc_setsize before the call to
 689                                          * cluster_write... since we only partially
 690                                          * completed the I/O, we need to
 691                                          * re-adjust our idea of the filesize based
 692                                          * on our interim EOF
 693                                          */
 694                                         ubc_setsize(vp, offset);
 695
 696                                         fp->ff_size = offset;
 697                                 }
 698                                 goto exit;
 699                         }
 700                         if (filesize > origFileSize) {
 701                                 ubc_setsize(vp, origFileSize);
 702                         }
 703                         goto ioerr_exit;
 704                 }
 705
 706                 if (filesize > origFileSize) {
 707                         fp->ff_size = filesize;
 708
 709                         /* Files that are changing size are not hot file candidates. */
 710                         if (hfsmp->hfc_stage == HFC_RECORDING) {
 711                                 fp->ff_bytesread = 0;
 712                         }
 713                 }
 714                 fp->ff_new_size = 0;    /* ff_size now has the correct size */
 715         }
 716         if (partialwrite) {
 717                 uio_setresid(uio, (uio_resid(uio) + bytesToAdd));
 718                 resid += bytesToAdd;
 719         }
 720
 721         if (vnode_should_flush_after_write(vp, ioflag))
 722                 hfs_flush(hfsmp, HFS_FLUSH_CACHE);
 723
 724 ioerr_exit:
 725         if (!cnode_locked) {
 726                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 727                 cnode_locked = 1;
 728         }
 729
 730         if (resid > uio_resid(uio)) {
 731                 cp->c_touch_chgtime = TRUE;
 732                 cp->c_touch_modtime = TRUE;
 733                 hfs_incr_gencount(cp);
 734
 735                 /*
 736                  * If we successfully wrote any data, and we are not the superuser
 737                  * we clear the setuid and setgid bits as a precaution against
 738                  * tampering.
 739                  */
 740                 if (cp->c_mode & (S_ISUID | S_ISGID)) {
 741                         cred = vfs_context_ucred(ap->a_context);
 742                         if (cred && suser(cred, NULL)) {
 743                                 cp->c_mode &= ~(S_ISUID | S_ISGID);
 744                         }
 745                 }
 746         }
 747         if (retval) {
 748                 if (ioflag & IO_UNIT) {
 749                         (void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC,
 750                                            0, ap->a_context);
 751                         uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio))));
 752                         uio_setresid(uio, resid);
 753                         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 754                 }
 755         } else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio)))
 756                 retval = hfs_update(vp, 0);
 757
 758         /* Updating vcbWrCnt doesn't need to be atomic. */
 759         hfsmp->vcbWrCnt++;
 760
 761         KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_END,
 762                 (int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0);
 763 exit:
 764         if (retval && took_truncate_lock
 765                 && cp->c_truncatelockowner == current_thread()) {
 766                 fp->ff_new_size = 0;
 767                 rl_remove(fp->ff_size, RL_INFINITY, &fp->ff_invalidranges);
 768         }
 769
 770         if (cnode_locked)
 771                 hfs_unlock(cp);
 772
 773         if (took_truncate_lock) {
 774                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 775         }
 776         if (retval == EAGAIN) {
 777                 throttle_lowpri_io(1);
 778                 throttled_count++;
 779
 780                 retval = 0;
 781                 goto again;
 782         }
 783         if (throttled_count)
 784                 throttle_info_reset_window(NULL);
 785         return (retval);
 786 }
 787
 788 /* support for the "bulk-access" fcntl */
 789
 790 #define CACHE_LEVELS 16
 791 #define NUM_CACHE_ENTRIES (64*16)
 792 #define PARENT_IDS_FLAG 0x100
 793
 794 struct access_cache {
 795        int numcached;
 796        int cachehits; /* these two for statistics gathering */
 797        int lookups;
 798        unsigned int *acache;
 799        unsigned char *haveaccess;
 800 };
 801
 802 struct access_t {
 803         uid_t     uid;              /* IN: effective user id */
 804         short     flags;            /* IN: access requested (i.e. R_OK) */
 805         short     num_groups;       /* IN: number of groups user belongs to */
 806         int       num_files;        /* IN: number of files to process */
 807         int       *file_ids;        /* IN: array of file ids */
 808         gid_t     *groups;          /* IN: array of groups */
 809         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 810 } __attribute__((unavailable)); // this structure is for reference purposes only
 811
 812 struct user32_access_t {
 813         uid_t     uid;              /* IN: effective user id */
 814         short     flags;            /* IN: access requested (i.e. R_OK) */
 815         short     num_groups;       /* IN: number of groups user belongs to */
 816         int       num_files;        /* IN: number of files to process */
 817         user32_addr_t      file_ids;        /* IN: array of file ids */
 818         user32_addr_t      groups;          /* IN: array of groups */
 819         user32_addr_t      access;          /* OUT: access info for each file (0 for 'has access') */
 820 };
 821
 822 struct user64_access_t {
 823         uid_t           uid;                    /* IN: effective user id */
 824         short           flags;                  /* IN: access requested (i.e. R_OK) */
 825         short           num_groups;             /* IN: number of groups user belongs to */
 826         int             num_files;              /* IN: number of files to process */
 827         user64_addr_t   file_ids;               /* IN: array of file ids */
 828         user64_addr_t   groups;                 /* IN: array of groups */
 829         user64_addr_t   access;                 /* OUT: access info for each file (0 for 'has access') */
 830 };
 831
 832
 833 // these are the "extended" versions of the above structures
 834 // note that it is crucial that they be different sized than
 835 // the regular version
 836 struct ext_access_t {
 837         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 838         uint32_t   num_files;       /* IN: number of files to process */
 839         uint32_t   map_size;        /* IN: size of the bit map */
 840         uint32_t  *file_ids;        /* IN: Array of file ids */
 841         char      *bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 842         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 843         uint32_t   num_parents;   /* future use */
 844         cnid_t      *parents;   /* future use */
 845 } __attribute__((unavailable)); // this structure is for reference purposes only
 846
 847 struct user32_ext_access_t {
 848         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 849         uint32_t   num_files;       /* IN: number of files to process */
 850         uint32_t   map_size;        /* IN: size of the bit map */
 851         user32_addr_t  file_ids;        /* IN: Array of file ids */
 852         user32_addr_t     bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 853         user32_addr_t access;          /* OUT: access info for each file (0 for 'has access') */
 854         uint32_t   num_parents;   /* future use */
 855         user32_addr_t parents;   /* future use */
 856 };
 857
 858 struct user64_ext_access_t {
 859         uint32_t      flags;        /* IN: access requested (i.e. R_OK) */
 860         uint32_t      num_files;    /* IN: number of files to process */
 861         uint32_t      map_size;     /* IN: size of the bit map */
 862         user64_addr_t   file_ids;     /* IN: array of file ids */
 863         user64_addr_t   bitmap;       /* IN: array of groups */
 864         user64_addr_t   access;       /* OUT: access info for each file (0 for 'has access') */
 865         uint32_t      num_parents;/* future use */
 866         user64_addr_t   parents;/* future use */
 867 };
 868
 869
 870 /*
 871  * Perform a binary search for the given parent_id. Return value is
 872  * the index if there is a match.  If no_match_indexp is non-NULL it
 873  * will be assigned with the index to insert the item (even if it was
 874  * not found).
 875  */
 876 static int cache_binSearch(cnid_t *array, unsigned int hi, cnid_t parent_id, int *no_match_indexp)
 877 {
 878     int index=-1;
 879     unsigned int lo=0;
 880
 881     do {
 882         unsigned int mid = ((hi - lo)/2) + lo;
 883         unsigned int this_id = array[mid];
 884
 885         if (parent_id == this_id) {
 886             hi = mid;
 887             break;
 888         }
 889
 890         if (parent_id < this_id) {
 891             hi = mid;
 892             continue;
 893         }
 894
 895         if (parent_id > this_id) {
 896             lo = mid + 1;
 897             continue;
 898         }
 899     } while(lo < hi);
 900
 901     /* check if lo and hi converged on the match */
 902     if (parent_id == array[hi]) {
 903         index = hi;
 904     }
 905
 906     if (no_match_indexp) {
 907         *no_match_indexp = hi;
 908     }
 909
 910     return index;
 911 }
 912
 913
 914 static int
 915 lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id)
 916 {
 917     unsigned int hi;
 918     int matches = 0;
 919     int index, no_match_index;
 920
 921     if (cache->numcached == 0) {
 922         *indexp = 0;
 923         return 0; // table is empty, so insert at index=0 and report no match
 924     }
 925
 926     if (cache->numcached > NUM_CACHE_ENTRIES) {
 927         cache->numcached = NUM_CACHE_ENTRIES;
 928     }
 929
 930     hi = cache->numcached - 1;
 931
 932     index = cache_binSearch(cache->acache, hi, parent_id, &no_match_index);
 933
 934     /* if no existing entry found, find index for new one */
 935     if (index == -1) {
 936         index = no_match_index;
 937         matches = 0;
 938     } else {
 939         matches = 1;
 940     }
 941
 942     *indexp = index;
 943     return matches;
 944 }
 945
 946 /*
 947  * Add a node to the access_cache at the given index (or do a lookup first
 948  * to find the index if -1 is passed in). We currently do a replace rather
 949  * than an insert if the cache is full.
 950  */
 951 static void
 952 add_node(struct access_cache *cache, int index, cnid_t nodeID, int access)
 953 {
 954     int lookup_index = -1;
 955
 956     /* need to do a lookup first if -1 passed for index */
 957     if (index == -1) {
 958         if (lookup_bucket(cache, &lookup_index, nodeID)) {
 959             if (cache->haveaccess[lookup_index] != access && cache->haveaccess[lookup_index] == ESRCH) {
 960                 // only update an entry if the previous access was ESRCH (i.e. a scope checking error)
 961                 cache->haveaccess[lookup_index] = access;
 962             }
 963
 964             /* mission accomplished */
 965             return;
 966         } else {
 967             index = lookup_index;
 968         }
 969
 970     }
 971
 972     /* if the cache is full, do a replace rather than an insert */
 973     if (cache->numcached >= NUM_CACHE_ENTRIES) {
 974         cache->numcached = NUM_CACHE_ENTRIES-1;
 975
 976         if (index > cache->numcached) {
 977             index = cache->numcached;
 978         }
 979     }
 980
 981     if (index < cache->numcached && index < NUM_CACHE_ENTRIES && nodeID > cache->acache[index]) {
 982         index++;
 983     }
 984
 985     if (index >= 0 && index < cache->numcached) {
 986         /* only do bcopy if we're inserting */
 987         bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) );
 988         bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(unsigned char) );
 989     }
 990
 991     cache->acache[index] = nodeID;
 992     cache->haveaccess[index] = access;
 993     cache->numcached++;
 994 }
 995
 996
 997 struct cinfo {
 998     uid_t   uid;
 999     gid_t   gid;
1000     mode_t  mode;
1001     cnid_t  parentcnid;
1002     u_int16_t recflags;
1003 };
1004
1005 static int
1006 snoop_callback(const cnode_t *cp, void *arg)
1007 {
1008     struct cinfo *cip = arg;
1009
1010     cip->uid = cp->c_uid;
1011     cip->gid = cp->c_gid;
1012     cip->mode = cp->c_mode;
1013     cip->parentcnid = cp->c_parentcnid;
1014     cip->recflags = cp->c_attr.ca_recflags;
1015
1016     return (0);
1017 }
1018
1019 /*
1020  * Lookup the cnid's attr info (uid, gid, and mode) as well as its parent id. If the item
1021  * isn't incore, then go to the catalog.
1022  */
1023 static int
1024 do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, cnid_t cnid,
1025     struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp)
1026 {
1027     int error = 0;
1028
1029     /* if this id matches the one the fsctl was called with, skip the lookup */
1030     if (cnid == skip_cp->c_cnid) {
1031                 cnattrp->ca_uid = skip_cp->c_uid;
1032                 cnattrp->ca_gid = skip_cp->c_gid;
1033                 cnattrp->ca_mode = skip_cp->c_mode;
1034                 cnattrp->ca_recflags = skip_cp->c_attr.ca_recflags;
1035                 keyp->hfsPlus.parentID = skip_cp->c_parentcnid;
1036     } else {
1037                 struct cinfo c_info;
1038
1039                 /* otherwise, check the cnode hash incase the file/dir is incore */
1040                 error = hfs_chash_snoop(hfsmp, cnid, 0, snoop_callback, &c_info);
1041
1042                 if (error == EACCES) {
1043                         // File is deleted
1044                         return ENOENT;
1045                 } else if (!error) {
1046                         cnattrp->ca_uid = c_info.uid;
1047                         cnattrp->ca_gid = c_info.gid;
1048                         cnattrp->ca_mode = c_info.mode;
1049                         cnattrp->ca_recflags = c_info.recflags;
1050                         keyp->hfsPlus.parentID = c_info.parentcnid;
1051                 } else {
1052                         int lockflags;
1053
1054                         if (throttle_io_will_be_throttled(-1, HFSTOVFS(hfsmp)))
1055                                 throttle_lowpri_io(1);
1056
1057                         lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
1058
1059                         /* lookup this cnid in the catalog */
1060                         error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp);
1061
1062                         hfs_systemfile_unlock(hfsmp, lockflags);
1063
1064                         cache->lookups++;
1065                 }
1066     }
1067
1068     return (error);
1069 }
1070
1071
1072 /*
1073  * Compute whether we have access to the given directory (nodeID) and all its parents. Cache
1074  * up to CACHE_LEVELS as we progress towards the root.
1075  */
1076 static int
1077 do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HFSCatalogNodeID nodeID,
1078     struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred,
1079     struct vfs_context *my_context,
1080     char *bitmap,
1081     uint32_t map_size,
1082     cnid_t* parents,
1083     uint32_t num_parents)
1084 {
1085     int                     myErr = 0;
1086     int                     myResult;
1087     HFSCatalogNodeID        thisNodeID;
1088     unsigned int            myPerms;
1089     struct cat_attr         cnattr;
1090     int                     cache_index = -1, scope_index = -1, scope_idx_start = -1;
1091     CatalogKey              catkey;
1092
1093     int i = 0, ids_to_cache = 0;
1094     int parent_ids[CACHE_LEVELS];
1095
1096     thisNodeID = nodeID;
1097     while (thisNodeID >=  kRootDirID) {
1098         myResult = 0;   /* default to "no access" */
1099
1100         /* check the cache before resorting to hitting the catalog */
1101
1102         /* ASSUMPTION: access info of cached entries is "final"... i.e. no need
1103          * to look any further after hitting cached dir */
1104
1105         if (lookup_bucket(cache, &cache_index, thisNodeID)) {
1106             cache->cachehits++;
1107             myErr = cache->haveaccess[cache_index];
1108             if (scope_index != -1) {
1109                 if (myErr == ESRCH) {
1110                     myErr = 0;
1111                 }
1112             } else {
1113                 scope_index = 0;   // so we'll just use the cache result
1114                 scope_idx_start = ids_to_cache;
1115             }
1116             myResult = (myErr == 0) ? 1 : 0;
1117             goto ExitThisRoutine;
1118         }
1119
1120
1121         if (parents) {
1122             int tmp;
1123             tmp = cache_binSearch(parents, num_parents-1, thisNodeID, NULL);
1124             if (scope_index == -1)
1125                 scope_index = tmp;
1126             if (tmp != -1 && scope_idx_start == -1 && ids_to_cache < CACHE_LEVELS) {
1127                 scope_idx_start = ids_to_cache;
1128             }
1129         }
1130
1131         /* remember which parents we want to cache */
1132         if (ids_to_cache < CACHE_LEVELS) {
1133             parent_ids[ids_to_cache] = thisNodeID;
1134             ids_to_cache++;
1135         }
1136         // Inefficient (using modulo) and we might want to use a hash function, not rely on the node id to be "nice"...
1137         if (bitmap && map_size) {
1138             bitmap[(thisNodeID/8)%(map_size)]|=(1<<(thisNodeID&7));
1139         }
1140
1141
1142         /* do the lookup (checks the cnode hash, then the catalog) */
1143         myErr = do_attr_lookup(hfsmp, cache, thisNodeID, skip_cp, &catkey, &cnattr);
1144         if (myErr) {
1145             goto ExitThisRoutine; /* no access */
1146         }
1147
1148         /* Root always gets access. */
1149         if (suser(myp_ucred, NULL) == 0) {
1150                 thisNodeID = catkey.hfsPlus.parentID;
1151                 myResult = 1;
1152                 continue;
1153         }
1154
1155         // if the thing has acl's, do the full permission check
1156         if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1157             struct vnode *vp;
1158
1159             /* get the vnode for this cnid */
1160             myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0, 0);
1161             if ( myErr ) {
1162                 myResult = 0;
1163                 goto ExitThisRoutine;
1164             }
1165
1166             thisNodeID = VTOC(vp)->c_parentcnid;
1167
1168             hfs_unlock(VTOC(vp));
1169
1170             if (vnode_vtype(vp) == VDIR) {
1171                 myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), my_context);
1172             } else {
1173                 myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, my_context);
1174             }
1175
1176             vnode_put(vp);
1177             if (myErr) {
1178                 myResult = 0;
1179                 goto ExitThisRoutine;
1180             }
1181         } else {
1182             unsigned int flags;
1183                 int mode = cnattr.ca_mode & S_IFMT;
1184                 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, cnattr.ca_mode, hfsmp->hfs_mp,myp_ucred, theProcPtr);
1185
1186                 if (mode == S_IFDIR) {
1187                         flags = R_OK | X_OK;
1188                 } else {
1189                         flags = R_OK;
1190                 }
1191                 if ( (myPerms & flags) != flags) {
1192                         myResult = 0;
1193                         myErr = EACCES;
1194                         goto ExitThisRoutine;   /* no access */
1195                 }
1196
1197             /* up the hierarchy we go */
1198             thisNodeID = catkey.hfsPlus.parentID;
1199         }
1200     }
1201
1202     /* if here, we have access to this node */
1203     myResult = 1;
1204
1205   ExitThisRoutine:
1206     if (parents && myErr == 0 && scope_index == -1) {
1207         myErr = ESRCH;
1208     }
1209
1210     if (myErr) {
1211         myResult = 0;
1212     }
1213     *err = myErr;
1214
1215     /* cache the parent directory(ies) */
1216     for (i = 0; i < ids_to_cache; i++) {
1217         if (myErr == 0 && parents && (scope_idx_start == -1 || i > scope_idx_start)) {
1218             add_node(cache, -1, parent_ids[i], ESRCH);
1219         } else {
1220             add_node(cache, -1, parent_ids[i], myErr);
1221         }
1222     }
1223
1224     return (myResult);
1225 }
1226
1227 static int
1228 do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
1229     struct vnop_ioctl_args *ap, int arg_size, vfs_context_t context)
1230 {
1231     boolean_t is64bit;
1232
1233     /*
1234      * NOTE: on entry, the vnode has an io_ref. In case this vnode
1235      * happens to be in our list of file_ids, we'll note it
1236      * avoid calling hfs_chashget_nowait() on that id as that
1237      * will cause a "locking against myself" panic.
1238      */
1239     Boolean check_leaf = true;
1240
1241     struct user64_ext_access_t *user_access_structp;
1242     struct user64_ext_access_t tmp_user_access;
1243     struct access_cache cache;
1244
1245     int error = 0, prev_parent_check_ok=1;
1246     unsigned int i;
1247
1248     short flags;
1249     unsigned int num_files = 0;
1250     int map_size = 0;
1251     int num_parents = 0;
1252     int *file_ids=NULL;
1253     short *access=NULL;
1254     char *bitmap=NULL;
1255     cnid_t *parents=NULL;
1256     int leaf_index;
1257
1258     cnid_t cnid;
1259     cnid_t prevParent_cnid = 0;
1260     unsigned int myPerms;
1261     short myaccess = 0;
1262     struct cat_attr cnattr;
1263     CatalogKey catkey;
1264     struct cnode *skip_cp = VTOC(vp);
1265     kauth_cred_t cred = vfs_context_ucred(context);
1266     proc_t p = vfs_context_proc(context);
1267
1268     is64bit = proc_is64bit(p);
1269
1270     /* initialize the local cache and buffers */
1271     cache.numcached = 0;
1272     cache.cachehits = 0;
1273     cache.lookups = 0;
1274     cache.acache = NULL;
1275     cache.haveaccess = NULL;
1276
1277     /* struct copyin done during dispatch... need to copy file_id array separately */
1278     if (ap->a_data == NULL) {
1279         error = EINVAL;
1280         goto err_exit_bulk_access;
1281     }
1282
1283     if (is64bit) {
1284         if (arg_size != sizeof(struct user64_ext_access_t)) {
1285             error = EINVAL;
1286             goto err_exit_bulk_access;
1287         }
1288
1289         user_access_structp = (struct user64_ext_access_t *)ap->a_data;
1290
1291     } else if (arg_size == sizeof(struct user32_access_t)) {
1292         struct user32_access_t *accessp = (struct user32_access_t *)ap->a_data;
1293
1294         // convert an old style bulk-access struct to the new style
1295         tmp_user_access.flags     = accessp->flags;
1296         tmp_user_access.num_files = accessp->num_files;
1297         tmp_user_access.map_size  = 0;
1298         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1299         tmp_user_access.bitmap    = USER_ADDR_NULL;
1300         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1301         tmp_user_access.num_parents = 0;
1302         user_access_structp = &tmp_user_access;
1303
1304     } else if (arg_size == sizeof(struct user32_ext_access_t)) {
1305         struct user32_ext_access_t *accessp = (struct user32_ext_access_t *)ap->a_data;
1306
1307         // up-cast from a 32-bit version of the struct
1308         tmp_user_access.flags     = accessp->flags;
1309         tmp_user_access.num_files = accessp->num_files;
1310         tmp_user_access.map_size  = accessp->map_size;
1311         tmp_user_access.num_parents  = accessp->num_parents;
1312
1313         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1314         tmp_user_access.bitmap    = CAST_USER_ADDR_T(accessp->bitmap);
1315         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1316         tmp_user_access.parents    = CAST_USER_ADDR_T(accessp->parents);
1317
1318         user_access_structp = &tmp_user_access;
1319     } else {
1320         error = EINVAL;
1321         goto err_exit_bulk_access;
1322     }
1323
1324     map_size = user_access_structp->map_size;
1325
1326     num_files = user_access_structp->num_files;
1327
1328     num_parents= user_access_structp->num_parents;
1329
1330     if (num_files < 1) {
1331         goto err_exit_bulk_access;
1332     }
1333     if (num_files > 1024) {
1334         error = EINVAL;
1335         goto err_exit_bulk_access;
1336     }
1337
1338     if (num_parents > 1024) {
1339         error = EINVAL;
1340         goto err_exit_bulk_access;
1341     }
1342
1343     file_ids = hfs_malloc(sizeof(int) * num_files);
1344     access = hfs_malloc(sizeof(short) * num_files);
1345     if (map_size) {
1346                 bitmap = hfs_mallocz(sizeof(char) * map_size);
1347     }
1348
1349     if (num_parents) {
1350                 parents = hfs_malloc(sizeof(cnid_t) * num_parents);
1351     }
1352
1353     cache.acache = hfs_malloc(sizeof(int) * NUM_CACHE_ENTRIES);
1354     cache.haveaccess = hfs_malloc(sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1355
1356     if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids,
1357                 num_files * sizeof(int)))) {
1358         goto err_exit_bulk_access;
1359     }
1360
1361     if (num_parents) {
1362         if ((error = copyin(user_access_structp->parents, (caddr_t)parents,
1363                     num_parents * sizeof(cnid_t)))) {
1364             goto err_exit_bulk_access;
1365         }
1366     }
1367
1368     flags = user_access_structp->flags;
1369     if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) {
1370         flags = R_OK;
1371     }
1372
1373     /* check if we've been passed leaf node ids or parent ids */
1374     if (flags & PARENT_IDS_FLAG) {
1375         check_leaf = false;
1376     }
1377
1378     /* Check access to each file_id passed in */
1379     for (i = 0; i < num_files; i++) {
1380         leaf_index=-1;
1381         cnid = (cnid_t) file_ids[i];
1382
1383         /* root always has access */
1384         if ((!parents) && (!suser(cred, NULL))) {
1385             access[i] = 0;
1386             continue;
1387         }
1388
1389         if (check_leaf) {
1390             /* do the lookup (checks the cnode hash, then the catalog) */
1391             error = do_attr_lookup(hfsmp, &cache, cnid, skip_cp, &catkey, &cnattr);
1392             if (error) {
1393                 access[i] = (short) error;
1394                 continue;
1395             }
1396
1397             if (parents) {
1398                 // Check if the leaf matches one of the parent scopes
1399                 leaf_index = cache_binSearch(parents, num_parents-1, cnid, NULL);
1400                 if (leaf_index >= 0 && parents[leaf_index] == cnid)
1401                     prev_parent_check_ok = 0;
1402                 else if (leaf_index >= 0)
1403                     prev_parent_check_ok = 1;
1404             }
1405
1406             // if the thing has acl's, do the full permission check
1407             if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1408                 struct vnode *cvp;
1409                 int myErr = 0;
1410                 /* get the vnode for this cnid */
1411                 myErr = hfs_vget(hfsmp, cnid, &cvp, 0, 0);
1412                 if ( myErr ) {
1413                     access[i] = myErr;
1414                     continue;
1415                 }
1416
1417                 hfs_unlock(VTOC(cvp));
1418
1419                 if (vnode_vtype(cvp) == VDIR) {
1420                     myErr = vnode_authorize(cvp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), context);
1421                 } else {
1422                     myErr = vnode_authorize(cvp, NULL, KAUTH_VNODE_READ_DATA, context);
1423                 }
1424
1425                 vnode_put(cvp);
1426                 if (myErr) {
1427                     access[i] = myErr;
1428                     continue;
1429                 }
1430             } else {
1431                 /* before calling CheckAccess(), check the target file for read access */
1432                 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
1433                     cnattr.ca_mode, hfsmp->hfs_mp, cred, p);
1434
1435                 /* fail fast if no access */
1436                 if ((myPerms & flags) == 0) {
1437                     access[i] = EACCES;
1438                     continue;
1439                 }
1440             }
1441         } else {
1442             /* we were passed an array of parent ids */
1443             catkey.hfsPlus.parentID = cnid;
1444         }
1445
1446         /* if the last guy had the same parent and had access, we're done */
1447         if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0 && prev_parent_check_ok) {
1448             cache.cachehits++;
1449             access[i] = 0;
1450             continue;
1451         }
1452
1453         myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID,
1454             skip_cp, p, cred, context,bitmap, map_size, parents, num_parents);
1455
1456         if (myaccess || (error == ESRCH && leaf_index != -1)) {
1457             access[i] = 0; // have access.. no errors to report
1458         } else {
1459             access[i] = (error != 0 ? (short) error : EACCES);
1460         }
1461
1462         prevParent_cnid = catkey.hfsPlus.parentID;
1463     }
1464
1465     /* copyout the access array */
1466     if ((error = copyout((caddr_t)access, user_access_structp->access,
1467                 num_files * sizeof (short)))) {
1468         goto err_exit_bulk_access;
1469     }
1470     if (map_size && bitmap) {
1471         if ((error = copyout((caddr_t)bitmap, user_access_structp->bitmap,
1472                     map_size * sizeof (char)))) {
1473             goto err_exit_bulk_access;
1474         }
1475     }
1476
1477
1478   err_exit_bulk_access:
1479
1480         hfs_free(file_ids, sizeof(int) * num_files);
1481         hfs_free(parents, sizeof(cnid_t) * num_parents);
1482         hfs_free(bitmap, sizeof(char) * map_size);
1483         hfs_free(access, sizeof(short) * num_files);
1484         hfs_free(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1485         hfs_free(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1486
1487     return (error);
1488 }
1489
1490
1491 /* end "bulk-access" support */
1492
1493
1494 /*
1495  * Control filesystem operating characteristics.
1496  */
1497 int
1498 hfs_vnop_ioctl( struct vnop_ioctl_args /* {
1499                 vnode_t a_vp;
1500                 long  a_command;
1501                 caddr_t  a_data;
1502                 int  a_fflag;
1503                 vfs_context_t a_context;
1504         } */ *ap)
1505 {
1506         struct vnode * vp = ap->a_vp;
1507         struct hfsmount *hfsmp = VTOHFS(vp);
1508         vfs_context_t context = ap->a_context;
1509         kauth_cred_t cred = vfs_context_ucred(context);
1510         proc_t p = vfs_context_proc(context);
1511         struct vfsstatfs *vfsp;
1512         boolean_t is64bit;
1513         off_t jnl_start, jnl_size;
1514         struct hfs_journal_info *jip;
1515 #if HFS_COMPRESSION
1516         int compressed = 0;
1517         off_t uncompressed_size = -1;
1518         int decmpfs_error = 0;
1519
1520         if (ap->a_command == F_RDADVISE) {
1521                 /* we need to inspect the decmpfs state of the file as early as possible */
1522                 compressed = hfs_file_is_compressed(VTOC(vp), 0);
1523                 if (compressed) {
1524                         if (VNODE_IS_RSRC(vp)) {
1525                                 /* if this is the resource fork, treat it as if it were empty */
1526                                 uncompressed_size = 0;
1527                         } else {
1528                                 decmpfs_error = hfs_uncompressed_size_of_compressed_file(NULL, vp, 0, &uncompressed_size, 0);
1529                                 if (decmpfs_error != 0) {
1530                                         /* failed to get the uncompressed size, we'll check for this later */
1531                                         uncompressed_size = -1;
1532                                 }
1533                         }
1534                 }
1535         }
1536 #endif /* HFS_COMPRESSION */
1537
1538         is64bit = proc_is64bit(p);
1539
1540 #if CONFIG_PROTECT
1541 #if HFS_CONFIG_KEY_ROLL
1542         // The HFS_KEY_ROLL fsctl does its own access checks
1543         if (ap->a_command != HFS_KEY_ROLL)
1544 #endif
1545         {
1546                 int error = 0;
1547                 if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
1548                         return error;
1549                 }
1550         }
1551 #endif /* CONFIG_PROTECT */
1552
1553         switch (ap->a_command) {
1554
1555         case HFS_GETPATH:
1556         {
1557                 struct vnode *file_vp;
1558                 cnid_t  cnid;
1559                 int  outlen;
1560                 char *bufptr;
1561                 int error;
1562                 int flags = 0;
1563
1564                 /* Caller must be owner of file system. */
1565                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1566                 if (suser(cred, NULL) &&
1567                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1568                         return (EACCES);
1569                 }
1570                 /* Target vnode must be file system's root. */
1571                 if (!vnode_isvroot(vp)) {
1572                         return (EINVAL);
1573                 }
1574                 bufptr = (char *)ap->a_data;
1575                 cnid = strtoul(bufptr, NULL, 10);
1576                 if (ap->a_fflag & HFS_GETPATH_VOLUME_RELATIVE) {
1577                         flags |= BUILDPATH_VOLUME_RELATIVE;
1578                 }
1579
1580                 /* We need to call hfs_vfs_vget to leverage the code that will
1581                  * fix the origin list for us if needed, as opposed to calling
1582                  * hfs_vget, since we will need the parent for build_path call.
1583                  */
1584
1585                 if ((error = hfs_vfs_vget(HFSTOVFS(hfsmp), cnid, &file_vp, context))) {
1586                         return (error);
1587                 }
1588
1589                 error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, flags, context);
1590                 vnode_put(file_vp);
1591
1592                 return (error);
1593         }
1594
1595         case HFS_SET_MAX_DEFRAG_SIZE:
1596         {
1597                 int error = 0;          /* Assume success */
1598                 u_int32_t maxsize = 0;
1599
1600                 if (vnode_vfsisrdonly(vp)) {
1601                         return (EROFS);
1602                 }
1603                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1604                 if (!kauth_cred_issuser(cred)) {
1605                         return (EACCES); /* must be root */
1606                 }
1607
1608                 maxsize = *(u_int32_t *)ap->a_data;
1609
1610                 hfs_lock_mount(hfsmp);
1611                 if (maxsize > HFS_MAX_DEFRAG_SIZE) {
1612                         error = EINVAL;
1613                 }
1614                 else {
1615                         hfsmp->hfs_defrag_max = maxsize;
1616                 }
1617                 hfs_unlock_mount(hfsmp);
1618
1619                 return (error);
1620         }
1621
1622         case HFS_FORCE_ENABLE_DEFRAG:
1623         {
1624                 int error = 0;          /* Assume success */
1625                 u_int32_t do_enable = 0;
1626
1627                 if (vnode_vfsisrdonly(vp)) {
1628                         return (EROFS);
1629                 }
1630                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1631                 if (!kauth_cred_issuser(cred)) {
1632                         return (EACCES); /* must be root */
1633                 }
1634
1635                 do_enable = *(u_int32_t *)ap->a_data;
1636
1637                 hfs_lock_mount(hfsmp);
1638                 if (do_enable != 0) {
1639                         hfsmp->hfs_defrag_nowait = 1;
1640                 }
1641                 else {
1642                         error = EINVAL;
1643                 }
1644
1645                 hfs_unlock_mount(hfsmp);
1646
1647                 return (error);
1648         }
1649
1650
1651         case HFS_TRANSFER_DOCUMENT_ID:
1652         {
1653                 struct cnode *cp = NULL;
1654                 int error;
1655                 u_int32_t to_fd = *(u_int32_t *)ap->a_data;
1656                 struct fileproc *to_fp;
1657                 struct vnode *to_vp;
1658                 struct cnode *to_cp;
1659
1660                 cp = VTOC(vp);
1661
1662                 if ((error = fp_getfvp(p, to_fd, &to_fp, &to_vp)) != 0) {
1663                         //printf("could not get the vnode for fd %d (err %d)\n", to_fd, error);
1664                         return error;
1665                 }
1666                 if ( (error = vnode_getwithref(to_vp)) ) {
1667                         file_drop(to_fd);
1668                         return error;
1669                 }
1670
1671                 if (VTOHFS(to_vp) != hfsmp) {
1672                         error = EXDEV;
1673                         goto transfer_cleanup;
1674                 }
1675
1676                 int need_unlock = 1;
1677                 to_cp = VTOC(to_vp);
1678                 error = hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK);
1679                 if (error != 0) {
1680                         //printf("could not lock the pair of cnodes (error %d)\n", error);
1681                         goto transfer_cleanup;
1682                 }
1683
1684                 if (!(cp->c_bsdflags & UF_TRACKED)) {
1685                         error = EINVAL;
1686                 } else if (to_cp->c_bsdflags & UF_TRACKED) {
1687                         //
1688                         // if the destination is already tracked, return an error
1689                         // as otherwise it's a silent deletion of the target's
1690                         // document-id
1691                         //
1692                         error = EEXIST;
1693                 } else if (S_ISDIR(cp->c_attr.ca_mode) || S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) {
1694                         //
1695                         // we can use the FndrExtendedFileInfo because the doc-id is the first
1696                         // thing in both it and the ExtendedDirInfo struct which is fixed in
1697                         // format and can not change layout
1698                         //
1699                         struct FndrExtendedFileInfo *f_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)cp->c_finderinfo + 16);
1700                         struct FndrExtendedFileInfo *to_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)to_cp->c_finderinfo + 16);
1701
1702                         if (f_extinfo->document_id == 0) {
1703                                 uint32_t new_id;
1704
1705                                 hfs_unlockpair(cp, to_cp);  // have to unlock to be able to get a new-id
1706
1707                                 if ((error = hfs_generate_document_id(hfsmp, &new_id)) == 0) {
1708                                         //
1709                                         // re-lock the pair now that we have the document-id
1710                                         //
1711                                         hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK);
1712                                         f_extinfo->document_id = new_id;
1713                                 } else {
1714                                         goto transfer_cleanup;
1715                                 }
1716                         }
1717
1718                         to_extinfo->document_id = f_extinfo->document_id;
1719                         f_extinfo->document_id = 0;
1720                         //printf("TRANSFERRING: doc-id %d from ino %d to ino %d\n", to_extinfo->document_id, cp->c_fileid, to_cp->c_fileid);
1721
1722                         // make sure the destination is also UF_TRACKED
1723                         to_cp->c_bsdflags |= UF_TRACKED;
1724                         cp->c_bsdflags &= ~UF_TRACKED;
1725
1726                         // mark the cnodes dirty
1727                         cp->c_flag |= C_MODIFIED;
1728                         to_cp->c_flag |= C_MODIFIED;
1729
1730                         int lockflags;
1731                         if ((error = hfs_start_transaction(hfsmp)) == 0) {
1732
1733                                 lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK);
1734
1735                                 (void) cat_update(hfsmp, &cp->c_desc, &cp->c_attr, NULL, NULL);
1736                                 (void) cat_update(hfsmp, &to_cp->c_desc, &to_cp->c_attr, NULL, NULL);
1737
1738                                 hfs_systemfile_unlock (hfsmp, lockflags);
1739                                 (void) hfs_end_transaction(hfsmp);
1740                         }
1741
1742                         add_fsevent(FSE_DOCID_CHANGED, context,
1743                                     FSE_ARG_DEV,   hfsmp->hfs_raw_dev,
1744                                     FSE_ARG_INO,   (ino64_t)cp->c_fileid,       // src inode #
1745                                     FSE_ARG_INO,   (ino64_t)to_cp->c_fileid,    // dst inode #
1746                                     FSE_ARG_INT32, to_extinfo->document_id,
1747                                     FSE_ARG_DONE);
1748
1749                         hfs_unlockpair(cp, to_cp);    // unlock this so we can send the fsevents
1750                         need_unlock = 0;
1751
1752                         if (need_fsevent(FSE_STAT_CHANGED, vp)) {
1753                                 add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
1754                         }
1755                         if (need_fsevent(FSE_STAT_CHANGED, to_vp)) {
1756                                 add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, to_vp, FSE_ARG_DONE);
1757                         }
1758                 }
1759
1760                 if (need_unlock) {
1761                         hfs_unlockpair(cp, to_cp);
1762                 }
1763
1764         transfer_cleanup:
1765                 vnode_put(to_vp);
1766                 file_drop(to_fd);
1767
1768                 return error;
1769         }
1770
1771
1772
1773         case HFS_PREV_LINK:
1774         case HFS_NEXT_LINK:
1775         {
1776                 cnid_t linkfileid;
1777                 cnid_t nextlinkid;
1778                 cnid_t prevlinkid;
1779                 int error;
1780
1781                 /* Caller must be owner of file system. */
1782                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1783                 if (suser(cred, NULL) &&
1784                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1785                         return (EACCES);
1786                 }
1787                 /* Target vnode must be file system's root. */
1788                 if (!vnode_isvroot(vp)) {
1789                         return (EINVAL);
1790                 }
1791                 linkfileid = *(cnid_t *)ap->a_data;
1792                 if (linkfileid < kHFSFirstUserCatalogNodeID) {
1793                         return (EINVAL);
1794                 }
1795                 if ((error = hfs_lookup_siblinglinks(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) {
1796                         return (error);
1797                 }
1798                 if (ap->a_command == HFS_NEXT_LINK) {
1799                         *(cnid_t *)ap->a_data = nextlinkid;
1800                 } else {
1801                         *(cnid_t *)ap->a_data = prevlinkid;
1802                 }
1803                 return (0);
1804         }
1805
1806         case HFS_RESIZE_PROGRESS: {
1807
1808                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1809                 if (suser(cred, NULL) &&
1810                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1811                         return (EACCES); /* must be owner of file system */
1812                 }
1813                 if (!vnode_isvroot(vp)) {
1814                         return (EINVAL);
1815                 }
1816                 /* file system must not be mounted read-only */
1817                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1818                         return (EROFS);
1819                 }
1820
1821                 return hfs_resize_progress(hfsmp, (u_int32_t *)ap->a_data);
1822         }
1823
1824         case HFS_RESIZE_VOLUME: {
1825                 u_int64_t newsize;
1826                 u_int64_t cursize;
1827                 int ret;
1828
1829                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1830                 if (suser(cred, NULL) &&
1831                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1832                         return (EACCES); /* must be owner of file system */
1833                 }
1834                 if (!vnode_isvroot(vp)) {
1835                         return (EINVAL);
1836                 }
1837
1838                 /* filesystem must not be mounted read only */
1839                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1840                         return (EROFS);
1841                 }
1842                 newsize = *(u_int64_t *)ap->a_data;
1843                 cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
1844
1845                 if (newsize == cursize) {
1846                         return (0);
1847                 }
1848                 IOBSDMountChange(hfsmp->hfs_mp, kIOMountChangeWillResize);
1849                 if (newsize > cursize) {
1850                         ret = hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context);
1851                 } else {
1852                         ret = hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context);
1853                 }
1854                 IOBSDMountChange(hfsmp->hfs_mp, kIOMountChangeDidResize);
1855                 return (ret);
1856         }
1857         case HFS_CHANGE_NEXT_ALLOCATION: {
1858                 int error = 0;          /* Assume success */
1859                 u_int32_t location;
1860
1861                 if (vnode_vfsisrdonly(vp)) {
1862                         return (EROFS);
1863                 }
1864                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1865                 if (suser(cred, NULL) &&
1866                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1867                         return (EACCES); /* must be owner of file system */
1868                 }
1869                 if (!vnode_isvroot(vp)) {
1870                         return (EINVAL);
1871                 }
1872                 hfs_lock_mount(hfsmp);
1873                 location = *(u_int32_t *)ap->a_data;
1874                 if ((location >= hfsmp->allocLimit) &&
1875                         (location != HFS_NO_UPDATE_NEXT_ALLOCATION)) {
1876                         error = EINVAL;
1877                         goto fail_change_next_allocation;
1878                 }
1879                 /* Return previous value. */
1880                 *(u_int32_t *)ap->a_data = hfsmp->nextAllocation;
1881                 if (location == HFS_NO_UPDATE_NEXT_ALLOCATION) {
1882                         /* On magic value for location, set nextAllocation to next block
1883                          * after metadata zone and set flag in mount structure to indicate
1884                          * that nextAllocation should not be updated again.
1885                          */
1886                         if (hfsmp->hfs_metazone_end != 0) {
1887                                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_end + 1);
1888                         }
1889                         hfsmp->hfs_flags |= HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1890                 } else {
1891                         hfsmp->hfs_flags &= ~HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1892                         HFS_UPDATE_NEXT_ALLOCATION(hfsmp, location);
1893                 }
1894                 MarkVCBDirty(hfsmp);
1895 fail_change_next_allocation:
1896                 hfs_unlock_mount(hfsmp);
1897                 return (error);
1898         }
1899
1900 #if HFS_SPARSE_DEV
1901         case HFS_SETBACKINGSTOREINFO: {
1902                 struct vnode * di_vp;
1903                 struct hfs_backingstoreinfo *bsdata;
1904                 int error = 0;
1905
1906                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1907                         return (EROFS);
1908                 }
1909                 if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
1910                         return (EALREADY);
1911                 }
1912                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1913                 if (suser(cred, NULL) &&
1914                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1915                         return (EACCES); /* must be owner of file system */
1916                 }
1917                 bsdata = (struct hfs_backingstoreinfo *)ap->a_data;
1918                 if (bsdata == NULL) {
1919                         return (EINVAL);
1920                 }
1921                 if ((error = file_vnode(bsdata->backingfd, &di_vp))) {
1922                         return (error);
1923                 }
1924                 if ((error = vnode_getwithref(di_vp))) {
1925                         file_drop(bsdata->backingfd);
1926                         return(error);
1927                 }
1928
1929                 if (vnode_mount(vp) == vnode_mount(di_vp)) {
1930                         (void)vnode_put(di_vp);
1931                         file_drop(bsdata->backingfd);
1932                         return (EINVAL);
1933                 }
1934
1935                 // Dropped in unmount
1936                 vnode_ref(di_vp);
1937
1938                 hfs_lock_mount(hfsmp);
1939                 hfsmp->hfs_backingvp = di_vp;
1940                 hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
1941                 hfsmp->hfs_sparsebandblks = bsdata->bandsize / hfsmp->blockSize * 4;
1942                 hfs_unlock_mount(hfsmp);
1943
1944                 /* We check the MNTK_VIRTUALDEV bit instead of marking the dependent process */
1945
1946                 /*
1947                  * If the sparse image is on a sparse image file (as opposed to a sparse
1948                  * bundle), then we may need to limit the free space to the maximum size
1949                  * of a file on that volume.  So we query (using pathconf), and if we get
1950                  * a meaningful result, we cache the number of blocks for later use in
1951                  * hfs_freeblks().
1952                  */
1953                 hfsmp->hfs_backingfs_maxblocks = 0;
1954                 if (vnode_vtype(di_vp) == VREG) {
1955                         int terr;
1956                         int hostbits;
1957                         terr = vn_pathconf(di_vp, _PC_FILESIZEBITS, &hostbits, context);
1958                         if (terr == 0 && hostbits != 0 && hostbits < 64) {
1959                                 u_int64_t hostfilesizemax = ((u_int64_t)1) << hostbits;
1960
1961                                 hfsmp->hfs_backingfs_maxblocks = hostfilesizemax / hfsmp->blockSize;
1962                         }
1963                 }
1964
1965                 /* The free extent cache is managed differently for sparse devices.
1966                  * There is a window between which the volume is mounted and the
1967                  * device is marked as sparse, so the free extent cache for this
1968                  * volume is currently initialized as normal volume (sorted by block
1969                  * count).  Reset the cache so that it will be rebuilt again
1970                  * for sparse device (sorted by start block).
1971                  */
1972                 ResetVCBFreeExtCache(hfsmp);
1973
1974                 (void)vnode_put(di_vp);
1975                 file_drop(bsdata->backingfd);
1976                 return (0);
1977         }
1978         case HFS_CLRBACKINGSTOREINFO: {
1979                 struct vnode * tmpvp;
1980
1981                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1982                 if (suser(cred, NULL) &&
1983                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1984                         return (EACCES); /* must be owner of file system */
1985                 }
1986                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1987                         return (EROFS);
1988                 }
1989
1990                 if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
1991                     hfsmp->hfs_backingvp) {
1992
1993                         hfs_lock_mount(hfsmp);
1994                         hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
1995                         tmpvp = hfsmp->hfs_backingvp;
1996                         hfsmp->hfs_backingvp = NULLVP;
1997                         hfsmp->hfs_sparsebandblks = 0;
1998                         hfs_unlock_mount(hfsmp);
1999
2000                         vnode_rele(tmpvp);
2001                 }
2002                 return (0);
2003         }
2004 #endif /* HFS_SPARSE_DEV */
2005
2006         /* Change the next CNID stored in the VH */
2007         case HFS_CHANGE_NEXTCNID: {
2008                 int error = 0;          /* Assume success */
2009                 u_int32_t fileid;
2010                 int wraparound = 0;
2011                 int lockflags = 0;
2012
2013                 if (vnode_vfsisrdonly(vp)) {
2014                         return (EROFS);
2015                 }
2016                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
2017                 if (suser(cred, NULL) &&
2018                         kauth_cred_getuid(cred) != vfsp->f_owner) {
2019                         return (EACCES); /* must be owner of file system */
2020                 }
2021
2022                 fileid = *(u_int32_t *)ap->a_data;
2023
2024                 /* Must have catalog lock excl. to advance the CNID pointer */
2025                 lockflags = hfs_systemfile_lock (hfsmp, SFL_CATALOG , HFS_EXCLUSIVE_LOCK);
2026
2027                 hfs_lock_mount(hfsmp);
2028
2029                 /* If it is less than the current next CNID, force the wraparound bit to be set */
2030                 if (fileid < hfsmp->vcbNxtCNID) {
2031                         wraparound=1;
2032                 }
2033
2034                 /* Return previous value. */
2035                 *(u_int32_t *)ap->a_data = hfsmp->vcbNxtCNID;
2036
2037                 hfsmp->vcbNxtCNID = fileid;
2038
2039                 if (wraparound) {
2040                         hfsmp->vcbAtrb |= kHFSCatalogNodeIDsReusedMask;
2041                 }
2042
2043                 MarkVCBDirty(hfsmp);
2044                 hfs_unlock_mount(hfsmp);
2045                 hfs_systemfile_unlock (hfsmp, lockflags);
2046
2047                 return (error);
2048         }
2049
2050         case F_FREEZE_FS: {
2051                 struct mount *mp;
2052
2053                 mp = vnode_mount(vp);
2054                 hfsmp = VFSTOHFS(mp);
2055
2056                 if (!(hfsmp->jnl))
2057                         return (ENOTSUP);
2058
2059                 vfsp = vfs_statfs(mp);
2060
2061                 if (kauth_cred_getuid(cred) != vfsp->f_owner &&
2062                         !kauth_cred_issuser(cred))
2063                         return (EACCES);
2064
2065                 return hfs_freeze(hfsmp);
2066         }
2067
2068         case F_THAW_FS: {
2069                 vfsp = vfs_statfs(vnode_mount(vp));
2070                 if (kauth_cred_getuid(cred) != vfsp->f_owner &&
2071                         !kauth_cred_issuser(cred))
2072                         return (EACCES);
2073
2074                 return hfs_thaw(hfsmp, current_proc());
2075         }
2076
2077         case HFS_EXT_BULKACCESS_FSCTL: {
2078             int size;
2079 #if CONFIG_HFS_STD
2080             if (hfsmp->hfs_flags & HFS_STANDARD) {
2081                         return EINVAL;
2082             }
2083 #endif
2084
2085             if (is64bit) {
2086                 size = sizeof(struct user64_ext_access_t);
2087             } else {
2088                 size = sizeof(struct user32_ext_access_t);
2089             }
2090
2091             return do_bulk_access_check(hfsmp, vp, ap, size, context);
2092         }
2093
2094         case HFS_SET_XATTREXTENTS_STATE: {
2095                 int state;
2096
2097                 if (ap->a_data == NULL) {
2098                         return (EINVAL);
2099                 }
2100
2101                 state = *(int *)ap->a_data;
2102
2103                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2104                         return (EROFS);
2105                 }
2106
2107                 /* Super-user can enable or disable extent-based extended
2108                  * attribute support on a volume
2109                  * Note: Starting Mac OS X 10.7, extent-based extended attributes
2110                  * are enabled by default, so any change will be transient only
2111                  * till the volume is remounted.
2112                  */
2113                 if (!kauth_cred_issuser(kauth_cred_get())) {
2114                         return (EPERM);
2115                 }
2116                 if (state == 0 || state == 1)
2117                         return hfs_set_volxattr(hfsmp, HFS_SET_XATTREXTENTS_STATE, state);
2118                 else
2119                         return (EINVAL);
2120         }
2121
2122         case F_SETSTATICCONTENT: {
2123                 int error;
2124                 int enable_static = 0;
2125                 struct cnode *cp = NULL;
2126                 /*
2127                  * lock the cnode, decorate the cnode flag, and bail out.
2128                  * VFS should have already authenticated the caller for us.
2129                  */
2130
2131                 if (ap->a_data) {
2132                         /*
2133                          * Note that even though ap->a_data is of type caddr_t,
2134                          * the fcntl layer at the syscall handler will pass in NULL
2135                          * or 1 depending on what the argument supplied to the fcntl
2136                          * was.  So it is in fact correct to check the ap->a_data
2137                          * argument for zero or non-zero value when deciding whether or not
2138                          * to enable the static bit in the cnode.
2139                          */
2140                         enable_static = 1;
2141                 }
2142                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2143                         return EROFS;
2144                 }
2145                 cp = VTOC(vp);
2146
2147                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2148                 if (error == 0) {
2149                         if (enable_static) {
2150                                 cp->c_flag |= C_SSD_STATIC;
2151                         }
2152                         else {
2153                                 cp->c_flag &= ~C_SSD_STATIC;
2154                         }
2155                         hfs_unlock (cp);
2156                 }
2157                 return error;
2158         }
2159
2160         case F_SET_GREEDY_MODE: {
2161                 int error;
2162                 int enable_greedy_mode = 0;
2163                 struct cnode *cp = NULL;
2164                 /*
2165                  * lock the cnode, decorate the cnode flag, and bail out.
2166                  * VFS should have already authenticated the caller for us.
2167                  */
2168
2169                 if (ap->a_data) {
2170                         /*
2171                          * Note that even though ap->a_data is of type caddr_t,
2172                          * the fcntl layer at the syscall handler will pass in NULL
2173                          * or 1 depending on what the argument supplied to the fcntl
2174                          * was.  So it is in fact correct to check the ap->a_data
2175                          * argument for zero or non-zero value when deciding whether or not
2176                          * to enable the greedy mode bit in the cnode.
2177                          */
2178                         enable_greedy_mode = 1;
2179                 }
2180                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2181                         return EROFS;
2182                 }
2183                 cp = VTOC(vp);
2184
2185                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2186                 if (error == 0) {
2187                         if (enable_greedy_mode) {
2188                                 cp->c_flag |= C_SSD_GREEDY_MODE;
2189                         }
2190                         else {
2191                                 cp->c_flag &= ~C_SSD_GREEDY_MODE;
2192                         }
2193                         hfs_unlock (cp);
2194                 }
2195                 return error;
2196         }
2197
2198         case F_SETIOTYPE: {
2199                 int error;
2200                 uint32_t iotypeflag = 0;
2201
2202                 struct cnode *cp = NULL;
2203                 /*
2204                  * lock the cnode, decorate the cnode flag, and bail out.
2205                  * VFS should have already authenticated the caller for us.
2206                  */
2207
2208                 if (ap->a_data == NULL) {
2209                         return EINVAL;
2210                 }
2211
2212                 /*
2213                  * Note that even though ap->a_data is of type caddr_t, we
2214                  * can only use 32 bits of flag values.
2215                  */
2216                 iotypeflag = (uint32_t) ap->a_data;
2217                 switch (iotypeflag) {
2218                         case F_IOTYPE_ISOCHRONOUS:
2219                                 break;
2220                         default:
2221                                 return EINVAL;
2222                 }
2223
2224
2225                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2226                         return EROFS;
2227                 }
2228                 cp = VTOC(vp);
2229
2230                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2231                 if (error == 0) {
2232                         switch (iotypeflag) {
2233                                 case F_IOTYPE_ISOCHRONOUS:
2234                                         cp->c_flag |= C_IO_ISOCHRONOUS;
2235                                         break;
2236                                 default:
2237                                         break;
2238                         }
2239                         hfs_unlock (cp);
2240                 }
2241                 return error;
2242         }
2243
2244         case F_MAKECOMPRESSED: {
2245                 int error = 0;
2246                 uint32_t gen_counter;
2247                 struct cnode *cp = NULL;
2248                 int reset_decmp = 0;
2249
2250                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2251                         return EROFS;
2252                 }
2253
2254                 /*
2255                  * acquire & lock the cnode.
2256                  * VFS should have already authenticated the caller for us.
2257                  */
2258
2259                 if (ap->a_data) {
2260                         /*
2261                          * Cast the pointer into a uint32_t so we can extract the
2262                          * supplied generation counter.
2263                          */
2264                         gen_counter = *((uint32_t*)ap->a_data);
2265                 }
2266                 else {
2267                         return EINVAL;
2268                 }
2269
2270 #if HFS_COMPRESSION
2271                 cp = VTOC(vp);
2272                 /* Grab truncate lock first; we may truncate the file */
2273                 hfs_lock_truncate (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2274
2275                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2276                 if (error) {
2277                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
2278                         return error;
2279                 }
2280
2281                 /* Are there any other usecounts/FDs? */
2282                 if (vnode_isinuse(vp, 1)) {
2283                         hfs_unlock(cp);
2284                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
2285                         return EBUSY;
2286                 }
2287
2288                 /* now we have the cnode locked down; Validate arguments */
2289                 if (cp->c_attr.ca_flags & (UF_IMMUTABLE | UF_COMPRESSED)) {
2290                         /* EINVAL if you are trying to manipulate an IMMUTABLE file */
2291                         hfs_unlock(cp);
2292                         hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT);
2293                         return EINVAL;
2294                 }
2295
2296                 if ((hfs_get_gencount (cp)) == gen_counter) {
2297                         /*
2298                          * OK, the gen_counter matched.  Go for it:
2299                          * Toggle state bits, truncate file, and suppress mtime update
2300                          */
2301                         reset_decmp = 1;
2302                         cp->c_bsdflags |= UF_COMPRESSED;
2303
2304                         error = hfs_truncate(vp, 0, IO_NDELAY, HFS_TRUNCATE_SKIPTIMES,
2305                                                                  ap->a_context);
2306                 }
2307                 else {
2308                         error = ESTALE;
2309                 }
2310
2311                 /* Unlock cnode before executing decmpfs ; they may need to get an EA */
2312                 hfs_unlock(cp);
2313
2314                 /*
2315                  * Reset the decmp state while still holding the truncate lock. We need to
2316                  * serialize here against a listxattr on this node which may occur at any
2317                  * time.
2318                  *
2319                  * Even if '0/skiplock' is passed in 2nd argument to hfs_file_is_compressed,
2320                  * that will still potentially require getting the com.apple.decmpfs EA. If the
2321                  * EA is required, then we can't hold the cnode lock, because the getxattr call is
2322                  * generic(through VFS), and can't pass along any info telling it that we're already
2323                  * holding it (the lock). If we don't serialize, then we risk listxattr stopping
2324                  * and trying to fill in the hfs_file_is_compressed info during the callback
2325                  * operation, which will result in deadlock against the b-tree node.
2326                  *
2327                  * So, to serialize against listxattr (which will grab buf_t meta references on
2328                  * the b-tree blocks), we hold the truncate lock as we're manipulating the
2329                  * decmpfs payload.
2330                  */
2331                 if ((reset_decmp) && (error == 0)) {
2332                         decmpfs_cnode *dp = VTOCMP (vp);
2333                         if (dp != NULL) {
2334                                 decmpfs_cnode_set_vnode_state(dp, FILE_TYPE_UNKNOWN, 0);
2335                         }
2336
2337                         /* Initialize the decmpfs node as needed */
2338                         (void) hfs_file_is_compressed (cp, 0); /* ok to take lock */
2339                 }
2340
2341                 hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT);
2342
2343 #endif
2344                 return error;
2345         }
2346
2347         case F_SETBACKINGSTORE: {
2348
2349                 int error = 0;
2350
2351                 /*
2352                  * See comment in F_SETSTATICCONTENT re: using
2353              * a null check for a_data
2354                  */
2355                 if (ap->a_data) {
2356                         error = hfs_set_backingstore (vp, 1);
2357                 }
2358                 else {
2359                         error = hfs_set_backingstore (vp, 0);
2360                 }
2361
2362                 return error;
2363         }
2364
2365         case F_GETPATH_MTMINFO: {
2366                 int error = 0;
2367
2368                 int *data = (int*) ap->a_data;
2369
2370                 /* Ask if this is a backingstore vnode */
2371                 error = hfs_is_backingstore (vp, data);
2372
2373                 return error;
2374         }
2375
2376         case F_FULLFSYNC: {
2377                 int error;
2378
2379                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2380                         return (EROFS);
2381                 }
2382                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2383                 if (error == 0) {
2384                         error = hfs_fsync(vp, MNT_WAIT, HFS_FSYNC_FULL, p);
2385                         hfs_unlock(VTOC(vp));
2386                 }
2387
2388                 return error;
2389         }
2390
2391         case F_BARRIERFSYNC: {
2392                 int error;
2393
2394                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2395                         return (EROFS);
2396                 }
2397                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2398                 if (error == 0) {
2399                         error = hfs_fsync(vp, MNT_WAIT, HFS_FSYNC_BARRIER, p);
2400                         hfs_unlock(VTOC(vp));
2401                 }
2402
2403                 return error;
2404         }
2405
2406         case F_CHKCLEAN: {
2407                 register struct cnode *cp;
2408                 int error;
2409
2410                 if (!vnode_isreg(vp))
2411                         return EINVAL;
2412
2413                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2414                 if (error == 0) {
2415                         cp = VTOC(vp);
2416                         /*
2417                          * used by regression test to determine if
2418                          * all the dirty pages (via write) have been cleaned
2419                          * after a call to 'fsysnc'.
2420                          */
2421                         error = is_file_clean(vp, VTOF(vp)->ff_size);
2422                         hfs_unlock(cp);
2423                 }
2424                 return (error);
2425         }
2426
2427         case F_RDADVISE: {
2428                 register struct radvisory *ra;
2429                 struct filefork *fp;
2430                 int error;
2431
2432                 if (!vnode_isreg(vp))
2433                         return EINVAL;
2434
2435                 ra = (struct radvisory *)(ap->a_data);
2436                 fp = VTOF(vp);
2437
2438                 /* Protect against a size change. */
2439                 hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2440
2441 #if HFS_COMPRESSION
2442                 if (compressed) {
2443                         if (uncompressed_size == -1) {
2444                                 /* fetching the uncompressed size failed above, so return the error */
2445                                 error = decmpfs_error;
2446                         } else if (ra->ra_offset >= uncompressed_size) {
2447                                 error = EFBIG;
2448                         } else {
2449                                 error = advisory_read(vp, uncompressed_size, ra->ra_offset, ra->ra_count);
2450                         }
2451                 } else
2452 #endif /* HFS_COMPRESSION */
2453                 if (ra->ra_offset >= fp->ff_size) {
2454                         error = EFBIG;
2455                 } else {
2456                         error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count);
2457                 }
2458
2459                 hfs_unlock_truncate(VTOC(vp), HFS_LOCK_DEFAULT);
2460                 return (error);
2461         }
2462
2463         case _IOC(IOC_OUT,'h', 4, 0):     /* Create date in local time */
2464         {
2465                 if (is64bit) {
2466                         *(user_time_t *)(ap->a_data) = (user_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2467                 }
2468                 else {
2469                         *(user32_time_t *)(ap->a_data) = (user32_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2470                 }
2471                 return 0;
2472         }
2473
2474         case SPOTLIGHT_FSCTL_GET_MOUNT_TIME:
2475             *(uint32_t *)ap->a_data = hfsmp->hfs_mount_time;
2476             break;
2477
2478         case SPOTLIGHT_FSCTL_GET_LAST_MTIME:
2479             *(uint32_t *)ap->a_data = hfsmp->hfs_last_mounted_mtime;
2480             break;
2481
2482         case HFS_FSCTL_GET_VERY_LOW_DISK:
2483             *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_dangerlimit;
2484             break;
2485
2486         case HFS_FSCTL_SET_VERY_LOW_DISK:
2487             if (*(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_warninglimit) {
2488                 return EINVAL;
2489             }
2490
2491             hfsmp->hfs_freespace_notify_dangerlimit = *(uint32_t *)ap->a_data;
2492             break;
2493
2494         case HFS_FSCTL_GET_LOW_DISK:
2495             *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_warninglimit;
2496             break;
2497
2498         case HFS_FSCTL_SET_LOW_DISK:
2499             if (   *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel
2500                 || *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_dangerlimit) {
2501
2502                 return EINVAL;
2503             }
2504
2505             hfsmp->hfs_freespace_notify_warninglimit = *(uint32_t *)ap->a_data;
2506             break;
2507
2508         case HFS_FSCTL_GET_DESIRED_DISK:
2509             *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_desiredlevel;
2510             break;
2511
2512         case HFS_FSCTL_SET_DESIRED_DISK:
2513             if (*(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) {
2514                 return EINVAL;
2515             }
2516
2517             hfsmp->hfs_freespace_notify_desiredlevel = *(uint32_t *)ap->a_data;
2518             break;
2519
2520         case HFS_VOLUME_STATUS:
2521             *(uint32_t *)ap->a_data = hfsmp->hfs_notification_conditions;
2522             break;
2523
2524         case HFS_SET_BOOT_INFO:
2525                 if (!vnode_isvroot(vp))
2526                         return(EINVAL);
2527                 if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(HFSTOVFS(hfsmp))->f_owner))
2528                         return(EACCES); /* must be superuser or owner of filesystem */
2529                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2530                         return (EROFS);
2531                 }
2532                 hfs_lock_mount (hfsmp);
2533                 bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo));
2534                 /* Null out the cached UUID, to be safe */
2535                 uuid_clear (hfsmp->hfs_full_uuid);
2536                 hfs_unlock_mount (hfsmp);
2537                 (void) hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT);
2538                 break;
2539
2540         case HFS_GET_BOOT_INFO:
2541                 if (!vnode_isvroot(vp))
2542                         return(EINVAL);
2543                 hfs_lock_mount (hfsmp);
2544                 bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo));
2545                 hfs_unlock_mount(hfsmp);
2546                 break;
2547
2548         case HFS_MARK_BOOT_CORRUPT:
2549                 /* Mark the boot volume corrupt by setting
2550                  * kHFSVolumeInconsistentBit in the volume header.  This will
2551                  * force fsck_hfs on next mount.
2552                  */
2553                 if (!kauth_cred_issuser(kauth_cred_get())) {
2554                         return EACCES;
2555                 }
2556
2557                 /* Allowed only on the root vnode of the boot volume */
2558                 if (!(vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) ||
2559                     !vnode_isvroot(vp)) {
2560                         return EINVAL;
2561                 }
2562                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2563                         return (EROFS);
2564                 }
2565                 printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n");
2566                 hfs_mark_inconsistent(hfsmp, HFS_FSCK_FORCED);
2567                 break;
2568
2569         case HFS_FSCTL_GET_JOURNAL_INFO:
2570                 jip = (struct hfs_journal_info*)ap->a_data;
2571
2572                 if (vp == NULLVP)
2573                         return EINVAL;
2574
2575             if (hfsmp->jnl == NULL) {
2576                         jnl_start = 0;
2577                         jnl_size  = 0;
2578             } else {
2579                         jnl_start = hfs_blk_to_bytes(hfsmp->jnl_start, hfsmp->blockSize) + hfsmp->hfsPlusIOPosOffset;
2580                         jnl_size  = hfsmp->jnl_size;
2581             }
2582
2583                 jip->jstart = jnl_start;
2584                 jip->jsize = jnl_size;
2585                 break;
2586
2587         case HFS_SET_ALWAYS_ZEROFILL: {
2588             struct cnode *cp = VTOC(vp);
2589
2590             if (*(int *)ap->a_data) {
2591                 cp->c_flag |= C_ALWAYS_ZEROFILL;
2592             } else {
2593                 cp->c_flag &= ~C_ALWAYS_ZEROFILL;
2594             }
2595             break;
2596         }
2597
2598         case HFS_DISABLE_METAZONE: {
2599                 /* Only root can disable metadata zone */
2600                 if (!kauth_cred_issuser(kauth_cred_get())) {
2601                         return EACCES;
2602                 }
2603                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2604                         return (EROFS);
2605                 }
2606
2607                 /* Disable metadata zone now */
2608                 (void) hfs_metadatazone_init(hfsmp, true);
2609                 printf ("hfs: Disabling metadata zone on %s\n", hfsmp->vcbVN);
2610                 break;
2611         }
2612
2613
2614         case HFS_FSINFO_METADATA_BLOCKS: {
2615                 int error;
2616                 struct hfsinfo_metadata *hinfo;
2617
2618                 hinfo = (struct hfsinfo_metadata *)ap->a_data;
2619
2620                 /* Get information about number of metadata blocks */
2621                 error = hfs_getinfo_metadata_blocks(hfsmp, hinfo);
2622                 if (error) {
2623                         return error;
2624                 }
2625
2626                 break;
2627         }
2628
2629         case HFS_GET_FSINFO: {
2630                 hfs_fsinfo *fsinfo = (hfs_fsinfo *)ap->a_data;
2631
2632                 /* Only root is allowed to get fsinfo */
2633                 if (!kauth_cred_issuser(kauth_cred_get())) {
2634                         return EACCES;
2635                 }
2636
2637                 /*
2638                  * Make sure that the caller's version number matches with
2639                  * the kernel's version number.  This will make sure that
2640                  * if the structures being read/written into are changed
2641                  * by the kernel, the caller will not read incorrect data.
2642                  *
2643                  * The first three fields --- request_type, version and
2644                  * flags are same for all the hfs_fsinfo structures, so
2645                  * we can access the version number by assuming any
2646                  * structure for now.
2647                  */
2648                 if (fsinfo->header.version != HFS_FSINFO_VERSION) {
2649                         return ENOTSUP;
2650                 }
2651
2652                 /* Make sure that the current file system is not marked inconsistent */
2653                 if (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) {
2654                         return EIO;
2655                 }
2656
2657                 return hfs_get_fsinfo(hfsmp, ap->a_data);
2658         }
2659
2660         case HFS_CS_FREESPACE_TRIM: {
2661                 int error = 0;
2662                 int lockflags = 0;
2663
2664                 /* Only root allowed */
2665                 if (!kauth_cred_issuser(kauth_cred_get())) {
2666                         return EACCES;
2667                 }
2668
2669                 /*
2670                  * This core functionality is similar to hfs_scan_blocks().
2671                  * The main difference is that hfs_scan_blocks() is called
2672                  * as part of mount where we are assured that the journal is
2673                  * empty to start with.  This fcntl() can be called on a
2674                  * mounted volume, therefore it has to flush the content of
2675                  * the journal as well as ensure the state of summary table.
2676                  *
2677                  * This fcntl scans over the entire allocation bitmap,
2678                  * creates list of all the free blocks, and issues TRIM
2679                  * down to the underlying device.  This can take long time
2680                  * as it can generate up to 512MB of read I/O.
2681                  */
2682
2683                 if ((hfsmp->hfs_flags & HFS_SUMMARY_TABLE) == 0) {
2684                         error = hfs_init_summary(hfsmp);
2685                         if (error) {
2686                                 printf("hfs: fsctl() could not initialize summary table for %s\n", hfsmp->vcbVN);
2687                                 return error;
2688                         }
2689                 }
2690
2691                 /*
2692                  * The journal maintains list of recently deallocated blocks to
2693                  * issue DKIOCUNMAPs when the corresponding journal transaction is
2694                  * flushed to the disk.  To avoid any race conditions, we only
2695                  * want one active trim list and only one thread issuing DKIOCUNMAPs.
2696                  * Therefore we make sure that the journal trim list is sync'ed,
2697                  * empty, and not modifiable for the duration of our scan.
2698                  *
2699                  * Take the journal lock before flushing the journal to the disk.
2700                  * We will keep on holding the journal lock till we don't get the
2701                  * bitmap lock to make sure that no new journal transactions can
2702                  * start.  This will make sure that the journal trim list is not
2703                  * modified after the journal flush and before getting bitmap lock.
2704                  * We can release the journal lock after we acquire the bitmap
2705                  * lock as it will prevent any further block deallocations.
2706                  */
2707                 hfs_journal_lock(hfsmp);
2708
2709                 /* Flush the journal and wait for all I/Os to finish up */
2710                 error = hfs_flush(hfsmp, HFS_FLUSH_JOURNAL_META);
2711                 if (error) {
2712                         hfs_journal_unlock(hfsmp);
2713                         return error;
2714                 }
2715
2716                 /* Take bitmap lock to ensure it is not being modified */
2717                 lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
2718
2719                 /* Release the journal lock */
2720                 hfs_journal_unlock(hfsmp);
2721
2722                 /*
2723                  * ScanUnmapBlocks reads the bitmap in large block size
2724                  * (up to 1MB) unlike the runtime which reads the bitmap
2725                  * in the 4K block size.  This can cause buf_t collisions
2726                  * and potential data corruption.  To avoid this, we
2727                  * invalidate all the existing buffers associated with
2728                  * the bitmap vnode before scanning it.
2729                  *
2730                  * Note: ScanUnmapBlock() cleans up all the buffers
2731                  * after itself, so there won't be any large buffers left
2732                  * for us to clean up after it returns.
2733                  */
2734                 error = buf_invalidateblks(hfsmp->hfs_allocation_vp, 0, 0, 0);
2735                 if (error) {
2736                         hfs_systemfile_unlock(hfsmp, lockflags);
2737                         return error;
2738                 }
2739
2740                 /* Traverse bitmap and issue DKIOCUNMAPs */
2741                 error = ScanUnmapBlocks(hfsmp);
2742                 hfs_systemfile_unlock(hfsmp, lockflags);
2743                 if (error) {
2744                         return error;
2745                 }
2746
2747                 break;
2748         }
2749
2750         case HFS_SET_HOTFILE_STATE: {
2751                 int error;
2752                 struct cnode *cp = VTOC(vp);
2753                 uint32_t hf_state = *((uint32_t*)ap->a_data);
2754                 uint32_t num_unpinned = 0;
2755
2756                 error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2757                 if (error) {
2758                         return error;
2759                 }
2760
2761                 // printf("hfs: setting hotfile state %d on %s\n", hf_state, vp->v_name);
2762                 if (hf_state == HFS_MARK_FASTDEVCANDIDATE) {
2763                         vnode_setfastdevicecandidate(vp);
2764
2765                         cp->c_attr.ca_recflags |= kHFSFastDevCandidateMask;
2766                         cp->c_attr.ca_recflags &= ~kHFSDoNotFastDevPinMask;
2767                         cp->c_flag |= C_MODIFIED;
2768                 } else if (hf_state == HFS_UNMARK_FASTDEVCANDIDATE || hf_state == HFS_NEVER_FASTDEVCANDIDATE) {
2769                         vnode_clearfastdevicecandidate(vp);
2770                         hfs_removehotfile(vp);
2771
2772                         if (cp->c_attr.ca_recflags & kHFSFastDevPinnedMask) {
2773                                 hfs_pin_vnode(hfsmp, vp, HFS_UNPIN_IT, &num_unpinned);
2774                         }
2775
2776                         if (hf_state == HFS_NEVER_FASTDEVCANDIDATE) {
2777                                 cp->c_attr.ca_recflags |= kHFSDoNotFastDevPinMask;
2778                         }
2779                         cp->c_attr.ca_recflags &= ~(kHFSFastDevCandidateMask|kHFSFastDevPinnedMask);
2780                         cp->c_flag |= C_MODIFIED;
2781
2782                 } else {
2783                         error = EINVAL;
2784                 }
2785
2786                 if (num_unpinned != 0) {
2787                         lck_mtx_lock(&hfsmp->hfc_mutex);
2788                         hfsmp->hfs_hotfile_freeblks += num_unpinned;
2789                         lck_mtx_unlock(&hfsmp->hfc_mutex);
2790                 }
2791
2792                 hfs_unlock(cp);
2793                 return error;
2794         }
2795
2796         case HFS_REPIN_HOTFILE_STATE: {
2797                 int error=0;
2798                 uint32_t repin_what = *((uint32_t*)ap->a_data);
2799
2800                 /* Only root allowed */
2801                 if (!kauth_cred_issuser(kauth_cred_get())) {
2802                         return EACCES;
2803                 }
2804
2805                 if (!(hfsmp->hfs_flags & (HFS_CS_METADATA_PIN | HFS_CS_HOTFILE_PIN))) {
2806                         // this system is neither regular Fusion or Cooperative Fusion
2807                         // so this fsctl makes no sense.
2808                         return EINVAL;
2809                 }
2810
2811                 //
2812                 // After a converting a CoreStorage volume to be encrypted, the
2813                 // extents could have moved around underneath us.  This call
2814                 // allows corestoraged to re-pin everything that should be
2815                 // pinned (it would happen on the next reboot too but that could
2816                 // be a long time away).
2817                 //
2818                 if ((repin_what & HFS_REPIN_METADATA) && (hfsmp->hfs_flags & HFS_CS_METADATA_PIN)) {
2819                         hfs_pin_fs_metadata(hfsmp);
2820                 }
2821                 if ((repin_what & HFS_REPIN_USERDATA) && (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN)) {
2822                         hfs_repin_hotfiles(hfsmp);
2823                 }
2824                 if ((repin_what & HFS_REPIN_USERDATA) && (hfsmp->hfs_flags & HFS_CS_SWAPFILE_PIN)) {
2825                         //XXX Swapfiles (marked SWAP_PINNED) may have moved too.
2826                         //XXX Do we care? They have a more transient/dynamic nature/lifetime.
2827                 }
2828
2829                 return error;
2830         }
2831
2832 #if HFS_CONFIG_KEY_ROLL
2833
2834         case HFS_KEY_ROLL: {
2835                 if (!kauth_cred_issuser(kauth_cred_get()))
2836                         return EACCES;
2837
2838                 hfs_key_roll_args_t *args = (hfs_key_roll_args_t *)ap->a_data;
2839
2840                 return hfs_key_roll_op(ap->a_context, ap->a_vp, args);
2841         }
2842
2843         case HFS_GET_KEY_AUTO_ROLL: {
2844                 if (!kauth_cred_issuser(kauth_cred_get()))
2845                         return EACCES;
2846
2847                 hfs_key_auto_roll_args_t *args = (hfs_key_auto_roll_args_t *)ap->a_data;
2848                 if (args->api_version != HFS_KEY_AUTO_ROLL_API_VERSION_1)
2849                         return ENOTSUP;
2850                 args->flags = (ISSET(hfsmp->cproot_flags, CP_ROOT_AUTO_ROLL_OLD_CLASS_GENERATION)
2851                                            ? HFS_KEY_AUTO_ROLL_OLD_CLASS_GENERATION : 0);
2852                 args->min_key_os_version = hfsmp->hfs_auto_roll_min_key_os_version;
2853                 args->max_key_os_version = hfsmp->hfs_auto_roll_max_key_os_version;
2854                 break;
2855         }
2856
2857         case HFS_SET_KEY_AUTO_ROLL: {
2858                 if (!kauth_cred_issuser(kauth_cred_get()))
2859                         return EACCES;
2860
2861                 hfs_key_auto_roll_args_t *args = (hfs_key_auto_roll_args_t *)ap->a_data;
2862                 if (args->api_version != HFS_KEY_AUTO_ROLL_API_VERSION_1)
2863                         return ENOTSUP;
2864                 return cp_set_auto_roll(hfsmp, args);
2865         }
2866
2867 #endif // HFS_CONFIG_KEY_ROLL
2868
2869 #if CONFIG_PROTECT
2870         case F_TRANSCODEKEY:
2871                 /*
2872                  * This API is only supported when called via kernel so
2873                  * a_fflag must be set to 1 (it's not possible to get here
2874                  * with it set to 1 via fsctl).
2875                  */
2876                 if (ap->a_fflag != 1)
2877                         return ENOTTY;
2878                 return cp_vnode_transcode(vp, (cp_key_t *)ap->a_data);
2879
2880         case F_GETPROTECTIONLEVEL:
2881                 return cp_get_root_major_vers (vp, (uint32_t *)ap->a_data);
2882
2883         case F_GETDEFAULTPROTLEVEL:
2884                 return cp_get_default_level(vp, (uint32_t *)ap->a_data);
2885 #endif // CONFIG_PROTECT
2886
2887         case FIOPINSWAP:
2888                 return hfs_pin_vnode(hfsmp, vp, HFS_PIN_IT | HFS_DATALESS_PIN,
2889                                                          NULL);
2890
2891         default:
2892                 return (ENOTTY);
2893         }
2894
2895         return 0;
2896 }
2897
2898 /*
2899  * select
2900  */
2901 int
2902 hfs_vnop_select(__unused struct vnop_select_args *ap)
2903 /*
2904         struct vnop_select_args {
2905                 vnode_t a_vp;
2906                 int  a_which;
2907                 int  a_fflags;
2908                 void *a_wql;
2909                 vfs_context_t a_context;
2910         };
2911 */
2912 {
2913         /*
2914          * We should really check to see if I/O is possible.
2915          */
2916         return (1);
2917 }
2918
2919 /*
2920  * Converts a logical block number to a physical block, and optionally returns
2921  * the amount of remaining blocks in a run. The logical block is based on hfsNode.logBlockSize.
2922  * The physical block number is based on the device block size, currently its 512.
2923  * The block run is returned in logical blocks, and is the REMAINING amount of blocks
2924  */
2925 int
2926 hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, unsigned int *runp)
2927 {
2928         struct filefork *fp = VTOF(vp);
2929         struct hfsmount *hfsmp = VTOHFS(vp);
2930         int  retval = E_NONE;
2931         u_int32_t  logBlockSize;
2932         size_t  bytesContAvail = 0;
2933         off_t  blockposition;
2934         int lockExtBtree;
2935         int lockflags = 0;
2936
2937         /*
2938          * Check for underlying vnode requests and ensure that logical
2939          * to physical mapping is requested.
2940          */
2941         if (vpp != NULL)
2942                 *vpp = hfsmp->hfs_devvp;
2943         if (bnp == NULL)
2944                 return (0);
2945
2946         logBlockSize = GetLogicalBlockSize(vp);
2947         blockposition = (off_t)bn * logBlockSize;
2948
2949         lockExtBtree = overflow_extents(fp);
2950
2951         if (lockExtBtree)
2952                 lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK);
2953
2954         retval = MacToVFSError(
2955                             MapFileBlockC (HFSTOVCB(hfsmp),
2956                                             (FCB*)fp,
2957                                             MAXPHYSIO,
2958                                             blockposition,
2959                                             bnp,
2960                                             &bytesContAvail));
2961
2962         if (lockExtBtree)
2963                 hfs_systemfile_unlock(hfsmp, lockflags);
2964
2965         if (retval == E_NONE) {
2966                 /* Figure out how many read ahead blocks there are */
2967                 if (runp != NULL) {
2968                         if (can_cluster(logBlockSize)) {
2969                                 /* Make sure this result never goes negative: */
2970                                 *runp = (bytesContAvail < logBlockSize) ? 0 : (bytesContAvail / logBlockSize) - 1;
2971                         } else {
2972                                 *runp = 0;
2973                         }
2974                 }
2975         }
2976         return (retval);
2977 }
2978
2979 /*
2980  * Convert logical block number to file offset.
2981  */
2982 int
2983 hfs_vnop_blktooff(struct vnop_blktooff_args *ap)
2984 /*
2985         struct vnop_blktooff_args {
2986                 vnode_t a_vp;
2987                 daddr64_t a_lblkno;
2988                 off_t *a_offset;
2989         };
2990 */
2991 {
2992         if (ap->a_vp == NULL)
2993                 return (EINVAL);
2994         *ap->a_offset = (off_t)ap->a_lblkno * (off_t)GetLogicalBlockSize(ap->a_vp);
2995
2996         return(0);
2997 }
2998
2999 /*
3000  * Convert file offset to logical block number.
3001  */
3002 int
3003 hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap)
3004 /*
3005         struct vnop_offtoblk_args {
3006                 vnode_t a_vp;
3007                 off_t a_offset;
3008                 daddr64_t *a_lblkno;
3009         };
3010 */
3011 {
3012         if (ap->a_vp == NULL)
3013                 return (EINVAL);
3014         *ap->a_lblkno = (daddr64_t)(ap->a_offset / (off_t)GetLogicalBlockSize(ap->a_vp));
3015
3016         return(0);
3017 }
3018
3019 /*
3020  * Map file offset to physical block number.
3021  *
3022  * If this function is called for write operation, and if the file
3023  * had virtual blocks allocated (delayed allocation), real blocks
3024  * are allocated by calling ExtendFileC().
3025  *
3026  * If this function is called for read operation, and if the file
3027  * had virtual blocks allocated (delayed allocation), no change
3028  * to the size of file is done, and if required, rangelist is
3029  * searched for mapping.
3030  *
3031  * System file cnodes are expected to be locked (shared or exclusive).
3032  *
3033  * -- INVALID RANGES --
3034  *
3035  * Invalid ranges are used to keep track of where we have extended a
3036  * file, but have not yet written that data to disk.  In the past we
3037  * would clear up the invalid ranges as we wrote to those areas, but
3038  * before data was actually flushed to disk.  The problem with that
3039  * approach is that the data can be left in the cache and is therefore
3040  * still not valid on disk.  So now we clear up the ranges here, when
3041  * the flags field has VNODE_WRITE set, indicating a write is about to
3042  * occur.  This isn't ideal (ideally we want to clear them up when
3043  * know the data has been successfully written), but it's the best we
3044  * can do.
3045  *
3046  * For reads, we use the invalid ranges here in block map to indicate
3047  * to the caller that the data should be zeroed (a_bpn == -1).  We
3048  * have to be careful about what ranges we return to the cluster code.
3049  * Currently the cluster code can only handle non-rounded values for
3050  * the EOF; it cannot handle funny sized ranges in the middle of the
3051  * file (the main problem is that it sends down odd sized I/Os to the
3052  * disk).  Our code currently works because whilst the very first
3053  * offset and the last offset in the invalid ranges are not aligned,
3054  * gaps in the invalid ranges between the first and last, have to be
3055  * aligned (because we always write page sized blocks).  For example,
3056  * consider this arrangement:
3057  *
3058  *         +-------------+-----+-------+------+
3059  *         |             |XXXXX|       |XXXXXX|
3060  *         +-------------+-----+-------+------+
3061  *                       a     b       c      d
3062  *
3063  * This shows two invalid ranges <a, b> and <c, d>.  Whilst a and d
3064  * are not necessarily aligned, b and c *must* be.
3065  *
3066  * Zero-filling occurs in a number of ways:
3067  *
3068  *   1. When a read occurs and we return with a_bpn == -1.
3069  *
3070  *   2. When hfs_fsync or hfs_filedone calls hfs_flush_invalid_ranges
3071  *      which will cause us to iterate over the ranges bringing in
3072  *      pages that are not present in the cache and zeroing them.  Any
3073  *      pages that are already in the cache are left untouched.  Note
3074  *      that hfs_fsync does not always flush invalid ranges.
3075  *
3076  *   3. When we extend a file we zero out from the old EOF to the end
3077  *      of the page.  It would be nice if we didn't have to do this if
3078  *      the page wasn't present (and could defer it), but because of
3079  *      the problem described above, we have to.
3080  *
3081  * The invalid ranges are also used to restrict the size that we write
3082  * out on disk: see hfs_prepare_fork_for_update.
3083  *
3084  * Note that invalid ranges are ignored when neither the VNODE_READ or
3085  * the VNODE_WRITE flag is specified.  This is useful for the
3086  * F_LOG2PHYS* fcntls which are not interested in invalid ranges: they
3087  * just want to know whether blocks are physically allocated or not.
3088  */
3089 int
3090 hfs_vnop_blockmap(struct vnop_blockmap_args *ap)
3091 /*
3092         struct vnop_blockmap_args {
3093                 vnode_t a_vp;
3094                 off_t a_foffset;
3095                 size_t a_size;
3096                 daddr64_t *a_bpn;
3097                 size_t *a_run;
3098                 void *a_poff;
3099                 int a_flags;
3100                 vfs_context_t a_context;
3101         };
3102 */
3103 {
3104         struct vnode *vp = ap->a_vp;
3105         struct cnode *cp;
3106         struct filefork *fp;
3107         struct hfsmount *hfsmp;
3108         size_t bytesContAvail = ap->a_size;
3109         int retval = E_NONE;
3110         int syslocks = 0;
3111         int lockflags = 0;
3112         struct rl_entry *invalid_range;
3113         enum rl_overlaptype overlaptype;
3114         int started_tr = 0;
3115         int tooklock = 0;
3116
3117 #if HFS_COMPRESSION
3118         if (VNODE_IS_RSRC(vp)) {
3119                 /* allow blockmaps to the resource fork */
3120         } else {
3121                 if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
3122                         int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
3123                         switch(state) {
3124                                 case FILE_IS_COMPRESSED:
3125                                         return ENOTSUP;
3126                                 case FILE_IS_CONVERTING:
3127                                         /* if FILE_IS_CONVERTING, we allow blockmap */
3128                                         break;
3129                                 default:
3130                                         printf("invalid state %d for compressed file\n", state);
3131                                         /* fall through */
3132                         }
3133                 }
3134         }
3135 #endif /* HFS_COMPRESSION */
3136
3137         /* Do not allow blockmap operation on a directory */
3138         if (vnode_isdir(vp)) {
3139                 return (ENOTSUP);
3140         }
3141
3142         /*
3143          * Check for underlying vnode requests and ensure that logical
3144          * to physical mapping is requested.
3145          */
3146         if (ap->a_bpn == NULL)
3147                 return (0);
3148
3149         hfsmp = VTOHFS(vp);
3150         cp = VTOC(vp);
3151         fp = VTOF(vp);
3152
3153         if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) {
3154                 if (cp->c_lockowner != current_thread()) {
3155                         hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
3156                         tooklock = 1;
3157                 }
3158
3159                 // For reads, check the invalid ranges
3160                 if (ISSET(ap->a_flags, VNODE_READ)) {
3161                         if (ap->a_foffset >= fp->ff_size) {
3162                                 retval = ERANGE;
3163                                 goto exit;
3164                         }
3165
3166                         overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
3167                                                                   ap->a_foffset + (off_t)bytesContAvail - 1,
3168                                                                   &invalid_range);
3169                         switch(overlaptype) {
3170                                 case RL_MATCHINGOVERLAP:
3171                                 case RL_OVERLAPCONTAINSRANGE:
3172                                 case RL_OVERLAPSTARTSBEFORE:
3173                                         /* There's no valid block for this byte offset */
3174                                         *ap->a_bpn = (daddr64_t)-1;
3175                                         /* There's no point limiting the amount to be returned
3176                                          * if the invalid range that was hit extends all the way
3177                                          * to the EOF (i.e. there's no valid bytes between the
3178                                          * end of this range and the file's EOF):
3179                                          */
3180                                         if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
3181                                                 ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
3182                                                 bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
3183                                         }
3184
3185                                         retval = 0;
3186                                         goto exit;
3187
3188                                 case RL_OVERLAPISCONTAINED:
3189                                 case RL_OVERLAPENDSAFTER:
3190                                         /* The range of interest hits an invalid block before the end: */
3191                                         if (invalid_range->rl_start == ap->a_foffset) {
3192                                                 /* There's actually no valid information to be had starting here: */
3193                                                 *ap->a_bpn = (daddr64_t)-1;
3194                                                 if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
3195                                                         ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
3196                                                         bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
3197                                                 }
3198
3199                                                 retval = 0;
3200                                                 goto exit;
3201                                         } else {
3202                                                 /*
3203                                                  * Sadly, the lower layers don't like us to
3204                                                  * return unaligned ranges, so we skip over
3205                                                  * any invalid ranges here that are less than
3206                                                  * a page: zeroing of those bits is not our
3207                                                  * responsibility (it's dealt with elsewhere).
3208                                                  */
3209                                                 do {
3210                                                         off_t rounded_start = round_page_64(invalid_range->rl_start);
3211                                                         if ((off_t)bytesContAvail < rounded_start - ap->a_foffset)
3212                                                                 break;
3213                                                         if (rounded_start < invalid_range->rl_end + 1) {
3214                                                                 bytesContAvail = rounded_start - ap->a_foffset;
3215                                                                 break;
3216                                                         }
3217                                                 } while ((invalid_range = TAILQ_NEXT(invalid_range,
3218                                                                                                                          rl_link)));
3219                                         }
3220                                         break;
3221
3222                                 case RL_NOOVERLAP:
3223                                         break;
3224                         } // switch
3225                 }
3226         }
3227
3228 #if CONFIG_PROTECT
3229         if (cp->c_cpentry) {
3230                 const int direction = (ISSET(ap->a_flags, VNODE_WRITE)
3231                                                            ? VNODE_WRITE : VNODE_READ);
3232
3233                 cp_io_params_t io_params;
3234                 cp_io_params(hfsmp, cp->c_cpentry,
3235                                          off_rsrc_make(ap->a_foffset, VNODE_IS_RSRC(vp)),
3236                                          direction, &io_params);
3237
3238                 if (io_params.max_len < (off_t)bytesContAvail)
3239                         bytesContAvail = io_params.max_len;
3240
3241                 if (io_params.phys_offset != -1) {
3242                         *ap->a_bpn = ((io_params.phys_offset + hfsmp->hfsPlusIOPosOffset)
3243                                                   / hfsmp->hfs_logical_block_size);
3244
3245                         retval = 0;
3246                         goto exit;
3247                 }
3248         }
3249 #endif
3250
3251 retry:
3252
3253         /* Check virtual blocks only when performing write operation */
3254         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
3255                 if (hfs_start_transaction(hfsmp) != 0) {
3256                         retval = EINVAL;
3257                         goto exit;
3258                 } else {
3259                         started_tr = 1;
3260                 }
3261                 syslocks = SFL_EXTENTS | SFL_BITMAP;
3262
3263         } else if (overflow_extents(fp)) {
3264                 syslocks = SFL_EXTENTS;
3265         }
3266
3267         if (syslocks)
3268                 lockflags = hfs_systemfile_lock(hfsmp, syslocks, HFS_EXCLUSIVE_LOCK);
3269
3270         /*
3271          * Check for any delayed allocations.
3272          */
3273         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
3274                 int64_t actbytes;
3275                 u_int32_t loanedBlocks;
3276
3277                 //
3278                 // Make sure we have a transaction.  It's possible
3279                 // that we came in and fp->ff_unallocblocks was zero
3280                 // but during the time we blocked acquiring the extents
3281                 // btree, ff_unallocblocks became non-zero and so we
3282                 // will need to start a transaction.
3283                 //
3284                 if (started_tr == 0) {
3285                         if (syslocks) {
3286                                 hfs_systemfile_unlock(hfsmp, lockflags);
3287                                 syslocks = 0;
3288                         }
3289                         goto retry;
3290                 }
3291
3292                 /*
3293                  * Note: ExtendFileC will Release any blocks on loan and
3294                  * aquire real blocks.  So we ask to extend by zero bytes
3295                  * since ExtendFileC will account for the virtual blocks.
3296                  */
3297
3298                 loanedBlocks = fp->ff_unallocblocks;
3299                 retval = ExtendFileC(hfsmp, (FCB*)fp, 0, 0,
3300                                      kEFAllMask | kEFNoClumpMask, &actbytes);
3301
3302                 if (retval) {
3303                         fp->ff_unallocblocks = loanedBlocks;
3304                         cp->c_blocks += loanedBlocks;
3305                         fp->ff_blocks += loanedBlocks;
3306
3307                         hfs_lock_mount (hfsmp);
3308                         hfsmp->loanedBlocks += loanedBlocks;
3309                         hfs_unlock_mount (hfsmp);
3310
3311                         hfs_systemfile_unlock(hfsmp, lockflags);
3312                         cp->c_flag |= C_MODIFIED;
3313                         if (started_tr) {
3314                                 (void) hfs_update(vp, 0);
3315                                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3316
3317                                 hfs_end_transaction(hfsmp);
3318                                 started_tr = 0;
3319                         }
3320                         goto exit;
3321                 }
3322         }
3323
3324         retval = MapFileBlockC(hfsmp, (FCB *)fp, bytesContAvail, ap->a_foffset,
3325                                ap->a_bpn, &bytesContAvail);
3326         if (syslocks) {
3327                 hfs_systemfile_unlock(hfsmp, lockflags);
3328                 syslocks = 0;
3329         }
3330
3331         if (retval) {
3332                 /* On write, always return error because virtual blocks, if any,
3333                  * should have been allocated in ExtendFileC().  We do not
3334                  * allocate virtual blocks on read, therefore return error
3335                  * only if no virtual blocks are allocated.  Otherwise we search
3336                  * rangelist for zero-fills
3337                  */
3338                 if ((MacToVFSError(retval) != ERANGE) ||
3339                     (ap->a_flags & VNODE_WRITE) ||
3340                     ((ap->a_flags & VNODE_READ) && (fp->ff_unallocblocks == 0))) {
3341                         goto exit;
3342                 }
3343
3344                 /* Validate if the start offset is within logical file size */
3345                 if (ap->a_foffset >= fp->ff_size) {
3346                         goto exit;
3347                 }
3348
3349                 /*
3350                  * At this point, we have encountered a failure during
3351                  * MapFileBlockC that resulted in ERANGE, and we are not
3352                  * servicing a write, and there are borrowed blocks.
3353                  *
3354                  * However, the cluster layer will not call blockmap for
3355                  * blocks that are borrowed and in-cache.  We have to assume
3356                  * that because we observed ERANGE being emitted from
3357                  * MapFileBlockC, this extent range is not valid on-disk.  So
3358                  * we treat this as a mapping that needs to be zero-filled
3359                  * prior to reading.
3360                  */
3361
3362                 if (fp->ff_size - ap->a_foffset < (off_t)bytesContAvail)
3363                         bytesContAvail = fp->ff_size - ap->a_foffset;
3364
3365                 *ap->a_bpn = (daddr64_t) -1;
3366                 retval = 0;
3367
3368                 goto exit;
3369         }
3370
3371 exit:
3372         if (retval == 0) {
3373                 if (ISSET(ap->a_flags, VNODE_WRITE)) {
3374                         struct rl_entry *r = TAILQ_FIRST(&fp->ff_invalidranges);
3375
3376                         // See if we might be overlapping invalid ranges...
3377                         if (r && (ap->a_foffset + (off_t)bytesContAvail) > r->rl_start) {
3378                                 /*
3379                                  * Mark the file as needing an update if we think the
3380                                  * on-disk EOF has changed.
3381                                  */
3382                                 if (ap->a_foffset <= r->rl_start)
3383                                         SET(cp->c_flag, C_MODIFIED);
3384
3385                                 /*
3386                                  * This isn't the ideal place to put this.  Ideally, we
3387                                  * should do something *after* we have successfully
3388                                  * written to the range, but that's difficult to do
3389                                  * because we cannot take locks in the callback.  At
3390                                  * present, the cluster code will call us with VNODE_WRITE
3391                                  * set just before it's about to write the data so we know
3392                                  * that data is about to be written.  If we get an I/O
3393                                  * error at this point then chances are the metadata
3394                                  * update to follow will also have an I/O error so the
3395                                  * risk here is small.
3396                                  */
3397                                 rl_remove(ap->a_foffset, ap->a_foffset + bytesContAvail - 1,
3398                                                   &fp->ff_invalidranges);
3399
3400                                 if (!TAILQ_FIRST(&fp->ff_invalidranges)) {
3401                                         cp->c_flag &= ~C_ZFWANTSYNC;
3402                                         cp->c_zftimeout = 0;
3403                                 }
3404                         }
3405                 }
3406
3407                 if (ap->a_run)
3408                         *ap->a_run = bytesContAvail;
3409
3410                 if (ap->a_poff)
3411                         *(int *)ap->a_poff = 0;
3412         }
3413
3414         if (started_tr) {
3415                 hfs_update(vp, TRUE);
3416                 hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3417                 hfs_end_transaction(hfsmp);
3418                 started_tr = 0;
3419         }
3420
3421         if (tooklock)
3422                 hfs_unlock(cp);
3423
3424         return (MacToVFSError(retval));
3425 }
3426
3427 /*
3428  * prepare and issue the I/O
3429  * buf_strategy knows how to deal
3430  * with requests that require
3431  * fragmented I/Os
3432  */
3433 int
3434 hfs_vnop_strategy(struct vnop_strategy_args *ap)
3435 {
3436         buf_t   bp = ap->a_bp;
3437         vnode_t vp = buf_vnode(bp);
3438         int error = 0;
3439
3440         /* Mark buffer as containing static data if cnode flag set */
3441         if (VTOC(vp)->c_flag & C_SSD_STATIC) {
3442                 buf_markstatic(bp);
3443         }
3444
3445         /* Mark buffer as containing static data if cnode flag set */
3446         if (VTOC(vp)->c_flag & C_SSD_GREEDY_MODE) {
3447                 bufattr_markgreedymode(buf_attr(bp));
3448         }
3449
3450         /* mark buffer as containing burst mode data if cnode flag set */
3451         if (VTOC(vp)->c_flag & C_IO_ISOCHRONOUS) {
3452                 bufattr_markisochronous(buf_attr(bp));
3453         }
3454
3455 #if CONFIG_PROTECT
3456         error = cp_handle_strategy(bp);
3457
3458         if (error)
3459                 return error;
3460 #endif
3461
3462         error = buf_strategy(VTOHFS(vp)->hfs_devvp, ap);
3463
3464         return error;
3465 }
3466
3467 int
3468 do_hfs_truncate(struct vnode *vp, off_t length, int flags, int truncateflags, vfs_context_t context)
3469 {
3470         register struct cnode *cp = VTOC(vp);
3471         struct filefork *fp = VTOF(vp);
3472         kauth_cred_t cred = vfs_context_ucred(context);
3473         int retval;
3474         off_t bytesToAdd;
3475         off_t actualBytesAdded;
3476         off_t filebytes;
3477         u_int32_t fileblocks;
3478         int blksize;
3479         struct hfsmount *hfsmp;
3480         int lockflags;
3481         int suppress_times = (truncateflags & HFS_TRUNCATE_SKIPTIMES);
3482
3483         blksize = VTOVCB(vp)->blockSize;
3484         fileblocks = fp->ff_blocks;
3485         filebytes = (off_t)fileblocks * (off_t)blksize;
3486
3487         KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_START,
3488                  (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
3489
3490         if (length < 0)
3491                 return (EINVAL);
3492
3493         /* This should only happen with a corrupt filesystem */
3494         if ((off_t)fp->ff_size < 0)
3495                 return (EINVAL);
3496
3497         if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE))
3498                 return (EFBIG);
3499
3500         hfsmp = VTOHFS(vp);
3501
3502         retval = E_NONE;
3503
3504         /* Files that are changing size are not hot file candidates. */
3505         if (hfsmp->hfc_stage == HFC_RECORDING) {
3506                 fp->ff_bytesread = 0;
3507         }
3508
3509         /*
3510          * We cannot just check if fp->ff_size == length (as an optimization)
3511          * since there may be extra physical blocks that also need truncation.
3512          */
3513 #if QUOTA
3514         if ((retval = hfs_getinoquota(cp)))
3515                 return(retval);
3516 #endif /* QUOTA */
3517
3518         /*
3519          * Lengthen the size of the file. We must ensure that the
3520          * last byte of the file is allocated. Since the smallest
3521          * value of ff_size is 0, length will be at least 1.
3522          */
3523         if (length > (off_t)fp->ff_size) {
3524 #if QUOTA
3525                 retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)),
3526                                    cred, 0);
3527                 if (retval)
3528                         goto Err_Exit;
3529 #endif /* QUOTA */
3530                 /*
3531                  * If we don't have enough physical space then
3532                  * we need to extend the physical size.
3533                  */
3534                 if (length > filebytes) {
3535                         int eflags;
3536                         u_int32_t blockHint = 0;
3537
3538                         /* All or nothing and don't round up to clumpsize. */
3539                         eflags = kEFAllMask | kEFNoClumpMask;
3540
3541                         if (cred && (suser(cred, NULL) != 0)) {
3542                                 eflags |= kEFReserveMask;  /* keep a reserve */
3543                         }
3544
3545                         /*
3546                          * Allocate Journal and Quota files in metadata zone.
3547                          */
3548                         if (filebytes == 0 &&
3549                             hfsmp->hfs_flags & HFS_METADATA_ZONE &&
3550                             hfs_virtualmetafile(cp)) {
3551                                 eflags |= kEFMetadataMask;
3552                                 blockHint = hfsmp->hfs_metazone_start;
3553                         }
3554                         if (hfs_start_transaction(hfsmp) != 0) {
3555                             retval = EINVAL;
3556                             goto Err_Exit;
3557                         }
3558
3559                         /* Protect extents b-tree and allocation bitmap */
3560                         lockflags = SFL_BITMAP;
3561                         if (overflow_extents(fp))
3562                                 lockflags |= SFL_EXTENTS;
3563                         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3564
3565                         /*
3566                          * Keep growing the file as long as the current EOF is
3567                          * less than the desired value.
3568                          */
3569                         while ((length > filebytes) && (retval == E_NONE)) {
3570                                 bytesToAdd = length - filebytes;
3571                                 retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
3572                                                     (FCB*)fp,
3573                                                     bytesToAdd,
3574                                                     blockHint,
3575                                                     eflags,
3576                                                     &actualBytesAdded));
3577
3578                                 filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
3579                                 if (actualBytesAdded == 0 && retval == E_NONE) {
3580                                         if (length > filebytes)
3581                                                 length = filebytes;
3582                                         break;
3583                                 }
3584                         } /* endwhile */
3585
3586                         hfs_systemfile_unlock(hfsmp, lockflags);
3587
3588                         if (hfsmp->jnl) {
3589                                 hfs_update(vp, 0);
3590                                 hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3591                         }
3592
3593                         hfs_end_transaction(hfsmp);
3594
3595                         if (retval)
3596                                 goto Err_Exit;
3597
3598                         KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE,
3599                                 (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
3600                 }
3601
3602                 if (ISSET(flags, IO_NOZEROFILL)) {
3603                         // An optimisation for the hibernation file
3604                         if (vnode_isswap(vp))
3605                                 rl_remove_all(&fp->ff_invalidranges);
3606                 } else {
3607                         if (!vnode_issystem(vp) && retval == E_NONE) {
3608                                 if (length > (off_t)fp->ff_size) {
3609                                         struct timeval tv;
3610
3611                                         /* Extending the file: time to fill out the current last page w. zeroes? */
3612                                         if (fp->ff_size & PAGE_MASK_64) {
3613                                                 /* There might be some valid data at the start of the (current) last page
3614                                                    of the file, so zero out the remainder of that page to ensure the
3615                                                    entire page contains valid data. */
3616                                                 hfs_unlock(cp);
3617                                                 retval = hfs_zero_eof_page(vp, length);
3618                                                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
3619                                                 if (retval) goto Err_Exit;
3620                                         }
3621                                         microuptime(&tv);
3622                                         rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges);
3623                                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
3624                                 }
3625                         } else {
3626                                         panic("hfs_truncate: invoked on non-UBC object?!");
3627                         };
3628                 }
3629                 if (suppress_times == 0) {
3630                         cp->c_touch_modtime = TRUE;
3631                 }
3632                 fp->ff_size = length;
3633
3634         } else { /* Shorten the size of the file */
3635
3636                 // An optimisation for the hibernation file
3637                 if (ISSET(flags, IO_NOZEROFILL) && vnode_isswap(vp)) {
3638                         rl_remove_all(&fp->ff_invalidranges);
3639                 } else if ((off_t)fp->ff_size > length) {
3640                         /* Any space previously marked as invalid is now irrelevant: */
3641                         rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges);
3642                 }
3643
3644                 /*
3645                  * Account for any unmapped blocks. Note that the new
3646                  * file length can still end up with unmapped blocks.
3647                  */
3648                 if (fp->ff_unallocblocks > 0) {
3649                         u_int32_t finalblks;
3650                         u_int32_t loanedBlocks;
3651
3652                         hfs_lock_mount(hfsmp);
3653                         loanedBlocks = fp->ff_unallocblocks;
3654                         cp->c_blocks -= loanedBlocks;
3655                         fp->ff_blocks -= loanedBlocks;
3656                         fp->ff_unallocblocks = 0;
3657
3658                         hfsmp->loanedBlocks -= loanedBlocks;
3659
3660                         finalblks = (length + blksize - 1) / blksize;
3661                         if (finalblks > fp->ff_blocks) {
3662                                 /* calculate required unmapped blocks */
3663                                 loanedBlocks = finalblks - fp->ff_blocks;
3664                                 hfsmp->loanedBlocks += loanedBlocks;
3665
3666                                 fp->ff_unallocblocks = loanedBlocks;
3667                                 cp->c_blocks += loanedBlocks;
3668                                 fp->ff_blocks += loanedBlocks;
3669                         }
3670                         hfs_unlock_mount (hfsmp);
3671                 }
3672
3673                 off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
3674                 if (hfs_start_transaction(hfsmp) != 0) {
3675                         retval = EINVAL;
3676                         goto Err_Exit;
3677                 }
3678
3679                 if (fp->ff_unallocblocks == 0) {
3680                         /* Protect extents b-tree and allocation bitmap */
3681                         lockflags = SFL_BITMAP;
3682                         if (overflow_extents(fp))
3683                                 lockflags |= SFL_EXTENTS;
3684                         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3685
3686                         retval = MacToVFSError(TruncateFileC(VTOVCB(vp), (FCB*)fp, length, 0,
3687                                                                                                  FORK_IS_RSRC (fp), FTOC(fp)->c_fileid, false));
3688
3689                         hfs_systemfile_unlock(hfsmp, lockflags);
3690                 }
3691                 if (hfsmp->jnl) {
3692                         if (retval == 0) {
3693                                 fp->ff_size = length;
3694                         }
3695                         hfs_update(vp, 0);
3696                         hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3697                 }
3698                 hfs_end_transaction(hfsmp);
3699
3700                 filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
3701                 if (retval)
3702                         goto Err_Exit;
3703 #if QUOTA
3704                 /* These are bytesreleased */
3705                 (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0);
3706 #endif /* QUOTA */
3707
3708                 //
3709                 // Unlike when growing a file, we adjust the hotfile block count here
3710                 // instead of deeper down in the block allocation code because we do
3711                 // not necessarily have a vnode or "fcb" at the time we're deleting
3712                 // the file and so we wouldn't know if it was hotfile cached or not
3713                 //
3714                 hfs_hotfile_adjust_blocks(vp, (int64_t)((savedbytes - filebytes) / blksize));
3715
3716
3717                 /*
3718                  * Only set update flag if the logical length changes & we aren't
3719                  * suppressing modtime updates.
3720                  */
3721                 if (((off_t)fp->ff_size != length) && (suppress_times == 0)) {
3722                         cp->c_touch_modtime = TRUE;
3723                 }
3724                 fp->ff_size = length;
3725         }
3726         if (cp->c_mode & (S_ISUID | S_ISGID)) {
3727                 if (!vfs_context_issuser(context))
3728                         cp->c_mode &= ~(S_ISUID | S_ISGID);
3729         }
3730         cp->c_flag |= C_MODIFIED;
3731         cp->c_touch_chgtime = TRUE;     /* status changed */
3732         if (suppress_times == 0) {
3733                 cp->c_touch_modtime = TRUE;     /* file data was modified */
3734
3735                 /*
3736                  * If we are not suppressing the modtime update, then
3737                  * update the gen count as well.
3738                  */
3739                 if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK (cp->c_attr.ca_mode)) {
3740                         hfs_incr_gencount(cp);
3741                 }
3742         }
3743
3744         retval = hfs_update(vp, 0);
3745         if (retval) {
3746                 KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE,
3747                      -1, -1, -1, retval, 0);
3748         }
3749
3750 Err_Exit:
3751
3752         KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_END,
3753                  (int)length, (int)fp->ff_size, (int)filebytes, retval, 0);
3754
3755         return (retval);
3756 }
3757
3758 /*
3759  * Preparation which must be done prior to deleting the catalog record
3760  * of a file or directory.  In order to make the on-disk as safe as possible,
3761  * we remove the catalog entry before releasing the bitmap blocks and the
3762  * overflow extent records.  However, some work must be done prior to deleting
3763  * the catalog record.
3764  *
3765  * When calling this function, the cnode must exist both in memory and on-disk.
3766  * If there are both resource fork and data fork vnodes, this function should
3767  * be called on both.
3768  */
3769
3770 int
3771 hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) {
3772
3773         struct filefork *fp = VTOF(vp);
3774         struct cnode *cp = VTOC(vp);
3775 #if QUOTA
3776         int retval = 0;
3777 #endif /* QUOTA */
3778
3779         /* Cannot truncate an HFS directory! */
3780         if (vnode_isdir(vp)) {
3781                 return (EISDIR);
3782         }
3783
3784         /*
3785          * See the comment below in hfs_truncate for why we need to call
3786          * setsize here.  Essentially we want to avoid pending IO if we
3787          * already know that the blocks are going to be released here.
3788          * This function is only called when totally removing all storage for a file, so
3789          * we can take a shortcut and immediately setsize (0);
3790          */
3791         ubc_setsize(vp, 0);
3792
3793         /* This should only happen with a corrupt filesystem */
3794         if ((off_t)fp->ff_size < 0)
3795                 return (EINVAL);
3796
3797         /*
3798          * We cannot just check if fp->ff_size == length (as an optimization)
3799          * since there may be extra physical blocks that also need truncation.
3800          */
3801 #if QUOTA
3802         if ((retval = hfs_getinoquota(cp))) {
3803                 return(retval);
3804         }
3805 #endif /* QUOTA */
3806
3807         /* Wipe out any invalid ranges which have yet to be backed by disk */
3808         rl_remove(0, fp->ff_size - 1, &fp->ff_invalidranges);
3809
3810         /*
3811          * Account for any unmapped blocks. Since we're deleting the
3812          * entire file, we don't have to worry about just shrinking
3813          * to a smaller number of borrowed blocks.
3814          */
3815         if (fp->ff_unallocblocks > 0) {
3816                 u_int32_t loanedBlocks;
3817
3818                 hfs_lock_mount (hfsmp);
3819                 loanedBlocks = fp->ff_unallocblocks;
3820                 cp->c_blocks -= loanedBlocks;
3821                 fp->ff_blocks -= loanedBlocks;
3822                 fp->ff_unallocblocks = 0;
3823
3824                 hfsmp->loanedBlocks -= loanedBlocks;
3825
3826                 hfs_unlock_mount (hfsmp);
3827         }
3828
3829         return 0;
3830 }
3831
3832
3833 /*
3834  * Special wrapper around calling TruncateFileC.  This function is useable
3835  * even when the catalog record does not exist any longer, making it ideal
3836  * for use when deleting a file.  The simplification here is that we know
3837  * that we are releasing all blocks.
3838  *
3839  * Note that this function may be called when there is no vnode backing
3840  * the file fork in question.  We may call this from hfs_vnop_inactive
3841  * to clear out resource fork data (and may not want to clear out the data
3842  * fork yet).  As a result, we pointer-check both sets of inputs before
3843  * doing anything with them.
3844  *
3845  * The caller is responsible for saving off a copy of the filefork(s)
3846  * embedded within the cnode prior to calling this function.  The pointers
3847  * supplied as arguments must be valid even if the cnode is no longer valid.
3848  */
3849
3850 int
3851 hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork,
3852                                          struct filefork *rsrcfork, u_int32_t fileid) {
3853
3854         off_t filebytes;
3855         u_int32_t fileblocks;
3856         int blksize = 0;
3857         int error = 0;
3858         int lockflags;
3859
3860         blksize = hfsmp->blockSize;
3861
3862         /* Data Fork */
3863         if (datafork) {
3864                 off_t prev_filebytes;
3865
3866                 datafork->ff_size = 0;
3867
3868                 fileblocks = datafork->ff_blocks;
3869                 filebytes = (off_t)fileblocks * (off_t)blksize;
3870                 prev_filebytes = filebytes;
3871
3872                 /* We killed invalid ranges and loaned blocks before we removed the catalog entry */
3873
3874                 while (filebytes > 0) {
3875                         if (filebytes > HFS_BIGFILE_SIZE) {
3876                                 filebytes -= HFS_BIGFILE_SIZE;
3877                         } else {
3878                                 filebytes = 0;
3879                         }
3880
3881                         /* Start a transaction, and wipe out as many blocks as we can in this iteration */
3882                         if (hfs_start_transaction(hfsmp) != 0) {
3883                                 error = EINVAL;
3884                                 break;
3885                         }
3886
3887                         if (datafork->ff_unallocblocks == 0) {
3888                                 /* Protect extents b-tree and allocation bitmap */
3889                                 lockflags = SFL_BITMAP;
3890                                 if (overflow_extents(datafork))
3891                                         lockflags |= SFL_EXTENTS;
3892                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3893
3894                                 error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), datafork, filebytes, 1, 0, fileid, false));
3895
3896                                 hfs_systemfile_unlock(hfsmp, lockflags);
3897                         }
3898                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3899
3900                         struct cnode *cp = datafork ? FTOC(datafork) : NULL;
3901                         struct vnode *vp;
3902                         vp = cp ? CTOV(cp, 0) : NULL;
3903                         hfs_hotfile_adjust_blocks(vp, (int64_t)((prev_filebytes - filebytes) / blksize));
3904                         prev_filebytes = filebytes;
3905
3906                         /* Finish the transaction and start over if necessary */
3907                         hfs_end_transaction(hfsmp);
3908
3909                         if (error) {
3910                                 break;
3911                         }
3912                 }
3913         }
3914
3915         /* Resource fork */
3916         if (error == 0 && rsrcfork) {
3917                 rsrcfork->ff_size = 0;
3918
3919                 fileblocks = rsrcfork->ff_blocks;
3920                 filebytes = (off_t)fileblocks * (off_t)blksize;
3921
3922                 /* We killed invalid ranges and loaned blocks before we removed the catalog entry */
3923
3924                 while (filebytes > 0) {
3925                         if (filebytes > HFS_BIGFILE_SIZE) {
3926                                 filebytes -= HFS_BIGFILE_SIZE;
3927                         } else {
3928                                 filebytes = 0;
3929                         }
3930
3931                         /* Start a transaction, and wipe out as many blocks as we can in this iteration */
3932                         if (hfs_start_transaction(hfsmp) != 0) {
3933                                 error = EINVAL;
3934                                 break;
3935                         }
3936
3937                         if (rsrcfork->ff_unallocblocks == 0) {
3938                                 /* Protect extents b-tree and allocation bitmap */
3939                                 lockflags = SFL_BITMAP;
3940                                 if (overflow_extents(rsrcfork))
3941                                         lockflags |= SFL_EXTENTS;
3942                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3943
3944                                 error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), rsrcfork, filebytes, 1, 1, fileid, false));
3945
3946                                 hfs_systemfile_unlock(hfsmp, lockflags);
3947                         }
3948                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3949
3950                         /* Finish the transaction and start over if necessary */
3951                         hfs_end_transaction(hfsmp);
3952
3953                         if (error) {
3954                                 break;
3955                         }
3956                 }
3957         }
3958
3959         return error;
3960 }
3961
3962 errno_t hfs_ubc_setsize(vnode_t vp, off_t len, bool have_cnode_lock)
3963 {
3964         errno_t error;
3965
3966         /*
3967          * Call ubc_setsize to give the VM subsystem a chance to do
3968          * whatever it needs to with existing pages before we delete
3969          * blocks.  Note that symlinks don't use the UBC so we'll
3970          * get back ENOENT in that case.
3971          */
3972         if (have_cnode_lock) {
3973                 error = ubc_setsize_ex(vp, len, UBC_SETSIZE_NO_FS_REENTRY);
3974                 if (error == EAGAIN) {
3975                         cnode_t *cp = VTOC(vp);
3976
3977                         if (cp->c_truncatelockowner != current_thread())
3978                                 hfs_warn("hfs: hfs_ubc_setsize called without exclusive truncate lock!");
3979
3980                         hfs_unlock(cp);
3981                         error = ubc_setsize_ex(vp, len, 0);
3982                         hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
3983                 }
3984         } else
3985                 error = ubc_setsize_ex(vp, len, 0);
3986
3987         return error == ENOENT ? 0 : error;
3988 }
3989
3990 /*
3991  * Truncate a cnode to at most length size, freeing (or adding) the
3992  * disk blocks.
3993  */
3994 int
3995 hfs_truncate(struct vnode *vp, off_t length, int flags,
3996                          int truncateflags, vfs_context_t context)
3997 {
3998         struct filefork *fp = VTOF(vp);
3999         off_t filebytes;
4000         u_int32_t fileblocks;
4001         int blksize;
4002         errno_t error = 0;
4003         struct cnode *cp = VTOC(vp);
4004         hfsmount_t *hfsmp = VTOHFS(vp);
4005
4006         /* Cannot truncate an HFS directory! */
4007         if (vnode_isdir(vp)) {
4008                 return (EISDIR);
4009         }
4010         /* A swap file cannot change size. */
4011         if (vnode_isswap(vp) && length && !ISSET(flags, IO_NOAUTH)) {
4012                 return (EPERM);
4013         }
4014
4015         blksize = hfsmp->blockSize;
4016         fileblocks = fp->ff_blocks;
4017         filebytes = (off_t)fileblocks * (off_t)blksize;
4018
4019         bool caller_has_cnode_lock = (cp->c_lockowner == current_thread());
4020
4021         error = hfs_ubc_setsize(vp, length, caller_has_cnode_lock);
4022         if (error)
4023                 return error;
4024
4025         if (!caller_has_cnode_lock) {
4026                 error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
4027                 if (error)
4028                         return error;
4029         }
4030
4031         if (vnode_islnk(vp) && cp->c_datafork->ff_symlinkptr) {
4032                 hfs_free(cp->c_datafork->ff_symlinkptr, cp->c_datafork->ff_size);
4033                 cp->c_datafork->ff_symlinkptr = NULL;
4034         }
4035
4036         // have to loop truncating or growing files that are
4037         // really big because otherwise transactions can get
4038         // enormous and consume too many kernel resources.
4039
4040         if (length < filebytes) {
4041                 while (filebytes > length) {
4042                         if ((filebytes - length) > HFS_BIGFILE_SIZE) {
4043                                 filebytes -= HFS_BIGFILE_SIZE;
4044                         } else {
4045                                 filebytes = length;
4046                         }
4047                         error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context);
4048                         if (error)
4049                                 break;
4050                 }
4051         } else if (length > filebytes) {
4052                 kauth_cred_t cred = vfs_context_ucred(context);
4053                 const bool keep_reserve = cred && suser(cred, NULL) != 0;
4054
4055                 if (hfs_freeblks(hfsmp, keep_reserve)
4056                         < howmany(length - filebytes, blksize)) {
4057                         error = ENOSPC;
4058                 } else {
4059                         while (filebytes < length) {
4060                                 if ((length - filebytes) > HFS_BIGFILE_SIZE) {
4061                                         filebytes += HFS_BIGFILE_SIZE;
4062                                 } else {
4063                                         filebytes = length;
4064                                 }
4065                                 error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context);
4066                                 if (error)
4067                                         break;
4068                         }
4069                 }
4070         } else /* Same logical size */ {
4071
4072                 error = do_hfs_truncate(vp, length, flags, truncateflags, context);
4073         }
4074         /* Files that are changing size are not hot file candidates. */
4075         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
4076                 fp->ff_bytesread = 0;
4077         }
4078
4079 #if HFS_CONFIG_KEY_ROLL
4080         if (!error && cp->c_truncatelockowner == current_thread()) {
4081                 hfs_key_roll_check(cp, true);
4082         }
4083 #endif
4084
4085         if (!caller_has_cnode_lock)
4086                 hfs_unlock(cp);
4087
4088         // Make sure UBC's size matches up (in case we didn't completely succeed)
4089         errno_t err2 = hfs_ubc_setsize(vp, fp->ff_size, caller_has_cnode_lock);
4090         if (!error)
4091                 error = err2;
4092
4093         return error;
4094 }
4095
4096
4097 /*
4098  * Preallocate file storage space.
4099  */
4100 int
4101 hfs_vnop_allocate(struct vnop_allocate_args /* {
4102                 vnode_t a_vp;
4103                 off_t a_length;
4104                 u_int32_t  a_flags;
4105                 off_t *a_bytesallocated;
4106                 off_t a_offset;
4107                 vfs_context_t a_context;
4108         } */ *ap)
4109 {
4110         struct vnode *vp = ap->a_vp;
4111         struct cnode *cp;
4112         struct filefork *fp;
4113         ExtendedVCB *vcb;
4114         off_t length = ap->a_length;
4115         off_t startingPEOF;
4116         off_t moreBytesRequested;
4117         off_t actualBytesAdded;
4118         off_t filebytes;
4119         u_int32_t fileblocks;
4120         int retval, retval2;
4121         u_int32_t blockHint;
4122         u_int32_t extendFlags;   /* For call to ExtendFileC */
4123         struct hfsmount *hfsmp;
4124         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
4125         int lockflags;
4126         time_t orig_ctime;
4127
4128         *(ap->a_bytesallocated) = 0;
4129
4130         if (!vnode_isreg(vp))
4131                 return (EISDIR);
4132         if (length < (off_t)0)
4133                 return (EINVAL);
4134
4135         cp = VTOC(vp);
4136
4137         orig_ctime = VTOC(vp)->c_ctime;
4138
4139         nspace_snapshot_event(vp, orig_ctime, ap->a_length == 0 ? NAMESPACE_HANDLER_TRUNCATE_OP|NAMESPACE_HANDLER_DELETE_OP : NAMESPACE_HANDLER_TRUNCATE_OP, NULL);
4140
4141         hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
4142
4143         if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
4144                 goto Err_Exit;
4145         }
4146
4147         fp = VTOF(vp);
4148         hfsmp = VTOHFS(vp);
4149         vcb = VTOVCB(vp);
4150
4151         fileblocks = fp->ff_blocks;
4152         filebytes = (off_t)fileblocks * (off_t)vcb->blockSize;
4153
4154         if ((ap->a_flags & ALLOCATEFROMVOL) && (length < filebytes)) {
4155                 retval = EINVAL;
4156                 goto Err_Exit;
4157         }
4158
4159         /* Fill in the flags word for the call to Extend the file */
4160
4161         extendFlags = kEFNoClumpMask;
4162         if (ap->a_flags & ALLOCATECONTIG)
4163                 extendFlags |= kEFContigMask;
4164         if (ap->a_flags & ALLOCATEALL)
4165                 extendFlags |= kEFAllMask;
4166         if (cred && suser(cred, NULL) != 0)
4167                 extendFlags |= kEFReserveMask;
4168         if (hfs_virtualmetafile(cp))
4169                 extendFlags |= kEFMetadataMask;
4170
4171         retval = E_NONE;
4172         blockHint = 0;
4173         startingPEOF = filebytes;
4174
4175         if (ap->a_flags & ALLOCATEFROMPEOF)
4176                 length += filebytes;
4177         else if (ap->a_flags & ALLOCATEFROMVOL)
4178                 blockHint = ap->a_offset / VTOVCB(vp)->blockSize;
4179
4180         /* If no changes are necesary, then we're done */
4181         if (filebytes == length)
4182                 goto Std_Exit;
4183
4184         /*
4185          * Lengthen the size of the file. We must ensure that the
4186          * last byte of the file is allocated. Since the smallest
4187          * value of filebytes is 0, length will be at least 1.
4188          */
4189         if (length > filebytes) {
4190                 if (ISSET(extendFlags, kEFAllMask)
4191                         && (hfs_freeblks(hfsmp, ISSET(extendFlags, kEFReserveMask))
4192                                 < howmany(length - filebytes, hfsmp->blockSize))) {
4193                         retval = ENOSPC;
4194                         goto Err_Exit;
4195                 }
4196
4197                 off_t total_bytes_added = 0, orig_request_size;
4198
4199                 orig_request_size = moreBytesRequested = length - filebytes;
4200
4201 #if QUOTA
4202                 retval = hfs_chkdq(cp,
4203                                 (int64_t)(roundup(moreBytesRequested, vcb->blockSize)),
4204                                 cred, 0);
4205                 if (retval)
4206                         goto Err_Exit;
4207
4208 #endif /* QUOTA */
4209                 /*
4210                  * Metadata zone checks.
4211                  */
4212                 if (hfsmp->hfs_flags & HFS_METADATA_ZONE) {
4213                         /*
4214                          * Allocate Journal and Quota files in metadata zone.
4215                          */
4216                         if (hfs_virtualmetafile(cp)) {
4217                                 blockHint = hfsmp->hfs_metazone_start;
4218                         } else if ((blockHint >= hfsmp->hfs_metazone_start) &&
4219                                    (blockHint <= hfsmp->hfs_metazone_end)) {
4220                                 /*
4221                                  * Move blockHint outside metadata zone.
4222                                  */
4223                                 blockHint = hfsmp->hfs_metazone_end + 1;
4224                         }
4225                 }
4226
4227
4228                 while ((length > filebytes) && (retval == E_NONE)) {
4229                     off_t bytesRequested;
4230
4231                     if (hfs_start_transaction(hfsmp) != 0) {
4232                         retval = EINVAL;
4233                         goto Err_Exit;
4234                     }
4235
4236                     /* Protect extents b-tree and allocation bitmap */
4237                     lockflags = SFL_BITMAP;
4238                     if (overflow_extents(fp))
4239                                 lockflags |= SFL_EXTENTS;
4240                     lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
4241
4242                     if (moreBytesRequested >= HFS_BIGFILE_SIZE) {
4243                                 bytesRequested = HFS_BIGFILE_SIZE;
4244                     } else {
4245                                 bytesRequested = moreBytesRequested;
4246                     }
4247
4248                     if (extendFlags & kEFContigMask) {
4249                             // if we're on a sparse device, this will force it to do a
4250                             // full scan to find the space needed.
4251                             hfsmp->hfs_flags &= ~HFS_DID_CONTIG_SCAN;
4252                     }
4253
4254                     retval = MacToVFSError(ExtendFileC(vcb,
4255                                                 (FCB*)fp,
4256                                                 bytesRequested,
4257                                                 blockHint,
4258                                                 extendFlags,
4259                                                 &actualBytesAdded));
4260
4261                     if (retval == E_NONE) {
4262                         *(ap->a_bytesallocated) += actualBytesAdded;
4263                         total_bytes_added += actualBytesAdded;
4264                         moreBytesRequested -= actualBytesAdded;
4265                         if (blockHint != 0) {
4266                             blockHint += actualBytesAdded / vcb->blockSize;
4267                         }
4268                     }
4269                     filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
4270
4271                     hfs_systemfile_unlock(hfsmp, lockflags);
4272
4273                     if (hfsmp->jnl) {
4274                         (void) hfs_update(vp, 0);
4275                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
4276                     }
4277
4278                     hfs_end_transaction(hfsmp);
4279                 }
4280
4281
4282                 /*
4283                  * if we get an error and no changes were made then exit
4284                  * otherwise we must do the hfs_update to reflect the changes
4285                  */
4286                 if (retval && (startingPEOF == filebytes))
4287                         goto Err_Exit;
4288
4289                 /*
4290                  * Adjust actualBytesAdded to be allocation block aligned, not
4291                  * clump size aligned.
4292                  * NOTE: So what we are reporting does not affect reality
4293                  * until the file is closed, when we truncate the file to allocation
4294                  * block size.
4295                  */
4296                 if (total_bytes_added != 0 && orig_request_size < total_bytes_added)
4297                         *(ap->a_bytesallocated) =
4298                                 roundup(orig_request_size, (off_t)vcb->blockSize);
4299
4300         } else { /* Shorten the size of the file */
4301
4302                 /*
4303                  * N.B. At present, this code is never called.  If and when we
4304                  * do start using it, it looks like there might be slightly
4305                  * strange semantics with the file size: it's possible for the
4306                  * file size to *increase* e.g. if current file size is 5,
4307                  * length is 1024 and filebytes is 4096, the file size will
4308                  * end up being 1024 bytes.  This isn't necessarily a problem
4309                  * but it's not consistent with the code above which doesn't
4310                  * change the file size.
4311                  */
4312
4313                 retval = hfs_truncate(vp, length, 0, 0, ap->a_context);
4314                 filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
4315
4316                 /*
4317                  * if we get an error and no changes were made then exit
4318                  * otherwise we must do the hfs_update to reflect the changes
4319                  */
4320                 if (retval && (startingPEOF == filebytes)) goto Err_Exit;
4321 #if QUOTA
4322                 /* These are  bytesreleased */
4323                 (void) hfs_chkdq(cp, (int64_t)-((startingPEOF - filebytes)), NOCRED,0);
4324 #endif /* QUOTA */
4325
4326                 if (fp->ff_size > filebytes) {
4327                         fp->ff_size = filebytes;
4328
4329                         hfs_ubc_setsize(vp, fp->ff_size, true);
4330                 }
4331         }
4332
4333 Std_Exit:
4334         cp->c_flag |= C_MODIFIED;
4335         cp->c_touch_chgtime = TRUE;
4336         cp->c_touch_modtime = TRUE;
4337         retval2 = hfs_update(vp, 0);
4338
4339         if (retval == 0)
4340                 retval = retval2;
4341 Err_Exit:
4342         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
4343         hfs_unlock(cp);
4344         return (retval);
4345 }
4346
4347
4348 /*
4349  * Pagein for HFS filesystem
4350  */
4351 int
4352 hfs_vnop_pagein(struct vnop_pagein_args *ap)
4353 /*
4354         struct vnop_pagein_args {
4355                 vnode_t a_vp,
4356                 upl_t         a_pl,
4357                 vm_offset_t   a_pl_offset,
4358                 off_t         a_f_offset,
4359                 size_t        a_size,
4360                 int           a_flags
4361                 vfs_context_t a_context;
4362         };
4363 */
4364 {
4365         vnode_t         vp;
4366         struct cnode    *cp;
4367         struct filefork *fp;
4368         int             error = 0;
4369         upl_t           upl;
4370         upl_page_info_t *pl;
4371         off_t           f_offset;
4372         off_t           page_needed_f_offset;
4373         int             offset;
4374         int             isize;
4375         int             upl_size;
4376         int             pg_index;
4377         boolean_t       truncate_lock_held = FALSE;
4378         boolean_t       file_converted = FALSE;
4379         kern_return_t   kret;
4380
4381         vp = ap->a_vp;
4382         cp = VTOC(vp);
4383         fp = VTOF(vp);
4384
4385 #if CONFIG_PROTECT
4386         if ((error = cp_handle_vnop(vp, CP_READ_ACCESS | CP_WRITE_ACCESS, 0)) != 0) {
4387                 /*
4388                  * If we errored here, then this means that one of two things occurred:
4389                  * 1. there was a problem with the decryption of the key.
4390                  * 2. the device is locked and we are not allowed to access this particular file.
4391                  *
4392                  * Either way, this means that we need to shut down this upl now.  As long as
4393                  * the pl pointer is NULL (meaning that we're supposed to create the UPL ourselves)
4394                  * then we create a upl and immediately abort it.
4395                  */
4396                 if (ap->a_pl == NULL) {
4397                         /* create the upl */
4398                         ubc_create_upl (vp, ap->a_f_offset, ap->a_size, &upl, &pl,
4399                                         UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT);
4400                         /* mark the range as needed so it doesn't immediately get discarded upon abort */
4401                         ubc_upl_range_needed (upl, ap->a_pl_offset / PAGE_SIZE, 1);
4402
4403                         /* Abort the range */
4404                         ubc_upl_abort_range (upl, 0, ap->a_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
4405                 }
4406
4407
4408                 return error;
4409         }
4410 #endif /* CONFIG_PROTECT */
4411
4412         if (ap->a_pl != NULL) {
4413                 /*
4414                  * this can only happen for swap files now that
4415                  * we're asking for V2 paging behavior...
4416                  * so don't need to worry about decompression, or
4417                  * keeping track of blocks read or taking the truncate lock
4418                  */
4419                 error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
4420                                        ap->a_size, (off_t)fp->ff_size, ap->a_flags);
4421                 goto pagein_done;
4422         }
4423
4424         page_needed_f_offset = ap->a_f_offset + ap->a_pl_offset;
4425
4426 retry_pagein:
4427         /*
4428          * take truncate lock (shared/recursive) to guard against
4429          * zero-fill thru fsync interfering, but only for v2
4430          *
4431          * the HFS_RECURSE_TRUNCLOCK arg indicates that we want the
4432          * lock shared and we are allowed to recurse 1 level if this thread already
4433          * owns the lock exclusively... this can legally occur
4434          * if we are doing a shrinking ftruncate against a file
4435          * that is mapped private, and the pages being truncated
4436          * do not currently exist in the cache... in that case
4437          * we will have to page-in the missing pages in order
4438          * to provide them to the private mapping... we must
4439          * also call hfs_unlock_truncate with a postive been_recursed
4440          * arg to indicate that if we have recursed, there is no need to drop
4441          * the lock.  Allowing this simple recursion is necessary
4442          * in order to avoid a certain deadlock... since the ftruncate
4443          * already holds the truncate lock exclusively, if we try
4444          * to acquire it shared to protect the pagein path, we will
4445          * hang this thread
4446          *
4447          * NOTE: The if () block below is a workaround in order to prevent a
4448          * VM deadlock. See rdar://7853471.
4449          *
4450          * If we are in a forced unmount, then launchd will still have the
4451          * dyld_shared_cache file mapped as it is trying to reboot.  If we
4452          * take the truncate lock here to service a page fault, then our
4453          * thread could deadlock with the forced-unmount.  The forced unmount
4454          * thread will try to reclaim the dyld_shared_cache vnode, but since it's
4455          * marked C_DELETED, it will call ubc_setsize(0).  As a result, the unmount
4456          * thread will think it needs to copy all of the data out of the file
4457          * and into a VM copy object.  If we hold the cnode lock here, then that
4458          * VM operation will not be able to proceed, because we'll set a busy page
4459          * before attempting to grab the lock.  Note that this isn't as simple as "don't
4460          * call ubc_setsize" because doing that would just shift the problem to the
4461          * ubc_msync done before the vnode is reclaimed.
4462          *
4463          * So, if a forced unmount on this volume is in flight AND the cnode is
4464          * marked C_DELETED, then just go ahead and do the page in without taking
4465          * the lock (thus suspending pagein_v2 semantics temporarily).  Since it's on a file
4466          * that is not going to be available on the next mount, this seems like a
4467          * OK solution from a correctness point of view, even though it is hacky.
4468          */
4469         if (vfs_isforce(vnode_mount(vp))) {
4470                 if (cp->c_flag & C_DELETED) {
4471                         /* If we don't get it, then just go ahead and operate without the lock */
4472                         truncate_lock_held = hfs_try_trunclock(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4473                 }
4474         }
4475         else {
4476                 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4477                 truncate_lock_held = TRUE;
4478         }
4479
4480         kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT);
4481
4482         if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
4483                 error = EINVAL;
4484                 goto pagein_done;
4485         }
4486         ubc_upl_range_needed(upl, ap->a_pl_offset / PAGE_SIZE, 1);
4487
4488         upl_size = isize = ap->a_size;
4489
4490         /*
4491          * Scan from the back to find the last page in the UPL, so that we
4492          * aren't looking at a UPL that may have already been freed by the
4493          * preceding aborts/completions.
4494          */
4495         for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
4496                 if (upl_page_present(pl, --pg_index))
4497                         break;
4498                 if (pg_index == 0) {
4499                         /*
4500                          * no absent pages were found in the range specified
4501                          * just abort the UPL to get rid of it and then we're done
4502                          */
4503                         ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
4504                         goto pagein_done;
4505                 }
4506         }
4507         /*
4508          * initialize the offset variables before we touch the UPL.
4509          * f_offset is the position into the file, in bytes
4510          * offset is the position into the UPL, in bytes
4511          * pg_index is the pg# of the UPL we're operating on
4512          * isize is the offset into the UPL of the last page that is present.
4513          */
4514         isize = ((pg_index + 1) * PAGE_SIZE);
4515         pg_index = 0;
4516         offset = 0;
4517         f_offset = ap->a_f_offset;
4518
4519         while (isize) {
4520                 int  xsize;
4521                 int  num_of_pages;
4522
4523                 if ( !upl_page_present(pl, pg_index)) {
4524                         /*
4525                          * we asked for RET_ONLY_ABSENT, so it's possible
4526                          * to get back empty slots in the UPL.
4527                          * just skip over them
4528                          */
4529                         f_offset += PAGE_SIZE;
4530                         offset   += PAGE_SIZE;
4531                         isize    -= PAGE_SIZE;
4532                         pg_index++;
4533
4534                         continue;
4535                 }
4536                 /*
4537                  * We know that we have at least one absent page.
4538                  * Now checking to see how many in a row we have
4539                  */
4540                 num_of_pages = 1;
4541                 xsize = isize - PAGE_SIZE;
4542
4543                 while (xsize) {
4544                         if ( !upl_page_present(pl, pg_index + num_of_pages))
4545                                 break;
4546                         num_of_pages++;
4547                         xsize -= PAGE_SIZE;
4548                 }
4549                 xsize = num_of_pages * PAGE_SIZE;
4550
4551 #if HFS_COMPRESSION
4552                 if (VNODE_IS_RSRC(vp)) {
4553                         /* allow pageins of the resource fork */
4554                 } else {
4555                         int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
4556
4557                         if (compressed) {
4558
4559                                 if (truncate_lock_held) {
4560                                         /*
4561                                          * can't hold the truncate lock when calling into the decmpfs layer
4562                                          * since it calls back into this layer... even though we're only
4563                                          * holding the lock in shared mode, and the re-entrant path only
4564                                          * takes the lock shared, we can deadlock if some other thread
4565                                          * tries to grab the lock exclusively in between.
4566                                          */
4567                                         hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4568                                         truncate_lock_held = FALSE;
4569                                 }
4570                                 ap->a_pl = upl;
4571                                 ap->a_pl_offset = offset;
4572                                 ap->a_f_offset = f_offset;
4573                                 ap->a_size = xsize;
4574
4575                                 error = decmpfs_pagein_compressed(ap, &compressed, VTOCMP(vp));
4576                                 /*
4577                                  * note that decpfs_pagein_compressed can change the state of
4578                                  * 'compressed'... it will set it to 0 if the file is no longer
4579                                  * compressed once the compression lock is successfully taken
4580                                  * i.e. we would block on that lock while the file is being inflated
4581                                  */
4582                                 if (error == 0 && vnode_isfastdevicecandidate(vp)) {
4583                                         (void) hfs_addhotfile(vp);
4584                                 }
4585                                 if (compressed) {
4586                                         if (error == 0) {
4587                                                 /* successful page-in, update the access time */
4588                                                 VTOC(vp)->c_touch_acctime = TRUE;
4589
4590                                                 //
4591                                                 // compressed files are not traditional hot file candidates
4592                                                 // but they may be for CF (which ignores the ff_bytesread
4593                                                 // field)
4594                                                 //
4595                                                 if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
4596                                                         fp->ff_bytesread = 0;
4597                                                 }
4598                                         } else if (error == EAGAIN) {
4599                                                 /*
4600                                                  * EAGAIN indicates someone else already holds the compression lock...
4601                                                  * to avoid deadlocking, we'll abort this range of pages with an
4602                                                  * indication that the pagein needs to be redriven
4603                                                  */
4604                                                 ubc_upl_abort_range(upl, (upl_offset_t) offset, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_RESTART);
4605                                         } else if (error == ENOSPC) {
4606
4607                                                 if (upl_size == PAGE_SIZE)
4608                                                         panic("decmpfs_pagein_compressed: couldn't ubc_upl_map a single page\n");
4609
4610                                                 ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY);
4611
4612                                                 ap->a_size = PAGE_SIZE;
4613                                                 ap->a_pl = NULL;
4614                                                 ap->a_pl_offset = 0;
4615                                                 ap->a_f_offset = page_needed_f_offset;
4616
4617                                                 goto retry_pagein;
4618                                         } else {
4619                                                 ubc_upl_abort(upl, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
4620                                                 goto pagein_done;
4621                                         }
4622                                         goto pagein_next_range;
4623                                 }
4624                                 else {
4625                                         /*
4626                                          * Set file_converted only if the file became decompressed while we were
4627                                          * paging in.  If it were still compressed, we would re-start the loop using the goto
4628                                          * in the above block.  This avoid us overloading truncate_lock_held as our retry_pagein
4629                                          * condition below, since we could have avoided taking the truncate lock to prevent
4630                                          * a deadlock in the force unmount case.
4631                                          */
4632                                         file_converted = TRUE;
4633                                 }
4634                         }
4635                         if (file_converted == TRUE) {
4636                                 /*
4637                                  * the file was converted back to a regular file after we first saw it as compressed
4638                                  * we need to abort the upl, retake the truncate lock, recreate the UPL and start over
4639                                  * reset a_size so that we consider what remains of the original request
4640                                  * and null out a_upl and a_pl_offset.
4641                                  *
4642                                  * We should only be able to get into this block if the decmpfs_pagein_compressed
4643                                  * successfully decompressed the range in question for this file.
4644                                  */
4645                                 ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY);
4646
4647                                 ap->a_size = isize;
4648                                 ap->a_pl = NULL;
4649                                 ap->a_pl_offset = 0;
4650
4651                                 /* Reset file_converted back to false so that we don't infinite-loop. */
4652                                 file_converted = FALSE;
4653                                 goto retry_pagein;
4654                         }
4655                 }
4656 #endif
4657                 error = cluster_pagein(vp, upl, offset, f_offset, xsize, (off_t)fp->ff_size, ap->a_flags);
4658
4659                 /*
4660                  * Keep track of blocks read.
4661                  */
4662                 if ( !vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) {
4663                         int bytesread;
4664                         int took_cnode_lock = 0;
4665
4666                         if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE)
4667                                 bytesread = fp->ff_size;
4668                         else
4669                                 bytesread = xsize;
4670
4671                         /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
4672                         if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) {
4673                                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
4674                                 took_cnode_lock = 1;
4675                         }
4676                         /*
4677                          * If this file hasn't been seen since the start of
4678                          * the current sampling period then start over.
4679                          */
4680                         if (cp->c_atime < VTOHFS(vp)->hfc_timebase) {
4681                                 struct timeval tv;
4682
4683                                 fp->ff_bytesread = bytesread;
4684                                 microtime(&tv);
4685                                 cp->c_atime = tv.tv_sec;
4686                         } else {
4687                                 fp->ff_bytesread += bytesread;
4688                         }
4689                         cp->c_touch_acctime = TRUE;
4690
4691                         if (vnode_isfastdevicecandidate(vp)) {
4692                                 (void) hfs_addhotfile(vp);
4693                         }
4694                         if (took_cnode_lock)
4695                                 hfs_unlock(cp);
4696                 }
4697 pagein_next_range:
4698                 f_offset += xsize;
4699                 offset   += xsize;
4700                 isize    -= xsize;
4701                 pg_index += num_of_pages;
4702
4703                 error = 0;
4704         }
4705
4706 pagein_done:
4707         if (truncate_lock_held == TRUE) {
4708                 /* Note 1 is passed to hfs_unlock_truncate in been_recursed argument */
4709                 hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4710         }
4711
4712         return (error);
4713 }
4714
4715 /*
4716  * Pageout for HFS filesystem.
4717  */
4718 int
4719 hfs_vnop_pageout(struct vnop_pageout_args *ap)
4720 /*
4721         struct vnop_pageout_args {
4722            vnode_t a_vp,
4723            upl_t         a_pl,
4724            vm_offset_t   a_pl_offset,
4725            off_t         a_f_offset,
4726            size_t        a_size,
4727            int           a_flags
4728            vfs_context_t a_context;
4729         };
4730 */
4731 {
4732         vnode_t vp = ap->a_vp;
4733         struct cnode *cp;
4734         struct filefork *fp;
4735         int retval = 0;
4736         off_t filesize;
4737         upl_t           upl;
4738         upl_page_info_t* pl = NULL;
4739         vm_offset_t     a_pl_offset;
4740         int             a_flags;
4741         int is_pageoutv2 = 0;
4742         kern_return_t kret;
4743
4744         cp = VTOC(vp);
4745         fp = VTOF(vp);
4746
4747         a_flags = ap->a_flags;
4748         a_pl_offset = ap->a_pl_offset;
4749
4750         /*
4751          * we can tell if we're getting the new or old behavior from the UPL
4752          */
4753         if ((upl = ap->a_pl) == NULL) {
4754                 int request_flags;
4755
4756                 is_pageoutv2 = 1;
4757                 /*
4758                  * we're in control of any UPL we commit
4759                  * make sure someone hasn't accidentally passed in UPL_NOCOMMIT
4760                  */
4761                 a_flags &= ~UPL_NOCOMMIT;
4762                 a_pl_offset = 0;
4763
4764                 /*
4765                  * For V2 semantics, we want to take the cnode truncate lock
4766                  * shared to guard against the file size changing via zero-filling.
4767                  *
4768                  * However, we have to be careful because we may be invoked
4769                  * via the ubc_msync path to write out dirty mmap'd pages
4770                  * in response to a lock event on a content-protected
4771                  * filesystem (e.g. to write out class A files).
4772                  * As a result, we want to take the truncate lock 'SHARED' with
4773                  * the mini-recursion locktype so that we don't deadlock/panic
4774                  * because we may be already holding the truncate lock exclusive to force any other
4775                  * IOs to have blocked behind us.
4776                  */
4777                 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4778
4779                 if (a_flags & UPL_MSYNC) {
4780                         request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY;
4781                 }
4782                 else {
4783                         request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY;
4784                 }
4785
4786                 kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags);
4787
4788                 if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
4789                         retval = EINVAL;
4790                         goto pageout_done;
4791                 }
4792         }
4793         /*
4794          * from this point forward upl points at the UPL we're working with
4795          * it was either passed in or we succesfully created it
4796          */
4797
4798         /*
4799          * Figure out where the file ends, for pageout purposes.  If
4800          * ff_new_size > ff_size, then we're in the middle of extending the
4801          * file via a write, so it is safe (and necessary) that we be able
4802          * to pageout up to that point.
4803          */
4804         filesize = fp->ff_size;
4805         if (fp->ff_new_size > filesize)
4806                 filesize = fp->ff_new_size;
4807
4808         /*
4809          * Now that HFS is opting into VFC_VFSVNOP_PAGEOUTV2, we may need to operate on our own
4810          * UPL instead of relying on the UPL passed into us.  We go ahead and do that here,
4811          * scanning for dirty ranges.  We'll issue our own N cluster_pageout calls, for
4812          * N dirty ranges in the UPL.  Note that this is almost a direct copy of the
4813          * logic in vnode_pageout except that we need to do it after grabbing the truncate
4814          * lock in HFS so that we don't lock invert ourselves.
4815          *
4816          * Note that we can still get into this function on behalf of the default pager with
4817          * non-V2 behavior (swapfiles).  However in that case, we did not grab locks above
4818          * since fsync and other writing threads will grab the locks, then mark the
4819          * relevant pages as busy.  But the pageout codepath marks the pages as busy,
4820          * and THEN would attempt to grab the truncate lock, which would result in deadlock.  So
4821          * we do not try to grab anything for the pre-V2 case, which should only be accessed
4822          * by the paging/VM system.
4823          */
4824
4825         if (is_pageoutv2) {
4826                 off_t f_offset;
4827                 int offset;
4828                 int isize;
4829                 int pg_index;
4830                 int error;
4831                 int error_ret = 0;
4832
4833                 isize = ap->a_size;
4834                 f_offset = ap->a_f_offset;
4835
4836                 /*
4837                  * Scan from the back to find the last page in the UPL, so that we
4838                  * aren't looking at a UPL that may have already been freed by the
4839                  * preceding aborts/completions.
4840                  */
4841                 for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
4842                         if (upl_page_present(pl, --pg_index))
4843                                 break;
4844                         if (pg_index == 0) {
4845                                 ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
4846                                 goto pageout_done;
4847                         }
4848                 }
4849
4850                 /*
4851                  * initialize the offset variables before we touch the UPL.
4852                  * a_f_offset is the position into the file, in bytes
4853                  * offset is the position into the UPL, in bytes
4854                  * pg_index is the pg# of the UPL we're operating on.
4855                  * isize is the offset into the UPL of the last non-clean page.
4856                  */
4857                 isize = ((pg_index + 1) * PAGE_SIZE);
4858
4859                 offset = 0;
4860                 pg_index = 0;
4861
4862                 while (isize) {
4863                         int  xsize;
4864                         int  num_of_pages;
4865
4866                         if ( !upl_page_present(pl, pg_index)) {
4867                                 /*
4868                                  * we asked for RET_ONLY_DIRTY, so it's possible
4869                                  * to get back empty slots in the UPL.
4870                                  * just skip over them
4871                                  */
4872                                 f_offset += PAGE_SIZE;
4873                                 offset   += PAGE_SIZE;
4874                                 isize    -= PAGE_SIZE;
4875                                 pg_index++;
4876
4877                                 continue;
4878                         }
4879                         if ( !upl_dirty_page(pl, pg_index)) {
4880                                 panic ("hfs_vnop_pageout: unforeseen clean page @ index %d for UPL %p\n", pg_index, upl);
4881                         }
4882
4883                         /*
4884                          * We know that we have at least one dirty page.
4885                          * Now checking to see how many in a row we have
4886                          */
4887                         num_of_pages = 1;
4888                         xsize = isize - PAGE_SIZE;
4889
4890                         while (xsize) {
4891                                 if ( !upl_dirty_page(pl, pg_index + num_of_pages))
4892                                         break;
4893                                 num_of_pages++;
4894                                 xsize -= PAGE_SIZE;
4895                         }
4896                         xsize = num_of_pages * PAGE_SIZE;
4897
4898                         if ((error = cluster_pageout(vp, upl, offset, f_offset,
4899                                                         xsize, filesize, a_flags))) {
4900                                 if (error_ret == 0)
4901                                         error_ret = error;
4902                         }
4903                         f_offset += xsize;
4904                         offset   += xsize;
4905                         isize    -= xsize;
4906                         pg_index += num_of_pages;
4907                 }
4908                 /* capture errnos bubbled out of cluster_pageout if they occurred */
4909                 if (error_ret != 0) {
4910                         retval = error_ret;
4911                 }
4912         } /* end block for v2 pageout behavior */
4913         else {
4914                 /*
4915                  * just call cluster_pageout for old pre-v2 behavior
4916                  */
4917                 retval = cluster_pageout(vp, upl, a_pl_offset, ap->a_f_offset,
4918                                 ap->a_size, filesize, a_flags);
4919         }
4920
4921         /*
4922          * If data was written, update the modification time of the file
4923          * but only if it's mapped writable; we will have touched the
4924          * modifcation time for direct writes.
4925          */
4926         if (retval == 0 && (ubc_is_mapped_writable(vp)
4927                                                 || ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING))) {
4928                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
4929
4930                 // Check again with lock
4931                 bool mapped_writable = ubc_is_mapped_writable(vp);
4932                 if (mapped_writable
4933                         || ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING)) {
4934                         cp->c_touch_modtime = TRUE;
4935                         cp->c_touch_chgtime = TRUE;
4936
4937                         /*
4938                          * We only need to increment the generation counter if
4939                          * it's currently mapped writable because we incremented
4940                          * the counter in hfs_vnop_mnomap.
4941                          */
4942                         if (mapped_writable)
4943                                 hfs_incr_gencount(VTOC(vp));
4944
4945                         /*
4946                          * If setuid or setgid bits are set and this process is
4947                          * not the superuser then clear the setuid and setgid bits
4948                          * as a precaution against tampering.
4949                          */
4950                         if ((cp->c_mode & (S_ISUID | S_ISGID)) &&
4951                                 (vfs_context_suser(ap->a_context) != 0)) {
4952                                 cp->c_mode &= ~(S_ISUID | S_ISGID);
4953                         }
4954                 }
4955
4956                 hfs_unlock(cp);
4957         }
4958
4959 pageout_done:
4960         if (is_pageoutv2) {
4961                 /*
4962                  * Release the truncate lock.  Note that because
4963                  * we may have taken the lock recursively by
4964                  * being invoked via ubc_msync due to lockdown,
4965                  * we should release it recursively, too.
4966                  */
4967                 hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4968         }
4969         return (retval);
4970 }
4971
4972 /*
4973  * Intercept B-Tree node writes to unswap them if necessary.
4974  */
4975 int
4976 hfs_vnop_bwrite(struct vnop_bwrite_args *ap)
4977 {
4978         int retval = 0;
4979         register struct buf *bp = ap->a_bp;
4980         register struct vnode *vp = buf_vnode(bp);
4981         BlockDescriptor block;
4982
4983         /* Trap B-Tree writes */
4984         if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) ||
4985             (VTOC(vp)->c_fileid == kHFSCatalogFileID) ||
4986             (VTOC(vp)->c_fileid == kHFSAttributesFileID) ||
4987             (vp == VTOHFS(vp)->hfc_filevp)) {
4988
4989                 /*
4990                  * Swap and validate the node if it is in native byte order.
4991                  * This is always be true on big endian, so we always validate
4992                  * before writing here.  On little endian, the node typically has
4993                  * been swapped and validated when it was written to the journal,
4994                  * so we won't do anything here.
4995                  */
4996                 if (((u_int16_t *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) {
4997                         /* Prepare the block pointer */
4998                         block.blockHeader = bp;
4999                         block.buffer = (char *)buf_dataptr(bp);
5000                         block.blockNum = buf_lblkno(bp);
5001                         /* not found in cache ==> came from disk */
5002                         block.blockReadFromDisk = (buf_fromcache(bp) == 0);
5003                         block.blockSize = buf_count(bp);
5004
5005                         /* Endian un-swap B-Tree node */
5006                         retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, false);
5007                         if (retval)
5008                                 panic("hfs_vnop_bwrite: about to write corrupt node!\n");
5009                 }
5010         }
5011
5012         /* This buffer shouldn't be locked anymore but if it is clear it */
5013         if ((buf_flags(bp) & B_LOCKED)) {
5014                 // XXXdbg
5015                 if (VTOHFS(vp)->jnl) {
5016                         panic("hfs: CLEARING the lock bit on bp %p\n", bp);
5017                 }
5018                 buf_clearflags(bp, B_LOCKED);
5019         }
5020         retval = vn_bwrite (ap);
5021
5022         return (retval);
5023 }
5024
5025
5026 int
5027 hfs_pin_block_range(struct hfsmount *hfsmp, int pin_state, uint32_t start_block, uint32_t nblocks)
5028 {
5029         _dk_cs_pin_t pin;
5030         unsigned ioc;
5031         int err;
5032
5033         memset(&pin, 0, sizeof(pin));
5034         pin.cp_extent.offset = ((uint64_t)start_block) * HFSTOVCB(hfsmp)->blockSize;
5035         pin.cp_extent.length = ((uint64_t)nblocks) * HFSTOVCB(hfsmp)->blockSize;
5036         switch (pin_state) {
5037         case HFS_PIN_IT:
5038                 ioc = _DKIOCCSPINEXTENT;
5039                 pin.cp_flags = _DKIOCCSPINTOFASTMEDIA;
5040                 break;
5041         case HFS_PIN_IT | HFS_TEMP_PIN:
5042                 ioc = _DKIOCCSPINEXTENT;
5043                 pin.cp_flags = _DKIOCCSPINTOFASTMEDIA | _DKIOCCSTEMPORARYPIN;
5044                 break;
5045         case HFS_PIN_IT | HFS_DATALESS_PIN:
5046                 ioc = _DKIOCCSPINEXTENT;
5047                 pin.cp_flags = _DKIOCCSPINTOFASTMEDIA | _DKIOCCSPINFORSWAPFILE;
5048                 break;
5049         case HFS_UNPIN_IT:
5050                 ioc = _DKIOCCSUNPINEXTENT;
5051                 pin.cp_flags = 0;
5052                 break;
5053         case HFS_UNPIN_IT | HFS_EVICT_PIN:
5054                 ioc = _DKIOCCSPINEXTENT;
5055                 pin.cp_flags = _DKIOCCSPINTOSLOWMEDIA;
5056                 break;
5057         default:
5058                 return EINVAL;
5059         }
5060         err = VNOP_IOCTL(hfsmp->hfs_devvp, ioc, (caddr_t)&pin, 0, vfs_context_kernel());
5061         return err;
5062 }
5063
5064 //
5065 // The cnode lock should already be held on entry to this function
5066 //
5067 int
5068 hfs_pin_vnode(struct hfsmount *hfsmp, struct vnode *vp, int pin_state, uint32_t *num_blocks_pinned)
5069 {
5070         struct filefork *fp = VTOF(vp);
5071         int i, err=0, need_put=0;
5072         struct vnode *rsrc_vp=NULL;
5073         uint32_t npinned = 0;
5074         off_t               offset;
5075
5076         if (num_blocks_pinned) {
5077                 *num_blocks_pinned = 0;
5078         }
5079
5080         if (vnode_vtype(vp) != VREG) {
5081                 /* Not allowed to pin directories or symlinks */
5082                 printf("hfs: can't pin vnode of type %d\n", vnode_vtype(vp));
5083                 return (EPERM);
5084         }
5085
5086         if (fp->ff_unallocblocks) {
5087                 printf("hfs: can't pin a vnode w/unalloced blocks (%d)\n", fp->ff_unallocblocks);
5088                 return (EINVAL);
5089         }
5090
5091         /*
5092          * It is possible that if the caller unlocked/re-locked the cnode after checking
5093          * for C_NOEXISTS|C_DELETED that the file could have been deleted while the
5094          * cnode was unlocked.  So check the condition again and return ENOENT so that
5095          * the caller knows why we failed to pin the vnode.
5096          */
5097         if (VTOC(vp)->c_flag & (C_NOEXISTS|C_DELETED)) {
5098                 // makes no sense to pin something that's pending deletion
5099                 return ENOENT;
5100         }
5101
5102         if (fp->ff_blocks == 0 && (VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
5103                 if (!VNODE_IS_RSRC(vp) && hfs_vgetrsrc(hfsmp, vp, &rsrc_vp) == 0) {
5104                         //printf("hfs: fileid %d resource fork nblocks: %d / size: %lld\n", VTOC(vp)->c_fileid,
5105                         //       VTOC(rsrc_vp)->c_rsrcfork->ff_blocks,VTOC(rsrc_vp)->c_rsrcfork->ff_size);
5106
5107                         fp = VTOC(rsrc_vp)->c_rsrcfork;
5108                         need_put = 1;
5109                 }
5110         }
5111         if (fp->ff_blocks == 0) {
5112                 if (need_put) {
5113                         //
5114                         // use a distinct error code for a compressed file that has no resource fork;
5115                         // we return EALREADY to indicate that the data is already probably hot file
5116                         // cached because it's in an EA and the attributes btree is on the ssd
5117                         //
5118                         err = EALREADY;
5119                 } else {
5120                         err = EINVAL;
5121                 }
5122                 goto out;
5123         }
5124
5125         offset = 0;
5126         for (i = 0; i < kHFSPlusExtentDensity; i++) {
5127                 if (fp->ff_extents[i].startBlock == 0) {
5128                         break;
5129                 }
5130
5131                 err = hfs_pin_block_range(hfsmp, pin_state, fp->ff_extents[i].startBlock, fp->ff_extents[i].blockCount);
5132                 if (err) {
5133                         break;
5134                 } else {
5135                         npinned += fp->ff_extents[i].blockCount;
5136                 }
5137         }
5138
5139         if (err || npinned == 0) {
5140                 goto out;
5141         }
5142
5143         if (fp->ff_extents[kHFSPlusExtentDensity-1].startBlock) {
5144                 uint32_t pblocks;
5145                 uint8_t forktype = 0;
5146
5147                 if (fp == VTOC(vp)->c_rsrcfork) {
5148                         forktype = 0xff;
5149                 }
5150                 /*
5151                  * The file could have overflow extents, better pin them.
5152                  *
5153                  * We assume that since we are holding the cnode lock for this cnode,
5154                  * the files extents cannot be manipulated, but the tree could, so we
5155                  * need to ensure that it doesn't change behind our back as we iterate it.
5156                  */
5157                 int lockflags = hfs_systemfile_lock (hfsmp, SFL_EXTENTS, HFS_SHARED_LOCK);
5158                 err = hfs_pin_overflow_extents(hfsmp, VTOC(vp)->c_fileid, forktype, &pblocks);
5159                 hfs_systemfile_unlock (hfsmp, lockflags);
5160
5161                 if (err) {
5162                         goto out;
5163                 }
5164                 npinned += pblocks;
5165         }
5166
5167 out:
5168         if (num_blocks_pinned) {
5169                 *num_blocks_pinned = npinned;
5170         }
5171
5172         if (need_put && rsrc_vp) {
5173                 //
5174                 // have to unlock the cnode since it's shared between the
5175                 // resource fork vnode and the data fork vnode (and the
5176                 // vnode_put() may need to re-acquire the cnode lock to
5177                 // reclaim the resource fork vnode)
5178                 //
5179                 hfs_unlock(VTOC(vp));
5180                 vnode_put(rsrc_vp);
5181                 hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5182         }
5183         return err;
5184 }
5185
5186
5187 /*
5188  * Relocate a file to a new location on disk
5189  *  cnode must be locked on entry
5190  *
5191  * Relocation occurs by cloning the file's data from its
5192  * current set of blocks to a new set of blocks. During
5193  * the relocation all of the blocks (old and new) are
5194  * owned by the file.
5195  *
5196  * -----------------
5197  * |///////////////|
5198  * -----------------
5199  * 0               N (file offset)
5200  *
5201  * -----------------     -----------------
5202  * |///////////////|     |               |     STEP 1 (acquire new blocks)
5203  * -----------------     -----------------
5204  * 0               N     N+1             2N
5205  *
5206  * -----------------     -----------------
5207  * |///////////////|     |///////////////|     STEP 2 (clone data)
5208  * -----------------     -----------------
5209  * 0               N     N+1             2N
5210  *
5211  *                       -----------------
5212  *                       |///////////////|     STEP 3 (head truncate blocks)
5213  *                       -----------------
5214  *                       0               N
5215  *
5216  * During steps 2 and 3 page-outs to file offsets less
5217  * than or equal to N are suspended.
5218  *
5219  * During step 3 page-ins to the file get suspended.
5220  */
5221 int
5222 hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
5223         struct  proc *p)
5224 {
5225         struct  cnode *cp;
5226         struct  filefork *fp;
5227         struct  hfsmount *hfsmp;
5228         u_int32_t  headblks;
5229         u_int32_t  datablks;
5230         u_int32_t  blksize;
5231         u_int32_t  growsize;
5232         u_int32_t  nextallocsave;
5233         daddr64_t  sector_a,  sector_b;
5234         int eflags;
5235         off_t  newbytes;
5236         int  retval;
5237         int lockflags = 0;
5238         int took_trunc_lock = 0;
5239         int started_tr = 0;
5240         enum vtype vnodetype;
5241
5242         vnodetype = vnode_vtype(vp);
5243         if (vnodetype != VREG) {
5244                 /* Not allowed to move symlinks. */
5245                 return (EPERM);
5246         }
5247
5248         hfsmp = VTOHFS(vp);
5249         if (hfsmp->hfs_flags & HFS_FRAGMENTED_FREESPACE) {
5250                 return (ENOSPC);
5251         }
5252
5253         cp = VTOC(vp);
5254         fp = VTOF(vp);
5255         if (fp->ff_unallocblocks)
5256                 return (EINVAL);
5257
5258 #if CONFIG_PROTECT
5259         /*
5260          * <rdar://problem/9118426>
5261          * Disable HFS file relocation on content-protected filesystems
5262          */
5263         if (cp_fs_protected (hfsmp->hfs_mp)) {
5264                 return EINVAL;
5265         }
5266 #endif
5267         /* If it's an SSD, also disable HFS relocation */
5268         if (hfsmp->hfs_flags & HFS_SSD) {
5269                 return EINVAL;
5270         }
5271
5272
5273         blksize = hfsmp->blockSize;
5274         if (blockHint == 0)
5275                 blockHint = hfsmp->nextAllocation;
5276
5277         if (fp->ff_size > 0x7fffffff) {
5278                 return (EFBIG);
5279         }
5280
5281         if (!vnode_issystem(vp) && (vnodetype != VLNK)) {
5282                 hfs_unlock(cp);
5283                 hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
5284                 /* Force lock since callers expects lock to be held. */
5285                 if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS))) {
5286                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5287                         return (retval);
5288                 }
5289                 /* No need to continue if file was removed. */
5290                 if (cp->c_flag & C_NOEXISTS) {
5291                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5292                         return (ENOENT);
5293                 }
5294                 took_trunc_lock = 1;
5295         }
5296         headblks = fp->ff_blocks;
5297         datablks = howmany(fp->ff_size, blksize);
5298         growsize = datablks * blksize;
5299         eflags = kEFContigMask | kEFAllMask | kEFNoClumpMask;
5300         if (blockHint >= hfsmp->hfs_metazone_start &&
5301             blockHint <= hfsmp->hfs_metazone_end)
5302                 eflags |= kEFMetadataMask;
5303
5304         if (hfs_start_transaction(hfsmp) != 0) {
5305                 if (took_trunc_lock)
5306                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5307             return (EINVAL);
5308         }
5309         started_tr = 1;
5310         /*
5311          * Protect the extents b-tree and the allocation bitmap
5312          * during MapFileBlockC and ExtendFileC operations.
5313          */
5314         lockflags = SFL_BITMAP;
5315         if (overflow_extents(fp))
5316                 lockflags |= SFL_EXTENTS;
5317         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5318
5319         retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize - 1, &sector_a, NULL);
5320         if (retval) {
5321                 retval = MacToVFSError(retval);
5322                 goto out;
5323         }
5324
5325         /*
5326          * STEP 1 - acquire new allocation blocks.
5327          */
5328         nextallocsave = hfsmp->nextAllocation;
5329         retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes);
5330         if (eflags & kEFMetadataMask) {
5331                 hfs_lock_mount(hfsmp);
5332                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave);
5333                 MarkVCBDirty(hfsmp);
5334                 hfs_unlock_mount(hfsmp);
5335         }
5336
5337         retval = MacToVFSError(retval);
5338         if (retval == 0) {
5339                 cp->c_flag |= C_MODIFIED;
5340                 if (newbytes < growsize) {
5341                         retval = ENOSPC;
5342                         goto restore;
5343                 } else if (fp->ff_blocks < (headblks + datablks)) {
5344                         printf("hfs_relocate: allocation failed id=%u, vol=%s\n", cp->c_cnid, hfsmp->vcbVN);
5345                         retval = ENOSPC;
5346                         goto restore;
5347                 }
5348
5349                 retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize, &sector_b, NULL);
5350                 if (retval) {
5351                         retval = MacToVFSError(retval);
5352                 } else if ((sector_a + 1) == sector_b) {
5353                         retval = ENOSPC;
5354                         goto restore;
5355                 } else if ((eflags & kEFMetadataMask) &&
5356                            ((((u_int64_t)sector_b * hfsmp->hfs_logical_block_size) / blksize) >
5357                               hfsmp->hfs_metazone_end)) {
5358 #if 0
5359                         const char * filestr;
5360                         char emptystr = '\0';
5361
5362                         if (cp->c_desc.cd_nameptr != NULL) {
5363                                 filestr = (const char *)&cp->c_desc.cd_nameptr[0];
5364                         } else if (vnode_name(vp) != NULL) {
5365                                 filestr = vnode_name(vp);
5366                         } else {
5367                                 filestr = &emptystr;
5368                         }
5369 #endif
5370                         retval = ENOSPC;
5371                         goto restore;
5372                 }
5373         }
5374         /* Done with system locks and journal for now. */
5375         hfs_systemfile_unlock(hfsmp, lockflags);
5376         lockflags = 0;
5377         hfs_end_transaction(hfsmp);
5378         started_tr = 0;
5379
5380         if (retval) {
5381                 /*
5382                  * Check to see if failure is due to excessive fragmentation.
5383                  */
5384                 if ((retval == ENOSPC) &&
5385                     (hfs_freeblks(hfsmp, 0) > (datablks * 2))) {
5386                         hfsmp->hfs_flags |= HFS_FRAGMENTED_FREESPACE;
5387                 }
5388                 goto out;
5389         }
5390         /*
5391          * STEP 2 - clone file data into the new allocation blocks.
5392          */
5393
5394         if (vnodetype == VLNK)
5395                 retval = EPERM;
5396         else if (vnode_issystem(vp))
5397                 retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p);
5398         else
5399                 retval = hfs_clonefile(vp, headblks, datablks, blksize);
5400
5401         /* Start transaction for step 3 or for a restore. */
5402         if (hfs_start_transaction(hfsmp) != 0) {
5403                 retval = EINVAL;
5404                 goto out;
5405         }
5406         started_tr = 1;
5407         if (retval)
5408                 goto restore;
5409
5410         /*
5411          * STEP 3 - switch to cloned data and remove old blocks.
5412          */
5413         lockflags = SFL_BITMAP;
5414         if (overflow_extents(fp))
5415                 lockflags |= SFL_EXTENTS;
5416         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5417
5418         retval = HeadTruncateFile(hfsmp, (FCB*)fp, headblks);
5419
5420         hfs_systemfile_unlock(hfsmp, lockflags);
5421         lockflags = 0;
5422         if (retval)
5423                 goto restore;
5424 out:
5425         if (took_trunc_lock)
5426                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5427
5428         if (lockflags) {
5429                 hfs_systemfile_unlock(hfsmp, lockflags);
5430                 lockflags = 0;
5431         }
5432
5433         /* Push cnode's new extent data to disk. */
5434         if (retval == 0) {
5435                 hfs_update(vp, 0);
5436         }
5437         if (hfsmp->jnl) {
5438                 if (cp->c_cnid < kHFSFirstUserCatalogNodeID)
5439                         (void) hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT | HFS_FVH_WRITE_ALT);
5440                 else
5441                         (void) hfs_flushvolumeheader(hfsmp, 0);
5442         }
5443 exit:
5444         if (started_tr)
5445                 hfs_end_transaction(hfsmp);
5446
5447         return (retval);
5448
5449 restore:
5450         if (fp->ff_blocks == headblks) {
5451                 if (took_trunc_lock)
5452                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5453                 goto exit;
5454         }
5455         /*
5456          * Give back any newly allocated space.
5457          */
5458         if (lockflags == 0) {
5459                 lockflags = SFL_BITMAP;
5460                 if (overflow_extents(fp))
5461                         lockflags |= SFL_EXTENTS;
5462                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5463         }
5464
5465         (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, 0, FORK_IS_RSRC(fp),
5466                                                  FTOC(fp)->c_fileid, false);
5467
5468         hfs_systemfile_unlock(hfsmp, lockflags);
5469         lockflags = 0;
5470
5471         if (took_trunc_lock)
5472                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5473         goto exit;
5474 }
5475
5476
5477 /*
5478  * Clone a file's data within the file.
5479  *
5480  */
5481 static int
5482 hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
5483 {
5484         caddr_t  bufp;
5485         size_t  bufsize;
5486         size_t  copysize;
5487         size_t  iosize;
5488         size_t  offset;
5489         off_t   writebase;
5490         uio_t auio;
5491         int  error = 0;
5492
5493         writebase = blkstart * blksize;
5494         copysize = blkcnt * blksize;
5495         iosize = bufsize = MIN(copysize, 128 * 1024);
5496         offset = 0;
5497
5498         hfs_unlock(VTOC(vp));
5499
5500 #if CONFIG_PROTECT
5501         if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
5502                 hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5503                 return (error);
5504         }
5505 #endif /* CONFIG_PROTECT */
5506
5507     bufp = hfs_malloc(bufsize);
5508
5509         auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
5510
5511         while (offset < copysize) {
5512                 iosize = MIN(copysize - offset, iosize);
5513
5514                 uio_reset(auio, offset, UIO_SYSSPACE, UIO_READ);
5515                 uio_addiov(auio, (uintptr_t)bufp, iosize);
5516
5517                 error = cluster_read(vp, auio, copysize, IO_NOCACHE);
5518                 if (error) {
5519                         printf("hfs_clonefile: cluster_read failed - %d\n", error);
5520                         break;
5521                 }
5522                 if (uio_resid(auio) != 0) {
5523                         printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", (int64_t)uio_resid(auio));
5524                         error = EIO;
5525                         break;
5526                 }
5527
5528                 uio_reset(auio, writebase + offset, UIO_SYSSPACE, UIO_WRITE);
5529                 uio_addiov(auio, (uintptr_t)bufp, iosize);
5530
5531                 error = cluster_write(vp, auio, writebase + offset,
5532                                       writebase + offset + iosize,
5533                                       uio_offset(auio), 0, IO_NOCACHE | IO_SYNC);
5534                 if (error) {
5535                         printf("hfs_clonefile: cluster_write failed - %d\n", error);
5536                         break;
5537                 }
5538                 if (uio_resid(auio) != 0) {
5539                         printf("hfs_clonefile: cluster_write failed - uio_resid not zero\n");
5540                         error = EIO;
5541                         break;
5542                 }
5543                 offset += iosize;
5544         }
5545         uio_free(auio);
5546
5547         if ((blksize & PAGE_MASK)) {
5548                 /*
5549                  * since the copy may not have started on a PAGE
5550                  * boundary (or may not have ended on one), we
5551                  * may have pages left in the cache since NOCACHE
5552                  * will let partially written pages linger...
5553                  * lets just flush the entire range to make sure
5554                  * we don't have any pages left that are beyond
5555                  * (or intersect) the real LEOF of this file
5556                  */
5557                 ubc_msync(vp, writebase, writebase + offset, NULL, UBC_INVALIDATE | UBC_PUSHDIRTY);
5558         } else {
5559                 /*
5560                  * No need to call ubc_msync or hfs_invalbuf
5561                  * since the file was copied using IO_NOCACHE and
5562                  * the copy was done starting and ending on a page
5563                  * boundary in the file.
5564                  */
5565         }
5566     hfs_free(bufp, bufsize);
5567
5568         hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5569         return (error);
5570 }
5571
5572 /*
5573  * Clone a system (metadata) file.
5574  *
5575  */
5576 static int
5577 hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize,
5578                  kauth_cred_t cred, struct proc *p)
5579 {
5580         caddr_t  bufp;
5581         char * offset;
5582         size_t  bufsize;
5583         size_t  iosize;
5584         struct buf *bp = NULL;
5585         daddr64_t  blkno;
5586         daddr64_t  blk;
5587         daddr64_t  start_blk;
5588         daddr64_t  last_blk;
5589         int  breadcnt;
5590         int  i;
5591         int  error = 0;
5592
5593
5594         iosize = GetLogicalBlockSize(vp);
5595         bufsize = MIN(blkcnt * blksize, 1024 * 1024) & ~(iosize - 1);
5596         breadcnt = bufsize / iosize;
5597
5598     bufp = hfs_malloc(bufsize);
5599
5600         start_blk = ((daddr64_t)blkstart * blksize) / iosize;
5601         last_blk  = ((daddr64_t)blkcnt * blksize) / iosize;
5602         blkno = 0;
5603
5604         while (blkno < last_blk) {
5605                 /*
5606                  * Read up to a megabyte
5607                  */
5608                 offset = bufp;
5609                 for (i = 0, blk = blkno; (i < breadcnt) && (blk < last_blk); ++i, ++blk) {
5610                         error = (int)buf_meta_bread(vp, blk, iosize, cred, &bp);
5611                         if (error) {
5612                                 printf("hfs_clonesysfile: meta_bread error %d\n", error);
5613                                 goto out;
5614                         }
5615                         if (buf_count(bp) != iosize) {
5616                                 printf("hfs_clonesysfile: b_bcount is only %d\n", buf_count(bp));
5617                                 goto out;
5618                         }
5619                         bcopy((char *)buf_dataptr(bp), offset, iosize);
5620
5621                         buf_markinvalid(bp);
5622                         buf_brelse(bp);
5623                         bp = NULL;
5624
5625                         offset += iosize;
5626                 }
5627
5628                 /*
5629                  * Write up to a megabyte
5630                  */
5631                 offset = bufp;
5632                 for (i = 0; (i < breadcnt) && (blkno < last_blk); ++i, ++blkno) {
5633                         bp = buf_getblk(vp, start_blk + blkno, iosize, 0, 0, BLK_META);
5634                         if (bp == NULL) {
5635                                 printf("hfs_clonesysfile: getblk failed on blk %qd\n", start_blk + blkno);
5636                                 error = EIO;
5637                                 goto out;
5638                         }
5639                         bcopy(offset, (char *)buf_dataptr(bp), iosize);
5640                         error = (int)buf_bwrite(bp);
5641                         bp = NULL;
5642                         if (error)
5643                                 goto out;
5644                         offset += iosize;
5645                 }
5646         }
5647 out:
5648         if (bp) {
5649                 buf_brelse(bp);
5650         }
5651
5652     hfs_free(bufp, bufsize);
5653
5654         error = hfs_fsync(vp, MNT_WAIT, 0, p);
5655
5656         return (error);
5657 }
5658
5659 errno_t hfs_flush_invalid_ranges(vnode_t vp)
5660 {
5661         cnode_t *cp = VTOC(vp);
5662
5663         hfs_assert(cp->c_lockowner == current_thread());
5664         hfs_assert(cp->c_truncatelockowner == current_thread());
5665
5666         if (!ISSET(cp->c_flag, C_ZFWANTSYNC) && !cp->c_zftimeout)
5667                 return 0;
5668
5669         filefork_t *fp = VTOF(vp);
5670
5671         /*
5672          * We can't hold the cnode lock whilst we call cluster_write so we
5673          * need to copy the extents into a local buffer.
5674          */
5675         int max_exts = 16;
5676         struct ext {
5677                 off_t start, end;
5678         } exts_buf[max_exts];           // 256 bytes
5679         struct ext *exts = exts_buf;
5680         int ext_count = 0;
5681         errno_t ret;
5682
5683         struct rl_entry *r = TAILQ_FIRST(&fp->ff_invalidranges);
5684
5685         while (r) {
5686                 /* If we have more than can fit in our stack buffer, switch
5687                    to a heap buffer. */
5688                 if (exts == exts_buf && ext_count == max_exts) {
5689                         max_exts = 256;
5690                         exts = hfs_malloc(sizeof(struct ext) * max_exts);
5691                         memcpy(exts, exts_buf, ext_count * sizeof(struct ext));
5692                 }
5693
5694                 struct rl_entry *next = TAILQ_NEXT(r, rl_link);
5695
5696                 exts[ext_count++] = (struct ext){ r->rl_start, r->rl_end };
5697
5698                 if (!next || (ext_count == max_exts && exts != exts_buf)) {
5699                         hfs_unlock(cp);
5700                         for (int i = 0; i < ext_count; ++i) {
5701                                 ret = cluster_write(vp, NULL, fp->ff_size, exts[i].end + 1,
5702                                                                         exts[i].start, 0,
5703                                                                         IO_HEADZEROFILL | IO_NOZERODIRTY | IO_NOCACHE);
5704                                 if (ret) {
5705                                         hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
5706                                         goto exit;
5707                                 }
5708                         }
5709
5710                         if (!next) {
5711                                 hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
5712                                 break;
5713                         }
5714
5715                         /* Push any existing clusters which should clean up our invalid
5716                            ranges as they go through hfs_vnop_blockmap. */
5717                         cluster_push(vp, 0);
5718
5719                         hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
5720
5721                         /*
5722                          * Get back to where we were (given we dropped the lock).
5723                          * This shouldn't be many because we pushed above.
5724                          */
5725                         TAILQ_FOREACH(r, &fp->ff_invalidranges, rl_link) {
5726                                 if (r->rl_end > exts[ext_count - 1].end)
5727                                         break;
5728                         }
5729
5730                         ext_count = 0;
5731                 } else
5732                         r = next;
5733         }
5734
5735         ret = 0;
5736
5737 exit:
5738
5739         if (exts != exts_buf)
5740                 hfs_free(exts, sizeof(struct ext) * max_exts);
5741
5742         return ret;
5743 }