core/hfs_readwrite.c

   1 /*
   2  * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*      @(#)hfs_readwrite.c     1.0
  29  *
  30  *      (c) 1998-2001 Apple Inc.  All Rights Reserved
  31  *
  32  *      hfs_readwrite.c -- vnode operations to deal with reading and writing files.
  33  *
  34  */
  35
  36 #include <sys/param.h>
  37 #include <sys/systm.h>
  38 #include <sys/kernel.h>
  39 #include <sys/fcntl.h>
  40 #include <sys/stat.h>
  41 #include <sys/buf.h>
  42 #include <sys/proc.h>
  43 #include <sys/kauth.h>
  44 #include <sys/vnode.h>
  45 #include <sys/uio.h>
  46 #include <sys/vfs_context.h>
  47 #include <sys/disk.h>
  48 #include <sys/sysctl.h>
  49 #include <sys/fsctl.h>
  50 #include <sys/ubc.h>
  51 #include <sys/fsevents.h>
  52
  53 #include <libkern/OSDebug.h>
  54
  55 #include <miscfs/specfs/specdev.h>
  56
  57 #include <sys/ubc.h>
  58
  59 #include <vm/vm_pageout.h>
  60 #include <vm/vm_kern.h>
  61
  62 #include <IOKit/IOBSD.h>
  63
  64 #include <sys/kdebug.h>
  65
  66 #include        "hfs.h"
  67 #include        "hfs_attrlist.h"
  68 #include        "hfs_endian.h"
  69 #include        "hfs_fsctl.h"
  70 #include        "hfs_quota.h"
  71 #include        "FileMgrInternal.h"
  72 #include        "BTreesInternal.h"
  73 #include        "hfs_cnode.h"
  74 #include        "hfs_dbg.h"
  75
  76 #if HFS_CONFIG_KEY_ROLL
  77 #include        "hfs_key_roll.h"
  78 #endif
  79
  80 #define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2)))
  81
  82 enum {
  83         MAXHFSFILESIZE = 0x7FFFFFFF             /* this needs to go in the mount structure */
  84 };
  85
  86 /* from bsd/hfs/hfs_vfsops.c */
  87 extern int hfs_vfs_vget (struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context);
  88
  89 /* from hfs_hotfiles.c */
  90 extern int hfs_pin_overflow_extents (struct hfsmount *hfsmp, uint32_t fileid,
  91                                               uint8_t forktype, uint32_t *pinned);
  92
  93 static int  hfs_clonefile(struct vnode *, int, int, int);
  94 static int  hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *);
  95 static int  do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skip, vfs_context_t context);
  96
  97
  98 /*
  99  * Read data from a file.
 100  */
 101 int
 102 hfs_vnop_read(struct vnop_read_args *ap)
 103 {
 104         /*
 105            struct vnop_read_args {
 106            struct vnodeop_desc *a_desc;
 107            vnode_t a_vp;
 108            struct uio *a_uio;
 109            int a_ioflag;
 110            vfs_context_t a_context;
 111            };
 112          */
 113
 114         uio_t uio = ap->a_uio;
 115         struct vnode *vp = ap->a_vp;
 116         struct cnode *cp;
 117         struct filefork *fp;
 118         struct hfsmount *hfsmp;
 119         off_t filesize;
 120         off_t filebytes;
 121         off_t start_resid = uio_resid(uio);
 122         off_t offset = uio_offset(uio);
 123         int retval = 0;
 124         int took_truncate_lock = 0;
 125         int io_throttle = 0;
 126         int throttled_count = 0;
 127
 128         /* Preflight checks */
 129         if (!vnode_isreg(vp)) {
 130                 /* can only read regular files */
 131                 if (vnode_isdir(vp))
 132                         return (EISDIR);
 133                 else
 134                         return (EPERM);
 135         }
 136         if (start_resid == 0)
 137                 return (0);             /* Nothing left to do */
 138         if (offset < 0)
 139                 return (EINVAL);        /* cant read from a negative offset */
 140
 141 #if SECURE_KERNEL
 142         if ((ap->a_ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) ==
 143                                                 (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) {
 144                 /* Don't allow unencrypted io request from user space */
 145                 return EPERM;
 146         }
 147 #endif
 148
 149 #if HFS_COMPRESSION
 150         if (VNODE_IS_RSRC(vp)) {
 151                 if (hfs_hides_rsrc(ap->a_context, VTOC(vp), 1)) { /* 1 == don't take the cnode lock */
 152                         return 0;
 153                 }
 154                 /* otherwise read the resource fork normally */
 155         } else {
 156                 int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
 157                 if (compressed) {
 158                         retval = decmpfs_read_compressed(ap, &compressed, VTOCMP(vp));
 159                         if (retval == 0 && !(ap->a_ioflag & IO_EVTONLY) && vnode_isfastdevicecandidate(vp)) {
 160                                 (void) hfs_addhotfile(vp);
 161                         }
 162                         if (compressed) {
 163                                 if (retval == 0) {
 164                                         /* successful read, update the access time */
 165                                         VTOC(vp)->c_touch_acctime = TRUE;
 166
 167                                         //
 168                                         // compressed files are not traditional hot file candidates
 169                                         // but they may be for CF (which ignores the ff_bytesread
 170                                         // field)
 171                                         //
 172                                         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
 173                                                 VTOF(vp)->ff_bytesread = 0;
 174                                         }
 175                                 }
 176                                 return retval;
 177                         }
 178                         /* otherwise the file was converted back to a regular file while we were reading it */
 179                         retval = 0;
 180                 } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
 181                         int error;
 182
 183                         error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP);
 184                         if (error) {
 185                                 return error;
 186                         }
 187
 188                 }
 189         }
 190 #endif /* HFS_COMPRESSION */
 191
 192         cp = VTOC(vp);
 193         fp = VTOF(vp);
 194         hfsmp = VTOHFS(vp);
 195
 196 #if CONFIG_PROTECT
 197         if ((retval = cp_handle_vnop (vp, CP_READ_ACCESS, ap->a_ioflag)) != 0) {
 198                 goto exit;
 199         }
 200
 201 #if HFS_CONFIG_KEY_ROLL
 202         if (ISSET(ap->a_ioflag, IO_ENCRYPTED)) {
 203                 off_rsrc_t off_rsrc = off_rsrc_make(offset + start_resid,
 204                                                                                         VNODE_IS_RSRC(vp));
 205
 206                 retval = hfs_key_roll_up_to(ap->a_context, vp, off_rsrc);
 207                 if (retval)
 208                         goto exit;
 209         }
 210 #endif // HFS_CONFIG_KEY_ROLL
 211 #endif // CONFIG_PROTECT
 212
 213         /*
 214          * If this read request originated from a syscall (as opposed to
 215          * an in-kernel page fault or something), then set it up for
 216          * throttle checks
 217          */
 218         if (ap->a_ioflag & IO_SYSCALL_DISPATCH) {
 219                 io_throttle = IO_RETURN_ON_THROTTLE;
 220         }
 221
 222 read_again:
 223
 224         /* Protect against a size change. */
 225         hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
 226         took_truncate_lock = 1;
 227
 228         filesize = fp->ff_size;
 229         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 230
 231         /*
 232          * Check the file size. Note that per POSIX spec, we return 0 at
 233          * file EOF, so attempting a read at an offset that is too big
 234          * should just return 0 on HFS+. Since the return value was initialized
 235          * to 0 above, we just jump to exit.  HFS Standard has its own behavior.
 236          */
 237         if (offset > filesize) {
 238 #if CONFIG_HFS_STD
 239                 if ((hfsmp->hfs_flags & HFS_STANDARD) &&
 240                     (offset > (off_t)MAXHFSFILESIZE)) {
 241                         retval = EFBIG;
 242                 }
 243 #endif
 244                 goto exit;
 245         }
 246
 247         KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_START,
 248                 (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0);
 249
 250         retval = cluster_read(vp, uio, filesize, ap->a_ioflag |io_throttle);
 251
 252         cp->c_touch_acctime = TRUE;
 253
 254         KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_END,
 255                 (int)uio_offset(uio), uio_resid(uio), (int)filesize,  (int)filebytes, 0);
 256
 257         /*
 258          * Keep track blocks read
 259          */
 260         if (hfsmp->hfc_stage == HFC_RECORDING && retval == 0) {
 261                 int took_cnode_lock = 0;
 262                 off_t bytesread;
 263
 264                 bytesread = start_resid - uio_resid(uio);
 265
 266                 /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
 267                 if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) {
 268                         hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 269                         took_cnode_lock = 1;
 270                 }
 271                 /*
 272                  * If this file hasn't been seen since the start of
 273                  * the current sampling period then start over.
 274                  */
 275                 if (cp->c_atime < hfsmp->hfc_timebase) {
 276                         struct timeval tv;
 277
 278                         fp->ff_bytesread = bytesread;
 279                         microtime(&tv);
 280                         cp->c_atime = tv.tv_sec;
 281                 } else {
 282                         fp->ff_bytesread += bytesread;
 283                 }
 284
 285                 if (!(ap->a_ioflag & IO_EVTONLY) && vnode_isfastdevicecandidate(vp)) {
 286                         //
 287                         // We don't add hotfiles for processes doing IO_EVTONLY I/O
 288                         // on the assumption that they're system processes such as
 289                         // mdworker which scan everything in the system (and thus
 290                         // do not represent user-initiated access to files)
 291                         //
 292                         (void) hfs_addhotfile(vp);
 293                 }
 294                 if (took_cnode_lock)
 295                         hfs_unlock(cp);
 296         }
 297 exit:
 298         if (took_truncate_lock) {
 299                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 300         }
 301         if (retval == EAGAIN) {
 302                 throttle_lowpri_io(1);
 303                 throttled_count++;
 304
 305                 retval = 0;
 306                 goto read_again;
 307         }
 308         if (throttled_count)
 309                 throttle_info_reset_window(NULL);
 310         return (retval);
 311 }
 312
 313 /*
 314  * Ideally, this wouldn't be necessary; the cluster code should be
 315  * able to handle this on the read-side.  See <rdar://20420068>.
 316  */
 317 static errno_t hfs_zero_eof_page(vnode_t vp, off_t zero_up_to)
 318 {
 319         hfs_assert(VTOC(vp)->c_lockowner != current_thread());
 320         hfs_assert(VTOC(vp)->c_truncatelockowner == current_thread());
 321
 322         struct filefork *fp = VTOF(vp);
 323
 324         if (!(fp->ff_size & PAGE_MASK_64) || zero_up_to <= fp->ff_size) {
 325                 // Nothing to do
 326                 return 0;
 327         }
 328
 329         zero_up_to = MIN(zero_up_to, (off_t)round_page_64(fp->ff_size));
 330
 331         /* N.B. At present, @zero_up_to is not important because the cluster
 332            code will always zero up to the end of the page anyway. */
 333         return cluster_write(vp, NULL, fp->ff_size, zero_up_to,
 334                                                  fp->ff_size, 0, IO_HEADZEROFILL);
 335 }
 336
 337 /*
 338  * Write data to a file.
 339  */
 340 int
 341 hfs_vnop_write(struct vnop_write_args *ap)
 342 {
 343         uio_t uio = ap->a_uio;
 344         struct vnode *vp = ap->a_vp;
 345         struct cnode *cp;
 346         struct filefork *fp;
 347         struct hfsmount *hfsmp;
 348         kauth_cred_t cred = NULL;
 349         off_t origFileSize;
 350         off_t writelimit;
 351         off_t bytesToAdd = 0;
 352         off_t actualBytesAdded;
 353         off_t filebytes;
 354         off_t offset;
 355         ssize_t resid;
 356         int eflags;
 357         int ioflag = ap->a_ioflag;
 358         int retval = 0;
 359         int lockflags;
 360         int cnode_locked = 0;
 361         int partialwrite = 0;
 362         int do_snapshot = 1;
 363         time_t orig_ctime=VTOC(vp)->c_ctime;
 364         int took_truncate_lock = 0;
 365         int io_return_on_throttle = 0;
 366         int throttled_count = 0;
 367
 368 #if HFS_COMPRESSION
 369         if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
 370                 int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
 371                 switch(state) {
 372                         case FILE_IS_COMPRESSED:
 373                                 return EACCES;
 374                         case FILE_IS_CONVERTING:
 375                                 /* if FILE_IS_CONVERTING, we allow writes but do not
 376                                    bother with snapshots or else we will deadlock.
 377                                 */
 378                                 do_snapshot = 0;
 379                                 break;
 380                         default:
 381                                 printf("invalid state %d for compressed file\n", state);
 382                                 /* fall through */
 383                 }
 384         } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
 385                 int error;
 386
 387                 error = check_for_dataless_file(vp, NAMESPACE_HANDLER_WRITE_OP);
 388                 if (error != 0) {
 389                         return error;
 390                 }
 391         }
 392
 393         if (do_snapshot) {
 394                 nspace_snapshot_event(vp, orig_ctime, NAMESPACE_HANDLER_WRITE_OP, uio);
 395         }
 396
 397 #endif
 398
 399 #if SECURE_KERNEL
 400         if ((ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) ==
 401                                                 (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) {
 402                 /* Don't allow unencrypted io request from user space */
 403                 return EPERM;
 404         }
 405 #endif
 406
 407         resid = uio_resid(uio);
 408         offset = uio_offset(uio);
 409
 410         if (offset < 0)
 411                 return (EINVAL);
 412         if (resid == 0)
 413                 return (E_NONE);
 414         if (!vnode_isreg(vp))
 415                 return (EPERM);  /* Can only write regular files */
 416
 417         cp = VTOC(vp);
 418         fp = VTOF(vp);
 419         hfsmp = VTOHFS(vp);
 420
 421 #if CONFIG_PROTECT
 422         if ((retval = cp_handle_vnop (vp, CP_WRITE_ACCESS, 0)) != 0) {
 423                 goto exit;
 424         }
 425 #endif
 426
 427         eflags = kEFDeferMask;  /* defer file block allocations */
 428 #if HFS_SPARSE_DEV
 429         /*
 430          * When the underlying device is sparse and space
 431          * is low (< 8MB), stop doing delayed allocations
 432          * and begin doing synchronous I/O.
 433          */
 434         if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
 435             (hfs_freeblks(hfsmp, 0) < 2048)) {
 436                 eflags &= ~kEFDeferMask;
 437                 ioflag |= IO_SYNC;
 438         }
 439 #endif /* HFS_SPARSE_DEV */
 440
 441         if ((ioflag & (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) ==
 442                         (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) {
 443                 io_return_on_throttle = IO_RETURN_ON_THROTTLE;
 444         }
 445
 446 again:
 447         /*
 448          * Protect against a size change.
 449          *
 450          * Note: If took_truncate_lock is true, then we previously got the lock shared
 451          * but needed to upgrade to exclusive.  So try getting it exclusive from the
 452          * start.
 453          */
 454         if (ioflag & IO_APPEND || took_truncate_lock) {
 455                 hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
 456         }
 457         else {
 458                 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
 459         }
 460         took_truncate_lock = 1;
 461
 462         /* Update UIO */
 463         if (ioflag & IO_APPEND) {
 464                 uio_setoffset(uio, fp->ff_size);
 465                 offset = fp->ff_size;
 466         }
 467         if ((cp->c_bsdflags & APPEND) && offset != fp->ff_size) {
 468                 retval = EPERM;
 469                 goto exit;
 470         }
 471
 472         cred = vfs_context_ucred(ap->a_context);
 473         if (cred && suser(cred, NULL) != 0)
 474                 eflags |= kEFReserveMask;
 475
 476         origFileSize = fp->ff_size;
 477         writelimit = offset + resid;
 478
 479         /*
 480          * We may need an exclusive truncate lock for several reasons, all
 481          * of which are because we may be writing to a (portion of a) block
 482          * for the first time, and we need to make sure no readers see the
 483          * prior, uninitialized contents of the block.  The cases are:
 484          *
 485          * 1. We have unallocated (delayed allocation) blocks.  We may be
 486          *    allocating new blocks to the file and writing to them.
 487          *    (A more precise check would be whether the range we're writing
 488          *    to contains delayed allocation blocks.)
 489          * 2. We need to extend the file.  The bytes between the old EOF
 490          *    and the new EOF are not yet initialized.  This is important
 491          *    even if we're not allocating new blocks to the file.  If the
 492          *    old EOF and new EOF are in the same block, we still need to
 493          *    protect that range of bytes until they are written for the
 494          *    first time.
 495          *
 496          * If we had a shared lock with the above cases, we need to try to upgrade
 497          * to an exclusive lock.  If the upgrade fails, we will lose the shared
 498          * lock, and will need to take the truncate lock again; the took_truncate_lock
 499          * flag will still be set, causing us to try for an exclusive lock next time.
 500          */
 501         if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) &&
 502             ((fp->ff_unallocblocks != 0) ||
 503              (writelimit > origFileSize))) {
 504                 if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) {
 505                         /*
 506                          * Lock upgrade failed and we lost our shared lock, try again.
 507                          * Note: we do not set took_truncate_lock=0 here.  Leaving it
 508                          * set to 1 will cause us to try to get the lock exclusive.
 509                          */
 510                         goto again;
 511                 }
 512                 else {
 513                         /* Store the owner in the c_truncatelockowner field if we successfully upgrade */
 514                         cp->c_truncatelockowner = current_thread();
 515                 }
 516         }
 517
 518         if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
 519                 goto exit;
 520         }
 521         cnode_locked = 1;
 522
 523         filebytes = hfs_blk_to_bytes(fp->ff_blocks, hfsmp->blockSize);
 524
 525         if (offset > filebytes
 526                 && (hfs_blk_to_bytes(hfs_freeblks(hfsmp, ISSET(eflags, kEFReserveMask)),
 527                                                          hfsmp->blockSize) < offset - filebytes)) {
 528                 retval = ENOSPC;
 529                 goto exit;
 530         }
 531
 532         KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_START,
 533                      (int)offset, uio_resid(uio), (int)fp->ff_size,
 534                      (int)filebytes, 0);
 535
 536         /* Check if we do not need to extend the file */
 537         if (writelimit <= filebytes) {
 538                 goto sizeok;
 539         }
 540
 541         bytesToAdd = writelimit - filebytes;
 542
 543 #if QUOTA
 544         retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, hfsmp->blockSize)),
 545                            cred, 0);
 546         if (retval)
 547                 goto exit;
 548 #endif /* QUOTA */
 549
 550         if (hfs_start_transaction(hfsmp) != 0) {
 551                 retval = EINVAL;
 552                 goto exit;
 553         }
 554
 555         while (writelimit > filebytes) {
 556                 bytesToAdd = writelimit - filebytes;
 557
 558                 /* Protect extents b-tree and allocation bitmap */
 559                 lockflags = SFL_BITMAP;
 560                 if (overflow_extents(fp))
 561                         lockflags |= SFL_EXTENTS;
 562                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
 563
 564                 /* Files that are changing size are not hot file candidates. */
 565                 if (hfsmp->hfc_stage == HFC_RECORDING) {
 566                         fp->ff_bytesread = 0;
 567                 }
 568                 retval = MacToVFSError(ExtendFileC (hfsmp, (FCB*)fp, bytesToAdd,
 569                                 0, eflags, &actualBytesAdded));
 570
 571                 hfs_systemfile_unlock(hfsmp, lockflags);
 572
 573                 if ((actualBytesAdded == 0) && (retval == E_NONE))
 574                         retval = ENOSPC;
 575                 if (retval != E_NONE)
 576                         break;
 577                 filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 578                 KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_NONE,
 579                         (int)offset, uio_resid(uio), (int)fp->ff_size,  (int)filebytes, 0);
 580         }
 581         (void) hfs_update(vp, 0);
 582         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
 583         (void) hfs_end_transaction(hfsmp);
 584
 585         /*
 586          * If we didn't grow the file enough try a partial write.
 587          * POSIX expects this behavior.
 588          */
 589         if ((retval == ENOSPC) && (filebytes > offset)) {
 590                 retval = 0;
 591                 partialwrite = 1;
 592                 uio_setresid(uio, (uio_resid(uio) - bytesToAdd));
 593                 resid -= bytesToAdd;
 594                 writelimit = filebytes;
 595         }
 596 sizeok:
 597         if (retval == E_NONE) {
 598                 off_t filesize;
 599                 off_t head_off;
 600                 int lflag;
 601
 602                 if (writelimit > fp->ff_size) {
 603                         filesize = writelimit;
 604                         struct timeval tv;
 605                         rl_add(fp->ff_size, writelimit - 1 , &fp->ff_invalidranges);
 606                         microuptime(&tv);
 607                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
 608                 } else
 609                         filesize = fp->ff_size;
 610
 611                 lflag = ioflag & ~(IO_TAILZEROFILL | IO_HEADZEROFILL | IO_NOZEROVALID | IO_NOZERODIRTY);
 612
 613                 /*
 614                  * We no longer use IO_HEADZEROFILL or IO_TAILZEROFILL (except
 615                  * for one case below).  For the regions that lie before the
 616                  * beginning and after the end of this write that are in the
 617                  * same page, we let the cluster code handle zeroing that out
 618                  * if necessary.  If those areas are not cached, the cluster
 619                  * code will try and read those areas in, and in the case
 620                  * where those regions have never been written to,
 621                  * hfs_vnop_blockmap will consult the invalid ranges and then
 622                  * indicate that.  The cluster code will zero out those areas.
 623                  */
 624
 625                 head_off = trunc_page_64(offset);
 626
 627                 if (head_off < offset && head_off >= fp->ff_size) {
 628                         /*
 629                          * The first page is beyond current EOF, so as an
 630                          * optimisation, we can pass IO_HEADZEROFILL.
 631                          */
 632                         lflag |= IO_HEADZEROFILL;
 633                 }
 634
 635                 hfs_unlock(cp);
 636                 cnode_locked = 0;
 637
 638                 /*
 639                  * We need to tell UBC the fork's new size BEFORE calling
 640                  * cluster_write, in case any of the new pages need to be
 641                  * paged out before cluster_write completes (which does happen
 642                  * in embedded systems due to extreme memory pressure).
 643                  * Similarly, we need to tell hfs_vnop_pageout what the new EOF
 644                  * will be, so that it can pass that on to cluster_pageout, and
 645                  * allow those pageouts.
 646                  *
 647                  * We don't update ff_size yet since we don't want pageins to
 648                  * be able to see uninitialized data between the old and new
 649                  * EOF, until cluster_write has completed and initialized that
 650                  * part of the file.
 651                  *
 652                  * The vnode pager relies on the file size last given to UBC via
 653                  * ubc_setsize.  hfs_vnop_pageout relies on fp->ff_new_size or
 654                  * ff_size (whichever is larger).  NOTE: ff_new_size is always
 655                  * zero, unless we are extending the file via write.
 656                  */
 657                 if (filesize > fp->ff_size) {
 658                         retval = hfs_zero_eof_page(vp, offset);
 659                         if (retval)
 660                                 goto exit;
 661                         fp->ff_new_size = filesize;
 662                         ubc_setsize(vp, filesize);
 663                 }
 664                 retval = cluster_write(vp, uio, fp->ff_size, filesize, head_off,
 665                                                            0, lflag | IO_NOZERODIRTY | io_return_on_throttle);
 666                 if (retval) {
 667                         fp->ff_new_size = 0;    /* no longer extending; use ff_size */
 668
 669                         if (retval == EAGAIN) {
 670                                 /*
 671                                  * EAGAIN indicates that we still have I/O to do, but
 672                                  * that we now need to be throttled
 673                                  */
 674                                 if (resid != uio_resid(uio)) {
 675                                         /*
 676                                          * did manage to do some I/O before returning EAGAIN
 677                                          */
 678                                         resid = uio_resid(uio);
 679                                         offset = uio_offset(uio);
 680
 681                                         cp->c_touch_chgtime = TRUE;
 682                                         cp->c_touch_modtime = TRUE;
 683                                         hfs_incr_gencount(cp);
 684                                 }
 685                                 if (filesize > fp->ff_size) {
 686                                         /*
 687                                          * we called ubc_setsize before the call to
 688                                          * cluster_write... since we only partially
 689                                          * completed the I/O, we need to
 690                                          * re-adjust our idea of the filesize based
 691                                          * on our interim EOF
 692                                          */
 693                                         ubc_setsize(vp, offset);
 694
 695                                         fp->ff_size = offset;
 696                                 }
 697                                 goto exit;
 698                         }
 699                         if (filesize > origFileSize) {
 700                                 ubc_setsize(vp, origFileSize);
 701                         }
 702                         goto ioerr_exit;
 703                 }
 704
 705                 if (filesize > origFileSize) {
 706                         fp->ff_size = filesize;
 707
 708                         /* Files that are changing size are not hot file candidates. */
 709                         if (hfsmp->hfc_stage == HFC_RECORDING) {
 710                                 fp->ff_bytesread = 0;
 711                         }
 712                 }
 713                 fp->ff_new_size = 0;    /* ff_size now has the correct size */
 714         }
 715         if (partialwrite) {
 716                 uio_setresid(uio, (uio_resid(uio) + bytesToAdd));
 717                 resid += bytesToAdd;
 718         }
 719
 720         if (vnode_should_flush_after_write(vp, ioflag))
 721                 hfs_flush(hfsmp, HFS_FLUSH_CACHE);
 722
 723 ioerr_exit:
 724         if (!cnode_locked) {
 725                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 726                 cnode_locked = 1;
 727         }
 728
 729         if (resid > uio_resid(uio)) {
 730                 cp->c_touch_chgtime = TRUE;
 731                 cp->c_touch_modtime = TRUE;
 732                 hfs_incr_gencount(cp);
 733
 734                 /*
 735                  * If we successfully wrote any data, and we are not the superuser
 736                  * we clear the setuid and setgid bits as a precaution against
 737                  * tampering.
 738                  */
 739                 if (cp->c_mode & (S_ISUID | S_ISGID)) {
 740                         cred = vfs_context_ucred(ap->a_context);
 741                         if (cred && suser(cred, NULL)) {
 742                                 cp->c_mode &= ~(S_ISUID | S_ISGID);
 743                         }
 744                 }
 745         }
 746         if (retval) {
 747                 if (ioflag & IO_UNIT) {
 748                         (void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC,
 749                                            0, ap->a_context);
 750                         uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio))));
 751                         uio_setresid(uio, resid);
 752                         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 753                 }
 754         } else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio)))
 755                 retval = hfs_update(vp, 0);
 756
 757         /* Updating vcbWrCnt doesn't need to be atomic. */
 758         hfsmp->vcbWrCnt++;
 759
 760         KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_END,
 761                 (int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0);
 762 exit:
 763         if (retval && took_truncate_lock
 764                 && cp->c_truncatelockowner == current_thread()) {
 765                 fp->ff_new_size = 0;
 766                 rl_remove(fp->ff_size, RL_INFINITY, &fp->ff_invalidranges);
 767         }
 768
 769         if (cnode_locked)
 770                 hfs_unlock(cp);
 771
 772         if (took_truncate_lock) {
 773                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 774         }
 775         if (retval == EAGAIN) {
 776                 throttle_lowpri_io(1);
 777                 throttled_count++;
 778
 779                 retval = 0;
 780                 goto again;
 781         }
 782         if (throttled_count)
 783                 throttle_info_reset_window(NULL);
 784         return (retval);
 785 }
 786
 787 /* support for the "bulk-access" fcntl */
 788
 789 #define CACHE_LEVELS 16
 790 #define NUM_CACHE_ENTRIES (64*16)
 791 #define PARENT_IDS_FLAG 0x100
 792
 793 struct access_cache {
 794        int numcached;
 795        int cachehits; /* these two for statistics gathering */
 796        int lookups;
 797        unsigned int *acache;
 798        unsigned char *haveaccess;
 799 };
 800
 801 struct access_t {
 802         uid_t     uid;              /* IN: effective user id */
 803         short     flags;            /* IN: access requested (i.e. R_OK) */
 804         short     num_groups;       /* IN: number of groups user belongs to */
 805         int       num_files;        /* IN: number of files to process */
 806         int       *file_ids;        /* IN: array of file ids */
 807         gid_t     *groups;          /* IN: array of groups */
 808         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 809 } __attribute__((unavailable)); // this structure is for reference purposes only
 810
 811 struct user32_access_t {
 812         uid_t     uid;              /* IN: effective user id */
 813         short     flags;            /* IN: access requested (i.e. R_OK) */
 814         short     num_groups;       /* IN: number of groups user belongs to */
 815         int       num_files;        /* IN: number of files to process */
 816         user32_addr_t      file_ids;        /* IN: array of file ids */
 817         user32_addr_t      groups;          /* IN: array of groups */
 818         user32_addr_t      access;          /* OUT: access info for each file (0 for 'has access') */
 819 };
 820
 821 struct user64_access_t {
 822         uid_t           uid;                    /* IN: effective user id */
 823         short           flags;                  /* IN: access requested (i.e. R_OK) */
 824         short           num_groups;             /* IN: number of groups user belongs to */
 825         int             num_files;              /* IN: number of files to process */
 826         user64_addr_t   file_ids;               /* IN: array of file ids */
 827         user64_addr_t   groups;                 /* IN: array of groups */
 828         user64_addr_t   access;                 /* OUT: access info for each file (0 for 'has access') */
 829 };
 830
 831
 832 // these are the "extended" versions of the above structures
 833 // note that it is crucial that they be different sized than
 834 // the regular version
 835 struct ext_access_t {
 836         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 837         uint32_t   num_files;       /* IN: number of files to process */
 838         uint32_t   map_size;        /* IN: size of the bit map */
 839         uint32_t  *file_ids;        /* IN: Array of file ids */
 840         char      *bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 841         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 842         uint32_t   num_parents;   /* future use */
 843         cnid_t      *parents;   /* future use */
 844 } __attribute__((unavailable)); // this structure is for reference purposes only
 845
 846 struct user32_ext_access_t {
 847         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 848         uint32_t   num_files;       /* IN: number of files to process */
 849         uint32_t   map_size;        /* IN: size of the bit map */
 850         user32_addr_t  file_ids;        /* IN: Array of file ids */
 851         user32_addr_t     bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 852         user32_addr_t access;          /* OUT: access info for each file (0 for 'has access') */
 853         uint32_t   num_parents;   /* future use */
 854         user32_addr_t parents;   /* future use */
 855 };
 856
 857 struct user64_ext_access_t {
 858         uint32_t      flags;        /* IN: access requested (i.e. R_OK) */
 859         uint32_t      num_files;    /* IN: number of files to process */
 860         uint32_t      map_size;     /* IN: size of the bit map */
 861         user64_addr_t   file_ids;     /* IN: array of file ids */
 862         user64_addr_t   bitmap;       /* IN: array of groups */
 863         user64_addr_t   access;       /* OUT: access info for each file (0 for 'has access') */
 864         uint32_t      num_parents;/* future use */
 865         user64_addr_t   parents;/* future use */
 866 };
 867
 868
 869 /*
 870  * Perform a binary search for the given parent_id. Return value is
 871  * the index if there is a match.  If no_match_indexp is non-NULL it
 872  * will be assigned with the index to insert the item (even if it was
 873  * not found).
 874  */
 875 static int cache_binSearch(cnid_t *array, unsigned int hi, cnid_t parent_id, int *no_match_indexp)
 876 {
 877     int index=-1;
 878     unsigned int lo=0;
 879
 880     do {
 881         unsigned int mid = ((hi - lo)/2) + lo;
 882         unsigned int this_id = array[mid];
 883
 884         if (parent_id == this_id) {
 885             hi = mid;
 886             break;
 887         }
 888
 889         if (parent_id < this_id) {
 890             hi = mid;
 891             continue;
 892         }
 893
 894         if (parent_id > this_id) {
 895             lo = mid + 1;
 896             continue;
 897         }
 898     } while(lo < hi);
 899
 900     /* check if lo and hi converged on the match */
 901     if (parent_id == array[hi]) {
 902         index = hi;
 903     }
 904
 905     if (no_match_indexp) {
 906         *no_match_indexp = hi;
 907     }
 908
 909     return index;
 910 }
 911
 912
 913 static int
 914 lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id)
 915 {
 916     unsigned int hi;
 917     int matches = 0;
 918     int index, no_match_index;
 919
 920     if (cache->numcached == 0) {
 921         *indexp = 0;
 922         return 0; // table is empty, so insert at index=0 and report no match
 923     }
 924
 925     if (cache->numcached > NUM_CACHE_ENTRIES) {
 926         cache->numcached = NUM_CACHE_ENTRIES;
 927     }
 928
 929     hi = cache->numcached - 1;
 930
 931     index = cache_binSearch(cache->acache, hi, parent_id, &no_match_index);
 932
 933     /* if no existing entry found, find index for new one */
 934     if (index == -1) {
 935         index = no_match_index;
 936         matches = 0;
 937     } else {
 938         matches = 1;
 939     }
 940
 941     *indexp = index;
 942     return matches;
 943 }
 944
 945 /*
 946  * Add a node to the access_cache at the given index (or do a lookup first
 947  * to find the index if -1 is passed in). We currently do a replace rather
 948  * than an insert if the cache is full.
 949  */
 950 static void
 951 add_node(struct access_cache *cache, int index, cnid_t nodeID, int access)
 952 {
 953     int lookup_index = -1;
 954
 955     /* need to do a lookup first if -1 passed for index */
 956     if (index == -1) {
 957         if (lookup_bucket(cache, &lookup_index, nodeID)) {
 958             if (cache->haveaccess[lookup_index] != access && cache->haveaccess[lookup_index] == ESRCH) {
 959                 // only update an entry if the previous access was ESRCH (i.e. a scope checking error)
 960                 cache->haveaccess[lookup_index] = access;
 961             }
 962
 963             /* mission accomplished */
 964             return;
 965         } else {
 966             index = lookup_index;
 967         }
 968
 969     }
 970
 971     /* if the cache is full, do a replace rather than an insert */
 972     if (cache->numcached >= NUM_CACHE_ENTRIES) {
 973         cache->numcached = NUM_CACHE_ENTRIES-1;
 974
 975         if (index > cache->numcached) {
 976             index = cache->numcached;
 977         }
 978     }
 979
 980     if (index < cache->numcached && index < NUM_CACHE_ENTRIES && nodeID > cache->acache[index]) {
 981         index++;
 982     }
 983
 984     if (index >= 0 && index < cache->numcached) {
 985         /* only do bcopy if we're inserting */
 986         bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) );
 987         bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(unsigned char) );
 988     }
 989
 990     cache->acache[index] = nodeID;
 991     cache->haveaccess[index] = access;
 992     cache->numcached++;
 993 }
 994
 995
 996 struct cinfo {
 997     uid_t   uid;
 998     gid_t   gid;
 999     mode_t  mode;
1000     cnid_t  parentcnid;
1001     u_int16_t recflags;
1002 };
1003
1004 static int
1005 snoop_callback(const cnode_t *cp, void *arg)
1006 {
1007     struct cinfo *cip = arg;
1008
1009     cip->uid = cp->c_uid;
1010     cip->gid = cp->c_gid;
1011     cip->mode = cp->c_mode;
1012     cip->parentcnid = cp->c_parentcnid;
1013     cip->recflags = cp->c_attr.ca_recflags;
1014
1015     return (0);
1016 }
1017
1018 /*
1019  * Lookup the cnid's attr info (uid, gid, and mode) as well as its parent id. If the item
1020  * isn't incore, then go to the catalog.
1021  */
1022 static int
1023 do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, cnid_t cnid,
1024     struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp)
1025 {
1026     int error = 0;
1027
1028     /* if this id matches the one the fsctl was called with, skip the lookup */
1029     if (cnid == skip_cp->c_cnid) {
1030                 cnattrp->ca_uid = skip_cp->c_uid;
1031                 cnattrp->ca_gid = skip_cp->c_gid;
1032                 cnattrp->ca_mode = skip_cp->c_mode;
1033                 cnattrp->ca_recflags = skip_cp->c_attr.ca_recflags;
1034                 keyp->hfsPlus.parentID = skip_cp->c_parentcnid;
1035     } else {
1036                 struct cinfo c_info;
1037
1038                 /* otherwise, check the cnode hash incase the file/dir is incore */
1039                 error = hfs_chash_snoop(hfsmp, cnid, 0, snoop_callback, &c_info);
1040
1041                 if (error == EACCES) {
1042                         // File is deleted
1043                         return ENOENT;
1044                 } else if (!error) {
1045                         cnattrp->ca_uid = c_info.uid;
1046                         cnattrp->ca_gid = c_info.gid;
1047                         cnattrp->ca_mode = c_info.mode;
1048                         cnattrp->ca_recflags = c_info.recflags;
1049                         keyp->hfsPlus.parentID = c_info.parentcnid;
1050                 } else {
1051                         int lockflags;
1052
1053                         if (throttle_io_will_be_throttled(-1, HFSTOVFS(hfsmp)))
1054                                 throttle_lowpri_io(1);
1055
1056                         lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
1057
1058                         /* lookup this cnid in the catalog */
1059                         error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp);
1060
1061                         hfs_systemfile_unlock(hfsmp, lockflags);
1062
1063                         cache->lookups++;
1064                 }
1065     }
1066
1067     return (error);
1068 }
1069
1070
1071 /*
1072  * Compute whether we have access to the given directory (nodeID) and all its parents. Cache
1073  * up to CACHE_LEVELS as we progress towards the root.
1074  */
1075 static int
1076 do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HFSCatalogNodeID nodeID,
1077     struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred,
1078     struct vfs_context *my_context,
1079     char *bitmap,
1080     uint32_t map_size,
1081     cnid_t* parents,
1082     uint32_t num_parents)
1083 {
1084     int                     myErr = 0;
1085     int                     myResult;
1086     HFSCatalogNodeID        thisNodeID;
1087     unsigned int            myPerms;
1088     struct cat_attr         cnattr;
1089     int                     cache_index = -1, scope_index = -1, scope_idx_start = -1;
1090     CatalogKey              catkey;
1091
1092     int i = 0, ids_to_cache = 0;
1093     int parent_ids[CACHE_LEVELS];
1094
1095     thisNodeID = nodeID;
1096     while (thisNodeID >=  kRootDirID) {
1097         myResult = 0;   /* default to "no access" */
1098
1099         /* check the cache before resorting to hitting the catalog */
1100
1101         /* ASSUMPTION: access info of cached entries is "final"... i.e. no need
1102          * to look any further after hitting cached dir */
1103
1104         if (lookup_bucket(cache, &cache_index, thisNodeID)) {
1105             cache->cachehits++;
1106             myErr = cache->haveaccess[cache_index];
1107             if (scope_index != -1) {
1108                 if (myErr == ESRCH) {
1109                     myErr = 0;
1110                 }
1111             } else {
1112                 scope_index = 0;   // so we'll just use the cache result
1113                 scope_idx_start = ids_to_cache;
1114             }
1115             myResult = (myErr == 0) ? 1 : 0;
1116             goto ExitThisRoutine;
1117         }
1118
1119
1120         if (parents) {
1121             int tmp;
1122             tmp = cache_binSearch(parents, num_parents-1, thisNodeID, NULL);
1123             if (scope_index == -1)
1124                 scope_index = tmp;
1125             if (tmp != -1 && scope_idx_start == -1 && ids_to_cache < CACHE_LEVELS) {
1126                 scope_idx_start = ids_to_cache;
1127             }
1128         }
1129
1130         /* remember which parents we want to cache */
1131         if (ids_to_cache < CACHE_LEVELS) {
1132             parent_ids[ids_to_cache] = thisNodeID;
1133             ids_to_cache++;
1134         }
1135         // Inefficient (using modulo) and we might want to use a hash function, not rely on the node id to be "nice"...
1136         if (bitmap && map_size) {
1137             bitmap[(thisNodeID/8)%(map_size)]|=(1<<(thisNodeID&7));
1138         }
1139
1140
1141         /* do the lookup (checks the cnode hash, then the catalog) */
1142         myErr = do_attr_lookup(hfsmp, cache, thisNodeID, skip_cp, &catkey, &cnattr);
1143         if (myErr) {
1144             goto ExitThisRoutine; /* no access */
1145         }
1146
1147         /* Root always gets access. */
1148         if (suser(myp_ucred, NULL) == 0) {
1149                 thisNodeID = catkey.hfsPlus.parentID;
1150                 myResult = 1;
1151                 continue;
1152         }
1153
1154         // if the thing has acl's, do the full permission check
1155         if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1156             struct vnode *vp;
1157
1158             /* get the vnode for this cnid */
1159             myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0, 0);
1160             if ( myErr ) {
1161                 myResult = 0;
1162                 goto ExitThisRoutine;
1163             }
1164
1165             thisNodeID = VTOC(vp)->c_parentcnid;
1166
1167             hfs_unlock(VTOC(vp));
1168
1169             if (vnode_vtype(vp) == VDIR) {
1170                 myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), my_context);
1171             } else {
1172                 myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, my_context);
1173             }
1174
1175             vnode_put(vp);
1176             if (myErr) {
1177                 myResult = 0;
1178                 goto ExitThisRoutine;
1179             }
1180         } else {
1181             unsigned int flags;
1182                 int mode = cnattr.ca_mode & S_IFMT;
1183                 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, cnattr.ca_mode, hfsmp->hfs_mp,myp_ucred, theProcPtr);
1184
1185                 if (mode == S_IFDIR) {
1186                         flags = R_OK | X_OK;
1187                 } else {
1188                         flags = R_OK;
1189                 }
1190                 if ( (myPerms & flags) != flags) {
1191                         myResult = 0;
1192                         myErr = EACCES;
1193                         goto ExitThisRoutine;   /* no access */
1194                 }
1195
1196             /* up the hierarchy we go */
1197             thisNodeID = catkey.hfsPlus.parentID;
1198         }
1199     }
1200
1201     /* if here, we have access to this node */
1202     myResult = 1;
1203
1204   ExitThisRoutine:
1205     if (parents && myErr == 0 && scope_index == -1) {
1206         myErr = ESRCH;
1207     }
1208
1209     if (myErr) {
1210         myResult = 0;
1211     }
1212     *err = myErr;
1213
1214     /* cache the parent directory(ies) */
1215     for (i = 0; i < ids_to_cache; i++) {
1216         if (myErr == 0 && parents && (scope_idx_start == -1 || i > scope_idx_start)) {
1217             add_node(cache, -1, parent_ids[i], ESRCH);
1218         } else {
1219             add_node(cache, -1, parent_ids[i], myErr);
1220         }
1221     }
1222
1223     return (myResult);
1224 }
1225
1226 static int
1227 do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
1228     struct vnop_ioctl_args *ap, int arg_size, vfs_context_t context)
1229 {
1230     boolean_t is64bit;
1231
1232     /*
1233      * NOTE: on entry, the vnode has an io_ref. In case this vnode
1234      * happens to be in our list of file_ids, we'll note it
1235      * avoid calling hfs_chashget_nowait() on that id as that
1236      * will cause a "locking against myself" panic.
1237      */
1238     Boolean check_leaf = true;
1239
1240     struct user64_ext_access_t *user_access_structp;
1241     struct user64_ext_access_t tmp_user_access;
1242     struct access_cache cache;
1243
1244     int error = 0, prev_parent_check_ok=1;
1245     unsigned int i;
1246
1247     short flags;
1248     unsigned int num_files = 0;
1249     int map_size = 0;
1250     int num_parents = 0;
1251     int *file_ids=NULL;
1252     short *access=NULL;
1253     char *bitmap=NULL;
1254     cnid_t *parents=NULL;
1255     int leaf_index;
1256
1257     cnid_t cnid;
1258     cnid_t prevParent_cnid = 0;
1259     unsigned int myPerms;
1260     short myaccess = 0;
1261     struct cat_attr cnattr;
1262     CatalogKey catkey;
1263     struct cnode *skip_cp = VTOC(vp);
1264     kauth_cred_t cred = vfs_context_ucred(context);
1265     proc_t p = vfs_context_proc(context);
1266
1267     is64bit = proc_is64bit(p);
1268
1269     /* initialize the local cache and buffers */
1270     cache.numcached = 0;
1271     cache.cachehits = 0;
1272     cache.lookups = 0;
1273     cache.acache = NULL;
1274     cache.haveaccess = NULL;
1275
1276     /* struct copyin done during dispatch... need to copy file_id array separately */
1277     if (ap->a_data == NULL) {
1278         error = EINVAL;
1279         goto err_exit_bulk_access;
1280     }
1281
1282     if (is64bit) {
1283         if (arg_size != sizeof(struct user64_ext_access_t)) {
1284             error = EINVAL;
1285             goto err_exit_bulk_access;
1286         }
1287
1288         user_access_structp = (struct user64_ext_access_t *)ap->a_data;
1289
1290     } else if (arg_size == sizeof(struct user32_access_t)) {
1291         struct user32_access_t *accessp = (struct user32_access_t *)ap->a_data;
1292
1293         // convert an old style bulk-access struct to the new style
1294         tmp_user_access.flags     = accessp->flags;
1295         tmp_user_access.num_files = accessp->num_files;
1296         tmp_user_access.map_size  = 0;
1297         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1298         tmp_user_access.bitmap    = USER_ADDR_NULL;
1299         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1300         tmp_user_access.num_parents = 0;
1301         user_access_structp = &tmp_user_access;
1302
1303     } else if (arg_size == sizeof(struct user32_ext_access_t)) {
1304         struct user32_ext_access_t *accessp = (struct user32_ext_access_t *)ap->a_data;
1305
1306         // up-cast from a 32-bit version of the struct
1307         tmp_user_access.flags     = accessp->flags;
1308         tmp_user_access.num_files = accessp->num_files;
1309         tmp_user_access.map_size  = accessp->map_size;
1310         tmp_user_access.num_parents  = accessp->num_parents;
1311
1312         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1313         tmp_user_access.bitmap    = CAST_USER_ADDR_T(accessp->bitmap);
1314         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1315         tmp_user_access.parents    = CAST_USER_ADDR_T(accessp->parents);
1316
1317         user_access_structp = &tmp_user_access;
1318     } else {
1319         error = EINVAL;
1320         goto err_exit_bulk_access;
1321     }
1322
1323     map_size = user_access_structp->map_size;
1324
1325     num_files = user_access_structp->num_files;
1326
1327     num_parents= user_access_structp->num_parents;
1328
1329     if (num_files < 1) {
1330         goto err_exit_bulk_access;
1331     }
1332     if (num_files > 1024) {
1333         error = EINVAL;
1334         goto err_exit_bulk_access;
1335     }
1336
1337     if (num_parents > 1024) {
1338         error = EINVAL;
1339         goto err_exit_bulk_access;
1340     }
1341
1342     file_ids = hfs_malloc(sizeof(int) * num_files);
1343     access = hfs_malloc(sizeof(short) * num_files);
1344     if (map_size) {
1345                 bitmap = hfs_mallocz(sizeof(char) * map_size);
1346     }
1347
1348     if (num_parents) {
1349                 parents = hfs_malloc(sizeof(cnid_t) * num_parents);
1350     }
1351
1352     cache.acache = hfs_malloc(sizeof(int) * NUM_CACHE_ENTRIES);
1353     cache.haveaccess = hfs_malloc(sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1354
1355     if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids,
1356                 num_files * sizeof(int)))) {
1357         goto err_exit_bulk_access;
1358     }
1359
1360     if (num_parents) {
1361         if ((error = copyin(user_access_structp->parents, (caddr_t)parents,
1362                     num_parents * sizeof(cnid_t)))) {
1363             goto err_exit_bulk_access;
1364         }
1365     }
1366
1367     flags = user_access_structp->flags;
1368     if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) {
1369         flags = R_OK;
1370     }
1371
1372     /* check if we've been passed leaf node ids or parent ids */
1373     if (flags & PARENT_IDS_FLAG) {
1374         check_leaf = false;
1375     }
1376
1377     /* Check access to each file_id passed in */
1378     for (i = 0; i < num_files; i++) {
1379         leaf_index=-1;
1380         cnid = (cnid_t) file_ids[i];
1381
1382         /* root always has access */
1383         if ((!parents) && (!suser(cred, NULL))) {
1384             access[i] = 0;
1385             continue;
1386         }
1387
1388         if (check_leaf) {
1389             /* do the lookup (checks the cnode hash, then the catalog) */
1390             error = do_attr_lookup(hfsmp, &cache, cnid, skip_cp, &catkey, &cnattr);
1391             if (error) {
1392                 access[i] = (short) error;
1393                 continue;
1394             }
1395
1396             if (parents) {
1397                 // Check if the leaf matches one of the parent scopes
1398                 leaf_index = cache_binSearch(parents, num_parents-1, cnid, NULL);
1399                 if (leaf_index >= 0 && parents[leaf_index] == cnid)
1400                     prev_parent_check_ok = 0;
1401                 else if (leaf_index >= 0)
1402                     prev_parent_check_ok = 1;
1403             }
1404
1405             // if the thing has acl's, do the full permission check
1406             if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1407                 struct vnode *cvp;
1408                 int myErr = 0;
1409                 /* get the vnode for this cnid */
1410                 myErr = hfs_vget(hfsmp, cnid, &cvp, 0, 0);
1411                 if ( myErr ) {
1412                     access[i] = myErr;
1413                     continue;
1414                 }
1415
1416                 hfs_unlock(VTOC(cvp));
1417
1418                 if (vnode_vtype(cvp) == VDIR) {
1419                     myErr = vnode_authorize(cvp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), context);
1420                 } else {
1421                     myErr = vnode_authorize(cvp, NULL, KAUTH_VNODE_READ_DATA, context);
1422                 }
1423
1424                 vnode_put(cvp);
1425                 if (myErr) {
1426                     access[i] = myErr;
1427                     continue;
1428                 }
1429             } else {
1430                 /* before calling CheckAccess(), check the target file for read access */
1431                 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
1432                     cnattr.ca_mode, hfsmp->hfs_mp, cred, p);
1433
1434                 /* fail fast if no access */
1435                 if ((myPerms & flags) == 0) {
1436                     access[i] = EACCES;
1437                     continue;
1438                 }
1439             }
1440         } else {
1441             /* we were passed an array of parent ids */
1442             catkey.hfsPlus.parentID = cnid;
1443         }
1444
1445         /* if the last guy had the same parent and had access, we're done */
1446         if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0 && prev_parent_check_ok) {
1447             cache.cachehits++;
1448             access[i] = 0;
1449             continue;
1450         }
1451
1452         myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID,
1453             skip_cp, p, cred, context,bitmap, map_size, parents, num_parents);
1454
1455         if (myaccess || (error == ESRCH && leaf_index != -1)) {
1456             access[i] = 0; // have access.. no errors to report
1457         } else {
1458             access[i] = (error != 0 ? (short) error : EACCES);
1459         }
1460
1461         prevParent_cnid = catkey.hfsPlus.parentID;
1462     }
1463
1464     /* copyout the access array */
1465     if ((error = copyout((caddr_t)access, user_access_structp->access,
1466                 num_files * sizeof (short)))) {
1467         goto err_exit_bulk_access;
1468     }
1469     if (map_size && bitmap) {
1470         if ((error = copyout((caddr_t)bitmap, user_access_structp->bitmap,
1471                     map_size * sizeof (char)))) {
1472             goto err_exit_bulk_access;
1473         }
1474     }
1475
1476
1477   err_exit_bulk_access:
1478
1479         hfs_free(file_ids, sizeof(int) * num_files);
1480         hfs_free(parents, sizeof(cnid_t) * num_parents);
1481         hfs_free(bitmap, sizeof(char) * map_size);
1482         hfs_free(access, sizeof(short) * num_files);
1483         hfs_free(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1484         hfs_free(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1485
1486     return (error);
1487 }
1488
1489
1490 /* end "bulk-access" support */
1491
1492
1493 /*
1494  * Control filesystem operating characteristics.
1495  */
1496 int
1497 hfs_vnop_ioctl( struct vnop_ioctl_args /* {
1498                 vnode_t a_vp;
1499                 long  a_command;
1500                 caddr_t  a_data;
1501                 int  a_fflag;
1502                 vfs_context_t a_context;
1503         } */ *ap)
1504 {
1505         struct vnode * vp = ap->a_vp;
1506         struct hfsmount *hfsmp = VTOHFS(vp);
1507         vfs_context_t context = ap->a_context;
1508         kauth_cred_t cred = vfs_context_ucred(context);
1509         proc_t p = vfs_context_proc(context);
1510         struct vfsstatfs *vfsp;
1511         boolean_t is64bit;
1512         off_t jnl_start, jnl_size;
1513         struct hfs_journal_info *jip;
1514 #if HFS_COMPRESSION
1515         int compressed = 0;
1516         off_t uncompressed_size = -1;
1517         int decmpfs_error = 0;
1518
1519         if (ap->a_command == F_RDADVISE) {
1520                 /* we need to inspect the decmpfs state of the file as early as possible */
1521                 compressed = hfs_file_is_compressed(VTOC(vp), 0);
1522                 if (compressed) {
1523                         if (VNODE_IS_RSRC(vp)) {
1524                                 /* if this is the resource fork, treat it as if it were empty */
1525                                 uncompressed_size = 0;
1526                         } else {
1527                                 decmpfs_error = hfs_uncompressed_size_of_compressed_file(NULL, vp, 0, &uncompressed_size, 0);
1528                                 if (decmpfs_error != 0) {
1529                                         /* failed to get the uncompressed size, we'll check for this later */
1530                                         uncompressed_size = -1;
1531                                 }
1532                         }
1533                 }
1534         }
1535 #endif /* HFS_COMPRESSION */
1536
1537         is64bit = proc_is64bit(p);
1538
1539 #if CONFIG_PROTECT
1540 #if HFS_CONFIG_KEY_ROLL
1541         // The HFS_KEY_ROLL fsctl does its own access checks
1542         if (ap->a_command != HFS_KEY_ROLL)
1543 #endif
1544         {
1545                 int error = 0;
1546                 if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
1547                         return error;
1548                 }
1549         }
1550 #endif /* CONFIG_PROTECT */
1551
1552         switch (ap->a_command) {
1553
1554         case HFS_GETPATH:
1555         {
1556                 struct vnode *file_vp;
1557                 cnid_t  cnid;
1558                 int  outlen;
1559                 char *bufptr;
1560                 int error;
1561                 int flags = 0;
1562
1563                 /* Caller must be owner of file system. */
1564                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1565                 if (suser(cred, NULL) &&
1566                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1567                         return (EACCES);
1568                 }
1569                 /* Target vnode must be file system's root. */
1570                 if (!vnode_isvroot(vp)) {
1571                         return (EINVAL);
1572                 }
1573                 bufptr = (char *)ap->a_data;
1574                 cnid = strtoul(bufptr, NULL, 10);
1575                 if (ap->a_fflag & HFS_GETPATH_VOLUME_RELATIVE) {
1576                         flags |= BUILDPATH_VOLUME_RELATIVE;
1577                 }
1578
1579                 /* We need to call hfs_vfs_vget to leverage the code that will
1580                  * fix the origin list for us if needed, as opposed to calling
1581                  * hfs_vget, since we will need the parent for build_path call.
1582                  */
1583
1584                 if ((error = hfs_vfs_vget(HFSTOVFS(hfsmp), cnid, &file_vp, context))) {
1585                         return (error);
1586                 }
1587
1588                 error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, flags, context);
1589                 vnode_put(file_vp);
1590
1591                 return (error);
1592         }
1593
1594         case HFS_SET_MAX_DEFRAG_SIZE:
1595         {
1596                 int error = 0;          /* Assume success */
1597                 u_int32_t maxsize = 0;
1598
1599                 if (vnode_vfsisrdonly(vp)) {
1600                         return (EROFS);
1601                 }
1602                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1603                 if (!kauth_cred_issuser(cred)) {
1604                         return (EACCES); /* must be root */
1605                 }
1606
1607                 maxsize = *(u_int32_t *)ap->a_data;
1608
1609                 hfs_lock_mount(hfsmp);
1610                 if (maxsize > HFS_MAX_DEFRAG_SIZE) {
1611                         error = EINVAL;
1612                 }
1613                 else {
1614                         hfsmp->hfs_defrag_max = maxsize;
1615                 }
1616                 hfs_unlock_mount(hfsmp);
1617
1618                 return (error);
1619         }
1620
1621         case HFS_FORCE_ENABLE_DEFRAG:
1622         {
1623                 int error = 0;          /* Assume success */
1624                 u_int32_t do_enable = 0;
1625
1626                 if (vnode_vfsisrdonly(vp)) {
1627                         return (EROFS);
1628                 }
1629                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1630                 if (!kauth_cred_issuser(cred)) {
1631                         return (EACCES); /* must be root */
1632                 }
1633
1634                 do_enable = *(u_int32_t *)ap->a_data;
1635
1636                 hfs_lock_mount(hfsmp);
1637                 if (do_enable != 0) {
1638                         hfsmp->hfs_defrag_nowait = 1;
1639                 }
1640                 else {
1641                         error = EINVAL;
1642                 }
1643
1644                 hfs_unlock_mount(hfsmp);
1645
1646                 return (error);
1647         }
1648
1649
1650         case HFS_TRANSFER_DOCUMENT_ID:
1651         {
1652                 struct cnode *cp = NULL;
1653                 int error;
1654                 u_int32_t to_fd = *(u_int32_t *)ap->a_data;
1655                 struct fileproc *to_fp;
1656                 struct vnode *to_vp;
1657                 struct cnode *to_cp;
1658
1659                 cp = VTOC(vp);
1660
1661                 if ((error = fp_getfvp(p, to_fd, &to_fp, &to_vp)) != 0) {
1662                         //printf("could not get the vnode for fd %d (err %d)\n", to_fd, error);
1663                         return error;
1664                 }
1665                 if ( (error = vnode_getwithref(to_vp)) ) {
1666                         file_drop(to_fd);
1667                         return error;
1668                 }
1669
1670                 if (VTOHFS(to_vp) != hfsmp) {
1671                         error = EXDEV;
1672                         goto transfer_cleanup;
1673                 }
1674
1675                 int need_unlock = 1;
1676                 to_cp = VTOC(to_vp);
1677                 error = hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK);
1678                 if (error != 0) {
1679                         //printf("could not lock the pair of cnodes (error %d)\n", error);
1680                         goto transfer_cleanup;
1681                 }
1682
1683                 if (!(cp->c_bsdflags & UF_TRACKED)) {
1684                         error = EINVAL;
1685                 } else if (to_cp->c_bsdflags & UF_TRACKED) {
1686                         //
1687                         // if the destination is already tracked, return an error
1688                         // as otherwise it's a silent deletion of the target's
1689                         // document-id
1690                         //
1691                         error = EEXIST;
1692                 } else if (S_ISDIR(cp->c_attr.ca_mode) || S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) {
1693                         //
1694                         // we can use the FndrExtendedFileInfo because the doc-id is the first
1695                         // thing in both it and the ExtendedDirInfo struct which is fixed in
1696                         // format and can not change layout
1697                         //
1698                         struct FndrExtendedFileInfo *f_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)cp->c_finderinfo + 16);
1699                         struct FndrExtendedFileInfo *to_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)to_cp->c_finderinfo + 16);
1700
1701                         if (f_extinfo->document_id == 0) {
1702                                 uint32_t new_id;
1703
1704                                 hfs_unlockpair(cp, to_cp);  // have to unlock to be able to get a new-id
1705
1706                                 if ((error = hfs_generate_document_id(hfsmp, &new_id)) == 0) {
1707                                         //
1708                                         // re-lock the pair now that we have the document-id
1709                                         //
1710                                         hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK);
1711                                         f_extinfo->document_id = new_id;
1712                                 } else {
1713                                         goto transfer_cleanup;
1714                                 }
1715                         }
1716
1717                         to_extinfo->document_id = f_extinfo->document_id;
1718                         f_extinfo->document_id = 0;
1719                         //printf("TRANSFERRING: doc-id %d from ino %d to ino %d\n", to_extinfo->document_id, cp->c_fileid, to_cp->c_fileid);
1720
1721                         // make sure the destination is also UF_TRACKED
1722                         to_cp->c_bsdflags |= UF_TRACKED;
1723                         cp->c_bsdflags &= ~UF_TRACKED;
1724
1725                         // mark the cnodes dirty
1726                         cp->c_flag |= C_MODIFIED;
1727                         to_cp->c_flag |= C_MODIFIED;
1728
1729                         int lockflags;
1730                         if ((error = hfs_start_transaction(hfsmp)) == 0) {
1731
1732                                 lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK);
1733
1734                                 (void) cat_update(hfsmp, &cp->c_desc, &cp->c_attr, NULL, NULL);
1735                                 (void) cat_update(hfsmp, &to_cp->c_desc, &to_cp->c_attr, NULL, NULL);
1736
1737                                 hfs_systemfile_unlock (hfsmp, lockflags);
1738                                 (void) hfs_end_transaction(hfsmp);
1739                         }
1740
1741                         add_fsevent(FSE_DOCID_CHANGED, context,
1742                                     FSE_ARG_DEV,   hfsmp->hfs_raw_dev,
1743                                     FSE_ARG_INO,   (ino64_t)cp->c_fileid,       // src inode #
1744                                     FSE_ARG_INO,   (ino64_t)to_cp->c_fileid,    // dst inode #
1745                                     FSE_ARG_INT32, to_extinfo->document_id,
1746                                     FSE_ARG_DONE);
1747
1748                         hfs_unlockpair(cp, to_cp);    // unlock this so we can send the fsevents
1749                         need_unlock = 0;
1750
1751                         if (need_fsevent(FSE_STAT_CHANGED, vp)) {
1752                                 add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
1753                         }
1754                         if (need_fsevent(FSE_STAT_CHANGED, to_vp)) {
1755                                 add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, to_vp, FSE_ARG_DONE);
1756                         }
1757                 }
1758
1759                 if (need_unlock) {
1760                         hfs_unlockpair(cp, to_cp);
1761                 }
1762
1763         transfer_cleanup:
1764                 vnode_put(to_vp);
1765                 file_drop(to_fd);
1766
1767                 return error;
1768         }
1769
1770
1771
1772         case HFS_PREV_LINK:
1773         case HFS_NEXT_LINK:
1774         {
1775                 cnid_t linkfileid;
1776                 cnid_t nextlinkid;
1777                 cnid_t prevlinkid;
1778                 int error;
1779
1780                 /* Caller must be owner of file system. */
1781                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1782                 if (suser(cred, NULL) &&
1783                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1784                         return (EACCES);
1785                 }
1786                 /* Target vnode must be file system's root. */
1787                 if (!vnode_isvroot(vp)) {
1788                         return (EINVAL);
1789                 }
1790                 linkfileid = *(cnid_t *)ap->a_data;
1791                 if (linkfileid < kHFSFirstUserCatalogNodeID) {
1792                         return (EINVAL);
1793                 }
1794                 if ((error = hfs_lookup_siblinglinks(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) {
1795                         return (error);
1796                 }
1797                 if (ap->a_command == HFS_NEXT_LINK) {
1798                         *(cnid_t *)ap->a_data = nextlinkid;
1799                 } else {
1800                         *(cnid_t *)ap->a_data = prevlinkid;
1801                 }
1802                 return (0);
1803         }
1804
1805         case HFS_RESIZE_PROGRESS: {
1806
1807                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1808                 if (suser(cred, NULL) &&
1809                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1810                         return (EACCES); /* must be owner of file system */
1811                 }
1812                 if (!vnode_isvroot(vp)) {
1813                         return (EINVAL);
1814                 }
1815                 /* file system must not be mounted read-only */
1816                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1817                         return (EROFS);
1818                 }
1819
1820                 return hfs_resize_progress(hfsmp, (u_int32_t *)ap->a_data);
1821         }
1822
1823         case HFS_RESIZE_VOLUME: {
1824                 u_int64_t newsize;
1825                 u_int64_t cursize;
1826                 int ret;
1827
1828                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1829                 if (suser(cred, NULL) &&
1830                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1831                         return (EACCES); /* must be owner of file system */
1832                 }
1833                 if (!vnode_isvroot(vp)) {
1834                         return (EINVAL);
1835                 }
1836
1837                 /* filesystem must not be mounted read only */
1838                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1839                         return (EROFS);
1840                 }
1841                 newsize = *(u_int64_t *)ap->a_data;
1842                 cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
1843
1844                 if (newsize == cursize) {
1845                         return (0);
1846                 }
1847                 IOBSDMountChange(hfsmp->hfs_mp, kIOMountChangeWillResize);
1848                 if (newsize > cursize) {
1849                         ret = hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context);
1850                 } else {
1851                         ret = hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context);
1852                 }
1853                 IOBSDMountChange(hfsmp->hfs_mp, kIOMountChangeDidResize);
1854                 return (ret);
1855         }
1856         case HFS_CHANGE_NEXT_ALLOCATION: {
1857                 int error = 0;          /* Assume success */
1858                 u_int32_t location;
1859
1860                 if (vnode_vfsisrdonly(vp)) {
1861                         return (EROFS);
1862                 }
1863                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1864                 if (suser(cred, NULL) &&
1865                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1866                         return (EACCES); /* must be owner of file system */
1867                 }
1868                 if (!vnode_isvroot(vp)) {
1869                         return (EINVAL);
1870                 }
1871                 hfs_lock_mount(hfsmp);
1872                 location = *(u_int32_t *)ap->a_data;
1873                 if ((location >= hfsmp->allocLimit) &&
1874                         (location != HFS_NO_UPDATE_NEXT_ALLOCATION)) {
1875                         error = EINVAL;
1876                         goto fail_change_next_allocation;
1877                 }
1878                 /* Return previous value. */
1879                 *(u_int32_t *)ap->a_data = hfsmp->nextAllocation;
1880                 if (location == HFS_NO_UPDATE_NEXT_ALLOCATION) {
1881                         /* On magic value for location, set nextAllocation to next block
1882                          * after metadata zone and set flag in mount structure to indicate
1883                          * that nextAllocation should not be updated again.
1884                          */
1885                         if (hfsmp->hfs_metazone_end != 0) {
1886                                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_end + 1);
1887                         }
1888                         hfsmp->hfs_flags |= HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1889                 } else {
1890                         hfsmp->hfs_flags &= ~HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1891                         HFS_UPDATE_NEXT_ALLOCATION(hfsmp, location);
1892                 }
1893                 MarkVCBDirty(hfsmp);
1894 fail_change_next_allocation:
1895                 hfs_unlock_mount(hfsmp);
1896                 return (error);
1897         }
1898
1899 #if HFS_SPARSE_DEV
1900         case HFS_SETBACKINGSTOREINFO: {
1901                 struct vnode * di_vp;
1902                 struct hfs_backingstoreinfo *bsdata;
1903                 int error = 0;
1904
1905                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1906                         return (EROFS);
1907                 }
1908                 if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
1909                         return (EALREADY);
1910                 }
1911                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1912                 if (suser(cred, NULL) &&
1913                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1914                         return (EACCES); /* must be owner of file system */
1915                 }
1916                 bsdata = (struct hfs_backingstoreinfo *)ap->a_data;
1917                 if (bsdata == NULL) {
1918                         return (EINVAL);
1919                 }
1920                 if ((error = file_vnode(bsdata->backingfd, &di_vp))) {
1921                         return (error);
1922                 }
1923                 if ((error = vnode_getwithref(di_vp))) {
1924                         file_drop(bsdata->backingfd);
1925                         return(error);
1926                 }
1927
1928                 if (vnode_mount(vp) == vnode_mount(di_vp)) {
1929                         (void)vnode_put(di_vp);
1930                         file_drop(bsdata->backingfd);
1931                         return (EINVAL);
1932                 }
1933
1934                 // Dropped in unmount
1935                 vnode_ref(di_vp);
1936
1937                 hfs_lock_mount(hfsmp);
1938                 hfsmp->hfs_backingvp = di_vp;
1939                 hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
1940                 hfsmp->hfs_sparsebandblks = bsdata->bandsize / hfsmp->blockSize * 4;
1941                 hfs_unlock_mount(hfsmp);
1942
1943                 /* We check the MNTK_VIRTUALDEV bit instead of marking the dependent process */
1944
1945                 /*
1946                  * If the sparse image is on a sparse image file (as opposed to a sparse
1947                  * bundle), then we may need to limit the free space to the maximum size
1948                  * of a file on that volume.  So we query (using pathconf), and if we get
1949                  * a meaningful result, we cache the number of blocks for later use in
1950                  * hfs_freeblks().
1951                  */
1952                 hfsmp->hfs_backingfs_maxblocks = 0;
1953                 if (vnode_vtype(di_vp) == VREG) {
1954                         int terr;
1955                         int hostbits;
1956                         terr = vn_pathconf(di_vp, _PC_FILESIZEBITS, &hostbits, context);
1957                         if (terr == 0 && hostbits != 0 && hostbits < 64) {
1958                                 u_int64_t hostfilesizemax = ((u_int64_t)1) << hostbits;
1959
1960                                 hfsmp->hfs_backingfs_maxblocks = hostfilesizemax / hfsmp->blockSize;
1961                         }
1962                 }
1963
1964                 /* The free extent cache is managed differently for sparse devices.
1965                  * There is a window between which the volume is mounted and the
1966                  * device is marked as sparse, so the free extent cache for this
1967                  * volume is currently initialized as normal volume (sorted by block
1968                  * count).  Reset the cache so that it will be rebuilt again
1969                  * for sparse device (sorted by start block).
1970                  */
1971                 ResetVCBFreeExtCache(hfsmp);
1972
1973                 (void)vnode_put(di_vp);
1974                 file_drop(bsdata->backingfd);
1975                 return (0);
1976         }
1977         case HFS_CLRBACKINGSTOREINFO: {
1978                 struct vnode * tmpvp;
1979
1980                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1981                 if (suser(cred, NULL) &&
1982                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1983                         return (EACCES); /* must be owner of file system */
1984                 }
1985                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1986                         return (EROFS);
1987                 }
1988
1989                 if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
1990                     hfsmp->hfs_backingvp) {
1991
1992                         hfs_lock_mount(hfsmp);
1993                         hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
1994                         tmpvp = hfsmp->hfs_backingvp;
1995                         hfsmp->hfs_backingvp = NULLVP;
1996                         hfsmp->hfs_sparsebandblks = 0;
1997                         hfs_unlock_mount(hfsmp);
1998
1999                         vnode_rele(tmpvp);
2000                 }
2001                 return (0);
2002         }
2003 #endif /* HFS_SPARSE_DEV */
2004
2005         /* Change the next CNID stored in the VH */
2006         case HFS_CHANGE_NEXTCNID: {
2007                 int error = 0;          /* Assume success */
2008                 u_int32_t fileid;
2009                 int wraparound = 0;
2010                 int lockflags = 0;
2011
2012                 if (vnode_vfsisrdonly(vp)) {
2013                         return (EROFS);
2014                 }
2015                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
2016                 if (suser(cred, NULL) &&
2017                         kauth_cred_getuid(cred) != vfsp->f_owner) {
2018                         return (EACCES); /* must be owner of file system */
2019                 }
2020
2021                 fileid = *(u_int32_t *)ap->a_data;
2022
2023                 /* Must have catalog lock excl. to advance the CNID pointer */
2024                 lockflags = hfs_systemfile_lock (hfsmp, SFL_CATALOG , HFS_EXCLUSIVE_LOCK);
2025
2026                 hfs_lock_mount(hfsmp);
2027
2028                 /* If it is less than the current next CNID, force the wraparound bit to be set */
2029                 if (fileid < hfsmp->vcbNxtCNID) {
2030                         wraparound=1;
2031                 }
2032
2033                 /* Return previous value. */
2034                 *(u_int32_t *)ap->a_data = hfsmp->vcbNxtCNID;
2035
2036                 hfsmp->vcbNxtCNID = fileid;
2037
2038                 if (wraparound) {
2039                         hfsmp->vcbAtrb |= kHFSCatalogNodeIDsReusedMask;
2040                 }
2041
2042                 MarkVCBDirty(hfsmp);
2043                 hfs_unlock_mount(hfsmp);
2044                 hfs_systemfile_unlock (hfsmp, lockflags);
2045
2046                 return (error);
2047         }
2048
2049         case F_FREEZE_FS: {
2050                 struct mount *mp;
2051
2052                 mp = vnode_mount(vp);
2053                 hfsmp = VFSTOHFS(mp);
2054
2055                 if (!(hfsmp->jnl))
2056                         return (ENOTSUP);
2057
2058                 vfsp = vfs_statfs(mp);
2059
2060                 if (kauth_cred_getuid(cred) != vfsp->f_owner &&
2061                         !kauth_cred_issuser(cred))
2062                         return (EACCES);
2063
2064                 return hfs_freeze(hfsmp);
2065         }
2066
2067         case F_THAW_FS: {
2068                 vfsp = vfs_statfs(vnode_mount(vp));
2069                 if (kauth_cred_getuid(cred) != vfsp->f_owner &&
2070                         !kauth_cred_issuser(cred))
2071                         return (EACCES);
2072
2073                 return hfs_thaw(hfsmp, current_proc());
2074         }
2075
2076         case HFS_EXT_BULKACCESS_FSCTL: {
2077             int size;
2078 #if CONFIG_HFS_STD
2079             if (hfsmp->hfs_flags & HFS_STANDARD) {
2080                         return EINVAL;
2081             }
2082 #endif
2083
2084             if (is64bit) {
2085                 size = sizeof(struct user64_ext_access_t);
2086             } else {
2087                 size = sizeof(struct user32_ext_access_t);
2088             }
2089
2090             return do_bulk_access_check(hfsmp, vp, ap, size, context);
2091         }
2092
2093         case HFS_SET_XATTREXTENTS_STATE: {
2094                 int state;
2095
2096                 if (ap->a_data == NULL) {
2097                         return (EINVAL);
2098                 }
2099
2100                 state = *(int *)ap->a_data;
2101
2102                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2103                         return (EROFS);
2104                 }
2105
2106                 /* Super-user can enable or disable extent-based extended
2107                  * attribute support on a volume
2108                  * Note: Starting Mac OS X 10.7, extent-based extended attributes
2109                  * are enabled by default, so any change will be transient only
2110                  * till the volume is remounted.
2111                  */
2112                 if (!kauth_cred_issuser(kauth_cred_get())) {
2113                         return (EPERM);
2114                 }
2115                 if (state == 0 || state == 1)
2116                         return hfs_set_volxattr(hfsmp, HFS_SET_XATTREXTENTS_STATE, state);
2117                 else
2118                         return (EINVAL);
2119         }
2120
2121         case F_SETSTATICCONTENT: {
2122                 int error;
2123                 int enable_static = 0;
2124                 struct cnode *cp = NULL;
2125                 /*
2126                  * lock the cnode, decorate the cnode flag, and bail out.
2127                  * VFS should have already authenticated the caller for us.
2128                  */
2129
2130                 if (ap->a_data) {
2131                         /*
2132                          * Note that even though ap->a_data is of type caddr_t,
2133                          * the fcntl layer at the syscall handler will pass in NULL
2134                          * or 1 depending on what the argument supplied to the fcntl
2135                          * was.  So it is in fact correct to check the ap->a_data
2136                          * argument for zero or non-zero value when deciding whether or not
2137                          * to enable the static bit in the cnode.
2138                          */
2139                         enable_static = 1;
2140                 }
2141                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2142                         return EROFS;
2143                 }
2144                 cp = VTOC(vp);
2145
2146                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2147                 if (error == 0) {
2148                         if (enable_static) {
2149                                 cp->c_flag |= C_SSD_STATIC;
2150                         }
2151                         else {
2152                                 cp->c_flag &= ~C_SSD_STATIC;
2153                         }
2154                         hfs_unlock (cp);
2155                 }
2156                 return error;
2157         }
2158
2159         case F_SET_GREEDY_MODE: {
2160                 int error;
2161                 int enable_greedy_mode = 0;
2162                 struct cnode *cp = NULL;
2163                 /*
2164                  * lock the cnode, decorate the cnode flag, and bail out.
2165                  * VFS should have already authenticated the caller for us.
2166                  */
2167
2168                 if (ap->a_data) {
2169                         /*
2170                          * Note that even though ap->a_data is of type caddr_t,
2171                          * the fcntl layer at the syscall handler will pass in NULL
2172                          * or 1 depending on what the argument supplied to the fcntl
2173                          * was.  So it is in fact correct to check the ap->a_data
2174                          * argument for zero or non-zero value when deciding whether or not
2175                          * to enable the greedy mode bit in the cnode.
2176                          */
2177                         enable_greedy_mode = 1;
2178                 }
2179                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2180                         return EROFS;
2181                 }
2182                 cp = VTOC(vp);
2183
2184                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2185                 if (error == 0) {
2186                         if (enable_greedy_mode) {
2187                                 cp->c_flag |= C_SSD_GREEDY_MODE;
2188                         }
2189                         else {
2190                                 cp->c_flag &= ~C_SSD_GREEDY_MODE;
2191                         }
2192                         hfs_unlock (cp);
2193                 }
2194                 return error;
2195         }
2196
2197         case F_SETIOTYPE: {
2198                 int error;
2199                 uint32_t iotypeflag = 0;
2200
2201                 struct cnode *cp = NULL;
2202                 /*
2203                  * lock the cnode, decorate the cnode flag, and bail out.
2204                  * VFS should have already authenticated the caller for us.
2205                  */
2206
2207                 if (ap->a_data == NULL) {
2208                         return EINVAL;
2209                 }
2210
2211                 /*
2212                  * Note that even though ap->a_data is of type caddr_t, we
2213                  * can only use 32 bits of flag values.
2214                  */
2215                 iotypeflag = (uint32_t) ap->a_data;
2216                 switch (iotypeflag) {
2217                         case F_IOTYPE_ISOCHRONOUS:
2218                                 break;
2219                         default:
2220                                 return EINVAL;
2221                 }
2222
2223
2224                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2225                         return EROFS;
2226                 }
2227                 cp = VTOC(vp);
2228
2229                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2230                 if (error == 0) {
2231                         switch (iotypeflag) {
2232                                 case F_IOTYPE_ISOCHRONOUS:
2233                                         cp->c_flag |= C_IO_ISOCHRONOUS;
2234                                         break;
2235                                 default:
2236                                         break;
2237                         }
2238                         hfs_unlock (cp);
2239                 }
2240                 return error;
2241         }
2242
2243         case F_MAKECOMPRESSED: {
2244                 int error = 0;
2245                 uint32_t gen_counter;
2246                 struct cnode *cp = NULL;
2247                 int reset_decmp = 0;
2248
2249                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2250                         return EROFS;
2251                 }
2252
2253                 /*
2254                  * acquire & lock the cnode.
2255                  * VFS should have already authenticated the caller for us.
2256                  */
2257
2258                 if (ap->a_data) {
2259                         /*
2260                          * Cast the pointer into a uint32_t so we can extract the
2261                          * supplied generation counter.
2262                          */
2263                         gen_counter = *((uint32_t*)ap->a_data);
2264                 }
2265                 else {
2266                         return EINVAL;
2267                 }
2268
2269 #if HFS_COMPRESSION
2270                 cp = VTOC(vp);
2271                 /* Grab truncate lock first; we may truncate the file */
2272                 hfs_lock_truncate (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2273
2274                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2275                 if (error) {
2276                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
2277                         return error;
2278                 }
2279
2280                 /* Are there any other usecounts/FDs? */
2281                 if (vnode_isinuse(vp, 1)) {
2282                         hfs_unlock(cp);
2283                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
2284                         return EBUSY;
2285                 }
2286
2287                 /* now we have the cnode locked down; Validate arguments */
2288                 if (cp->c_attr.ca_flags & (UF_IMMUTABLE | UF_COMPRESSED)) {
2289                         /* EINVAL if you are trying to manipulate an IMMUTABLE file */
2290                         hfs_unlock(cp);
2291                         hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT);
2292                         return EINVAL;
2293                 }
2294
2295                 if ((hfs_get_gencount (cp)) == gen_counter) {
2296                         /*
2297                          * OK, the gen_counter matched.  Go for it:
2298                          * Toggle state bits, truncate file, and suppress mtime update
2299                          */
2300                         reset_decmp = 1;
2301                         cp->c_bsdflags |= UF_COMPRESSED;
2302
2303                         error = hfs_truncate(vp, 0, IO_NDELAY, HFS_TRUNCATE_SKIPTIMES,
2304                                                                  ap->a_context);
2305                 }
2306                 else {
2307                         error = ESTALE;
2308                 }
2309
2310                 /* Unlock cnode before executing decmpfs ; they may need to get an EA */
2311                 hfs_unlock(cp);
2312
2313                 /*
2314                  * Reset the decmp state while still holding the truncate lock. We need to
2315                  * serialize here against a listxattr on this node which may occur at any
2316                  * time.
2317                  *
2318                  * Even if '0/skiplock' is passed in 2nd argument to hfs_file_is_compressed,
2319                  * that will still potentially require getting the com.apple.decmpfs EA. If the
2320                  * EA is required, then we can't hold the cnode lock, because the getxattr call is
2321                  * generic(through VFS), and can't pass along any info telling it that we're already
2322                  * holding it (the lock). If we don't serialize, then we risk listxattr stopping
2323                  * and trying to fill in the hfs_file_is_compressed info during the callback
2324                  * operation, which will result in deadlock against the b-tree node.
2325                  *
2326                  * So, to serialize against listxattr (which will grab buf_t meta references on
2327                  * the b-tree blocks), we hold the truncate lock as we're manipulating the
2328                  * decmpfs payload.
2329                  */
2330                 if ((reset_decmp) && (error == 0)) {
2331                         decmpfs_cnode *dp = VTOCMP (vp);
2332                         if (dp != NULL) {
2333                                 decmpfs_cnode_set_vnode_state(dp, FILE_TYPE_UNKNOWN, 0);
2334                         }
2335
2336                         /* Initialize the decmpfs node as needed */
2337                         (void) hfs_file_is_compressed (cp, 0); /* ok to take lock */
2338                 }
2339
2340                 hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT);
2341
2342 #endif
2343                 return error;
2344         }
2345
2346         case F_SETBACKINGSTORE: {
2347
2348                 int error = 0;
2349
2350                 /*
2351                  * See comment in F_SETSTATICCONTENT re: using
2352              * a null check for a_data
2353                  */
2354                 if (ap->a_data) {
2355                         error = hfs_set_backingstore (vp, 1);
2356                 }
2357                 else {
2358                         error = hfs_set_backingstore (vp, 0);
2359                 }
2360
2361                 return error;
2362         }
2363
2364         case F_GETPATH_MTMINFO: {
2365                 int error = 0;
2366
2367                 int *data = (int*) ap->a_data;
2368
2369                 /* Ask if this is a backingstore vnode */
2370                 error = hfs_is_backingstore (vp, data);
2371
2372                 return error;
2373         }
2374
2375         case F_FULLFSYNC: {
2376                 int error;
2377
2378                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2379                         return (EROFS);
2380                 }
2381                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2382                 if (error == 0) {
2383                         error = hfs_fsync(vp, MNT_WAIT, HFS_FSYNC_FULL, p);
2384                         hfs_unlock(VTOC(vp));
2385                 }
2386
2387                 return error;
2388         }
2389
2390         case F_BARRIERFSYNC: {
2391                 int error;
2392
2393                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2394                         return (EROFS);
2395                 }
2396                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2397                 if (error == 0) {
2398                         error = hfs_fsync(vp, MNT_WAIT, HFS_FSYNC_BARRIER, p);
2399                         hfs_unlock(VTOC(vp));
2400                 }
2401
2402                 return error;
2403         }
2404
2405         case F_CHKCLEAN: {
2406                 register struct cnode *cp;
2407                 int error;
2408
2409                 if (!vnode_isreg(vp))
2410                         return EINVAL;
2411
2412                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2413                 if (error == 0) {
2414                         cp = VTOC(vp);
2415                         /*
2416                          * used by regression test to determine if
2417                          * all the dirty pages (via write) have been cleaned
2418                          * after a call to 'fsysnc'.
2419                          */
2420                         error = is_file_clean(vp, VTOF(vp)->ff_size);
2421                         hfs_unlock(cp);
2422                 }
2423                 return (error);
2424         }
2425
2426         case F_RDADVISE: {
2427                 register struct radvisory *ra;
2428                 struct filefork *fp;
2429                 int error;
2430
2431                 if (!vnode_isreg(vp))
2432                         return EINVAL;
2433
2434                 ra = (struct radvisory *)(ap->a_data);
2435                 fp = VTOF(vp);
2436
2437                 /* Protect against a size change. */
2438                 hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2439
2440 #if HFS_COMPRESSION
2441                 if (compressed) {
2442                         if (uncompressed_size == -1) {
2443                                 /* fetching the uncompressed size failed above, so return the error */
2444                                 error = decmpfs_error;
2445                         } else if (ra->ra_offset >= uncompressed_size) {
2446                                 error = EFBIG;
2447                         } else {
2448                                 error = advisory_read(vp, uncompressed_size, ra->ra_offset, ra->ra_count);
2449                         }
2450                 } else
2451 #endif /* HFS_COMPRESSION */
2452                 if (ra->ra_offset >= fp->ff_size) {
2453                         error = EFBIG;
2454                 } else {
2455                         error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count);
2456                 }
2457
2458                 hfs_unlock_truncate(VTOC(vp), HFS_LOCK_DEFAULT);
2459                 return (error);
2460         }
2461
2462         case _IOC(IOC_OUT,'h', 4, 0):     /* Create date in local time */
2463         {
2464                 if (is64bit) {
2465                         *(user_time_t *)(ap->a_data) = (user_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2466                 }
2467                 else {
2468                         *(user32_time_t *)(ap->a_data) = (user32_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2469                 }
2470                 return 0;
2471         }
2472
2473         case SPOTLIGHT_FSCTL_GET_MOUNT_TIME:
2474             *(uint32_t *)ap->a_data = hfsmp->hfs_mount_time;
2475             break;
2476
2477         case SPOTLIGHT_FSCTL_GET_LAST_MTIME:
2478             *(uint32_t *)ap->a_data = hfsmp->hfs_last_mounted_mtime;
2479             break;
2480
2481         case HFS_FSCTL_GET_VERY_LOW_DISK:
2482             *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_dangerlimit;
2483             break;
2484
2485         case HFS_FSCTL_SET_VERY_LOW_DISK:
2486             if (*(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_warninglimit) {
2487                 return EINVAL;
2488             }
2489
2490             hfsmp->hfs_freespace_notify_dangerlimit = *(uint32_t *)ap->a_data;
2491             break;
2492
2493         case HFS_FSCTL_GET_LOW_DISK:
2494             *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_warninglimit;
2495             break;
2496
2497         case HFS_FSCTL_SET_LOW_DISK:
2498             if (   *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel
2499                 || *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_dangerlimit) {
2500
2501                 return EINVAL;
2502             }
2503
2504             hfsmp->hfs_freespace_notify_warninglimit = *(uint32_t *)ap->a_data;
2505             break;
2506
2507         case HFS_FSCTL_GET_DESIRED_DISK:
2508             *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_desiredlevel;
2509             break;
2510
2511         case HFS_FSCTL_SET_DESIRED_DISK:
2512             if (*(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) {
2513                 return EINVAL;
2514             }
2515
2516             hfsmp->hfs_freespace_notify_desiredlevel = *(uint32_t *)ap->a_data;
2517             break;
2518
2519         case HFS_VOLUME_STATUS:
2520             *(uint32_t *)ap->a_data = hfsmp->hfs_notification_conditions;
2521             break;
2522
2523         case HFS_SET_BOOT_INFO:
2524                 if (!vnode_isvroot(vp))
2525                         return(EINVAL);
2526                 if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(HFSTOVFS(hfsmp))->f_owner))
2527                         return(EACCES); /* must be superuser or owner of filesystem */
2528                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2529                         return (EROFS);
2530                 }
2531                 hfs_lock_mount (hfsmp);
2532                 bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo));
2533                 hfs_unlock_mount (hfsmp);
2534                 (void) hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT);
2535                 break;
2536
2537         case HFS_GET_BOOT_INFO:
2538                 if (!vnode_isvroot(vp))
2539                         return(EINVAL);
2540                 hfs_lock_mount (hfsmp);
2541                 bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo));
2542                 hfs_unlock_mount(hfsmp);
2543                 break;
2544
2545         case HFS_MARK_BOOT_CORRUPT:
2546                 /* Mark the boot volume corrupt by setting
2547                  * kHFSVolumeInconsistentBit in the volume header.  This will
2548                  * force fsck_hfs on next mount.
2549                  */
2550                 if (!kauth_cred_issuser(kauth_cred_get())) {
2551                         return EACCES;
2552                 }
2553
2554                 /* Allowed only on the root vnode of the boot volume */
2555                 if (!(vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) ||
2556                     !vnode_isvroot(vp)) {
2557                         return EINVAL;
2558                 }
2559                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2560                         return (EROFS);
2561                 }
2562                 printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n");
2563                 hfs_mark_inconsistent(hfsmp, HFS_FSCK_FORCED);
2564                 break;
2565
2566         case HFS_FSCTL_GET_JOURNAL_INFO:
2567                 jip = (struct hfs_journal_info*)ap->a_data;
2568
2569                 if (vp == NULLVP)
2570                         return EINVAL;
2571
2572             if (hfsmp->jnl == NULL) {
2573                         jnl_start = 0;
2574                         jnl_size  = 0;
2575             } else {
2576                         jnl_start = hfs_blk_to_bytes(hfsmp->jnl_start, hfsmp->blockSize) + hfsmp->hfsPlusIOPosOffset;
2577                         jnl_size  = hfsmp->jnl_size;
2578             }
2579
2580                 jip->jstart = jnl_start;
2581                 jip->jsize = jnl_size;
2582                 break;
2583
2584         case HFS_SET_ALWAYS_ZEROFILL: {
2585             struct cnode *cp = VTOC(vp);
2586
2587             if (*(int *)ap->a_data) {
2588                 cp->c_flag |= C_ALWAYS_ZEROFILL;
2589             } else {
2590                 cp->c_flag &= ~C_ALWAYS_ZEROFILL;
2591             }
2592             break;
2593         }
2594
2595         case HFS_DISABLE_METAZONE: {
2596                 /* Only root can disable metadata zone */
2597                 if (!kauth_cred_issuser(kauth_cred_get())) {
2598                         return EACCES;
2599                 }
2600                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2601                         return (EROFS);
2602                 }
2603
2604                 /* Disable metadata zone now */
2605                 (void) hfs_metadatazone_init(hfsmp, true);
2606                 printf ("hfs: Disabling metadata zone on %s\n", hfsmp->vcbVN);
2607                 break;
2608         }
2609
2610
2611         case HFS_FSINFO_METADATA_BLOCKS: {
2612                 int error;
2613                 struct hfsinfo_metadata *hinfo;
2614
2615                 hinfo = (struct hfsinfo_metadata *)ap->a_data;
2616
2617                 /* Get information about number of metadata blocks */
2618                 error = hfs_getinfo_metadata_blocks(hfsmp, hinfo);
2619                 if (error) {
2620                         return error;
2621                 }
2622
2623                 break;
2624         }
2625
2626         case HFS_GET_FSINFO: {
2627                 hfs_fsinfo *fsinfo = (hfs_fsinfo *)ap->a_data;
2628
2629                 /* Only root is allowed to get fsinfo */
2630                 if (!kauth_cred_issuser(kauth_cred_get())) {
2631                         return EACCES;
2632                 }
2633
2634                 /*
2635                  * Make sure that the caller's version number matches with
2636                  * the kernel's version number.  This will make sure that
2637                  * if the structures being read/written into are changed
2638                  * by the kernel, the caller will not read incorrect data.
2639                  *
2640                  * The first three fields --- request_type, version and
2641                  * flags are same for all the hfs_fsinfo structures, so
2642                  * we can access the version number by assuming any
2643                  * structure for now.
2644                  */
2645                 if (fsinfo->header.version != HFS_FSINFO_VERSION) {
2646                         return ENOTSUP;
2647                 }
2648
2649                 /* Make sure that the current file system is not marked inconsistent */
2650                 if (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) {
2651                         return EIO;
2652                 }
2653
2654                 return hfs_get_fsinfo(hfsmp, ap->a_data);
2655         }
2656
2657         case HFS_CS_FREESPACE_TRIM: {
2658                 int error = 0;
2659                 int lockflags = 0;
2660
2661                 /* Only root allowed */
2662                 if (!kauth_cred_issuser(kauth_cred_get())) {
2663                         return EACCES;
2664                 }
2665
2666                 /*
2667                  * This core functionality is similar to hfs_scan_blocks().
2668                  * The main difference is that hfs_scan_blocks() is called
2669                  * as part of mount where we are assured that the journal is
2670                  * empty to start with.  This fcntl() can be called on a
2671                  * mounted volume, therefore it has to flush the content of
2672                  * the journal as well as ensure the state of summary table.
2673                  *
2674                  * This fcntl scans over the entire allocation bitmap,
2675                  * creates list of all the free blocks, and issues TRIM
2676                  * down to the underlying device.  This can take long time
2677                  * as it can generate up to 512MB of read I/O.
2678                  */
2679
2680                 if ((hfsmp->hfs_flags & HFS_SUMMARY_TABLE) == 0) {
2681                         error = hfs_init_summary(hfsmp);
2682                         if (error) {
2683                                 printf("hfs: fsctl() could not initialize summary table for %s\n", hfsmp->vcbVN);
2684                                 return error;
2685                         }
2686                 }
2687
2688                 /*
2689                  * The journal maintains list of recently deallocated blocks to
2690                  * issue DKIOCUNMAPs when the corresponding journal transaction is
2691                  * flushed to the disk.  To avoid any race conditions, we only
2692                  * want one active trim list and only one thread issuing DKIOCUNMAPs.
2693                  * Therefore we make sure that the journal trim list is sync'ed,
2694                  * empty, and not modifiable for the duration of our scan.
2695                  *
2696                  * Take the journal lock before flushing the journal to the disk.
2697                  * We will keep on holding the journal lock till we don't get the
2698                  * bitmap lock to make sure that no new journal transactions can
2699                  * start.  This will make sure that the journal trim list is not
2700                  * modified after the journal flush and before getting bitmap lock.
2701                  * We can release the journal lock after we acquire the bitmap
2702                  * lock as it will prevent any further block deallocations.
2703                  */
2704                 hfs_journal_lock(hfsmp);
2705
2706                 /* Flush the journal and wait for all I/Os to finish up */
2707                 error = hfs_flush(hfsmp, HFS_FLUSH_JOURNAL_META);
2708                 if (error) {
2709                         hfs_journal_unlock(hfsmp);
2710                         return error;
2711                 }
2712
2713                 /* Take bitmap lock to ensure it is not being modified */
2714                 lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
2715
2716                 /* Release the journal lock */
2717                 hfs_journal_unlock(hfsmp);
2718
2719                 /*
2720                  * ScanUnmapBlocks reads the bitmap in large block size
2721                  * (up to 1MB) unlike the runtime which reads the bitmap
2722                  * in the 4K block size.  This can cause buf_t collisions
2723                  * and potential data corruption.  To avoid this, we
2724                  * invalidate all the existing buffers associated with
2725                  * the bitmap vnode before scanning it.
2726                  *
2727                  * Note: ScanUnmapBlock() cleans up all the buffers
2728                  * after itself, so there won't be any large buffers left
2729                  * for us to clean up after it returns.
2730                  */
2731                 error = buf_invalidateblks(hfsmp->hfs_allocation_vp, 0, 0, 0);
2732                 if (error) {
2733                         hfs_systemfile_unlock(hfsmp, lockflags);
2734                         return error;
2735                 }
2736
2737                 /* Traverse bitmap and issue DKIOCUNMAPs */
2738                 error = ScanUnmapBlocks(hfsmp);
2739                 hfs_systemfile_unlock(hfsmp, lockflags);
2740                 if (error) {
2741                         return error;
2742                 }
2743
2744                 break;
2745         }
2746
2747         case HFS_SET_HOTFILE_STATE: {
2748                 int error;
2749                 struct cnode *cp = VTOC(vp);
2750                 uint32_t hf_state = *((uint32_t*)ap->a_data);
2751                 uint32_t num_unpinned = 0;
2752
2753                 error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2754                 if (error) {
2755                         return error;
2756                 }
2757
2758                 // printf("hfs: setting hotfile state %d on %s\n", hf_state, vp->v_name);
2759                 if (hf_state == HFS_MARK_FASTDEVCANDIDATE) {
2760                         vnode_setfastdevicecandidate(vp);
2761
2762                         cp->c_attr.ca_recflags |= kHFSFastDevCandidateMask;
2763                         cp->c_attr.ca_recflags &= ~kHFSDoNotFastDevPinMask;
2764                         cp->c_flag |= C_MODIFIED;
2765                 } else if (hf_state == HFS_UNMARK_FASTDEVCANDIDATE || hf_state == HFS_NEVER_FASTDEVCANDIDATE) {
2766                         vnode_clearfastdevicecandidate(vp);
2767                         hfs_removehotfile(vp);
2768
2769                         if (cp->c_attr.ca_recflags & kHFSFastDevPinnedMask) {
2770                                 hfs_pin_vnode(hfsmp, vp, HFS_UNPIN_IT, &num_unpinned);
2771                         }
2772
2773                         if (hf_state == HFS_NEVER_FASTDEVCANDIDATE) {
2774                                 cp->c_attr.ca_recflags |= kHFSDoNotFastDevPinMask;
2775                         }
2776                         cp->c_attr.ca_recflags &= ~(kHFSFastDevCandidateMask|kHFSFastDevPinnedMask);
2777                         cp->c_flag |= C_MODIFIED;
2778
2779                 } else {
2780                         error = EINVAL;
2781                 }
2782
2783                 if (num_unpinned != 0) {
2784                         lck_mtx_lock(&hfsmp->hfc_mutex);
2785                         hfsmp->hfs_hotfile_freeblks += num_unpinned;
2786                         lck_mtx_unlock(&hfsmp->hfc_mutex);
2787                 }
2788
2789                 hfs_unlock(cp);
2790                 return error;
2791         }
2792
2793         case HFS_REPIN_HOTFILE_STATE: {
2794                 int error=0;
2795                 uint32_t repin_what = *((uint32_t*)ap->a_data);
2796
2797                 /* Only root allowed */
2798                 if (!kauth_cred_issuser(kauth_cred_get())) {
2799                         return EACCES;
2800                 }
2801
2802                 if (!(hfsmp->hfs_flags & (HFS_CS_METADATA_PIN | HFS_CS_HOTFILE_PIN))) {
2803                         // this system is neither regular Fusion or Cooperative Fusion
2804                         // so this fsctl makes no sense.
2805                         return EINVAL;
2806                 }
2807
2808                 //
2809                 // After a converting a CoreStorage volume to be encrypted, the
2810                 // extents could have moved around underneath us.  This call
2811                 // allows corestoraged to re-pin everything that should be
2812                 // pinned (it would happen on the next reboot too but that could
2813                 // be a long time away).
2814                 //
2815                 if ((repin_what & HFS_REPIN_METADATA) && (hfsmp->hfs_flags & HFS_CS_METADATA_PIN)) {
2816                         hfs_pin_fs_metadata(hfsmp);
2817                 }
2818                 if ((repin_what & HFS_REPIN_USERDATA) && (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN)) {
2819                         hfs_repin_hotfiles(hfsmp);
2820                 }
2821                 if ((repin_what & HFS_REPIN_USERDATA) && (hfsmp->hfs_flags & HFS_CS_SWAPFILE_PIN)) {
2822                         //XXX Swapfiles (marked SWAP_PINNED) may have moved too.
2823                         //XXX Do we care? They have a more transient/dynamic nature/lifetime.
2824                 }
2825
2826                 return error;
2827         }
2828
2829 #if HFS_CONFIG_KEY_ROLL
2830
2831         case HFS_KEY_ROLL: {
2832                 if (!kauth_cred_issuser(kauth_cred_get()))
2833                         return EACCES;
2834
2835                 hfs_key_roll_args_t *args = (hfs_key_roll_args_t *)ap->a_data;
2836
2837                 return hfs_key_roll_op(ap->a_context, ap->a_vp, args);
2838         }
2839
2840         case HFS_GET_KEY_AUTO_ROLL: {
2841                 if (!kauth_cred_issuser(kauth_cred_get()))
2842                         return EACCES;
2843
2844                 hfs_key_auto_roll_args_t *args = (hfs_key_auto_roll_args_t *)ap->a_data;
2845                 if (args->api_version != HFS_KEY_AUTO_ROLL_API_VERSION_1)
2846                         return ENOTSUP;
2847                 args->flags = (ISSET(hfsmp->cproot_flags, CP_ROOT_AUTO_ROLL_OLD_CLASS_GENERATION)
2848                                            ? HFS_KEY_AUTO_ROLL_OLD_CLASS_GENERATION : 0);
2849                 args->min_key_os_version = hfsmp->hfs_auto_roll_min_key_os_version;
2850                 args->max_key_os_version = hfsmp->hfs_auto_roll_max_key_os_version;
2851                 break;
2852         }
2853
2854         case HFS_SET_KEY_AUTO_ROLL: {
2855                 if (!kauth_cred_issuser(kauth_cred_get()))
2856                         return EACCES;
2857
2858                 hfs_key_auto_roll_args_t *args = (hfs_key_auto_roll_args_t *)ap->a_data;
2859                 if (args->api_version != HFS_KEY_AUTO_ROLL_API_VERSION_1)
2860                         return ENOTSUP;
2861                 return cp_set_auto_roll(hfsmp, args);
2862         }
2863
2864 #endif // HFS_CONFIG_KEY_ROLL
2865
2866 #if CONFIG_PROTECT
2867         case F_TRANSCODEKEY:
2868                 /*
2869                  * This API is only supported when called via kernel so
2870                  * a_fflag must be set to 1 (it's not possible to get here
2871                  * with it set to 1 via fsctl).
2872                  */
2873                 if (ap->a_fflag != 1)
2874                         return ENOTTY;
2875                 return cp_vnode_transcode(vp, (cp_key_t *)ap->a_data);
2876
2877         case F_GETPROTECTIONLEVEL:
2878                 return cp_get_root_major_vers (vp, (uint32_t *)ap->a_data);
2879
2880         case F_GETDEFAULTPROTLEVEL:
2881                 return cp_get_default_level(vp, (uint32_t *)ap->a_data);
2882 #endif // CONFIG_PROTECT
2883
2884         case FIOPINSWAP:
2885                 return hfs_pin_vnode(hfsmp, vp, HFS_PIN_IT | HFS_DATALESS_PIN,
2886                                                          NULL);
2887
2888         default:
2889                 return (ENOTTY);
2890         }
2891
2892         return 0;
2893 }
2894
2895 /*
2896  * select
2897  */
2898 int
2899 hfs_vnop_select(__unused struct vnop_select_args *ap)
2900 /*
2901         struct vnop_select_args {
2902                 vnode_t a_vp;
2903                 int  a_which;
2904                 int  a_fflags;
2905                 void *a_wql;
2906                 vfs_context_t a_context;
2907         };
2908 */
2909 {
2910         /*
2911          * We should really check to see if I/O is possible.
2912          */
2913         return (1);
2914 }
2915
2916 /*
2917  * Converts a logical block number to a physical block, and optionally returns
2918  * the amount of remaining blocks in a run. The logical block is based on hfsNode.logBlockSize.
2919  * The physical block number is based on the device block size, currently its 512.
2920  * The block run is returned in logical blocks, and is the REMAINING amount of blocks
2921  */
2922 int
2923 hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, unsigned int *runp)
2924 {
2925         struct filefork *fp = VTOF(vp);
2926         struct hfsmount *hfsmp = VTOHFS(vp);
2927         int  retval = E_NONE;
2928         u_int32_t  logBlockSize;
2929         size_t  bytesContAvail = 0;
2930         off_t  blockposition;
2931         int lockExtBtree;
2932         int lockflags = 0;
2933
2934         /*
2935          * Check for underlying vnode requests and ensure that logical
2936          * to physical mapping is requested.
2937          */
2938         if (vpp != NULL)
2939                 *vpp = hfsmp->hfs_devvp;
2940         if (bnp == NULL)
2941                 return (0);
2942
2943         logBlockSize = GetLogicalBlockSize(vp);
2944         blockposition = (off_t)bn * logBlockSize;
2945
2946         lockExtBtree = overflow_extents(fp);
2947
2948         if (lockExtBtree)
2949                 lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK);
2950
2951         retval = MacToVFSError(
2952                             MapFileBlockC (HFSTOVCB(hfsmp),
2953                                             (FCB*)fp,
2954                                             MAXPHYSIO,
2955                                             blockposition,
2956                                             bnp,
2957                                             &bytesContAvail));
2958
2959         if (lockExtBtree)
2960                 hfs_systemfile_unlock(hfsmp, lockflags);
2961
2962         if (retval == E_NONE) {
2963                 /* Figure out how many read ahead blocks there are */
2964                 if (runp != NULL) {
2965                         if (can_cluster(logBlockSize)) {
2966                                 /* Make sure this result never goes negative: */
2967                                 *runp = (bytesContAvail < logBlockSize) ? 0 : (bytesContAvail / logBlockSize) - 1;
2968                         } else {
2969                                 *runp = 0;
2970                         }
2971                 }
2972         }
2973         return (retval);
2974 }
2975
2976 /*
2977  * Convert logical block number to file offset.
2978  */
2979 int
2980 hfs_vnop_blktooff(struct vnop_blktooff_args *ap)
2981 /*
2982         struct vnop_blktooff_args {
2983                 vnode_t a_vp;
2984                 daddr64_t a_lblkno;
2985                 off_t *a_offset;
2986         };
2987 */
2988 {
2989         if (ap->a_vp == NULL)
2990                 return (EINVAL);
2991         *ap->a_offset = (off_t)ap->a_lblkno * (off_t)GetLogicalBlockSize(ap->a_vp);
2992
2993         return(0);
2994 }
2995
2996 /*
2997  * Convert file offset to logical block number.
2998  */
2999 int
3000 hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap)
3001 /*
3002         struct vnop_offtoblk_args {
3003                 vnode_t a_vp;
3004                 off_t a_offset;
3005                 daddr64_t *a_lblkno;
3006         };
3007 */
3008 {
3009         if (ap->a_vp == NULL)
3010                 return (EINVAL);
3011         *ap->a_lblkno = (daddr64_t)(ap->a_offset / (off_t)GetLogicalBlockSize(ap->a_vp));
3012
3013         return(0);
3014 }
3015
3016 /*
3017  * Map file offset to physical block number.
3018  *
3019  * If this function is called for write operation, and if the file
3020  * had virtual blocks allocated (delayed allocation), real blocks
3021  * are allocated by calling ExtendFileC().
3022  *
3023  * If this function is called for read operation, and if the file
3024  * had virtual blocks allocated (delayed allocation), no change
3025  * to the size of file is done, and if required, rangelist is
3026  * searched for mapping.
3027  *
3028  * System file cnodes are expected to be locked (shared or exclusive).
3029  *
3030  * -- INVALID RANGES --
3031  *
3032  * Invalid ranges are used to keep track of where we have extended a
3033  * file, but have not yet written that data to disk.  In the past we
3034  * would clear up the invalid ranges as we wrote to those areas, but
3035  * before data was actually flushed to disk.  The problem with that
3036  * approach is that the data can be left in the cache and is therefore
3037  * still not valid on disk.  So now we clear up the ranges here, when
3038  * the flags field has VNODE_WRITE set, indicating a write is about to
3039  * occur.  This isn't ideal (ideally we want to clear them up when
3040  * know the data has been successfully written), but it's the best we
3041  * can do.
3042  *
3043  * For reads, we use the invalid ranges here in block map to indicate
3044  * to the caller that the data should be zeroed (a_bpn == -1).  We
3045  * have to be careful about what ranges we return to the cluster code.
3046  * Currently the cluster code can only handle non-rounded values for
3047  * the EOF; it cannot handle funny sized ranges in the middle of the
3048  * file (the main problem is that it sends down odd sized I/Os to the
3049  * disk).  Our code currently works because whilst the very first
3050  * offset and the last offset in the invalid ranges are not aligned,
3051  * gaps in the invalid ranges between the first and last, have to be
3052  * aligned (because we always write page sized blocks).  For example,
3053  * consider this arrangement:
3054  *
3055  *         +-------------+-----+-------+------+
3056  *         |             |XXXXX|       |XXXXXX|
3057  *         +-------------+-----+-------+------+
3058  *                       a     b       c      d
3059  *
3060  * This shows two invalid ranges <a, b> and <c, d>.  Whilst a and d
3061  * are not necessarily aligned, b and c *must* be.
3062  *
3063  * Zero-filling occurs in a number of ways:
3064  *
3065  *   1. When a read occurs and we return with a_bpn == -1.
3066  *
3067  *   2. When hfs_fsync or hfs_filedone calls hfs_flush_invalid_ranges
3068  *      which will cause us to iterate over the ranges bringing in
3069  *      pages that are not present in the cache and zeroing them.  Any
3070  *      pages that are already in the cache are left untouched.  Note
3071  *      that hfs_fsync does not always flush invalid ranges.
3072  *
3073  *   3. When we extend a file we zero out from the old EOF to the end
3074  *      of the page.  It would be nice if we didn't have to do this if
3075  *      the page wasn't present (and could defer it), but because of
3076  *      the problem described above, we have to.
3077  *
3078  * The invalid ranges are also used to restrict the size that we write
3079  * out on disk: see hfs_prepare_fork_for_update.
3080  *
3081  * Note that invalid ranges are ignored when neither the VNODE_READ or
3082  * the VNODE_WRITE flag is specified.  This is useful for the
3083  * F_LOG2PHYS* fcntls which are not interested in invalid ranges: they
3084  * just want to know whether blocks are physically allocated or not.
3085  */
3086 int
3087 hfs_vnop_blockmap(struct vnop_blockmap_args *ap)
3088 /*
3089         struct vnop_blockmap_args {
3090                 vnode_t a_vp;
3091                 off_t a_foffset;
3092                 size_t a_size;
3093                 daddr64_t *a_bpn;
3094                 size_t *a_run;
3095                 void *a_poff;
3096                 int a_flags;
3097                 vfs_context_t a_context;
3098         };
3099 */
3100 {
3101         struct vnode *vp = ap->a_vp;
3102         struct cnode *cp;
3103         struct filefork *fp;
3104         struct hfsmount *hfsmp;
3105         size_t bytesContAvail = ap->a_size;
3106         int retval = E_NONE;
3107         int syslocks = 0;
3108         int lockflags = 0;
3109         struct rl_entry *invalid_range;
3110         enum rl_overlaptype overlaptype;
3111         int started_tr = 0;
3112         int tooklock = 0;
3113
3114 #if HFS_COMPRESSION
3115         if (VNODE_IS_RSRC(vp)) {
3116                 /* allow blockmaps to the resource fork */
3117         } else {
3118                 if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
3119                         int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
3120                         switch(state) {
3121                                 case FILE_IS_COMPRESSED:
3122                                         return ENOTSUP;
3123                                 case FILE_IS_CONVERTING:
3124                                         /* if FILE_IS_CONVERTING, we allow blockmap */
3125                                         break;
3126                                 default:
3127                                         printf("invalid state %d for compressed file\n", state);
3128                                         /* fall through */
3129                         }
3130                 }
3131         }
3132 #endif /* HFS_COMPRESSION */
3133
3134         /* Do not allow blockmap operation on a directory */
3135         if (vnode_isdir(vp)) {
3136                 return (ENOTSUP);
3137         }
3138
3139         /*
3140          * Check for underlying vnode requests and ensure that logical
3141          * to physical mapping is requested.
3142          */
3143         if (ap->a_bpn == NULL)
3144                 return (0);
3145
3146         hfsmp = VTOHFS(vp);
3147         cp = VTOC(vp);
3148         fp = VTOF(vp);
3149
3150         if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) {
3151                 if (cp->c_lockowner != current_thread()) {
3152                         hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
3153                         tooklock = 1;
3154                 }
3155
3156                 // For reads, check the invalid ranges
3157                 if (ISSET(ap->a_flags, VNODE_READ)) {
3158                         if (ap->a_foffset >= fp->ff_size) {
3159                                 retval = ERANGE;
3160                                 goto exit;
3161                         }
3162
3163                         overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
3164                                                                   ap->a_foffset + (off_t)bytesContAvail - 1,
3165                                                                   &invalid_range);
3166                         switch(overlaptype) {
3167                                 case RL_MATCHINGOVERLAP:
3168                                 case RL_OVERLAPCONTAINSRANGE:
3169                                 case RL_OVERLAPSTARTSBEFORE:
3170                                         /* There's no valid block for this byte offset */
3171                                         *ap->a_bpn = (daddr64_t)-1;
3172                                         /* There's no point limiting the amount to be returned
3173                                          * if the invalid range that was hit extends all the way
3174                                          * to the EOF (i.e. there's no valid bytes between the
3175                                          * end of this range and the file's EOF):
3176                                          */
3177                                         if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
3178                                                 ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
3179                                                 bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
3180                                         }
3181
3182                                         retval = 0;
3183                                         goto exit;
3184
3185                                 case RL_OVERLAPISCONTAINED:
3186                                 case RL_OVERLAPENDSAFTER:
3187                                         /* The range of interest hits an invalid block before the end: */
3188                                         if (invalid_range->rl_start == ap->a_foffset) {
3189                                                 /* There's actually no valid information to be had starting here: */
3190                                                 *ap->a_bpn = (daddr64_t)-1;
3191                                                 if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
3192                                                         ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
3193                                                         bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
3194                                                 }
3195
3196                                                 retval = 0;
3197                                                 goto exit;
3198                                         } else {
3199                                                 /*
3200                                                  * Sadly, the lower layers don't like us to
3201                                                  * return unaligned ranges, so we skip over
3202                                                  * any invalid ranges here that are less than
3203                                                  * a page: zeroing of those bits is not our
3204                                                  * responsibility (it's dealt with elsewhere).
3205                                                  */
3206                                                 do {
3207                                                         off_t rounded_start = round_page_64(invalid_range->rl_start);
3208                                                         if ((off_t)bytesContAvail < rounded_start - ap->a_foffset)
3209                                                                 break;
3210                                                         if (rounded_start < invalid_range->rl_end + 1) {
3211                                                                 bytesContAvail = rounded_start - ap->a_foffset;
3212                                                                 break;
3213                                                         }
3214                                                 } while ((invalid_range = TAILQ_NEXT(invalid_range,
3215                                                                                                                          rl_link)));
3216                                         }
3217                                         break;
3218
3219                                 case RL_NOOVERLAP:
3220                                         break;
3221                         } // switch
3222                 }
3223         }
3224
3225 #if CONFIG_PROTECT
3226         if (cp->c_cpentry) {
3227                 const int direction = (ISSET(ap->a_flags, VNODE_WRITE)
3228                                                            ? VNODE_WRITE : VNODE_READ);
3229
3230                 cp_io_params_t io_params;
3231                 cp_io_params(hfsmp, cp->c_cpentry,
3232                                          off_rsrc_make(ap->a_foffset, VNODE_IS_RSRC(vp)),
3233                                          direction, &io_params);
3234
3235                 if (io_params.max_len < (off_t)bytesContAvail)
3236                         bytesContAvail = io_params.max_len;
3237
3238                 if (io_params.phys_offset != -1) {
3239                         *ap->a_bpn = ((io_params.phys_offset + hfsmp->hfsPlusIOPosOffset)
3240                                                   / hfsmp->hfs_logical_block_size);
3241
3242                         retval = 0;
3243                         goto exit;
3244                 }
3245         }
3246 #endif
3247
3248 retry:
3249
3250         /* Check virtual blocks only when performing write operation */
3251         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
3252                 if (hfs_start_transaction(hfsmp) != 0) {
3253                         retval = EINVAL;
3254                         goto exit;
3255                 } else {
3256                         started_tr = 1;
3257                 }
3258                 syslocks = SFL_EXTENTS | SFL_BITMAP;
3259
3260         } else if (overflow_extents(fp)) {
3261                 syslocks = SFL_EXTENTS;
3262         }
3263
3264         if (syslocks)
3265                 lockflags = hfs_systemfile_lock(hfsmp, syslocks, HFS_EXCLUSIVE_LOCK);
3266
3267         /*
3268          * Check for any delayed allocations.
3269          */
3270         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
3271                 int64_t actbytes;
3272                 u_int32_t loanedBlocks;
3273
3274                 //
3275                 // Make sure we have a transaction.  It's possible
3276                 // that we came in and fp->ff_unallocblocks was zero
3277                 // but during the time we blocked acquiring the extents
3278                 // btree, ff_unallocblocks became non-zero and so we
3279                 // will need to start a transaction.
3280                 //
3281                 if (started_tr == 0) {
3282                         if (syslocks) {
3283                                 hfs_systemfile_unlock(hfsmp, lockflags);
3284                                 syslocks = 0;
3285                         }
3286                         goto retry;
3287                 }
3288
3289                 /*
3290                  * Note: ExtendFileC will Release any blocks on loan and
3291                  * aquire real blocks.  So we ask to extend by zero bytes
3292                  * since ExtendFileC will account for the virtual blocks.
3293                  */
3294
3295                 loanedBlocks = fp->ff_unallocblocks;
3296                 retval = ExtendFileC(hfsmp, (FCB*)fp, 0, 0,
3297                                      kEFAllMask | kEFNoClumpMask, &actbytes);
3298
3299                 if (retval) {
3300                         fp->ff_unallocblocks = loanedBlocks;
3301                         cp->c_blocks += loanedBlocks;
3302                         fp->ff_blocks += loanedBlocks;
3303
3304                         hfs_lock_mount (hfsmp);
3305                         hfsmp->loanedBlocks += loanedBlocks;
3306                         hfs_unlock_mount (hfsmp);
3307
3308                         hfs_systemfile_unlock(hfsmp, lockflags);
3309                         cp->c_flag |= C_MODIFIED;
3310                         if (started_tr) {
3311                                 (void) hfs_update(vp, 0);
3312                                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3313
3314                                 hfs_end_transaction(hfsmp);
3315                                 started_tr = 0;
3316                         }
3317                         goto exit;
3318                 }
3319         }
3320
3321         retval = MapFileBlockC(hfsmp, (FCB *)fp, bytesContAvail, ap->a_foffset,
3322                                ap->a_bpn, &bytesContAvail);
3323         if (syslocks) {
3324                 hfs_systemfile_unlock(hfsmp, lockflags);
3325                 syslocks = 0;
3326         }
3327
3328         if (retval) {
3329                 /* On write, always return error because virtual blocks, if any,
3330                  * should have been allocated in ExtendFileC().  We do not
3331                  * allocate virtual blocks on read, therefore return error
3332                  * only if no virtual blocks are allocated.  Otherwise we search
3333                  * rangelist for zero-fills
3334                  */
3335                 if ((MacToVFSError(retval) != ERANGE) ||
3336                     (ap->a_flags & VNODE_WRITE) ||
3337                     ((ap->a_flags & VNODE_READ) && (fp->ff_unallocblocks == 0))) {
3338                         goto exit;
3339                 }
3340
3341                 /* Validate if the start offset is within logical file size */
3342                 if (ap->a_foffset >= fp->ff_size) {
3343                         goto exit;
3344                 }
3345
3346                 /*
3347                  * At this point, we have encountered a failure during
3348                  * MapFileBlockC that resulted in ERANGE, and we are not
3349                  * servicing a write, and there are borrowed blocks.
3350                  *
3351                  * However, the cluster layer will not call blockmap for
3352                  * blocks that are borrowed and in-cache.  We have to assume
3353                  * that because we observed ERANGE being emitted from
3354                  * MapFileBlockC, this extent range is not valid on-disk.  So
3355                  * we treat this as a mapping that needs to be zero-filled
3356                  * prior to reading.
3357                  */
3358
3359                 if (fp->ff_size - ap->a_foffset < (off_t)bytesContAvail)
3360                         bytesContAvail = fp->ff_size - ap->a_foffset;
3361
3362                 *ap->a_bpn = (daddr64_t) -1;
3363                 retval = 0;
3364
3365                 goto exit;
3366         }
3367
3368 exit:
3369         if (retval == 0) {
3370                 if (ISSET(ap->a_flags, VNODE_WRITE)) {
3371                         struct rl_entry *r = TAILQ_FIRST(&fp->ff_invalidranges);
3372
3373                         // See if we might be overlapping invalid ranges...
3374                         if (r && (ap->a_foffset + (off_t)bytesContAvail) > r->rl_start) {
3375                                 /*
3376                                  * Mark the file as needing an update if we think the
3377                                  * on-disk EOF has changed.
3378                                  */
3379                                 if (ap->a_foffset <= r->rl_start)
3380                                         SET(cp->c_flag, C_MODIFIED);
3381
3382                                 /*
3383                                  * This isn't the ideal place to put this.  Ideally, we
3384                                  * should do something *after* we have successfully
3385                                  * written to the range, but that's difficult to do
3386                                  * because we cannot take locks in the callback.  At
3387                                  * present, the cluster code will call us with VNODE_WRITE
3388                                  * set just before it's about to write the data so we know
3389                                  * that data is about to be written.  If we get an I/O
3390                                  * error at this point then chances are the metadata
3391                                  * update to follow will also have an I/O error so the
3392                                  * risk here is small.
3393                                  */
3394                                 rl_remove(ap->a_foffset, ap->a_foffset + bytesContAvail - 1,
3395                                                   &fp->ff_invalidranges);
3396
3397                                 if (!TAILQ_FIRST(&fp->ff_invalidranges)) {
3398                                         cp->c_flag &= ~C_ZFWANTSYNC;
3399                                         cp->c_zftimeout = 0;
3400                                 }
3401                         }
3402                 }
3403
3404                 if (ap->a_run)
3405                         *ap->a_run = bytesContAvail;
3406
3407                 if (ap->a_poff)
3408                         *(int *)ap->a_poff = 0;
3409         }
3410
3411         if (started_tr) {
3412                 hfs_update(vp, TRUE);
3413                 hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3414                 hfs_end_transaction(hfsmp);
3415                 started_tr = 0;
3416         }
3417
3418         if (tooklock)
3419                 hfs_unlock(cp);
3420
3421         return (MacToVFSError(retval));
3422 }
3423
3424 /*
3425  * prepare and issue the I/O
3426  * buf_strategy knows how to deal
3427  * with requests that require
3428  * fragmented I/Os
3429  */
3430 int
3431 hfs_vnop_strategy(struct vnop_strategy_args *ap)
3432 {
3433         buf_t   bp = ap->a_bp;
3434         vnode_t vp = buf_vnode(bp);
3435         int error = 0;
3436
3437         /* Mark buffer as containing static data if cnode flag set */
3438         if (VTOC(vp)->c_flag & C_SSD_STATIC) {
3439                 buf_markstatic(bp);
3440         }
3441
3442         /* Mark buffer as containing static data if cnode flag set */
3443         if (VTOC(vp)->c_flag & C_SSD_GREEDY_MODE) {
3444                 bufattr_markgreedymode(buf_attr(bp));
3445         }
3446
3447         /* mark buffer as containing burst mode data if cnode flag set */
3448         if (VTOC(vp)->c_flag & C_IO_ISOCHRONOUS) {
3449                 bufattr_markisochronous(buf_attr(bp));
3450         }
3451
3452 #if CONFIG_PROTECT
3453         error = cp_handle_strategy(bp);
3454
3455         if (error)
3456                 return error;
3457 #endif
3458
3459         error = buf_strategy(VTOHFS(vp)->hfs_devvp, ap);
3460
3461         return error;
3462 }
3463
3464 int
3465 do_hfs_truncate(struct vnode *vp, off_t length, int flags, int truncateflags, vfs_context_t context)
3466 {
3467         register struct cnode *cp = VTOC(vp);
3468         struct filefork *fp = VTOF(vp);
3469         kauth_cred_t cred = vfs_context_ucred(context);
3470         int retval;
3471         off_t bytesToAdd;
3472         off_t actualBytesAdded;
3473         off_t filebytes;
3474         u_int32_t fileblocks;
3475         int blksize;
3476         struct hfsmount *hfsmp;
3477         int lockflags;
3478         int suppress_times = (truncateflags & HFS_TRUNCATE_SKIPTIMES);
3479
3480         blksize = VTOVCB(vp)->blockSize;
3481         fileblocks = fp->ff_blocks;
3482         filebytes = (off_t)fileblocks * (off_t)blksize;
3483
3484         KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_START,
3485                  (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
3486
3487         if (length < 0)
3488                 return (EINVAL);
3489
3490         /* This should only happen with a corrupt filesystem */
3491         if ((off_t)fp->ff_size < 0)
3492                 return (EINVAL);
3493
3494         if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE))
3495                 return (EFBIG);
3496
3497         hfsmp = VTOHFS(vp);
3498
3499         retval = E_NONE;
3500
3501         /* Files that are changing size are not hot file candidates. */
3502         if (hfsmp->hfc_stage == HFC_RECORDING) {
3503                 fp->ff_bytesread = 0;
3504         }
3505
3506         /*
3507          * We cannot just check if fp->ff_size == length (as an optimization)
3508          * since there may be extra physical blocks that also need truncation.
3509          */
3510 #if QUOTA
3511         if ((retval = hfs_getinoquota(cp)))
3512                 return(retval);
3513 #endif /* QUOTA */
3514
3515         /*
3516          * Lengthen the size of the file. We must ensure that the
3517          * last byte of the file is allocated. Since the smallest
3518          * value of ff_size is 0, length will be at least 1.
3519          */
3520         if (length > (off_t)fp->ff_size) {
3521 #if QUOTA
3522                 retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)),
3523                                    cred, 0);
3524                 if (retval)
3525                         goto Err_Exit;
3526 #endif /* QUOTA */
3527                 /*
3528                  * If we don't have enough physical space then
3529                  * we need to extend the physical size.
3530                  */
3531                 if (length > filebytes) {
3532                         int eflags;
3533                         u_int32_t blockHint = 0;
3534
3535                         /* All or nothing and don't round up to clumpsize. */
3536                         eflags = kEFAllMask | kEFNoClumpMask;
3537
3538                         if (cred && (suser(cred, NULL) != 0)) {
3539                                 eflags |= kEFReserveMask;  /* keep a reserve */
3540                         }
3541
3542                         /*
3543                          * Allocate Journal and Quota files in metadata zone.
3544                          */
3545                         if (filebytes == 0 &&
3546                             hfsmp->hfs_flags & HFS_METADATA_ZONE &&
3547                             hfs_virtualmetafile(cp)) {
3548                                 eflags |= kEFMetadataMask;
3549                                 blockHint = hfsmp->hfs_metazone_start;
3550                         }
3551                         if (hfs_start_transaction(hfsmp) != 0) {
3552                             retval = EINVAL;
3553                             goto Err_Exit;
3554                         }
3555
3556                         /* Protect extents b-tree and allocation bitmap */
3557                         lockflags = SFL_BITMAP;
3558                         if (overflow_extents(fp))
3559                                 lockflags |= SFL_EXTENTS;
3560                         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3561
3562                         /*
3563                          * Keep growing the file as long as the current EOF is
3564                          * less than the desired value.
3565                          */
3566                         while ((length > filebytes) && (retval == E_NONE)) {
3567                                 bytesToAdd = length - filebytes;
3568                                 retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
3569                                                     (FCB*)fp,
3570                                                     bytesToAdd,
3571                                                     blockHint,
3572                                                     eflags,
3573                                                     &actualBytesAdded));
3574
3575                                 filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
3576                                 if (actualBytesAdded == 0 && retval == E_NONE) {
3577                                         if (length > filebytes)
3578                                                 length = filebytes;
3579                                         break;
3580                                 }
3581                         } /* endwhile */
3582
3583                         hfs_systemfile_unlock(hfsmp, lockflags);
3584
3585                         if (hfsmp->jnl) {
3586                                 hfs_update(vp, 0);
3587                                 hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3588                         }
3589
3590                         hfs_end_transaction(hfsmp);
3591
3592                         if (retval)
3593                                 goto Err_Exit;
3594
3595                         KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE,
3596                                 (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
3597                 }
3598
3599                 if (ISSET(flags, IO_NOZEROFILL)) {
3600                         // An optimisation for the hibernation file
3601                         if (vnode_isswap(vp))
3602                                 rl_remove_all(&fp->ff_invalidranges);
3603                 } else {
3604                         if (!vnode_issystem(vp) && retval == E_NONE) {
3605                                 if (length > (off_t)fp->ff_size) {
3606                                         struct timeval tv;
3607
3608                                         /* Extending the file: time to fill out the current last page w. zeroes? */
3609                                         if (fp->ff_size & PAGE_MASK_64) {
3610                                                 /* There might be some valid data at the start of the (current) last page
3611                                                    of the file, so zero out the remainder of that page to ensure the
3612                                                    entire page contains valid data. */
3613                                                 hfs_unlock(cp);
3614                                                 retval = hfs_zero_eof_page(vp, length);
3615                                                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
3616                                                 if (retval) goto Err_Exit;
3617                                         }
3618                                         microuptime(&tv);
3619                                         rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges);
3620                                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
3621                                 }
3622                         } else {
3623                                         panic("hfs_truncate: invoked on non-UBC object?!");
3624                         };
3625                 }
3626                 if (suppress_times == 0) {
3627                         cp->c_touch_modtime = TRUE;
3628                 }
3629                 fp->ff_size = length;
3630
3631         } else { /* Shorten the size of the file */
3632
3633                 // An optimisation for the hibernation file
3634                 if (ISSET(flags, IO_NOZEROFILL) && vnode_isswap(vp)) {
3635                         rl_remove_all(&fp->ff_invalidranges);
3636                 } else if ((off_t)fp->ff_size > length) {
3637                         /* Any space previously marked as invalid is now irrelevant: */
3638                         rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges);
3639                 }
3640
3641                 /*
3642                  * Account for any unmapped blocks. Note that the new
3643                  * file length can still end up with unmapped blocks.
3644                  */
3645                 if (fp->ff_unallocblocks > 0) {
3646                         u_int32_t finalblks;
3647                         u_int32_t loanedBlocks;
3648
3649                         hfs_lock_mount(hfsmp);
3650                         loanedBlocks = fp->ff_unallocblocks;
3651                         cp->c_blocks -= loanedBlocks;
3652                         fp->ff_blocks -= loanedBlocks;
3653                         fp->ff_unallocblocks = 0;
3654
3655                         hfsmp->loanedBlocks -= loanedBlocks;
3656
3657                         finalblks = (length + blksize - 1) / blksize;
3658                         if (finalblks > fp->ff_blocks) {
3659                                 /* calculate required unmapped blocks */
3660                                 loanedBlocks = finalblks - fp->ff_blocks;
3661                                 hfsmp->loanedBlocks += loanedBlocks;
3662
3663                                 fp->ff_unallocblocks = loanedBlocks;
3664                                 cp->c_blocks += loanedBlocks;
3665                                 fp->ff_blocks += loanedBlocks;
3666                         }
3667                         hfs_unlock_mount (hfsmp);
3668                 }
3669
3670                 off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
3671                 if (hfs_start_transaction(hfsmp) != 0) {
3672                         retval = EINVAL;
3673                         goto Err_Exit;
3674                 }
3675
3676                 if (fp->ff_unallocblocks == 0) {
3677                         /* Protect extents b-tree and allocation bitmap */
3678                         lockflags = SFL_BITMAP;
3679                         if (overflow_extents(fp))
3680                                 lockflags |= SFL_EXTENTS;
3681                         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3682
3683                         retval = MacToVFSError(TruncateFileC(VTOVCB(vp), (FCB*)fp, length, 0,
3684                                                                                                  FORK_IS_RSRC (fp), FTOC(fp)->c_fileid, false));
3685
3686                         hfs_systemfile_unlock(hfsmp, lockflags);
3687                 }
3688                 if (hfsmp->jnl) {
3689                         if (retval == 0) {
3690                                 fp->ff_size = length;
3691                         }
3692                         hfs_update(vp, 0);
3693                         hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3694                 }
3695                 hfs_end_transaction(hfsmp);
3696
3697                 filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
3698                 if (retval)
3699                         goto Err_Exit;
3700 #if QUOTA
3701                 /* These are bytesreleased */
3702                 (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0);
3703 #endif /* QUOTA */
3704
3705                 //
3706                 // Unlike when growing a file, we adjust the hotfile block count here
3707                 // instead of deeper down in the block allocation code because we do
3708                 // not necessarily have a vnode or "fcb" at the time we're deleting
3709                 // the file and so we wouldn't know if it was hotfile cached or not
3710                 //
3711                 hfs_hotfile_adjust_blocks(vp, (int64_t)((savedbytes - filebytes) / blksize));
3712
3713
3714                 /*
3715                  * Only set update flag if the logical length changes & we aren't
3716                  * suppressing modtime updates.
3717                  */
3718                 if (((off_t)fp->ff_size != length) && (suppress_times == 0)) {
3719                         cp->c_touch_modtime = TRUE;
3720                 }
3721                 fp->ff_size = length;
3722         }
3723         if (cp->c_mode & (S_ISUID | S_ISGID)) {
3724                 if (!vfs_context_issuser(context))
3725                         cp->c_mode &= ~(S_ISUID | S_ISGID);
3726         }
3727         cp->c_flag |= C_MODIFIED;
3728         cp->c_touch_chgtime = TRUE;     /* status changed */
3729         if (suppress_times == 0) {
3730                 cp->c_touch_modtime = TRUE;     /* file data was modified */
3731
3732                 /*
3733                  * If we are not suppressing the modtime update, then
3734                  * update the gen count as well.
3735                  */
3736                 if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK (cp->c_attr.ca_mode)) {
3737                         hfs_incr_gencount(cp);
3738                 }
3739         }
3740
3741         retval = hfs_update(vp, 0);
3742         if (retval) {
3743                 KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE,
3744                      -1, -1, -1, retval, 0);
3745         }
3746
3747 Err_Exit:
3748
3749         KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_END,
3750                  (int)length, (int)fp->ff_size, (int)filebytes, retval, 0);
3751
3752         return (retval);
3753 }
3754
3755 /*
3756  * Preparation which must be done prior to deleting the catalog record
3757  * of a file or directory.  In order to make the on-disk as safe as possible,
3758  * we remove the catalog entry before releasing the bitmap blocks and the
3759  * overflow extent records.  However, some work must be done prior to deleting
3760  * the catalog record.
3761  *
3762  * When calling this function, the cnode must exist both in memory and on-disk.
3763  * If there are both resource fork and data fork vnodes, this function should
3764  * be called on both.
3765  */
3766
3767 int
3768 hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) {
3769
3770         struct filefork *fp = VTOF(vp);
3771         struct cnode *cp = VTOC(vp);
3772 #if QUOTA
3773         int retval = 0;
3774 #endif /* QUOTA */
3775
3776         /* Cannot truncate an HFS directory! */
3777         if (vnode_isdir(vp)) {
3778                 return (EISDIR);
3779         }
3780
3781         /*
3782          * See the comment below in hfs_truncate for why we need to call
3783          * setsize here.  Essentially we want to avoid pending IO if we
3784          * already know that the blocks are going to be released here.
3785          * This function is only called when totally removing all storage for a file, so
3786          * we can take a shortcut and immediately setsize (0);
3787          */
3788         ubc_setsize(vp, 0);
3789
3790         /* This should only happen with a corrupt filesystem */
3791         if ((off_t)fp->ff_size < 0)
3792                 return (EINVAL);
3793
3794         /*
3795          * We cannot just check if fp->ff_size == length (as an optimization)
3796          * since there may be extra physical blocks that also need truncation.
3797          */
3798 #if QUOTA
3799         if ((retval = hfs_getinoquota(cp))) {
3800                 return(retval);
3801         }
3802 #endif /* QUOTA */
3803
3804         /* Wipe out any invalid ranges which have yet to be backed by disk */
3805         rl_remove(0, fp->ff_size - 1, &fp->ff_invalidranges);
3806
3807         /*
3808          * Account for any unmapped blocks. Since we're deleting the
3809          * entire file, we don't have to worry about just shrinking
3810          * to a smaller number of borrowed blocks.
3811          */
3812         if (fp->ff_unallocblocks > 0) {
3813                 u_int32_t loanedBlocks;
3814
3815                 hfs_lock_mount (hfsmp);
3816                 loanedBlocks = fp->ff_unallocblocks;
3817                 cp->c_blocks -= loanedBlocks;
3818                 fp->ff_blocks -= loanedBlocks;
3819                 fp->ff_unallocblocks = 0;
3820
3821                 hfsmp->loanedBlocks -= loanedBlocks;
3822
3823                 hfs_unlock_mount (hfsmp);
3824         }
3825
3826         return 0;
3827 }
3828
3829
3830 /*
3831  * Special wrapper around calling TruncateFileC.  This function is useable
3832  * even when the catalog record does not exist any longer, making it ideal
3833  * for use when deleting a file.  The simplification here is that we know
3834  * that we are releasing all blocks.
3835  *
3836  * Note that this function may be called when there is no vnode backing
3837  * the file fork in question.  We may call this from hfs_vnop_inactive
3838  * to clear out resource fork data (and may not want to clear out the data
3839  * fork yet).  As a result, we pointer-check both sets of inputs before
3840  * doing anything with them.
3841  *
3842  * The caller is responsible for saving off a copy of the filefork(s)
3843  * embedded within the cnode prior to calling this function.  The pointers
3844  * supplied as arguments must be valid even if the cnode is no longer valid.
3845  */
3846
3847 int
3848 hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork,
3849                                          struct filefork *rsrcfork, u_int32_t fileid) {
3850
3851         off_t filebytes;
3852         u_int32_t fileblocks;
3853         int blksize = 0;
3854         int error = 0;
3855         int lockflags;
3856
3857         blksize = hfsmp->blockSize;
3858
3859         /* Data Fork */
3860         if (datafork) {
3861                 off_t prev_filebytes;
3862
3863                 datafork->ff_size = 0;
3864
3865                 fileblocks = datafork->ff_blocks;
3866                 filebytes = (off_t)fileblocks * (off_t)blksize;
3867                 prev_filebytes = filebytes;
3868
3869                 /* We killed invalid ranges and loaned blocks before we removed the catalog entry */
3870
3871                 while (filebytes > 0) {
3872                         if (filebytes > HFS_BIGFILE_SIZE) {
3873                                 filebytes -= HFS_BIGFILE_SIZE;
3874                         } else {
3875                                 filebytes = 0;
3876                         }
3877
3878                         /* Start a transaction, and wipe out as many blocks as we can in this iteration */
3879                         if (hfs_start_transaction(hfsmp) != 0) {
3880                                 error = EINVAL;
3881                                 break;
3882                         }
3883
3884                         if (datafork->ff_unallocblocks == 0) {
3885                                 /* Protect extents b-tree and allocation bitmap */
3886                                 lockflags = SFL_BITMAP;
3887                                 if (overflow_extents(datafork))
3888                                         lockflags |= SFL_EXTENTS;
3889                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3890
3891                                 error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), datafork, filebytes, 1, 0, fileid, false));
3892
3893                                 hfs_systemfile_unlock(hfsmp, lockflags);
3894                         }
3895                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3896
3897                         struct cnode *cp = datafork ? FTOC(datafork) : NULL;
3898                         struct vnode *vp;
3899                         vp = cp ? CTOV(cp, 0) : NULL;
3900                         hfs_hotfile_adjust_blocks(vp, (int64_t)((prev_filebytes - filebytes) / blksize));
3901                         prev_filebytes = filebytes;
3902
3903                         /* Finish the transaction and start over if necessary */
3904                         hfs_end_transaction(hfsmp);
3905
3906                         if (error) {
3907                                 break;
3908                         }
3909                 }
3910         }
3911
3912         /* Resource fork */
3913         if (error == 0 && rsrcfork) {
3914                 rsrcfork->ff_size = 0;
3915
3916                 fileblocks = rsrcfork->ff_blocks;
3917                 filebytes = (off_t)fileblocks * (off_t)blksize;
3918
3919                 /* We killed invalid ranges and loaned blocks before we removed the catalog entry */
3920
3921                 while (filebytes > 0) {
3922                         if (filebytes > HFS_BIGFILE_SIZE) {
3923                                 filebytes -= HFS_BIGFILE_SIZE;
3924                         } else {
3925                                 filebytes = 0;
3926                         }
3927
3928                         /* Start a transaction, and wipe out as many blocks as we can in this iteration */
3929                         if (hfs_start_transaction(hfsmp) != 0) {
3930                                 error = EINVAL;
3931                                 break;
3932                         }
3933
3934                         if (rsrcfork->ff_unallocblocks == 0) {
3935                                 /* Protect extents b-tree and allocation bitmap */
3936                                 lockflags = SFL_BITMAP;
3937                                 if (overflow_extents(rsrcfork))
3938                                         lockflags |= SFL_EXTENTS;
3939                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3940
3941                                 error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), rsrcfork, filebytes, 1, 1, fileid, false));
3942
3943                                 hfs_systemfile_unlock(hfsmp, lockflags);
3944                         }
3945                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3946
3947                         /* Finish the transaction and start over if necessary */
3948                         hfs_end_transaction(hfsmp);
3949
3950                         if (error) {
3951                                 break;
3952                         }
3953                 }
3954         }
3955
3956         return error;
3957 }
3958
3959 errno_t hfs_ubc_setsize(vnode_t vp, off_t len, bool have_cnode_lock)
3960 {
3961         errno_t error;
3962
3963         /*
3964          * Call ubc_setsize to give the VM subsystem a chance to do
3965          * whatever it needs to with existing pages before we delete
3966          * blocks.  Note that symlinks don't use the UBC so we'll
3967          * get back ENOENT in that case.
3968          */
3969         if (have_cnode_lock) {
3970                 error = ubc_setsize_ex(vp, len, UBC_SETSIZE_NO_FS_REENTRY);
3971                 if (error == EAGAIN) {
3972                         cnode_t *cp = VTOC(vp);
3973
3974                         if (cp->c_truncatelockowner != current_thread())
3975                                 hfs_warn("hfs: hfs_ubc_setsize called without exclusive truncate lock!");
3976
3977                         hfs_unlock(cp);
3978                         error = ubc_setsize_ex(vp, len, 0);
3979                         hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
3980                 }
3981         } else
3982                 error = ubc_setsize_ex(vp, len, 0);
3983
3984         return error == ENOENT ? 0 : error;
3985 }
3986
3987 /*
3988  * Truncate a cnode to at most length size, freeing (or adding) the
3989  * disk blocks.
3990  */
3991 int
3992 hfs_truncate(struct vnode *vp, off_t length, int flags,
3993                          int truncateflags, vfs_context_t context)
3994 {
3995         struct filefork *fp = VTOF(vp);
3996         off_t filebytes;
3997         u_int32_t fileblocks;
3998         int blksize;
3999         errno_t error = 0;
4000         struct cnode *cp = VTOC(vp);
4001         hfsmount_t *hfsmp = VTOHFS(vp);
4002
4003         /* Cannot truncate an HFS directory! */
4004         if (vnode_isdir(vp)) {
4005                 return (EISDIR);
4006         }
4007         /* A swap file cannot change size. */
4008         if (vnode_isswap(vp) && length && !ISSET(flags, IO_NOAUTH)) {
4009                 return (EPERM);
4010         }
4011
4012         blksize = hfsmp->blockSize;
4013         fileblocks = fp->ff_blocks;
4014         filebytes = (off_t)fileblocks * (off_t)blksize;
4015
4016         bool caller_has_cnode_lock = (cp->c_lockowner == current_thread());
4017
4018         error = hfs_ubc_setsize(vp, length, caller_has_cnode_lock);
4019         if (error)
4020                 return error;
4021
4022         if (!caller_has_cnode_lock) {
4023                 error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
4024                 if (error)
4025                         return error;
4026         }
4027
4028         if (vnode_islnk(vp) && cp->c_datafork->ff_symlinkptr) {
4029                 hfs_free(cp->c_datafork->ff_symlinkptr, cp->c_datafork->ff_size);
4030                 cp->c_datafork->ff_symlinkptr = NULL;
4031         }
4032
4033         // have to loop truncating or growing files that are
4034         // really big because otherwise transactions can get
4035         // enormous and consume too many kernel resources.
4036
4037         if (length < filebytes) {
4038                 while (filebytes > length) {
4039                         if ((filebytes - length) > HFS_BIGFILE_SIZE) {
4040                                 filebytes -= HFS_BIGFILE_SIZE;
4041                         } else {
4042                                 filebytes = length;
4043                         }
4044                         error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context);
4045                         if (error)
4046                                 break;
4047                 }
4048         } else if (length > filebytes) {
4049                 kauth_cred_t cred = vfs_context_ucred(context);
4050                 const bool keep_reserve = cred && suser(cred, NULL) != 0;
4051
4052                 if (hfs_freeblks(hfsmp, keep_reserve)
4053                         < howmany(length - filebytes, blksize)) {
4054                         error = ENOSPC;
4055                 } else {
4056                         while (filebytes < length) {
4057                                 if ((length - filebytes) > HFS_BIGFILE_SIZE) {
4058                                         filebytes += HFS_BIGFILE_SIZE;
4059                                 } else {
4060                                         filebytes = length;
4061                                 }
4062                                 error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context);
4063                                 if (error)
4064                                         break;
4065                         }
4066                 }
4067         } else /* Same logical size */ {
4068
4069                 error = do_hfs_truncate(vp, length, flags, truncateflags, context);
4070         }
4071         /* Files that are changing size are not hot file candidates. */
4072         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
4073                 fp->ff_bytesread = 0;
4074         }
4075
4076 #if HFS_CONFIG_KEY_ROLL
4077         if (!error && cp->c_truncatelockowner == current_thread()) {
4078                 hfs_key_roll_check(cp, true);
4079         }
4080 #endif
4081
4082         if (!caller_has_cnode_lock)
4083                 hfs_unlock(cp);
4084
4085         // Make sure UBC's size matches up (in case we didn't completely succeed)
4086         errno_t err2 = hfs_ubc_setsize(vp, fp->ff_size, caller_has_cnode_lock);
4087         if (!error)
4088                 error = err2;
4089
4090         return error;
4091 }
4092
4093
4094 /*
4095  * Preallocate file storage space.
4096  */
4097 int
4098 hfs_vnop_allocate(struct vnop_allocate_args /* {
4099                 vnode_t a_vp;
4100                 off_t a_length;
4101                 u_int32_t  a_flags;
4102                 off_t *a_bytesallocated;
4103                 off_t a_offset;
4104                 vfs_context_t a_context;
4105         } */ *ap)
4106 {
4107         struct vnode *vp = ap->a_vp;
4108         struct cnode *cp;
4109         struct filefork *fp;
4110         ExtendedVCB *vcb;
4111         off_t length = ap->a_length;
4112         off_t startingPEOF;
4113         off_t moreBytesRequested;
4114         off_t actualBytesAdded;
4115         off_t filebytes;
4116         u_int32_t fileblocks;
4117         int retval, retval2;
4118         u_int32_t blockHint;
4119         u_int32_t extendFlags;   /* For call to ExtendFileC */
4120         struct hfsmount *hfsmp;
4121         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
4122         int lockflags;
4123         time_t orig_ctime;
4124
4125         *(ap->a_bytesallocated) = 0;
4126
4127         if (!vnode_isreg(vp))
4128                 return (EISDIR);
4129         if (length < (off_t)0)
4130                 return (EINVAL);
4131
4132         cp = VTOC(vp);
4133
4134         orig_ctime = VTOC(vp)->c_ctime;
4135
4136         nspace_snapshot_event(vp, orig_ctime, ap->a_length == 0 ? NAMESPACE_HANDLER_TRUNCATE_OP|NAMESPACE_HANDLER_DELETE_OP : NAMESPACE_HANDLER_TRUNCATE_OP, NULL);
4137
4138         hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
4139
4140         if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
4141                 goto Err_Exit;
4142         }
4143
4144         fp = VTOF(vp);
4145         hfsmp = VTOHFS(vp);
4146         vcb = VTOVCB(vp);
4147
4148         fileblocks = fp->ff_blocks;
4149         filebytes = (off_t)fileblocks * (off_t)vcb->blockSize;
4150
4151         if ((ap->a_flags & ALLOCATEFROMVOL) && (length < filebytes)) {
4152                 retval = EINVAL;
4153                 goto Err_Exit;
4154         }
4155
4156         /* Fill in the flags word for the call to Extend the file */
4157
4158         extendFlags = kEFNoClumpMask;
4159         if (ap->a_flags & ALLOCATECONTIG)
4160                 extendFlags |= kEFContigMask;
4161         if (ap->a_flags & ALLOCATEALL)
4162                 extendFlags |= kEFAllMask;
4163         if (cred && suser(cred, NULL) != 0)
4164                 extendFlags |= kEFReserveMask;
4165         if (hfs_virtualmetafile(cp))
4166                 extendFlags |= kEFMetadataMask;
4167
4168         retval = E_NONE;
4169         blockHint = 0;
4170         startingPEOF = filebytes;
4171
4172         if (ap->a_flags & ALLOCATEFROMPEOF)
4173                 length += filebytes;
4174         else if (ap->a_flags & ALLOCATEFROMVOL)
4175                 blockHint = ap->a_offset / VTOVCB(vp)->blockSize;
4176
4177         /* If no changes are necesary, then we're done */
4178         if (filebytes == length)
4179                 goto Std_Exit;
4180
4181         /*
4182          * Lengthen the size of the file. We must ensure that the
4183          * last byte of the file is allocated. Since the smallest
4184          * value of filebytes is 0, length will be at least 1.
4185          */
4186         if (length > filebytes) {
4187                 if (ISSET(extendFlags, kEFAllMask)
4188                         && (hfs_freeblks(hfsmp, ISSET(extendFlags, kEFReserveMask))
4189                                 < howmany(length - filebytes, hfsmp->blockSize))) {
4190                         retval = ENOSPC;
4191                         goto Err_Exit;
4192                 }
4193
4194                 off_t total_bytes_added = 0, orig_request_size;
4195
4196                 orig_request_size = moreBytesRequested = length - filebytes;
4197
4198 #if QUOTA
4199                 retval = hfs_chkdq(cp,
4200                                 (int64_t)(roundup(moreBytesRequested, vcb->blockSize)),
4201                                 cred, 0);
4202                 if (retval)
4203                         goto Err_Exit;
4204
4205 #endif /* QUOTA */
4206                 /*
4207                  * Metadata zone checks.
4208                  */
4209                 if (hfsmp->hfs_flags & HFS_METADATA_ZONE) {
4210                         /*
4211                          * Allocate Journal and Quota files in metadata zone.
4212                          */
4213                         if (hfs_virtualmetafile(cp)) {
4214                                 blockHint = hfsmp->hfs_metazone_start;
4215                         } else if ((blockHint >= hfsmp->hfs_metazone_start) &&
4216                                    (blockHint <= hfsmp->hfs_metazone_end)) {
4217                                 /*
4218                                  * Move blockHint outside metadata zone.
4219                                  */
4220                                 blockHint = hfsmp->hfs_metazone_end + 1;
4221                         }
4222                 }
4223
4224
4225                 while ((length > filebytes) && (retval == E_NONE)) {
4226                     off_t bytesRequested;
4227
4228                     if (hfs_start_transaction(hfsmp) != 0) {
4229                         retval = EINVAL;
4230                         goto Err_Exit;
4231                     }
4232
4233                     /* Protect extents b-tree and allocation bitmap */
4234                     lockflags = SFL_BITMAP;
4235                     if (overflow_extents(fp))
4236                                 lockflags |= SFL_EXTENTS;
4237                     lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
4238
4239                     if (moreBytesRequested >= HFS_BIGFILE_SIZE) {
4240                                 bytesRequested = HFS_BIGFILE_SIZE;
4241                     } else {
4242                                 bytesRequested = moreBytesRequested;
4243                     }
4244
4245                     if (extendFlags & kEFContigMask) {
4246                             // if we're on a sparse device, this will force it to do a
4247                             // full scan to find the space needed.
4248                             hfsmp->hfs_flags &= ~HFS_DID_CONTIG_SCAN;
4249                     }
4250
4251                     retval = MacToVFSError(ExtendFileC(vcb,
4252                                                 (FCB*)fp,
4253                                                 bytesRequested,
4254                                                 blockHint,
4255                                                 extendFlags,
4256                                                 &actualBytesAdded));
4257
4258                     if (retval == E_NONE) {
4259                         *(ap->a_bytesallocated) += actualBytesAdded;
4260                         total_bytes_added += actualBytesAdded;
4261                         moreBytesRequested -= actualBytesAdded;
4262                         if (blockHint != 0) {
4263                             blockHint += actualBytesAdded / vcb->blockSize;
4264                         }
4265                     }
4266                     filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
4267
4268                     hfs_systemfile_unlock(hfsmp, lockflags);
4269
4270                     if (hfsmp->jnl) {
4271                         (void) hfs_update(vp, 0);
4272                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
4273                     }
4274
4275                     hfs_end_transaction(hfsmp);
4276                 }
4277
4278
4279                 /*
4280                  * if we get an error and no changes were made then exit
4281                  * otherwise we must do the hfs_update to reflect the changes
4282                  */
4283                 if (retval && (startingPEOF == filebytes))
4284                         goto Err_Exit;
4285
4286                 /*
4287                  * Adjust actualBytesAdded to be allocation block aligned, not
4288                  * clump size aligned.
4289                  * NOTE: So what we are reporting does not affect reality
4290                  * until the file is closed, when we truncate the file to allocation
4291                  * block size.
4292                  */
4293                 if (total_bytes_added != 0 && orig_request_size < total_bytes_added)
4294                         *(ap->a_bytesallocated) =
4295                                 roundup(orig_request_size, (off_t)vcb->blockSize);
4296
4297         } else { /* Shorten the size of the file */
4298
4299                 /*
4300                  * N.B. At present, this code is never called.  If and when we
4301                  * do start using it, it looks like there might be slightly
4302                  * strange semantics with the file size: it's possible for the
4303                  * file size to *increase* e.g. if current file size is 5,
4304                  * length is 1024 and filebytes is 4096, the file size will
4305                  * end up being 1024 bytes.  This isn't necessarily a problem
4306                  * but it's not consistent with the code above which doesn't
4307                  * change the file size.
4308                  */
4309
4310                 retval = hfs_truncate(vp, length, 0, 0, ap->a_context);
4311                 filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
4312
4313                 /*
4314                  * if we get an error and no changes were made then exit
4315                  * otherwise we must do the hfs_update to reflect the changes
4316                  */
4317                 if (retval && (startingPEOF == filebytes)) goto Err_Exit;
4318 #if QUOTA
4319                 /* These are  bytesreleased */
4320                 (void) hfs_chkdq(cp, (int64_t)-((startingPEOF - filebytes)), NOCRED,0);
4321 #endif /* QUOTA */
4322
4323                 if (fp->ff_size > filebytes) {
4324                         fp->ff_size = filebytes;
4325
4326                         hfs_ubc_setsize(vp, fp->ff_size, true);
4327                 }
4328         }
4329
4330 Std_Exit:
4331         cp->c_flag |= C_MODIFIED;
4332         cp->c_touch_chgtime = TRUE;
4333         cp->c_touch_modtime = TRUE;
4334         retval2 = hfs_update(vp, 0);
4335
4336         if (retval == 0)
4337                 retval = retval2;
4338 Err_Exit:
4339         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
4340         hfs_unlock(cp);
4341         return (retval);
4342 }
4343
4344
4345 /*
4346  * Pagein for HFS filesystem
4347  */
4348 int
4349 hfs_vnop_pagein(struct vnop_pagein_args *ap)
4350 /*
4351         struct vnop_pagein_args {
4352                 vnode_t a_vp,
4353                 upl_t         a_pl,
4354                 vm_offset_t   a_pl_offset,
4355                 off_t         a_f_offset,
4356                 size_t        a_size,
4357                 int           a_flags
4358                 vfs_context_t a_context;
4359         };
4360 */
4361 {
4362         vnode_t         vp;
4363         struct cnode    *cp;
4364         struct filefork *fp;
4365         int             error = 0;
4366         upl_t           upl;
4367         upl_page_info_t *pl;
4368         off_t           f_offset;
4369         off_t           page_needed_f_offset;
4370         int             offset;
4371         int             isize;
4372         int             upl_size;
4373         int             pg_index;
4374         boolean_t       truncate_lock_held = FALSE;
4375         boolean_t       file_converted = FALSE;
4376         kern_return_t   kret;
4377
4378         vp = ap->a_vp;
4379         cp = VTOC(vp);
4380         fp = VTOF(vp);
4381
4382 #if CONFIG_PROTECT
4383         if ((error = cp_handle_vnop(vp, CP_READ_ACCESS | CP_WRITE_ACCESS, 0)) != 0) {
4384                 /*
4385                  * If we errored here, then this means that one of two things occurred:
4386                  * 1. there was a problem with the decryption of the key.
4387                  * 2. the device is locked and we are not allowed to access this particular file.
4388                  *
4389                  * Either way, this means that we need to shut down this upl now.  As long as
4390                  * the pl pointer is NULL (meaning that we're supposed to create the UPL ourselves)
4391                  * then we create a upl and immediately abort it.
4392                  */
4393                 if (ap->a_pl == NULL) {
4394                         /* create the upl */
4395                         ubc_create_upl (vp, ap->a_f_offset, ap->a_size, &upl, &pl,
4396                                         UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT);
4397                         /* mark the range as needed so it doesn't immediately get discarded upon abort */
4398                         ubc_upl_range_needed (upl, ap->a_pl_offset / PAGE_SIZE, 1);
4399
4400                         /* Abort the range */
4401                         ubc_upl_abort_range (upl, 0, ap->a_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
4402                 }
4403
4404
4405                 return error;
4406         }
4407 #endif /* CONFIG_PROTECT */
4408
4409         if (ap->a_pl != NULL) {
4410                 /*
4411                  * this can only happen for swap files now that
4412                  * we're asking for V2 paging behavior...
4413                  * so don't need to worry about decompression, or
4414                  * keeping track of blocks read or taking the truncate lock
4415                  */
4416                 error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
4417                                        ap->a_size, (off_t)fp->ff_size, ap->a_flags);
4418                 goto pagein_done;
4419         }
4420
4421         page_needed_f_offset = ap->a_f_offset + ap->a_pl_offset;
4422
4423 retry_pagein:
4424         /*
4425          * take truncate lock (shared/recursive) to guard against
4426          * zero-fill thru fsync interfering, but only for v2
4427          *
4428          * the HFS_RECURSE_TRUNCLOCK arg indicates that we want the
4429          * lock shared and we are allowed to recurse 1 level if this thread already
4430          * owns the lock exclusively... this can legally occur
4431          * if we are doing a shrinking ftruncate against a file
4432          * that is mapped private, and the pages being truncated
4433          * do not currently exist in the cache... in that case
4434          * we will have to page-in the missing pages in order
4435          * to provide them to the private mapping... we must
4436          * also call hfs_unlock_truncate with a postive been_recursed
4437          * arg to indicate that if we have recursed, there is no need to drop
4438          * the lock.  Allowing this simple recursion is necessary
4439          * in order to avoid a certain deadlock... since the ftruncate
4440          * already holds the truncate lock exclusively, if we try
4441          * to acquire it shared to protect the pagein path, we will
4442          * hang this thread
4443          *
4444          * NOTE: The if () block below is a workaround in order to prevent a
4445          * VM deadlock. See rdar://7853471.
4446          *
4447          * If we are in a forced unmount, then launchd will still have the
4448          * dyld_shared_cache file mapped as it is trying to reboot.  If we
4449          * take the truncate lock here to service a page fault, then our
4450          * thread could deadlock with the forced-unmount.  The forced unmount
4451          * thread will try to reclaim the dyld_shared_cache vnode, but since it's
4452          * marked C_DELETED, it will call ubc_setsize(0).  As a result, the unmount
4453          * thread will think it needs to copy all of the data out of the file
4454          * and into a VM copy object.  If we hold the cnode lock here, then that
4455          * VM operation will not be able to proceed, because we'll set a busy page
4456          * before attempting to grab the lock.  Note that this isn't as simple as "don't
4457          * call ubc_setsize" because doing that would just shift the problem to the
4458          * ubc_msync done before the vnode is reclaimed.
4459          *
4460          * So, if a forced unmount on this volume is in flight AND the cnode is
4461          * marked C_DELETED, then just go ahead and do the page in without taking
4462          * the lock (thus suspending pagein_v2 semantics temporarily).  Since it's on a file
4463          * that is not going to be available on the next mount, this seems like a
4464          * OK solution from a correctness point of view, even though it is hacky.
4465          */
4466         if (vfs_isforce(vnode_mount(vp))) {
4467                 if (cp->c_flag & C_DELETED) {
4468                         /* If we don't get it, then just go ahead and operate without the lock */
4469                         truncate_lock_held = hfs_try_trunclock(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4470                 }
4471         }
4472         else {
4473                 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4474                 truncate_lock_held = TRUE;
4475         }
4476
4477         kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT);
4478
4479         if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
4480                 error = EINVAL;
4481                 goto pagein_done;
4482         }
4483         ubc_upl_range_needed(upl, ap->a_pl_offset / PAGE_SIZE, 1);
4484
4485         upl_size = isize = ap->a_size;
4486
4487         /*
4488          * Scan from the back to find the last page in the UPL, so that we
4489          * aren't looking at a UPL that may have already been freed by the
4490          * preceding aborts/completions.
4491          */
4492         for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
4493                 if (upl_page_present(pl, --pg_index))
4494                         break;
4495                 if (pg_index == 0) {
4496                         /*
4497                          * no absent pages were found in the range specified
4498                          * just abort the UPL to get rid of it and then we're done
4499                          */
4500                         ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
4501                         goto pagein_done;
4502                 }
4503         }
4504         /*
4505          * initialize the offset variables before we touch the UPL.
4506          * f_offset is the position into the file, in bytes
4507          * offset is the position into the UPL, in bytes
4508          * pg_index is the pg# of the UPL we're operating on
4509          * isize is the offset into the UPL of the last page that is present.
4510          */
4511         isize = ((pg_index + 1) * PAGE_SIZE);
4512         pg_index = 0;
4513         offset = 0;
4514         f_offset = ap->a_f_offset;
4515
4516         while (isize) {
4517                 int  xsize;
4518                 int  num_of_pages;
4519
4520                 if ( !upl_page_present(pl, pg_index)) {
4521                         /*
4522                          * we asked for RET_ONLY_ABSENT, so it's possible
4523                          * to get back empty slots in the UPL.
4524                          * just skip over them
4525                          */
4526                         f_offset += PAGE_SIZE;
4527                         offset   += PAGE_SIZE;
4528                         isize    -= PAGE_SIZE;
4529                         pg_index++;
4530
4531                         continue;
4532                 }
4533                 /*
4534                  * We know that we have at least one absent page.
4535                  * Now checking to see how many in a row we have
4536                  */
4537                 num_of_pages = 1;
4538                 xsize = isize - PAGE_SIZE;
4539
4540                 while (xsize) {
4541                         if ( !upl_page_present(pl, pg_index + num_of_pages))
4542                                 break;
4543                         num_of_pages++;
4544                         xsize -= PAGE_SIZE;
4545                 }
4546                 xsize = num_of_pages * PAGE_SIZE;
4547
4548 #if HFS_COMPRESSION
4549                 if (VNODE_IS_RSRC(vp)) {
4550                         /* allow pageins of the resource fork */
4551                 } else {
4552                         int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
4553
4554                         if (compressed) {
4555
4556                                 if (truncate_lock_held) {
4557                                         /*
4558                                          * can't hold the truncate lock when calling into the decmpfs layer
4559                                          * since it calls back into this layer... even though we're only
4560                                          * holding the lock in shared mode, and the re-entrant path only
4561                                          * takes the lock shared, we can deadlock if some other thread
4562                                          * tries to grab the lock exclusively in between.
4563                                          */
4564                                         hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4565                                         truncate_lock_held = FALSE;
4566                                 }
4567                                 ap->a_pl = upl;
4568                                 ap->a_pl_offset = offset;
4569                                 ap->a_f_offset = f_offset;
4570                                 ap->a_size = xsize;
4571
4572                                 error = decmpfs_pagein_compressed(ap, &compressed, VTOCMP(vp));
4573                                 /*
4574                                  * note that decpfs_pagein_compressed can change the state of
4575                                  * 'compressed'... it will set it to 0 if the file is no longer
4576                                  * compressed once the compression lock is successfully taken
4577                                  * i.e. we would block on that lock while the file is being inflated
4578                                  */
4579                                 if (error == 0 && vnode_isfastdevicecandidate(vp)) {
4580                                         (void) hfs_addhotfile(vp);
4581                                 }
4582                                 if (compressed) {
4583                                         if (error == 0) {
4584                                                 /* successful page-in, update the access time */
4585                                                 VTOC(vp)->c_touch_acctime = TRUE;
4586
4587                                                 //
4588                                                 // compressed files are not traditional hot file candidates
4589                                                 // but they may be for CF (which ignores the ff_bytesread
4590                                                 // field)
4591                                                 //
4592                                                 if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
4593                                                         fp->ff_bytesread = 0;
4594                                                 }
4595                                         } else if (error == EAGAIN) {
4596                                                 /*
4597                                                  * EAGAIN indicates someone else already holds the compression lock...
4598                                                  * to avoid deadlocking, we'll abort this range of pages with an
4599                                                  * indication that the pagein needs to be redriven
4600                                                  */
4601                                                 ubc_upl_abort_range(upl, (upl_offset_t) offset, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_RESTART);
4602                                         } else if (error == ENOSPC) {
4603
4604                                                 if (upl_size == PAGE_SIZE)
4605                                                         panic("decmpfs_pagein_compressed: couldn't ubc_upl_map a single page\n");
4606
4607                                                 ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY);
4608
4609                                                 ap->a_size = PAGE_SIZE;
4610                                                 ap->a_pl = NULL;
4611                                                 ap->a_pl_offset = 0;
4612                                                 ap->a_f_offset = page_needed_f_offset;
4613
4614                                                 goto retry_pagein;
4615                                         } else {
4616                                                 ubc_upl_abort(upl, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
4617                                                 goto pagein_done;
4618                                         }
4619                                         goto pagein_next_range;
4620                                 }
4621                                 else {
4622                                         /*
4623                                          * Set file_converted only if the file became decompressed while we were
4624                                          * paging in.  If it were still compressed, we would re-start the loop using the goto
4625                                          * in the above block.  This avoid us overloading truncate_lock_held as our retry_pagein
4626                                          * condition below, since we could have avoided taking the truncate lock to prevent
4627                                          * a deadlock in the force unmount case.
4628                                          */
4629                                         file_converted = TRUE;
4630                                 }
4631                         }
4632                         if (file_converted == TRUE) {
4633                                 /*
4634                                  * the file was converted back to a regular file after we first saw it as compressed
4635                                  * we need to abort the upl, retake the truncate lock, recreate the UPL and start over
4636                                  * reset a_size so that we consider what remains of the original request
4637                                  * and null out a_upl and a_pl_offset.
4638                                  *
4639                                  * We should only be able to get into this block if the decmpfs_pagein_compressed
4640                                  * successfully decompressed the range in question for this file.
4641                                  */
4642                                 ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY);
4643
4644                                 ap->a_size = isize;
4645                                 ap->a_pl = NULL;
4646                                 ap->a_pl_offset = 0;
4647
4648                                 /* Reset file_converted back to false so that we don't infinite-loop. */
4649                                 file_converted = FALSE;
4650                                 goto retry_pagein;
4651                         }
4652                 }
4653 #endif
4654                 error = cluster_pagein(vp, upl, offset, f_offset, xsize, (off_t)fp->ff_size, ap->a_flags);
4655
4656                 /*
4657                  * Keep track of blocks read.
4658                  */
4659                 if ( !vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) {
4660                         int bytesread;
4661                         int took_cnode_lock = 0;
4662
4663                         if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE)
4664                                 bytesread = fp->ff_size;
4665                         else
4666                                 bytesread = xsize;
4667
4668                         /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
4669                         if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) {
4670                                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
4671                                 took_cnode_lock = 1;
4672                         }
4673                         /*
4674                          * If this file hasn't been seen since the start of
4675                          * the current sampling period then start over.
4676                          */
4677                         if (cp->c_atime < VTOHFS(vp)->hfc_timebase) {
4678                                 struct timeval tv;
4679
4680                                 fp->ff_bytesread = bytesread;
4681                                 microtime(&tv);
4682                                 cp->c_atime = tv.tv_sec;
4683                         } else {
4684                                 fp->ff_bytesread += bytesread;
4685                         }
4686                         cp->c_touch_acctime = TRUE;
4687
4688                         if (vnode_isfastdevicecandidate(vp)) {
4689                                 (void) hfs_addhotfile(vp);
4690                         }
4691                         if (took_cnode_lock)
4692                                 hfs_unlock(cp);
4693                 }
4694 pagein_next_range:
4695                 f_offset += xsize;
4696                 offset   += xsize;
4697                 isize    -= xsize;
4698                 pg_index += num_of_pages;
4699
4700                 error = 0;
4701         }
4702
4703 pagein_done:
4704         if (truncate_lock_held == TRUE) {
4705                 /* Note 1 is passed to hfs_unlock_truncate in been_recursed argument */
4706                 hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4707         }
4708
4709         return (error);
4710 }
4711
4712 /*
4713  * Pageout for HFS filesystem.
4714  */
4715 int
4716 hfs_vnop_pageout(struct vnop_pageout_args *ap)
4717 /*
4718         struct vnop_pageout_args {
4719            vnode_t a_vp,
4720            upl_t         a_pl,
4721            vm_offset_t   a_pl_offset,
4722            off_t         a_f_offset,
4723            size_t        a_size,
4724            int           a_flags
4725            vfs_context_t a_context;
4726         };
4727 */
4728 {
4729         vnode_t vp = ap->a_vp;
4730         struct cnode *cp;
4731         struct filefork *fp;
4732         int retval = 0;
4733         off_t filesize;
4734         upl_t           upl;
4735         upl_page_info_t* pl = NULL;
4736         vm_offset_t     a_pl_offset;
4737         int             a_flags;
4738         int is_pageoutv2 = 0;
4739         kern_return_t kret;
4740
4741         cp = VTOC(vp);
4742         fp = VTOF(vp);
4743
4744         a_flags = ap->a_flags;
4745         a_pl_offset = ap->a_pl_offset;
4746
4747         /*
4748          * we can tell if we're getting the new or old behavior from the UPL
4749          */
4750         if ((upl = ap->a_pl) == NULL) {
4751                 int request_flags;
4752
4753                 is_pageoutv2 = 1;
4754                 /*
4755                  * we're in control of any UPL we commit
4756                  * make sure someone hasn't accidentally passed in UPL_NOCOMMIT
4757                  */
4758                 a_flags &= ~UPL_NOCOMMIT;
4759                 a_pl_offset = 0;
4760
4761                 /*
4762                  * For V2 semantics, we want to take the cnode truncate lock
4763                  * shared to guard against the file size changing via zero-filling.
4764                  *
4765                  * However, we have to be careful because we may be invoked
4766                  * via the ubc_msync path to write out dirty mmap'd pages
4767                  * in response to a lock event on a content-protected
4768                  * filesystem (e.g. to write out class A files).
4769                  * As a result, we want to take the truncate lock 'SHARED' with
4770                  * the mini-recursion locktype so that we don't deadlock/panic
4771                  * because we may be already holding the truncate lock exclusive to force any other
4772                  * IOs to have blocked behind us.
4773                  */
4774                 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4775
4776                 if (a_flags & UPL_MSYNC) {
4777                         request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY;
4778                 }
4779                 else {
4780                         request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY;
4781                 }
4782
4783                 kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags);
4784
4785                 if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
4786                         retval = EINVAL;
4787                         goto pageout_done;
4788                 }
4789         }
4790         /*
4791          * from this point forward upl points at the UPL we're working with
4792          * it was either passed in or we succesfully created it
4793          */
4794
4795         /*
4796          * Figure out where the file ends, for pageout purposes.  If
4797          * ff_new_size > ff_size, then we're in the middle of extending the
4798          * file via a write, so it is safe (and necessary) that we be able
4799          * to pageout up to that point.
4800          */
4801         filesize = fp->ff_size;
4802         if (fp->ff_new_size > filesize)
4803                 filesize = fp->ff_new_size;
4804
4805         /*
4806          * Now that HFS is opting into VFC_VFSVNOP_PAGEOUTV2, we may need to operate on our own
4807          * UPL instead of relying on the UPL passed into us.  We go ahead and do that here,
4808          * scanning for dirty ranges.  We'll issue our own N cluster_pageout calls, for
4809          * N dirty ranges in the UPL.  Note that this is almost a direct copy of the
4810          * logic in vnode_pageout except that we need to do it after grabbing the truncate
4811          * lock in HFS so that we don't lock invert ourselves.
4812          *
4813          * Note that we can still get into this function on behalf of the default pager with
4814          * non-V2 behavior (swapfiles).  However in that case, we did not grab locks above
4815          * since fsync and other writing threads will grab the locks, then mark the
4816          * relevant pages as busy.  But the pageout codepath marks the pages as busy,
4817          * and THEN would attempt to grab the truncate lock, which would result in deadlock.  So
4818          * we do not try to grab anything for the pre-V2 case, which should only be accessed
4819          * by the paging/VM system.
4820          */
4821
4822         if (is_pageoutv2) {
4823                 off_t f_offset;
4824                 int offset;
4825                 int isize;
4826                 int pg_index;
4827                 int error;
4828                 int error_ret = 0;
4829
4830                 isize = ap->a_size;
4831                 f_offset = ap->a_f_offset;
4832
4833                 /*
4834                  * Scan from the back to find the last page in the UPL, so that we
4835                  * aren't looking at a UPL that may have already been freed by the
4836                  * preceding aborts/completions.
4837                  */
4838                 for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
4839                         if (upl_page_present(pl, --pg_index))
4840                                 break;
4841                         if (pg_index == 0) {
4842                                 ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
4843                                 goto pageout_done;
4844                         }
4845                 }
4846
4847                 /*
4848                  * initialize the offset variables before we touch the UPL.
4849                  * a_f_offset is the position into the file, in bytes
4850                  * offset is the position into the UPL, in bytes
4851                  * pg_index is the pg# of the UPL we're operating on.
4852                  * isize is the offset into the UPL of the last non-clean page.
4853                  */
4854                 isize = ((pg_index + 1) * PAGE_SIZE);
4855
4856                 offset = 0;
4857                 pg_index = 0;
4858
4859                 while (isize) {
4860                         int  xsize;
4861                         int  num_of_pages;
4862
4863                         if ( !upl_page_present(pl, pg_index)) {
4864                                 /*
4865                                  * we asked for RET_ONLY_DIRTY, so it's possible
4866                                  * to get back empty slots in the UPL.
4867                                  * just skip over them
4868                                  */
4869                                 f_offset += PAGE_SIZE;
4870                                 offset   += PAGE_SIZE;
4871                                 isize    -= PAGE_SIZE;
4872                                 pg_index++;
4873
4874                                 continue;
4875                         }
4876                         if ( !upl_dirty_page(pl, pg_index)) {
4877                                 panic ("hfs_vnop_pageout: unforeseen clean page @ index %d for UPL %p\n", pg_index, upl);
4878                         }
4879
4880                         /*
4881                          * We know that we have at least one dirty page.
4882                          * Now checking to see how many in a row we have
4883                          */
4884                         num_of_pages = 1;
4885                         xsize = isize - PAGE_SIZE;
4886
4887                         while (xsize) {
4888                                 if ( !upl_dirty_page(pl, pg_index + num_of_pages))
4889                                         break;
4890                                 num_of_pages++;
4891                                 xsize -= PAGE_SIZE;
4892                         }
4893                         xsize = num_of_pages * PAGE_SIZE;
4894
4895                         if ((error = cluster_pageout(vp, upl, offset, f_offset,
4896                                                         xsize, filesize, a_flags))) {
4897                                 if (error_ret == 0)
4898                                         error_ret = error;
4899                         }
4900                         f_offset += xsize;
4901                         offset   += xsize;
4902                         isize    -= xsize;
4903                         pg_index += num_of_pages;
4904                 }
4905                 /* capture errnos bubbled out of cluster_pageout if they occurred */
4906                 if (error_ret != 0) {
4907                         retval = error_ret;
4908                 }
4909         } /* end block for v2 pageout behavior */
4910         else {
4911                 /*
4912                  * just call cluster_pageout for old pre-v2 behavior
4913                  */
4914                 retval = cluster_pageout(vp, upl, a_pl_offset, ap->a_f_offset,
4915                                 ap->a_size, filesize, a_flags);
4916         }
4917
4918         /*
4919          * If data was written, update the modification time of the file
4920          * but only if it's mapped writable; we will have touched the
4921          * modifcation time for direct writes.
4922          */
4923         if (retval == 0 && (ubc_is_mapped_writable(vp)
4924                                                 || ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING))) {
4925                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
4926
4927                 // Check again with lock
4928                 bool mapped_writable = ubc_is_mapped_writable(vp);
4929                 if (mapped_writable
4930                         || ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING)) {
4931                         cp->c_touch_modtime = TRUE;
4932                         cp->c_touch_chgtime = TRUE;
4933
4934                         /*
4935                          * We only need to increment the generation counter if
4936                          * it's currently mapped writable because we incremented
4937                          * the counter in hfs_vnop_mnomap.
4938                          */
4939                         if (mapped_writable)
4940                                 hfs_incr_gencount(VTOC(vp));
4941
4942                         /*
4943                          * If setuid or setgid bits are set and this process is
4944                          * not the superuser then clear the setuid and setgid bits
4945                          * as a precaution against tampering.
4946                          */
4947                         if ((cp->c_mode & (S_ISUID | S_ISGID)) &&
4948                                 (vfs_context_suser(ap->a_context) != 0)) {
4949                                 cp->c_mode &= ~(S_ISUID | S_ISGID);
4950                         }
4951                 }
4952
4953                 hfs_unlock(cp);
4954         }
4955
4956 pageout_done:
4957         if (is_pageoutv2) {
4958                 /*
4959                  * Release the truncate lock.  Note that because
4960                  * we may have taken the lock recursively by
4961                  * being invoked via ubc_msync due to lockdown,
4962                  * we should release it recursively, too.
4963                  */
4964                 hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4965         }
4966         return (retval);
4967 }
4968
4969 /*
4970  * Intercept B-Tree node writes to unswap them if necessary.
4971  */
4972 int
4973 hfs_vnop_bwrite(struct vnop_bwrite_args *ap)
4974 {
4975         int retval = 0;
4976         register struct buf *bp = ap->a_bp;
4977         register struct vnode *vp = buf_vnode(bp);
4978         BlockDescriptor block;
4979
4980         /* Trap B-Tree writes */
4981         if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) ||
4982             (VTOC(vp)->c_fileid == kHFSCatalogFileID) ||
4983             (VTOC(vp)->c_fileid == kHFSAttributesFileID) ||
4984             (vp == VTOHFS(vp)->hfc_filevp)) {
4985
4986                 /*
4987                  * Swap and validate the node if it is in native byte order.
4988                  * This is always be true on big endian, so we always validate
4989                  * before writing here.  On little endian, the node typically has
4990                  * been swapped and validated when it was written to the journal,
4991                  * so we won't do anything here.
4992                  */
4993                 if (((u_int16_t *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) {
4994                         /* Prepare the block pointer */
4995                         block.blockHeader = bp;
4996                         block.buffer = (char *)buf_dataptr(bp);
4997                         block.blockNum = buf_lblkno(bp);
4998                         /* not found in cache ==> came from disk */
4999                         block.blockReadFromDisk = (buf_fromcache(bp) == 0);
5000                         block.blockSize = buf_count(bp);
5001
5002                         /* Endian un-swap B-Tree node */
5003                         retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, false);
5004                         if (retval)
5005                                 panic("hfs_vnop_bwrite: about to write corrupt node!\n");
5006                 }
5007         }
5008
5009         /* This buffer shouldn't be locked anymore but if it is clear it */
5010         if ((buf_flags(bp) & B_LOCKED)) {
5011                 // XXXdbg
5012                 if (VTOHFS(vp)->jnl) {
5013                         panic("hfs: CLEARING the lock bit on bp %p\n", bp);
5014                 }
5015                 buf_clearflags(bp, B_LOCKED);
5016         }
5017         retval = vn_bwrite (ap);
5018
5019         return (retval);
5020 }
5021
5022
5023 int
5024 hfs_pin_block_range(struct hfsmount *hfsmp, int pin_state, uint32_t start_block, uint32_t nblocks)
5025 {
5026         _dk_cs_pin_t pin;
5027         unsigned ioc;
5028         int err;
5029
5030         memset(&pin, 0, sizeof(pin));
5031         pin.cp_extent.offset = ((uint64_t)start_block) * HFSTOVCB(hfsmp)->blockSize;
5032         pin.cp_extent.length = ((uint64_t)nblocks) * HFSTOVCB(hfsmp)->blockSize;
5033         switch (pin_state) {
5034         case HFS_PIN_IT:
5035                 ioc = _DKIOCCSPINEXTENT;
5036                 pin.cp_flags = _DKIOCCSPINTOFASTMEDIA;
5037                 break;
5038         case HFS_PIN_IT | HFS_TEMP_PIN:
5039                 ioc = _DKIOCCSPINEXTENT;
5040                 pin.cp_flags = _DKIOCCSPINTOFASTMEDIA | _DKIOCCSTEMPORARYPIN;
5041                 break;
5042         case HFS_PIN_IT | HFS_DATALESS_PIN:
5043                 ioc = _DKIOCCSPINEXTENT;
5044                 pin.cp_flags = _DKIOCCSPINTOFASTMEDIA | _DKIOCCSPINFORSWAPFILE;
5045                 break;
5046         case HFS_UNPIN_IT:
5047                 ioc = _DKIOCCSUNPINEXTENT;
5048                 pin.cp_flags = 0;
5049                 break;
5050         case HFS_UNPIN_IT | HFS_EVICT_PIN:
5051                 ioc = _DKIOCCSPINEXTENT;
5052                 pin.cp_flags = _DKIOCCSPINTOSLOWMEDIA;
5053                 break;
5054         default:
5055                 return EINVAL;
5056         }
5057         err = VNOP_IOCTL(hfsmp->hfs_devvp, ioc, (caddr_t)&pin, 0, vfs_context_kernel());
5058         return err;
5059 }
5060
5061 //
5062 // The cnode lock should already be held on entry to this function
5063 //
5064 int
5065 hfs_pin_vnode(struct hfsmount *hfsmp, struct vnode *vp, int pin_state, uint32_t *num_blocks_pinned)
5066 {
5067         struct filefork *fp = VTOF(vp);
5068         int i, err=0, need_put=0;
5069         struct vnode *rsrc_vp=NULL;
5070         uint32_t npinned = 0;
5071         off_t               offset;
5072
5073         if (num_blocks_pinned) {
5074                 *num_blocks_pinned = 0;
5075         }
5076
5077         if (vnode_vtype(vp) != VREG) {
5078                 /* Not allowed to pin directories or symlinks */
5079                 printf("hfs: can't pin vnode of type %d\n", vnode_vtype(vp));
5080                 return (EPERM);
5081         }
5082
5083         if (fp->ff_unallocblocks) {
5084                 printf("hfs: can't pin a vnode w/unalloced blocks (%d)\n", fp->ff_unallocblocks);
5085                 return (EINVAL);
5086         }
5087
5088         /*
5089          * It is possible that if the caller unlocked/re-locked the cnode after checking
5090          * for C_NOEXISTS|C_DELETED that the file could have been deleted while the
5091          * cnode was unlocked.  So check the condition again and return ENOENT so that
5092          * the caller knows why we failed to pin the vnode.
5093          */
5094         if (VTOC(vp)->c_flag & (C_NOEXISTS|C_DELETED)) {
5095                 // makes no sense to pin something that's pending deletion
5096                 return ENOENT;
5097         }
5098
5099         if (fp->ff_blocks == 0 && (VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
5100                 if (!VNODE_IS_RSRC(vp) && hfs_vgetrsrc(hfsmp, vp, &rsrc_vp) == 0) {
5101                         //printf("hfs: fileid %d resource fork nblocks: %d / size: %lld\n", VTOC(vp)->c_fileid,
5102                         //       VTOC(rsrc_vp)->c_rsrcfork->ff_blocks,VTOC(rsrc_vp)->c_rsrcfork->ff_size);
5103
5104                         fp = VTOC(rsrc_vp)->c_rsrcfork;
5105                         need_put = 1;
5106                 }
5107         }
5108         if (fp->ff_blocks == 0) {
5109                 if (need_put) {
5110                         //
5111                         // use a distinct error code for a compressed file that has no resource fork;
5112                         // we return EALREADY to indicate that the data is already probably hot file
5113                         // cached because it's in an EA and the attributes btree is on the ssd
5114                         //
5115                         err = EALREADY;
5116                 } else {
5117                         err = EINVAL;
5118                 }
5119                 goto out;
5120         }
5121
5122         offset = 0;
5123         for (i = 0; i < kHFSPlusExtentDensity; i++) {
5124                 if (fp->ff_extents[i].startBlock == 0) {
5125                         break;
5126                 }
5127
5128                 err = hfs_pin_block_range(hfsmp, pin_state, fp->ff_extents[i].startBlock, fp->ff_extents[i].blockCount);
5129                 if (err) {
5130                         break;
5131                 } else {
5132                         npinned += fp->ff_extents[i].blockCount;
5133                 }
5134         }
5135
5136         if (err || npinned == 0) {
5137                 goto out;
5138         }
5139
5140         if (fp->ff_extents[kHFSPlusExtentDensity-1].startBlock) {
5141                 uint32_t pblocks;
5142                 uint8_t forktype = 0;
5143
5144                 if (fp == VTOC(vp)->c_rsrcfork) {
5145                         forktype = 0xff;
5146                 }
5147                 /*
5148                  * The file could have overflow extents, better pin them.
5149                  *
5150                  * We assume that since we are holding the cnode lock for this cnode,
5151                  * the files extents cannot be manipulated, but the tree could, so we
5152                  * need to ensure that it doesn't change behind our back as we iterate it.
5153                  */
5154                 int lockflags = hfs_systemfile_lock (hfsmp, SFL_EXTENTS, HFS_SHARED_LOCK);
5155                 err = hfs_pin_overflow_extents(hfsmp, VTOC(vp)->c_fileid, forktype, &pblocks);
5156                 hfs_systemfile_unlock (hfsmp, lockflags);
5157
5158                 if (err) {
5159                         goto out;
5160                 }
5161                 npinned += pblocks;
5162         }
5163
5164 out:
5165         if (num_blocks_pinned) {
5166                 *num_blocks_pinned = npinned;
5167         }
5168
5169         if (need_put && rsrc_vp) {
5170                 //
5171                 // have to unlock the cnode since it's shared between the
5172                 // resource fork vnode and the data fork vnode (and the
5173                 // vnode_put() may need to re-acquire the cnode lock to
5174                 // reclaim the resource fork vnode)
5175                 //
5176                 hfs_unlock(VTOC(vp));
5177                 vnode_put(rsrc_vp);
5178                 hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5179         }
5180         return err;
5181 }
5182
5183
5184 /*
5185  * Relocate a file to a new location on disk
5186  *  cnode must be locked on entry
5187  *
5188  * Relocation occurs by cloning the file's data from its
5189  * current set of blocks to a new set of blocks. During
5190  * the relocation all of the blocks (old and new) are
5191  * owned by the file.
5192  *
5193  * -----------------
5194  * |///////////////|
5195  * -----------------
5196  * 0               N (file offset)
5197  *
5198  * -----------------     -----------------
5199  * |///////////////|     |               |     STEP 1 (acquire new blocks)
5200  * -----------------     -----------------
5201  * 0               N     N+1             2N
5202  *
5203  * -----------------     -----------------
5204  * |///////////////|     |///////////////|     STEP 2 (clone data)
5205  * -----------------     -----------------
5206  * 0               N     N+1             2N
5207  *
5208  *                       -----------------
5209  *                       |///////////////|     STEP 3 (head truncate blocks)
5210  *                       -----------------
5211  *                       0               N
5212  *
5213  * During steps 2 and 3 page-outs to file offsets less
5214  * than or equal to N are suspended.
5215  *
5216  * During step 3 page-ins to the file get suspended.
5217  */
5218 int
5219 hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
5220         struct  proc *p)
5221 {
5222         struct  cnode *cp;
5223         struct  filefork *fp;
5224         struct  hfsmount *hfsmp;
5225         u_int32_t  headblks;
5226         u_int32_t  datablks;
5227         u_int32_t  blksize;
5228         u_int32_t  growsize;
5229         u_int32_t  nextallocsave;
5230         daddr64_t  sector_a,  sector_b;
5231         int eflags;
5232         off_t  newbytes;
5233         int  retval;
5234         int lockflags = 0;
5235         int took_trunc_lock = 0;
5236         int started_tr = 0;
5237         enum vtype vnodetype;
5238
5239         vnodetype = vnode_vtype(vp);
5240         if (vnodetype != VREG) {
5241                 /* Not allowed to move symlinks. */
5242                 return (EPERM);
5243         }
5244
5245         hfsmp = VTOHFS(vp);
5246         if (hfsmp->hfs_flags & HFS_FRAGMENTED_FREESPACE) {
5247                 return (ENOSPC);
5248         }
5249
5250         cp = VTOC(vp);
5251         fp = VTOF(vp);
5252         if (fp->ff_unallocblocks)
5253                 return (EINVAL);
5254
5255 #if CONFIG_PROTECT
5256         /*
5257          * <rdar://problem/9118426>
5258          * Disable HFS file relocation on content-protected filesystems
5259          */
5260         if (cp_fs_protected (hfsmp->hfs_mp)) {
5261                 return EINVAL;
5262         }
5263 #endif
5264         /* If it's an SSD, also disable HFS relocation */
5265         if (hfsmp->hfs_flags & HFS_SSD) {
5266                 return EINVAL;
5267         }
5268
5269
5270         blksize = hfsmp->blockSize;
5271         if (blockHint == 0)
5272                 blockHint = hfsmp->nextAllocation;
5273
5274         if (fp->ff_size > 0x7fffffff) {
5275                 return (EFBIG);
5276         }
5277
5278         if (!vnode_issystem(vp) && (vnodetype != VLNK)) {
5279                 hfs_unlock(cp);
5280                 hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
5281                 /* Force lock since callers expects lock to be held. */
5282                 if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS))) {
5283                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5284                         return (retval);
5285                 }
5286                 /* No need to continue if file was removed. */
5287                 if (cp->c_flag & C_NOEXISTS) {
5288                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5289                         return (ENOENT);
5290                 }
5291                 took_trunc_lock = 1;
5292         }
5293         headblks = fp->ff_blocks;
5294         datablks = howmany(fp->ff_size, blksize);
5295         growsize = datablks * blksize;
5296         eflags = kEFContigMask | kEFAllMask | kEFNoClumpMask;
5297         if (blockHint >= hfsmp->hfs_metazone_start &&
5298             blockHint <= hfsmp->hfs_metazone_end)
5299                 eflags |= kEFMetadataMask;
5300
5301         if (hfs_start_transaction(hfsmp) != 0) {
5302                 if (took_trunc_lock)
5303                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5304             return (EINVAL);
5305         }
5306         started_tr = 1;
5307         /*
5308          * Protect the extents b-tree and the allocation bitmap
5309          * during MapFileBlockC and ExtendFileC operations.
5310          */
5311         lockflags = SFL_BITMAP;
5312         if (overflow_extents(fp))
5313                 lockflags |= SFL_EXTENTS;
5314         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5315
5316         retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize - 1, &sector_a, NULL);
5317         if (retval) {
5318                 retval = MacToVFSError(retval);
5319                 goto out;
5320         }
5321
5322         /*
5323          * STEP 1 - acquire new allocation blocks.
5324          */
5325         nextallocsave = hfsmp->nextAllocation;
5326         retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes);
5327         if (eflags & kEFMetadataMask) {
5328                 hfs_lock_mount(hfsmp);
5329                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave);
5330                 MarkVCBDirty(hfsmp);
5331                 hfs_unlock_mount(hfsmp);
5332         }
5333
5334         retval = MacToVFSError(retval);
5335         if (retval == 0) {
5336                 cp->c_flag |= C_MODIFIED;
5337                 if (newbytes < growsize) {
5338                         retval = ENOSPC;
5339                         goto restore;
5340                 } else if (fp->ff_blocks < (headblks + datablks)) {
5341                         printf("hfs_relocate: allocation failed id=%u, vol=%s\n", cp->c_cnid, hfsmp->vcbVN);
5342                         retval = ENOSPC;
5343                         goto restore;
5344                 }
5345
5346                 retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize, &sector_b, NULL);
5347                 if (retval) {
5348                         retval = MacToVFSError(retval);
5349                 } else if ((sector_a + 1) == sector_b) {
5350                         retval = ENOSPC;
5351                         goto restore;
5352                 } else if ((eflags & kEFMetadataMask) &&
5353                            ((((u_int64_t)sector_b * hfsmp->hfs_logical_block_size) / blksize) >
5354                               hfsmp->hfs_metazone_end)) {
5355 #if 0
5356                         const char * filestr;
5357                         char emptystr = '\0';
5358
5359                         if (cp->c_desc.cd_nameptr != NULL) {
5360                                 filestr = (const char *)&cp->c_desc.cd_nameptr[0];
5361                         } else if (vnode_name(vp) != NULL) {
5362                                 filestr = vnode_name(vp);
5363                         } else {
5364                                 filestr = &emptystr;
5365                         }
5366 #endif
5367                         retval = ENOSPC;
5368                         goto restore;
5369                 }
5370         }
5371         /* Done with system locks and journal for now. */
5372         hfs_systemfile_unlock(hfsmp, lockflags);
5373         lockflags = 0;
5374         hfs_end_transaction(hfsmp);
5375         started_tr = 0;
5376
5377         if (retval) {
5378                 /*
5379                  * Check to see if failure is due to excessive fragmentation.
5380                  */
5381                 if ((retval == ENOSPC) &&
5382                     (hfs_freeblks(hfsmp, 0) > (datablks * 2))) {
5383                         hfsmp->hfs_flags |= HFS_FRAGMENTED_FREESPACE;
5384                 }
5385                 goto out;
5386         }
5387         /*
5388          * STEP 2 - clone file data into the new allocation blocks.
5389          */
5390
5391         if (vnodetype == VLNK)
5392                 retval = EPERM;
5393         else if (vnode_issystem(vp))
5394                 retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p);
5395         else
5396                 retval = hfs_clonefile(vp, headblks, datablks, blksize);
5397
5398         /* Start transaction for step 3 or for a restore. */
5399         if (hfs_start_transaction(hfsmp) != 0) {
5400                 retval = EINVAL;
5401                 goto out;
5402         }
5403         started_tr = 1;
5404         if (retval)
5405                 goto restore;
5406
5407         /*
5408          * STEP 3 - switch to cloned data and remove old blocks.
5409          */
5410         lockflags = SFL_BITMAP;
5411         if (overflow_extents(fp))
5412                 lockflags |= SFL_EXTENTS;
5413         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5414
5415         retval = HeadTruncateFile(hfsmp, (FCB*)fp, headblks);
5416
5417         hfs_systemfile_unlock(hfsmp, lockflags);
5418         lockflags = 0;
5419         if (retval)
5420                 goto restore;
5421 out:
5422         if (took_trunc_lock)
5423                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5424
5425         if (lockflags) {
5426                 hfs_systemfile_unlock(hfsmp, lockflags);
5427                 lockflags = 0;
5428         }
5429
5430         /* Push cnode's new extent data to disk. */
5431         if (retval == 0) {
5432                 hfs_update(vp, 0);
5433         }
5434         if (hfsmp->jnl) {
5435                 if (cp->c_cnid < kHFSFirstUserCatalogNodeID)
5436                         (void) hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT | HFS_FVH_WRITE_ALT);
5437                 else
5438                         (void) hfs_flushvolumeheader(hfsmp, 0);
5439         }
5440 exit:
5441         if (started_tr)
5442                 hfs_end_transaction(hfsmp);
5443
5444         return (retval);
5445
5446 restore:
5447         if (fp->ff_blocks == headblks) {
5448                 if (took_trunc_lock)
5449                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5450                 goto exit;
5451         }
5452         /*
5453          * Give back any newly allocated space.
5454          */
5455         if (lockflags == 0) {
5456                 lockflags = SFL_BITMAP;
5457                 if (overflow_extents(fp))
5458                         lockflags |= SFL_EXTENTS;
5459                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5460         }
5461
5462         (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, 0, FORK_IS_RSRC(fp),
5463                                                  FTOC(fp)->c_fileid, false);
5464
5465         hfs_systemfile_unlock(hfsmp, lockflags);
5466         lockflags = 0;
5467
5468         if (took_trunc_lock)
5469                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5470         goto exit;
5471 }
5472
5473
5474 /*
5475  * Clone a file's data within the file.
5476  *
5477  */
5478 static int
5479 hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
5480 {
5481         caddr_t  bufp;
5482         size_t  bufsize;
5483         size_t  copysize;
5484         size_t  iosize;
5485         size_t  offset;
5486         off_t   writebase;
5487         uio_t auio;
5488         int  error = 0;
5489
5490         writebase = blkstart * blksize;
5491         copysize = blkcnt * blksize;
5492         iosize = bufsize = MIN(copysize, 128 * 1024);
5493         offset = 0;
5494
5495         hfs_unlock(VTOC(vp));
5496
5497 #if CONFIG_PROTECT
5498         if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
5499                 hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5500                 return (error);
5501         }
5502 #endif /* CONFIG_PROTECT */
5503
5504     bufp = hfs_malloc(bufsize);
5505
5506         auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
5507
5508         while (offset < copysize) {
5509                 iosize = MIN(copysize - offset, iosize);
5510
5511                 uio_reset(auio, offset, UIO_SYSSPACE, UIO_READ);
5512                 uio_addiov(auio, (uintptr_t)bufp, iosize);
5513
5514                 error = cluster_read(vp, auio, copysize, IO_NOCACHE);
5515                 if (error) {
5516                         printf("hfs_clonefile: cluster_read failed - %d\n", error);
5517                         break;
5518                 }
5519                 if (uio_resid(auio) != 0) {
5520                         printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", (int64_t)uio_resid(auio));
5521                         error = EIO;
5522                         break;
5523                 }
5524
5525                 uio_reset(auio, writebase + offset, UIO_SYSSPACE, UIO_WRITE);
5526                 uio_addiov(auio, (uintptr_t)bufp, iosize);
5527
5528                 error = cluster_write(vp, auio, writebase + offset,
5529                                       writebase + offset + iosize,
5530                                       uio_offset(auio), 0, IO_NOCACHE | IO_SYNC);
5531                 if (error) {
5532                         printf("hfs_clonefile: cluster_write failed - %d\n", error);
5533                         break;
5534                 }
5535                 if (uio_resid(auio) != 0) {
5536                         printf("hfs_clonefile: cluster_write failed - uio_resid not zero\n");
5537                         error = EIO;
5538                         break;
5539                 }
5540                 offset += iosize;
5541         }
5542         uio_free(auio);
5543
5544         if ((blksize & PAGE_MASK)) {
5545                 /*
5546                  * since the copy may not have started on a PAGE
5547                  * boundary (or may not have ended on one), we
5548                  * may have pages left in the cache since NOCACHE
5549                  * will let partially written pages linger...
5550                  * lets just flush the entire range to make sure
5551                  * we don't have any pages left that are beyond
5552                  * (or intersect) the real LEOF of this file
5553                  */
5554                 ubc_msync(vp, writebase, writebase + offset, NULL, UBC_INVALIDATE | UBC_PUSHDIRTY);
5555         } else {
5556                 /*
5557                  * No need to call ubc_msync or hfs_invalbuf
5558                  * since the file was copied using IO_NOCACHE and
5559                  * the copy was done starting and ending on a page
5560                  * boundary in the file.
5561                  */
5562         }
5563     hfs_free(bufp, bufsize);
5564
5565         hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5566         return (error);
5567 }
5568
5569 /*
5570  * Clone a system (metadata) file.
5571  *
5572  */
5573 static int
5574 hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize,
5575                  kauth_cred_t cred, struct proc *p)
5576 {
5577         caddr_t  bufp;
5578         char * offset;
5579         size_t  bufsize;
5580         size_t  iosize;
5581         struct buf *bp = NULL;
5582         daddr64_t  blkno;
5583         daddr64_t  blk;
5584         daddr64_t  start_blk;
5585         daddr64_t  last_blk;
5586         int  breadcnt;
5587         int  i;
5588         int  error = 0;
5589
5590
5591         iosize = GetLogicalBlockSize(vp);
5592         bufsize = MIN(blkcnt * blksize, 1024 * 1024) & ~(iosize - 1);
5593         breadcnt = bufsize / iosize;
5594
5595     bufp = hfs_malloc(bufsize);
5596
5597         start_blk = ((daddr64_t)blkstart * blksize) / iosize;
5598         last_blk  = ((daddr64_t)blkcnt * blksize) / iosize;
5599         blkno = 0;
5600
5601         while (blkno < last_blk) {
5602                 /*
5603                  * Read up to a megabyte
5604                  */
5605                 offset = bufp;
5606                 for (i = 0, blk = blkno; (i < breadcnt) && (blk < last_blk); ++i, ++blk) {
5607                         error = (int)buf_meta_bread(vp, blk, iosize, cred, &bp);
5608                         if (error) {
5609                                 printf("hfs_clonesysfile: meta_bread error %d\n", error);
5610                                 goto out;
5611                         }
5612                         if (buf_count(bp) != iosize) {
5613                                 printf("hfs_clonesysfile: b_bcount is only %d\n", buf_count(bp));
5614                                 goto out;
5615                         }
5616                         bcopy((char *)buf_dataptr(bp), offset, iosize);
5617
5618                         buf_markinvalid(bp);
5619                         buf_brelse(bp);
5620                         bp = NULL;
5621
5622                         offset += iosize;
5623                 }
5624
5625                 /*
5626                  * Write up to a megabyte
5627                  */
5628                 offset = bufp;
5629                 for (i = 0; (i < breadcnt) && (blkno < last_blk); ++i, ++blkno) {
5630                         bp = buf_getblk(vp, start_blk + blkno, iosize, 0, 0, BLK_META);
5631                         if (bp == NULL) {
5632                                 printf("hfs_clonesysfile: getblk failed on blk %qd\n", start_blk + blkno);
5633                                 error = EIO;
5634                                 goto out;
5635                         }
5636                         bcopy(offset, (char *)buf_dataptr(bp), iosize);
5637                         error = (int)buf_bwrite(bp);
5638                         bp = NULL;
5639                         if (error)
5640                                 goto out;
5641                         offset += iosize;
5642                 }
5643         }
5644 out:
5645         if (bp) {
5646                 buf_brelse(bp);
5647         }
5648
5649     hfs_free(bufp, bufsize);
5650
5651         error = hfs_fsync(vp, MNT_WAIT, 0, p);
5652
5653         return (error);
5654 }
5655
5656 errno_t hfs_flush_invalid_ranges(vnode_t vp)
5657 {
5658         cnode_t *cp = VTOC(vp);
5659
5660         hfs_assert(cp->c_lockowner == current_thread());
5661         hfs_assert(cp->c_truncatelockowner == current_thread());
5662
5663         if (!ISSET(cp->c_flag, C_ZFWANTSYNC) && !cp->c_zftimeout)
5664                 return 0;
5665
5666         filefork_t *fp = VTOF(vp);
5667
5668         /*
5669          * We can't hold the cnode lock whilst we call cluster_write so we
5670          * need to copy the extents into a local buffer.
5671          */
5672         int max_exts = 16;
5673         struct ext {
5674                 off_t start, end;
5675         } exts_buf[max_exts];           // 256 bytes
5676         struct ext *exts = exts_buf;
5677         int ext_count = 0;
5678         errno_t ret;
5679
5680         struct rl_entry *r = TAILQ_FIRST(&fp->ff_invalidranges);
5681
5682         while (r) {
5683                 /* If we have more than can fit in our stack buffer, switch
5684                    to a heap buffer. */
5685                 if (exts == exts_buf && ext_count == max_exts) {
5686                         max_exts = 256;
5687                         exts = hfs_malloc(sizeof(struct ext) * max_exts);
5688                         memcpy(exts, exts_buf, ext_count * sizeof(struct ext));
5689                 }
5690
5691                 struct rl_entry *next = TAILQ_NEXT(r, rl_link);
5692
5693                 exts[ext_count++] = (struct ext){ r->rl_start, r->rl_end };
5694
5695                 if (!next || (ext_count == max_exts && exts != exts_buf)) {
5696                         hfs_unlock(cp);
5697                         for (int i = 0; i < ext_count; ++i) {
5698                                 ret = cluster_write(vp, NULL, fp->ff_size, exts[i].end + 1,
5699                                                                         exts[i].start, 0,
5700                                                                         IO_HEADZEROFILL | IO_NOZERODIRTY | IO_NOCACHE);
5701                                 if (ret) {
5702                                         hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
5703                                         goto exit;
5704                                 }
5705                         }
5706
5707                         if (!next) {
5708                                 hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
5709                                 break;
5710                         }
5711
5712                         /* Push any existing clusters which should clean up our invalid
5713                            ranges as they go through hfs_vnop_blockmap. */
5714                         cluster_push(vp, 0);
5715
5716                         hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
5717
5718                         /*
5719                          * Get back to where we were (given we dropped the lock).
5720                          * This shouldn't be many because we pushed above.
5721                          */
5722                         TAILQ_FOREACH(r, &fp->ff_invalidranges, rl_link) {
5723                                 if (r->rl_end > exts[ext_count - 1].end)
5724                                         break;
5725                         }
5726
5727                         ext_count = 0;
5728                 } else
5729                         r = next;
5730         }
5731
5732         ret = 0;
5733
5734 exit:
5735
5736         if (exts != exts_buf)
5737                 hfs_free(exts, sizeof(struct ext) * max_exts);
5738
5739         return ret;
5740 }