core/hfs_readwrite.c

   1 /*
   2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*      @(#)hfs_readwrite.c     1.0
  29  *
  30  *      (c) 1998-2001 Apple Inc.  All Rights Reserved
  31  *
  32  *      hfs_readwrite.c -- vnode operations to deal with reading and writing files.
  33  *
  34  */
  35
  36 #include <sys/param.h>
  37 #include <sys/systm.h>
  38 #include <sys/kernel.h>
  39 #include <sys/fcntl.h>
  40 #include <sys/stat.h>
  41 #include <sys/buf.h>
  42 #include <sys/proc.h>
  43 #include <sys/kauth.h>
  44 #include <sys/vnode.h>
  45 #include <sys/uio.h>
  46 #include <sys/vfs_context.h>
  47 #include <sys/disk.h>
  48 #include <sys/sysctl.h>
  49 #include <sys/fsctl.h>
  50 #include <sys/ubc.h>
  51 #include <sys/fsevents.h>
  52 #include <uuid/uuid.h>
  53
  54 #include <libkern/OSDebug.h>
  55
  56 #include <miscfs/specfs/specdev.h>
  57
  58 #include <sys/ubc.h>
  59
  60 #include <vm/vm_pageout.h>
  61 #include <vm/vm_kern.h>
  62
  63 #include <IOKit/IOBSD.h>
  64
  65 #include <sys/kdebug.h>
  66
  67 #include        "hfs.h"
  68 #include        "hfs_attrlist.h"
  69 #include        "hfs_endian.h"
  70 #include        "hfs_fsctl.h"
  71 #include        "hfs_quota.h"
  72 #include        "FileMgrInternal.h"
  73 #include        "BTreesInternal.h"
  74 #include        "hfs_cnode.h"
  75 #include        "hfs_dbg.h"
  76
  77 #if HFS_CONFIG_KEY_ROLL
  78 #include        "hfs_key_roll.h"
  79 #endif
  80
  81 #define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2)))
  82
  83 enum {
  84         MAXHFSFILESIZE = 0x7FFFFFFF             /* this needs to go in the mount structure */
  85 };
  86
  87 /* from bsd/hfs/hfs_vfsops.c */
  88 extern int hfs_vfs_vget (struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context);
  89
  90 /* from hfs_hotfiles.c */
  91 extern int hfs_pin_overflow_extents (struct hfsmount *hfsmp, uint32_t fileid,
  92                                               uint8_t forktype, uint32_t *pinned);
  93
  94 static int  hfs_clonefile(struct vnode *, int, int, int);
  95 static int  hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *);
  96 static int  do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skip, vfs_context_t context);
  97
  98
  99 /*
 100  * Read data from a file.
 101  */
 102 int
 103 hfs_vnop_read(struct vnop_read_args *ap)
 104 {
 105         /*
 106            struct vnop_read_args {
 107            struct vnodeop_desc *a_desc;
 108            vnode_t a_vp;
 109            struct uio *a_uio;
 110            int a_ioflag;
 111            vfs_context_t a_context;
 112            };
 113          */
 114
 115         uio_t uio = ap->a_uio;
 116         struct vnode *vp = ap->a_vp;
 117         struct cnode *cp;
 118         struct filefork *fp;
 119         struct hfsmount *hfsmp;
 120         off_t filesize;
 121         off_t filebytes;
 122         off_t start_resid = uio_resid(uio);
 123         off_t offset = uio_offset(uio);
 124         int retval = 0;
 125         int took_truncate_lock = 0;
 126         int io_throttle = 0;
 127         int throttled_count = 0;
 128
 129         /* Preflight checks */
 130         if (!vnode_isreg(vp)) {
 131                 /* can only read regular files */
 132                 if (vnode_isdir(vp))
 133                         return (EISDIR);
 134                 else
 135                         return (EPERM);
 136         }
 137         if (start_resid == 0)
 138                 return (0);             /* Nothing left to do */
 139         if (offset < 0)
 140                 return (EINVAL);        /* cant read from a negative offset */
 141
 142 #if SECURE_KERNEL
 143         if ((ap->a_ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) ==
 144                                                 (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) {
 145                 /* Don't allow unencrypted io request from user space */
 146                 return EPERM;
 147         }
 148 #endif
 149
 150 #if HFS_COMPRESSION
 151         if (VNODE_IS_RSRC(vp)) {
 152                 if (hfs_hides_rsrc(ap->a_context, VTOC(vp), 1)) { /* 1 == don't take the cnode lock */
 153                         return 0;
 154                 }
 155                 /* otherwise read the resource fork normally */
 156         } else {
 157                 int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
 158                 if (compressed) {
 159                         retval = decmpfs_read_compressed(ap, &compressed, VTOCMP(vp));
 160                         if (retval == 0 && !(ap->a_ioflag & IO_EVTONLY) && vnode_isfastdevicecandidate(vp)) {
 161                                 (void) hfs_addhotfile(vp);
 162                         }
 163                         if (compressed) {
 164                                 if (retval == 0) {
 165                                         /* successful read, update the access time */
 166                                         VTOC(vp)->c_touch_acctime = TRUE;
 167
 168                                         //
 169                                         // compressed files are not traditional hot file candidates
 170                                         // but they may be for CF (which ignores the ff_bytesread
 171                                         // field)
 172                                         //
 173                                         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
 174                                                 VTOF(vp)->ff_bytesread = 0;
 175                                         }
 176                                 }
 177                                 return retval;
 178                         }
 179                         /* otherwise the file was converted back to a regular file while we were reading it */
 180                         retval = 0;
 181                 } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
 182                         int error;
 183
 184                         error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP);
 185                         if (error) {
 186                                 return error;
 187                         }
 188
 189                 }
 190         }
 191 #endif /* HFS_COMPRESSION */
 192
 193         cp = VTOC(vp);
 194         fp = VTOF(vp);
 195         hfsmp = VTOHFS(vp);
 196
 197 #if CONFIG_PROTECT
 198         if ((retval = cp_handle_vnop (vp, CP_READ_ACCESS, ap->a_ioflag)) != 0) {
 199                 goto exit;
 200         }
 201
 202 #if HFS_CONFIG_KEY_ROLL
 203         if (ISSET(ap->a_ioflag, IO_ENCRYPTED)) {
 204                 off_rsrc_t off_rsrc = off_rsrc_make(offset + start_resid,
 205                                                                                         VNODE_IS_RSRC(vp));
 206
 207                 retval = hfs_key_roll_up_to(ap->a_context, vp, off_rsrc);
 208                 if (retval)
 209                         goto exit;
 210         }
 211 #endif // HFS_CONFIG_KEY_ROLL
 212 #endif // CONFIG_PROTECT
 213
 214         /*
 215          * If this read request originated from a syscall (as opposed to
 216          * an in-kernel page fault or something), then set it up for
 217          * throttle checks
 218          */
 219         if (ap->a_ioflag & IO_SYSCALL_DISPATCH) {
 220                 io_throttle = IO_RETURN_ON_THROTTLE;
 221         }
 222
 223 read_again:
 224
 225         /* Protect against a size change. */
 226         hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
 227         took_truncate_lock = 1;
 228
 229         filesize = fp->ff_size;
 230         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 231
 232         /*
 233          * Check the file size. Note that per POSIX spec, we return 0 at
 234          * file EOF, so attempting a read at an offset that is too big
 235          * should just return 0 on HFS+. Since the return value was initialized
 236          * to 0 above, we just jump to exit.  HFS Standard has its own behavior.
 237          */
 238         if (offset > filesize) {
 239 #if CONFIG_HFS_STD
 240                 if ((hfsmp->hfs_flags & HFS_STANDARD) &&
 241                     (offset > (off_t)MAXHFSFILESIZE)) {
 242                         retval = EFBIG;
 243                 }
 244 #endif
 245                 goto exit;
 246         }
 247
 248         KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_START,
 249                 (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0);
 250
 251         retval = cluster_read(vp, uio, filesize, ap->a_ioflag |io_throttle);
 252
 253         cp->c_touch_acctime = TRUE;
 254
 255         KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_END,
 256                 (int)uio_offset(uio), uio_resid(uio), (int)filesize,  (int)filebytes, 0);
 257
 258         /*
 259          * Keep track blocks read
 260          */
 261         if (hfsmp->hfc_stage == HFC_RECORDING && retval == 0) {
 262                 int took_cnode_lock = 0;
 263                 off_t bytesread;
 264
 265                 bytesread = start_resid - uio_resid(uio);
 266
 267                 /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
 268                 if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) {
 269                         hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 270                         took_cnode_lock = 1;
 271                 }
 272                 /*
 273                  * If this file hasn't been seen since the start of
 274                  * the current sampling period then start over.
 275                  */
 276                 if (cp->c_atime < hfsmp->hfc_timebase) {
 277                         struct timeval tv;
 278
 279                         fp->ff_bytesread = bytesread;
 280                         microtime(&tv);
 281                         cp->c_atime = tv.tv_sec;
 282                 } else {
 283                         fp->ff_bytesread += bytesread;
 284                 }
 285
 286                 if (!(ap->a_ioflag & IO_EVTONLY) && vnode_isfastdevicecandidate(vp)) {
 287                         //
 288                         // We don't add hotfiles for processes doing IO_EVTONLY I/O
 289                         // on the assumption that they're system processes such as
 290                         // mdworker which scan everything in the system (and thus
 291                         // do not represent user-initiated access to files)
 292                         //
 293                         (void) hfs_addhotfile(vp);
 294                 }
 295                 if (took_cnode_lock)
 296                         hfs_unlock(cp);
 297         }
 298 exit:
 299         if (took_truncate_lock) {
 300                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 301         }
 302         if (retval == EAGAIN) {
 303                 throttle_lowpri_io(1);
 304                 throttled_count++;
 305
 306                 retval = 0;
 307                 goto read_again;
 308         }
 309         if (throttled_count)
 310                 throttle_info_reset_window(NULL);
 311         return (retval);
 312 }
 313
 314 /*
 315  * Ideally, this wouldn't be necessary; the cluster code should be
 316  * able to handle this on the read-side.  See <rdar://20420068>.
 317  */
 318 static errno_t hfs_zero_eof_page(vnode_t vp, off_t zero_up_to)
 319 {
 320         hfs_assert(VTOC(vp)->c_lockowner != current_thread());
 321         hfs_assert(VTOC(vp)->c_truncatelockowner == current_thread());
 322
 323         struct filefork *fp = VTOF(vp);
 324
 325         if (!(fp->ff_size & PAGE_MASK_64) || zero_up_to <= fp->ff_size) {
 326                 // Nothing to do
 327                 return 0;
 328         }
 329
 330         zero_up_to = MIN(zero_up_to, (off_t)round_page_64(fp->ff_size));
 331
 332         /* N.B. At present, @zero_up_to is not important because the cluster
 333            code will always zero up to the end of the page anyway. */
 334         return cluster_write(vp, NULL, fp->ff_size, zero_up_to,
 335                                                  fp->ff_size, 0, IO_HEADZEROFILL);
 336 }
 337
 338 /*
 339  * Write data to a file.
 340  */
 341 int
 342 hfs_vnop_write(struct vnop_write_args *ap)
 343 {
 344         uio_t uio = ap->a_uio;
 345         struct vnode *vp = ap->a_vp;
 346         struct cnode *cp;
 347         struct filefork *fp;
 348         struct hfsmount *hfsmp;
 349         kauth_cred_t cred = NULL;
 350         off_t origFileSize;
 351         off_t writelimit;
 352         off_t bytesToAdd = 0;
 353         off_t actualBytesAdded;
 354         off_t filebytes;
 355         off_t offset;
 356         ssize_t resid;
 357         int eflags = 0;
 358         int ioflag = ap->a_ioflag;
 359         int retval = 0;
 360         int lockflags;
 361         int cnode_locked = 0;
 362         int partialwrite = 0;
 363         int do_snapshot = 1;
 364         time_t orig_ctime=VTOC(vp)->c_ctime;
 365         int took_truncate_lock = 0;
 366         int io_return_on_throttle = 0;
 367         int throttled_count = 0;
 368
 369 #if HFS_COMPRESSION
 370         if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
 371                 int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
 372                 switch(state) {
 373                         case FILE_IS_COMPRESSED:
 374                                 return EACCES;
 375                         case FILE_IS_CONVERTING:
 376                                 /* if FILE_IS_CONVERTING, we allow writes but do not
 377                                    bother with snapshots or else we will deadlock.
 378                                 */
 379                                 do_snapshot = 0;
 380                                 break;
 381                         default:
 382                                 printf("invalid state %d for compressed file\n", state);
 383                                 /* fall through */
 384                 }
 385         } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
 386                 int error;
 387
 388                 error = check_for_dataless_file(vp, NAMESPACE_HANDLER_WRITE_OP);
 389                 if (error != 0) {
 390                         return error;
 391                 }
 392         }
 393
 394         if (do_snapshot) {
 395                 nspace_snapshot_event(vp, orig_ctime, NAMESPACE_HANDLER_WRITE_OP, uio);
 396         }
 397
 398 #endif
 399
 400 #if SECURE_KERNEL
 401         if ((ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) ==
 402                                                 (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) {
 403                 /* Don't allow unencrypted io request from user space */
 404                 return EPERM;
 405         }
 406 #endif
 407
 408         resid = uio_resid(uio);
 409         offset = uio_offset(uio);
 410
 411         if (offset < 0)
 412                 return (EINVAL);
 413         if (resid == 0)
 414                 return (E_NONE);
 415         if (!vnode_isreg(vp))
 416                 return (EPERM);  /* Can only write regular files */
 417
 418         cp = VTOC(vp);
 419         fp = VTOF(vp);
 420         hfsmp = VTOHFS(vp);
 421
 422 #if CONFIG_PROTECT
 423         if ((retval = cp_handle_vnop (vp, CP_WRITE_ACCESS, 0)) != 0) {
 424                 goto exit;
 425         }
 426 #endif
 427
 428         eflags = kEFDeferMask;  /* defer file block allocations */
 429 #if HFS_SPARSE_DEV
 430         /*
 431          * When the underlying device is sparse and space
 432          * is low (< 8MB), stop doing delayed allocations
 433          * and begin doing synchronous I/O.
 434          */
 435         if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
 436             (hfs_freeblks(hfsmp, 0) < 2048)) {
 437                 eflags &= ~kEFDeferMask;
 438                 ioflag |= IO_SYNC;
 439         }
 440 #endif /* HFS_SPARSE_DEV */
 441
 442         if ((ioflag & (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) ==
 443                         (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) {
 444                 io_return_on_throttle = IO_RETURN_ON_THROTTLE;
 445         }
 446
 447 again:
 448         /*
 449          * Protect against a size change.
 450          *
 451          * Note: If took_truncate_lock is true, then we previously got the lock shared
 452          * but needed to upgrade to exclusive.  So try getting it exclusive from the
 453          * start.
 454          */
 455         if (ioflag & IO_APPEND || took_truncate_lock) {
 456                 hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
 457         }
 458         else {
 459                 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
 460         }
 461         took_truncate_lock = 1;
 462
 463         /* Update UIO */
 464         if (ioflag & IO_APPEND) {
 465                 uio_setoffset(uio, fp->ff_size);
 466                 offset = fp->ff_size;
 467         }
 468         if ((cp->c_bsdflags & APPEND) && offset != fp->ff_size) {
 469                 retval = EPERM;
 470                 goto exit;
 471         }
 472
 473         cred = vfs_context_ucred(ap->a_context);
 474         if (cred && suser(cred, NULL) != 0)
 475                 eflags |= kEFReserveMask;
 476
 477         origFileSize = fp->ff_size;
 478         writelimit = offset + resid;
 479
 480         /*
 481          * We may need an exclusive truncate lock for several reasons, all
 482          * of which are because we may be writing to a (portion of a) block
 483          * for the first time, and we need to make sure no readers see the
 484          * prior, uninitialized contents of the block.  The cases are:
 485          *
 486          * 1. We have unallocated (delayed allocation) blocks.  We may be
 487          *    allocating new blocks to the file and writing to them.
 488          *    (A more precise check would be whether the range we're writing
 489          *    to contains delayed allocation blocks.)
 490          * 2. We need to extend the file.  The bytes between the old EOF
 491          *    and the new EOF are not yet initialized.  This is important
 492          *    even if we're not allocating new blocks to the file.  If the
 493          *    old EOF and new EOF are in the same block, we still need to
 494          *    protect that range of bytes until they are written for the
 495          *    first time.
 496          *
 497          * If we had a shared lock with the above cases, we need to try to upgrade
 498          * to an exclusive lock.  If the upgrade fails, we will lose the shared
 499          * lock, and will need to take the truncate lock again; the took_truncate_lock
 500          * flag will still be set, causing us to try for an exclusive lock next time.
 501          */
 502         if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) &&
 503             ((fp->ff_unallocblocks != 0) ||
 504              (writelimit > origFileSize))) {
 505                 if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) {
 506                         /*
 507                          * Lock upgrade failed and we lost our shared lock, try again.
 508                          * Note: we do not set took_truncate_lock=0 here.  Leaving it
 509                          * set to 1 will cause us to try to get the lock exclusive.
 510                          */
 511                         goto again;
 512                 }
 513                 else {
 514                         /* Store the owner in the c_truncatelockowner field if we successfully upgrade */
 515                         cp->c_truncatelockowner = current_thread();
 516                 }
 517         }
 518
 519         if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
 520                 goto exit;
 521         }
 522         cnode_locked = 1;
 523
 524         filebytes = hfs_blk_to_bytes(fp->ff_blocks, hfsmp->blockSize);
 525
 526         if (offset > filebytes
 527                 && (hfs_blk_to_bytes(hfs_freeblks(hfsmp, ISSET(eflags, kEFReserveMask)),
 528                                                          hfsmp->blockSize) < offset - filebytes)) {
 529                 retval = ENOSPC;
 530                 goto exit;
 531         }
 532
 533         KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_START,
 534                      (int)offset, uio_resid(uio), (int)fp->ff_size,
 535                      (int)filebytes, 0);
 536
 537         /* Check if we do not need to extend the file */
 538         if (writelimit <= filebytes) {
 539                 goto sizeok;
 540         }
 541
 542         bytesToAdd = writelimit - filebytes;
 543
 544 #if QUOTA
 545         retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, hfsmp->blockSize)),
 546                            cred, 0);
 547         if (retval)
 548                 goto exit;
 549 #endif /* QUOTA */
 550
 551         if (hfs_start_transaction(hfsmp) != 0) {
 552                 retval = EINVAL;
 553                 goto exit;
 554         }
 555
 556         while (writelimit > filebytes) {
 557                 bytesToAdd = writelimit - filebytes;
 558
 559                 /* Protect extents b-tree and allocation bitmap */
 560                 lockflags = SFL_BITMAP;
 561                 if (overflow_extents(fp))
 562                         lockflags |= SFL_EXTENTS;
 563                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
 564
 565                 /* Files that are changing size are not hot file candidates. */
 566                 if (hfsmp->hfc_stage == HFC_RECORDING) {
 567                         fp->ff_bytesread = 0;
 568                 }
 569                 retval = MacToVFSError(ExtendFileC (hfsmp, (FCB*)fp, bytesToAdd,
 570                                 0, eflags, &actualBytesAdded));
 571
 572                 hfs_systemfile_unlock(hfsmp, lockflags);
 573
 574                 if ((actualBytesAdded == 0) && (retval == E_NONE))
 575                         retval = ENOSPC;
 576                 if (retval != E_NONE)
 577                         break;
 578                 filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 579                 KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_NONE,
 580                         (int)offset, uio_resid(uio), (int)fp->ff_size,  (int)filebytes, 0);
 581         }
 582         (void) hfs_update(vp, 0);
 583         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
 584         (void) hfs_end_transaction(hfsmp);
 585
 586         /*
 587          * If we didn't grow the file enough try a partial write.
 588          * POSIX expects this behavior.
 589          */
 590         if ((retval == ENOSPC) && (filebytes > offset)) {
 591                 retval = 0;
 592                 partialwrite = 1;
 593                 uio_setresid(uio, (uio_resid(uio) - bytesToAdd));
 594                 resid -= bytesToAdd;
 595                 writelimit = filebytes;
 596         }
 597 sizeok:
 598         if (retval == E_NONE) {
 599                 off_t filesize;
 600                 off_t head_off;
 601                 int lflag;
 602
 603                 if (writelimit > fp->ff_size) {
 604                         filesize = writelimit;
 605                         struct timeval tv;
 606                         rl_add(fp->ff_size, writelimit - 1 , &fp->ff_invalidranges);
 607                         microuptime(&tv);
 608                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
 609                 } else
 610                         filesize = fp->ff_size;
 611
 612                 lflag = ioflag & ~(IO_TAILZEROFILL | IO_HEADZEROFILL | IO_NOZEROVALID | IO_NOZERODIRTY);
 613
 614                 /*
 615                  * We no longer use IO_HEADZEROFILL or IO_TAILZEROFILL (except
 616                  * for one case below).  For the regions that lie before the
 617                  * beginning and after the end of this write that are in the
 618                  * same page, we let the cluster code handle zeroing that out
 619                  * if necessary.  If those areas are not cached, the cluster
 620                  * code will try and read those areas in, and in the case
 621                  * where those regions have never been written to,
 622                  * hfs_vnop_blockmap will consult the invalid ranges and then
 623                  * indicate that.  The cluster code will zero out those areas.
 624                  */
 625
 626                 head_off = trunc_page_64(offset);
 627
 628                 if (head_off < offset && head_off >= fp->ff_size) {
 629                         /*
 630                          * The first page is beyond current EOF, so as an
 631                          * optimisation, we can pass IO_HEADZEROFILL.
 632                          */
 633                         lflag |= IO_HEADZEROFILL;
 634                 }
 635
 636                 hfs_unlock(cp);
 637                 cnode_locked = 0;
 638
 639                 /*
 640                  * We need to tell UBC the fork's new size BEFORE calling
 641                  * cluster_write, in case any of the new pages need to be
 642                  * paged out before cluster_write completes (which does happen
 643                  * in embedded systems due to extreme memory pressure).
 644                  * Similarly, we need to tell hfs_vnop_pageout what the new EOF
 645                  * will be, so that it can pass that on to cluster_pageout, and
 646                  * allow those pageouts.
 647                  *
 648                  * We don't update ff_size yet since we don't want pageins to
 649                  * be able to see uninitialized data between the old and new
 650                  * EOF, until cluster_write has completed and initialized that
 651                  * part of the file.
 652                  *
 653                  * The vnode pager relies on the file size last given to UBC via
 654                  * ubc_setsize.  hfs_vnop_pageout relies on fp->ff_new_size or
 655                  * ff_size (whichever is larger).  NOTE: ff_new_size is always
 656                  * zero, unless we are extending the file via write.
 657                  */
 658                 if (filesize > fp->ff_size) {
 659                         retval = hfs_zero_eof_page(vp, offset);
 660                         if (retval)
 661                                 goto exit;
 662                         fp->ff_new_size = filesize;
 663                         ubc_setsize(vp, filesize);
 664                 }
 665                 retval = cluster_write(vp, uio, fp->ff_size, filesize, head_off,
 666                                                            0, lflag | IO_NOZERODIRTY | io_return_on_throttle);
 667                 if (retval) {
 668                         fp->ff_new_size = 0;    /* no longer extending; use ff_size */
 669
 670                         if (retval == EAGAIN) {
 671                                 /*
 672                                  * EAGAIN indicates that we still have I/O to do, but
 673                                  * that we now need to be throttled
 674                                  */
 675                                 if (resid != uio_resid(uio)) {
 676                                         /*
 677                                          * did manage to do some I/O before returning EAGAIN
 678                                          */
 679                                         resid = uio_resid(uio);
 680                                         offset = uio_offset(uio);
 681
 682                                         cp->c_touch_chgtime = TRUE;
 683                                         cp->c_touch_modtime = TRUE;
 684                                         hfs_incr_gencount(cp);
 685                                 }
 686                                 if (filesize > fp->ff_size) {
 687                                         /*
 688                                          * we called ubc_setsize before the call to
 689                                          * cluster_write... since we only partially
 690                                          * completed the I/O, we need to
 691                                          * re-adjust our idea of the filesize based
 692                                          * on our interim EOF
 693                                          */
 694                                         ubc_setsize(vp, offset);
 695
 696                                         fp->ff_size = offset;
 697                                 }
 698                                 goto exit;
 699                         }
 700                         if (filesize > origFileSize) {
 701                                 ubc_setsize(vp, origFileSize);
 702                         }
 703                         goto ioerr_exit;
 704                 }
 705
 706                 if (filesize > origFileSize) {
 707                         fp->ff_size = filesize;
 708
 709                         /* Files that are changing size are not hot file candidates. */
 710                         if (hfsmp->hfc_stage == HFC_RECORDING) {
 711                                 fp->ff_bytesread = 0;
 712                         }
 713                 }
 714                 fp->ff_new_size = 0;    /* ff_size now has the correct size */
 715         }
 716         if (partialwrite) {
 717                 uio_setresid(uio, (uio_resid(uio) + bytesToAdd));
 718                 resid += bytesToAdd;
 719         }
 720
 721         if (vnode_should_flush_after_write(vp, ioflag))
 722                 hfs_flush(hfsmp, HFS_FLUSH_CACHE);
 723
 724 ioerr_exit:
 725         if (!cnode_locked) {
 726                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 727                 cnode_locked = 1;
 728         }
 729
 730         if (resid > uio_resid(uio)) {
 731                 cp->c_touch_chgtime = TRUE;
 732                 cp->c_touch_modtime = TRUE;
 733                 hfs_incr_gencount(cp);
 734
 735                 /*
 736                  * If we successfully wrote any data, and we are not the superuser
 737                  * we clear the setuid and setgid bits as a precaution against
 738                  * tampering.
 739                  */
 740                 if (cp->c_mode & (S_ISUID | S_ISGID)) {
 741                         cred = vfs_context_ucred(ap->a_context);
 742                         if (cred && suser(cred, NULL)) {
 743                                 cp->c_mode &= ~(S_ISUID | S_ISGID);
 744                         }
 745                 }
 746         }
 747         if (retval) {
 748                 if (ioflag & IO_UNIT) {
 749                         (void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC,
 750                                            0, ap->a_context);
 751                         uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio))));
 752                         uio_setresid(uio, resid);
 753                         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 754                 }
 755         } else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio)))
 756                 retval = hfs_update(vp, 0);
 757
 758         /* Updating vcbWrCnt doesn't need to be atomic. */
 759         hfsmp->vcbWrCnt++;
 760
 761         KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_END,
 762                 (int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0);
 763 exit:
 764         if (retval && took_truncate_lock
 765                 && cp->c_truncatelockowner == current_thread()) {
 766                 fp->ff_new_size = 0;
 767                 rl_remove(fp->ff_size, RL_INFINITY, &fp->ff_invalidranges);
 768         }
 769
 770         if (cnode_locked)
 771                 hfs_unlock(cp);
 772
 773         if (took_truncate_lock) {
 774                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 775         }
 776         if (retval == EAGAIN) {
 777                 throttle_lowpri_io(1);
 778                 throttled_count++;
 779
 780                 retval = 0;
 781                 goto again;
 782         }
 783         if (throttled_count)
 784                 throttle_info_reset_window(NULL);
 785         return (retval);
 786 }
 787
 788 /* support for the "bulk-access" fcntl */
 789
 790 #define CACHE_LEVELS 16
 791 #define NUM_CACHE_ENTRIES (64*16)
 792 #define PARENT_IDS_FLAG 0x100
 793
 794 struct access_cache {
 795        int numcached;
 796        int cachehits; /* these two for statistics gathering */
 797        int lookups;
 798        unsigned int *acache;
 799        unsigned char *haveaccess;
 800 };
 801
 802 struct access_t {
 803         uid_t     uid;              /* IN: effective user id */
 804         short     flags;            /* IN: access requested (i.e. R_OK) */
 805         short     num_groups;       /* IN: number of groups user belongs to */
 806         int       num_files;        /* IN: number of files to process */
 807         int       *file_ids;        /* IN: array of file ids */
 808         gid_t     *groups;          /* IN: array of groups */
 809         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 810 } __attribute__((unavailable)); // this structure is for reference purposes only
 811
 812 struct user32_access_t {
 813         uid_t     uid;              /* IN: effective user id */
 814         short     flags;            /* IN: access requested (i.e. R_OK) */
 815         short     num_groups;       /* IN: number of groups user belongs to */
 816         int       num_files;        /* IN: number of files to process */
 817         user32_addr_t      file_ids;        /* IN: array of file ids */
 818         user32_addr_t      groups;          /* IN: array of groups */
 819         user32_addr_t      access;          /* OUT: access info for each file (0 for 'has access') */
 820 };
 821
 822 struct user64_access_t {
 823         uid_t           uid;                    /* IN: effective user id */
 824         short           flags;                  /* IN: access requested (i.e. R_OK) */
 825         short           num_groups;             /* IN: number of groups user belongs to */
 826         int             num_files;              /* IN: number of files to process */
 827         user64_addr_t   file_ids;               /* IN: array of file ids */
 828         user64_addr_t   groups;                 /* IN: array of groups */
 829         user64_addr_t   access;                 /* OUT: access info for each file (0 for 'has access') */
 830 };
 831
 832
 833 // these are the "extended" versions of the above structures
 834 // note that it is crucial that they be different sized than
 835 // the regular version
 836 struct ext_access_t {
 837         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 838         uint32_t   num_files;       /* IN: number of files to process */
 839         uint32_t   map_size;        /* IN: size of the bit map */
 840         uint32_t  *file_ids;        /* IN: Array of file ids */
 841         char      *bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 842         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 843         uint32_t   num_parents;   /* future use */
 844         cnid_t      *parents;   /* future use */
 845 } __attribute__((unavailable)); // this structure is for reference purposes only
 846
 847 struct user32_ext_access_t {
 848         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 849         uint32_t   num_files;       /* IN: number of files to process */
 850         uint32_t   map_size;        /* IN: size of the bit map */
 851         user32_addr_t  file_ids;        /* IN: Array of file ids */
 852         user32_addr_t     bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 853         user32_addr_t access;          /* OUT: access info for each file (0 for 'has access') */
 854         uint32_t   num_parents;   /* future use */
 855         user32_addr_t parents;   /* future use */
 856 };
 857
 858 struct user64_ext_access_t {
 859         uint32_t      flags;        /* IN: access requested (i.e. R_OK) */
 860         uint32_t      num_files;    /* IN: number of files to process */
 861         uint32_t      map_size;     /* IN: size of the bit map */
 862         user64_addr_t   file_ids;     /* IN: array of file ids */
 863         user64_addr_t   bitmap;       /* IN: array of groups */
 864         user64_addr_t   access;       /* OUT: access info for each file (0 for 'has access') */
 865         uint32_t      num_parents;/* future use */
 866         user64_addr_t   parents;/* future use */
 867 };
 868
 869
 870 /*
 871  * Perform a binary search for the given parent_id. Return value is
 872  * the index if there is a match.  If no_match_indexp is non-NULL it
 873  * will be assigned with the index to insert the item (even if it was
 874  * not found).
 875  */
 876 static int cache_binSearch(cnid_t *array, unsigned int hi, cnid_t parent_id, int *no_match_indexp)
 877 {
 878     int index=-1;
 879     unsigned int lo=0;
 880
 881     do {
 882         unsigned int mid = ((hi - lo)/2) + lo;
 883         unsigned int this_id = array[mid];
 884
 885         if (parent_id == this_id) {
 886             hi = mid;
 887             break;
 888         }
 889
 890         if (parent_id < this_id) {
 891             hi = mid;
 892             continue;
 893         }
 894
 895         if (parent_id > this_id) {
 896             lo = mid + 1;
 897             continue;
 898         }
 899     } while(lo < hi);
 900
 901     /* check if lo and hi converged on the match */
 902     if (parent_id == array[hi]) {
 903         index = hi;
 904     }
 905
 906     if (no_match_indexp) {
 907         *no_match_indexp = hi;
 908     }
 909
 910     return index;
 911 }
 912
 913
 914 static int
 915 lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id)
 916 {
 917     unsigned int hi;
 918     int matches = 0;
 919     int index, no_match_index;
 920
 921     if (cache->numcached == 0) {
 922         *indexp = 0;
 923         return 0; // table is empty, so insert at index=0 and report no match
 924     }
 925
 926     if (cache->numcached > NUM_CACHE_ENTRIES) {
 927         cache->numcached = NUM_CACHE_ENTRIES;
 928     }
 929
 930     hi = cache->numcached - 1;
 931
 932     index = cache_binSearch(cache->acache, hi, parent_id, &no_match_index);
 933
 934     /* if no existing entry found, find index for new one */
 935     if (index == -1) {
 936         index = no_match_index;
 937         matches = 0;
 938     } else {
 939         matches = 1;
 940     }
 941
 942     *indexp = index;
 943     return matches;
 944 }
 945
 946 /*
 947  * Add a node to the access_cache at the given index (or do a lookup first
 948  * to find the index if -1 is passed in). We currently do a replace rather
 949  * than an insert if the cache is full.
 950  */
 951 static void
 952 add_node(struct access_cache *cache, int index, cnid_t nodeID, int access)
 953 {
 954     int lookup_index = -1;
 955
 956     /* need to do a lookup first if -1 passed for index */
 957     if (index == -1) {
 958         if (lookup_bucket(cache, &lookup_index, nodeID)) {
 959             if (cache->haveaccess[lookup_index] != access && cache->haveaccess[lookup_index] == ESRCH) {
 960                 // only update an entry if the previous access was ESRCH (i.e. a scope checking error)
 961                 cache->haveaccess[lookup_index] = access;
 962             }
 963
 964             /* mission accomplished */
 965             return;
 966         } else {
 967             index = lookup_index;
 968         }
 969
 970     }
 971
 972     /* if the cache is full, do a replace rather than an insert */
 973     if (cache->numcached >= NUM_CACHE_ENTRIES) {
 974         cache->numcached = NUM_CACHE_ENTRIES-1;
 975
 976         if (index > cache->numcached) {
 977             index = cache->numcached;
 978         }
 979     }
 980
 981     if (index < cache->numcached && index < NUM_CACHE_ENTRIES && nodeID > cache->acache[index]) {
 982         index++;
 983     }
 984
 985     if (index >= 0 && index < cache->numcached) {
 986         /* only do bcopy if we're inserting */
 987         bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) );
 988         bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(unsigned char) );
 989     }
 990
 991     cache->acache[index] = nodeID;
 992     cache->haveaccess[index] = access;
 993     cache->numcached++;
 994 }
 995
 996
 997 struct cinfo {
 998     uid_t   uid;
 999     gid_t   gid;
1000     mode_t  mode;
1001     cnid_t  parentcnid;
1002     u_int16_t recflags;
1003 };
1004
1005 static int
1006 snoop_callback(const cnode_t *cp, void *arg)
1007 {
1008     struct cinfo *cip = arg;
1009
1010     cip->uid = cp->c_uid;
1011     cip->gid = cp->c_gid;
1012     cip->mode = cp->c_mode;
1013     cip->parentcnid = cp->c_parentcnid;
1014     cip->recflags = cp->c_attr.ca_recflags;
1015
1016     return (0);
1017 }
1018
1019 /*
1020  * Lookup the cnid's attr info (uid, gid, and mode) as well as its parent id. If the item
1021  * isn't incore, then go to the catalog.
1022  */
1023 static int
1024 do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, cnid_t cnid,
1025     struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp)
1026 {
1027     int error = 0;
1028
1029     /* if this id matches the one the fsctl was called with, skip the lookup */
1030     if (cnid == skip_cp->c_cnid) {
1031                 cnattrp->ca_uid = skip_cp->c_uid;
1032                 cnattrp->ca_gid = skip_cp->c_gid;
1033                 cnattrp->ca_mode = skip_cp->c_mode;
1034                 cnattrp->ca_recflags = skip_cp->c_attr.ca_recflags;
1035                 keyp->hfsPlus.parentID = skip_cp->c_parentcnid;
1036     } else {
1037                 struct cinfo c_info;
1038
1039                 /* otherwise, check the cnode hash incase the file/dir is incore */
1040                 error = hfs_chash_snoop(hfsmp, cnid, 0, snoop_callback, &c_info);
1041
1042                 if (error == EACCES) {
1043                         // File is deleted
1044                         return ENOENT;
1045                 } else if (!error) {
1046                         cnattrp->ca_uid = c_info.uid;
1047                         cnattrp->ca_gid = c_info.gid;
1048                         cnattrp->ca_mode = c_info.mode;
1049                         cnattrp->ca_recflags = c_info.recflags;
1050                         keyp->hfsPlus.parentID = c_info.parentcnid;
1051                 } else {
1052                         int lockflags;
1053
1054                         if (throttle_io_will_be_throttled(-1, HFSTOVFS(hfsmp)))
1055                                 throttle_lowpri_io(1);
1056
1057                         lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
1058
1059                         /* lookup this cnid in the catalog */
1060                         error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp);
1061
1062                         hfs_systemfile_unlock(hfsmp, lockflags);
1063
1064                         cache->lookups++;
1065                 }
1066     }
1067
1068     return (error);
1069 }
1070
1071
1072 /*
1073  * Compute whether we have access to the given directory (nodeID) and all its parents. Cache
1074  * up to CACHE_LEVELS as we progress towards the root.
1075  */
1076 static int
1077 do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HFSCatalogNodeID nodeID,
1078     struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred,
1079     struct vfs_context *my_context,
1080     char *bitmap,
1081     uint32_t map_size,
1082     cnid_t* parents,
1083     uint32_t num_parents)
1084 {
1085     int                     myErr = 0;
1086     int                     myResult;
1087     HFSCatalogNodeID        thisNodeID;
1088     unsigned int            myPerms;
1089     struct cat_attr         cnattr;
1090     int                     cache_index = -1, scope_index = -1, scope_idx_start = -1;
1091     CatalogKey              catkey;
1092
1093     int i = 0, ids_to_cache = 0;
1094     int parent_ids[CACHE_LEVELS];
1095
1096     thisNodeID = nodeID;
1097     while (thisNodeID >=  kRootDirID) {
1098         myResult = 0;   /* default to "no access" */
1099
1100         /* check the cache before resorting to hitting the catalog */
1101
1102         /* ASSUMPTION: access info of cached entries is "final"... i.e. no need
1103          * to look any further after hitting cached dir */
1104
1105         if (lookup_bucket(cache, &cache_index, thisNodeID)) {
1106             cache->cachehits++;
1107             myErr = cache->haveaccess[cache_index];
1108             if (scope_index != -1) {
1109                 if (myErr == ESRCH) {
1110                     myErr = 0;
1111                 }
1112             } else {
1113                 scope_index = 0;   // so we'll just use the cache result
1114                 scope_idx_start = ids_to_cache;
1115             }
1116             myResult = (myErr == 0) ? 1 : 0;
1117             goto ExitThisRoutine;
1118         }
1119
1120
1121         if (parents) {
1122             int tmp;
1123             tmp = cache_binSearch(parents, num_parents-1, thisNodeID, NULL);
1124             if (scope_index == -1)
1125                 scope_index = tmp;
1126             if (tmp != -1 && scope_idx_start == -1 && ids_to_cache < CACHE_LEVELS) {
1127                 scope_idx_start = ids_to_cache;
1128             }
1129         }
1130
1131         /* remember which parents we want to cache */
1132         if (ids_to_cache < CACHE_LEVELS) {
1133             parent_ids[ids_to_cache] = thisNodeID;
1134             ids_to_cache++;
1135         }
1136         // Inefficient (using modulo) and we might want to use a hash function, not rely on the node id to be "nice"...
1137         if (bitmap && map_size) {
1138             bitmap[(thisNodeID/8)%(map_size)]|=(1<<(thisNodeID&7));
1139         }
1140
1141
1142         /* do the lookup (checks the cnode hash, then the catalog) */
1143         myErr = do_attr_lookup(hfsmp, cache, thisNodeID, skip_cp, &catkey, &cnattr);
1144         if (myErr) {
1145             goto ExitThisRoutine; /* no access */
1146         }
1147
1148         /* Root always gets access. */
1149         if (suser(myp_ucred, NULL) == 0) {
1150                 thisNodeID = catkey.hfsPlus.parentID;
1151                 myResult = 1;
1152                 continue;
1153         }
1154
1155         // if the thing has acl's, do the full permission check
1156         if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1157             struct vnode *vp;
1158
1159             /* get the vnode for this cnid */
1160             myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0, 0);
1161             if ( myErr ) {
1162                 myResult = 0;
1163                 goto ExitThisRoutine;
1164             }
1165
1166             thisNodeID = VTOC(vp)->c_parentcnid;
1167
1168             hfs_unlock(VTOC(vp));
1169
1170             if (vnode_vtype(vp) == VDIR) {
1171                 myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), my_context);
1172             } else {
1173                 myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, my_context);
1174             }
1175
1176             vnode_put(vp);
1177             if (myErr) {
1178                 myResult = 0;
1179                 goto ExitThisRoutine;
1180             }
1181         } else {
1182             unsigned int flags;
1183                 int mode = cnattr.ca_mode & S_IFMT;
1184                 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, cnattr.ca_mode, hfsmp->hfs_mp,myp_ucred, theProcPtr);
1185
1186                 if (mode == S_IFDIR) {
1187                         flags = R_OK | X_OK;
1188                 } else {
1189                         flags = R_OK;
1190                 }
1191                 if ( (myPerms & flags) != flags) {
1192                         myResult = 0;
1193                         myErr = EACCES;
1194                         goto ExitThisRoutine;   /* no access */
1195                 }
1196
1197             /* up the hierarchy we go */
1198             thisNodeID = catkey.hfsPlus.parentID;
1199         }
1200     }
1201
1202     /* if here, we have access to this node */
1203     myResult = 1;
1204
1205   ExitThisRoutine:
1206     if (parents && myErr == 0 && scope_index == -1) {
1207         myErr = ESRCH;
1208     }
1209
1210     if (myErr) {
1211         myResult = 0;
1212     }
1213     *err = myErr;
1214
1215     /* cache the parent directory(ies) */
1216     for (i = 0; i < ids_to_cache; i++) {
1217         if (myErr == 0 && parents && (scope_idx_start == -1 || i > scope_idx_start)) {
1218             add_node(cache, -1, parent_ids[i], ESRCH);
1219         } else {
1220             add_node(cache, -1, parent_ids[i], myErr);
1221         }
1222     }
1223
1224     return (myResult);
1225 }
1226
1227 static int
1228 do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
1229     struct vnop_ioctl_args *ap, int arg_size, vfs_context_t context)
1230 {
1231     boolean_t is64bit;
1232
1233     /*
1234      * NOTE: on entry, the vnode has an io_ref. In case this vnode
1235      * happens to be in our list of file_ids, we'll note it
1236      * avoid calling hfs_chashget_nowait() on that id as that
1237      * will cause a "locking against myself" panic.
1238      */
1239     Boolean check_leaf = true;
1240
1241     struct user64_ext_access_t *user_access_structp;
1242     struct user64_ext_access_t tmp_user_access;
1243     struct access_cache cache;
1244
1245     int error = 0, prev_parent_check_ok=1;
1246     unsigned int i;
1247
1248     short flags;
1249     unsigned int num_files = 0;
1250     int map_size = 0;
1251     int num_parents = 0;
1252     int *file_ids=NULL;
1253     short *access=NULL;
1254     char *bitmap=NULL;
1255     cnid_t *parents=NULL;
1256     int leaf_index;
1257
1258     cnid_t cnid;
1259     cnid_t prevParent_cnid = 0;
1260     unsigned int myPerms;
1261     short myaccess = 0;
1262     struct cat_attr cnattr;
1263     CatalogKey catkey;
1264     struct cnode *skip_cp = VTOC(vp);
1265     kauth_cred_t cred = vfs_context_ucred(context);
1266     proc_t p = vfs_context_proc(context);
1267
1268     is64bit = proc_is64bit(p);
1269
1270     /* initialize the local cache and buffers */
1271     cache.numcached = 0;
1272     cache.cachehits = 0;
1273     cache.lookups = 0;
1274     cache.acache = NULL;
1275     cache.haveaccess = NULL;
1276
1277     /* struct copyin done during dispatch... need to copy file_id array separately */
1278     if (ap->a_data == NULL) {
1279         error = EINVAL;
1280         goto err_exit_bulk_access;
1281     }
1282
1283     if (is64bit) {
1284         if (arg_size != sizeof(struct user64_ext_access_t)) {
1285             error = EINVAL;
1286             goto err_exit_bulk_access;
1287         }
1288
1289         user_access_structp = (struct user64_ext_access_t *)ap->a_data;
1290
1291     } else if (arg_size == sizeof(struct user32_access_t)) {
1292         struct user32_access_t *accessp = (struct user32_access_t *)ap->a_data;
1293
1294         // convert an old style bulk-access struct to the new style
1295         tmp_user_access.flags     = accessp->flags;
1296         tmp_user_access.num_files = accessp->num_files;
1297         tmp_user_access.map_size  = 0;
1298         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1299         tmp_user_access.bitmap    = USER_ADDR_NULL;
1300         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1301         tmp_user_access.num_parents = 0;
1302         user_access_structp = &tmp_user_access;
1303
1304     } else if (arg_size == sizeof(struct user32_ext_access_t)) {
1305         struct user32_ext_access_t *accessp = (struct user32_ext_access_t *)ap->a_data;
1306
1307         // up-cast from a 32-bit version of the struct
1308         tmp_user_access.flags     = accessp->flags;
1309         tmp_user_access.num_files = accessp->num_files;
1310         tmp_user_access.map_size  = accessp->map_size;
1311         tmp_user_access.num_parents  = accessp->num_parents;
1312
1313         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1314         tmp_user_access.bitmap    = CAST_USER_ADDR_T(accessp->bitmap);
1315         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1316         tmp_user_access.parents    = CAST_USER_ADDR_T(accessp->parents);
1317
1318         user_access_structp = &tmp_user_access;
1319     } else {
1320         error = EINVAL;
1321         goto err_exit_bulk_access;
1322     }
1323
1324     map_size = user_access_structp->map_size;
1325
1326     num_files = user_access_structp->num_files;
1327
1328     num_parents= user_access_structp->num_parents;
1329
1330     if (num_files < 1) {
1331         goto err_exit_bulk_access;
1332     }
1333     if (num_files > 1024) {
1334         error = EINVAL;
1335         goto err_exit_bulk_access;
1336     }
1337
1338     if (num_parents > 1024) {
1339         error = EINVAL;
1340         goto err_exit_bulk_access;
1341     }
1342
1343     file_ids = hfs_malloc(sizeof(int) * num_files);
1344     access = hfs_malloc(sizeof(short) * num_files);
1345     if (map_size) {
1346                 bitmap = hfs_mallocz(sizeof(char) * map_size);
1347     }
1348
1349     if (num_parents) {
1350                 parents = hfs_malloc(sizeof(cnid_t) * num_parents);
1351     }
1352
1353     cache.acache = hfs_malloc(sizeof(int) * NUM_CACHE_ENTRIES);
1354     cache.haveaccess = hfs_malloc(sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1355
1356     if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids,
1357                 num_files * sizeof(int)))) {
1358         goto err_exit_bulk_access;
1359     }
1360
1361     if (num_parents) {
1362         if ((error = copyin(user_access_structp->parents, (caddr_t)parents,
1363                     num_parents * sizeof(cnid_t)))) {
1364             goto err_exit_bulk_access;
1365         }
1366     }
1367
1368     flags = user_access_structp->flags;
1369     if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) {
1370         flags = R_OK;
1371     }
1372
1373     /* check if we've been passed leaf node ids or parent ids */
1374     if (flags & PARENT_IDS_FLAG) {
1375         check_leaf = false;
1376     }
1377
1378     /* Check access to each file_id passed in */
1379     for (i = 0; i < num_files; i++) {
1380         leaf_index=-1;
1381         cnid = (cnid_t) file_ids[i];
1382
1383         /* root always has access */
1384         if ((!parents) && (!suser(cred, NULL))) {
1385             access[i] = 0;
1386             continue;
1387         }
1388
1389         if (check_leaf) {
1390             /* do the lookup (checks the cnode hash, then the catalog) */
1391             error = do_attr_lookup(hfsmp, &cache, cnid, skip_cp, &catkey, &cnattr);
1392             if (error) {
1393                 access[i] = (short) error;
1394                 continue;
1395             }
1396
1397             if (parents) {
1398                 // Check if the leaf matches one of the parent scopes
1399                 leaf_index = cache_binSearch(parents, num_parents-1, cnid, NULL);
1400                 if (leaf_index >= 0 && parents[leaf_index] == cnid)
1401                     prev_parent_check_ok = 0;
1402                 else if (leaf_index >= 0)
1403                     prev_parent_check_ok = 1;
1404             }
1405
1406             // if the thing has acl's, do the full permission check
1407             if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1408                 struct vnode *cvp;
1409                 int myErr = 0;
1410                 /* get the vnode for this cnid */
1411                 myErr = hfs_vget(hfsmp, cnid, &cvp, 0, 0);
1412                 if ( myErr ) {
1413                     access[i] = myErr;
1414                     continue;
1415                 }
1416
1417                 hfs_unlock(VTOC(cvp));
1418
1419                 if (vnode_vtype(cvp) == VDIR) {
1420                     myErr = vnode_authorize(cvp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), context);
1421                 } else {
1422                     myErr = vnode_authorize(cvp, NULL, KAUTH_VNODE_READ_DATA, context);
1423                 }
1424
1425                 vnode_put(cvp);
1426                 if (myErr) {
1427                     access[i] = myErr;
1428                     continue;
1429                 }
1430             } else {
1431                 /* before calling CheckAccess(), check the target file for read access */
1432                 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
1433                     cnattr.ca_mode, hfsmp->hfs_mp, cred, p);
1434
1435                 /* fail fast if no access */
1436                 if ((myPerms & flags) == 0) {
1437                     access[i] = EACCES;
1438                     continue;
1439                 }
1440             }
1441         } else {
1442             /* we were passed an array of parent ids */
1443             catkey.hfsPlus.parentID = cnid;
1444         }
1445
1446         /* if the last guy had the same parent and had access, we're done */
1447         if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0 && prev_parent_check_ok) {
1448             cache.cachehits++;
1449             access[i] = 0;
1450             continue;
1451         }
1452
1453         myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID,
1454             skip_cp, p, cred, context,bitmap, map_size, parents, num_parents);
1455
1456         if (myaccess || (error == ESRCH && leaf_index != -1)) {
1457             access[i] = 0; // have access.. no errors to report
1458         } else {
1459             access[i] = (error != 0 ? (short) error : EACCES);
1460         }
1461
1462         prevParent_cnid = catkey.hfsPlus.parentID;
1463     }
1464
1465     /* copyout the access array */
1466     if ((error = copyout((caddr_t)access, user_access_structp->access,
1467                 num_files * sizeof (short)))) {
1468         goto err_exit_bulk_access;
1469     }
1470     if (map_size && bitmap) {
1471         if ((error = copyout((caddr_t)bitmap, user_access_structp->bitmap,
1472                     map_size * sizeof (char)))) {
1473             goto err_exit_bulk_access;
1474         }
1475     }
1476
1477
1478   err_exit_bulk_access:
1479
1480         hfs_free(file_ids, sizeof(int) * num_files);
1481         hfs_free(parents, sizeof(cnid_t) * num_parents);
1482         hfs_free(bitmap, sizeof(char) * map_size);
1483         hfs_free(access, sizeof(short) * num_files);
1484         hfs_free(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1485         hfs_free(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1486
1487     return (error);
1488 }
1489
1490
1491 /* end "bulk-access" support */
1492
1493
1494 /*
1495  * Control filesystem operating characteristics.
1496  */
1497 int
1498 hfs_vnop_ioctl( struct vnop_ioctl_args /* {
1499                 vnode_t a_vp;
1500                 long  a_command;
1501                 caddr_t  a_data;
1502                 int  a_fflag;
1503                 vfs_context_t a_context;
1504         } */ *ap)
1505 {
1506         struct vnode * vp = ap->a_vp;
1507         struct hfsmount *hfsmp = VTOHFS(vp);
1508         vfs_context_t context = ap->a_context;
1509         kauth_cred_t cred = vfs_context_ucred(context);
1510         proc_t p = vfs_context_proc(context);
1511         struct vfsstatfs *vfsp;
1512         boolean_t is64bit;
1513         off_t jnl_start, jnl_size;
1514         struct hfs_journal_info *jip;
1515 #if HFS_COMPRESSION
1516         int compressed = 0;
1517         off_t uncompressed_size = -1;
1518         int decmpfs_error = 0;
1519
1520         if (ap->a_command == F_RDADVISE) {
1521                 /* we need to inspect the decmpfs state of the file as early as possible */
1522                 compressed = hfs_file_is_compressed(VTOC(vp), 0);
1523                 if (compressed) {
1524                         if (VNODE_IS_RSRC(vp)) {
1525                                 /* if this is the resource fork, treat it as if it were empty */
1526                                 uncompressed_size = 0;
1527                         } else {
1528                                 decmpfs_error = hfs_uncompressed_size_of_compressed_file(NULL, vp, 0, &uncompressed_size, 0);
1529                                 if (decmpfs_error != 0) {
1530                                         /* failed to get the uncompressed size, we'll check for this later */
1531                                         uncompressed_size = -1;
1532                                 }
1533                         }
1534                 }
1535         }
1536 #endif /* HFS_COMPRESSION */
1537
1538         is64bit = proc_is64bit(p);
1539
1540 #if CONFIG_PROTECT
1541 #if HFS_CONFIG_KEY_ROLL
1542         // The HFSIOC_KEY_ROLL fsctl does its own access checks
1543         if (ap->a_command != HFSIOC_KEY_ROLL)
1544 #endif
1545         {
1546                 int error = 0;
1547                 if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
1548                         return error;
1549                 }
1550         }
1551 #endif /* CONFIG_PROTECT */
1552
1553         switch (ap->a_command) {
1554
1555         case HFSIOC_GETPATH:
1556         {
1557                 struct vnode *file_vp;
1558                 cnid_t  cnid;
1559                 int error;
1560                 int flags = 0;
1561                 char *bufptr;
1562 #ifdef VN_GETPATH_NEW
1563                 size_t outlen;
1564 #else  // VN_GETPATH_NEW
1565                 int outlen;
1566 #endif // VN_GETPATH_NEW
1567
1568                 /* Caller must be owner of file system. */
1569                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1570                 if (suser(cred, NULL) &&
1571                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1572                         return (EACCES);
1573                 }
1574                 /* Target vnode must be file system's root. */
1575                 if (!vnode_isvroot(vp)) {
1576                         return (EINVAL);
1577                 }
1578                 bufptr = (char *)ap->a_data;
1579                 cnid = strtoul(bufptr, NULL, 10);
1580                 if (ap->a_fflag & HFS_GETPATH_VOLUME_RELATIVE) {
1581                         flags |= BUILDPATH_VOLUME_RELATIVE;
1582                 }
1583
1584                 /* We need to call hfs_vfs_vget to leverage the code that will
1585                  * fix the origin list for us if needed, as opposed to calling
1586                  * hfs_vget, since we will need the parent for vn_getpath_ext call.
1587                  */
1588
1589                 if ((error = hfs_vfs_vget(HFSTOVFS(hfsmp), cnid, &file_vp, context))) {
1590                         return (error);
1591                 }
1592
1593                 outlen = sizeof(pathname_t);
1594                 error = vn_getpath_ext(file_vp, NULLVP,  bufptr, &outlen, flags);
1595                 vnode_put(file_vp);
1596
1597                 return (error);
1598         }
1599
1600         case HFSIOC_SET_MAX_DEFRAG_SIZE:
1601         {
1602                 int error = 0;          /* Assume success */
1603                 u_int32_t maxsize = 0;
1604
1605                 if (vnode_vfsisrdonly(vp)) {
1606                         return (EROFS);
1607                 }
1608                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1609                 if (!kauth_cred_issuser(cred)) {
1610                         return (EACCES); /* must be root */
1611                 }
1612
1613                 maxsize = *(u_int32_t *)ap->a_data;
1614
1615                 hfs_lock_mount(hfsmp);
1616                 if (maxsize > HFS_MAX_DEFRAG_SIZE) {
1617                         error = EINVAL;
1618                 }
1619                 else {
1620                         hfsmp->hfs_defrag_max = maxsize;
1621                 }
1622                 hfs_unlock_mount(hfsmp);
1623
1624                 return (error);
1625         }
1626
1627         case HFSIOC_FORCE_ENABLE_DEFRAG:
1628         {
1629                 int error = 0;          /* Assume success */
1630                 u_int32_t do_enable = 0;
1631
1632                 if (vnode_vfsisrdonly(vp)) {
1633                         return (EROFS);
1634                 }
1635                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1636                 if (!kauth_cred_issuser(cred)) {
1637                         return (EACCES); /* must be root */
1638                 }
1639
1640                 do_enable = *(u_int32_t *)ap->a_data;
1641
1642                 hfs_lock_mount(hfsmp);
1643                 if (do_enable != 0) {
1644                         hfsmp->hfs_defrag_nowait = 1;
1645                 }
1646                 else {
1647                         error = EINVAL;
1648                 }
1649
1650                 hfs_unlock_mount(hfsmp);
1651
1652                 return (error);
1653         }
1654
1655
1656         case HFSIOC_TRANSFER_DOCUMENT_ID:
1657         {
1658                 struct cnode *cp = NULL;
1659                 int error;
1660                 u_int32_t to_fd = *(u_int32_t *)ap->a_data;
1661                 struct fileproc *to_fp;
1662                 struct vnode *to_vp;
1663                 struct cnode *to_cp;
1664
1665                 cp = VTOC(vp);
1666
1667                 if ((error = fp_getfvp(p, to_fd, &to_fp, &to_vp)) != 0) {
1668                         //printf("could not get the vnode for fd %d (err %d)\n", to_fd, error);
1669                         return error;
1670                 }
1671                 if ( (error = vnode_getwithref(to_vp)) ) {
1672                         file_drop(to_fd);
1673                         return error;
1674                 }
1675
1676                 if (VTOHFS(to_vp) != hfsmp) {
1677                         error = EXDEV;
1678                         goto transfer_cleanup;
1679                 }
1680
1681                 int need_unlock = 1;
1682                 to_cp = VTOC(to_vp);
1683                 error = hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK);
1684                 if (error != 0) {
1685                         //printf("could not lock the pair of cnodes (error %d)\n", error);
1686                         goto transfer_cleanup;
1687                 }
1688
1689                 if (!(cp->c_bsdflags & UF_TRACKED)) {
1690                         error = EINVAL;
1691                 } else if (to_cp->c_bsdflags & UF_TRACKED) {
1692                         //
1693                         // if the destination is already tracked, return an error
1694                         // as otherwise it's a silent deletion of the target's
1695                         // document-id
1696                         //
1697                         error = EEXIST;
1698                 } else if (S_ISDIR(cp->c_attr.ca_mode) || S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) {
1699                         //
1700                         // we can use the FndrExtendedFileInfo because the doc-id is the first
1701                         // thing in both it and the ExtendedDirInfo struct which is fixed in
1702                         // format and can not change layout
1703                         //
1704                         struct FndrExtendedFileInfo *f_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)cp->c_finderinfo + 16);
1705                         struct FndrExtendedFileInfo *to_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)to_cp->c_finderinfo + 16);
1706
1707                         if (f_extinfo->document_id == 0) {
1708                                 uint32_t new_id;
1709
1710                                 hfs_unlockpair(cp, to_cp);  // have to unlock to be able to get a new-id
1711
1712                                 if ((error = hfs_generate_document_id(hfsmp, &new_id)) == 0) {
1713                                         //
1714                                         // re-lock the pair now that we have the document-id
1715                                         //
1716                                         hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK);
1717                                         f_extinfo->document_id = new_id;
1718                                 } else {
1719                                         goto transfer_cleanup;
1720                                 }
1721                         }
1722
1723                         to_extinfo->document_id = f_extinfo->document_id;
1724                         f_extinfo->document_id = 0;
1725                         //printf("TRANSFERRING: doc-id %d from ino %d to ino %d\n", to_extinfo->document_id, cp->c_fileid, to_cp->c_fileid);
1726
1727                         // make sure the destination is also UF_TRACKED
1728                         to_cp->c_bsdflags |= UF_TRACKED;
1729                         cp->c_bsdflags &= ~UF_TRACKED;
1730
1731                         // mark the cnodes dirty
1732                         cp->c_flag |= C_MODIFIED;
1733                         to_cp->c_flag |= C_MODIFIED;
1734
1735                         int lockflags;
1736                         if ((error = hfs_start_transaction(hfsmp)) == 0) {
1737
1738                                 lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK);
1739
1740                                 (void) cat_update(hfsmp, &cp->c_desc, &cp->c_attr, NULL, NULL);
1741                                 (void) cat_update(hfsmp, &to_cp->c_desc, &to_cp->c_attr, NULL, NULL);
1742
1743                                 hfs_systemfile_unlock (hfsmp, lockflags);
1744                                 (void) hfs_end_transaction(hfsmp);
1745                         }
1746
1747                         add_fsevent(FSE_DOCID_CHANGED, context,
1748                                     FSE_ARG_DEV,   hfsmp->hfs_raw_dev,
1749                                     FSE_ARG_INO,   (ino64_t)cp->c_fileid,       // src inode #
1750                                     FSE_ARG_INO,   (ino64_t)to_cp->c_fileid,    // dst inode #
1751                                     FSE_ARG_INT32, to_extinfo->document_id,
1752                                     FSE_ARG_DONE);
1753
1754                         hfs_unlockpair(cp, to_cp);    // unlock this so we can send the fsevents
1755                         need_unlock = 0;
1756
1757                         if (need_fsevent(FSE_STAT_CHANGED, vp)) {
1758                                 add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
1759                         }
1760                         if (need_fsevent(FSE_STAT_CHANGED, to_vp)) {
1761                                 add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, to_vp, FSE_ARG_DONE);
1762                         }
1763                 }
1764
1765                 if (need_unlock) {
1766                         hfs_unlockpair(cp, to_cp);
1767                 }
1768
1769         transfer_cleanup:
1770                 vnode_put(to_vp);
1771                 file_drop(to_fd);
1772
1773                 return error;
1774         }
1775
1776
1777
1778         case HFSIOC_PREV_LINK:
1779         case HFSIOC_NEXT_LINK:
1780         {
1781                 cnid_t linkfileid;
1782                 cnid_t nextlinkid;
1783                 cnid_t prevlinkid;
1784                 int error;
1785
1786                 /* Caller must be owner of file system. */
1787                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1788                 if (suser(cred, NULL) &&
1789                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1790                         return (EACCES);
1791                 }
1792                 /* Target vnode must be file system's root. */
1793                 if (!vnode_isvroot(vp)) {
1794                         return (EINVAL);
1795                 }
1796                 linkfileid = *(cnid_t *)ap->a_data;
1797                 if (linkfileid < kHFSFirstUserCatalogNodeID) {
1798                         return (EINVAL);
1799                 }
1800                 if ((error = hfs_lookup_siblinglinks(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) {
1801                         return (error);
1802                 }
1803                 if (ap->a_command == HFSIOC_NEXT_LINK) {
1804                         *(cnid_t *)ap->a_data = nextlinkid;
1805                 } else {
1806                         *(cnid_t *)ap->a_data = prevlinkid;
1807                 }
1808                 return (0);
1809         }
1810
1811         case HFSIOC_RESIZE_PROGRESS: {
1812
1813                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1814                 if (suser(cred, NULL) &&
1815                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1816                         return (EACCES); /* must be owner of file system */
1817                 }
1818                 if (!vnode_isvroot(vp)) {
1819                         return (EINVAL);
1820                 }
1821                 /* file system must not be mounted read-only */
1822                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1823                         return (EROFS);
1824                 }
1825
1826                 return hfs_resize_progress(hfsmp, (u_int32_t *)ap->a_data);
1827         }
1828
1829         case HFSIOC_RESIZE_VOLUME: {
1830                 u_int64_t newsize;
1831                 u_int64_t cursize;
1832                 int ret;
1833
1834                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1835                 if (suser(cred, NULL) &&
1836                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1837                         return (EACCES); /* must be owner of file system */
1838                 }
1839                 if (!vnode_isvroot(vp)) {
1840                         return (EINVAL);
1841                 }
1842
1843                 /* filesystem must not be mounted read only */
1844                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1845                         return (EROFS);
1846                 }
1847                 newsize = *(u_int64_t *)ap->a_data;
1848                 cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
1849
1850                 if (newsize == cursize) {
1851                         return (0);
1852                 }
1853                 IOBSDMountChange(hfsmp->hfs_mp, kIOMountChangeWillResize);
1854                 if (newsize > cursize) {
1855                         ret = hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context);
1856                 } else {
1857                         ret = hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context);
1858                 }
1859                 IOBSDMountChange(hfsmp->hfs_mp, kIOMountChangeDidResize);
1860                 return (ret);
1861         }
1862         case HFSIOC_CHANGE_NEXT_ALLOCATION: {
1863                 int error = 0;          /* Assume success */
1864                 u_int32_t location;
1865
1866                 if (vnode_vfsisrdonly(vp)) {
1867                         return (EROFS);
1868                 }
1869                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1870                 if (suser(cred, NULL) &&
1871                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1872                         return (EACCES); /* must be owner of file system */
1873                 }
1874                 if (!vnode_isvroot(vp)) {
1875                         return (EINVAL);
1876                 }
1877                 hfs_lock_mount(hfsmp);
1878                 location = *(u_int32_t *)ap->a_data;
1879                 if ((location >= hfsmp->allocLimit) &&
1880                         (location != HFS_NO_UPDATE_NEXT_ALLOCATION)) {
1881                         error = EINVAL;
1882                         goto fail_change_next_allocation;
1883                 }
1884                 /* Return previous value. */
1885                 *(u_int32_t *)ap->a_data = hfsmp->nextAllocation;
1886                 if (location == HFS_NO_UPDATE_NEXT_ALLOCATION) {
1887                         /* On magic value for location, set nextAllocation to next block
1888                          * after metadata zone and set flag in mount structure to indicate
1889                          * that nextAllocation should not be updated again.
1890                          */
1891                         if (hfsmp->hfs_metazone_end != 0) {
1892                                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_end + 1);
1893                         }
1894                         hfsmp->hfs_flags |= HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1895                 } else {
1896                         hfsmp->hfs_flags &= ~HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1897                         HFS_UPDATE_NEXT_ALLOCATION(hfsmp, location);
1898                 }
1899                 MarkVCBDirty(hfsmp);
1900 fail_change_next_allocation:
1901                 hfs_unlock_mount(hfsmp);
1902                 return (error);
1903         }
1904
1905 #if HFS_SPARSE_DEV
1906         case HFSIOC_SETBACKINGSTOREINFO: {
1907                 struct vnode * di_vp;
1908                 struct hfs_backingstoreinfo *bsdata;
1909                 int error = 0;
1910
1911                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1912                         return (EROFS);
1913                 }
1914                 if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
1915                         return (EALREADY);
1916                 }
1917                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1918                 if (suser(cred, NULL) &&
1919                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1920                         return (EACCES); /* must be owner of file system */
1921                 }
1922                 bsdata = (struct hfs_backingstoreinfo *)ap->a_data;
1923                 if (bsdata == NULL) {
1924                         return (EINVAL);
1925                 }
1926                 if ((error = file_vnode(bsdata->backingfd, &di_vp))) {
1927                         return (error);
1928                 }
1929                 if ((error = vnode_getwithref(di_vp))) {
1930                         file_drop(bsdata->backingfd);
1931                         return(error);
1932                 }
1933
1934                 if (vnode_mount(vp) == vnode_mount(di_vp)) {
1935                         (void)vnode_put(di_vp);
1936                         file_drop(bsdata->backingfd);
1937                         return (EINVAL);
1938                 }
1939
1940                 // Dropped in unmount
1941                 vnode_ref(di_vp);
1942
1943                 hfs_lock_mount(hfsmp);
1944                 hfsmp->hfs_backingvp = di_vp;
1945                 hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
1946                 hfsmp->hfs_sparsebandblks = bsdata->bandsize / hfsmp->blockSize * 4;
1947                 hfs_unlock_mount(hfsmp);
1948
1949                 /* We check the MNTK_VIRTUALDEV bit instead of marking the dependent process */
1950
1951                 /*
1952                  * If the sparse image is on a sparse image file (as opposed to a sparse
1953                  * bundle), then we may need to limit the free space to the maximum size
1954                  * of a file on that volume.  So we query (using pathconf), and if we get
1955                  * a meaningful result, we cache the number of blocks for later use in
1956                  * hfs_freeblks().
1957                  */
1958                 hfsmp->hfs_backingfs_maxblocks = 0;
1959                 if (vnode_vtype(di_vp) == VREG) {
1960                         int terr;
1961                         int hostbits;
1962                         terr = vn_pathconf(di_vp, _PC_FILESIZEBITS, &hostbits, context);
1963                         if (terr == 0 && hostbits != 0 && hostbits < 64) {
1964                                 u_int64_t hostfilesizemax = ((u_int64_t)1) << hostbits;
1965
1966                                 hfsmp->hfs_backingfs_maxblocks = hostfilesizemax / hfsmp->blockSize;
1967                         }
1968                 }
1969
1970                 /* The free extent cache is managed differently for sparse devices.
1971                  * There is a window between which the volume is mounted and the
1972                  * device is marked as sparse, so the free extent cache for this
1973                  * volume is currently initialized as normal volume (sorted by block
1974                  * count).  Reset the cache so that it will be rebuilt again
1975                  * for sparse device (sorted by start block).
1976                  */
1977                 ResetVCBFreeExtCache(hfsmp);
1978
1979                 (void)vnode_put(di_vp);
1980                 file_drop(bsdata->backingfd);
1981                 return (0);
1982         }
1983
1984         case HFSIOC_CLRBACKINGSTOREINFO: {
1985                 struct vnode * tmpvp;
1986
1987                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1988                 if (suser(cred, NULL) &&
1989                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1990                         return (EACCES); /* must be owner of file system */
1991                 }
1992                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1993                         return (EROFS);
1994                 }
1995
1996                 if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
1997                     hfsmp->hfs_backingvp) {
1998
1999                         hfs_lock_mount(hfsmp);
2000                         hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
2001                         tmpvp = hfsmp->hfs_backingvp;
2002                         hfsmp->hfs_backingvp = NULLVP;
2003                         hfsmp->hfs_sparsebandblks = 0;
2004                         hfs_unlock_mount(hfsmp);
2005
2006                         vnode_rele(tmpvp);
2007                 }
2008                 return (0);
2009         }
2010 #endif /* HFS_SPARSE_DEV */
2011
2012         /* Change the next CNID stored in the VH */
2013         case HFSIOC_CHANGE_NEXTCNID: {
2014                 int error = 0;          /* Assume success */
2015                 u_int32_t fileid;
2016                 int wraparound = 0;
2017                 int lockflags = 0;
2018
2019                 if (vnode_vfsisrdonly(vp)) {
2020                         return (EROFS);
2021                 }
2022                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
2023                 if (suser(cred, NULL) &&
2024                         kauth_cred_getuid(cred) != vfsp->f_owner) {
2025                         return (EACCES); /* must be owner of file system */
2026                 }
2027
2028                 fileid = *(u_int32_t *)ap->a_data;
2029
2030                 /* Must have catalog lock excl. to advance the CNID pointer */
2031                 lockflags = hfs_systemfile_lock (hfsmp, SFL_CATALOG , HFS_EXCLUSIVE_LOCK);
2032
2033                 hfs_lock_mount(hfsmp);
2034
2035                 /* If it is less than the current next CNID, force the wraparound bit to be set */
2036                 if (fileid < hfsmp->vcbNxtCNID) {
2037                         wraparound=1;
2038                 }
2039
2040                 /* Return previous value. */
2041                 *(u_int32_t *)ap->a_data = hfsmp->vcbNxtCNID;
2042
2043                 hfsmp->vcbNxtCNID = fileid;
2044
2045                 if (wraparound) {
2046                         hfsmp->vcbAtrb |= kHFSCatalogNodeIDsReusedMask;
2047                 }
2048
2049                 MarkVCBDirty(hfsmp);
2050                 hfs_unlock_mount(hfsmp);
2051                 hfs_systemfile_unlock (hfsmp, lockflags);
2052
2053                 return (error);
2054         }
2055
2056         case F_FREEZE_FS: {
2057                 struct mount *mp;
2058
2059                 mp = vnode_mount(vp);
2060                 hfsmp = VFSTOHFS(mp);
2061
2062                 if (!(hfsmp->jnl))
2063                         return (ENOTSUP);
2064
2065                 vfsp = vfs_statfs(mp);
2066
2067                 if (kauth_cred_getuid(cred) != vfsp->f_owner &&
2068                         !kauth_cred_issuser(cred))
2069                         return (EACCES);
2070
2071                 return hfs_freeze(hfsmp);
2072         }
2073
2074         case F_THAW_FS: {
2075                 vfsp = vfs_statfs(vnode_mount(vp));
2076                 if (kauth_cred_getuid(cred) != vfsp->f_owner &&
2077                         !kauth_cred_issuser(cred))
2078                         return (EACCES);
2079
2080                 return hfs_thaw(hfsmp, current_proc());
2081         }
2082
2083         case HFSIOC_EXT_BULKACCESS32:
2084         case HFSIOC_EXT_BULKACCESS64: {
2085             int size;
2086 #if CONFIG_HFS_STD
2087             if (hfsmp->hfs_flags & HFS_STANDARD) {
2088                         return EINVAL;
2089             }
2090 #endif
2091
2092             if (is64bit) {
2093                 size = sizeof(struct user64_ext_access_t);
2094             } else {
2095                 size = sizeof(struct user32_ext_access_t);
2096             }
2097
2098             return do_bulk_access_check(hfsmp, vp, ap, size, context);
2099         }
2100
2101         case HFSIOC_SET_XATTREXTENTS_STATE: {
2102                 int state;
2103
2104                 if (ap->a_data == NULL) {
2105                         return (EINVAL);
2106                 }
2107
2108                 state = *(int *)ap->a_data;
2109
2110                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2111                         return (EROFS);
2112                 }
2113
2114                 /* Super-user can enable or disable extent-based extended
2115                  * attribute support on a volume
2116                  * Note: Starting Mac OS X 10.7, extent-based extended attributes
2117                  * are enabled by default, so any change will be transient only
2118                  * till the volume is remounted.
2119                  */
2120                 if (!kauth_cred_issuser(kauth_cred_get())) {
2121                         return (EPERM);
2122                 }
2123                 if (state == 0 || state == 1)
2124                         return hfs_set_volxattr(hfsmp, HFSIOC_SET_XATTREXTENTS_STATE, state);
2125                 else
2126                         return (EINVAL);
2127         }
2128
2129         case F_SETSTATICCONTENT: {
2130                 int error;
2131                 int enable_static = 0;
2132                 struct cnode *cp = NULL;
2133                 /*
2134                  * lock the cnode, decorate the cnode flag, and bail out.
2135                  * VFS should have already authenticated the caller for us.
2136                  */
2137
2138                 if (ap->a_data) {
2139                         /*
2140                          * Note that even though ap->a_data is of type caddr_t,
2141                          * the fcntl layer at the syscall handler will pass in NULL
2142                          * or 1 depending on what the argument supplied to the fcntl
2143                          * was.  So it is in fact correct to check the ap->a_data
2144                          * argument for zero or non-zero value when deciding whether or not
2145                          * to enable the static bit in the cnode.
2146                          */
2147                         enable_static = 1;
2148                 }
2149                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2150                         return EROFS;
2151                 }
2152                 cp = VTOC(vp);
2153
2154                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2155                 if (error == 0) {
2156                         if (enable_static) {
2157                                 cp->c_flag |= C_SSD_STATIC;
2158                         }
2159                         else {
2160                                 cp->c_flag &= ~C_SSD_STATIC;
2161                         }
2162                         hfs_unlock (cp);
2163                 }
2164                 return error;
2165         }
2166
2167         case F_SET_GREEDY_MODE: {
2168                 int error;
2169                 int enable_greedy_mode = 0;
2170                 struct cnode *cp = NULL;
2171                 /*
2172                  * lock the cnode, decorate the cnode flag, and bail out.
2173                  * VFS should have already authenticated the caller for us.
2174                  */
2175
2176                 if (ap->a_data) {
2177                         /*
2178                          * Note that even though ap->a_data is of type caddr_t,
2179                          * the fcntl layer at the syscall handler will pass in NULL
2180                          * or 1 depending on what the argument supplied to the fcntl
2181                          * was.  So it is in fact correct to check the ap->a_data
2182                          * argument for zero or non-zero value when deciding whether or not
2183                          * to enable the greedy mode bit in the cnode.
2184                          */
2185                         enable_greedy_mode = 1;
2186                 }
2187                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2188                         return EROFS;
2189                 }
2190                 cp = VTOC(vp);
2191
2192                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2193                 if (error == 0) {
2194                         if (enable_greedy_mode) {
2195                                 cp->c_flag |= C_SSD_GREEDY_MODE;
2196                         }
2197                         else {
2198                                 cp->c_flag &= ~C_SSD_GREEDY_MODE;
2199                         }
2200                         hfs_unlock (cp);
2201                 }
2202                 return error;
2203         }
2204
2205         case F_SETIOTYPE: {
2206                 int error;
2207                 uint32_t iotypeflag = 0;
2208
2209                 struct cnode *cp = NULL;
2210                 /*
2211                  * lock the cnode, decorate the cnode flag, and bail out.
2212                  * VFS should have already authenticated the caller for us.
2213                  */
2214
2215                 if (ap->a_data == NULL) {
2216                         return EINVAL;
2217                 }
2218
2219                 /*
2220                  * Note that even though ap->a_data is of type caddr_t, we
2221                  * can only use 32 bits of flag values.
2222                  */
2223                 iotypeflag = (uint32_t) ap->a_data;
2224                 switch (iotypeflag) {
2225                         case F_IOTYPE_ISOCHRONOUS:
2226                                 break;
2227                         default:
2228                                 return EINVAL;
2229                 }
2230
2231
2232                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2233                         return EROFS;
2234                 }
2235                 cp = VTOC(vp);
2236
2237                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2238                 if (error == 0) {
2239                         switch (iotypeflag) {
2240                                 case F_IOTYPE_ISOCHRONOUS:
2241                                         cp->c_flag |= C_IO_ISOCHRONOUS;
2242                                         break;
2243                                 default:
2244                                         break;
2245                         }
2246                         hfs_unlock (cp);
2247                 }
2248                 return error;
2249         }
2250
2251         case F_MAKECOMPRESSED: {
2252                 int error = 0;
2253                 uint32_t gen_counter;
2254                 struct cnode *cp = NULL;
2255                 int reset_decmp = 0;
2256
2257                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2258                         return EROFS;
2259                 }
2260
2261                 /*
2262                  * acquire & lock the cnode.
2263                  * VFS should have already authenticated the caller for us.
2264                  */
2265
2266                 if (ap->a_data) {
2267                         /*
2268                          * Cast the pointer into a uint32_t so we can extract the
2269                          * supplied generation counter.
2270                          */
2271                         gen_counter = *((uint32_t*)ap->a_data);
2272                 }
2273                 else {
2274                         return EINVAL;
2275                 }
2276
2277 #if HFS_COMPRESSION
2278                 cp = VTOC(vp);
2279                 /* Grab truncate lock first; we may truncate the file */
2280                 hfs_lock_truncate (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2281
2282                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2283                 if (error) {
2284                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
2285                         return error;
2286                 }
2287
2288                 /* Are there any other usecounts/FDs? */
2289                 if (vnode_isinuse(vp, 1)) {
2290                         hfs_unlock(cp);
2291                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
2292                         return EBUSY;
2293                 }
2294
2295                 /* now we have the cnode locked down; Validate arguments */
2296                 if (cp->c_attr.ca_flags & (UF_IMMUTABLE | UF_COMPRESSED)) {
2297                         /* EINVAL if you are trying to manipulate an IMMUTABLE file */
2298                         hfs_unlock(cp);
2299                         hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT);
2300                         return EINVAL;
2301                 }
2302
2303                 if ((hfs_get_gencount (cp)) == gen_counter) {
2304                         /*
2305                          * OK, the gen_counter matched.  Go for it:
2306                          * Toggle state bits, truncate file, and suppress mtime update
2307                          */
2308                         reset_decmp = 1;
2309                         cp->c_bsdflags |= UF_COMPRESSED;
2310
2311                         error = hfs_truncate(vp, 0, IO_NDELAY, HFS_TRUNCATE_SKIPTIMES,
2312                                                                  ap->a_context);
2313                 }
2314                 else {
2315                         error = ESTALE;
2316                 }
2317
2318                 /* Unlock cnode before executing decmpfs ; they may need to get an EA */
2319                 hfs_unlock(cp);
2320
2321                 /*
2322                  * Reset the decmp state while still holding the truncate lock. We need to
2323                  * serialize here against a listxattr on this node which may occur at any
2324                  * time.
2325                  *
2326                  * Even if '0/skiplock' is passed in 2nd argument to hfs_file_is_compressed,
2327                  * that will still potentially require getting the com.apple.decmpfs EA. If the
2328                  * EA is required, then we can't hold the cnode lock, because the getxattr call is
2329                  * generic(through VFS), and can't pass along any info telling it that we're already
2330                  * holding it (the lock). If we don't serialize, then we risk listxattr stopping
2331                  * and trying to fill in the hfs_file_is_compressed info during the callback
2332                  * operation, which will result in deadlock against the b-tree node.
2333                  *
2334                  * So, to serialize against listxattr (which will grab buf_t meta references on
2335                  * the b-tree blocks), we hold the truncate lock as we're manipulating the
2336                  * decmpfs payload.
2337                  */
2338                 if ((reset_decmp) && (error == 0)) {
2339                         decmpfs_cnode *dp = VTOCMP (vp);
2340                         if (dp != NULL) {
2341                                 decmpfs_cnode_set_vnode_state(dp, FILE_TYPE_UNKNOWN, 0);
2342                         }
2343
2344                         /* Initialize the decmpfs node as needed */
2345                         (void) hfs_file_is_compressed (cp, 0); /* ok to take lock */
2346                 }
2347
2348                 hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT);
2349
2350 #endif
2351                 return error;
2352         }
2353
2354         case F_SETBACKINGSTORE: {
2355
2356                 int error = 0;
2357
2358                 /*
2359                  * See comment in F_SETSTATICCONTENT re: using
2360              * a null check for a_data
2361                  */
2362                 if (ap->a_data) {
2363                         error = hfs_set_backingstore (vp, 1);
2364                 }
2365                 else {
2366                         error = hfs_set_backingstore (vp, 0);
2367                 }
2368
2369                 return error;
2370         }
2371
2372         case F_GETPATH_MTMINFO: {
2373                 int error = 0;
2374
2375                 int *data = (int*) ap->a_data;
2376
2377                 /* Ask if this is a backingstore vnode */
2378                 error = hfs_is_backingstore (vp, data);
2379
2380                 return error;
2381         }
2382
2383         case F_FULLFSYNC: {
2384                 int error;
2385
2386                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2387                         return (EROFS);
2388                 }
2389                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2390                 if (error == 0) {
2391                         error = hfs_fsync(vp, MNT_WAIT, HFS_FSYNC_FULL, p);
2392                         hfs_unlock(VTOC(vp));
2393                 }
2394
2395                 return error;
2396         }
2397
2398         case F_BARRIERFSYNC: {
2399                 int error;
2400
2401                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2402                         return (EROFS);
2403                 }
2404                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2405                 if (error == 0) {
2406                         error = hfs_fsync(vp, MNT_WAIT, HFS_FSYNC_BARRIER, p);
2407                         hfs_unlock(VTOC(vp));
2408                 }
2409
2410                 return error;
2411         }
2412
2413         case F_CHKCLEAN: {
2414                 register struct cnode *cp;
2415                 int error;
2416
2417                 if (!vnode_isreg(vp))
2418                         return EINVAL;
2419
2420                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2421                 if (error == 0) {
2422                         cp = VTOC(vp);
2423                         /*
2424                          * used by regression test to determine if
2425                          * all the dirty pages (via write) have been cleaned
2426                          * after a call to 'fsysnc'.
2427                          */
2428                         error = is_file_clean(vp, VTOF(vp)->ff_size);
2429                         hfs_unlock(cp);
2430                 }
2431                 return (error);
2432         }
2433
2434         case F_RDADVISE: {
2435                 register struct radvisory *ra;
2436                 struct filefork *fp;
2437                 int error;
2438
2439                 if (!vnode_isreg(vp))
2440                         return EINVAL;
2441
2442                 ra = (struct radvisory *)(ap->a_data);
2443                 fp = VTOF(vp);
2444
2445                 /* Protect against a size change. */
2446                 hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2447
2448 #if HFS_COMPRESSION
2449                 if (compressed) {
2450                         if (uncompressed_size == -1) {
2451                                 /* fetching the uncompressed size failed above, so return the error */
2452                                 error = decmpfs_error;
2453                         } else if (ra->ra_offset >= uncompressed_size) {
2454                                 error = EFBIG;
2455                         } else {
2456                                 error = advisory_read(vp, uncompressed_size, ra->ra_offset, ra->ra_count);
2457                         }
2458                 } else
2459 #endif /* HFS_COMPRESSION */
2460                 if (ra->ra_offset >= fp->ff_size) {
2461                         error = EFBIG;
2462                 } else {
2463                         error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count);
2464                 }
2465
2466                 hfs_unlock_truncate(VTOC(vp), HFS_LOCK_DEFAULT);
2467                 return (error);
2468         }
2469
2470         case HFSIOC_GET_VOL_CREATE_TIME_32: {
2471                 *(user32_time_t *)(ap->a_data) = (user32_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2472                 return 0;
2473         }
2474
2475         case HFSIOC_GET_VOL_CREATE_TIME_64: {
2476                 *(user64_time_t *)(ap->a_data) = (user64_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2477                 return 0;
2478         }
2479
2480         case SPOTLIGHT_IOC_GET_MOUNT_TIME:
2481             *(uint32_t *)ap->a_data = hfsmp->hfs_mount_time;
2482             break;
2483
2484         case SPOTLIGHT_IOC_GET_LAST_MTIME:
2485             *(uint32_t *)ap->a_data = hfsmp->hfs_last_mounted_mtime;
2486             break;
2487
2488         case HFSIOC_GET_VERY_LOW_DISK:
2489             *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_dangerlimit;
2490             break;
2491
2492         case HFSIOC_SET_VERY_LOW_DISK:
2493             if (*(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_warninglimit) {
2494                 return EINVAL;
2495             }
2496
2497             hfsmp->hfs_freespace_notify_dangerlimit = *(uint32_t *)ap->a_data;
2498             break;
2499
2500         case HFSIOC_GET_LOW_DISK:
2501             *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_warninglimit;
2502             break;
2503
2504         case HFSIOC_SET_LOW_DISK:
2505             if (   *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel
2506                 || *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_dangerlimit) {
2507
2508                 return EINVAL;
2509             }
2510
2511             hfsmp->hfs_freespace_notify_warninglimit = *(uint32_t *)ap->a_data;
2512             break;
2513
2514         /* The following two fsctls were ported from apfs. */
2515         case APFSIOC_GET_NEAR_LOW_DISK:
2516                 *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_nearwarninglimit;
2517                 break;
2518
2519         case APFSIOC_SET_NEAR_LOW_DISK:
2520                 if (   *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel
2521                 || *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) {
2522                         return EINVAL;
2523                 }
2524
2525                 hfsmp->hfs_freespace_notify_nearwarninglimit = *(uint32_t *)ap->a_data;
2526                 break;
2527
2528         case HFSIOC_GET_DESIRED_DISK:
2529             *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_desiredlevel;
2530             break;
2531
2532         case HFSIOC_SET_DESIRED_DISK:
2533             if (*(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) {
2534                 return EINVAL;
2535             }
2536
2537             hfsmp->hfs_freespace_notify_desiredlevel = *(uint32_t *)ap->a_data;
2538             break;
2539
2540         case HFSIOC_VOLUME_STATUS:
2541             *(uint32_t *)ap->a_data = hfsmp->hfs_notification_conditions;
2542             break;
2543
2544         case HFS_SET_BOOT_INFO:
2545                 if (!vnode_isvroot(vp))
2546                         return(EINVAL);
2547                 if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(HFSTOVFS(hfsmp))->f_owner))
2548                         return(EACCES); /* must be superuser or owner of filesystem */
2549                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2550                         return (EROFS);
2551                 }
2552                 hfs_lock_mount (hfsmp);
2553                 bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo));
2554                 /* Null out the cached UUID, to be safe */
2555                 uuid_clear (hfsmp->hfs_full_uuid);
2556                 hfs_unlock_mount (hfsmp);
2557                 (void) hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT);
2558                 break;
2559
2560         case HFS_GET_BOOT_INFO:
2561                 if (!vnode_isvroot(vp))
2562                         return(EINVAL);
2563                 hfs_lock_mount (hfsmp);
2564                 bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo));
2565                 hfs_unlock_mount(hfsmp);
2566                 break;
2567
2568         /* case HFS_MARK_BOOT_CORRUPT: _IO are the same */
2569         case HFSIOC_MARK_BOOT_CORRUPT:
2570                 /* Mark the boot volume corrupt by setting
2571                  * kHFSVolumeInconsistentBit in the volume header.  This will
2572                  * force fsck_hfs on next mount.
2573                  */
2574                 if (!kauth_cred_issuser(kauth_cred_get())) {
2575                         return EACCES;
2576                 }
2577
2578                 /* Allowed only on the root vnode of the boot volume */
2579                 if (!(vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) ||
2580                     !vnode_isvroot(vp)) {
2581                         return EINVAL;
2582                 }
2583                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2584                         return (EROFS);
2585                 }
2586                 printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n");
2587                 hfs_mark_inconsistent(hfsmp, HFS_FSCK_FORCED);
2588                 break;
2589
2590         case HFSIOC_GET_JOURNAL_INFO:
2591                 jip = (struct hfs_journal_info*)ap->a_data;
2592
2593                 if (vp == NULLVP)
2594                         return EINVAL;
2595
2596             if (hfsmp->jnl == NULL) {
2597                         jnl_start = 0;
2598                         jnl_size  = 0;
2599             } else {
2600                         jnl_start = hfs_blk_to_bytes(hfsmp->jnl_start, hfsmp->blockSize) + hfsmp->hfsPlusIOPosOffset;
2601                         jnl_size  = hfsmp->jnl_size;
2602             }
2603
2604                 jip->jstart = jnl_start;
2605                 jip->jsize = jnl_size;
2606                 break;
2607
2608         case HFSIOC_SET_ALWAYS_ZEROFILL: {
2609             struct cnode *cp = VTOC(vp);
2610
2611             if (*(int *)ap->a_data) {
2612                 cp->c_flag |= C_ALWAYS_ZEROFILL;
2613             } else {
2614                 cp->c_flag &= ~C_ALWAYS_ZEROFILL;
2615             }
2616             break;
2617         }
2618
2619         /* case HFS_DISABLE_METAZONE: _IO are the same */
2620         case HFSIOC_DISABLE_METAZONE: {
2621                 /* Only root can disable metadata zone */
2622                 if (!kauth_cred_issuser(kauth_cred_get())) {
2623                         return EACCES;
2624                 }
2625                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2626                         return (EROFS);
2627                 }
2628
2629                 /* Disable metadata zone now */
2630                 (void) hfs_metadatazone_init(hfsmp, true);
2631                 printf ("hfs: Disabling metadata zone on %s\n", hfsmp->vcbVN);
2632                 break;
2633         }
2634
2635
2636         case HFSIOC_FSINFO_METADATA_BLOCKS: {
2637                 int error;
2638                 struct hfsinfo_metadata *hinfo;
2639
2640                 hinfo = (struct hfsinfo_metadata *)ap->a_data;
2641
2642                 /* Get information about number of metadata blocks */
2643                 error = hfs_getinfo_metadata_blocks(hfsmp, hinfo);
2644                 if (error) {
2645                         return error;
2646                 }
2647
2648                 break;
2649         }
2650
2651         case HFSIOC_GET_FSINFO: {
2652                 hfs_fsinfo *fsinfo = (hfs_fsinfo *)ap->a_data;
2653
2654                 /* Only root is allowed to get fsinfo */
2655                 if (!kauth_cred_issuser(kauth_cred_get())) {
2656                         return EACCES;
2657                 }
2658
2659                 /*
2660                  * Make sure that the caller's version number matches with
2661                  * the kernel's version number.  This will make sure that
2662                  * if the structures being read/written into are changed
2663                  * by the kernel, the caller will not read incorrect data.
2664                  *
2665                  * The first three fields --- request_type, version and
2666                  * flags are same for all the hfs_fsinfo structures, so
2667                  * we can access the version number by assuming any
2668                  * structure for now.
2669                  */
2670                 if (fsinfo->header.version != HFS_FSINFO_VERSION) {
2671                         return ENOTSUP;
2672                 }
2673
2674                 /* Make sure that the current file system is not marked inconsistent */
2675                 if (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) {
2676                         return EIO;
2677                 }
2678
2679                 return hfs_get_fsinfo(hfsmp, ap->a_data);
2680         }
2681
2682         case HFSIOC_CS_FREESPACE_TRIM: {
2683                 int error = 0;
2684                 int lockflags = 0;
2685
2686                 /* Only root allowed */
2687                 if (!kauth_cred_issuser(kauth_cred_get())) {
2688                         return EACCES;
2689                 }
2690
2691                 /*
2692                  * This core functionality is similar to hfs_scan_blocks().
2693                  * The main difference is that hfs_scan_blocks() is called
2694                  * as part of mount where we are assured that the journal is
2695                  * empty to start with.  This fcntl() can be called on a
2696                  * mounted volume, therefore it has to flush the content of
2697                  * the journal as well as ensure the state of summary table.
2698                  *
2699                  * This fcntl scans over the entire allocation bitmap,
2700                  * creates list of all the free blocks, and issues TRIM
2701                  * down to the underlying device.  This can take long time
2702                  * as it can generate up to 512MB of read I/O.
2703                  */
2704
2705                 if ((hfsmp->hfs_flags & HFS_SUMMARY_TABLE) == 0) {
2706                         error = hfs_init_summary(hfsmp);
2707                         if (error) {
2708                                 printf("hfs: fsctl() could not initialize summary table for %s\n", hfsmp->vcbVN);
2709                                 return error;
2710                         }
2711                 }
2712
2713                 /*
2714                  * The journal maintains list of recently deallocated blocks to
2715                  * issue DKIOCUNMAPs when the corresponding journal transaction is
2716                  * flushed to the disk.  To avoid any race conditions, we only
2717                  * want one active trim list and only one thread issuing DKIOCUNMAPs.
2718                  * Therefore we make sure that the journal trim list is sync'ed,
2719                  * empty, and not modifiable for the duration of our scan.
2720                  *
2721                  * Take the journal lock before flushing the journal to the disk.
2722                  * We will keep on holding the journal lock till we don't get the
2723                  * bitmap lock to make sure that no new journal transactions can
2724                  * start.  This will make sure that the journal trim list is not
2725                  * modified after the journal flush and before getting bitmap lock.
2726                  * We can release the journal lock after we acquire the bitmap
2727                  * lock as it will prevent any further block deallocations.
2728                  */
2729                 hfs_journal_lock(hfsmp);
2730
2731                 /* Flush the journal and wait for all I/Os to finish up */
2732                 error = hfs_flush(hfsmp, HFS_FLUSH_JOURNAL_META);
2733                 if (error) {
2734                         hfs_journal_unlock(hfsmp);
2735                         return error;
2736                 }
2737
2738                 /* Take bitmap lock to ensure it is not being modified */
2739                 lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
2740
2741                 /* Release the journal lock */
2742                 hfs_journal_unlock(hfsmp);
2743
2744                 /*
2745                  * ScanUnmapBlocks reads the bitmap in large block size
2746                  * (up to 1MB) unlike the runtime which reads the bitmap
2747                  * in the 4K block size.  This can cause buf_t collisions
2748                  * and potential data corruption.  To avoid this, we
2749                  * invalidate all the existing buffers associated with
2750                  * the bitmap vnode before scanning it.
2751                  *
2752                  * Note: ScanUnmapBlock() cleans up all the buffers
2753                  * after itself, so there won't be any large buffers left
2754                  * for us to clean up after it returns.
2755                  */
2756                 error = buf_invalidateblks(hfsmp->hfs_allocation_vp, 0, 0, 0);
2757                 if (error) {
2758                         hfs_systemfile_unlock(hfsmp, lockflags);
2759                         return error;
2760                 }
2761
2762                 /* Traverse bitmap and issue DKIOCUNMAPs */
2763                 error = ScanUnmapBlocks(hfsmp);
2764                 hfs_systemfile_unlock(hfsmp, lockflags);
2765                 if (error) {
2766                         return error;
2767                 }
2768
2769                 break;
2770         }
2771
2772         case HFSIOC_SET_HOTFILE_STATE: {
2773                 int error;
2774                 struct cnode *cp = VTOC(vp);
2775                 uint32_t hf_state = *((uint32_t*)ap->a_data);
2776                 uint32_t num_unpinned = 0;
2777
2778                 error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2779                 if (error) {
2780                         return error;
2781                 }
2782
2783                 // printf("hfs: setting hotfile state %d on %s\n", hf_state, vp->v_name);
2784                 if (hf_state == HFS_MARK_FASTDEVCANDIDATE) {
2785                         vnode_setfastdevicecandidate(vp);
2786
2787                         cp->c_attr.ca_recflags |= kHFSFastDevCandidateMask;
2788                         cp->c_attr.ca_recflags &= ~kHFSDoNotFastDevPinMask;
2789                         cp->c_flag |= C_MODIFIED;
2790                 } else if (hf_state == HFS_UNMARK_FASTDEVCANDIDATE || hf_state == HFS_NEVER_FASTDEVCANDIDATE) {
2791                         vnode_clearfastdevicecandidate(vp);
2792                         hfs_removehotfile(vp);
2793
2794                         if (cp->c_attr.ca_recflags & kHFSFastDevPinnedMask) {
2795                                 hfs_pin_vnode(hfsmp, vp, HFS_UNPIN_IT, &num_unpinned);
2796                         }
2797
2798                         if (hf_state == HFS_NEVER_FASTDEVCANDIDATE) {
2799                                 cp->c_attr.ca_recflags |= kHFSDoNotFastDevPinMask;
2800                         }
2801                         cp->c_attr.ca_recflags &= ~(kHFSFastDevCandidateMask|kHFSFastDevPinnedMask);
2802                         cp->c_flag |= C_MODIFIED;
2803
2804                 } else {
2805                         error = EINVAL;
2806                 }
2807
2808                 if (num_unpinned != 0) {
2809                         lck_mtx_lock(&hfsmp->hfc_mutex);
2810                         hfsmp->hfs_hotfile_freeblks += num_unpinned;
2811                         lck_mtx_unlock(&hfsmp->hfc_mutex);
2812                 }
2813
2814                 hfs_unlock(cp);
2815                 return error;
2816         }
2817
2818         case HFSIOC_REPIN_HOTFILE_STATE: {
2819                 int error=0;
2820                 uint32_t repin_what = *((uint32_t*)ap->a_data);
2821
2822                 /* Only root allowed */
2823                 if (!kauth_cred_issuser(kauth_cred_get())) {
2824                         return EACCES;
2825                 }
2826
2827                 if (!(hfsmp->hfs_flags & (HFS_CS_METADATA_PIN | HFS_CS_HOTFILE_PIN))) {
2828                         // this system is neither regular Fusion or Cooperative Fusion
2829                         // so this fsctl makes no sense.
2830                         return EINVAL;
2831                 }
2832
2833                 //
2834                 // After a converting a CoreStorage volume to be encrypted, the
2835                 // extents could have moved around underneath us.  This call
2836                 // allows corestoraged to re-pin everything that should be
2837                 // pinned (it would happen on the next reboot too but that could
2838                 // be a long time away).
2839                 //
2840                 if ((repin_what & HFS_REPIN_METADATA) && (hfsmp->hfs_flags & HFS_CS_METADATA_PIN)) {
2841                         hfs_pin_fs_metadata(hfsmp);
2842                 }
2843                 if ((repin_what & HFS_REPIN_USERDATA) && (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN)) {
2844                         hfs_repin_hotfiles(hfsmp);
2845                 }
2846                 if ((repin_what & HFS_REPIN_USERDATA) && (hfsmp->hfs_flags & HFS_CS_SWAPFILE_PIN)) {
2847                         //XXX Swapfiles (marked SWAP_PINNED) may have moved too.
2848                         //XXX Do we care? They have a more transient/dynamic nature/lifetime.
2849                 }
2850
2851                 return error;
2852         }
2853
2854 #if HFS_CONFIG_KEY_ROLL
2855
2856         case HFSIOC_KEY_ROLL: {
2857                 if (!kauth_cred_issuser(kauth_cred_get()))
2858                         return EACCES;
2859
2860                 hfs_key_roll_args_t *args = (hfs_key_roll_args_t *)ap->a_data;
2861
2862                 return hfs_key_roll_op(ap->a_context, ap->a_vp, args);
2863         }
2864
2865         case HFSIOC_GET_KEY_AUTO_ROLL: {
2866                 if (!kauth_cred_issuser(kauth_cred_get()))
2867                         return EACCES;
2868
2869                 hfs_key_auto_roll_args_t *args = (hfs_key_auto_roll_args_t *)ap->a_data;
2870                 if (args->api_version != HFS_KEY_AUTO_ROLL_API_VERSION_1)
2871                         return ENOTSUP;
2872                 args->flags = (ISSET(hfsmp->cproot_flags, CP_ROOT_AUTO_ROLL_OLD_CLASS_GENERATION)
2873                                            ? HFS_KEY_AUTO_ROLL_OLD_CLASS_GENERATION : 0);
2874                 args->min_key_os_version = hfsmp->hfs_auto_roll_min_key_os_version;
2875                 args->max_key_os_version = hfsmp->hfs_auto_roll_max_key_os_version;
2876                 break;
2877         }
2878
2879         case HFSIOC_SET_KEY_AUTO_ROLL: {
2880                 if (!kauth_cred_issuser(kauth_cred_get()))
2881                         return EACCES;
2882
2883                 hfs_key_auto_roll_args_t *args = (hfs_key_auto_roll_args_t *)ap->a_data;
2884                 if (args->api_version != HFS_KEY_AUTO_ROLL_API_VERSION_1)
2885                         return ENOTSUP;
2886                 return cp_set_auto_roll(hfsmp, args);
2887         }
2888
2889 #endif // HFS_CONFIG_KEY_ROLL
2890
2891 #if CONFIG_PROTECT
2892         case F_TRANSCODEKEY:
2893                 /*
2894                  * This API is only supported when called via kernel so
2895                  * a_fflag must be set to 1 (it's not possible to get here
2896                  * with it set to 1 via fsctl).
2897                  */
2898                 if (ap->a_fflag != 1)
2899                         return ENOTTY;
2900                 return cp_vnode_transcode(vp, (cp_key_t *)ap->a_data);
2901
2902         case F_GETPROTECTIONLEVEL:
2903                 return cp_get_root_major_vers (vp, (uint32_t *)ap->a_data);
2904
2905         case F_GETDEFAULTPROTLEVEL:
2906                 return cp_get_default_level(vp, (uint32_t *)ap->a_data);
2907 #endif // CONFIG_PROTECT
2908
2909         case FIOPINSWAP:
2910                 return hfs_pin_vnode(hfsmp, vp, HFS_PIN_IT | HFS_DATALESS_PIN,
2911                                                          NULL);
2912
2913         case FSIOC_CAS_BSDFLAGS: {
2914                 struct fsioc_cas_bsdflags *cas = (void *)ap->a_data;
2915                 struct cnode *cp = VTOC(vp);
2916                 u_int32_t document_id = 0;
2917                 bool need_truncate = false;
2918                 int decmpfs_reset_state = 0;
2919                 int error;
2920
2921                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2922                         return (EROFS);
2923                 }
2924
2925                 /* Don't allow modification of the journal. */
2926                 if (hfs_is_journal_file(hfsmp, cp)) {
2927                         return (EPERM);
2928                 }
2929
2930                 // Check if we need to set UF_COMPRESSED.
2931                 // If so, ask decmpfs if we're allowed to (and if so, if we need to truncate
2932                 // the data fork to 0).
2933                 if (!(cas->expected_flags & UF_COMPRESSED) && (cas->new_flags & UF_COMPRESSED)) {
2934                         struct vnode_attr vap;
2935                         VATTR_INIT(&vap);
2936                         VATTR_SET(&vap, va_flags, cas->new_flags);
2937
2938                         error = decmpfs_update_attributes(vp, &vap);
2939                         if (error) {
2940                                 return (error);
2941                         }
2942
2943                         // Similar to hfs_vnop_setattr(), we call decmpfs_update_attributes()
2944                         // as it is the ultimate arbiter of whether or not UF_COMPRESSED can be set.
2945                         // (If the decmpfs xattr is not present or invalid, for example,
2946                         // UF_COMPRESSED should *not* be set.)
2947                         // It will also tell us if we need to truncate the data fork to 0.
2948                         if (!(vap.va_flags & UF_COMPRESSED)) {
2949                                 // The request to update UF_COMPRESSED is denied.
2950                                 // (Note that decmpfs_update_attributes() won't touch va_active
2951                                 // in this case.) Error out.
2952                                 return (EPERM);
2953                         }
2954
2955                         if (VATTR_IS_ACTIVE(&vap, va_data_size) && (vap.va_data_size == 0)) {
2956                                 // We must also truncate this file's data fork to 0.
2957                                 need_truncate = true;
2958                         }
2959                 }
2960
2961                 if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
2962                         return (error);
2963                 }
2964
2965                 cas->actual_flags = cp->c_bsdflags;
2966                 if (cas->actual_flags != cas->expected_flags) {
2967                         hfs_unlock(cp);
2968                         return (0);
2969                 }
2970
2971                 //
2972                 // Check if we'll need a document_id.  If so, we need to drop the lock
2973                 // (to avoid any possible deadlock with the root vnode which has to get
2974                 // locked to get the document id), generate the document_id, re-acquire
2975                 // the lock, and perform the CAS check again.  We do it in this sequence
2976                 // in order to avoid throwing away document_ids in the case where the
2977                 // CAS check fails.  Note that it can still happen, but by performing
2978                 // the check first, hopefully we can reduce the ocurrence.
2979                 //
2980                 if ((cas->new_flags & UF_TRACKED) && !(VTOC(vp)->c_bsdflags & UF_TRACKED)) {
2981                         struct FndrExtendedDirInfo *fip = (struct FndrExtendedDirInfo *)((char *)&(VTOC(vp)->c_attr.ca_finderinfo) + 16);
2982                         //
2983                         // If the document_id is not set, get a new one.  It will be set
2984                         // on the file down below once we hold the cnode lock.
2985                         //
2986                         if (fip->document_id == 0) {
2987                                 //
2988                                 // Drat, we have to generate one.  Unlock the cnode, do the
2989                                 // deed, re-lock the cnode, and then to the CAS check again
2990                                 // to see if we lost the race.
2991                                 //
2992                                 hfs_unlock(cp);
2993                                 if (hfs_generate_document_id(hfsmp, &document_id) != 0) {
2994                                         document_id = 0;
2995                                 }
2996                                 if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
2997                                         return (error);
2998                                 }
2999                                 cas->actual_flags = cp->c_bsdflags;
3000                                 if (cas->actual_flags != cas->expected_flags) {
3001                                         hfs_unlock(cp);
3002                                         return (0);
3003                                 }
3004                         }
3005                 }
3006
3007                 // Attempt to truncate our data fork to 0 length, if necessary.
3008                 if (need_truncate && (VTOF(vp)->ff_size)) {
3009                         hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
3010                         // hfs_truncate will deal with the cnode lock
3011                         error = hfs_truncate(vp, 0, IO_NDELAY, 0, ap->a_context);
3012                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
3013                 }
3014
3015                 if (!error)
3016                         error = hfs_set_bsd_flags(hfsmp, cp, cas->new_flags,
3017                                                                   document_id, ap->a_context,
3018                                                                   &decmpfs_reset_state);
3019                 if (error == 0) {
3020                         error = hfs_update(vp, 0);
3021                 }
3022                 hfs_unlock(cp);
3023                 if (error) {
3024                         return (error);
3025                 }
3026
3027 #if HFS_COMPRESSION
3028                 if (decmpfs_reset_state) {
3029                         /*
3030                          * we've changed the UF_COMPRESSED flag, so reset the decmpfs state for this cnode
3031                          * but don't do it while holding the hfs cnode lock
3032                          */
3033                         decmpfs_cnode *dp = VTOCMP(vp);
3034                         if (!dp) {
3035                                 /*
3036                                  * call hfs_lazy_init_decmpfs_cnode() to make sure that the decmpfs_cnode
3037                                  * is filled in; we need a decmpfs_cnode to prevent decmpfs state changes
3038                                  * on this file if it's locked
3039                                  */
3040                                 dp = hfs_lazy_init_decmpfs_cnode(VTOC(vp));
3041                                 if (!dp) {
3042                                         /* failed to allocate a decmpfs_cnode */
3043                                         return ENOMEM; /* what should this be? */
3044                                 }
3045                         }
3046                         decmpfs_cnode_set_vnode_state(dp, FILE_TYPE_UNKNOWN, 0);
3047                 }
3048 #endif
3049                 break; // return 0 below
3050         }
3051
3052         default:
3053                 return (ENOTTY);
3054         }
3055
3056         return 0;
3057 }
3058
3059 /*
3060  * select
3061  */
3062 int
3063 hfs_vnop_select(__unused struct vnop_select_args *ap)
3064 /*
3065         struct vnop_select_args {
3066                 vnode_t a_vp;
3067                 int  a_which;
3068                 int  a_fflags;
3069                 void *a_wql;
3070                 vfs_context_t a_context;
3071         };
3072 */
3073 {
3074         /*
3075          * We should really check to see if I/O is possible.
3076          */
3077         return (1);
3078 }
3079
3080 /*
3081  * Converts a logical block number to a physical block, and optionally returns
3082  * the amount of remaining blocks in a run. The logical block is based on hfsNode.logBlockSize.
3083  * The physical block number is based on the device block size, currently its 512.
3084  * The block run is returned in logical blocks, and is the REMAINING amount of blocks
3085  */
3086 int
3087 hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, unsigned int *runp)
3088 {
3089         struct filefork *fp = VTOF(vp);
3090         struct hfsmount *hfsmp = VTOHFS(vp);
3091         int  retval = E_NONE;
3092         u_int32_t  logBlockSize;
3093         size_t  bytesContAvail = 0;
3094         off_t  blockposition;
3095         int lockExtBtree;
3096         int lockflags = 0;
3097
3098         /*
3099          * Check for underlying vnode requests and ensure that logical
3100          * to physical mapping is requested.
3101          */
3102         if (vpp != NULL)
3103                 *vpp = hfsmp->hfs_devvp;
3104         if (bnp == NULL)
3105                 return (0);
3106
3107         logBlockSize = GetLogicalBlockSize(vp);
3108         blockposition = (off_t)bn * logBlockSize;
3109
3110         lockExtBtree = overflow_extents(fp);
3111
3112         if (lockExtBtree)
3113                 lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK);
3114
3115         retval = MacToVFSError(
3116                             MapFileBlockC (HFSTOVCB(hfsmp),
3117                                             (FCB*)fp,
3118                                             MAXPHYSIO,
3119                                             blockposition,
3120                                             bnp,
3121                                             &bytesContAvail));
3122
3123         if (lockExtBtree)
3124                 hfs_systemfile_unlock(hfsmp, lockflags);
3125
3126         if (retval == E_NONE) {
3127                 /* Figure out how many read ahead blocks there are */
3128                 if (runp != NULL) {
3129                         if (can_cluster(logBlockSize)) {
3130                                 /* Make sure this result never goes negative: */
3131                                 *runp = (bytesContAvail < logBlockSize) ? 0 : (bytesContAvail / logBlockSize) - 1;
3132                         } else {
3133                                 *runp = 0;
3134                         }
3135                 }
3136         }
3137         return (retval);
3138 }
3139
3140 /*
3141  * Convert logical block number to file offset.
3142  */
3143 int
3144 hfs_vnop_blktooff(struct vnop_blktooff_args *ap)
3145 /*
3146         struct vnop_blktooff_args {
3147                 vnode_t a_vp;
3148                 daddr64_t a_lblkno;
3149                 off_t *a_offset;
3150         };
3151 */
3152 {
3153         if (ap->a_vp == NULL)
3154                 return (EINVAL);
3155         *ap->a_offset = (off_t)ap->a_lblkno * (off_t)GetLogicalBlockSize(ap->a_vp);
3156
3157         return(0);
3158 }
3159
3160 /*
3161  * Convert file offset to logical block number.
3162  */
3163 int
3164 hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap)
3165 /*
3166         struct vnop_offtoblk_args {
3167                 vnode_t a_vp;
3168                 off_t a_offset;
3169                 daddr64_t *a_lblkno;
3170         };
3171 */
3172 {
3173         if (ap->a_vp == NULL)
3174                 return (EINVAL);
3175         *ap->a_lblkno = (daddr64_t)(ap->a_offset / (off_t)GetLogicalBlockSize(ap->a_vp));
3176
3177         return(0);
3178 }
3179
3180 /*
3181  * Map file offset to physical block number.
3182  *
3183  * If this function is called for write operation, and if the file
3184  * had virtual blocks allocated (delayed allocation), real blocks
3185  * are allocated by calling ExtendFileC().
3186  *
3187  * If this function is called for read operation, and if the file
3188  * had virtual blocks allocated (delayed allocation), no change
3189  * to the size of file is done, and if required, rangelist is
3190  * searched for mapping.
3191  *
3192  * System file cnodes are expected to be locked (shared or exclusive).
3193  *
3194  * -- INVALID RANGES --
3195  *
3196  * Invalid ranges are used to keep track of where we have extended a
3197  * file, but have not yet written that data to disk.  In the past we
3198  * would clear up the invalid ranges as we wrote to those areas, but
3199  * before data was actually flushed to disk.  The problem with that
3200  * approach is that the data can be left in the cache and is therefore
3201  * still not valid on disk.  So now we clear up the ranges here, when
3202  * the flags field has VNODE_WRITE set, indicating a write is about to
3203  * occur.  This isn't ideal (ideally we want to clear them up when
3204  * know the data has been successfully written), but it's the best we
3205  * can do.
3206  *
3207  * For reads, we use the invalid ranges here in block map to indicate
3208  * to the caller that the data should be zeroed (a_bpn == -1).  We
3209  * have to be careful about what ranges we return to the cluster code.
3210  * Currently the cluster code can only handle non-rounded values for
3211  * the EOF; it cannot handle funny sized ranges in the middle of the
3212  * file (the main problem is that it sends down odd sized I/Os to the
3213  * disk).  Our code currently works because whilst the very first
3214  * offset and the last offset in the invalid ranges are not aligned,
3215  * gaps in the invalid ranges between the first and last, have to be
3216  * aligned (because we always write page sized blocks).  For example,
3217  * consider this arrangement:
3218  *
3219  *         +-------------+-----+-------+------+
3220  *         |             |XXXXX|       |XXXXXX|
3221  *         +-------------+-----+-------+------+
3222  *                       a     b       c      d
3223  *
3224  * This shows two invalid ranges <a, b> and <c, d>.  Whilst a and d
3225  * are not necessarily aligned, b and c *must* be.
3226  *
3227  * Zero-filling occurs in a number of ways:
3228  *
3229  *   1. When a read occurs and we return with a_bpn == -1.
3230  *
3231  *   2. When hfs_fsync or hfs_filedone calls hfs_flush_invalid_ranges
3232  *      which will cause us to iterate over the ranges bringing in
3233  *      pages that are not present in the cache and zeroing them.  Any
3234  *      pages that are already in the cache are left untouched.  Note
3235  *      that hfs_fsync does not always flush invalid ranges.
3236  *
3237  *   3. When we extend a file we zero out from the old EOF to the end
3238  *      of the page.  It would be nice if we didn't have to do this if
3239  *      the page wasn't present (and could defer it), but because of
3240  *      the problem described above, we have to.
3241  *
3242  * The invalid ranges are also used to restrict the size that we write
3243  * out on disk: see hfs_prepare_fork_for_update.
3244  *
3245  * Note that invalid ranges are ignored when neither the VNODE_READ or
3246  * the VNODE_WRITE flag is specified.  This is useful for the
3247  * F_LOG2PHYS* fcntls which are not interested in invalid ranges: they
3248  * just want to know whether blocks are physically allocated or not.
3249  */
3250 int
3251 hfs_vnop_blockmap(struct vnop_blockmap_args *ap)
3252 /*
3253         struct vnop_blockmap_args {
3254                 vnode_t a_vp;
3255                 off_t a_foffset;
3256                 size_t a_size;
3257                 daddr64_t *a_bpn;
3258                 size_t *a_run;
3259                 void *a_poff;
3260                 int a_flags;
3261                 vfs_context_t a_context;
3262         };
3263 */
3264 {
3265         struct vnode *vp = ap->a_vp;
3266         struct cnode *cp;
3267         struct filefork *fp;
3268         struct hfsmount *hfsmp;
3269         size_t bytesContAvail = ap->a_size;
3270         int retval = E_NONE;
3271         int syslocks = 0;
3272         int lockflags = 0;
3273         struct rl_entry *invalid_range;
3274         enum rl_overlaptype overlaptype;
3275         int started_tr = 0;
3276         int tooklock = 0;
3277
3278 #if HFS_COMPRESSION
3279         if (VNODE_IS_RSRC(vp)) {
3280                 /* allow blockmaps to the resource fork */
3281         } else {
3282                 if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
3283                         int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
3284                         switch(state) {
3285                                 case FILE_IS_COMPRESSED:
3286                                         return ENOTSUP;
3287                                 case FILE_IS_CONVERTING:
3288                                         /* if FILE_IS_CONVERTING, we allow blockmap */
3289                                         break;
3290                                 default:
3291                                         printf("invalid state %d for compressed file\n", state);
3292                                         /* fall through */
3293                         }
3294                 }
3295         }
3296 #endif /* HFS_COMPRESSION */
3297
3298         /* Do not allow blockmap operation on a directory */
3299         if (vnode_isdir(vp)) {
3300                 return (ENOTSUP);
3301         }
3302
3303         /*
3304          * Check for underlying vnode requests and ensure that logical
3305          * to physical mapping is requested.
3306          */
3307         if (ap->a_bpn == NULL)
3308                 return (0);
3309
3310         hfsmp = VTOHFS(vp);
3311         cp = VTOC(vp);
3312         fp = VTOF(vp);
3313
3314         if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) {
3315                 if (cp->c_lockowner != current_thread()) {
3316                         hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
3317                         tooklock = 1;
3318                 }
3319
3320                 // For reads, check the invalid ranges
3321                 if (ISSET(ap->a_flags, VNODE_READ)) {
3322                         if (ap->a_foffset >= fp->ff_size) {
3323                                 retval = ERANGE;
3324                                 goto exit;
3325                         }
3326
3327                         overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
3328                                                                   ap->a_foffset + (off_t)bytesContAvail - 1,
3329                                                                   &invalid_range);
3330                         switch(overlaptype) {
3331                                 case RL_MATCHINGOVERLAP:
3332                                 case RL_OVERLAPCONTAINSRANGE:
3333                                 case RL_OVERLAPSTARTSBEFORE:
3334                                         /* There's no valid block for this byte offset */
3335                                         *ap->a_bpn = (daddr64_t)-1;
3336                                         /* There's no point limiting the amount to be returned
3337                                          * if the invalid range that was hit extends all the way
3338                                          * to the EOF (i.e. there's no valid bytes between the
3339                                          * end of this range and the file's EOF):
3340                                          */
3341                                         if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
3342                                                 ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
3343                                                 bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
3344                                         }
3345
3346                                         retval = 0;
3347                                         goto exit;
3348
3349                                 case RL_OVERLAPISCONTAINED:
3350                                 case RL_OVERLAPENDSAFTER:
3351                                         /* The range of interest hits an invalid block before the end: */
3352                                         if (invalid_range->rl_start == ap->a_foffset) {
3353                                                 /* There's actually no valid information to be had starting here: */
3354                                                 *ap->a_bpn = (daddr64_t)-1;
3355                                                 if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
3356                                                         ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
3357                                                         bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
3358                                                 }
3359
3360                                                 retval = 0;
3361                                                 goto exit;
3362                                         } else {
3363                                                 /*
3364                                                  * Sadly, the lower layers don't like us to
3365                                                  * return unaligned ranges, so we skip over
3366                                                  * any invalid ranges here that are less than
3367                                                  * a page: zeroing of those bits is not our
3368                                                  * responsibility (it's dealt with elsewhere).
3369                                                  */
3370                                                 do {
3371                                                         off_t rounded_start = round_page_64(invalid_range->rl_start);
3372                                                         if ((off_t)bytesContAvail < rounded_start - ap->a_foffset)
3373                                                                 break;
3374                                                         if (rounded_start < invalid_range->rl_end + 1) {
3375                                                                 bytesContAvail = rounded_start - ap->a_foffset;
3376                                                                 break;
3377                                                         }
3378                                                 } while ((invalid_range = TAILQ_NEXT(invalid_range,
3379                                                                                                                          rl_link)));
3380                                         }
3381                                         break;
3382
3383                                 case RL_NOOVERLAP:
3384                                         break;
3385                         } // switch
3386                 }
3387         }
3388
3389 #if CONFIG_PROTECT
3390         if (cp->c_cpentry) {
3391                 const int direction = (ISSET(ap->a_flags, VNODE_WRITE)
3392                                                            ? VNODE_WRITE : VNODE_READ);
3393
3394                 cp_io_params_t io_params;
3395                 cp_io_params(hfsmp, cp->c_cpentry,
3396                                          off_rsrc_make(ap->a_foffset, VNODE_IS_RSRC(vp)),
3397                                          direction, &io_params);
3398
3399                 if (io_params.max_len < (off_t)bytesContAvail)
3400                         bytesContAvail = io_params.max_len;
3401
3402                 if (io_params.phys_offset != -1) {
3403                         *ap->a_bpn = ((io_params.phys_offset + hfsmp->hfsPlusIOPosOffset)
3404                                                   / hfsmp->hfs_logical_block_size);
3405
3406                         retval = 0;
3407                         goto exit;
3408                 }
3409         }
3410 #endif
3411
3412 retry:
3413
3414         /* Check virtual blocks only when performing write operation */
3415         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
3416                 if (hfs_start_transaction(hfsmp) != 0) {
3417                         retval = EINVAL;
3418                         goto exit;
3419                 } else {
3420                         started_tr = 1;
3421                 }
3422                 syslocks = SFL_EXTENTS | SFL_BITMAP;
3423
3424         } else if (overflow_extents(fp)) {
3425                 syslocks = SFL_EXTENTS;
3426         }
3427
3428         if (syslocks)
3429                 lockflags = hfs_systemfile_lock(hfsmp, syslocks, HFS_EXCLUSIVE_LOCK);
3430
3431         /*
3432          * Check for any delayed allocations.
3433          */
3434         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
3435                 int64_t actbytes;
3436                 u_int32_t loanedBlocks;
3437
3438                 //
3439                 // Make sure we have a transaction.  It's possible
3440                 // that we came in and fp->ff_unallocblocks was zero
3441                 // but during the time we blocked acquiring the extents
3442                 // btree, ff_unallocblocks became non-zero and so we
3443                 // will need to start a transaction.
3444                 //
3445                 if (started_tr == 0) {
3446                         if (syslocks) {
3447                                 hfs_systemfile_unlock(hfsmp, lockflags);
3448                                 syslocks = 0;
3449                         }
3450                         goto retry;
3451                 }
3452
3453                 /*
3454                  * Note: ExtendFileC will Release any blocks on loan and
3455                  * aquire real blocks.  So we ask to extend by zero bytes
3456                  * since ExtendFileC will account for the virtual blocks.
3457                  */
3458
3459                 loanedBlocks = fp->ff_unallocblocks;
3460                 retval = ExtendFileC(hfsmp, (FCB*)fp, 0, 0,
3461                                      kEFAllMask | kEFNoClumpMask, &actbytes);
3462
3463                 if (retval) {
3464                         fp->ff_unallocblocks = loanedBlocks;
3465                         cp->c_blocks += loanedBlocks;
3466                         fp->ff_blocks += loanedBlocks;
3467
3468                         hfs_lock_mount (hfsmp);
3469                         hfsmp->loanedBlocks += loanedBlocks;
3470                         hfs_unlock_mount (hfsmp);
3471
3472                         hfs_systemfile_unlock(hfsmp, lockflags);
3473                         cp->c_flag |= C_MODIFIED;
3474                         if (started_tr) {
3475                                 (void) hfs_update(vp, 0);
3476                                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3477
3478                                 hfs_end_transaction(hfsmp);
3479                                 started_tr = 0;
3480                         }
3481                         goto exit;
3482                 }
3483         }
3484
3485 #if NEW_XATTR
3486         // check for the alternate xattr vnode
3487         if (vp == hfsmp->hfs_attrdata_vp) {
3488                 HFSPlusExtentDescriptor real_fext;
3489                 size_t availableBytes;
3490                 u_int32_t sectorsPerBlock;              // Number of sectors per allocation block
3491                 u_int32_t sectorSize;
3492                 uint64_t f_offset;
3493
3494                 if (!hfs_xattr_fext_find(&hfsmp->hfs_xattr_io, hfsmp->blockSize,
3495                                                                  ap->a_foffset, &real_fext, &f_offset)) {
3496                         panic("cannot find xattr fext for %llu", f_offset);
3497                 }
3498
3499                 sectorSize = hfsmp->hfs_logical_block_size;
3500                 //      Compute the number of sectors in an allocation block
3501                 sectorsPerBlock = hfsmp->blockSize / sectorSize;
3502
3503                 *ap->a_bpn = (f_offset / hfsmp->blockSize) * sectorsPerBlock;
3504                 availableBytes = real_fext.blockCount * hfsmp->blockSize - (f_offset - (real_fext.startBlock * hfsmp->blockSize));
3505                 if (availableBytes < bytesContAvail) {
3506                         bytesContAvail = availableBytes;
3507                 }
3508
3509                 goto got_fext;
3510         }
3511 #endif
3512
3513         retval = MapFileBlockC(hfsmp, (FCB *)fp, bytesContAvail, ap->a_foffset,
3514                                ap->a_bpn, &bytesContAvail);
3515
3516 #if NEW_XATTR
3517 got_fext:
3518 #endif
3519
3520         if (syslocks) {
3521                 hfs_systemfile_unlock(hfsmp, lockflags);
3522                 syslocks = 0;
3523         }
3524
3525         if (retval) {
3526                 /* On write, always return error because virtual blocks, if any,
3527                  * should have been allocated in ExtendFileC().  We do not
3528                  * allocate virtual blocks on read, therefore return error
3529                  * only if no virtual blocks are allocated.  Otherwise we search
3530                  * rangelist for zero-fills
3531                  */
3532                 if ((MacToVFSError(retval) != ERANGE) ||
3533                     (ap->a_flags & VNODE_WRITE) ||
3534                     ((ap->a_flags & VNODE_READ) && (fp->ff_unallocblocks == 0))) {
3535                         goto exit;
3536                 }
3537
3538                 /* Validate if the start offset is within logical file size */
3539                 if (ap->a_foffset >= fp->ff_size) {
3540                         goto exit;
3541                 }
3542
3543                 /*
3544                  * At this point, we have encountered a failure during
3545                  * MapFileBlockC that resulted in ERANGE, and we are not
3546                  * servicing a write, and there are borrowed blocks.
3547                  *
3548                  * However, the cluster layer will not call blockmap for
3549                  * blocks that are borrowed and in-cache.  We have to assume
3550                  * that because we observed ERANGE being emitted from
3551                  * MapFileBlockC, this extent range is not valid on-disk.  So
3552                  * we treat this as a mapping that needs to be zero-filled
3553                  * prior to reading.
3554                  */
3555
3556                 if (fp->ff_size - ap->a_foffset < (off_t)bytesContAvail)
3557                         bytesContAvail = fp->ff_size - ap->a_foffset;
3558
3559                 *ap->a_bpn = (daddr64_t) -1;
3560                 retval = 0;
3561
3562                 goto exit;
3563         }
3564
3565 exit:
3566         if (retval == 0) {
3567                 if (ISSET(ap->a_flags, VNODE_WRITE)) {
3568                         struct rl_entry *r = TAILQ_FIRST(&fp->ff_invalidranges);
3569
3570                         // See if we might be overlapping invalid ranges...
3571                         if (r && (ap->a_foffset + (off_t)bytesContAvail) > r->rl_start) {
3572                                 /*
3573                                  * Mark the file as needing an update if we think the
3574                                  * on-disk EOF has changed.
3575                                  */
3576                                 if (ap->a_foffset <= r->rl_start)
3577                                         SET(cp->c_flag, C_MODIFIED);
3578
3579                                 /*
3580                                  * This isn't the ideal place to put this.  Ideally, we
3581                                  * should do something *after* we have successfully
3582                                  * written to the range, but that's difficult to do
3583                                  * because we cannot take locks in the callback.  At
3584                                  * present, the cluster code will call us with VNODE_WRITE
3585                                  * set just before it's about to write the data so we know
3586                                  * that data is about to be written.  If we get an I/O
3587                                  * error at this point then chances are the metadata
3588                                  * update to follow will also have an I/O error so the
3589                                  * risk here is small.
3590                                  */
3591                                 rl_remove(ap->a_foffset, ap->a_foffset + bytesContAvail - 1,
3592                                                   &fp->ff_invalidranges);
3593
3594                                 if (!TAILQ_FIRST(&fp->ff_invalidranges)) {
3595                                         cp->c_flag &= ~C_ZFWANTSYNC;
3596                                         cp->c_zftimeout = 0;
3597                                 }
3598                         }
3599                 }
3600
3601                 if (ap->a_run)
3602                         *ap->a_run = bytesContAvail;
3603
3604                 if (ap->a_poff)
3605                         *(int *)ap->a_poff = 0;
3606         }
3607
3608         if (started_tr) {
3609                 hfs_update(vp, TRUE);
3610                 hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3611                 hfs_end_transaction(hfsmp);
3612                 started_tr = 0;
3613         }
3614
3615         if (tooklock)
3616                 hfs_unlock(cp);
3617
3618         return (MacToVFSError(retval));
3619 }
3620
3621 /*
3622  * prepare and issue the I/O
3623  * buf_strategy knows how to deal
3624  * with requests that require
3625  * fragmented I/Os
3626  */
3627 int
3628 hfs_vnop_strategy(struct vnop_strategy_args *ap)
3629 {
3630         buf_t   bp = ap->a_bp;
3631         vnode_t vp = buf_vnode(bp);
3632         int error = 0;
3633
3634         /* Mark buffer as containing static data if cnode flag set */
3635         if (VTOC(vp)->c_flag & C_SSD_STATIC) {
3636                 buf_markstatic(bp);
3637         }
3638
3639         /* Mark buffer as containing static data if cnode flag set */
3640         if (VTOC(vp)->c_flag & C_SSD_GREEDY_MODE) {
3641                 bufattr_markgreedymode(buf_attr(bp));
3642         }
3643
3644         /* mark buffer as containing burst mode data if cnode flag set */
3645         if (VTOC(vp)->c_flag & C_IO_ISOCHRONOUS) {
3646                 bufattr_markisochronous(buf_attr(bp));
3647         }
3648
3649 #if CONFIG_PROTECT
3650         error = cp_handle_strategy(bp);
3651
3652         if (error)
3653                 return error;
3654 #endif
3655
3656         error = buf_strategy(VTOHFS(vp)->hfs_devvp, ap);
3657
3658         return error;
3659 }
3660
3661 int
3662 do_hfs_truncate(struct vnode *vp, off_t length, int flags, int truncateflags, vfs_context_t context)
3663 {
3664         register struct cnode *cp = VTOC(vp);
3665         struct filefork *fp = VTOF(vp);
3666         kauth_cred_t cred = vfs_context_ucred(context);
3667         int retval;
3668         off_t bytesToAdd;
3669         off_t actualBytesAdded;
3670         off_t filebytes;
3671         u_int32_t fileblocks;
3672         int blksize;
3673         struct hfsmount *hfsmp;
3674         int lockflags;
3675         int suppress_times = (truncateflags & HFS_TRUNCATE_SKIPTIMES);
3676
3677         blksize = VTOVCB(vp)->blockSize;
3678         fileblocks = fp->ff_blocks;
3679         filebytes = (off_t)fileblocks * (off_t)blksize;
3680
3681         KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_START,
3682                  (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
3683
3684         if (length < 0)
3685                 return (EINVAL);
3686
3687         /* This should only happen with a corrupt filesystem */
3688         if ((off_t)fp->ff_size < 0)
3689                 return (EINVAL);
3690
3691         if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE))
3692                 return (EFBIG);
3693
3694         hfsmp = VTOHFS(vp);
3695
3696         retval = E_NONE;
3697
3698         /* Files that are changing size are not hot file candidates. */
3699         if (hfsmp->hfc_stage == HFC_RECORDING) {
3700                 fp->ff_bytesread = 0;
3701         }
3702
3703         /*
3704          * We cannot just check if fp->ff_size == length (as an optimization)
3705          * since there may be extra physical blocks that also need truncation.
3706          */
3707 #if QUOTA
3708         if ((retval = hfs_getinoquota(cp)))
3709                 return(retval);
3710 #endif /* QUOTA */
3711
3712         /*
3713          * Lengthen the size of the file. We must ensure that the
3714          * last byte of the file is allocated. Since the smallest
3715          * value of ff_size is 0, length will be at least 1.
3716          */
3717         if (length > (off_t)fp->ff_size) {
3718 #if QUOTA
3719                 retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)),
3720                                    cred, 0);
3721                 if (retval)
3722                         goto Err_Exit;
3723 #endif /* QUOTA */
3724                 /*
3725                  * If we don't have enough physical space then
3726                  * we need to extend the physical size.
3727                  */
3728                 if (length > filebytes) {
3729                         int eflags;
3730                         u_int32_t blockHint = 0;
3731
3732                         /* All or nothing and don't round up to clumpsize. */
3733                         eflags = kEFAllMask | kEFNoClumpMask;
3734
3735                         if (cred && (suser(cred, NULL) != 0)) {
3736                                 eflags |= kEFReserveMask;  /* keep a reserve */
3737                         }
3738
3739                         /*
3740                          * Allocate Journal and Quota files in metadata zone.
3741                          */
3742                         if (filebytes == 0 &&
3743                             hfsmp->hfs_flags & HFS_METADATA_ZONE &&
3744                             hfs_virtualmetafile(cp)) {
3745                                 eflags |= kEFMetadataMask;
3746                                 blockHint = hfsmp->hfs_metazone_start;
3747                         }
3748                         if (hfs_start_transaction(hfsmp) != 0) {
3749                             retval = EINVAL;
3750                             goto Err_Exit;
3751                         }
3752
3753                         /* Protect extents b-tree and allocation bitmap */
3754                         lockflags = SFL_BITMAP;
3755                         if (overflow_extents(fp))
3756                                 lockflags |= SFL_EXTENTS;
3757                         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3758
3759                         /*
3760                          * Keep growing the file as long as the current EOF is
3761                          * less than the desired value.
3762                          */
3763                         while ((length > filebytes) && (retval == E_NONE)) {
3764                                 bytesToAdd = length - filebytes;
3765                                 retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
3766                                                     (FCB*)fp,
3767                                                     bytesToAdd,
3768                                                     blockHint,
3769                                                     eflags,
3770                                                     &actualBytesAdded));
3771
3772                                 filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
3773                                 if (actualBytesAdded == 0 && retval == E_NONE) {
3774                                         if (length > filebytes)
3775                                                 length = filebytes;
3776                                         break;
3777                                 }
3778                         } /* endwhile */
3779
3780                         hfs_systemfile_unlock(hfsmp, lockflags);
3781
3782                         if (hfsmp->jnl) {
3783                                 hfs_update(vp, 0);
3784                                 hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3785                         }
3786
3787                         hfs_end_transaction(hfsmp);
3788
3789                         if (retval)
3790                                 goto Err_Exit;
3791
3792                         KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE,
3793                                 (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
3794                 }
3795
3796                 if (ISSET(flags, IO_NOZEROFILL)) {
3797                         // An optimisation for the hibernation file
3798                         if (vnode_isswap(vp))
3799                                 rl_remove_all(&fp->ff_invalidranges);
3800                 } else {
3801                         if (!vnode_issystem(vp) && retval == E_NONE) {
3802                                 if (length > (off_t)fp->ff_size) {
3803                                         struct timeval tv;
3804
3805                                         /* Extending the file: time to fill out the current last page w. zeroes? */
3806                                         if (fp->ff_size & PAGE_MASK_64) {
3807                                                 /* There might be some valid data at the start of the (current) last page
3808                                                    of the file, so zero out the remainder of that page to ensure the
3809                                                    entire page contains valid data. */
3810                                                 hfs_unlock(cp);
3811                                                 retval = hfs_zero_eof_page(vp, length);
3812                                                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
3813                                                 if (retval) goto Err_Exit;
3814                                         }
3815                                         microuptime(&tv);
3816                                         rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges);
3817                                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
3818                                 }
3819                         } else {
3820                                         panic("hfs_truncate: invoked on non-UBC object?!");
3821                         };
3822                 }
3823                 if (suppress_times == 0) {
3824                         cp->c_touch_modtime = TRUE;
3825                 }
3826                 fp->ff_size = length;
3827
3828         } else { /* Shorten the size of the file */
3829
3830                 // An optimisation for the hibernation file
3831                 if (ISSET(flags, IO_NOZEROFILL) && vnode_isswap(vp)) {
3832                         rl_remove_all(&fp->ff_invalidranges);
3833                 } else if ((off_t)fp->ff_size > length) {
3834                         /* Any space previously marked as invalid is now irrelevant: */
3835                         rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges);
3836                 }
3837
3838                 /*
3839                  * Account for any unmapped blocks. Note that the new
3840                  * file length can still end up with unmapped blocks.
3841                  */
3842                 if (fp->ff_unallocblocks > 0) {
3843                         u_int32_t finalblks;
3844                         u_int32_t loanedBlocks;
3845
3846                         hfs_lock_mount(hfsmp);
3847                         loanedBlocks = fp->ff_unallocblocks;
3848                         cp->c_blocks -= loanedBlocks;
3849                         fp->ff_blocks -= loanedBlocks;
3850                         fp->ff_unallocblocks = 0;
3851
3852                         hfsmp->loanedBlocks -= loanedBlocks;
3853
3854                         finalblks = (length + blksize - 1) / blksize;
3855                         if (finalblks > fp->ff_blocks) {
3856                                 /* calculate required unmapped blocks */
3857                                 loanedBlocks = finalblks - fp->ff_blocks;
3858                                 hfsmp->loanedBlocks += loanedBlocks;
3859
3860                                 fp->ff_unallocblocks = loanedBlocks;
3861                                 cp->c_blocks += loanedBlocks;
3862                                 fp->ff_blocks += loanedBlocks;
3863                         }
3864                         hfs_unlock_mount (hfsmp);
3865                 }
3866
3867                 off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
3868                 if (hfs_start_transaction(hfsmp) != 0) {
3869                         retval = EINVAL;
3870                         goto Err_Exit;
3871                 }
3872
3873                 if (fp->ff_unallocblocks == 0) {
3874                         /* Protect extents b-tree and allocation bitmap */
3875                         lockflags = SFL_BITMAP;
3876                         if (overflow_extents(fp))
3877                                 lockflags |= SFL_EXTENTS;
3878                         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3879
3880                         retval = MacToVFSError(TruncateFileC(VTOVCB(vp), (FCB*)fp, length, 0,
3881                                                                                                  FORK_IS_RSRC (fp), FTOC(fp)->c_fileid, false));
3882
3883                         hfs_systemfile_unlock(hfsmp, lockflags);
3884                 }
3885                 if (hfsmp->jnl) {
3886                         if (retval == 0) {
3887                                 fp->ff_size = length;
3888                         }
3889                         hfs_update(vp, 0);
3890                         hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3891                 }
3892                 hfs_end_transaction(hfsmp);
3893
3894                 filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
3895                 if (retval)
3896                         goto Err_Exit;
3897 #if QUOTA
3898                 /* These are bytesreleased */
3899                 (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0);
3900 #endif /* QUOTA */
3901
3902                 //
3903                 // Unlike when growing a file, we adjust the hotfile block count here
3904                 // instead of deeper down in the block allocation code because we do
3905                 // not necessarily have a vnode or "fcb" at the time we're deleting
3906                 // the file and so we wouldn't know if it was hotfile cached or not
3907                 //
3908                 hfs_hotfile_adjust_blocks(vp, (int64_t)((savedbytes - filebytes) / blksize));
3909
3910
3911                 /*
3912                  * Only set update flag if the logical length changes & we aren't
3913                  * suppressing modtime updates.
3914                  */
3915                 if (((off_t)fp->ff_size != length) && (suppress_times == 0)) {
3916                         cp->c_touch_modtime = TRUE;
3917                 }
3918                 fp->ff_size = length;
3919         }
3920         if (cp->c_mode & (S_ISUID | S_ISGID)) {
3921                 if (!vfs_context_issuser(context))
3922                         cp->c_mode &= ~(S_ISUID | S_ISGID);
3923         }
3924         cp->c_flag |= C_MODIFIED;
3925         cp->c_touch_chgtime = TRUE;     /* status changed */
3926         if (suppress_times == 0) {
3927                 cp->c_touch_modtime = TRUE;     /* file data was modified */
3928
3929                 /*
3930                  * If we are not suppressing the modtime update, then
3931                  * update the gen count as well.
3932                  */
3933                 if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK (cp->c_attr.ca_mode)) {
3934                         hfs_incr_gencount(cp);
3935                 }
3936         }
3937
3938         retval = hfs_update(vp, 0);
3939         if (retval) {
3940                 KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE,
3941                      -1, -1, -1, retval, 0);
3942         }
3943
3944 Err_Exit:
3945
3946         KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_END,
3947                  (int)length, (int)fp->ff_size, (int)filebytes, retval, 0);
3948
3949         return (retval);
3950 }
3951
3952 /*
3953  * Preparation which must be done prior to deleting the catalog record
3954  * of a file or directory.  In order to make the on-disk as safe as possible,
3955  * we remove the catalog entry before releasing the bitmap blocks and the
3956  * overflow extent records.  However, some work must be done prior to deleting
3957  * the catalog record.
3958  *
3959  * When calling this function, the cnode must exist both in memory and on-disk.
3960  * If there are both resource fork and data fork vnodes, this function should
3961  * be called on both.
3962  */
3963
3964 int
3965 hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) {
3966
3967         struct filefork *fp = VTOF(vp);
3968         struct cnode *cp = VTOC(vp);
3969 #if QUOTA
3970         int retval = 0;
3971 #endif /* QUOTA */
3972
3973         /* Cannot truncate an HFS directory! */
3974         if (vnode_isdir(vp)) {
3975                 return (EISDIR);
3976         }
3977
3978         /*
3979          * See the comment below in hfs_truncate for why we need to call
3980          * setsize here.  Essentially we want to avoid pending IO if we
3981          * already know that the blocks are going to be released here.
3982          * This function is only called when totally removing all storage for a file, so
3983          * we can take a shortcut and immediately setsize (0);
3984          */
3985         ubc_setsize(vp, 0);
3986
3987         /* This should only happen with a corrupt filesystem */
3988         if ((off_t)fp->ff_size < 0)
3989                 return (EINVAL);
3990
3991         /*
3992          * We cannot just check if fp->ff_size == length (as an optimization)
3993          * since there may be extra physical blocks that also need truncation.
3994          */
3995 #if QUOTA
3996         if ((retval = hfs_getinoquota(cp))) {
3997                 return(retval);
3998         }
3999 #endif /* QUOTA */
4000
4001         /* Wipe out any invalid ranges which have yet to be backed by disk */
4002         rl_remove(0, fp->ff_size - 1, &fp->ff_invalidranges);
4003
4004         /*
4005          * Account for any unmapped blocks. Since we're deleting the
4006          * entire file, we don't have to worry about just shrinking
4007          * to a smaller number of borrowed blocks.
4008          */
4009         if (fp->ff_unallocblocks > 0) {
4010                 u_int32_t loanedBlocks;
4011
4012                 hfs_lock_mount (hfsmp);
4013                 loanedBlocks = fp->ff_unallocblocks;
4014                 cp->c_blocks -= loanedBlocks;
4015                 fp->ff_blocks -= loanedBlocks;
4016                 fp->ff_unallocblocks = 0;
4017
4018                 hfsmp->loanedBlocks -= loanedBlocks;
4019
4020                 hfs_unlock_mount (hfsmp);
4021         }
4022
4023         return 0;
4024 }
4025
4026
4027 /*
4028  * Special wrapper around calling TruncateFileC.  This function is useable
4029  * even when the catalog record does not exist any longer, making it ideal
4030  * for use when deleting a file.  The simplification here is that we know
4031  * that we are releasing all blocks.
4032  *
4033  * Note that this function may be called when there is no vnode backing
4034  * the file fork in question.  We may call this from hfs_vnop_inactive
4035  * to clear out resource fork data (and may not want to clear out the data
4036  * fork yet).  As a result, we pointer-check both sets of inputs before
4037  * doing anything with them.
4038  *
4039  * The caller is responsible for saving off a copy of the filefork(s)
4040  * embedded within the cnode prior to calling this function.  The pointers
4041  * supplied as arguments must be valid even if the cnode is no longer valid.
4042  */
4043
4044 int
4045 hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork,
4046                                          struct filefork *rsrcfork, u_int32_t fileid) {
4047
4048         off_t filebytes;
4049         u_int32_t fileblocks;
4050         int blksize = 0;
4051         int error = 0;
4052         int lockflags;
4053
4054         blksize = hfsmp->blockSize;
4055
4056         /* Data Fork */
4057         if (datafork) {
4058                 off_t prev_filebytes;
4059
4060                 datafork->ff_size = 0;
4061
4062                 fileblocks = datafork->ff_blocks;
4063                 filebytes = (off_t)fileblocks * (off_t)blksize;
4064                 prev_filebytes = filebytes;
4065
4066                 /* We killed invalid ranges and loaned blocks before we removed the catalog entry */
4067
4068                 while (filebytes > 0) {
4069                         if (filebytes > HFS_BIGFILE_SIZE) {
4070                                 filebytes -= HFS_BIGFILE_SIZE;
4071                         } else {
4072                                 filebytes = 0;
4073                         }
4074
4075                         /* Start a transaction, and wipe out as many blocks as we can in this iteration */
4076                         if (hfs_start_transaction(hfsmp) != 0) {
4077                                 error = EINVAL;
4078                                 break;
4079                         }
4080
4081                         if (datafork->ff_unallocblocks == 0) {
4082                                 /* Protect extents b-tree and allocation bitmap */
4083                                 lockflags = SFL_BITMAP;
4084                                 if (overflow_extents(datafork))
4085                                         lockflags |= SFL_EXTENTS;
4086                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
4087
4088                                 error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), datafork, filebytes, 1, 0, fileid, false));
4089
4090                                 hfs_systemfile_unlock(hfsmp, lockflags);
4091                         }
4092                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
4093
4094                         struct cnode *cp = datafork ? FTOC(datafork) : NULL;
4095                         struct vnode *vp;
4096                         vp = cp ? CTOV(cp, 0) : NULL;
4097                         hfs_hotfile_adjust_blocks(vp, (int64_t)((prev_filebytes - filebytes) / blksize));
4098                         prev_filebytes = filebytes;
4099
4100                         /* Finish the transaction and start over if necessary */
4101                         hfs_end_transaction(hfsmp);
4102
4103                         if (error) {
4104                                 break;
4105                         }
4106                 }
4107         }
4108
4109         /* Resource fork */
4110         if (error == 0 && rsrcfork) {
4111                 rsrcfork->ff_size = 0;
4112
4113                 fileblocks = rsrcfork->ff_blocks;
4114                 filebytes = (off_t)fileblocks * (off_t)blksize;
4115
4116                 /* We killed invalid ranges and loaned blocks before we removed the catalog entry */
4117
4118                 while (filebytes > 0) {
4119                         if (filebytes > HFS_BIGFILE_SIZE) {
4120                                 filebytes -= HFS_BIGFILE_SIZE;
4121                         } else {
4122                                 filebytes = 0;
4123                         }
4124
4125                         /* Start a transaction, and wipe out as many blocks as we can in this iteration */
4126                         if (hfs_start_transaction(hfsmp) != 0) {
4127                                 error = EINVAL;
4128                                 break;
4129                         }
4130
4131                         if (rsrcfork->ff_unallocblocks == 0) {
4132                                 /* Protect extents b-tree and allocation bitmap */
4133                                 lockflags = SFL_BITMAP;
4134                                 if (overflow_extents(rsrcfork))
4135                                         lockflags |= SFL_EXTENTS;
4136                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
4137
4138                                 error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), rsrcfork, filebytes, 1, 1, fileid, false));
4139
4140                                 hfs_systemfile_unlock(hfsmp, lockflags);
4141                         }
4142                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
4143
4144                         /* Finish the transaction and start over if necessary */
4145                         hfs_end_transaction(hfsmp);
4146
4147                         if (error) {
4148                                 break;
4149                         }
4150                 }
4151         }
4152
4153         return error;
4154 }
4155
4156 errno_t hfs_ubc_setsize(vnode_t vp, off_t len, bool have_cnode_lock)
4157 {
4158         errno_t error;
4159
4160         /*
4161          * Call ubc_setsize to give the VM subsystem a chance to do
4162          * whatever it needs to with existing pages before we delete
4163          * blocks.  Note that symlinks don't use the UBC so we'll
4164          * get back ENOENT in that case.
4165          */
4166         if (have_cnode_lock) {
4167                 error = ubc_setsize_ex(vp, len, UBC_SETSIZE_NO_FS_REENTRY);
4168                 if (error == EAGAIN) {
4169                         cnode_t *cp = VTOC(vp);
4170
4171                         if (cp->c_truncatelockowner != current_thread())
4172                                 hfs_warn("hfs: hfs_ubc_setsize called without exclusive truncate lock!");
4173
4174                         hfs_unlock(cp);
4175                         error = ubc_setsize_ex(vp, len, 0);
4176                         hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
4177                 }
4178         } else
4179                 error = ubc_setsize_ex(vp, len, 0);
4180
4181         return error == ENOENT ? 0 : error;
4182 }
4183
4184 /*
4185  * Truncate a cnode to at most length size, freeing (or adding) the
4186  * disk blocks.
4187  */
4188 int
4189 hfs_truncate(struct vnode *vp, off_t length, int flags,
4190                          int truncateflags, vfs_context_t context)
4191 {
4192         struct filefork *fp = VTOF(vp);
4193         off_t filebytes;
4194         u_int32_t fileblocks;
4195         int blksize;
4196         errno_t error = 0;
4197         struct cnode *cp = VTOC(vp);
4198         hfsmount_t *hfsmp = VTOHFS(vp);
4199
4200         /* Cannot truncate an HFS directory! */
4201         if (vnode_isdir(vp)) {
4202                 return (EISDIR);
4203         }
4204         /* A swap file cannot change size. */
4205         if (vnode_isswap(vp) && length && !ISSET(flags, IO_NOAUTH)) {
4206                 return (EPERM);
4207         }
4208
4209         blksize = hfsmp->blockSize;
4210         fileblocks = fp->ff_blocks;
4211         filebytes = (off_t)fileblocks * (off_t)blksize;
4212
4213         bool caller_has_cnode_lock = (cp->c_lockowner == current_thread());
4214
4215         error = hfs_ubc_setsize(vp, length, caller_has_cnode_lock);
4216         if (error)
4217                 return error;
4218
4219         if (!caller_has_cnode_lock) {
4220                 error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
4221                 if (error)
4222                         return error;
4223         }
4224
4225         if (vnode_islnk(vp) && cp->c_datafork->ff_symlinkptr) {
4226                 hfs_free(cp->c_datafork->ff_symlinkptr, cp->c_datafork->ff_size);
4227                 cp->c_datafork->ff_symlinkptr = NULL;
4228         }
4229
4230         // have to loop truncating or growing files that are
4231         // really big because otherwise transactions can get
4232         // enormous and consume too many kernel resources.
4233
4234         if (length < filebytes) {
4235                 while (filebytes > length) {
4236                         if ((filebytes - length) > HFS_BIGFILE_SIZE) {
4237                                 filebytes -= HFS_BIGFILE_SIZE;
4238                         } else {
4239                                 filebytes = length;
4240                         }
4241                         error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context);
4242                         if (error)
4243                                 break;
4244                 }
4245         } else if (length > filebytes) {
4246                 kauth_cred_t cred = vfs_context_ucred(context);
4247                 const bool keep_reserve = cred && suser(cred, NULL) != 0;
4248
4249                 if (hfs_freeblks(hfsmp, keep_reserve)
4250                         < howmany(length - filebytes, blksize)) {
4251                         error = ENOSPC;
4252                 } else {
4253                         while (filebytes < length) {
4254                                 if ((length - filebytes) > HFS_BIGFILE_SIZE) {
4255                                         filebytes += HFS_BIGFILE_SIZE;
4256                                 } else {
4257                                         filebytes = length;
4258                                 }
4259                                 error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context);
4260                                 if (error)
4261                                         break;
4262                         }
4263                 }
4264         } else /* Same logical size */ {
4265
4266                 error = do_hfs_truncate(vp, length, flags, truncateflags, context);
4267         }
4268         /* Files that are changing size are not hot file candidates. */
4269         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
4270                 fp->ff_bytesread = 0;
4271         }
4272
4273 #if HFS_CONFIG_KEY_ROLL
4274         if (!error && cp->c_truncatelockowner == current_thread()) {
4275                 hfs_key_roll_check(cp, true);
4276         }
4277 #endif
4278
4279         if (!caller_has_cnode_lock)
4280                 hfs_unlock(cp);
4281
4282         // Make sure UBC's size matches up (in case we didn't completely succeed)
4283         errno_t err2 = hfs_ubc_setsize(vp, fp->ff_size, caller_has_cnode_lock);
4284         if (!error)
4285                 error = err2;
4286
4287         return error;
4288 }
4289
4290
4291 /*
4292  * Preallocate file storage space.
4293  */
4294 int
4295 hfs_vnop_allocate(struct vnop_allocate_args /* {
4296                 vnode_t a_vp;
4297                 off_t a_length;
4298                 u_int32_t  a_flags;
4299                 off_t *a_bytesallocated;
4300                 off_t a_offset;
4301                 vfs_context_t a_context;
4302         } */ *ap)
4303 {
4304         struct vnode *vp = ap->a_vp;
4305         struct cnode *cp;
4306         struct filefork *fp;
4307         ExtendedVCB *vcb;
4308         off_t length = ap->a_length;
4309         off_t startingPEOF;
4310         off_t moreBytesRequested;
4311         off_t actualBytesAdded;
4312         off_t filebytes;
4313         u_int32_t fileblocks;
4314         int retval, retval2;
4315         u_int32_t blockHint;
4316         u_int32_t extendFlags;   /* For call to ExtendFileC */
4317         struct hfsmount *hfsmp;
4318         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
4319         int lockflags;
4320         time_t orig_ctime;
4321
4322         *(ap->a_bytesallocated) = 0;
4323
4324         if (!vnode_isreg(vp))
4325                 return (EISDIR);
4326         if (length < (off_t)0)
4327                 return (EINVAL);
4328
4329         cp = VTOC(vp);
4330
4331         orig_ctime = VTOC(vp)->c_ctime;
4332
4333         nspace_snapshot_event(vp, orig_ctime, ap->a_length == 0 ? NAMESPACE_HANDLER_TRUNCATE_OP|NAMESPACE_HANDLER_DELETE_OP : NAMESPACE_HANDLER_TRUNCATE_OP, NULL);
4334
4335         hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
4336
4337         if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
4338                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
4339                 return (retval);
4340         }
4341
4342         fp = VTOF(vp);
4343         hfsmp = VTOHFS(vp);
4344         vcb = VTOVCB(vp);
4345
4346         fileblocks = fp->ff_blocks;
4347         filebytes = (off_t)fileblocks * (off_t)vcb->blockSize;
4348
4349         if ((ap->a_flags & ALLOCATEFROMVOL) && (length < filebytes)) {
4350                 retval = EINVAL;
4351                 goto Err_Exit;
4352         }
4353
4354         /* Fill in the flags word for the call to Extend the file */
4355
4356         extendFlags = kEFNoClumpMask;
4357         if (ap->a_flags & ALLOCATECONTIG)
4358                 extendFlags |= kEFContigMask;
4359         if (ap->a_flags & ALLOCATEALL)
4360                 extendFlags |= kEFAllMask;
4361         if (cred && suser(cred, NULL) != 0)
4362                 extendFlags |= kEFReserveMask;
4363         if (hfs_virtualmetafile(cp))
4364                 extendFlags |= kEFMetadataMask;
4365
4366         retval = E_NONE;
4367         blockHint = 0;
4368         startingPEOF = filebytes;
4369
4370         if (ap->a_flags & ALLOCATEFROMPEOF)
4371                 length += filebytes;
4372         else if (ap->a_flags & ALLOCATEFROMVOL)
4373                 blockHint = ap->a_offset / VTOVCB(vp)->blockSize;
4374
4375         /* If no changes are necesary, then we're done */
4376         if (filebytes == length)
4377                 goto Std_Exit;
4378
4379         /*
4380          * Lengthen the size of the file. We must ensure that the
4381          * last byte of the file is allocated. Since the smallest
4382          * value of filebytes is 0, length will be at least 1.
4383          */
4384         if (length > filebytes) {
4385                 if (ISSET(extendFlags, kEFAllMask)
4386                         && (hfs_freeblks(hfsmp, ISSET(extendFlags, kEFReserveMask))
4387                                 < howmany(length - filebytes, hfsmp->blockSize))) {
4388                         retval = ENOSPC;
4389                         goto Err_Exit;
4390                 }
4391
4392                 off_t total_bytes_added = 0, orig_request_size;
4393
4394                 orig_request_size = moreBytesRequested = length - filebytes;
4395
4396 #if QUOTA
4397                 retval = hfs_chkdq(cp,
4398                                 (int64_t)(roundup(moreBytesRequested, vcb->blockSize)),
4399                                 cred, 0);
4400                 if (retval)
4401                         goto Err_Exit;
4402
4403 #endif /* QUOTA */
4404                 /*
4405                  * Metadata zone checks.
4406                  */
4407                 if (hfsmp->hfs_flags & HFS_METADATA_ZONE) {
4408                         /*
4409                          * Allocate Journal and Quota files in metadata zone.
4410                          */
4411                         if (hfs_virtualmetafile(cp)) {
4412                                 blockHint = hfsmp->hfs_metazone_start;
4413                         } else if ((blockHint >= hfsmp->hfs_metazone_start) &&
4414                                    (blockHint <= hfsmp->hfs_metazone_end)) {
4415                                 /*
4416                                  * Move blockHint outside metadata zone.
4417                                  */
4418                                 blockHint = hfsmp->hfs_metazone_end + 1;
4419                         }
4420                 }
4421
4422
4423                 while ((length > filebytes) && (retval == E_NONE)) {
4424                     off_t bytesRequested;
4425
4426                     if (hfs_start_transaction(hfsmp) != 0) {
4427                         retval = EINVAL;
4428                         goto Err_Exit;
4429                     }
4430
4431                     /* Protect extents b-tree and allocation bitmap */
4432                     lockflags = SFL_BITMAP;
4433                     if (overflow_extents(fp))
4434                                 lockflags |= SFL_EXTENTS;
4435                     lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
4436
4437                     if (moreBytesRequested >= HFS_BIGFILE_SIZE) {
4438                                 bytesRequested = HFS_BIGFILE_SIZE;
4439                     } else {
4440                                 bytesRequested = moreBytesRequested;
4441                     }
4442
4443                     if (extendFlags & kEFContigMask) {
4444                             // if we're on a sparse device, this will force it to do a
4445                             // full scan to find the space needed.
4446                             hfsmp->hfs_flags &= ~HFS_DID_CONTIG_SCAN;
4447                     }
4448
4449                     retval = MacToVFSError(ExtendFileC(vcb,
4450                                                 (FCB*)fp,
4451                                                 bytesRequested,
4452                                                 blockHint,
4453                                                 extendFlags,
4454                                                 &actualBytesAdded));
4455
4456                     if (retval == E_NONE) {
4457                         *(ap->a_bytesallocated) += actualBytesAdded;
4458                         total_bytes_added += actualBytesAdded;
4459                         moreBytesRequested -= actualBytesAdded;
4460                         if (blockHint != 0) {
4461                             blockHint += actualBytesAdded / vcb->blockSize;
4462                         }
4463                     }
4464                     filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
4465
4466                     hfs_systemfile_unlock(hfsmp, lockflags);
4467
4468                     if (hfsmp->jnl) {
4469                         (void) hfs_update(vp, 0);
4470                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
4471                     }
4472
4473                     hfs_end_transaction(hfsmp);
4474                 }
4475
4476
4477                 /*
4478                  * if we get an error and no changes were made then exit
4479                  * otherwise we must do the hfs_update to reflect the changes
4480                  */
4481                 if (retval && (startingPEOF == filebytes))
4482                         goto Err_Exit;
4483
4484                 /*
4485                  * Adjust actualBytesAdded to be allocation block aligned, not
4486                  * clump size aligned.
4487                  * NOTE: So what we are reporting does not affect reality
4488                  * until the file is closed, when we truncate the file to allocation
4489                  * block size.
4490                  */
4491                 if (total_bytes_added != 0 && orig_request_size < total_bytes_added)
4492                         *(ap->a_bytesallocated) =
4493                                 roundup(orig_request_size, (off_t)vcb->blockSize);
4494
4495         } else { /* Shorten the size of the file */
4496
4497                 /*
4498                  * N.B. At present, this code is never called.  If and when we
4499                  * do start using it, it looks like there might be slightly
4500                  * strange semantics with the file size: it's possible for the
4501                  * file size to *increase* e.g. if current file size is 5,
4502                  * length is 1024 and filebytes is 4096, the file size will
4503                  * end up being 1024 bytes.  This isn't necessarily a problem
4504                  * but it's not consistent with the code above which doesn't
4505                  * change the file size.
4506                  */
4507
4508                 retval = hfs_truncate(vp, length, 0, 0, ap->a_context);
4509                 filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
4510
4511                 /*
4512                  * if we get an error and no changes were made then exit
4513                  * otherwise we must do the hfs_update to reflect the changes
4514                  */
4515                 if (retval && (startingPEOF == filebytes)) goto Err_Exit;
4516 #if QUOTA
4517                 /* These are  bytesreleased */
4518                 (void) hfs_chkdq(cp, (int64_t)-((startingPEOF - filebytes)), NOCRED,0);
4519 #endif /* QUOTA */
4520
4521                 if (fp->ff_size > filebytes) {
4522                         fp->ff_size = filebytes;
4523
4524                         hfs_ubc_setsize(vp, fp->ff_size, true);
4525                 }
4526         }
4527
4528 Std_Exit:
4529         cp->c_flag |= C_MODIFIED;
4530         cp->c_touch_chgtime = TRUE;
4531         cp->c_touch_modtime = TRUE;
4532         retval2 = hfs_update(vp, 0);
4533
4534         if (retval == 0)
4535                 retval = retval2;
4536 Err_Exit:
4537         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
4538         hfs_unlock(cp);
4539         return (retval);
4540 }
4541
4542
4543 /*
4544  * Pagein for HFS filesystem
4545  */
4546 int
4547 hfs_vnop_pagein(struct vnop_pagein_args *ap)
4548 /*
4549         struct vnop_pagein_args {
4550                 vnode_t a_vp,
4551                 upl_t         a_pl,
4552                 vm_offset_t   a_pl_offset,
4553                 off_t         a_f_offset,
4554                 size_t        a_size,
4555                 int           a_flags
4556                 vfs_context_t a_context;
4557         };
4558 */
4559 {
4560         vnode_t         vp;
4561         struct cnode    *cp;
4562         struct filefork *fp;
4563         int             error = 0;
4564         upl_t           upl;
4565         upl_page_info_t *pl;
4566         off_t           f_offset;
4567         off_t           page_needed_f_offset;
4568         int             offset;
4569         int             isize;
4570         int             upl_size;
4571         int             pg_index;
4572         boolean_t       truncate_lock_held = FALSE;
4573         boolean_t       file_converted = FALSE;
4574         kern_return_t   kret;
4575
4576         vp = ap->a_vp;
4577         cp = VTOC(vp);
4578         fp = VTOF(vp);
4579
4580 #if CONFIG_PROTECT
4581         if ((error = cp_handle_vnop(vp, CP_READ_ACCESS | CP_WRITE_ACCESS, 0)) != 0) {
4582                 /*
4583                  * If we errored here, then this means that one of two things occurred:
4584                  * 1. there was a problem with the decryption of the key.
4585                  * 2. the device is locked and we are not allowed to access this particular file.
4586                  *
4587                  * Either way, this means that we need to shut down this upl now.  As long as
4588                  * the pl pointer is NULL (meaning that we're supposed to create the UPL ourselves)
4589                  * then we create a upl and immediately abort it.
4590                  */
4591                 if (ap->a_pl == NULL) {
4592                         /* create the upl */
4593                         ubc_create_upl (vp, ap->a_f_offset, ap->a_size, &upl, &pl,
4594                                         UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT);
4595                         /* mark the range as needed so it doesn't immediately get discarded upon abort */
4596                         ubc_upl_range_needed (upl, ap->a_pl_offset / PAGE_SIZE, 1);
4597
4598                         /* Abort the range */
4599                         ubc_upl_abort_range (upl, 0, ap->a_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
4600                 }
4601
4602
4603                 return error;
4604         }
4605 #endif /* CONFIG_PROTECT */
4606
4607         if (ap->a_pl != NULL) {
4608                 /*
4609                  * this can only happen for swap files now that
4610                  * we're asking for V2 paging behavior...
4611                  * so don't need to worry about decompression, or
4612                  * keeping track of blocks read or taking the truncate lock
4613                  */
4614                 error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
4615                                        ap->a_size, (off_t)fp->ff_size, ap->a_flags);
4616                 goto pagein_done;
4617         }
4618
4619         page_needed_f_offset = ap->a_f_offset + ap->a_pl_offset;
4620
4621 retry_pagein:
4622         /*
4623          * take truncate lock (shared/recursive) to guard against
4624          * zero-fill thru fsync interfering, but only for v2
4625          *
4626          * the HFS_RECURSE_TRUNCLOCK arg indicates that we want the
4627          * lock shared and we are allowed to recurse 1 level if this thread already
4628          * owns the lock exclusively... this can legally occur
4629          * if we are doing a shrinking ftruncate against a file
4630          * that is mapped private, and the pages being truncated
4631          * do not currently exist in the cache... in that case
4632          * we will have to page-in the missing pages in order
4633          * to provide them to the private mapping... we must
4634          * also call hfs_unlock_truncate with a postive been_recursed
4635          * arg to indicate that if we have recursed, there is no need to drop
4636          * the lock.  Allowing this simple recursion is necessary
4637          * in order to avoid a certain deadlock... since the ftruncate
4638          * already holds the truncate lock exclusively, if we try
4639          * to acquire it shared to protect the pagein path, we will
4640          * hang this thread
4641          *
4642          * NOTE: The if () block below is a workaround in order to prevent a
4643          * VM deadlock. See rdar://7853471.
4644          *
4645          * If we are in a forced unmount, then launchd will still have the
4646          * dyld_shared_cache file mapped as it is trying to reboot.  If we
4647          * take the truncate lock here to service a page fault, then our
4648          * thread could deadlock with the forced-unmount.  The forced unmount
4649          * thread will try to reclaim the dyld_shared_cache vnode, but since it's
4650          * marked C_DELETED, it will call ubc_setsize(0).  As a result, the unmount
4651          * thread will think it needs to copy all of the data out of the file
4652          * and into a VM copy object.  If we hold the cnode lock here, then that
4653          * VM operation will not be able to proceed, because we'll set a busy page
4654          * before attempting to grab the lock.  Note that this isn't as simple as "don't
4655          * call ubc_setsize" because doing that would just shift the problem to the
4656          * ubc_msync done before the vnode is reclaimed.
4657          *
4658          * So, if a forced unmount on this volume is in flight AND the cnode is
4659          * marked C_DELETED, then just go ahead and do the page in without taking
4660          * the lock (thus suspending pagein_v2 semantics temporarily).  Since it's on a file
4661          * that is not going to be available on the next mount, this seems like a
4662          * OK solution from a correctness point of view, even though it is hacky.
4663          */
4664         if (vfs_isforce(vnode_mount(vp))) {
4665                 if (cp->c_flag & C_DELETED) {
4666                         /* If we don't get it, then just go ahead and operate without the lock */
4667                         truncate_lock_held = hfs_try_trunclock(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4668                 }
4669         }
4670         else {
4671                 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4672                 truncate_lock_held = TRUE;
4673         }
4674
4675         kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT);
4676
4677         if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
4678                 error = EINVAL;
4679                 goto pagein_done;
4680         }
4681         ubc_upl_range_needed(upl, ap->a_pl_offset / PAGE_SIZE, 1);
4682
4683         upl_size = isize = ap->a_size;
4684
4685         /*
4686          * Scan from the back to find the last page in the UPL, so that we
4687          * aren't looking at a UPL that may have already been freed by the
4688          * preceding aborts/completions.
4689          */
4690         for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
4691                 if (upl_page_present(pl, --pg_index))
4692                         break;
4693                 if (pg_index == 0) {
4694                         /*
4695                          * no absent pages were found in the range specified
4696                          * just abort the UPL to get rid of it and then we're done
4697                          */
4698                         ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
4699                         goto pagein_done;
4700                 }
4701         }
4702         /*
4703          * initialize the offset variables before we touch the UPL.
4704          * f_offset is the position into the file, in bytes
4705          * offset is the position into the UPL, in bytes
4706          * pg_index is the pg# of the UPL we're operating on
4707          * isize is the offset into the UPL of the last page that is present.
4708          */
4709         isize = ((pg_index + 1) * PAGE_SIZE);
4710         pg_index = 0;
4711         offset = 0;
4712         f_offset = ap->a_f_offset;
4713
4714         while (isize) {
4715                 int  xsize;
4716                 int  num_of_pages;
4717
4718                 if ( !upl_page_present(pl, pg_index)) {
4719                         /*
4720                          * we asked for RET_ONLY_ABSENT, so it's possible
4721                          * to get back empty slots in the UPL.
4722                          * just skip over them
4723                          */
4724                         f_offset += PAGE_SIZE;
4725                         offset   += PAGE_SIZE;
4726                         isize    -= PAGE_SIZE;
4727                         pg_index++;
4728
4729                         continue;
4730                 }
4731                 /*
4732                  * We know that we have at least one absent page.
4733                  * Now checking to see how many in a row we have
4734                  */
4735                 num_of_pages = 1;
4736                 xsize = isize - PAGE_SIZE;
4737
4738                 while (xsize) {
4739                         if ( !upl_page_present(pl, pg_index + num_of_pages))
4740                                 break;
4741                         num_of_pages++;
4742                         xsize -= PAGE_SIZE;
4743                 }
4744                 xsize = num_of_pages * PAGE_SIZE;
4745
4746 #if HFS_COMPRESSION
4747                 if (VNODE_IS_RSRC(vp)) {
4748                         /* allow pageins of the resource fork */
4749                 } else {
4750                         int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
4751
4752                         if (compressed) {
4753
4754                                 if (truncate_lock_held) {
4755                                         /*
4756                                          * can't hold the truncate lock when calling into the decmpfs layer
4757                                          * since it calls back into this layer... even though we're only
4758                                          * holding the lock in shared mode, and the re-entrant path only
4759                                          * takes the lock shared, we can deadlock if some other thread
4760                                          * tries to grab the lock exclusively in between.
4761                                          */
4762                                         hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4763                                         truncate_lock_held = FALSE;
4764                                 }
4765                                 ap->a_pl = upl;
4766                                 ap->a_pl_offset = offset;
4767                                 ap->a_f_offset = f_offset;
4768                                 ap->a_size = xsize;
4769
4770                                 error = decmpfs_pagein_compressed(ap, &compressed, VTOCMP(vp));
4771                                 /*
4772                                  * note that decpfs_pagein_compressed can change the state of
4773                                  * 'compressed'... it will set it to 0 if the file is no longer
4774                                  * compressed once the compression lock is successfully taken
4775                                  * i.e. we would block on that lock while the file is being inflated
4776                                  */
4777                                 if (error == 0 && vnode_isfastdevicecandidate(vp)) {
4778                                         (void) hfs_addhotfile(vp);
4779                                 }
4780                                 if (compressed) {
4781                                         if (error == 0) {
4782                                                 /* successful page-in, update the access time */
4783                                                 VTOC(vp)->c_touch_acctime = TRUE;
4784
4785                                                 //
4786                                                 // compressed files are not traditional hot file candidates
4787                                                 // but they may be for CF (which ignores the ff_bytesread
4788                                                 // field)
4789                                                 //
4790                                                 if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
4791                                                         fp->ff_bytesread = 0;
4792                                                 }
4793                                         } else if (error == EAGAIN) {
4794                                                 /*
4795                                                  * EAGAIN indicates someone else already holds the compression lock...
4796                                                  * to avoid deadlocking, we'll abort this range of pages with an
4797                                                  * indication that the pagein needs to be redriven
4798                                                  */
4799                                                 ubc_upl_abort_range(upl, (upl_offset_t) offset, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_RESTART);
4800                                         } else if (error == ENOSPC) {
4801
4802                                                 if (upl_size == PAGE_SIZE)
4803                                                         panic("decmpfs_pagein_compressed: couldn't ubc_upl_map a single page\n");
4804
4805                                                 ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY);
4806
4807                                                 ap->a_size = PAGE_SIZE;
4808                                                 ap->a_pl = NULL;
4809                                                 ap->a_pl_offset = 0;
4810                                                 ap->a_f_offset = page_needed_f_offset;
4811
4812                                                 goto retry_pagein;
4813                                         } else {
4814                                                 ubc_upl_abort(upl, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
4815                                                 goto pagein_done;
4816                                         }
4817                                         goto pagein_next_range;
4818                                 }
4819                                 else {
4820                                         /*
4821                                          * Set file_converted only if the file became decompressed while we were
4822                                          * paging in.  If it were still compressed, we would re-start the loop using the goto
4823                                          * in the above block.  This avoid us overloading truncate_lock_held as our retry_pagein
4824                                          * condition below, since we could have avoided taking the truncate lock to prevent
4825                                          * a deadlock in the force unmount case.
4826                                          */
4827                                         file_converted = TRUE;
4828                                 }
4829                         }
4830                         if (file_converted == TRUE) {
4831                                 /*
4832                                  * the file was converted back to a regular file after we first saw it as compressed
4833                                  * we need to abort the upl, retake the truncate lock, recreate the UPL and start over
4834                                  * reset a_size so that we consider what remains of the original request
4835                                  * and null out a_upl and a_pl_offset.
4836                                  *
4837                                  * We should only be able to get into this block if the decmpfs_pagein_compressed
4838                                  * successfully decompressed the range in question for this file.
4839                                  */
4840                                 ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY);
4841
4842                                 ap->a_size = isize;
4843                                 ap->a_pl = NULL;
4844                                 ap->a_pl_offset = 0;
4845
4846                                 /* Reset file_converted back to false so that we don't infinite-loop. */
4847                                 file_converted = FALSE;
4848                                 goto retry_pagein;
4849                         }
4850                 }
4851 #endif
4852                 error = cluster_pagein(vp, upl, offset, f_offset, xsize, (off_t)fp->ff_size, ap->a_flags);
4853
4854                 /*
4855                  * Keep track of blocks read.
4856                  */
4857                 if ( !vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) {
4858                         int bytesread;
4859                         int took_cnode_lock = 0;
4860
4861                         if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE)
4862                                 bytesread = fp->ff_size;
4863                         else
4864                                 bytesread = xsize;
4865
4866                         /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
4867                         if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) {
4868                                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
4869                                 took_cnode_lock = 1;
4870                         }
4871                         /*
4872                          * If this file hasn't been seen since the start of
4873                          * the current sampling period then start over.
4874                          */
4875                         if (cp->c_atime < VTOHFS(vp)->hfc_timebase) {
4876                                 struct timeval tv;
4877
4878                                 fp->ff_bytesread = bytesread;
4879                                 microtime(&tv);
4880                                 cp->c_atime = tv.tv_sec;
4881                         } else {
4882                                 fp->ff_bytesread += bytesread;
4883                         }
4884                         cp->c_touch_acctime = TRUE;
4885
4886                         if (vnode_isfastdevicecandidate(vp)) {
4887                                 (void) hfs_addhotfile(vp);
4888                         }
4889                         if (took_cnode_lock)
4890                                 hfs_unlock(cp);
4891                 }
4892 pagein_next_range:
4893                 f_offset += xsize;
4894                 offset   += xsize;
4895                 isize    -= xsize;
4896                 pg_index += num_of_pages;
4897
4898                 error = 0;
4899         }
4900
4901 pagein_done:
4902         if (truncate_lock_held == TRUE) {
4903                 /* Note 1 is passed to hfs_unlock_truncate in been_recursed argument */
4904                 hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4905         }
4906
4907         return (error);
4908 }
4909
4910 /*
4911  * Pageout for HFS filesystem.
4912  */
4913 int
4914 hfs_vnop_pageout(struct vnop_pageout_args *ap)
4915 /*
4916         struct vnop_pageout_args {
4917            vnode_t a_vp,
4918            upl_t         a_pl,
4919            vm_offset_t   a_pl_offset,
4920            off_t         a_f_offset,
4921            size_t        a_size,
4922            int           a_flags
4923            vfs_context_t a_context;
4924         };
4925 */
4926 {
4927         vnode_t vp = ap->a_vp;
4928         struct cnode *cp;
4929         struct filefork *fp;
4930         int retval = 0;
4931         off_t filesize;
4932         upl_t           upl;
4933         upl_page_info_t* pl = NULL;
4934         vm_offset_t     a_pl_offset;
4935         int             a_flags;
4936         int is_pageoutv2 = 0;
4937         kern_return_t kret;
4938
4939         cp = VTOC(vp);
4940         fp = VTOF(vp);
4941
4942         a_flags = ap->a_flags;
4943         a_pl_offset = ap->a_pl_offset;
4944
4945         /*
4946          * we can tell if we're getting the new or old behavior from the UPL
4947          */
4948         if ((upl = ap->a_pl) == NULL) {
4949                 int request_flags;
4950
4951                 is_pageoutv2 = 1;
4952                 /*
4953                  * we're in control of any UPL we commit
4954                  * make sure someone hasn't accidentally passed in UPL_NOCOMMIT
4955                  */
4956                 a_flags &= ~UPL_NOCOMMIT;
4957                 a_pl_offset = 0;
4958
4959                 /*
4960                  * For V2 semantics, we want to take the cnode truncate lock
4961                  * shared to guard against the file size changing via zero-filling.
4962                  *
4963                  * However, we have to be careful because we may be invoked
4964                  * via the ubc_msync path to write out dirty mmap'd pages
4965                  * in response to a lock event on a content-protected
4966                  * filesystem (e.g. to write out class A files).
4967                  * As a result, we want to take the truncate lock 'SHARED' with
4968                  * the mini-recursion locktype so that we don't deadlock/panic
4969                  * because we may be already holding the truncate lock exclusive to force any other
4970                  * IOs to have blocked behind us.
4971                  */
4972                 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4973
4974                 if (a_flags & UPL_MSYNC) {
4975                         request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY;
4976                 }
4977                 else {
4978                         request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY;
4979                 }
4980
4981                 kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags);
4982
4983                 if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
4984                         retval = EINVAL;
4985                         goto pageout_done;
4986                 }
4987         }
4988         /*
4989          * from this point forward upl points at the UPL we're working with
4990          * it was either passed in or we succesfully created it
4991          */
4992
4993         /*
4994          * Figure out where the file ends, for pageout purposes.  If
4995          * ff_new_size > ff_size, then we're in the middle of extending the
4996          * file via a write, so it is safe (and necessary) that we be able
4997          * to pageout up to that point.
4998          */
4999         filesize = fp->ff_size;
5000         if (fp->ff_new_size > filesize)
5001                 filesize = fp->ff_new_size;
5002
5003         /*
5004          * Now that HFS is opting into VFC_VFSVNOP_PAGEOUTV2, we may need to operate on our own
5005          * UPL instead of relying on the UPL passed into us.  We go ahead and do that here,
5006          * scanning for dirty ranges.  We'll issue our own N cluster_pageout calls, for
5007          * N dirty ranges in the UPL.  Note that this is almost a direct copy of the
5008          * logic in vnode_pageout except that we need to do it after grabbing the truncate
5009          * lock in HFS so that we don't lock invert ourselves.
5010          *
5011          * Note that we can still get into this function on behalf of the default pager with
5012          * non-V2 behavior (swapfiles).  However in that case, we did not grab locks above
5013          * since fsync and other writing threads will grab the locks, then mark the
5014          * relevant pages as busy.  But the pageout codepath marks the pages as busy,
5015          * and THEN would attempt to grab the truncate lock, which would result in deadlock.  So
5016          * we do not try to grab anything for the pre-V2 case, which should only be accessed
5017          * by the paging/VM system.
5018          */
5019
5020         if (is_pageoutv2) {
5021                 off_t f_offset;
5022                 int offset;
5023                 int isize;
5024                 int pg_index;
5025                 int error;
5026                 int error_ret = 0;
5027
5028                 isize = ap->a_size;
5029                 f_offset = ap->a_f_offset;
5030
5031                 /*
5032                  * Scan from the back to find the last page in the UPL, so that we
5033                  * aren't looking at a UPL that may have already been freed by the
5034                  * preceding aborts/completions.
5035                  */
5036                 for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
5037                         if (upl_page_present(pl, --pg_index))
5038                                 break;
5039                         if (pg_index == 0) {
5040                                 ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
5041                                 goto pageout_done;
5042                         }
5043                 }
5044
5045                 /*
5046                  * initialize the offset variables before we touch the UPL.
5047                  * a_f_offset is the position into the file, in bytes
5048                  * offset is the position into the UPL, in bytes
5049                  * pg_index is the pg# of the UPL we're operating on.
5050                  * isize is the offset into the UPL of the last non-clean page.
5051                  */
5052                 isize = ((pg_index + 1) * PAGE_SIZE);
5053
5054                 offset = 0;
5055                 pg_index = 0;
5056
5057                 while (isize) {
5058                         int  xsize;
5059                         int  num_of_pages;
5060
5061                         if ( !upl_page_present(pl, pg_index)) {
5062                                 /*
5063                                  * we asked for RET_ONLY_DIRTY, so it's possible
5064                                  * to get back empty slots in the UPL.
5065                                  * just skip over them
5066                                  */
5067                                 f_offset += PAGE_SIZE;
5068                                 offset   += PAGE_SIZE;
5069                                 isize    -= PAGE_SIZE;
5070                                 pg_index++;
5071
5072                                 continue;
5073                         }
5074                         if ( !upl_dirty_page(pl, pg_index)) {
5075                                 panic ("hfs_vnop_pageout: unforeseen clean page @ index %d for UPL %p\n", pg_index, upl);
5076                         }
5077
5078                         /*
5079                          * We know that we have at least one dirty page.
5080                          * Now checking to see how many in a row we have
5081                          */
5082                         num_of_pages = 1;
5083                         xsize = isize - PAGE_SIZE;
5084
5085                         while (xsize) {
5086                                 if ( !upl_dirty_page(pl, pg_index + num_of_pages))
5087                                         break;
5088                                 num_of_pages++;
5089                                 xsize -= PAGE_SIZE;
5090                         }
5091                         xsize = num_of_pages * PAGE_SIZE;
5092
5093                         if ((error = cluster_pageout(vp, upl, offset, f_offset,
5094                                                         xsize, filesize, a_flags))) {
5095                                 if (error_ret == 0)
5096                                         error_ret = error;
5097                         }
5098                         f_offset += xsize;
5099                         offset   += xsize;
5100                         isize    -= xsize;
5101                         pg_index += num_of_pages;
5102                 }
5103                 /* capture errnos bubbled out of cluster_pageout if they occurred */
5104                 if (error_ret != 0) {
5105                         retval = error_ret;
5106                 }
5107         } /* end block for v2 pageout behavior */
5108         else {
5109                 /*
5110                  * just call cluster_pageout for old pre-v2 behavior
5111                  */
5112                 retval = cluster_pageout(vp, upl, a_pl_offset, ap->a_f_offset,
5113                                 ap->a_size, filesize, a_flags);
5114         }
5115
5116         /*
5117          * If data was written, update the modification time of the file
5118          * but only if it's mapped writable; we will have touched the
5119          * modifcation time for direct writes.
5120          */
5121         if (retval == 0 && (ubc_is_mapped_writable(vp)
5122                                                 || ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING))) {
5123                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5124
5125                 // Check again with lock
5126                 bool mapped_writable = ubc_is_mapped_writable(vp);
5127                 if (mapped_writable
5128                         || ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING)) {
5129                         cp->c_touch_modtime = TRUE;
5130                         cp->c_touch_chgtime = TRUE;
5131
5132                         /*
5133                          * We only need to increment the generation counter if
5134                          * it's currently mapped writable because we incremented
5135                          * the counter in hfs_vnop_mnomap.
5136                          */
5137                         if (mapped_writable)
5138                                 hfs_incr_gencount(VTOC(vp));
5139
5140                         /*
5141                          * If setuid or setgid bits are set and this process is
5142                          * not the superuser then clear the setuid and setgid bits
5143                          * as a precaution against tampering.
5144                          */
5145                         if ((cp->c_mode & (S_ISUID | S_ISGID)) &&
5146                                 (vfs_context_suser(ap->a_context) != 0)) {
5147                                 cp->c_mode &= ~(S_ISUID | S_ISGID);
5148                         }
5149                 }
5150
5151                 hfs_unlock(cp);
5152         }
5153
5154 pageout_done:
5155         if (is_pageoutv2) {
5156                 /*
5157                  * Release the truncate lock.  Note that because
5158                  * we may have taken the lock recursively by
5159                  * being invoked via ubc_msync due to lockdown,
5160                  * we should release it recursively, too.
5161                  */
5162                 hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
5163         }
5164         return (retval);
5165 }
5166
5167 /*
5168  * Intercept B-Tree node writes to unswap them if necessary.
5169  */
5170 int
5171 hfs_vnop_bwrite(struct vnop_bwrite_args *ap)
5172 {
5173         int retval = 0;
5174         register struct buf *bp = ap->a_bp;
5175         register struct vnode *vp = buf_vnode(bp);
5176         BlockDescriptor block;
5177
5178         /* Trap B-Tree writes */
5179         if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) ||
5180             (VTOC(vp)->c_fileid == kHFSCatalogFileID) ||
5181             (VTOC(vp)->c_fileid == kHFSAttributesFileID) ||
5182             (vp == VTOHFS(vp)->hfc_filevp)) {
5183
5184                 /*
5185                  * Swap and validate the node if it is in native byte order.
5186                  * This is always be true on big endian, so we always validate
5187                  * before writing here.  On little endian, the node typically has
5188                  * been swapped and validated when it was written to the journal,
5189                  * so we won't do anything here.
5190                  */
5191                 if (((u_int16_t *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) {
5192                         /* Prepare the block pointer */
5193                         block.blockHeader = bp;
5194                         block.buffer = (char *)buf_dataptr(bp);
5195                         block.blockNum = buf_lblkno(bp);
5196                         /* not found in cache ==> came from disk */
5197                         block.blockReadFromDisk = (buf_fromcache(bp) == 0);
5198                         block.blockSize = buf_count(bp);
5199
5200                         /* Endian un-swap B-Tree node */
5201                         retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, false);
5202                         if (retval)
5203                                 panic("hfs_vnop_bwrite: about to write corrupt node!\n");
5204                 }
5205         }
5206
5207         /* This buffer shouldn't be locked anymore but if it is clear it */
5208         if ((buf_flags(bp) & B_LOCKED)) {
5209                 // XXXdbg
5210                 if (VTOHFS(vp)->jnl) {
5211                         panic("hfs: CLEARING the lock bit on bp %p\n", bp);
5212                 }
5213                 buf_clearflags(bp, B_LOCKED);
5214         }
5215         retval = vn_bwrite (ap);
5216
5217         return (retval);
5218 }
5219
5220
5221 int
5222 hfs_pin_block_range(struct hfsmount *hfsmp, int pin_state, uint32_t start_block, uint32_t nblocks)
5223 {
5224         _dk_cs_pin_t pin;
5225         unsigned ioc;
5226         int err;
5227
5228         memset(&pin, 0, sizeof(pin));
5229         pin.cp_extent.offset = ((uint64_t)start_block) * HFSTOVCB(hfsmp)->blockSize;
5230         pin.cp_extent.length = ((uint64_t)nblocks) * HFSTOVCB(hfsmp)->blockSize;
5231         switch (pin_state) {
5232         case HFS_PIN_IT:
5233                 ioc = _DKIOCCSPINEXTENT;
5234                 pin.cp_flags = _DKIOCCSPINTOFASTMEDIA;
5235                 break;
5236         case HFS_PIN_IT | HFS_TEMP_PIN:
5237                 ioc = _DKIOCCSPINEXTENT;
5238                 pin.cp_flags = _DKIOCCSPINTOFASTMEDIA | _DKIOCCSTEMPORARYPIN;
5239                 break;
5240         case HFS_PIN_IT | HFS_DATALESS_PIN:
5241                 ioc = _DKIOCCSPINEXTENT;
5242                 pin.cp_flags = _DKIOCCSPINTOFASTMEDIA | _DKIOCCSPINFORSWAPFILE;
5243                 break;
5244         case HFS_UNPIN_IT:
5245                 ioc = _DKIOCCSUNPINEXTENT;
5246                 pin.cp_flags = 0;
5247                 break;
5248         case HFS_UNPIN_IT | HFS_EVICT_PIN:
5249                 ioc = _DKIOCCSPINEXTENT;
5250                 pin.cp_flags = _DKIOCCSPINTOSLOWMEDIA;
5251                 break;
5252         default:
5253                 return EINVAL;
5254         }
5255         err = VNOP_IOCTL(hfsmp->hfs_devvp, ioc, (caddr_t)&pin, 0, vfs_context_kernel());
5256         return err;
5257 }
5258
5259 //
5260 // The cnode lock should already be held on entry to this function
5261 //
5262 int
5263 hfs_pin_vnode(struct hfsmount *hfsmp, struct vnode *vp, int pin_state, uint32_t *num_blocks_pinned)
5264 {
5265         struct filefork *fp = VTOF(vp);
5266         int i, err=0, need_put=0;
5267         struct vnode *rsrc_vp=NULL;
5268         uint32_t npinned = 0;
5269         off_t               offset;
5270
5271         if (num_blocks_pinned) {
5272                 *num_blocks_pinned = 0;
5273         }
5274
5275         if (vnode_vtype(vp) != VREG) {
5276                 /* Not allowed to pin directories or symlinks */
5277                 printf("hfs: can't pin vnode of type %d\n", vnode_vtype(vp));
5278                 return (EPERM);
5279         }
5280
5281         if (fp->ff_unallocblocks) {
5282                 printf("hfs: can't pin a vnode w/unalloced blocks (%d)\n", fp->ff_unallocblocks);
5283                 return (EINVAL);
5284         }
5285
5286         /*
5287          * It is possible that if the caller unlocked/re-locked the cnode after checking
5288          * for C_NOEXISTS|C_DELETED that the file could have been deleted while the
5289          * cnode was unlocked.  So check the condition again and return ENOENT so that
5290          * the caller knows why we failed to pin the vnode.
5291          */
5292         if (VTOC(vp)->c_flag & (C_NOEXISTS|C_DELETED)) {
5293                 // makes no sense to pin something that's pending deletion
5294                 return ENOENT;
5295         }
5296
5297         if (fp->ff_blocks == 0 && (VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
5298                 if (!VNODE_IS_RSRC(vp) && hfs_vgetrsrc(hfsmp, vp, &rsrc_vp) == 0) {
5299                         //printf("hfs: fileid %d resource fork nblocks: %d / size: %lld\n", VTOC(vp)->c_fileid,
5300                         //       VTOC(rsrc_vp)->c_rsrcfork->ff_blocks,VTOC(rsrc_vp)->c_rsrcfork->ff_size);
5301
5302                         fp = VTOC(rsrc_vp)->c_rsrcfork;
5303                         need_put = 1;
5304                 }
5305         }
5306         if (fp->ff_blocks == 0) {
5307                 if (need_put) {
5308                         //
5309                         // use a distinct error code for a compressed file that has no resource fork;
5310                         // we return EALREADY to indicate that the data is already probably hot file
5311                         // cached because it's in an EA and the attributes btree is on the ssd
5312                         //
5313                         err = EALREADY;
5314                 } else {
5315                         err = EINVAL;
5316                 }
5317                 goto out;
5318         }
5319
5320         offset = 0;
5321         for (i = 0; i < kHFSPlusExtentDensity; i++) {
5322                 if (fp->ff_extents[i].startBlock == 0) {
5323                         break;
5324                 }
5325
5326                 err = hfs_pin_block_range(hfsmp, pin_state, fp->ff_extents[i].startBlock, fp->ff_extents[i].blockCount);
5327                 if (err) {
5328                         break;
5329                 } else {
5330                         npinned += fp->ff_extents[i].blockCount;
5331                 }
5332         }
5333
5334         if (err || npinned == 0) {
5335                 goto out;
5336         }
5337
5338         if (fp->ff_extents[kHFSPlusExtentDensity-1].startBlock) {
5339                 uint32_t pblocks;
5340                 uint8_t forktype = 0;
5341
5342                 if (fp == VTOC(vp)->c_rsrcfork) {
5343                         forktype = 0xff;
5344                 }
5345                 /*
5346                  * The file could have overflow extents, better pin them.
5347                  *
5348                  * We assume that since we are holding the cnode lock for this cnode,
5349                  * the files extents cannot be manipulated, but the tree could, so we
5350                  * need to ensure that it doesn't change behind our back as we iterate it.
5351                  */
5352                 int lockflags = hfs_systemfile_lock (hfsmp, SFL_EXTENTS, HFS_SHARED_LOCK);
5353                 err = hfs_pin_overflow_extents(hfsmp, VTOC(vp)->c_fileid, forktype, &pblocks);
5354                 hfs_systemfile_unlock (hfsmp, lockflags);
5355
5356                 if (err) {
5357                         goto out;
5358                 }
5359                 npinned += pblocks;
5360         }
5361
5362 out:
5363         if (num_blocks_pinned) {
5364                 *num_blocks_pinned = npinned;
5365         }
5366
5367         if (need_put && rsrc_vp) {
5368                 //
5369                 // have to unlock the cnode since it's shared between the
5370                 // resource fork vnode and the data fork vnode (and the
5371                 // vnode_put() may need to re-acquire the cnode lock to
5372                 // reclaim the resource fork vnode)
5373                 //
5374                 hfs_unlock(VTOC(vp));
5375                 vnode_put(rsrc_vp);
5376                 hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5377         }
5378         return err;
5379 }
5380
5381
5382 /*
5383  * Relocate a file to a new location on disk
5384  *  cnode must be locked on entry
5385  *
5386  * Relocation occurs by cloning the file's data from its
5387  * current set of blocks to a new set of blocks. During
5388  * the relocation all of the blocks (old and new) are
5389  * owned by the file.
5390  *
5391  * -----------------
5392  * |///////////////|
5393  * -----------------
5394  * 0               N (file offset)
5395  *
5396  * -----------------     -----------------
5397  * |///////////////|     |               |     STEP 1 (acquire new blocks)
5398  * -----------------     -----------------
5399  * 0               N     N+1             2N
5400  *
5401  * -----------------     -----------------
5402  * |///////////////|     |///////////////|     STEP 2 (clone data)
5403  * -----------------     -----------------
5404  * 0               N     N+1             2N
5405  *
5406  *                       -----------------
5407  *                       |///////////////|     STEP 3 (head truncate blocks)
5408  *                       -----------------
5409  *                       0               N
5410  *
5411  * During steps 2 and 3 page-outs to file offsets less
5412  * than or equal to N are suspended.
5413  *
5414  * During step 3 page-ins to the file get suspended.
5415  */
5416 int
5417 hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
5418         struct  proc *p)
5419 {
5420         struct  cnode *cp;
5421         struct  filefork *fp;
5422         struct  hfsmount *hfsmp;
5423         u_int32_t  headblks;
5424         u_int32_t  datablks;
5425         u_int32_t  blksize;
5426         u_int32_t  growsize;
5427         u_int32_t  nextallocsave;
5428         daddr64_t  sector_a,  sector_b;
5429         int eflags = 0;
5430         off_t  newbytes;
5431         int  retval;
5432         int lockflags = 0;
5433         int took_trunc_lock = 0;
5434         int started_tr = 0;
5435         enum vtype vnodetype;
5436
5437         vnodetype = vnode_vtype(vp);
5438         if (vnodetype != VREG) {
5439                 /* Not allowed to move symlinks. */
5440                 return (EPERM);
5441         }
5442
5443         hfsmp = VTOHFS(vp);
5444         if (hfsmp->hfs_flags & HFS_FRAGMENTED_FREESPACE) {
5445                 return (ENOSPC);
5446         }
5447
5448         cp = VTOC(vp);
5449         fp = VTOF(vp);
5450         if (fp->ff_unallocblocks)
5451                 return (EINVAL);
5452
5453 #if CONFIG_PROTECT
5454         /*
5455          * <rdar://problem/9118426>
5456          * Disable HFS file relocation on content-protected filesystems
5457          */
5458         if (cp_fs_protected (hfsmp->hfs_mp)) {
5459                 return EINVAL;
5460         }
5461 #endif
5462         /* If it's an SSD, also disable HFS relocation */
5463         if (hfsmp->hfs_flags & HFS_SSD) {
5464                 return EINVAL;
5465         }
5466
5467
5468         blksize = hfsmp->blockSize;
5469         if (blockHint == 0)
5470                 blockHint = hfsmp->nextAllocation;
5471
5472         if (fp->ff_size > 0x7fffffff) {
5473                 return (EFBIG);
5474         }
5475
5476         if (!vnode_issystem(vp) && (vnodetype != VLNK)) {
5477                 hfs_unlock(cp);
5478                 hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
5479                 /* Force lock since callers expects lock to be held. */
5480                 if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS))) {
5481                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5482                         return (retval);
5483                 }
5484                 /* No need to continue if file was removed. */
5485                 if (cp->c_flag & C_NOEXISTS) {
5486                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5487                         return (ENOENT);
5488                 }
5489                 took_trunc_lock = 1;
5490         }
5491         headblks = fp->ff_blocks;
5492         datablks = howmany(fp->ff_size, blksize);
5493         growsize = datablks * blksize;
5494         eflags = kEFContigMask | kEFAllMask | kEFNoClumpMask;
5495         if (blockHint >= hfsmp->hfs_metazone_start &&
5496             blockHint <= hfsmp->hfs_metazone_end)
5497                 eflags |= kEFMetadataMask;
5498
5499         if (hfs_start_transaction(hfsmp) != 0) {
5500                 if (took_trunc_lock)
5501                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5502             return (EINVAL);
5503         }
5504         started_tr = 1;
5505         /*
5506          * Protect the extents b-tree and the allocation bitmap
5507          * during MapFileBlockC and ExtendFileC operations.
5508          */
5509         lockflags = SFL_BITMAP;
5510         if (overflow_extents(fp))
5511                 lockflags |= SFL_EXTENTS;
5512         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5513
5514         retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize - 1, &sector_a, NULL);
5515         if (retval) {
5516                 retval = MacToVFSError(retval);
5517                 goto out;
5518         }
5519
5520         /*
5521          * STEP 1 - acquire new allocation blocks.
5522          */
5523         nextallocsave = hfsmp->nextAllocation;
5524         retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes);
5525         if (eflags & kEFMetadataMask) {
5526                 hfs_lock_mount(hfsmp);
5527                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave);
5528                 MarkVCBDirty(hfsmp);
5529                 hfs_unlock_mount(hfsmp);
5530         }
5531
5532         retval = MacToVFSError(retval);
5533         if (retval == 0) {
5534                 cp->c_flag |= C_MODIFIED;
5535                 if (newbytes < growsize) {
5536                         retval = ENOSPC;
5537                         goto restore;
5538                 } else if (fp->ff_blocks < (headblks + datablks)) {
5539                         printf("hfs_relocate: allocation failed id=%u, vol=%s\n", cp->c_cnid, hfsmp->vcbVN);
5540                         retval = ENOSPC;
5541                         goto restore;
5542                 }
5543
5544                 retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize, &sector_b, NULL);
5545                 if (retval) {
5546                         retval = MacToVFSError(retval);
5547                 } else if ((sector_a + 1) == sector_b) {
5548                         retval = ENOSPC;
5549                         goto restore;
5550                 } else if ((eflags & kEFMetadataMask) &&
5551                            ((((u_int64_t)sector_b * hfsmp->hfs_logical_block_size) / blksize) >
5552                               hfsmp->hfs_metazone_end)) {
5553 #if 0
5554                         const char * filestr;
5555                         char emptystr = '\0';
5556
5557                         if (cp->c_desc.cd_nameptr != NULL) {
5558                                 filestr = (const char *)&cp->c_desc.cd_nameptr[0];
5559                         } else if (vnode_name(vp) != NULL) {
5560                                 filestr = vnode_name(vp);
5561                         } else {
5562                                 filestr = &emptystr;
5563                         }
5564 #endif
5565                         retval = ENOSPC;
5566                         goto restore;
5567                 }
5568         }
5569         /* Done with system locks and journal for now. */
5570         hfs_systemfile_unlock(hfsmp, lockflags);
5571         lockflags = 0;
5572         hfs_end_transaction(hfsmp);
5573         started_tr = 0;
5574
5575         if (retval) {
5576                 /*
5577                  * Check to see if failure is due to excessive fragmentation.
5578                  */
5579                 if ((retval == ENOSPC) &&
5580                     (hfs_freeblks(hfsmp, 0) > (datablks * 2))) {
5581                         hfsmp->hfs_flags |= HFS_FRAGMENTED_FREESPACE;
5582                 }
5583                 goto out;
5584         }
5585         /*
5586          * STEP 2 - clone file data into the new allocation blocks.
5587          */
5588
5589         if (vnodetype == VLNK)
5590                 retval = EPERM;
5591         else if (vnode_issystem(vp))
5592                 retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p);
5593         else
5594                 retval = hfs_clonefile(vp, headblks, datablks, blksize);
5595
5596         /* Start transaction for step 3 or for a restore. */
5597         if (hfs_start_transaction(hfsmp) != 0) {
5598                 retval = EINVAL;
5599                 goto out;
5600         }
5601         started_tr = 1;
5602         if (retval)
5603                 goto restore;
5604
5605         /*
5606          * STEP 3 - switch to cloned data and remove old blocks.
5607          */
5608         lockflags = SFL_BITMAP;
5609         if (overflow_extents(fp))
5610                 lockflags |= SFL_EXTENTS;
5611         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5612
5613         retval = HeadTruncateFile(hfsmp, (FCB*)fp, headblks);
5614
5615         hfs_systemfile_unlock(hfsmp, lockflags);
5616         lockflags = 0;
5617         if (retval)
5618                 goto restore;
5619 out:
5620         if (took_trunc_lock)
5621                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5622
5623         if (lockflags) {
5624                 hfs_systemfile_unlock(hfsmp, lockflags);
5625                 lockflags = 0;
5626         }
5627
5628         /* Push cnode's new extent data to disk. */
5629         if (retval == 0) {
5630                 hfs_update(vp, 0);
5631         }
5632         if (hfsmp->jnl) {
5633                 if (cp->c_cnid < kHFSFirstUserCatalogNodeID)
5634                         (void) hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT | HFS_FVH_WRITE_ALT);
5635                 else
5636                         (void) hfs_flushvolumeheader(hfsmp, 0);
5637         }
5638 exit:
5639         if (started_tr)
5640                 hfs_end_transaction(hfsmp);
5641
5642         return (retval);
5643
5644 restore:
5645         if (fp->ff_blocks == headblks) {
5646                 if (took_trunc_lock)
5647                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5648                 goto exit;
5649         }
5650         /*
5651          * Give back any newly allocated space.
5652          */
5653         if (lockflags == 0) {
5654                 lockflags = SFL_BITMAP;
5655                 if (overflow_extents(fp))
5656                         lockflags |= SFL_EXTENTS;
5657                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5658         }
5659
5660         (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, 0, FORK_IS_RSRC(fp),
5661                                                  FTOC(fp)->c_fileid, false);
5662
5663         hfs_systemfile_unlock(hfsmp, lockflags);
5664         lockflags = 0;
5665
5666         if (took_trunc_lock)
5667                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5668         goto exit;
5669 }
5670
5671
5672 /*
5673  * Clone a file's data within the file.
5674  *
5675  */
5676 static int
5677 hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
5678 {
5679         caddr_t  bufp;
5680         size_t  bufsize;
5681         size_t  copysize;
5682         size_t  iosize;
5683         size_t  offset;
5684         off_t   writebase;
5685         uio_t auio;
5686         int  error = 0;
5687
5688         writebase = blkstart * blksize;
5689         copysize = blkcnt * blksize;
5690         iosize = bufsize = MIN(copysize, 128 * 1024);
5691         offset = 0;
5692
5693         hfs_unlock(VTOC(vp));
5694
5695 #if CONFIG_PROTECT
5696         if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
5697                 hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5698                 return (error);
5699         }
5700 #endif /* CONFIG_PROTECT */
5701
5702     bufp = hfs_malloc(bufsize);
5703
5704         auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
5705
5706         while (offset < copysize) {
5707                 iosize = MIN(copysize - offset, iosize);
5708
5709                 uio_reset(auio, offset, UIO_SYSSPACE, UIO_READ);
5710                 uio_addiov(auio, (uintptr_t)bufp, iosize);
5711
5712                 error = cluster_read(vp, auio, copysize, IO_NOCACHE);
5713                 if (error) {
5714                         printf("hfs_clonefile: cluster_read failed - %d\n", error);
5715                         break;
5716                 }
5717                 if (uio_resid(auio) != 0) {
5718                         printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", (int64_t)uio_resid(auio));
5719                         error = EIO;
5720                         break;
5721                 }
5722
5723                 uio_reset(auio, writebase + offset, UIO_SYSSPACE, UIO_WRITE);
5724                 uio_addiov(auio, (uintptr_t)bufp, iosize);
5725
5726                 error = cluster_write(vp, auio, writebase + offset,
5727                                       writebase + offset + iosize,
5728                                       uio_offset(auio), 0, IO_NOCACHE | IO_SYNC);
5729                 if (error) {
5730                         printf("hfs_clonefile: cluster_write failed - %d\n", error);
5731                         break;
5732                 }
5733                 if (uio_resid(auio) != 0) {
5734                         printf("hfs_clonefile: cluster_write failed - uio_resid not zero\n");
5735                         error = EIO;
5736                         break;
5737                 }
5738                 offset += iosize;
5739         }
5740         uio_free(auio);
5741
5742         if ((blksize & PAGE_MASK)) {
5743                 /*
5744                  * since the copy may not have started on a PAGE
5745                  * boundary (or may not have ended on one), we
5746                  * may have pages left in the cache since NOCACHE
5747                  * will let partially written pages linger...
5748                  * lets just flush the entire range to make sure
5749                  * we don't have any pages left that are beyond
5750                  * (or intersect) the real LEOF of this file
5751                  */
5752                 ubc_msync(vp, writebase, writebase + offset, NULL, UBC_INVALIDATE | UBC_PUSHDIRTY);
5753         } else {
5754                 /*
5755                  * No need to call ubc_msync or hfs_invalbuf
5756                  * since the file was copied using IO_NOCACHE and
5757                  * the copy was done starting and ending on a page
5758                  * boundary in the file.
5759                  */
5760         }
5761     hfs_free(bufp, bufsize);
5762
5763         hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5764         return (error);
5765 }
5766
5767 /*
5768  * Clone a system (metadata) file.
5769  *
5770  */
5771 static int
5772 hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize,
5773                  kauth_cred_t cred, struct proc *p)
5774 {
5775         caddr_t  bufp;
5776         char * offset;
5777         size_t  bufsize;
5778         size_t  iosize;
5779         struct buf *bp = NULL;
5780         daddr64_t  blkno;
5781         daddr64_t  blk;
5782         daddr64_t  start_blk;
5783         daddr64_t  last_blk;
5784         int  breadcnt;
5785         int  i;
5786         int  error = 0;
5787
5788
5789         iosize = GetLogicalBlockSize(vp);
5790         bufsize = MIN(blkcnt * blksize, 1024 * 1024) & ~(iosize - 1);
5791         breadcnt = bufsize / iosize;
5792
5793     bufp = hfs_malloc(bufsize);
5794
5795         start_blk = ((daddr64_t)blkstart * blksize) / iosize;
5796         last_blk  = ((daddr64_t)blkcnt * blksize) / iosize;
5797         blkno = 0;
5798
5799         while (blkno < last_blk) {
5800                 /*
5801                  * Read up to a megabyte
5802                  */
5803                 offset = bufp;
5804                 for (i = 0, blk = blkno; (i < breadcnt) && (blk < last_blk); ++i, ++blk) {
5805                         error = (int)buf_meta_bread(vp, blk, iosize, cred, &bp);
5806                         if (error) {
5807                                 printf("hfs_clonesysfile: meta_bread error %d\n", error);
5808                                 goto out;
5809                         }
5810                         if (buf_count(bp) != iosize) {
5811                                 printf("hfs_clonesysfile: b_bcount is only %d\n", buf_count(bp));
5812                                 goto out;
5813                         }
5814                         bcopy((char *)buf_dataptr(bp), offset, iosize);
5815
5816                         buf_markinvalid(bp);
5817                         buf_brelse(bp);
5818                         bp = NULL;
5819
5820                         offset += iosize;
5821                 }
5822
5823                 /*
5824                  * Write up to a megabyte
5825                  */
5826                 offset = bufp;
5827                 for (i = 0; (i < breadcnt) && (blkno < last_blk); ++i, ++blkno) {
5828                         bp = buf_getblk(vp, start_blk + blkno, iosize, 0, 0, BLK_META);
5829                         if (bp == NULL) {
5830                                 printf("hfs_clonesysfile: getblk failed on blk %qd\n", start_blk + blkno);
5831                                 error = EIO;
5832                                 goto out;
5833                         }
5834                         bcopy(offset, (char *)buf_dataptr(bp), iosize);
5835                         error = (int)buf_bwrite(bp);
5836                         bp = NULL;
5837                         if (error)
5838                                 goto out;
5839                         offset += iosize;
5840                 }
5841         }
5842 out:
5843         if (bp) {
5844                 buf_brelse(bp);
5845         }
5846
5847     hfs_free(bufp, bufsize);
5848
5849         error = hfs_fsync(vp, MNT_WAIT, 0, p);
5850
5851         return (error);
5852 }
5853
5854 errno_t hfs_flush_invalid_ranges(vnode_t vp)
5855 {
5856         cnode_t *cp = VTOC(vp);
5857
5858         hfs_assert(cp->c_lockowner == current_thread());
5859         hfs_assert(cp->c_truncatelockowner == current_thread());
5860
5861         if (!ISSET(cp->c_flag, C_ZFWANTSYNC) && !cp->c_zftimeout)
5862                 return 0;
5863
5864         filefork_t *fp = VTOF(vp);
5865
5866         /*
5867          * We can't hold the cnode lock whilst we call cluster_write so we
5868          * need to copy the extents into a local buffer.
5869          */
5870         int max_exts = 16;
5871         struct ext {
5872                 off_t start, end;
5873         } exts_buf[max_exts];           // 256 bytes
5874         struct ext *exts = exts_buf;
5875         int ext_count = 0;
5876         errno_t ret;
5877
5878         struct rl_entry *r = TAILQ_FIRST(&fp->ff_invalidranges);
5879
5880         while (r) {
5881                 /* If we have more than can fit in our stack buffer, switch
5882                    to a heap buffer. */
5883                 if (exts == exts_buf && ext_count == max_exts) {
5884                         max_exts = 256;
5885                         exts = hfs_malloc(sizeof(struct ext) * max_exts);
5886                         memcpy(exts, exts_buf, ext_count * sizeof(struct ext));
5887                 }
5888
5889                 struct rl_entry *next = TAILQ_NEXT(r, rl_link);
5890
5891                 exts[ext_count++] = (struct ext){ r->rl_start, r->rl_end };
5892
5893                 if (!next || (ext_count == max_exts && exts != exts_buf)) {
5894                         hfs_unlock(cp);
5895                         for (int i = 0; i < ext_count; ++i) {
5896                                 ret = cluster_write(vp, NULL, fp->ff_size, exts[i].end + 1,
5897                                                                         exts[i].start, 0,
5898                                                                         IO_HEADZEROFILL | IO_NOZERODIRTY | IO_NOCACHE);
5899                                 if (ret) {
5900                                         hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
5901                                         goto exit;
5902                                 }
5903                         }
5904
5905                         if (!next) {
5906                                 hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
5907                                 break;
5908                         }
5909
5910                         /* Push any existing clusters which should clean up our invalid
5911                            ranges as they go through hfs_vnop_blockmap. */
5912                         cluster_push(vp, 0);
5913
5914                         hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
5915
5916                         /*
5917                          * Get back to where we were (given we dropped the lock).
5918                          * This shouldn't be many because we pushed above.
5919                          */
5920                         TAILQ_FOREACH(r, &fp->ff_invalidranges, rl_link) {
5921                                 if (r->rl_end > exts[ext_count - 1].end)
5922                                         break;
5923                         }
5924
5925                         ext_count = 0;
5926                 } else
5927                         r = next;
5928         }
5929
5930         ret = 0;
5931
5932 exit:
5933
5934         if (exts != exts_buf)
5935                 hfs_free(exts, sizeof(struct ext) * max_exts);
5936
5937         return ret;
5938 }