core/hfs_readwrite.c

   1 /*
   2  * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*      @(#)hfs_readwrite.c     1.0
  29  *
  30  *      (c) 1998-2001 Apple Inc.  All Rights Reserved
  31  *
  32  *      hfs_readwrite.c -- vnode operations to deal with reading and writing files.
  33  *
  34  */
  35
  36 #include <sys/param.h>
  37 #include <sys/systm.h>
  38 #include <sys/kernel.h>
  39 #include <sys/fcntl.h>
  40 #include <sys/stat.h>
  41 #include <sys/buf.h>
  42 #include <sys/proc.h>
  43 #include <sys/kauth.h>
  44 #include <sys/vnode.h>
  45 #include <sys/uio.h>
  46 #include <sys/vfs_context.h>
  47 #include <sys/disk.h>
  48 #include <sys/sysctl.h>
  49 #include <sys/fsctl.h>
  50 #include <sys/ubc.h>
  51 #include <sys/fsevents.h>
  52 #include <uuid/uuid.h>
  53
  54 #include <libkern/OSDebug.h>
  55
  56 #include <miscfs/specfs/specdev.h>
  57
  58 #include <sys/ubc.h>
  59
  60 #include <vm/vm_pageout.h>
  61 #include <vm/vm_kern.h>
  62
  63 #include <IOKit/IOBSD.h>
  64
  65 #include <sys/kdebug.h>
  66
  67 #include        "hfs.h"
  68 #include        "hfs_attrlist.h"
  69 #include        "hfs_endian.h"
  70 #include        "hfs_fsctl.h"
  71 #include        "hfs_quota.h"
  72 #include        "FileMgrInternal.h"
  73 #include        "BTreesInternal.h"
  74 #include        "hfs_cnode.h"
  75 #include        "hfs_dbg.h"
  76
  77 #if HFS_CONFIG_KEY_ROLL
  78 #include        "hfs_key_roll.h"
  79 #endif
  80
  81 #define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2)))
  82
  83 enum {
  84         MAXHFSFILESIZE = 0x7FFFFFFF             /* this needs to go in the mount structure */
  85 };
  86
  87 /* from bsd/hfs/hfs_vfsops.c */
  88 extern int hfs_vfs_vget (struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context);
  89
  90 /* from hfs_hotfiles.c */
  91 extern int hfs_pin_overflow_extents (struct hfsmount *hfsmp, uint32_t fileid,
  92                                               uint8_t forktype, uint32_t *pinned);
  93
  94 static int  hfs_clonefile(struct vnode *, int, int, int);
  95 static int  hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *);
  96 static int  do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skip, vfs_context_t context);
  97
  98
  99 /*
 100  * Read data from a file.
 101  */
 102 int
 103 hfs_vnop_read(struct vnop_read_args *ap)
 104 {
 105         /*
 106            struct vnop_read_args {
 107            struct vnodeop_desc *a_desc;
 108            vnode_t a_vp;
 109            struct uio *a_uio;
 110            int a_ioflag;
 111            vfs_context_t a_context;
 112            };
 113          */
 114
 115         uio_t uio = ap->a_uio;
 116         struct vnode *vp = ap->a_vp;
 117         struct cnode *cp;
 118         struct filefork *fp;
 119         struct hfsmount *hfsmp;
 120         off_t filesize;
 121         off_t filebytes;
 122         off_t start_resid = uio_resid(uio);
 123         off_t offset = uio_offset(uio);
 124         int retval = 0;
 125         int took_truncate_lock = 0;
 126         int io_throttle = 0;
 127         int throttled_count = 0;
 128
 129         /* Preflight checks */
 130         if (!vnode_isreg(vp)) {
 131                 /* can only read regular files */
 132                 if (vnode_isdir(vp))
 133                         return (EISDIR);
 134                 else
 135                         return (EPERM);
 136         }
 137         if (start_resid == 0)
 138                 return (0);             /* Nothing left to do */
 139         if (offset < 0)
 140                 return (EINVAL);        /* cant read from a negative offset */
 141
 142 #if SECURE_KERNEL
 143         if ((ap->a_ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) ==
 144                                                 (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) {
 145                 /* Don't allow unencrypted io request from user space */
 146                 return EPERM;
 147         }
 148 #endif
 149
 150 #if HFS_COMPRESSION
 151         if (VNODE_IS_RSRC(vp)) {
 152                 if (hfs_hides_rsrc(ap->a_context, VTOC(vp), 1)) { /* 1 == don't take the cnode lock */
 153                         return 0;
 154                 }
 155                 /* otherwise read the resource fork normally */
 156         } else {
 157                 int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
 158                 if (compressed) {
 159                         retval = decmpfs_read_compressed(ap, &compressed, VTOCMP(vp));
 160                         if (retval == 0 && !(ap->a_ioflag & IO_EVTONLY) && vnode_isfastdevicecandidate(vp)) {
 161                                 (void) hfs_addhotfile(vp);
 162                         }
 163                         if (compressed) {
 164                                 if (retval == 0) {
 165                                         /* successful read, update the access time */
 166                                         VTOC(vp)->c_touch_acctime = TRUE;
 167
 168                                         //
 169                                         // compressed files are not traditional hot file candidates
 170                                         // but they may be for CF (which ignores the ff_bytesread
 171                                         // field)
 172                                         //
 173                                         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
 174                                                 VTOF(vp)->ff_bytesread = 0;
 175                                         }
 176                                 }
 177                                 return retval;
 178                         }
 179                         /* otherwise the file was converted back to a regular file while we were reading it */
 180                         retval = 0;
 181                 } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
 182                         int error;
 183
 184                         error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP);
 185                         if (error) {
 186                                 return error;
 187                         }
 188
 189                 }
 190         }
 191 #endif /* HFS_COMPRESSION */
 192
 193         cp = VTOC(vp);
 194         fp = VTOF(vp);
 195         hfsmp = VTOHFS(vp);
 196
 197 #if CONFIG_PROTECT
 198         if ((retval = cp_handle_vnop (vp, CP_READ_ACCESS, ap->a_ioflag)) != 0) {
 199                 goto exit;
 200         }
 201
 202 #if HFS_CONFIG_KEY_ROLL
 203         if (ISSET(ap->a_ioflag, IO_ENCRYPTED)) {
 204                 off_rsrc_t off_rsrc = off_rsrc_make(offset + start_resid,
 205                                                                                         VNODE_IS_RSRC(vp));
 206
 207                 retval = hfs_key_roll_up_to(ap->a_context, vp, off_rsrc);
 208                 if (retval)
 209                         goto exit;
 210         }
 211 #endif // HFS_CONFIG_KEY_ROLL
 212 #endif // CONFIG_PROTECT
 213
 214         /*
 215          * If this read request originated from a syscall (as opposed to
 216          * an in-kernel page fault or something), then set it up for
 217          * throttle checks
 218          */
 219         if (ap->a_ioflag & IO_SYSCALL_DISPATCH) {
 220                 io_throttle = IO_RETURN_ON_THROTTLE;
 221         }
 222
 223 read_again:
 224
 225         /* Protect against a size change. */
 226         hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
 227         took_truncate_lock = 1;
 228
 229         filesize = fp->ff_size;
 230         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 231
 232         /*
 233          * Check the file size. Note that per POSIX spec, we return 0 at
 234          * file EOF, so attempting a read at an offset that is too big
 235          * should just return 0 on HFS+. Since the return value was initialized
 236          * to 0 above, we just jump to exit.  HFS Standard has its own behavior.
 237          */
 238         if (offset > filesize) {
 239 #if CONFIG_HFS_STD
 240                 if ((hfsmp->hfs_flags & HFS_STANDARD) &&
 241                     (offset > (off_t)MAXHFSFILESIZE)) {
 242                         retval = EFBIG;
 243                 }
 244 #endif
 245                 goto exit;
 246         }
 247
 248         KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_START,
 249                 (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0);
 250
 251         retval = cluster_read(vp, uio, filesize, ap->a_ioflag |io_throttle);
 252
 253         cp->c_touch_acctime = TRUE;
 254
 255         KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_END,
 256                 (int)uio_offset(uio), uio_resid(uio), (int)filesize,  (int)filebytes, 0);
 257
 258         /*
 259          * Keep track blocks read
 260          */
 261         if (hfsmp->hfc_stage == HFC_RECORDING && retval == 0) {
 262                 int took_cnode_lock = 0;
 263                 off_t bytesread;
 264
 265                 bytesread = start_resid - uio_resid(uio);
 266
 267                 /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
 268                 if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) {
 269                         hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 270                         took_cnode_lock = 1;
 271                 }
 272                 /*
 273                  * If this file hasn't been seen since the start of
 274                  * the current sampling period then start over.
 275                  */
 276                 if (cp->c_atime < hfsmp->hfc_timebase) {
 277                         struct timeval tv;
 278
 279                         fp->ff_bytesread = bytesread;
 280                         microtime(&tv);
 281                         cp->c_atime = tv.tv_sec;
 282                 } else {
 283                         fp->ff_bytesread += bytesread;
 284                 }
 285
 286                 if (!(ap->a_ioflag & IO_EVTONLY) && vnode_isfastdevicecandidate(vp)) {
 287                         //
 288                         // We don't add hotfiles for processes doing IO_EVTONLY I/O
 289                         // on the assumption that they're system processes such as
 290                         // mdworker which scan everything in the system (and thus
 291                         // do not represent user-initiated access to files)
 292                         //
 293                         (void) hfs_addhotfile(vp);
 294                 }
 295                 if (took_cnode_lock)
 296                         hfs_unlock(cp);
 297         }
 298 exit:
 299         if (took_truncate_lock) {
 300                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 301         }
 302         if (retval == EAGAIN) {
 303                 throttle_lowpri_io(1);
 304                 throttled_count++;
 305
 306                 retval = 0;
 307                 goto read_again;
 308         }
 309         if (throttled_count)
 310                 throttle_info_reset_window(NULL);
 311         return (retval);
 312 }
 313
 314 /*
 315  * Ideally, this wouldn't be necessary; the cluster code should be
 316  * able to handle this on the read-side.  See <rdar://20420068>.
 317  */
 318 static errno_t hfs_zero_eof_page(vnode_t vp, off_t zero_up_to)
 319 {
 320         hfs_assert(VTOC(vp)->c_lockowner != current_thread());
 321         hfs_assert(VTOC(vp)->c_truncatelockowner == current_thread());
 322
 323         struct filefork *fp = VTOF(vp);
 324
 325         if (!(fp->ff_size & PAGE_MASK_64) || zero_up_to <= fp->ff_size) {
 326                 // Nothing to do
 327                 return 0;
 328         }
 329
 330         zero_up_to = MIN(zero_up_to, (off_t)round_page_64(fp->ff_size));
 331
 332         /* N.B. At present, @zero_up_to is not important because the cluster
 333            code will always zero up to the end of the page anyway. */
 334         return cluster_write(vp, NULL, fp->ff_size, zero_up_to,
 335                                                  fp->ff_size, 0, IO_HEADZEROFILL);
 336 }
 337
 338 /*
 339  * Write data to a file.
 340  */
 341 int
 342 hfs_vnop_write(struct vnop_write_args *ap)
 343 {
 344         uio_t uio = ap->a_uio;
 345         struct vnode *vp = ap->a_vp;
 346         struct cnode *cp;
 347         struct filefork *fp;
 348         struct hfsmount *hfsmp;
 349         kauth_cred_t cred = NULL;
 350         off_t origFileSize;
 351         off_t writelimit;
 352         off_t bytesToAdd = 0;
 353         off_t actualBytesAdded;
 354         off_t filebytes;
 355         off_t offset;
 356         ssize_t resid;
 357         int eflags;
 358         int ioflag = ap->a_ioflag;
 359         int retval = 0;
 360         int lockflags;
 361         int cnode_locked = 0;
 362         int partialwrite = 0;
 363         int do_snapshot = 1;
 364         time_t orig_ctime=VTOC(vp)->c_ctime;
 365         int took_truncate_lock = 0;
 366         int io_return_on_throttle = 0;
 367         int throttled_count = 0;
 368
 369 #if HFS_COMPRESSION
 370         if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
 371                 int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
 372                 switch(state) {
 373                         case FILE_IS_COMPRESSED:
 374                                 return EACCES;
 375                         case FILE_IS_CONVERTING:
 376                                 /* if FILE_IS_CONVERTING, we allow writes but do not
 377                                    bother with snapshots or else we will deadlock.
 378                                 */
 379                                 do_snapshot = 0;
 380                                 break;
 381                         default:
 382                                 printf("invalid state %d for compressed file\n", state);
 383                                 /* fall through */
 384                 }
 385         } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
 386                 int error;
 387
 388                 error = check_for_dataless_file(vp, NAMESPACE_HANDLER_WRITE_OP);
 389                 if (error != 0) {
 390                         return error;
 391                 }
 392         }
 393
 394         if (do_snapshot) {
 395                 nspace_snapshot_event(vp, orig_ctime, NAMESPACE_HANDLER_WRITE_OP, uio);
 396         }
 397
 398 #endif
 399
 400 #if SECURE_KERNEL
 401         if ((ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) ==
 402                                                 (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) {
 403                 /* Don't allow unencrypted io request from user space */
 404                 return EPERM;
 405         }
 406 #endif
 407
 408         resid = uio_resid(uio);
 409         offset = uio_offset(uio);
 410
 411         if (offset < 0)
 412                 return (EINVAL);
 413         if (resid == 0)
 414                 return (E_NONE);
 415         if (!vnode_isreg(vp))
 416                 return (EPERM);  /* Can only write regular files */
 417
 418         cp = VTOC(vp);
 419         fp = VTOF(vp);
 420         hfsmp = VTOHFS(vp);
 421
 422 #if CONFIG_PROTECT
 423         if ((retval = cp_handle_vnop (vp, CP_WRITE_ACCESS, 0)) != 0) {
 424                 goto exit;
 425         }
 426 #endif
 427
 428         eflags = kEFDeferMask;  /* defer file block allocations */
 429 #if HFS_SPARSE_DEV
 430         /*
 431          * When the underlying device is sparse and space
 432          * is low (< 8MB), stop doing delayed allocations
 433          * and begin doing synchronous I/O.
 434          */
 435         if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
 436             (hfs_freeblks(hfsmp, 0) < 2048)) {
 437                 eflags &= ~kEFDeferMask;
 438                 ioflag |= IO_SYNC;
 439         }
 440 #endif /* HFS_SPARSE_DEV */
 441
 442         if ((ioflag & (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) ==
 443                         (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) {
 444                 io_return_on_throttle = IO_RETURN_ON_THROTTLE;
 445         }
 446
 447 again:
 448         /*
 449          * Protect against a size change.
 450          *
 451          * Note: If took_truncate_lock is true, then we previously got the lock shared
 452          * but needed to upgrade to exclusive.  So try getting it exclusive from the
 453          * start.
 454          */
 455         if (ioflag & IO_APPEND || took_truncate_lock) {
 456                 hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
 457         }
 458         else {
 459                 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
 460         }
 461         took_truncate_lock = 1;
 462
 463         /* Update UIO */
 464         if (ioflag & IO_APPEND) {
 465                 uio_setoffset(uio, fp->ff_size);
 466                 offset = fp->ff_size;
 467         }
 468         if ((cp->c_bsdflags & APPEND) && offset != fp->ff_size) {
 469                 retval = EPERM;
 470                 goto exit;
 471         }
 472
 473         cred = vfs_context_ucred(ap->a_context);
 474         if (cred && suser(cred, NULL) != 0)
 475                 eflags |= kEFReserveMask;
 476
 477         origFileSize = fp->ff_size;
 478         writelimit = offset + resid;
 479
 480         /*
 481          * We may need an exclusive truncate lock for several reasons, all
 482          * of which are because we may be writing to a (portion of a) block
 483          * for the first time, and we need to make sure no readers see the
 484          * prior, uninitialized contents of the block.  The cases are:
 485          *
 486          * 1. We have unallocated (delayed allocation) blocks.  We may be
 487          *    allocating new blocks to the file and writing to them.
 488          *    (A more precise check would be whether the range we're writing
 489          *    to contains delayed allocation blocks.)
 490          * 2. We need to extend the file.  The bytes between the old EOF
 491          *    and the new EOF are not yet initialized.  This is important
 492          *    even if we're not allocating new blocks to the file.  If the
 493          *    old EOF and new EOF are in the same block, we still need to
 494          *    protect that range of bytes until they are written for the
 495          *    first time.
 496          *
 497          * If we had a shared lock with the above cases, we need to try to upgrade
 498          * to an exclusive lock.  If the upgrade fails, we will lose the shared
 499          * lock, and will need to take the truncate lock again; the took_truncate_lock
 500          * flag will still be set, causing us to try for an exclusive lock next time.
 501          */
 502         if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) &&
 503             ((fp->ff_unallocblocks != 0) ||
 504              (writelimit > origFileSize))) {
 505                 if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) {
 506                         /*
 507                          * Lock upgrade failed and we lost our shared lock, try again.
 508                          * Note: we do not set took_truncate_lock=0 here.  Leaving it
 509                          * set to 1 will cause us to try to get the lock exclusive.
 510                          */
 511                         goto again;
 512                 }
 513                 else {
 514                         /* Store the owner in the c_truncatelockowner field if we successfully upgrade */
 515                         cp->c_truncatelockowner = current_thread();
 516                 }
 517         }
 518
 519         if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
 520                 goto exit;
 521         }
 522         cnode_locked = 1;
 523
 524         filebytes = hfs_blk_to_bytes(fp->ff_blocks, hfsmp->blockSize);
 525
 526         if (offset > filebytes
 527                 && (hfs_blk_to_bytes(hfs_freeblks(hfsmp, ISSET(eflags, kEFReserveMask)),
 528                                                          hfsmp->blockSize) < offset - filebytes)) {
 529                 retval = ENOSPC;
 530                 goto exit;
 531         }
 532
 533         KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_START,
 534                      (int)offset, uio_resid(uio), (int)fp->ff_size,
 535                      (int)filebytes, 0);
 536
 537         /* Check if we do not need to extend the file */
 538         if (writelimit <= filebytes) {
 539                 goto sizeok;
 540         }
 541
 542         bytesToAdd = writelimit - filebytes;
 543
 544 #if QUOTA
 545         retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, hfsmp->blockSize)),
 546                            cred, 0);
 547         if (retval)
 548                 goto exit;
 549 #endif /* QUOTA */
 550
 551         if (hfs_start_transaction(hfsmp) != 0) {
 552                 retval = EINVAL;
 553                 goto exit;
 554         }
 555
 556         while (writelimit > filebytes) {
 557                 bytesToAdd = writelimit - filebytes;
 558
 559                 /* Protect extents b-tree and allocation bitmap */
 560                 lockflags = SFL_BITMAP;
 561                 if (overflow_extents(fp))
 562                         lockflags |= SFL_EXTENTS;
 563                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
 564
 565                 /* Files that are changing size are not hot file candidates. */
 566                 if (hfsmp->hfc_stage == HFC_RECORDING) {
 567                         fp->ff_bytesread = 0;
 568                 }
 569                 retval = MacToVFSError(ExtendFileC (hfsmp, (FCB*)fp, bytesToAdd,
 570                                 0, eflags, &actualBytesAdded));
 571
 572                 hfs_systemfile_unlock(hfsmp, lockflags);
 573
 574                 if ((actualBytesAdded == 0) && (retval == E_NONE))
 575                         retval = ENOSPC;
 576                 if (retval != E_NONE)
 577                         break;
 578                 filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 579                 KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_NONE,
 580                         (int)offset, uio_resid(uio), (int)fp->ff_size,  (int)filebytes, 0);
 581         }
 582         (void) hfs_update(vp, 0);
 583         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
 584         (void) hfs_end_transaction(hfsmp);
 585
 586         /*
 587          * If we didn't grow the file enough try a partial write.
 588          * POSIX expects this behavior.
 589          */
 590         if ((retval == ENOSPC) && (filebytes > offset)) {
 591                 retval = 0;
 592                 partialwrite = 1;
 593                 uio_setresid(uio, (uio_resid(uio) - bytesToAdd));
 594                 resid -= bytesToAdd;
 595                 writelimit = filebytes;
 596         }
 597 sizeok:
 598         if (retval == E_NONE) {
 599                 off_t filesize;
 600                 off_t head_off;
 601                 int lflag;
 602
 603                 if (writelimit > fp->ff_size) {
 604                         filesize = writelimit;
 605                         struct timeval tv;
 606                         rl_add(fp->ff_size, writelimit - 1 , &fp->ff_invalidranges);
 607                         microuptime(&tv);
 608                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
 609                 } else
 610                         filesize = fp->ff_size;
 611
 612                 lflag = ioflag & ~(IO_TAILZEROFILL | IO_HEADZEROFILL | IO_NOZEROVALID | IO_NOZERODIRTY);
 613
 614                 /*
 615                  * We no longer use IO_HEADZEROFILL or IO_TAILZEROFILL (except
 616                  * for one case below).  For the regions that lie before the
 617                  * beginning and after the end of this write that are in the
 618                  * same page, we let the cluster code handle zeroing that out
 619                  * if necessary.  If those areas are not cached, the cluster
 620                  * code will try and read those areas in, and in the case
 621                  * where those regions have never been written to,
 622                  * hfs_vnop_blockmap will consult the invalid ranges and then
 623                  * indicate that.  The cluster code will zero out those areas.
 624                  */
 625
 626                 head_off = trunc_page_64(offset);
 627
 628                 if (head_off < offset && head_off >= fp->ff_size) {
 629                         /*
 630                          * The first page is beyond current EOF, so as an
 631                          * optimisation, we can pass IO_HEADZEROFILL.
 632                          */
 633                         lflag |= IO_HEADZEROFILL;
 634                 }
 635
 636                 hfs_unlock(cp);
 637                 cnode_locked = 0;
 638
 639                 /*
 640                  * We need to tell UBC the fork's new size BEFORE calling
 641                  * cluster_write, in case any of the new pages need to be
 642                  * paged out before cluster_write completes (which does happen
 643                  * in embedded systems due to extreme memory pressure).
 644                  * Similarly, we need to tell hfs_vnop_pageout what the new EOF
 645                  * will be, so that it can pass that on to cluster_pageout, and
 646                  * allow those pageouts.
 647                  *
 648                  * We don't update ff_size yet since we don't want pageins to
 649                  * be able to see uninitialized data between the old and new
 650                  * EOF, until cluster_write has completed and initialized that
 651                  * part of the file.
 652                  *
 653                  * The vnode pager relies on the file size last given to UBC via
 654                  * ubc_setsize.  hfs_vnop_pageout relies on fp->ff_new_size or
 655                  * ff_size (whichever is larger).  NOTE: ff_new_size is always
 656                  * zero, unless we are extending the file via write.
 657                  */
 658                 if (filesize > fp->ff_size) {
 659                         retval = hfs_zero_eof_page(vp, offset);
 660                         if (retval)
 661                                 goto exit;
 662                         fp->ff_new_size = filesize;
 663                         ubc_setsize(vp, filesize);
 664                 }
 665                 retval = cluster_write(vp, uio, fp->ff_size, filesize, head_off,
 666                                                            0, lflag | IO_NOZERODIRTY | io_return_on_throttle);
 667                 if (retval) {
 668                         fp->ff_new_size = 0;    /* no longer extending; use ff_size */
 669
 670                         if (retval == EAGAIN) {
 671                                 /*
 672                                  * EAGAIN indicates that we still have I/O to do, but
 673                                  * that we now need to be throttled
 674                                  */
 675                                 if (resid != uio_resid(uio)) {
 676                                         /*
 677                                          * did manage to do some I/O before returning EAGAIN
 678                                          */
 679                                         resid = uio_resid(uio);
 680                                         offset = uio_offset(uio);
 681
 682                                         cp->c_touch_chgtime = TRUE;
 683                                         cp->c_touch_modtime = TRUE;
 684                                         hfs_incr_gencount(cp);
 685                                 }
 686                                 if (filesize > fp->ff_size) {
 687                                         /*
 688                                          * we called ubc_setsize before the call to
 689                                          * cluster_write... since we only partially
 690                                          * completed the I/O, we need to
 691                                          * re-adjust our idea of the filesize based
 692                                          * on our interim EOF
 693                                          */
 694                                         ubc_setsize(vp, offset);
 695
 696                                         fp->ff_size = offset;
 697                                 }
 698                                 goto exit;
 699                         }
 700                         if (filesize > origFileSize) {
 701                                 ubc_setsize(vp, origFileSize);
 702                         }
 703                         goto ioerr_exit;
 704                 }
 705
 706                 if (filesize > origFileSize) {
 707                         fp->ff_size = filesize;
 708
 709                         /* Files that are changing size are not hot file candidates. */
 710                         if (hfsmp->hfc_stage == HFC_RECORDING) {
 711                                 fp->ff_bytesread = 0;
 712                         }
 713                 }
 714                 fp->ff_new_size = 0;    /* ff_size now has the correct size */
 715         }
 716         if (partialwrite) {
 717                 uio_setresid(uio, (uio_resid(uio) + bytesToAdd));
 718                 resid += bytesToAdd;
 719         }
 720
 721         if (vnode_should_flush_after_write(vp, ioflag))
 722                 hfs_flush(hfsmp, HFS_FLUSH_CACHE);
 723
 724 ioerr_exit:
 725         if (!cnode_locked) {
 726                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 727                 cnode_locked = 1;
 728         }
 729
 730         if (resid > uio_resid(uio)) {
 731                 cp->c_touch_chgtime = TRUE;
 732                 cp->c_touch_modtime = TRUE;
 733                 hfs_incr_gencount(cp);
 734
 735                 /*
 736                  * If we successfully wrote any data, and we are not the superuser
 737                  * we clear the setuid and setgid bits as a precaution against
 738                  * tampering.
 739                  */
 740                 if (cp->c_mode & (S_ISUID | S_ISGID)) {
 741                         cred = vfs_context_ucred(ap->a_context);
 742                         if (cred && suser(cred, NULL)) {
 743                                 cp->c_mode &= ~(S_ISUID | S_ISGID);
 744                         }
 745                 }
 746         }
 747         if (retval) {
 748                 if (ioflag & IO_UNIT) {
 749                         (void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC,
 750                                            0, ap->a_context);
 751                         uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio))));
 752                         uio_setresid(uio, resid);
 753                         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 754                 }
 755         } else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio)))
 756                 retval = hfs_update(vp, 0);
 757
 758         /* Updating vcbWrCnt doesn't need to be atomic. */
 759         hfsmp->vcbWrCnt++;
 760
 761         KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_END,
 762                 (int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0);
 763 exit:
 764         if (retval && took_truncate_lock
 765                 && cp->c_truncatelockowner == current_thread()) {
 766                 fp->ff_new_size = 0;
 767                 rl_remove(fp->ff_size, RL_INFINITY, &fp->ff_invalidranges);
 768         }
 769
 770         if (cnode_locked)
 771                 hfs_unlock(cp);
 772
 773         if (took_truncate_lock) {
 774                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 775         }
 776         if (retval == EAGAIN) {
 777                 throttle_lowpri_io(1);
 778                 throttled_count++;
 779
 780                 retval = 0;
 781                 goto again;
 782         }
 783         if (throttled_count)
 784                 throttle_info_reset_window(NULL);
 785         return (retval);
 786 }
 787
 788 /* support for the "bulk-access" fcntl */
 789
 790 #define CACHE_LEVELS 16
 791 #define NUM_CACHE_ENTRIES (64*16)
 792 #define PARENT_IDS_FLAG 0x100
 793
 794 struct access_cache {
 795        int numcached;
 796        int cachehits; /* these two for statistics gathering */
 797        int lookups;
 798        unsigned int *acache;
 799        unsigned char *haveaccess;
 800 };
 801
 802 struct access_t {
 803         uid_t     uid;              /* IN: effective user id */
 804         short     flags;            /* IN: access requested (i.e. R_OK) */
 805         short     num_groups;       /* IN: number of groups user belongs to */
 806         int       num_files;        /* IN: number of files to process */
 807         int       *file_ids;        /* IN: array of file ids */
 808         gid_t     *groups;          /* IN: array of groups */
 809         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 810 } __attribute__((unavailable)); // this structure is for reference purposes only
 811
 812 struct user32_access_t {
 813         uid_t     uid;              /* IN: effective user id */
 814         short     flags;            /* IN: access requested (i.e. R_OK) */
 815         short     num_groups;       /* IN: number of groups user belongs to */
 816         int       num_files;        /* IN: number of files to process */
 817         user32_addr_t      file_ids;        /* IN: array of file ids */
 818         user32_addr_t      groups;          /* IN: array of groups */
 819         user32_addr_t      access;          /* OUT: access info for each file (0 for 'has access') */
 820 };
 821
 822 struct user64_access_t {
 823         uid_t           uid;                    /* IN: effective user id */
 824         short           flags;                  /* IN: access requested (i.e. R_OK) */
 825         short           num_groups;             /* IN: number of groups user belongs to */
 826         int             num_files;              /* IN: number of files to process */
 827         user64_addr_t   file_ids;               /* IN: array of file ids */
 828         user64_addr_t   groups;                 /* IN: array of groups */
 829         user64_addr_t   access;                 /* OUT: access info for each file (0 for 'has access') */
 830 };
 831
 832
 833 // these are the "extended" versions of the above structures
 834 // note that it is crucial that they be different sized than
 835 // the regular version
 836 struct ext_access_t {
 837         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 838         uint32_t   num_files;       /* IN: number of files to process */
 839         uint32_t   map_size;        /* IN: size of the bit map */
 840         uint32_t  *file_ids;        /* IN: Array of file ids */
 841         char      *bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 842         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 843         uint32_t   num_parents;   /* future use */
 844         cnid_t      *parents;   /* future use */
 845 } __attribute__((unavailable)); // this structure is for reference purposes only
 846
 847 struct user32_ext_access_t {
 848         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 849         uint32_t   num_files;       /* IN: number of files to process */
 850         uint32_t   map_size;        /* IN: size of the bit map */
 851         user32_addr_t  file_ids;        /* IN: Array of file ids */
 852         user32_addr_t     bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 853         user32_addr_t access;          /* OUT: access info for each file (0 for 'has access') */
 854         uint32_t   num_parents;   /* future use */
 855         user32_addr_t parents;   /* future use */
 856 };
 857
 858 struct user64_ext_access_t {
 859         uint32_t      flags;        /* IN: access requested (i.e. R_OK) */
 860         uint32_t      num_files;    /* IN: number of files to process */
 861         uint32_t      map_size;     /* IN: size of the bit map */
 862         user64_addr_t   file_ids;     /* IN: array of file ids */
 863         user64_addr_t   bitmap;       /* IN: array of groups */
 864         user64_addr_t   access;       /* OUT: access info for each file (0 for 'has access') */
 865         uint32_t      num_parents;/* future use */
 866         user64_addr_t   parents;/* future use */
 867 };
 868
 869
 870 /*
 871  * Perform a binary search for the given parent_id. Return value is
 872  * the index if there is a match.  If no_match_indexp is non-NULL it
 873  * will be assigned with the index to insert the item (even if it was
 874  * not found).
 875  */
 876 static int cache_binSearch(cnid_t *array, unsigned int hi, cnid_t parent_id, int *no_match_indexp)
 877 {
 878     int index=-1;
 879     unsigned int lo=0;
 880
 881     do {
 882         unsigned int mid = ((hi - lo)/2) + lo;
 883         unsigned int this_id = array[mid];
 884
 885         if (parent_id == this_id) {
 886             hi = mid;
 887             break;
 888         }
 889
 890         if (parent_id < this_id) {
 891             hi = mid;
 892             continue;
 893         }
 894
 895         if (parent_id > this_id) {
 896             lo = mid + 1;
 897             continue;
 898         }
 899     } while(lo < hi);
 900
 901     /* check if lo and hi converged on the match */
 902     if (parent_id == array[hi]) {
 903         index = hi;
 904     }
 905
 906     if (no_match_indexp) {
 907         *no_match_indexp = hi;
 908     }
 909
 910     return index;
 911 }
 912
 913
 914 static int
 915 lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id)
 916 {
 917     unsigned int hi;
 918     int matches = 0;
 919     int index, no_match_index;
 920
 921     if (cache->numcached == 0) {
 922         *indexp = 0;
 923         return 0; // table is empty, so insert at index=0 and report no match
 924     }
 925
 926     if (cache->numcached > NUM_CACHE_ENTRIES) {
 927         cache->numcached = NUM_CACHE_ENTRIES;
 928     }
 929
 930     hi = cache->numcached - 1;
 931
 932     index = cache_binSearch(cache->acache, hi, parent_id, &no_match_index);
 933
 934     /* if no existing entry found, find index for new one */
 935     if (index == -1) {
 936         index = no_match_index;
 937         matches = 0;
 938     } else {
 939         matches = 1;
 940     }
 941
 942     *indexp = index;
 943     return matches;
 944 }
 945
 946 /*
 947  * Add a node to the access_cache at the given index (or do a lookup first
 948  * to find the index if -1 is passed in). We currently do a replace rather
 949  * than an insert if the cache is full.
 950  */
 951 static void
 952 add_node(struct access_cache *cache, int index, cnid_t nodeID, int access)
 953 {
 954     int lookup_index = -1;
 955
 956     /* need to do a lookup first if -1 passed for index */
 957     if (index == -1) {
 958         if (lookup_bucket(cache, &lookup_index, nodeID)) {
 959             if (cache->haveaccess[lookup_index] != access && cache->haveaccess[lookup_index] == ESRCH) {
 960                 // only update an entry if the previous access was ESRCH (i.e. a scope checking error)
 961                 cache->haveaccess[lookup_index] = access;
 962             }
 963
 964             /* mission accomplished */
 965             return;
 966         } else {
 967             index = lookup_index;
 968         }
 969
 970     }
 971
 972     /* if the cache is full, do a replace rather than an insert */
 973     if (cache->numcached >= NUM_CACHE_ENTRIES) {
 974         cache->numcached = NUM_CACHE_ENTRIES-1;
 975
 976         if (index > cache->numcached) {
 977             index = cache->numcached;
 978         }
 979     }
 980
 981     if (index < cache->numcached && index < NUM_CACHE_ENTRIES && nodeID > cache->acache[index]) {
 982         index++;
 983     }
 984
 985     if (index >= 0 && index < cache->numcached) {
 986         /* only do bcopy if we're inserting */
 987         bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) );
 988         bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(unsigned char) );
 989     }
 990
 991     cache->acache[index] = nodeID;
 992     cache->haveaccess[index] = access;
 993     cache->numcached++;
 994 }
 995
 996
 997 struct cinfo {
 998     uid_t   uid;
 999     gid_t   gid;
1000     mode_t  mode;
1001     cnid_t  parentcnid;
1002     u_int16_t recflags;
1003 };
1004
1005 static int
1006 snoop_callback(const cnode_t *cp, void *arg)
1007 {
1008     struct cinfo *cip = arg;
1009
1010     cip->uid = cp->c_uid;
1011     cip->gid = cp->c_gid;
1012     cip->mode = cp->c_mode;
1013     cip->parentcnid = cp->c_parentcnid;
1014     cip->recflags = cp->c_attr.ca_recflags;
1015
1016     return (0);
1017 }
1018
1019 /*
1020  * Lookup the cnid's attr info (uid, gid, and mode) as well as its parent id. If the item
1021  * isn't incore, then go to the catalog.
1022  */
1023 static int
1024 do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, cnid_t cnid,
1025     struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp)
1026 {
1027     int error = 0;
1028
1029     /* if this id matches the one the fsctl was called with, skip the lookup */
1030     if (cnid == skip_cp->c_cnid) {
1031                 cnattrp->ca_uid = skip_cp->c_uid;
1032                 cnattrp->ca_gid = skip_cp->c_gid;
1033                 cnattrp->ca_mode = skip_cp->c_mode;
1034                 cnattrp->ca_recflags = skip_cp->c_attr.ca_recflags;
1035                 keyp->hfsPlus.parentID = skip_cp->c_parentcnid;
1036     } else {
1037                 struct cinfo c_info;
1038
1039                 /* otherwise, check the cnode hash incase the file/dir is incore */
1040                 error = hfs_chash_snoop(hfsmp, cnid, 0, snoop_callback, &c_info);
1041
1042                 if (error == EACCES) {
1043                         // File is deleted
1044                         return ENOENT;
1045                 } else if (!error) {
1046                         cnattrp->ca_uid = c_info.uid;
1047                         cnattrp->ca_gid = c_info.gid;
1048                         cnattrp->ca_mode = c_info.mode;
1049                         cnattrp->ca_recflags = c_info.recflags;
1050                         keyp->hfsPlus.parentID = c_info.parentcnid;
1051                 } else {
1052                         int lockflags;
1053
1054                         if (throttle_io_will_be_throttled(-1, HFSTOVFS(hfsmp)))
1055                                 throttle_lowpri_io(1);
1056
1057                         lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
1058
1059                         /* lookup this cnid in the catalog */
1060                         error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp);
1061
1062                         hfs_systemfile_unlock(hfsmp, lockflags);
1063
1064                         cache->lookups++;
1065                 }
1066     }
1067
1068     return (error);
1069 }
1070
1071
1072 /*
1073  * Compute whether we have access to the given directory (nodeID) and all its parents. Cache
1074  * up to CACHE_LEVELS as we progress towards the root.
1075  */
1076 static int
1077 do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HFSCatalogNodeID nodeID,
1078     struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred,
1079     struct vfs_context *my_context,
1080     char *bitmap,
1081     uint32_t map_size,
1082     cnid_t* parents,
1083     uint32_t num_parents)
1084 {
1085     int                     myErr = 0;
1086     int                     myResult;
1087     HFSCatalogNodeID        thisNodeID;
1088     unsigned int            myPerms;
1089     struct cat_attr         cnattr;
1090     int                     cache_index = -1, scope_index = -1, scope_idx_start = -1;
1091     CatalogKey              catkey;
1092
1093     int i = 0, ids_to_cache = 0;
1094     int parent_ids[CACHE_LEVELS];
1095
1096     thisNodeID = nodeID;
1097     while (thisNodeID >=  kRootDirID) {
1098         myResult = 0;   /* default to "no access" */
1099
1100         /* check the cache before resorting to hitting the catalog */
1101
1102         /* ASSUMPTION: access info of cached entries is "final"... i.e. no need
1103          * to look any further after hitting cached dir */
1104
1105         if (lookup_bucket(cache, &cache_index, thisNodeID)) {
1106             cache->cachehits++;
1107             myErr = cache->haveaccess[cache_index];
1108             if (scope_index != -1) {
1109                 if (myErr == ESRCH) {
1110                     myErr = 0;
1111                 }
1112             } else {
1113                 scope_index = 0;   // so we'll just use the cache result
1114                 scope_idx_start = ids_to_cache;
1115             }
1116             myResult = (myErr == 0) ? 1 : 0;
1117             goto ExitThisRoutine;
1118         }
1119
1120
1121         if (parents) {
1122             int tmp;
1123             tmp = cache_binSearch(parents, num_parents-1, thisNodeID, NULL);
1124             if (scope_index == -1)
1125                 scope_index = tmp;
1126             if (tmp != -1 && scope_idx_start == -1 && ids_to_cache < CACHE_LEVELS) {
1127                 scope_idx_start = ids_to_cache;
1128             }
1129         }
1130
1131         /* remember which parents we want to cache */
1132         if (ids_to_cache < CACHE_LEVELS) {
1133             parent_ids[ids_to_cache] = thisNodeID;
1134             ids_to_cache++;
1135         }
1136         // Inefficient (using modulo) and we might want to use a hash function, not rely on the node id to be "nice"...
1137         if (bitmap && map_size) {
1138             bitmap[(thisNodeID/8)%(map_size)]|=(1<<(thisNodeID&7));
1139         }
1140
1141
1142         /* do the lookup (checks the cnode hash, then the catalog) */
1143         myErr = do_attr_lookup(hfsmp, cache, thisNodeID, skip_cp, &catkey, &cnattr);
1144         if (myErr) {
1145             goto ExitThisRoutine; /* no access */
1146         }
1147
1148         /* Root always gets access. */
1149         if (suser(myp_ucred, NULL) == 0) {
1150                 thisNodeID = catkey.hfsPlus.parentID;
1151                 myResult = 1;
1152                 continue;
1153         }
1154
1155         // if the thing has acl's, do the full permission check
1156         if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1157             struct vnode *vp;
1158
1159             /* get the vnode for this cnid */
1160             myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0, 0);
1161             if ( myErr ) {
1162                 myResult = 0;
1163                 goto ExitThisRoutine;
1164             }
1165
1166             thisNodeID = VTOC(vp)->c_parentcnid;
1167
1168             hfs_unlock(VTOC(vp));
1169
1170             if (vnode_vtype(vp) == VDIR) {
1171                 myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), my_context);
1172             } else {
1173                 myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, my_context);
1174             }
1175
1176             vnode_put(vp);
1177             if (myErr) {
1178                 myResult = 0;
1179                 goto ExitThisRoutine;
1180             }
1181         } else {
1182             unsigned int flags;
1183                 int mode = cnattr.ca_mode & S_IFMT;
1184                 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, cnattr.ca_mode, hfsmp->hfs_mp,myp_ucred, theProcPtr);
1185
1186                 if (mode == S_IFDIR) {
1187                         flags = R_OK | X_OK;
1188                 } else {
1189                         flags = R_OK;
1190                 }
1191                 if ( (myPerms & flags) != flags) {
1192                         myResult = 0;
1193                         myErr = EACCES;
1194                         goto ExitThisRoutine;   /* no access */
1195                 }
1196
1197             /* up the hierarchy we go */
1198             thisNodeID = catkey.hfsPlus.parentID;
1199         }
1200     }
1201
1202     /* if here, we have access to this node */
1203     myResult = 1;
1204
1205   ExitThisRoutine:
1206     if (parents && myErr == 0 && scope_index == -1) {
1207         myErr = ESRCH;
1208     }
1209
1210     if (myErr) {
1211         myResult = 0;
1212     }
1213     *err = myErr;
1214
1215     /* cache the parent directory(ies) */
1216     for (i = 0; i < ids_to_cache; i++) {
1217         if (myErr == 0 && parents && (scope_idx_start == -1 || i > scope_idx_start)) {
1218             add_node(cache, -1, parent_ids[i], ESRCH);
1219         } else {
1220             add_node(cache, -1, parent_ids[i], myErr);
1221         }
1222     }
1223
1224     return (myResult);
1225 }
1226
1227 static int
1228 do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
1229     struct vnop_ioctl_args *ap, int arg_size, vfs_context_t context)
1230 {
1231     boolean_t is64bit;
1232
1233     /*
1234      * NOTE: on entry, the vnode has an io_ref. In case this vnode
1235      * happens to be in our list of file_ids, we'll note it
1236      * avoid calling hfs_chashget_nowait() on that id as that
1237      * will cause a "locking against myself" panic.
1238      */
1239     Boolean check_leaf = true;
1240
1241     struct user64_ext_access_t *user_access_structp;
1242     struct user64_ext_access_t tmp_user_access;
1243     struct access_cache cache;
1244
1245     int error = 0, prev_parent_check_ok=1;
1246     unsigned int i;
1247
1248     short flags;
1249     unsigned int num_files = 0;
1250     int map_size = 0;
1251     int num_parents = 0;
1252     int *file_ids=NULL;
1253     short *access=NULL;
1254     char *bitmap=NULL;
1255     cnid_t *parents=NULL;
1256     int leaf_index;
1257
1258     cnid_t cnid;
1259     cnid_t prevParent_cnid = 0;
1260     unsigned int myPerms;
1261     short myaccess = 0;
1262     struct cat_attr cnattr;
1263     CatalogKey catkey;
1264     struct cnode *skip_cp = VTOC(vp);
1265     kauth_cred_t cred = vfs_context_ucred(context);
1266     proc_t p = vfs_context_proc(context);
1267
1268     is64bit = proc_is64bit(p);
1269
1270     /* initialize the local cache and buffers */
1271     cache.numcached = 0;
1272     cache.cachehits = 0;
1273     cache.lookups = 0;
1274     cache.acache = NULL;
1275     cache.haveaccess = NULL;
1276
1277     /* struct copyin done during dispatch... need to copy file_id array separately */
1278     if (ap->a_data == NULL) {
1279         error = EINVAL;
1280         goto err_exit_bulk_access;
1281     }
1282
1283     if (is64bit) {
1284         if (arg_size != sizeof(struct user64_ext_access_t)) {
1285             error = EINVAL;
1286             goto err_exit_bulk_access;
1287         }
1288
1289         user_access_structp = (struct user64_ext_access_t *)ap->a_data;
1290
1291     } else if (arg_size == sizeof(struct user32_access_t)) {
1292         struct user32_access_t *accessp = (struct user32_access_t *)ap->a_data;
1293
1294         // convert an old style bulk-access struct to the new style
1295         tmp_user_access.flags     = accessp->flags;
1296         tmp_user_access.num_files = accessp->num_files;
1297         tmp_user_access.map_size  = 0;
1298         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1299         tmp_user_access.bitmap    = USER_ADDR_NULL;
1300         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1301         tmp_user_access.num_parents = 0;
1302         user_access_structp = &tmp_user_access;
1303
1304     } else if (arg_size == sizeof(struct user32_ext_access_t)) {
1305         struct user32_ext_access_t *accessp = (struct user32_ext_access_t *)ap->a_data;
1306
1307         // up-cast from a 32-bit version of the struct
1308         tmp_user_access.flags     = accessp->flags;
1309         tmp_user_access.num_files = accessp->num_files;
1310         tmp_user_access.map_size  = accessp->map_size;
1311         tmp_user_access.num_parents  = accessp->num_parents;
1312
1313         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1314         tmp_user_access.bitmap    = CAST_USER_ADDR_T(accessp->bitmap);
1315         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1316         tmp_user_access.parents    = CAST_USER_ADDR_T(accessp->parents);
1317
1318         user_access_structp = &tmp_user_access;
1319     } else {
1320         error = EINVAL;
1321         goto err_exit_bulk_access;
1322     }
1323
1324     map_size = user_access_structp->map_size;
1325
1326     num_files = user_access_structp->num_files;
1327
1328     num_parents= user_access_structp->num_parents;
1329
1330     if (num_files < 1) {
1331         goto err_exit_bulk_access;
1332     }
1333     if (num_files > 1024) {
1334         error = EINVAL;
1335         goto err_exit_bulk_access;
1336     }
1337
1338     if (num_parents > 1024) {
1339         error = EINVAL;
1340         goto err_exit_bulk_access;
1341     }
1342
1343     file_ids = hfs_malloc(sizeof(int) * num_files);
1344     access = hfs_malloc(sizeof(short) * num_files);
1345     if (map_size) {
1346                 bitmap = hfs_mallocz(sizeof(char) * map_size);
1347     }
1348
1349     if (num_parents) {
1350                 parents = hfs_malloc(sizeof(cnid_t) * num_parents);
1351     }
1352
1353     cache.acache = hfs_malloc(sizeof(int) * NUM_CACHE_ENTRIES);
1354     cache.haveaccess = hfs_malloc(sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1355
1356     if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids,
1357                 num_files * sizeof(int)))) {
1358         goto err_exit_bulk_access;
1359     }
1360
1361     if (num_parents) {
1362         if ((error = copyin(user_access_structp->parents, (caddr_t)parents,
1363                     num_parents * sizeof(cnid_t)))) {
1364             goto err_exit_bulk_access;
1365         }
1366     }
1367
1368     flags = user_access_structp->flags;
1369     if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) {
1370         flags = R_OK;
1371     }
1372
1373     /* check if we've been passed leaf node ids or parent ids */
1374     if (flags & PARENT_IDS_FLAG) {
1375         check_leaf = false;
1376     }
1377
1378     /* Check access to each file_id passed in */
1379     for (i = 0; i < num_files; i++) {
1380         leaf_index=-1;
1381         cnid = (cnid_t) file_ids[i];
1382
1383         /* root always has access */
1384         if ((!parents) && (!suser(cred, NULL))) {
1385             access[i] = 0;
1386             continue;
1387         }
1388
1389         if (check_leaf) {
1390             /* do the lookup (checks the cnode hash, then the catalog) */
1391             error = do_attr_lookup(hfsmp, &cache, cnid, skip_cp, &catkey, &cnattr);
1392             if (error) {
1393                 access[i] = (short) error;
1394                 continue;
1395             }
1396
1397             if (parents) {
1398                 // Check if the leaf matches one of the parent scopes
1399                 leaf_index = cache_binSearch(parents, num_parents-1, cnid, NULL);
1400                 if (leaf_index >= 0 && parents[leaf_index] == cnid)
1401                     prev_parent_check_ok = 0;
1402                 else if (leaf_index >= 0)
1403                     prev_parent_check_ok = 1;
1404             }
1405
1406             // if the thing has acl's, do the full permission check
1407             if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1408                 struct vnode *cvp;
1409                 int myErr = 0;
1410                 /* get the vnode for this cnid */
1411                 myErr = hfs_vget(hfsmp, cnid, &cvp, 0, 0);
1412                 if ( myErr ) {
1413                     access[i] = myErr;
1414                     continue;
1415                 }
1416
1417                 hfs_unlock(VTOC(cvp));
1418
1419                 if (vnode_vtype(cvp) == VDIR) {
1420                     myErr = vnode_authorize(cvp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), context);
1421                 } else {
1422                     myErr = vnode_authorize(cvp, NULL, KAUTH_VNODE_READ_DATA, context);
1423                 }
1424
1425                 vnode_put(cvp);
1426                 if (myErr) {
1427                     access[i] = myErr;
1428                     continue;
1429                 }
1430             } else {
1431                 /* before calling CheckAccess(), check the target file for read access */
1432                 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
1433                     cnattr.ca_mode, hfsmp->hfs_mp, cred, p);
1434
1435                 /* fail fast if no access */
1436                 if ((myPerms & flags) == 0) {
1437                     access[i] = EACCES;
1438                     continue;
1439                 }
1440             }
1441         } else {
1442             /* we were passed an array of parent ids */
1443             catkey.hfsPlus.parentID = cnid;
1444         }
1445
1446         /* if the last guy had the same parent and had access, we're done */
1447         if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0 && prev_parent_check_ok) {
1448             cache.cachehits++;
1449             access[i] = 0;
1450             continue;
1451         }
1452
1453         myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID,
1454             skip_cp, p, cred, context,bitmap, map_size, parents, num_parents);
1455
1456         if (myaccess || (error == ESRCH && leaf_index != -1)) {
1457             access[i] = 0; // have access.. no errors to report
1458         } else {
1459             access[i] = (error != 0 ? (short) error : EACCES);
1460         }
1461
1462         prevParent_cnid = catkey.hfsPlus.parentID;
1463     }
1464
1465     /* copyout the access array */
1466     if ((error = copyout((caddr_t)access, user_access_structp->access,
1467                 num_files * sizeof (short)))) {
1468         goto err_exit_bulk_access;
1469     }
1470     if (map_size && bitmap) {
1471         if ((error = copyout((caddr_t)bitmap, user_access_structp->bitmap,
1472                     map_size * sizeof (char)))) {
1473             goto err_exit_bulk_access;
1474         }
1475     }
1476
1477
1478   err_exit_bulk_access:
1479
1480         hfs_free(file_ids, sizeof(int) * num_files);
1481         hfs_free(parents, sizeof(cnid_t) * num_parents);
1482         hfs_free(bitmap, sizeof(char) * map_size);
1483         hfs_free(access, sizeof(short) * num_files);
1484         hfs_free(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1485         hfs_free(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1486
1487     return (error);
1488 }
1489
1490
1491 /* end "bulk-access" support */
1492
1493
1494 /*
1495  * Control filesystem operating characteristics.
1496  */
1497 int
1498 hfs_vnop_ioctl( struct vnop_ioctl_args /* {
1499                 vnode_t a_vp;
1500                 long  a_command;
1501                 caddr_t  a_data;
1502                 int  a_fflag;
1503                 vfs_context_t a_context;
1504         } */ *ap)
1505 {
1506         struct vnode * vp = ap->a_vp;
1507         struct hfsmount *hfsmp = VTOHFS(vp);
1508         vfs_context_t context = ap->a_context;
1509         kauth_cred_t cred = vfs_context_ucred(context);
1510         proc_t p = vfs_context_proc(context);
1511         struct vfsstatfs *vfsp;
1512         boolean_t is64bit;
1513         off_t jnl_start, jnl_size;
1514         struct hfs_journal_info *jip;
1515 #if HFS_COMPRESSION
1516         int compressed = 0;
1517         off_t uncompressed_size = -1;
1518         int decmpfs_error = 0;
1519
1520         if (ap->a_command == F_RDADVISE) {
1521                 /* we need to inspect the decmpfs state of the file as early as possible */
1522                 compressed = hfs_file_is_compressed(VTOC(vp), 0);
1523                 if (compressed) {
1524                         if (VNODE_IS_RSRC(vp)) {
1525                                 /* if this is the resource fork, treat it as if it were empty */
1526                                 uncompressed_size = 0;
1527                         } else {
1528                                 decmpfs_error = hfs_uncompressed_size_of_compressed_file(NULL, vp, 0, &uncompressed_size, 0);
1529                                 if (decmpfs_error != 0) {
1530                                         /* failed to get the uncompressed size, we'll check for this later */
1531                                         uncompressed_size = -1;
1532                                 }
1533                         }
1534                 }
1535         }
1536 #endif /* HFS_COMPRESSION */
1537
1538         is64bit = proc_is64bit(p);
1539
1540 #if CONFIG_PROTECT
1541 #if HFS_CONFIG_KEY_ROLL
1542         // The HFSIOC_KEY_ROLL fsctl does its own access checks
1543         if (ap->a_command != HFSIOC_KEY_ROLL)
1544 #endif
1545         {
1546                 int error = 0;
1547                 if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
1548                         return error;
1549                 }
1550         }
1551 #endif /* CONFIG_PROTECT */
1552
1553         switch (ap->a_command) {
1554
1555         case HFSIOC_GETPATH:
1556         {
1557                 struct vnode *file_vp;
1558                 cnid_t  cnid;
1559                 int  outlen;
1560                 char *bufptr;
1561                 int error;
1562                 int flags = 0;
1563
1564                 /* Caller must be owner of file system. */
1565                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1566                 if (suser(cred, NULL) &&
1567                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1568                         return (EACCES);
1569                 }
1570                 /* Target vnode must be file system's root. */
1571                 if (!vnode_isvroot(vp)) {
1572                         return (EINVAL);
1573                 }
1574                 bufptr = (char *)ap->a_data;
1575                 cnid = strtoul(bufptr, NULL, 10);
1576                 if (ap->a_fflag & HFS_GETPATH_VOLUME_RELATIVE) {
1577                         flags |= BUILDPATH_VOLUME_RELATIVE;
1578                 }
1579
1580                 /* We need to call hfs_vfs_vget to leverage the code that will
1581                  * fix the origin list for us if needed, as opposed to calling
1582                  * hfs_vget, since we will need the parent for build_path call.
1583                  */
1584
1585                 if ((error = hfs_vfs_vget(HFSTOVFS(hfsmp), cnid, &file_vp, context))) {
1586                         return (error);
1587                 }
1588
1589                 error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, flags, context);
1590                 vnode_put(file_vp);
1591
1592                 return (error);
1593         }
1594
1595         case HFSIOC_SET_MAX_DEFRAG_SIZE:
1596         {
1597                 int error = 0;          /* Assume success */
1598                 u_int32_t maxsize = 0;
1599
1600                 if (vnode_vfsisrdonly(vp)) {
1601                         return (EROFS);
1602                 }
1603                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1604                 if (!kauth_cred_issuser(cred)) {
1605                         return (EACCES); /* must be root */
1606                 }
1607
1608                 maxsize = *(u_int32_t *)ap->a_data;
1609
1610                 hfs_lock_mount(hfsmp);
1611                 if (maxsize > HFS_MAX_DEFRAG_SIZE) {
1612                         error = EINVAL;
1613                 }
1614                 else {
1615                         hfsmp->hfs_defrag_max = maxsize;
1616                 }
1617                 hfs_unlock_mount(hfsmp);
1618
1619                 return (error);
1620         }
1621
1622         case HFSIOC_FORCE_ENABLE_DEFRAG:
1623         {
1624                 int error = 0;          /* Assume success */
1625                 u_int32_t do_enable = 0;
1626
1627                 if (vnode_vfsisrdonly(vp)) {
1628                         return (EROFS);
1629                 }
1630                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1631                 if (!kauth_cred_issuser(cred)) {
1632                         return (EACCES); /* must be root */
1633                 }
1634
1635                 do_enable = *(u_int32_t *)ap->a_data;
1636
1637                 hfs_lock_mount(hfsmp);
1638                 if (do_enable != 0) {
1639                         hfsmp->hfs_defrag_nowait = 1;
1640                 }
1641                 else {
1642                         error = EINVAL;
1643                 }
1644
1645                 hfs_unlock_mount(hfsmp);
1646
1647                 return (error);
1648         }
1649
1650
1651         case HFSIOC_TRANSFER_DOCUMENT_ID:
1652         {
1653                 struct cnode *cp = NULL;
1654                 int error;
1655                 u_int32_t to_fd = *(u_int32_t *)ap->a_data;
1656                 struct fileproc *to_fp;
1657                 struct vnode *to_vp;
1658                 struct cnode *to_cp;
1659
1660                 cp = VTOC(vp);
1661
1662                 if ((error = fp_getfvp(p, to_fd, &to_fp, &to_vp)) != 0) {
1663                         //printf("could not get the vnode for fd %d (err %d)\n", to_fd, error);
1664                         return error;
1665                 }
1666                 if ( (error = vnode_getwithref(to_vp)) ) {
1667                         file_drop(to_fd);
1668                         return error;
1669                 }
1670
1671                 if (VTOHFS(to_vp) != hfsmp) {
1672                         error = EXDEV;
1673                         goto transfer_cleanup;
1674                 }
1675
1676                 int need_unlock = 1;
1677                 to_cp = VTOC(to_vp);
1678                 error = hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK);
1679                 if (error != 0) {
1680                         //printf("could not lock the pair of cnodes (error %d)\n", error);
1681                         goto transfer_cleanup;
1682                 }
1683
1684                 if (!(cp->c_bsdflags & UF_TRACKED)) {
1685                         error = EINVAL;
1686                 } else if (to_cp->c_bsdflags & UF_TRACKED) {
1687                         //
1688                         // if the destination is already tracked, return an error
1689                         // as otherwise it's a silent deletion of the target's
1690                         // document-id
1691                         //
1692                         error = EEXIST;
1693                 } else if (S_ISDIR(cp->c_attr.ca_mode) || S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) {
1694                         //
1695                         // we can use the FndrExtendedFileInfo because the doc-id is the first
1696                         // thing in both it and the ExtendedDirInfo struct which is fixed in
1697                         // format and can not change layout
1698                         //
1699                         struct FndrExtendedFileInfo *f_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)cp->c_finderinfo + 16);
1700                         struct FndrExtendedFileInfo *to_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)to_cp->c_finderinfo + 16);
1701
1702                         if (f_extinfo->document_id == 0) {
1703                                 uint32_t new_id;
1704
1705                                 hfs_unlockpair(cp, to_cp);  // have to unlock to be able to get a new-id
1706
1707                                 if ((error = hfs_generate_document_id(hfsmp, &new_id)) == 0) {
1708                                         //
1709                                         // re-lock the pair now that we have the document-id
1710                                         //
1711                                         hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK);
1712                                         f_extinfo->document_id = new_id;
1713                                 } else {
1714                                         goto transfer_cleanup;
1715                                 }
1716                         }
1717
1718                         to_extinfo->document_id = f_extinfo->document_id;
1719                         f_extinfo->document_id = 0;
1720                         //printf("TRANSFERRING: doc-id %d from ino %d to ino %d\n", to_extinfo->document_id, cp->c_fileid, to_cp->c_fileid);
1721
1722                         // make sure the destination is also UF_TRACKED
1723                         to_cp->c_bsdflags |= UF_TRACKED;
1724                         cp->c_bsdflags &= ~UF_TRACKED;
1725
1726                         // mark the cnodes dirty
1727                         cp->c_flag |= C_MODIFIED;
1728                         to_cp->c_flag |= C_MODIFIED;
1729
1730                         int lockflags;
1731                         if ((error = hfs_start_transaction(hfsmp)) == 0) {
1732
1733                                 lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK);
1734
1735                                 (void) cat_update(hfsmp, &cp->c_desc, &cp->c_attr, NULL, NULL);
1736                                 (void) cat_update(hfsmp, &to_cp->c_desc, &to_cp->c_attr, NULL, NULL);
1737
1738                                 hfs_systemfile_unlock (hfsmp, lockflags);
1739                                 (void) hfs_end_transaction(hfsmp);
1740                         }
1741
1742                         add_fsevent(FSE_DOCID_CHANGED, context,
1743                                     FSE_ARG_DEV,   hfsmp->hfs_raw_dev,
1744                                     FSE_ARG_INO,   (ino64_t)cp->c_fileid,       // src inode #
1745                                     FSE_ARG_INO,   (ino64_t)to_cp->c_fileid,    // dst inode #
1746                                     FSE_ARG_INT32, to_extinfo->document_id,
1747                                     FSE_ARG_DONE);
1748
1749                         hfs_unlockpair(cp, to_cp);    // unlock this so we can send the fsevents
1750                         need_unlock = 0;
1751
1752                         if (need_fsevent(FSE_STAT_CHANGED, vp)) {
1753                                 add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
1754                         }
1755                         if (need_fsevent(FSE_STAT_CHANGED, to_vp)) {
1756                                 add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, to_vp, FSE_ARG_DONE);
1757                         }
1758                 }
1759
1760                 if (need_unlock) {
1761                         hfs_unlockpair(cp, to_cp);
1762                 }
1763
1764         transfer_cleanup:
1765                 vnode_put(to_vp);
1766                 file_drop(to_fd);
1767
1768                 return error;
1769         }
1770
1771
1772
1773         case HFSIOC_PREV_LINK:
1774         case HFSIOC_NEXT_LINK:
1775         {
1776                 cnid_t linkfileid;
1777                 cnid_t nextlinkid;
1778                 cnid_t prevlinkid;
1779                 int error;
1780
1781                 /* Caller must be owner of file system. */
1782                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1783                 if (suser(cred, NULL) &&
1784                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1785                         return (EACCES);
1786                 }
1787                 /* Target vnode must be file system's root. */
1788                 if (!vnode_isvroot(vp)) {
1789                         return (EINVAL);
1790                 }
1791                 linkfileid = *(cnid_t *)ap->a_data;
1792                 if (linkfileid < kHFSFirstUserCatalogNodeID) {
1793                         return (EINVAL);
1794                 }
1795                 if ((error = hfs_lookup_siblinglinks(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) {
1796                         return (error);
1797                 }
1798                 if (ap->a_command == HFSIOC_NEXT_LINK) {
1799                         *(cnid_t *)ap->a_data = nextlinkid;
1800                 } else {
1801                         *(cnid_t *)ap->a_data = prevlinkid;
1802                 }
1803                 return (0);
1804         }
1805
1806         case HFSIOC_RESIZE_PROGRESS: {
1807
1808                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1809                 if (suser(cred, NULL) &&
1810                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1811                         return (EACCES); /* must be owner of file system */
1812                 }
1813                 if (!vnode_isvroot(vp)) {
1814                         return (EINVAL);
1815                 }
1816                 /* file system must not be mounted read-only */
1817                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1818                         return (EROFS);
1819                 }
1820
1821                 return hfs_resize_progress(hfsmp, (u_int32_t *)ap->a_data);
1822         }
1823
1824         case HFSIOC_RESIZE_VOLUME: {
1825                 u_int64_t newsize;
1826                 u_int64_t cursize;
1827                 int ret;
1828
1829                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1830                 if (suser(cred, NULL) &&
1831                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1832                         return (EACCES); /* must be owner of file system */
1833                 }
1834                 if (!vnode_isvroot(vp)) {
1835                         return (EINVAL);
1836                 }
1837
1838                 /* filesystem must not be mounted read only */
1839                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1840                         return (EROFS);
1841                 }
1842                 newsize = *(u_int64_t *)ap->a_data;
1843                 cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
1844
1845                 if (newsize == cursize) {
1846                         return (0);
1847                 }
1848                 IOBSDMountChange(hfsmp->hfs_mp, kIOMountChangeWillResize);
1849                 if (newsize > cursize) {
1850                         ret = hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context);
1851                 } else {
1852                         ret = hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context);
1853                 }
1854                 IOBSDMountChange(hfsmp->hfs_mp, kIOMountChangeDidResize);
1855                 return (ret);
1856         }
1857         case HFSIOC_CHANGE_NEXT_ALLOCATION: {
1858                 int error = 0;          /* Assume success */
1859                 u_int32_t location;
1860
1861                 if (vnode_vfsisrdonly(vp)) {
1862                         return (EROFS);
1863                 }
1864                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1865                 if (suser(cred, NULL) &&
1866                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1867                         return (EACCES); /* must be owner of file system */
1868                 }
1869                 if (!vnode_isvroot(vp)) {
1870                         return (EINVAL);
1871                 }
1872                 hfs_lock_mount(hfsmp);
1873                 location = *(u_int32_t *)ap->a_data;
1874                 if ((location >= hfsmp->allocLimit) &&
1875                         (location != HFS_NO_UPDATE_NEXT_ALLOCATION)) {
1876                         error = EINVAL;
1877                         goto fail_change_next_allocation;
1878                 }
1879                 /* Return previous value. */
1880                 *(u_int32_t *)ap->a_data = hfsmp->nextAllocation;
1881                 if (location == HFS_NO_UPDATE_NEXT_ALLOCATION) {
1882                         /* On magic value for location, set nextAllocation to next block
1883                          * after metadata zone and set flag in mount structure to indicate
1884                          * that nextAllocation should not be updated again.
1885                          */
1886                         if (hfsmp->hfs_metazone_end != 0) {
1887                                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_end + 1);
1888                         }
1889                         hfsmp->hfs_flags |= HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1890                 } else {
1891                         hfsmp->hfs_flags &= ~HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1892                         HFS_UPDATE_NEXT_ALLOCATION(hfsmp, location);
1893                 }
1894                 MarkVCBDirty(hfsmp);
1895 fail_change_next_allocation:
1896                 hfs_unlock_mount(hfsmp);
1897                 return (error);
1898         }
1899
1900 #if HFS_SPARSE_DEV
1901         case HFSIOC_SETBACKINGSTOREINFO: {
1902                 struct vnode * di_vp;
1903                 struct hfs_backingstoreinfo *bsdata;
1904                 int error = 0;
1905
1906                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1907                         return (EROFS);
1908                 }
1909                 if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
1910                         return (EALREADY);
1911                 }
1912                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1913                 if (suser(cred, NULL) &&
1914                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1915                         return (EACCES); /* must be owner of file system */
1916                 }
1917                 bsdata = (struct hfs_backingstoreinfo *)ap->a_data;
1918                 if (bsdata == NULL) {
1919                         return (EINVAL);
1920                 }
1921                 if ((error = file_vnode(bsdata->backingfd, &di_vp))) {
1922                         return (error);
1923                 }
1924                 if ((error = vnode_getwithref(di_vp))) {
1925                         file_drop(bsdata->backingfd);
1926                         return(error);
1927                 }
1928
1929                 if (vnode_mount(vp) == vnode_mount(di_vp)) {
1930                         (void)vnode_put(di_vp);
1931                         file_drop(bsdata->backingfd);
1932                         return (EINVAL);
1933                 }
1934
1935                 // Dropped in unmount
1936                 vnode_ref(di_vp);
1937
1938                 hfs_lock_mount(hfsmp);
1939                 hfsmp->hfs_backingvp = di_vp;
1940                 hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
1941                 hfsmp->hfs_sparsebandblks = bsdata->bandsize / hfsmp->blockSize * 4;
1942                 hfs_unlock_mount(hfsmp);
1943
1944                 /* We check the MNTK_VIRTUALDEV bit instead of marking the dependent process */
1945
1946                 /*
1947                  * If the sparse image is on a sparse image file (as opposed to a sparse
1948                  * bundle), then we may need to limit the free space to the maximum size
1949                  * of a file on that volume.  So we query (using pathconf), and if we get
1950                  * a meaningful result, we cache the number of blocks for later use in
1951                  * hfs_freeblks().
1952                  */
1953                 hfsmp->hfs_backingfs_maxblocks = 0;
1954                 if (vnode_vtype(di_vp) == VREG) {
1955                         int terr;
1956                         int hostbits;
1957                         terr = vn_pathconf(di_vp, _PC_FILESIZEBITS, &hostbits, context);
1958                         if (terr == 0 && hostbits != 0 && hostbits < 64) {
1959                                 u_int64_t hostfilesizemax = ((u_int64_t)1) << hostbits;
1960
1961                                 hfsmp->hfs_backingfs_maxblocks = hostfilesizemax / hfsmp->blockSize;
1962                         }
1963                 }
1964
1965                 /* The free extent cache is managed differently for sparse devices.
1966                  * There is a window between which the volume is mounted and the
1967                  * device is marked as sparse, so the free extent cache for this
1968                  * volume is currently initialized as normal volume (sorted by block
1969                  * count).  Reset the cache so that it will be rebuilt again
1970                  * for sparse device (sorted by start block).
1971                  */
1972                 ResetVCBFreeExtCache(hfsmp);
1973
1974                 (void)vnode_put(di_vp);
1975                 file_drop(bsdata->backingfd);
1976                 return (0);
1977         }
1978
1979         case HFSIOC_CLRBACKINGSTOREINFO: {
1980                 struct vnode * tmpvp;
1981
1982                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1983                 if (suser(cred, NULL) &&
1984                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1985                         return (EACCES); /* must be owner of file system */
1986                 }
1987                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1988                         return (EROFS);
1989                 }
1990
1991                 if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
1992                     hfsmp->hfs_backingvp) {
1993
1994                         hfs_lock_mount(hfsmp);
1995                         hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
1996                         tmpvp = hfsmp->hfs_backingvp;
1997                         hfsmp->hfs_backingvp = NULLVP;
1998                         hfsmp->hfs_sparsebandblks = 0;
1999                         hfs_unlock_mount(hfsmp);
2000
2001                         vnode_rele(tmpvp);
2002                 }
2003                 return (0);
2004         }
2005 #endif /* HFS_SPARSE_DEV */
2006
2007         /* Change the next CNID stored in the VH */
2008         case HFSIOC_CHANGE_NEXTCNID: {
2009                 int error = 0;          /* Assume success */
2010                 u_int32_t fileid;
2011                 int wraparound = 0;
2012                 int lockflags = 0;
2013
2014                 if (vnode_vfsisrdonly(vp)) {
2015                         return (EROFS);
2016                 }
2017                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
2018                 if (suser(cred, NULL) &&
2019                         kauth_cred_getuid(cred) != vfsp->f_owner) {
2020                         return (EACCES); /* must be owner of file system */
2021                 }
2022
2023                 fileid = *(u_int32_t *)ap->a_data;
2024
2025                 /* Must have catalog lock excl. to advance the CNID pointer */
2026                 lockflags = hfs_systemfile_lock (hfsmp, SFL_CATALOG , HFS_EXCLUSIVE_LOCK);
2027
2028                 hfs_lock_mount(hfsmp);
2029
2030                 /* If it is less than the current next CNID, force the wraparound bit to be set */
2031                 if (fileid < hfsmp->vcbNxtCNID) {
2032                         wraparound=1;
2033                 }
2034
2035                 /* Return previous value. */
2036                 *(u_int32_t *)ap->a_data = hfsmp->vcbNxtCNID;
2037
2038                 hfsmp->vcbNxtCNID = fileid;
2039
2040                 if (wraparound) {
2041                         hfsmp->vcbAtrb |= kHFSCatalogNodeIDsReusedMask;
2042                 }
2043
2044                 MarkVCBDirty(hfsmp);
2045                 hfs_unlock_mount(hfsmp);
2046                 hfs_systemfile_unlock (hfsmp, lockflags);
2047
2048                 return (error);
2049         }
2050
2051         case F_FREEZE_FS: {
2052                 struct mount *mp;
2053
2054                 mp = vnode_mount(vp);
2055                 hfsmp = VFSTOHFS(mp);
2056
2057                 if (!(hfsmp->jnl))
2058                         return (ENOTSUP);
2059
2060                 vfsp = vfs_statfs(mp);
2061
2062                 if (kauth_cred_getuid(cred) != vfsp->f_owner &&
2063                         !kauth_cred_issuser(cred))
2064                         return (EACCES);
2065
2066                 return hfs_freeze(hfsmp);
2067         }
2068
2069         case F_THAW_FS: {
2070                 vfsp = vfs_statfs(vnode_mount(vp));
2071                 if (kauth_cred_getuid(cred) != vfsp->f_owner &&
2072                         !kauth_cred_issuser(cred))
2073                         return (EACCES);
2074
2075                 return hfs_thaw(hfsmp, current_proc());
2076         }
2077
2078         case HFSIOC_EXT_BULKACCESS32:
2079         case HFSIOC_EXT_BULKACCESS64: {
2080             int size;
2081 #if CONFIG_HFS_STD
2082             if (hfsmp->hfs_flags & HFS_STANDARD) {
2083                         return EINVAL;
2084             }
2085 #endif
2086
2087             if (is64bit) {
2088                 size = sizeof(struct user64_ext_access_t);
2089             } else {
2090                 size = sizeof(struct user32_ext_access_t);
2091             }
2092
2093             return do_bulk_access_check(hfsmp, vp, ap, size, context);
2094         }
2095
2096         case HFSIOC_SET_XATTREXTENTS_STATE: {
2097                 int state;
2098
2099                 if (ap->a_data == NULL) {
2100                         return (EINVAL);
2101                 }
2102
2103                 state = *(int *)ap->a_data;
2104
2105                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2106                         return (EROFS);
2107                 }
2108
2109                 /* Super-user can enable or disable extent-based extended
2110                  * attribute support on a volume
2111                  * Note: Starting Mac OS X 10.7, extent-based extended attributes
2112                  * are enabled by default, so any change will be transient only
2113                  * till the volume is remounted.
2114                  */
2115                 if (!kauth_cred_issuser(kauth_cred_get())) {
2116                         return (EPERM);
2117                 }
2118                 if (state == 0 || state == 1)
2119                         return hfs_set_volxattr(hfsmp, HFSIOC_SET_XATTREXTENTS_STATE, state);
2120                 else
2121                         return (EINVAL);
2122         }
2123
2124         case F_SETSTATICCONTENT: {
2125                 int error;
2126                 int enable_static = 0;
2127                 struct cnode *cp = NULL;
2128                 /*
2129                  * lock the cnode, decorate the cnode flag, and bail out.
2130                  * VFS should have already authenticated the caller for us.
2131                  */
2132
2133                 if (ap->a_data) {
2134                         /*
2135                          * Note that even though ap->a_data is of type caddr_t,
2136                          * the fcntl layer at the syscall handler will pass in NULL
2137                          * or 1 depending on what the argument supplied to the fcntl
2138                          * was.  So it is in fact correct to check the ap->a_data
2139                          * argument for zero or non-zero value when deciding whether or not
2140                          * to enable the static bit in the cnode.
2141                          */
2142                         enable_static = 1;
2143                 }
2144                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2145                         return EROFS;
2146                 }
2147                 cp = VTOC(vp);
2148
2149                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2150                 if (error == 0) {
2151                         if (enable_static) {
2152                                 cp->c_flag |= C_SSD_STATIC;
2153                         }
2154                         else {
2155                                 cp->c_flag &= ~C_SSD_STATIC;
2156                         }
2157                         hfs_unlock (cp);
2158                 }
2159                 return error;
2160         }
2161
2162         case F_SET_GREEDY_MODE: {
2163                 int error;
2164                 int enable_greedy_mode = 0;
2165                 struct cnode *cp = NULL;
2166                 /*
2167                  * lock the cnode, decorate the cnode flag, and bail out.
2168                  * VFS should have already authenticated the caller for us.
2169                  */
2170
2171                 if (ap->a_data) {
2172                         /*
2173                          * Note that even though ap->a_data is of type caddr_t,
2174                          * the fcntl layer at the syscall handler will pass in NULL
2175                          * or 1 depending on what the argument supplied to the fcntl
2176                          * was.  So it is in fact correct to check the ap->a_data
2177                          * argument for zero or non-zero value when deciding whether or not
2178                          * to enable the greedy mode bit in the cnode.
2179                          */
2180                         enable_greedy_mode = 1;
2181                 }
2182                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2183                         return EROFS;
2184                 }
2185                 cp = VTOC(vp);
2186
2187                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2188                 if (error == 0) {
2189                         if (enable_greedy_mode) {
2190                                 cp->c_flag |= C_SSD_GREEDY_MODE;
2191                         }
2192                         else {
2193                                 cp->c_flag &= ~C_SSD_GREEDY_MODE;
2194                         }
2195                         hfs_unlock (cp);
2196                 }
2197                 return error;
2198         }
2199
2200         case F_SETIOTYPE: {
2201                 int error;
2202                 uint32_t iotypeflag = 0;
2203
2204                 struct cnode *cp = NULL;
2205                 /*
2206                  * lock the cnode, decorate the cnode flag, and bail out.
2207                  * VFS should have already authenticated the caller for us.
2208                  */
2209
2210                 if (ap->a_data == NULL) {
2211                         return EINVAL;
2212                 }
2213
2214                 /*
2215                  * Note that even though ap->a_data is of type caddr_t, we
2216                  * can only use 32 bits of flag values.
2217                  */
2218                 iotypeflag = (uint32_t) ap->a_data;
2219                 switch (iotypeflag) {
2220                         case F_IOTYPE_ISOCHRONOUS:
2221                                 break;
2222                         default:
2223                                 return EINVAL;
2224                 }
2225
2226
2227                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2228                         return EROFS;
2229                 }
2230                 cp = VTOC(vp);
2231
2232                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2233                 if (error == 0) {
2234                         switch (iotypeflag) {
2235                                 case F_IOTYPE_ISOCHRONOUS:
2236                                         cp->c_flag |= C_IO_ISOCHRONOUS;
2237                                         break;
2238                                 default:
2239                                         break;
2240                         }
2241                         hfs_unlock (cp);
2242                 }
2243                 return error;
2244         }
2245
2246         case F_MAKECOMPRESSED: {
2247                 int error = 0;
2248                 uint32_t gen_counter;
2249                 struct cnode *cp = NULL;
2250                 int reset_decmp = 0;
2251
2252                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2253                         return EROFS;
2254                 }
2255
2256                 /*
2257                  * acquire & lock the cnode.
2258                  * VFS should have already authenticated the caller for us.
2259                  */
2260
2261                 if (ap->a_data) {
2262                         /*
2263                          * Cast the pointer into a uint32_t so we can extract the
2264                          * supplied generation counter.
2265                          */
2266                         gen_counter = *((uint32_t*)ap->a_data);
2267                 }
2268                 else {
2269                         return EINVAL;
2270                 }
2271
2272 #if HFS_COMPRESSION
2273                 cp = VTOC(vp);
2274                 /* Grab truncate lock first; we may truncate the file */
2275                 hfs_lock_truncate (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2276
2277                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2278                 if (error) {
2279                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
2280                         return error;
2281                 }
2282
2283                 /* Are there any other usecounts/FDs? */
2284                 if (vnode_isinuse(vp, 1)) {
2285                         hfs_unlock(cp);
2286                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
2287                         return EBUSY;
2288                 }
2289
2290                 /* now we have the cnode locked down; Validate arguments */
2291                 if (cp->c_attr.ca_flags & (UF_IMMUTABLE | UF_COMPRESSED)) {
2292                         /* EINVAL if you are trying to manipulate an IMMUTABLE file */
2293                         hfs_unlock(cp);
2294                         hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT);
2295                         return EINVAL;
2296                 }
2297
2298                 if ((hfs_get_gencount (cp)) == gen_counter) {
2299                         /*
2300                          * OK, the gen_counter matched.  Go for it:
2301                          * Toggle state bits, truncate file, and suppress mtime update
2302                          */
2303                         reset_decmp = 1;
2304                         cp->c_bsdflags |= UF_COMPRESSED;
2305
2306                         error = hfs_truncate(vp, 0, IO_NDELAY, HFS_TRUNCATE_SKIPTIMES,
2307                                                                  ap->a_context);
2308                 }
2309                 else {
2310                         error = ESTALE;
2311                 }
2312
2313                 /* Unlock cnode before executing decmpfs ; they may need to get an EA */
2314                 hfs_unlock(cp);
2315
2316                 /*
2317                  * Reset the decmp state while still holding the truncate lock. We need to
2318                  * serialize here against a listxattr on this node which may occur at any
2319                  * time.
2320                  *
2321                  * Even if '0/skiplock' is passed in 2nd argument to hfs_file_is_compressed,
2322                  * that will still potentially require getting the com.apple.decmpfs EA. If the
2323                  * EA is required, then we can't hold the cnode lock, because the getxattr call is
2324                  * generic(through VFS), and can't pass along any info telling it that we're already
2325                  * holding it (the lock). If we don't serialize, then we risk listxattr stopping
2326                  * and trying to fill in the hfs_file_is_compressed info during the callback
2327                  * operation, which will result in deadlock against the b-tree node.
2328                  *
2329                  * So, to serialize against listxattr (which will grab buf_t meta references on
2330                  * the b-tree blocks), we hold the truncate lock as we're manipulating the
2331                  * decmpfs payload.
2332                  */
2333                 if ((reset_decmp) && (error == 0)) {
2334                         decmpfs_cnode *dp = VTOCMP (vp);
2335                         if (dp != NULL) {
2336                                 decmpfs_cnode_set_vnode_state(dp, FILE_TYPE_UNKNOWN, 0);
2337                         }
2338
2339                         /* Initialize the decmpfs node as needed */
2340                         (void) hfs_file_is_compressed (cp, 0); /* ok to take lock */
2341                 }
2342
2343                 hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT);
2344
2345 #endif
2346                 return error;
2347         }
2348
2349         case F_SETBACKINGSTORE: {
2350
2351                 int error = 0;
2352
2353                 /*
2354                  * See comment in F_SETSTATICCONTENT re: using
2355              * a null check for a_data
2356                  */
2357                 if (ap->a_data) {
2358                         error = hfs_set_backingstore (vp, 1);
2359                 }
2360                 else {
2361                         error = hfs_set_backingstore (vp, 0);
2362                 }
2363
2364                 return error;
2365         }
2366
2367         case F_GETPATH_MTMINFO: {
2368                 int error = 0;
2369
2370                 int *data = (int*) ap->a_data;
2371
2372                 /* Ask if this is a backingstore vnode */
2373                 error = hfs_is_backingstore (vp, data);
2374
2375                 return error;
2376         }
2377
2378         case F_FULLFSYNC: {
2379                 int error;
2380
2381                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2382                         return (EROFS);
2383                 }
2384                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2385                 if (error == 0) {
2386                         error = hfs_fsync(vp, MNT_WAIT, HFS_FSYNC_FULL, p);
2387                         hfs_unlock(VTOC(vp));
2388                 }
2389
2390                 return error;
2391         }
2392
2393         case F_BARRIERFSYNC: {
2394                 int error;
2395
2396                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2397                         return (EROFS);
2398                 }
2399                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2400                 if (error == 0) {
2401                         error = hfs_fsync(vp, MNT_WAIT, HFS_FSYNC_BARRIER, p);
2402                         hfs_unlock(VTOC(vp));
2403                 }
2404
2405                 return error;
2406         }
2407
2408         case F_CHKCLEAN: {
2409                 register struct cnode *cp;
2410                 int error;
2411
2412                 if (!vnode_isreg(vp))
2413                         return EINVAL;
2414
2415                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2416                 if (error == 0) {
2417                         cp = VTOC(vp);
2418                         /*
2419                          * used by regression test to determine if
2420                          * all the dirty pages (via write) have been cleaned
2421                          * after a call to 'fsysnc'.
2422                          */
2423                         error = is_file_clean(vp, VTOF(vp)->ff_size);
2424                         hfs_unlock(cp);
2425                 }
2426                 return (error);
2427         }
2428
2429         case F_RDADVISE: {
2430                 register struct radvisory *ra;
2431                 struct filefork *fp;
2432                 int error;
2433
2434                 if (!vnode_isreg(vp))
2435                         return EINVAL;
2436
2437                 ra = (struct radvisory *)(ap->a_data);
2438                 fp = VTOF(vp);
2439
2440                 /* Protect against a size change. */
2441                 hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2442
2443 #if HFS_COMPRESSION
2444                 if (compressed) {
2445                         if (uncompressed_size == -1) {
2446                                 /* fetching the uncompressed size failed above, so return the error */
2447                                 error = decmpfs_error;
2448                         } else if (ra->ra_offset >= uncompressed_size) {
2449                                 error = EFBIG;
2450                         } else {
2451                                 error = advisory_read(vp, uncompressed_size, ra->ra_offset, ra->ra_count);
2452                         }
2453                 } else
2454 #endif /* HFS_COMPRESSION */
2455                 if (ra->ra_offset >= fp->ff_size) {
2456                         error = EFBIG;
2457                 } else {
2458                         error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count);
2459                 }
2460
2461                 hfs_unlock_truncate(VTOC(vp), HFS_LOCK_DEFAULT);
2462                 return (error);
2463         }
2464
2465         case HFSIOC_GET_VOL_CREATE_TIME_32: {
2466                 *(user32_time_t *)(ap->a_data) = (user32_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2467                 return 0;
2468         }
2469
2470         case HFSIOC_GET_VOL_CREATE_TIME_64: {
2471                 *(user64_time_t *)(ap->a_data) = (user64_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2472                 return 0;
2473         }
2474
2475         case SPOTLIGHT_IOC_GET_MOUNT_TIME:
2476             *(uint32_t *)ap->a_data = hfsmp->hfs_mount_time;
2477             break;
2478
2479         case SPOTLIGHT_IOC_GET_LAST_MTIME:
2480             *(uint32_t *)ap->a_data = hfsmp->hfs_last_mounted_mtime;
2481             break;
2482
2483         case HFSIOC_GET_VERY_LOW_DISK:
2484             *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_dangerlimit;
2485             break;
2486
2487         case HFSIOC_SET_VERY_LOW_DISK:
2488             if (*(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_warninglimit) {
2489                 return EINVAL;
2490             }
2491
2492             hfsmp->hfs_freespace_notify_dangerlimit = *(uint32_t *)ap->a_data;
2493             break;
2494
2495         case HFSIOC_GET_LOW_DISK:
2496             *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_warninglimit;
2497             break;
2498
2499         case HFSIOC_SET_LOW_DISK:
2500             if (   *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel
2501                 || *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_dangerlimit) {
2502
2503                 return EINVAL;
2504             }
2505
2506             hfsmp->hfs_freespace_notify_warninglimit = *(uint32_t *)ap->a_data;
2507             break;
2508
2509         /* The following two fsctls were ported from apfs. */
2510         case APFSIOC_GET_NEAR_LOW_DISK:
2511                 *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_nearwarninglimit;
2512                 break;
2513
2514         case APFSIOC_SET_NEAR_LOW_DISK:
2515                 if (   *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel
2516                 || *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) {
2517                         return EINVAL;
2518                 }
2519
2520                 hfsmp->hfs_freespace_notify_nearwarninglimit = *(uint32_t *)ap->a_data;
2521                 break;
2522
2523         case HFSIOC_GET_DESIRED_DISK:
2524             *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_desiredlevel;
2525             break;
2526
2527         case HFSIOC_SET_DESIRED_DISK:
2528             if (*(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) {
2529                 return EINVAL;
2530             }
2531
2532             hfsmp->hfs_freespace_notify_desiredlevel = *(uint32_t *)ap->a_data;
2533             break;
2534
2535         case HFSIOC_VOLUME_STATUS:
2536             *(uint32_t *)ap->a_data = hfsmp->hfs_notification_conditions;
2537             break;
2538
2539         case HFS_SET_BOOT_INFO:
2540                 if (!vnode_isvroot(vp))
2541                         return(EINVAL);
2542                 if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(HFSTOVFS(hfsmp))->f_owner))
2543                         return(EACCES); /* must be superuser or owner of filesystem */
2544                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2545                         return (EROFS);
2546                 }
2547                 hfs_lock_mount (hfsmp);
2548                 bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo));
2549                 /* Null out the cached UUID, to be safe */
2550                 uuid_clear (hfsmp->hfs_full_uuid);
2551                 hfs_unlock_mount (hfsmp);
2552                 (void) hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT);
2553                 break;
2554
2555         case HFS_GET_BOOT_INFO:
2556                 if (!vnode_isvroot(vp))
2557                         return(EINVAL);
2558                 hfs_lock_mount (hfsmp);
2559                 bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo));
2560                 hfs_unlock_mount(hfsmp);
2561                 break;
2562
2563         /* case HFS_MARK_BOOT_CORRUPT: _IO are the same */
2564         case HFSIOC_MARK_BOOT_CORRUPT:
2565                 /* Mark the boot volume corrupt by setting
2566                  * kHFSVolumeInconsistentBit in the volume header.  This will
2567                  * force fsck_hfs on next mount.
2568                  */
2569                 if (!kauth_cred_issuser(kauth_cred_get())) {
2570                         return EACCES;
2571                 }
2572
2573                 /* Allowed only on the root vnode of the boot volume */
2574                 if (!(vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) ||
2575                     !vnode_isvroot(vp)) {
2576                         return EINVAL;
2577                 }
2578                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2579                         return (EROFS);
2580                 }
2581                 printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n");
2582                 hfs_mark_inconsistent(hfsmp, HFS_FSCK_FORCED);
2583                 break;
2584
2585         case HFSIOC_GET_JOURNAL_INFO:
2586                 jip = (struct hfs_journal_info*)ap->a_data;
2587
2588                 if (vp == NULLVP)
2589                         return EINVAL;
2590
2591             if (hfsmp->jnl == NULL) {
2592                         jnl_start = 0;
2593                         jnl_size  = 0;
2594             } else {
2595                         jnl_start = hfs_blk_to_bytes(hfsmp->jnl_start, hfsmp->blockSize) + hfsmp->hfsPlusIOPosOffset;
2596                         jnl_size  = hfsmp->jnl_size;
2597             }
2598
2599                 jip->jstart = jnl_start;
2600                 jip->jsize = jnl_size;
2601                 break;
2602
2603         case HFSIOC_SET_ALWAYS_ZEROFILL: {
2604             struct cnode *cp = VTOC(vp);
2605
2606             if (*(int *)ap->a_data) {
2607                 cp->c_flag |= C_ALWAYS_ZEROFILL;
2608             } else {
2609                 cp->c_flag &= ~C_ALWAYS_ZEROFILL;
2610             }
2611             break;
2612         }
2613
2614         /* case HFS_DISABLE_METAZONE: _IO are the same */
2615         case HFSIOC_DISABLE_METAZONE: {
2616                 /* Only root can disable metadata zone */
2617                 if (!kauth_cred_issuser(kauth_cred_get())) {
2618                         return EACCES;
2619                 }
2620                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2621                         return (EROFS);
2622                 }
2623
2624                 /* Disable metadata zone now */
2625                 (void) hfs_metadatazone_init(hfsmp, true);
2626                 printf ("hfs: Disabling metadata zone on %s\n", hfsmp->vcbVN);
2627                 break;
2628         }
2629
2630
2631         case HFSIOC_FSINFO_METADATA_BLOCKS: {
2632                 int error;
2633                 struct hfsinfo_metadata *hinfo;
2634
2635                 hinfo = (struct hfsinfo_metadata *)ap->a_data;
2636
2637                 /* Get information about number of metadata blocks */
2638                 error = hfs_getinfo_metadata_blocks(hfsmp, hinfo);
2639                 if (error) {
2640                         return error;
2641                 }
2642
2643                 break;
2644         }
2645
2646         case HFSIOC_GET_FSINFO: {
2647                 hfs_fsinfo *fsinfo = (hfs_fsinfo *)ap->a_data;
2648
2649                 /* Only root is allowed to get fsinfo */
2650                 if (!kauth_cred_issuser(kauth_cred_get())) {
2651                         return EACCES;
2652                 }
2653
2654                 /*
2655                  * Make sure that the caller's version number matches with
2656                  * the kernel's version number.  This will make sure that
2657                  * if the structures being read/written into are changed
2658                  * by the kernel, the caller will not read incorrect data.
2659                  *
2660                  * The first three fields --- request_type, version and
2661                  * flags are same for all the hfs_fsinfo structures, so
2662                  * we can access the version number by assuming any
2663                  * structure for now.
2664                  */
2665                 if (fsinfo->header.version != HFS_FSINFO_VERSION) {
2666                         return ENOTSUP;
2667                 }
2668
2669                 /* Make sure that the current file system is not marked inconsistent */
2670                 if (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) {
2671                         return EIO;
2672                 }
2673
2674                 return hfs_get_fsinfo(hfsmp, ap->a_data);
2675         }
2676
2677         case HFSIOC_CS_FREESPACE_TRIM: {
2678                 int error = 0;
2679                 int lockflags = 0;
2680
2681                 /* Only root allowed */
2682                 if (!kauth_cred_issuser(kauth_cred_get())) {
2683                         return EACCES;
2684                 }
2685
2686                 /*
2687                  * This core functionality is similar to hfs_scan_blocks().
2688                  * The main difference is that hfs_scan_blocks() is called
2689                  * as part of mount where we are assured that the journal is
2690                  * empty to start with.  This fcntl() can be called on a
2691                  * mounted volume, therefore it has to flush the content of
2692                  * the journal as well as ensure the state of summary table.
2693                  *
2694                  * This fcntl scans over the entire allocation bitmap,
2695                  * creates list of all the free blocks, and issues TRIM
2696                  * down to the underlying device.  This can take long time
2697                  * as it can generate up to 512MB of read I/O.
2698                  */
2699
2700                 if ((hfsmp->hfs_flags & HFS_SUMMARY_TABLE) == 0) {
2701                         error = hfs_init_summary(hfsmp);
2702                         if (error) {
2703                                 printf("hfs: fsctl() could not initialize summary table for %s\n", hfsmp->vcbVN);
2704                                 return error;
2705                         }
2706                 }
2707
2708                 /*
2709                  * The journal maintains list of recently deallocated blocks to
2710                  * issue DKIOCUNMAPs when the corresponding journal transaction is
2711                  * flushed to the disk.  To avoid any race conditions, we only
2712                  * want one active trim list and only one thread issuing DKIOCUNMAPs.
2713                  * Therefore we make sure that the journal trim list is sync'ed,
2714                  * empty, and not modifiable for the duration of our scan.
2715                  *
2716                  * Take the journal lock before flushing the journal to the disk.
2717                  * We will keep on holding the journal lock till we don't get the
2718                  * bitmap lock to make sure that no new journal transactions can
2719                  * start.  This will make sure that the journal trim list is not
2720                  * modified after the journal flush and before getting bitmap lock.
2721                  * We can release the journal lock after we acquire the bitmap
2722                  * lock as it will prevent any further block deallocations.
2723                  */
2724                 hfs_journal_lock(hfsmp);
2725
2726                 /* Flush the journal and wait for all I/Os to finish up */
2727                 error = hfs_flush(hfsmp, HFS_FLUSH_JOURNAL_META);
2728                 if (error) {
2729                         hfs_journal_unlock(hfsmp);
2730                         return error;
2731                 }
2732
2733                 /* Take bitmap lock to ensure it is not being modified */
2734                 lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
2735
2736                 /* Release the journal lock */
2737                 hfs_journal_unlock(hfsmp);
2738
2739                 /*
2740                  * ScanUnmapBlocks reads the bitmap in large block size
2741                  * (up to 1MB) unlike the runtime which reads the bitmap
2742                  * in the 4K block size.  This can cause buf_t collisions
2743                  * and potential data corruption.  To avoid this, we
2744                  * invalidate all the existing buffers associated with
2745                  * the bitmap vnode before scanning it.
2746                  *
2747                  * Note: ScanUnmapBlock() cleans up all the buffers
2748                  * after itself, so there won't be any large buffers left
2749                  * for us to clean up after it returns.
2750                  */
2751                 error = buf_invalidateblks(hfsmp->hfs_allocation_vp, 0, 0, 0);
2752                 if (error) {
2753                         hfs_systemfile_unlock(hfsmp, lockflags);
2754                         return error;
2755                 }
2756
2757                 /* Traverse bitmap and issue DKIOCUNMAPs */
2758                 error = ScanUnmapBlocks(hfsmp);
2759                 hfs_systemfile_unlock(hfsmp, lockflags);
2760                 if (error) {
2761                         return error;
2762                 }
2763
2764                 break;
2765         }
2766
2767         case HFSIOC_SET_HOTFILE_STATE: {
2768                 int error;
2769                 struct cnode *cp = VTOC(vp);
2770                 uint32_t hf_state = *((uint32_t*)ap->a_data);
2771                 uint32_t num_unpinned = 0;
2772
2773                 error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2774                 if (error) {
2775                         return error;
2776                 }
2777
2778                 // printf("hfs: setting hotfile state %d on %s\n", hf_state, vp->v_name);
2779                 if (hf_state == HFS_MARK_FASTDEVCANDIDATE) {
2780                         vnode_setfastdevicecandidate(vp);
2781
2782                         cp->c_attr.ca_recflags |= kHFSFastDevCandidateMask;
2783                         cp->c_attr.ca_recflags &= ~kHFSDoNotFastDevPinMask;
2784                         cp->c_flag |= C_MODIFIED;
2785                 } else if (hf_state == HFS_UNMARK_FASTDEVCANDIDATE || hf_state == HFS_NEVER_FASTDEVCANDIDATE) {
2786                         vnode_clearfastdevicecandidate(vp);
2787                         hfs_removehotfile(vp);
2788
2789                         if (cp->c_attr.ca_recflags & kHFSFastDevPinnedMask) {
2790                                 hfs_pin_vnode(hfsmp, vp, HFS_UNPIN_IT, &num_unpinned);
2791                         }
2792
2793                         if (hf_state == HFS_NEVER_FASTDEVCANDIDATE) {
2794                                 cp->c_attr.ca_recflags |= kHFSDoNotFastDevPinMask;
2795                         }
2796                         cp->c_attr.ca_recflags &= ~(kHFSFastDevCandidateMask|kHFSFastDevPinnedMask);
2797                         cp->c_flag |= C_MODIFIED;
2798
2799                 } else {
2800                         error = EINVAL;
2801                 }
2802
2803                 if (num_unpinned != 0) {
2804                         lck_mtx_lock(&hfsmp->hfc_mutex);
2805                         hfsmp->hfs_hotfile_freeblks += num_unpinned;
2806                         lck_mtx_unlock(&hfsmp->hfc_mutex);
2807                 }
2808
2809                 hfs_unlock(cp);
2810                 return error;
2811         }
2812
2813         case HFSIOC_REPIN_HOTFILE_STATE: {
2814                 int error=0;
2815                 uint32_t repin_what = *((uint32_t*)ap->a_data);
2816
2817                 /* Only root allowed */
2818                 if (!kauth_cred_issuser(kauth_cred_get())) {
2819                         return EACCES;
2820                 }
2821
2822                 if (!(hfsmp->hfs_flags & (HFS_CS_METADATA_PIN | HFS_CS_HOTFILE_PIN))) {
2823                         // this system is neither regular Fusion or Cooperative Fusion
2824                         // so this fsctl makes no sense.
2825                         return EINVAL;
2826                 }
2827
2828                 //
2829                 // After a converting a CoreStorage volume to be encrypted, the
2830                 // extents could have moved around underneath us.  This call
2831                 // allows corestoraged to re-pin everything that should be
2832                 // pinned (it would happen on the next reboot too but that could
2833                 // be a long time away).
2834                 //
2835                 if ((repin_what & HFS_REPIN_METADATA) && (hfsmp->hfs_flags & HFS_CS_METADATA_PIN)) {
2836                         hfs_pin_fs_metadata(hfsmp);
2837                 }
2838                 if ((repin_what & HFS_REPIN_USERDATA) && (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN)) {
2839                         hfs_repin_hotfiles(hfsmp);
2840                 }
2841                 if ((repin_what & HFS_REPIN_USERDATA) && (hfsmp->hfs_flags & HFS_CS_SWAPFILE_PIN)) {
2842                         //XXX Swapfiles (marked SWAP_PINNED) may have moved too.
2843                         //XXX Do we care? They have a more transient/dynamic nature/lifetime.
2844                 }
2845
2846                 return error;
2847         }
2848
2849 #if HFS_CONFIG_KEY_ROLL
2850
2851         case HFSIOC_KEY_ROLL: {
2852                 if (!kauth_cred_issuser(kauth_cred_get()))
2853                         return EACCES;
2854
2855                 hfs_key_roll_args_t *args = (hfs_key_roll_args_t *)ap->a_data;
2856
2857                 return hfs_key_roll_op(ap->a_context, ap->a_vp, args);
2858         }
2859
2860         case HFSIOC_GET_KEY_AUTO_ROLL: {
2861                 if (!kauth_cred_issuser(kauth_cred_get()))
2862                         return EACCES;
2863
2864                 hfs_key_auto_roll_args_t *args = (hfs_key_auto_roll_args_t *)ap->a_data;
2865                 if (args->api_version != HFS_KEY_AUTO_ROLL_API_VERSION_1)
2866                         return ENOTSUP;
2867                 args->flags = (ISSET(hfsmp->cproot_flags, CP_ROOT_AUTO_ROLL_OLD_CLASS_GENERATION)
2868                                            ? HFS_KEY_AUTO_ROLL_OLD_CLASS_GENERATION : 0);
2869                 args->min_key_os_version = hfsmp->hfs_auto_roll_min_key_os_version;
2870                 args->max_key_os_version = hfsmp->hfs_auto_roll_max_key_os_version;
2871                 break;
2872         }
2873
2874         case HFSIOC_SET_KEY_AUTO_ROLL: {
2875                 if (!kauth_cred_issuser(kauth_cred_get()))
2876                         return EACCES;
2877
2878                 hfs_key_auto_roll_args_t *args = (hfs_key_auto_roll_args_t *)ap->a_data;
2879                 if (args->api_version != HFS_KEY_AUTO_ROLL_API_VERSION_1)
2880                         return ENOTSUP;
2881                 return cp_set_auto_roll(hfsmp, args);
2882         }
2883
2884 #endif // HFS_CONFIG_KEY_ROLL
2885
2886 #if CONFIG_PROTECT
2887         case F_TRANSCODEKEY:
2888                 /*
2889                  * This API is only supported when called via kernel so
2890                  * a_fflag must be set to 1 (it's not possible to get here
2891                  * with it set to 1 via fsctl).
2892                  */
2893                 if (ap->a_fflag != 1)
2894                         return ENOTTY;
2895                 return cp_vnode_transcode(vp, (cp_key_t *)ap->a_data);
2896
2897         case F_GETPROTECTIONLEVEL:
2898                 return cp_get_root_major_vers (vp, (uint32_t *)ap->a_data);
2899
2900         case F_GETDEFAULTPROTLEVEL:
2901                 return cp_get_default_level(vp, (uint32_t *)ap->a_data);
2902 #endif // CONFIG_PROTECT
2903
2904         case FIOPINSWAP:
2905                 return hfs_pin_vnode(hfsmp, vp, HFS_PIN_IT | HFS_DATALESS_PIN,
2906                                                          NULL);
2907
2908         case FSIOC_CAS_BSDFLAGS: {
2909                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2910                         return (EROFS);
2911                 }
2912
2913 #if 0
2914                 struct fsioc_cas_bsdflags *cas = (void *)ap->a_data;
2915                 struct cnode *cp = VTOC(vp);
2916                 u_int32_t document_id = 0;
2917                 int decmpfs_reset_state = 0;
2918                 int error;
2919
2920                 /* Don't allow modification of the journal. */
2921                 if (hfs_is_journal_file(hfsmp, cp)) {
2922                         return (EPERM);
2923                 }
2924
2925                 if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
2926                         return (error);
2927                 }
2928
2929                 cas->actual_flags = cp->c_bsdflags;
2930                 if (cas->actual_flags != cas->expected_flags) {
2931                         hfs_unlock(cp);
2932                         return (0);
2933                 }
2934
2935                 //
2936                 // Check if we'll need a document_id.  If so, we need to drop the lock
2937                 // (to avoid any possible deadlock with the root vnode which has to get
2938                 // locked to get the document id), generate the document_id, re-acquire
2939                 // the lock, and perform the CAS check again.  We do it in this sequence
2940                 // in order to avoid throwing away document_ids in the case where the
2941                 // CAS check fails.  Note that it can still happen, but by performing
2942                 // the check first, hopefully we can reduce the ocurrence.
2943                 //
2944                 if ((cas->new_flags & UF_TRACKED) && !(VTOC(vp)->c_bsdflags & UF_TRACKED)) {
2945                         struct FndrExtendedDirInfo *fip = (struct FndrExtendedDirInfo *)((char *)&(VTOC(vp)->c_attr.ca_finderinfo) + 16);
2946                         //
2947                         // If the document_id is not set, get a new one.  It will be set
2948                         // on the file down below once we hold the cnode lock.
2949                         //
2950                         if (fip->document_id == 0) {
2951                                 //
2952                                 // Drat, we have to generate one.  Unlock the cnode, do the
2953                                 // deed, re-lock the cnode, and then to the CAS check again
2954                                 // to see if we lost the race.
2955                                 //
2956                                 hfs_unlock(cp);
2957                                 if (hfs_generate_document_id(hfsmp, &document_id) != 0) {
2958                                         document_id = 0;
2959                                 }
2960                                 if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
2961                                         return (error);
2962                                 }
2963                                 cas->actual_flags = cp->c_bsdflags;
2964                                 if (cas->actual_flags != cas->expected_flags) {
2965                                         hfs_unlock(cp);
2966                                         return (0);
2967                                 }
2968                         }
2969                 }
2970
2971                 bool setting_compression = false;
2972
2973                 if (!(cas->actual_flags & UF_COMPRESSED) && (cas->new_flags & UF_COMPRESSED))
2974                         setting_compression = true;
2975
2976                 if (setting_compression) {
2977                         hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2978                         if (VTOF(vp)->ff_size) {
2979                                 // hfs_truncate will deal with the cnode lock
2980                                 error = hfs_truncate(vp, 0, IO_NDELAY, 0, ap->a_context);
2981                         }
2982                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
2983                 }
2984
2985                 if (!error)
2986                         error = hfs_set_bsd_flags(hfsmp, cp, cas->new_flags,
2987                                                                   document_id, ap->a_context,
2988                                                                   &decmpfs_reset_state);
2989                 if (error == 0) {
2990                         error = hfs_update(vp, 0);
2991                 }
2992                 hfs_unlock(cp);
2993                 if (error) {
2994                         return (error);
2995                 }
2996
2997 #if HFS_COMPRESSION
2998                 if (decmpfs_reset_state) {
2999                         /*
3000                          * we've changed the UF_COMPRESSED flag, so reset the decmpfs state for this cnode
3001                          * but don't do it while holding the hfs cnode lock
3002                          */
3003                         decmpfs_cnode *dp = VTOCMP(vp);
3004                         if (!dp) {
3005                                 /*
3006                                  * call hfs_lazy_init_decmpfs_cnode() to make sure that the decmpfs_cnode
3007                                  * is filled in; we need a decmpfs_cnode to prevent decmpfs state changes
3008                                  * on this file if it's locked
3009                                  */
3010                                 dp = hfs_lazy_init_decmpfs_cnode(VTOC(vp));
3011                                 if (!dp) {
3012                                         /* failed to allocate a decmpfs_cnode */
3013                                         return ENOMEM; /* what should this be? */
3014                                 }
3015                         }
3016                         decmpfs_cnode_set_vnode_state(dp, FILE_TYPE_UNKNOWN, 0);
3017                 }
3018 #endif
3019                 break;
3020 #endif
3021                 return ENOTSUP;
3022         }
3023
3024         default:
3025                 return (ENOTTY);
3026         }
3027
3028         return 0;
3029 }
3030
3031 /*
3032  * select
3033  */
3034 int
3035 hfs_vnop_select(__unused struct vnop_select_args *ap)
3036 /*
3037         struct vnop_select_args {
3038                 vnode_t a_vp;
3039                 int  a_which;
3040                 int  a_fflags;
3041                 void *a_wql;
3042                 vfs_context_t a_context;
3043         };
3044 */
3045 {
3046         /*
3047          * We should really check to see if I/O is possible.
3048          */
3049         return (1);
3050 }
3051
3052 /*
3053  * Converts a logical block number to a physical block, and optionally returns
3054  * the amount of remaining blocks in a run. The logical block is based on hfsNode.logBlockSize.
3055  * The physical block number is based on the device block size, currently its 512.
3056  * The block run is returned in logical blocks, and is the REMAINING amount of blocks
3057  */
3058 int
3059 hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, unsigned int *runp)
3060 {
3061         struct filefork *fp = VTOF(vp);
3062         struct hfsmount *hfsmp = VTOHFS(vp);
3063         int  retval = E_NONE;
3064         u_int32_t  logBlockSize;
3065         size_t  bytesContAvail = 0;
3066         off_t  blockposition;
3067         int lockExtBtree;
3068         int lockflags = 0;
3069
3070         /*
3071          * Check for underlying vnode requests and ensure that logical
3072          * to physical mapping is requested.
3073          */
3074         if (vpp != NULL)
3075                 *vpp = hfsmp->hfs_devvp;
3076         if (bnp == NULL)
3077                 return (0);
3078
3079         logBlockSize = GetLogicalBlockSize(vp);
3080         blockposition = (off_t)bn * logBlockSize;
3081
3082         lockExtBtree = overflow_extents(fp);
3083
3084         if (lockExtBtree)
3085                 lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK);
3086
3087         retval = MacToVFSError(
3088                             MapFileBlockC (HFSTOVCB(hfsmp),
3089                                             (FCB*)fp,
3090                                             MAXPHYSIO,
3091                                             blockposition,
3092                                             bnp,
3093                                             &bytesContAvail));
3094
3095         if (lockExtBtree)
3096                 hfs_systemfile_unlock(hfsmp, lockflags);
3097
3098         if (retval == E_NONE) {
3099                 /* Figure out how many read ahead blocks there are */
3100                 if (runp != NULL) {
3101                         if (can_cluster(logBlockSize)) {
3102                                 /* Make sure this result never goes negative: */
3103                                 *runp = (bytesContAvail < logBlockSize) ? 0 : (bytesContAvail / logBlockSize) - 1;
3104                         } else {
3105                                 *runp = 0;
3106                         }
3107                 }
3108         }
3109         return (retval);
3110 }
3111
3112 /*
3113  * Convert logical block number to file offset.
3114  */
3115 int
3116 hfs_vnop_blktooff(struct vnop_blktooff_args *ap)
3117 /*
3118         struct vnop_blktooff_args {
3119                 vnode_t a_vp;
3120                 daddr64_t a_lblkno;
3121                 off_t *a_offset;
3122         };
3123 */
3124 {
3125         if (ap->a_vp == NULL)
3126                 return (EINVAL);
3127         *ap->a_offset = (off_t)ap->a_lblkno * (off_t)GetLogicalBlockSize(ap->a_vp);
3128
3129         return(0);
3130 }
3131
3132 /*
3133  * Convert file offset to logical block number.
3134  */
3135 int
3136 hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap)
3137 /*
3138         struct vnop_offtoblk_args {
3139                 vnode_t a_vp;
3140                 off_t a_offset;
3141                 daddr64_t *a_lblkno;
3142         };
3143 */
3144 {
3145         if (ap->a_vp == NULL)
3146                 return (EINVAL);
3147         *ap->a_lblkno = (daddr64_t)(ap->a_offset / (off_t)GetLogicalBlockSize(ap->a_vp));
3148
3149         return(0);
3150 }
3151
3152 /*
3153  * Map file offset to physical block number.
3154  *
3155  * If this function is called for write operation, and if the file
3156  * had virtual blocks allocated (delayed allocation), real blocks
3157  * are allocated by calling ExtendFileC().
3158  *
3159  * If this function is called for read operation, and if the file
3160  * had virtual blocks allocated (delayed allocation), no change
3161  * to the size of file is done, and if required, rangelist is
3162  * searched for mapping.
3163  *
3164  * System file cnodes are expected to be locked (shared or exclusive).
3165  *
3166  * -- INVALID RANGES --
3167  *
3168  * Invalid ranges are used to keep track of where we have extended a
3169  * file, but have not yet written that data to disk.  In the past we
3170  * would clear up the invalid ranges as we wrote to those areas, but
3171  * before data was actually flushed to disk.  The problem with that
3172  * approach is that the data can be left in the cache and is therefore
3173  * still not valid on disk.  So now we clear up the ranges here, when
3174  * the flags field has VNODE_WRITE set, indicating a write is about to
3175  * occur.  This isn't ideal (ideally we want to clear them up when
3176  * know the data has been successfully written), but it's the best we
3177  * can do.
3178  *
3179  * For reads, we use the invalid ranges here in block map to indicate
3180  * to the caller that the data should be zeroed (a_bpn == -1).  We
3181  * have to be careful about what ranges we return to the cluster code.
3182  * Currently the cluster code can only handle non-rounded values for
3183  * the EOF; it cannot handle funny sized ranges in the middle of the
3184  * file (the main problem is that it sends down odd sized I/Os to the
3185  * disk).  Our code currently works because whilst the very first
3186  * offset and the last offset in the invalid ranges are not aligned,
3187  * gaps in the invalid ranges between the first and last, have to be
3188  * aligned (because we always write page sized blocks).  For example,
3189  * consider this arrangement:
3190  *
3191  *         +-------------+-----+-------+------+
3192  *         |             |XXXXX|       |XXXXXX|
3193  *         +-------------+-----+-------+------+
3194  *                       a     b       c      d
3195  *
3196  * This shows two invalid ranges <a, b> and <c, d>.  Whilst a and d
3197  * are not necessarily aligned, b and c *must* be.
3198  *
3199  * Zero-filling occurs in a number of ways:
3200  *
3201  *   1. When a read occurs and we return with a_bpn == -1.
3202  *
3203  *   2. When hfs_fsync or hfs_filedone calls hfs_flush_invalid_ranges
3204  *      which will cause us to iterate over the ranges bringing in
3205  *      pages that are not present in the cache and zeroing them.  Any
3206  *      pages that are already in the cache are left untouched.  Note
3207  *      that hfs_fsync does not always flush invalid ranges.
3208  *
3209  *   3. When we extend a file we zero out from the old EOF to the end
3210  *      of the page.  It would be nice if we didn't have to do this if
3211  *      the page wasn't present (and could defer it), but because of
3212  *      the problem described above, we have to.
3213  *
3214  * The invalid ranges are also used to restrict the size that we write
3215  * out on disk: see hfs_prepare_fork_for_update.
3216  *
3217  * Note that invalid ranges are ignored when neither the VNODE_READ or
3218  * the VNODE_WRITE flag is specified.  This is useful for the
3219  * F_LOG2PHYS* fcntls which are not interested in invalid ranges: they
3220  * just want to know whether blocks are physically allocated or not.
3221  */
3222 int
3223 hfs_vnop_blockmap(struct vnop_blockmap_args *ap)
3224 /*
3225         struct vnop_blockmap_args {
3226                 vnode_t a_vp;
3227                 off_t a_foffset;
3228                 size_t a_size;
3229                 daddr64_t *a_bpn;
3230                 size_t *a_run;
3231                 void *a_poff;
3232                 int a_flags;
3233                 vfs_context_t a_context;
3234         };
3235 */
3236 {
3237         struct vnode *vp = ap->a_vp;
3238         struct cnode *cp;
3239         struct filefork *fp;
3240         struct hfsmount *hfsmp;
3241         size_t bytesContAvail = ap->a_size;
3242         int retval = E_NONE;
3243         int syslocks = 0;
3244         int lockflags = 0;
3245         struct rl_entry *invalid_range;
3246         enum rl_overlaptype overlaptype;
3247         int started_tr = 0;
3248         int tooklock = 0;
3249
3250 #if HFS_COMPRESSION
3251         if (VNODE_IS_RSRC(vp)) {
3252                 /* allow blockmaps to the resource fork */
3253         } else {
3254                 if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
3255                         int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
3256                         switch(state) {
3257                                 case FILE_IS_COMPRESSED:
3258                                         return ENOTSUP;
3259                                 case FILE_IS_CONVERTING:
3260                                         /* if FILE_IS_CONVERTING, we allow blockmap */
3261                                         break;
3262                                 default:
3263                                         printf("invalid state %d for compressed file\n", state);
3264                                         /* fall through */
3265                         }
3266                 }
3267         }
3268 #endif /* HFS_COMPRESSION */
3269
3270         /* Do not allow blockmap operation on a directory */
3271         if (vnode_isdir(vp)) {
3272                 return (ENOTSUP);
3273         }
3274
3275         /*
3276          * Check for underlying vnode requests and ensure that logical
3277          * to physical mapping is requested.
3278          */
3279         if (ap->a_bpn == NULL)
3280                 return (0);
3281
3282         hfsmp = VTOHFS(vp);
3283         cp = VTOC(vp);
3284         fp = VTOF(vp);
3285
3286         if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) {
3287                 if (cp->c_lockowner != current_thread()) {
3288                         hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
3289                         tooklock = 1;
3290                 }
3291
3292                 // For reads, check the invalid ranges
3293                 if (ISSET(ap->a_flags, VNODE_READ)) {
3294                         if (ap->a_foffset >= fp->ff_size) {
3295                                 retval = ERANGE;
3296                                 goto exit;
3297                         }
3298
3299                         overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
3300                                                                   ap->a_foffset + (off_t)bytesContAvail - 1,
3301                                                                   &invalid_range);
3302                         switch(overlaptype) {
3303                                 case RL_MATCHINGOVERLAP:
3304                                 case RL_OVERLAPCONTAINSRANGE:
3305                                 case RL_OVERLAPSTARTSBEFORE:
3306                                         /* There's no valid block for this byte offset */
3307                                         *ap->a_bpn = (daddr64_t)-1;
3308                                         /* There's no point limiting the amount to be returned
3309                                          * if the invalid range that was hit extends all the way
3310                                          * to the EOF (i.e. there's no valid bytes between the
3311                                          * end of this range and the file's EOF):
3312                                          */
3313                                         if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
3314                                                 ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
3315                                                 bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
3316                                         }
3317
3318                                         retval = 0;
3319                                         goto exit;
3320
3321                                 case RL_OVERLAPISCONTAINED:
3322                                 case RL_OVERLAPENDSAFTER:
3323                                         /* The range of interest hits an invalid block before the end: */
3324                                         if (invalid_range->rl_start == ap->a_foffset) {
3325                                                 /* There's actually no valid information to be had starting here: */
3326                                                 *ap->a_bpn = (daddr64_t)-1;
3327                                                 if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
3328                                                         ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
3329                                                         bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
3330                                                 }
3331
3332                                                 retval = 0;
3333                                                 goto exit;
3334                                         } else {
3335                                                 /*
3336                                                  * Sadly, the lower layers don't like us to
3337                                                  * return unaligned ranges, so we skip over
3338                                                  * any invalid ranges here that are less than
3339                                                  * a page: zeroing of those bits is not our
3340                                                  * responsibility (it's dealt with elsewhere).
3341                                                  */
3342                                                 do {
3343                                                         off_t rounded_start = round_page_64(invalid_range->rl_start);
3344                                                         if ((off_t)bytesContAvail < rounded_start - ap->a_foffset)
3345                                                                 break;
3346                                                         if (rounded_start < invalid_range->rl_end + 1) {
3347                                                                 bytesContAvail = rounded_start - ap->a_foffset;
3348                                                                 break;
3349                                                         }
3350                                                 } while ((invalid_range = TAILQ_NEXT(invalid_range,
3351                                                                                                                          rl_link)));
3352                                         }
3353                                         break;
3354
3355                                 case RL_NOOVERLAP:
3356                                         break;
3357                         } // switch
3358                 }
3359         }
3360
3361 #if CONFIG_PROTECT
3362         if (cp->c_cpentry) {
3363                 const int direction = (ISSET(ap->a_flags, VNODE_WRITE)
3364                                                            ? VNODE_WRITE : VNODE_READ);
3365
3366                 cp_io_params_t io_params;
3367                 cp_io_params(hfsmp, cp->c_cpentry,
3368                                          off_rsrc_make(ap->a_foffset, VNODE_IS_RSRC(vp)),
3369                                          direction, &io_params);
3370
3371                 if (io_params.max_len < (off_t)bytesContAvail)
3372                         bytesContAvail = io_params.max_len;
3373
3374                 if (io_params.phys_offset != -1) {
3375                         *ap->a_bpn = ((io_params.phys_offset + hfsmp->hfsPlusIOPosOffset)
3376                                                   / hfsmp->hfs_logical_block_size);
3377
3378                         retval = 0;
3379                         goto exit;
3380                 }
3381         }
3382 #endif
3383
3384 retry:
3385
3386         /* Check virtual blocks only when performing write operation */
3387         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
3388                 if (hfs_start_transaction(hfsmp) != 0) {
3389                         retval = EINVAL;
3390                         goto exit;
3391                 } else {
3392                         started_tr = 1;
3393                 }
3394                 syslocks = SFL_EXTENTS | SFL_BITMAP;
3395
3396         } else if (overflow_extents(fp)) {
3397                 syslocks = SFL_EXTENTS;
3398         }
3399
3400         if (syslocks)
3401                 lockflags = hfs_systemfile_lock(hfsmp, syslocks, HFS_EXCLUSIVE_LOCK);
3402
3403         /*
3404          * Check for any delayed allocations.
3405          */
3406         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
3407                 int64_t actbytes;
3408                 u_int32_t loanedBlocks;
3409
3410                 //
3411                 // Make sure we have a transaction.  It's possible
3412                 // that we came in and fp->ff_unallocblocks was zero
3413                 // but during the time we blocked acquiring the extents
3414                 // btree, ff_unallocblocks became non-zero and so we
3415                 // will need to start a transaction.
3416                 //
3417                 if (started_tr == 0) {
3418                         if (syslocks) {
3419                                 hfs_systemfile_unlock(hfsmp, lockflags);
3420                                 syslocks = 0;
3421                         }
3422                         goto retry;
3423                 }
3424
3425                 /*
3426                  * Note: ExtendFileC will Release any blocks on loan and
3427                  * aquire real blocks.  So we ask to extend by zero bytes
3428                  * since ExtendFileC will account for the virtual blocks.
3429                  */
3430
3431                 loanedBlocks = fp->ff_unallocblocks;
3432                 retval = ExtendFileC(hfsmp, (FCB*)fp, 0, 0,
3433                                      kEFAllMask | kEFNoClumpMask, &actbytes);
3434
3435                 if (retval) {
3436                         fp->ff_unallocblocks = loanedBlocks;
3437                         cp->c_blocks += loanedBlocks;
3438                         fp->ff_blocks += loanedBlocks;
3439
3440                         hfs_lock_mount (hfsmp);
3441                         hfsmp->loanedBlocks += loanedBlocks;
3442                         hfs_unlock_mount (hfsmp);
3443
3444                         hfs_systemfile_unlock(hfsmp, lockflags);
3445                         cp->c_flag |= C_MODIFIED;
3446                         if (started_tr) {
3447                                 (void) hfs_update(vp, 0);
3448                                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3449
3450                                 hfs_end_transaction(hfsmp);
3451                                 started_tr = 0;
3452                         }
3453                         goto exit;
3454                 }
3455         }
3456
3457         retval = MapFileBlockC(hfsmp, (FCB *)fp, bytesContAvail, ap->a_foffset,
3458                                ap->a_bpn, &bytesContAvail);
3459         if (syslocks) {
3460                 hfs_systemfile_unlock(hfsmp, lockflags);
3461                 syslocks = 0;
3462         }
3463
3464         if (retval) {
3465                 /* On write, always return error because virtual blocks, if any,
3466                  * should have been allocated in ExtendFileC().  We do not
3467                  * allocate virtual blocks on read, therefore return error
3468                  * only if no virtual blocks are allocated.  Otherwise we search
3469                  * rangelist for zero-fills
3470                  */
3471                 if ((MacToVFSError(retval) != ERANGE) ||
3472                     (ap->a_flags & VNODE_WRITE) ||
3473                     ((ap->a_flags & VNODE_READ) && (fp->ff_unallocblocks == 0))) {
3474                         goto exit;
3475                 }
3476
3477                 /* Validate if the start offset is within logical file size */
3478                 if (ap->a_foffset >= fp->ff_size) {
3479                         goto exit;
3480                 }
3481
3482                 /*
3483                  * At this point, we have encountered a failure during
3484                  * MapFileBlockC that resulted in ERANGE, and we are not
3485                  * servicing a write, and there are borrowed blocks.
3486                  *
3487                  * However, the cluster layer will not call blockmap for
3488                  * blocks that are borrowed and in-cache.  We have to assume
3489                  * that because we observed ERANGE being emitted from
3490                  * MapFileBlockC, this extent range is not valid on-disk.  So
3491                  * we treat this as a mapping that needs to be zero-filled
3492                  * prior to reading.
3493                  */
3494
3495                 if (fp->ff_size - ap->a_foffset < (off_t)bytesContAvail)
3496                         bytesContAvail = fp->ff_size - ap->a_foffset;
3497
3498                 *ap->a_bpn = (daddr64_t) -1;
3499                 retval = 0;
3500
3501                 goto exit;
3502         }
3503
3504 exit:
3505         if (retval == 0) {
3506                 if (ISSET(ap->a_flags, VNODE_WRITE)) {
3507                         struct rl_entry *r = TAILQ_FIRST(&fp->ff_invalidranges);
3508
3509                         // See if we might be overlapping invalid ranges...
3510                         if (r && (ap->a_foffset + (off_t)bytesContAvail) > r->rl_start) {
3511                                 /*
3512                                  * Mark the file as needing an update if we think the
3513                                  * on-disk EOF has changed.
3514                                  */
3515                                 if (ap->a_foffset <= r->rl_start)
3516                                         SET(cp->c_flag, C_MODIFIED);
3517
3518                                 /*
3519                                  * This isn't the ideal place to put this.  Ideally, we
3520                                  * should do something *after* we have successfully
3521                                  * written to the range, but that's difficult to do
3522                                  * because we cannot take locks in the callback.  At
3523                                  * present, the cluster code will call us with VNODE_WRITE
3524                                  * set just before it's about to write the data so we know
3525                                  * that data is about to be written.  If we get an I/O
3526                                  * error at this point then chances are the metadata
3527                                  * update to follow will also have an I/O error so the
3528                                  * risk here is small.
3529                                  */
3530                                 rl_remove(ap->a_foffset, ap->a_foffset + bytesContAvail - 1,
3531                                                   &fp->ff_invalidranges);
3532
3533                                 if (!TAILQ_FIRST(&fp->ff_invalidranges)) {
3534                                         cp->c_flag &= ~C_ZFWANTSYNC;
3535                                         cp->c_zftimeout = 0;
3536                                 }
3537                         }
3538                 }
3539
3540                 if (ap->a_run)
3541                         *ap->a_run = bytesContAvail;
3542
3543                 if (ap->a_poff)
3544                         *(int *)ap->a_poff = 0;
3545         }
3546
3547         if (started_tr) {
3548                 hfs_update(vp, TRUE);
3549                 hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3550                 hfs_end_transaction(hfsmp);
3551                 started_tr = 0;
3552         }
3553
3554         if (tooklock)
3555                 hfs_unlock(cp);
3556
3557         return (MacToVFSError(retval));
3558 }
3559
3560 /*
3561  * prepare and issue the I/O
3562  * buf_strategy knows how to deal
3563  * with requests that require
3564  * fragmented I/Os
3565  */
3566 int
3567 hfs_vnop_strategy(struct vnop_strategy_args *ap)
3568 {
3569         buf_t   bp = ap->a_bp;
3570         vnode_t vp = buf_vnode(bp);
3571         int error = 0;
3572
3573         /* Mark buffer as containing static data if cnode flag set */
3574         if (VTOC(vp)->c_flag & C_SSD_STATIC) {
3575                 buf_markstatic(bp);
3576         }
3577
3578         /* Mark buffer as containing static data if cnode flag set */
3579         if (VTOC(vp)->c_flag & C_SSD_GREEDY_MODE) {
3580                 bufattr_markgreedymode(buf_attr(bp));
3581         }
3582
3583         /* mark buffer as containing burst mode data if cnode flag set */
3584         if (VTOC(vp)->c_flag & C_IO_ISOCHRONOUS) {
3585                 bufattr_markisochronous(buf_attr(bp));
3586         }
3587
3588 #if CONFIG_PROTECT
3589         error = cp_handle_strategy(bp);
3590
3591         if (error)
3592                 return error;
3593 #endif
3594
3595         error = buf_strategy(VTOHFS(vp)->hfs_devvp, ap);
3596
3597         return error;
3598 }
3599
3600 int
3601 do_hfs_truncate(struct vnode *vp, off_t length, int flags, int truncateflags, vfs_context_t context)
3602 {
3603         register struct cnode *cp = VTOC(vp);
3604         struct filefork *fp = VTOF(vp);
3605         kauth_cred_t cred = vfs_context_ucred(context);
3606         int retval;
3607         off_t bytesToAdd;
3608         off_t actualBytesAdded;
3609         off_t filebytes;
3610         u_int32_t fileblocks;
3611         int blksize;
3612         struct hfsmount *hfsmp;
3613         int lockflags;
3614         int suppress_times = (truncateflags & HFS_TRUNCATE_SKIPTIMES);
3615
3616         blksize = VTOVCB(vp)->blockSize;
3617         fileblocks = fp->ff_blocks;
3618         filebytes = (off_t)fileblocks * (off_t)blksize;
3619
3620         KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_START,
3621                  (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
3622
3623         if (length < 0)
3624                 return (EINVAL);
3625
3626         /* This should only happen with a corrupt filesystem */
3627         if ((off_t)fp->ff_size < 0)
3628                 return (EINVAL);
3629
3630         if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE))
3631                 return (EFBIG);
3632
3633         hfsmp = VTOHFS(vp);
3634
3635         retval = E_NONE;
3636
3637         /* Files that are changing size are not hot file candidates. */
3638         if (hfsmp->hfc_stage == HFC_RECORDING) {
3639                 fp->ff_bytesread = 0;
3640         }
3641
3642         /*
3643          * We cannot just check if fp->ff_size == length (as an optimization)
3644          * since there may be extra physical blocks that also need truncation.
3645          */
3646 #if QUOTA
3647         if ((retval = hfs_getinoquota(cp)))
3648                 return(retval);
3649 #endif /* QUOTA */
3650
3651         /*
3652          * Lengthen the size of the file. We must ensure that the
3653          * last byte of the file is allocated. Since the smallest
3654          * value of ff_size is 0, length will be at least 1.
3655          */
3656         if (length > (off_t)fp->ff_size) {
3657 #if QUOTA
3658                 retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)),
3659                                    cred, 0);
3660                 if (retval)
3661                         goto Err_Exit;
3662 #endif /* QUOTA */
3663                 /*
3664                  * If we don't have enough physical space then
3665                  * we need to extend the physical size.
3666                  */
3667                 if (length > filebytes) {
3668                         int eflags;
3669                         u_int32_t blockHint = 0;
3670
3671                         /* All or nothing and don't round up to clumpsize. */
3672                         eflags = kEFAllMask | kEFNoClumpMask;
3673
3674                         if (cred && (suser(cred, NULL) != 0)) {
3675                                 eflags |= kEFReserveMask;  /* keep a reserve */
3676                         }
3677
3678                         /*
3679                          * Allocate Journal and Quota files in metadata zone.
3680                          */
3681                         if (filebytes == 0 &&
3682                             hfsmp->hfs_flags & HFS_METADATA_ZONE &&
3683                             hfs_virtualmetafile(cp)) {
3684                                 eflags |= kEFMetadataMask;
3685                                 blockHint = hfsmp->hfs_metazone_start;
3686                         }
3687                         if (hfs_start_transaction(hfsmp) != 0) {
3688                             retval = EINVAL;
3689                             goto Err_Exit;
3690                         }
3691
3692                         /* Protect extents b-tree and allocation bitmap */
3693                         lockflags = SFL_BITMAP;
3694                         if (overflow_extents(fp))
3695                                 lockflags |= SFL_EXTENTS;
3696                         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3697
3698                         /*
3699                          * Keep growing the file as long as the current EOF is
3700                          * less than the desired value.
3701                          */
3702                         while ((length > filebytes) && (retval == E_NONE)) {
3703                                 bytesToAdd = length - filebytes;
3704                                 retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
3705                                                     (FCB*)fp,
3706                                                     bytesToAdd,
3707                                                     blockHint,
3708                                                     eflags,
3709                                                     &actualBytesAdded));
3710
3711                                 filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
3712                                 if (actualBytesAdded == 0 && retval == E_NONE) {
3713                                         if (length > filebytes)
3714                                                 length = filebytes;
3715                                         break;
3716                                 }
3717                         } /* endwhile */
3718
3719                         hfs_systemfile_unlock(hfsmp, lockflags);
3720
3721                         if (hfsmp->jnl) {
3722                                 hfs_update(vp, 0);
3723                                 hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3724                         }
3725
3726                         hfs_end_transaction(hfsmp);
3727
3728                         if (retval)
3729                                 goto Err_Exit;
3730
3731                         KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE,
3732                                 (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
3733                 }
3734
3735                 if (ISSET(flags, IO_NOZEROFILL)) {
3736                         // An optimisation for the hibernation file
3737                         if (vnode_isswap(vp))
3738                                 rl_remove_all(&fp->ff_invalidranges);
3739                 } else {
3740                         if (!vnode_issystem(vp) && retval == E_NONE) {
3741                                 if (length > (off_t)fp->ff_size) {
3742                                         struct timeval tv;
3743
3744                                         /* Extending the file: time to fill out the current last page w. zeroes? */
3745                                         if (fp->ff_size & PAGE_MASK_64) {
3746                                                 /* There might be some valid data at the start of the (current) last page
3747                                                    of the file, so zero out the remainder of that page to ensure the
3748                                                    entire page contains valid data. */
3749                                                 hfs_unlock(cp);
3750                                                 retval = hfs_zero_eof_page(vp, length);
3751                                                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
3752                                                 if (retval) goto Err_Exit;
3753                                         }
3754                                         microuptime(&tv);
3755                                         rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges);
3756                                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
3757                                 }
3758                         } else {
3759                                         panic("hfs_truncate: invoked on non-UBC object?!");
3760                         };
3761                 }
3762                 if (suppress_times == 0) {
3763                         cp->c_touch_modtime = TRUE;
3764                 }
3765                 fp->ff_size = length;
3766
3767         } else { /* Shorten the size of the file */
3768
3769                 // An optimisation for the hibernation file
3770                 if (ISSET(flags, IO_NOZEROFILL) && vnode_isswap(vp)) {
3771                         rl_remove_all(&fp->ff_invalidranges);
3772                 } else if ((off_t)fp->ff_size > length) {
3773                         /* Any space previously marked as invalid is now irrelevant: */
3774                         rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges);
3775                 }
3776
3777                 /*
3778                  * Account for any unmapped blocks. Note that the new
3779                  * file length can still end up with unmapped blocks.
3780                  */
3781                 if (fp->ff_unallocblocks > 0) {
3782                         u_int32_t finalblks;
3783                         u_int32_t loanedBlocks;
3784
3785                         hfs_lock_mount(hfsmp);
3786                         loanedBlocks = fp->ff_unallocblocks;
3787                         cp->c_blocks -= loanedBlocks;
3788                         fp->ff_blocks -= loanedBlocks;
3789                         fp->ff_unallocblocks = 0;
3790
3791                         hfsmp->loanedBlocks -= loanedBlocks;
3792
3793                         finalblks = (length + blksize - 1) / blksize;
3794                         if (finalblks > fp->ff_blocks) {
3795                                 /* calculate required unmapped blocks */
3796                                 loanedBlocks = finalblks - fp->ff_blocks;
3797                                 hfsmp->loanedBlocks += loanedBlocks;
3798
3799                                 fp->ff_unallocblocks = loanedBlocks;
3800                                 cp->c_blocks += loanedBlocks;
3801                                 fp->ff_blocks += loanedBlocks;
3802                         }
3803                         hfs_unlock_mount (hfsmp);
3804                 }
3805
3806                 off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
3807                 if (hfs_start_transaction(hfsmp) != 0) {
3808                         retval = EINVAL;
3809                         goto Err_Exit;
3810                 }
3811
3812                 if (fp->ff_unallocblocks == 0) {
3813                         /* Protect extents b-tree and allocation bitmap */
3814                         lockflags = SFL_BITMAP;
3815                         if (overflow_extents(fp))
3816                                 lockflags |= SFL_EXTENTS;
3817                         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3818
3819                         retval = MacToVFSError(TruncateFileC(VTOVCB(vp), (FCB*)fp, length, 0,
3820                                                                                                  FORK_IS_RSRC (fp), FTOC(fp)->c_fileid, false));
3821
3822                         hfs_systemfile_unlock(hfsmp, lockflags);
3823                 }
3824                 if (hfsmp->jnl) {
3825                         if (retval == 0) {
3826                                 fp->ff_size = length;
3827                         }
3828                         hfs_update(vp, 0);
3829                         hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3830                 }
3831                 hfs_end_transaction(hfsmp);
3832
3833                 filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
3834                 if (retval)
3835                         goto Err_Exit;
3836 #if QUOTA
3837                 /* These are bytesreleased */
3838                 (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0);
3839 #endif /* QUOTA */
3840
3841                 //
3842                 // Unlike when growing a file, we adjust the hotfile block count here
3843                 // instead of deeper down in the block allocation code because we do
3844                 // not necessarily have a vnode or "fcb" at the time we're deleting
3845                 // the file and so we wouldn't know if it was hotfile cached or not
3846                 //
3847                 hfs_hotfile_adjust_blocks(vp, (int64_t)((savedbytes - filebytes) / blksize));
3848
3849
3850                 /*
3851                  * Only set update flag if the logical length changes & we aren't
3852                  * suppressing modtime updates.
3853                  */
3854                 if (((off_t)fp->ff_size != length) && (suppress_times == 0)) {
3855                         cp->c_touch_modtime = TRUE;
3856                 }
3857                 fp->ff_size = length;
3858         }
3859         if (cp->c_mode & (S_ISUID | S_ISGID)) {
3860                 if (!vfs_context_issuser(context))
3861                         cp->c_mode &= ~(S_ISUID | S_ISGID);
3862         }
3863         cp->c_flag |= C_MODIFIED;
3864         cp->c_touch_chgtime = TRUE;     /* status changed */
3865         if (suppress_times == 0) {
3866                 cp->c_touch_modtime = TRUE;     /* file data was modified */
3867
3868                 /*
3869                  * If we are not suppressing the modtime update, then
3870                  * update the gen count as well.
3871                  */
3872                 if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK (cp->c_attr.ca_mode)) {
3873                         hfs_incr_gencount(cp);
3874                 }
3875         }
3876
3877         retval = hfs_update(vp, 0);
3878         if (retval) {
3879                 KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE,
3880                      -1, -1, -1, retval, 0);
3881         }
3882
3883 Err_Exit:
3884
3885         KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_END,
3886                  (int)length, (int)fp->ff_size, (int)filebytes, retval, 0);
3887
3888         return (retval);
3889 }
3890
3891 /*
3892  * Preparation which must be done prior to deleting the catalog record
3893  * of a file or directory.  In order to make the on-disk as safe as possible,
3894  * we remove the catalog entry before releasing the bitmap blocks and the
3895  * overflow extent records.  However, some work must be done prior to deleting
3896  * the catalog record.
3897  *
3898  * When calling this function, the cnode must exist both in memory and on-disk.
3899  * If there are both resource fork and data fork vnodes, this function should
3900  * be called on both.
3901  */
3902
3903 int
3904 hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) {
3905
3906         struct filefork *fp = VTOF(vp);
3907         struct cnode *cp = VTOC(vp);
3908 #if QUOTA
3909         int retval = 0;
3910 #endif /* QUOTA */
3911
3912         /* Cannot truncate an HFS directory! */
3913         if (vnode_isdir(vp)) {
3914                 return (EISDIR);
3915         }
3916
3917         /*
3918          * See the comment below in hfs_truncate for why we need to call
3919          * setsize here.  Essentially we want to avoid pending IO if we
3920          * already know that the blocks are going to be released here.
3921          * This function is only called when totally removing all storage for a file, so
3922          * we can take a shortcut and immediately setsize (0);
3923          */
3924         ubc_setsize(vp, 0);
3925
3926         /* This should only happen with a corrupt filesystem */
3927         if ((off_t)fp->ff_size < 0)
3928                 return (EINVAL);
3929
3930         /*
3931          * We cannot just check if fp->ff_size == length (as an optimization)
3932          * since there may be extra physical blocks that also need truncation.
3933          */
3934 #if QUOTA
3935         if ((retval = hfs_getinoquota(cp))) {
3936                 return(retval);
3937         }
3938 #endif /* QUOTA */
3939
3940         /* Wipe out any invalid ranges which have yet to be backed by disk */
3941         rl_remove(0, fp->ff_size - 1, &fp->ff_invalidranges);
3942
3943         /*
3944          * Account for any unmapped blocks. Since we're deleting the
3945          * entire file, we don't have to worry about just shrinking
3946          * to a smaller number of borrowed blocks.
3947          */
3948         if (fp->ff_unallocblocks > 0) {
3949                 u_int32_t loanedBlocks;
3950
3951                 hfs_lock_mount (hfsmp);
3952                 loanedBlocks = fp->ff_unallocblocks;
3953                 cp->c_blocks -= loanedBlocks;
3954                 fp->ff_blocks -= loanedBlocks;
3955                 fp->ff_unallocblocks = 0;
3956
3957                 hfsmp->loanedBlocks -= loanedBlocks;
3958
3959                 hfs_unlock_mount (hfsmp);
3960         }
3961
3962         return 0;
3963 }
3964
3965
3966 /*
3967  * Special wrapper around calling TruncateFileC.  This function is useable
3968  * even when the catalog record does not exist any longer, making it ideal
3969  * for use when deleting a file.  The simplification here is that we know
3970  * that we are releasing all blocks.
3971  *
3972  * Note that this function may be called when there is no vnode backing
3973  * the file fork in question.  We may call this from hfs_vnop_inactive
3974  * to clear out resource fork data (and may not want to clear out the data
3975  * fork yet).  As a result, we pointer-check both sets of inputs before
3976  * doing anything with them.
3977  *
3978  * The caller is responsible for saving off a copy of the filefork(s)
3979  * embedded within the cnode prior to calling this function.  The pointers
3980  * supplied as arguments must be valid even if the cnode is no longer valid.
3981  */
3982
3983 int
3984 hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork,
3985                                          struct filefork *rsrcfork, u_int32_t fileid) {
3986
3987         off_t filebytes;
3988         u_int32_t fileblocks;
3989         int blksize = 0;
3990         int error = 0;
3991         int lockflags;
3992
3993         blksize = hfsmp->blockSize;
3994
3995         /* Data Fork */
3996         if (datafork) {
3997                 off_t prev_filebytes;
3998
3999                 datafork->ff_size = 0;
4000
4001                 fileblocks = datafork->ff_blocks;
4002                 filebytes = (off_t)fileblocks * (off_t)blksize;
4003                 prev_filebytes = filebytes;
4004
4005                 /* We killed invalid ranges and loaned blocks before we removed the catalog entry */
4006
4007                 while (filebytes > 0) {
4008                         if (filebytes > HFS_BIGFILE_SIZE) {
4009                                 filebytes -= HFS_BIGFILE_SIZE;
4010                         } else {
4011                                 filebytes = 0;
4012                         }
4013
4014                         /* Start a transaction, and wipe out as many blocks as we can in this iteration */
4015                         if (hfs_start_transaction(hfsmp) != 0) {
4016                                 error = EINVAL;
4017                                 break;
4018                         }
4019
4020                         if (datafork->ff_unallocblocks == 0) {
4021                                 /* Protect extents b-tree and allocation bitmap */
4022                                 lockflags = SFL_BITMAP;
4023                                 if (overflow_extents(datafork))
4024                                         lockflags |= SFL_EXTENTS;
4025                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
4026
4027                                 error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), datafork, filebytes, 1, 0, fileid, false));
4028
4029                                 hfs_systemfile_unlock(hfsmp, lockflags);
4030                         }
4031                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
4032
4033                         struct cnode *cp = datafork ? FTOC(datafork) : NULL;
4034                         struct vnode *vp;
4035                         vp = cp ? CTOV(cp, 0) : NULL;
4036                         hfs_hotfile_adjust_blocks(vp, (int64_t)((prev_filebytes - filebytes) / blksize));
4037                         prev_filebytes = filebytes;
4038
4039                         /* Finish the transaction and start over if necessary */
4040                         hfs_end_transaction(hfsmp);
4041
4042                         if (error) {
4043                                 break;
4044                         }
4045                 }
4046         }
4047
4048         /* Resource fork */
4049         if (error == 0 && rsrcfork) {
4050                 rsrcfork->ff_size = 0;
4051
4052                 fileblocks = rsrcfork->ff_blocks;
4053                 filebytes = (off_t)fileblocks * (off_t)blksize;
4054
4055                 /* We killed invalid ranges and loaned blocks before we removed the catalog entry */
4056
4057                 while (filebytes > 0) {
4058                         if (filebytes > HFS_BIGFILE_SIZE) {
4059                                 filebytes -= HFS_BIGFILE_SIZE;
4060                         } else {
4061                                 filebytes = 0;
4062                         }
4063
4064                         /* Start a transaction, and wipe out as many blocks as we can in this iteration */
4065                         if (hfs_start_transaction(hfsmp) != 0) {
4066                                 error = EINVAL;
4067                                 break;
4068                         }
4069
4070                         if (rsrcfork->ff_unallocblocks == 0) {
4071                                 /* Protect extents b-tree and allocation bitmap */
4072                                 lockflags = SFL_BITMAP;
4073                                 if (overflow_extents(rsrcfork))
4074                                         lockflags |= SFL_EXTENTS;
4075                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
4076
4077                                 error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), rsrcfork, filebytes, 1, 1, fileid, false));
4078
4079                                 hfs_systemfile_unlock(hfsmp, lockflags);
4080                         }
4081                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
4082
4083                         /* Finish the transaction and start over if necessary */
4084                         hfs_end_transaction(hfsmp);
4085
4086                         if (error) {
4087                                 break;
4088                         }
4089                 }
4090         }
4091
4092         return error;
4093 }
4094
4095 errno_t hfs_ubc_setsize(vnode_t vp, off_t len, bool have_cnode_lock)
4096 {
4097         errno_t error;
4098
4099         /*
4100          * Call ubc_setsize to give the VM subsystem a chance to do
4101          * whatever it needs to with existing pages before we delete
4102          * blocks.  Note that symlinks don't use the UBC so we'll
4103          * get back ENOENT in that case.
4104          */
4105         if (have_cnode_lock) {
4106                 error = ubc_setsize_ex(vp, len, UBC_SETSIZE_NO_FS_REENTRY);
4107                 if (error == EAGAIN) {
4108                         cnode_t *cp = VTOC(vp);
4109
4110                         if (cp->c_truncatelockowner != current_thread())
4111                                 hfs_warn("hfs: hfs_ubc_setsize called without exclusive truncate lock!");
4112
4113                         hfs_unlock(cp);
4114                         error = ubc_setsize_ex(vp, len, 0);
4115                         hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
4116                 }
4117         } else
4118                 error = ubc_setsize_ex(vp, len, 0);
4119
4120         return error == ENOENT ? 0 : error;
4121 }
4122
4123 /*
4124  * Truncate a cnode to at most length size, freeing (or adding) the
4125  * disk blocks.
4126  */
4127 int
4128 hfs_truncate(struct vnode *vp, off_t length, int flags,
4129                          int truncateflags, vfs_context_t context)
4130 {
4131         struct filefork *fp = VTOF(vp);
4132         off_t filebytes;
4133         u_int32_t fileblocks;
4134         int blksize;
4135         errno_t error = 0;
4136         struct cnode *cp = VTOC(vp);
4137         hfsmount_t *hfsmp = VTOHFS(vp);
4138
4139         /* Cannot truncate an HFS directory! */
4140         if (vnode_isdir(vp)) {
4141                 return (EISDIR);
4142         }
4143         /* A swap file cannot change size. */
4144         if (vnode_isswap(vp) && length && !ISSET(flags, IO_NOAUTH)) {
4145                 return (EPERM);
4146         }
4147
4148         blksize = hfsmp->blockSize;
4149         fileblocks = fp->ff_blocks;
4150         filebytes = (off_t)fileblocks * (off_t)blksize;
4151
4152         bool caller_has_cnode_lock = (cp->c_lockowner == current_thread());
4153
4154         error = hfs_ubc_setsize(vp, length, caller_has_cnode_lock);
4155         if (error)
4156                 return error;
4157
4158         if (!caller_has_cnode_lock) {
4159                 error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
4160                 if (error)
4161                         return error;
4162         }
4163
4164         if (vnode_islnk(vp) && cp->c_datafork->ff_symlinkptr) {
4165                 hfs_free(cp->c_datafork->ff_symlinkptr, cp->c_datafork->ff_size);
4166                 cp->c_datafork->ff_symlinkptr = NULL;
4167         }
4168
4169         // have to loop truncating or growing files that are
4170         // really big because otherwise transactions can get
4171         // enormous and consume too many kernel resources.
4172
4173         if (length < filebytes) {
4174                 while (filebytes > length) {
4175                         if ((filebytes - length) > HFS_BIGFILE_SIZE) {
4176                                 filebytes -= HFS_BIGFILE_SIZE;
4177                         } else {
4178                                 filebytes = length;
4179                         }
4180                         error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context);
4181                         if (error)
4182                                 break;
4183                 }
4184         } else if (length > filebytes) {
4185                 kauth_cred_t cred = vfs_context_ucred(context);
4186                 const bool keep_reserve = cred && suser(cred, NULL) != 0;
4187
4188                 if (hfs_freeblks(hfsmp, keep_reserve)
4189                         < howmany(length - filebytes, blksize)) {
4190                         error = ENOSPC;
4191                 } else {
4192                         while (filebytes < length) {
4193                                 if ((length - filebytes) > HFS_BIGFILE_SIZE) {
4194                                         filebytes += HFS_BIGFILE_SIZE;
4195                                 } else {
4196                                         filebytes = length;
4197                                 }
4198                                 error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context);
4199                                 if (error)
4200                                         break;
4201                         }
4202                 }
4203         } else /* Same logical size */ {
4204
4205                 error = do_hfs_truncate(vp, length, flags, truncateflags, context);
4206         }
4207         /* Files that are changing size are not hot file candidates. */
4208         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
4209                 fp->ff_bytesread = 0;
4210         }
4211
4212 #if HFS_CONFIG_KEY_ROLL
4213         if (!error && cp->c_truncatelockowner == current_thread()) {
4214                 hfs_key_roll_check(cp, true);
4215         }
4216 #endif
4217
4218         if (!caller_has_cnode_lock)
4219                 hfs_unlock(cp);
4220
4221         // Make sure UBC's size matches up (in case we didn't completely succeed)
4222         errno_t err2 = hfs_ubc_setsize(vp, fp->ff_size, caller_has_cnode_lock);
4223         if (!error)
4224                 error = err2;
4225
4226         return error;
4227 }
4228
4229
4230 /*
4231  * Preallocate file storage space.
4232  */
4233 int
4234 hfs_vnop_allocate(struct vnop_allocate_args /* {
4235                 vnode_t a_vp;
4236                 off_t a_length;
4237                 u_int32_t  a_flags;
4238                 off_t *a_bytesallocated;
4239                 off_t a_offset;
4240                 vfs_context_t a_context;
4241         } */ *ap)
4242 {
4243         struct vnode *vp = ap->a_vp;
4244         struct cnode *cp;
4245         struct filefork *fp;
4246         ExtendedVCB *vcb;
4247         off_t length = ap->a_length;
4248         off_t startingPEOF;
4249         off_t moreBytesRequested;
4250         off_t actualBytesAdded;
4251         off_t filebytes;
4252         u_int32_t fileblocks;
4253         int retval, retval2;
4254         u_int32_t blockHint;
4255         u_int32_t extendFlags;   /* For call to ExtendFileC */
4256         struct hfsmount *hfsmp;
4257         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
4258         int lockflags;
4259         time_t orig_ctime;
4260
4261         *(ap->a_bytesallocated) = 0;
4262
4263         if (!vnode_isreg(vp))
4264                 return (EISDIR);
4265         if (length < (off_t)0)
4266                 return (EINVAL);
4267
4268         cp = VTOC(vp);
4269
4270         orig_ctime = VTOC(vp)->c_ctime;
4271
4272         nspace_snapshot_event(vp, orig_ctime, ap->a_length == 0 ? NAMESPACE_HANDLER_TRUNCATE_OP|NAMESPACE_HANDLER_DELETE_OP : NAMESPACE_HANDLER_TRUNCATE_OP, NULL);
4273
4274         hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
4275
4276         if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
4277                 goto Err_Exit;
4278         }
4279
4280         fp = VTOF(vp);
4281         hfsmp = VTOHFS(vp);
4282         vcb = VTOVCB(vp);
4283
4284         fileblocks = fp->ff_blocks;
4285         filebytes = (off_t)fileblocks * (off_t)vcb->blockSize;
4286
4287         if ((ap->a_flags & ALLOCATEFROMVOL) && (length < filebytes)) {
4288                 retval = EINVAL;
4289                 goto Err_Exit;
4290         }
4291
4292         /* Fill in the flags word for the call to Extend the file */
4293
4294         extendFlags = kEFNoClumpMask;
4295         if (ap->a_flags & ALLOCATECONTIG)
4296                 extendFlags |= kEFContigMask;
4297         if (ap->a_flags & ALLOCATEALL)
4298                 extendFlags |= kEFAllMask;
4299         if (cred && suser(cred, NULL) != 0)
4300                 extendFlags |= kEFReserveMask;
4301         if (hfs_virtualmetafile(cp))
4302                 extendFlags |= kEFMetadataMask;
4303
4304         retval = E_NONE;
4305         blockHint = 0;
4306         startingPEOF = filebytes;
4307
4308         if (ap->a_flags & ALLOCATEFROMPEOF)
4309                 length += filebytes;
4310         else if (ap->a_flags & ALLOCATEFROMVOL)
4311                 blockHint = ap->a_offset / VTOVCB(vp)->blockSize;
4312
4313         /* If no changes are necesary, then we're done */
4314         if (filebytes == length)
4315                 goto Std_Exit;
4316
4317         /*
4318          * Lengthen the size of the file. We must ensure that the
4319          * last byte of the file is allocated. Since the smallest
4320          * value of filebytes is 0, length will be at least 1.
4321          */
4322         if (length > filebytes) {
4323                 if (ISSET(extendFlags, kEFAllMask)
4324                         && (hfs_freeblks(hfsmp, ISSET(extendFlags, kEFReserveMask))
4325                                 < howmany(length - filebytes, hfsmp->blockSize))) {
4326                         retval = ENOSPC;
4327                         goto Err_Exit;
4328                 }
4329
4330                 off_t total_bytes_added = 0, orig_request_size;
4331
4332                 orig_request_size = moreBytesRequested = length - filebytes;
4333
4334 #if QUOTA
4335                 retval = hfs_chkdq(cp,
4336                                 (int64_t)(roundup(moreBytesRequested, vcb->blockSize)),
4337                                 cred, 0);
4338                 if (retval)
4339                         goto Err_Exit;
4340
4341 #endif /* QUOTA */
4342                 /*
4343                  * Metadata zone checks.
4344                  */
4345                 if (hfsmp->hfs_flags & HFS_METADATA_ZONE) {
4346                         /*
4347                          * Allocate Journal and Quota files in metadata zone.
4348                          */
4349                         if (hfs_virtualmetafile(cp)) {
4350                                 blockHint = hfsmp->hfs_metazone_start;
4351                         } else if ((blockHint >= hfsmp->hfs_metazone_start) &&
4352                                    (blockHint <= hfsmp->hfs_metazone_end)) {
4353                                 /*
4354                                  * Move blockHint outside metadata zone.
4355                                  */
4356                                 blockHint = hfsmp->hfs_metazone_end + 1;
4357                         }
4358                 }
4359
4360
4361                 while ((length > filebytes) && (retval == E_NONE)) {
4362                     off_t bytesRequested;
4363
4364                     if (hfs_start_transaction(hfsmp) != 0) {
4365                         retval = EINVAL;
4366                         goto Err_Exit;
4367                     }
4368
4369                     /* Protect extents b-tree and allocation bitmap */
4370                     lockflags = SFL_BITMAP;
4371                     if (overflow_extents(fp))
4372                                 lockflags |= SFL_EXTENTS;
4373                     lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
4374
4375                     if (moreBytesRequested >= HFS_BIGFILE_SIZE) {
4376                                 bytesRequested = HFS_BIGFILE_SIZE;
4377                     } else {
4378                                 bytesRequested = moreBytesRequested;
4379                     }
4380
4381                     if (extendFlags & kEFContigMask) {
4382                             // if we're on a sparse device, this will force it to do a
4383                             // full scan to find the space needed.
4384                             hfsmp->hfs_flags &= ~HFS_DID_CONTIG_SCAN;
4385                     }
4386
4387                     retval = MacToVFSError(ExtendFileC(vcb,
4388                                                 (FCB*)fp,
4389                                                 bytesRequested,
4390                                                 blockHint,
4391                                                 extendFlags,
4392                                                 &actualBytesAdded));
4393
4394                     if (retval == E_NONE) {
4395                         *(ap->a_bytesallocated) += actualBytesAdded;
4396                         total_bytes_added += actualBytesAdded;
4397                         moreBytesRequested -= actualBytesAdded;
4398                         if (blockHint != 0) {
4399                             blockHint += actualBytesAdded / vcb->blockSize;
4400                         }
4401                     }
4402                     filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
4403
4404                     hfs_systemfile_unlock(hfsmp, lockflags);
4405
4406                     if (hfsmp->jnl) {
4407                         (void) hfs_update(vp, 0);
4408                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
4409                     }
4410
4411                     hfs_end_transaction(hfsmp);
4412                 }
4413
4414
4415                 /*
4416                  * if we get an error and no changes were made then exit
4417                  * otherwise we must do the hfs_update to reflect the changes
4418                  */
4419                 if (retval && (startingPEOF == filebytes))
4420                         goto Err_Exit;
4421
4422                 /*
4423                  * Adjust actualBytesAdded to be allocation block aligned, not
4424                  * clump size aligned.
4425                  * NOTE: So what we are reporting does not affect reality
4426                  * until the file is closed, when we truncate the file to allocation
4427                  * block size.
4428                  */
4429                 if (total_bytes_added != 0 && orig_request_size < total_bytes_added)
4430                         *(ap->a_bytesallocated) =
4431                                 roundup(orig_request_size, (off_t)vcb->blockSize);
4432
4433         } else { /* Shorten the size of the file */
4434
4435                 /*
4436                  * N.B. At present, this code is never called.  If and when we
4437                  * do start using it, it looks like there might be slightly
4438                  * strange semantics with the file size: it's possible for the
4439                  * file size to *increase* e.g. if current file size is 5,
4440                  * length is 1024 and filebytes is 4096, the file size will
4441                  * end up being 1024 bytes.  This isn't necessarily a problem
4442                  * but it's not consistent with the code above which doesn't
4443                  * change the file size.
4444                  */
4445
4446                 retval = hfs_truncate(vp, length, 0, 0, ap->a_context);
4447                 filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
4448
4449                 /*
4450                  * if we get an error and no changes were made then exit
4451                  * otherwise we must do the hfs_update to reflect the changes
4452                  */
4453                 if (retval && (startingPEOF == filebytes)) goto Err_Exit;
4454 #if QUOTA
4455                 /* These are  bytesreleased */
4456                 (void) hfs_chkdq(cp, (int64_t)-((startingPEOF - filebytes)), NOCRED,0);
4457 #endif /* QUOTA */
4458
4459                 if (fp->ff_size > filebytes) {
4460                         fp->ff_size = filebytes;
4461
4462                         hfs_ubc_setsize(vp, fp->ff_size, true);
4463                 }
4464         }
4465
4466 Std_Exit:
4467         cp->c_flag |= C_MODIFIED;
4468         cp->c_touch_chgtime = TRUE;
4469         cp->c_touch_modtime = TRUE;
4470         retval2 = hfs_update(vp, 0);
4471
4472         if (retval == 0)
4473                 retval = retval2;
4474 Err_Exit:
4475         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
4476         hfs_unlock(cp);
4477         return (retval);
4478 }
4479
4480
4481 /*
4482  * Pagein for HFS filesystem
4483  */
4484 int
4485 hfs_vnop_pagein(struct vnop_pagein_args *ap)
4486 /*
4487         struct vnop_pagein_args {
4488                 vnode_t a_vp,
4489                 upl_t         a_pl,
4490                 vm_offset_t   a_pl_offset,
4491                 off_t         a_f_offset,
4492                 size_t        a_size,
4493                 int           a_flags
4494                 vfs_context_t a_context;
4495         };
4496 */
4497 {
4498         vnode_t         vp;
4499         struct cnode    *cp;
4500         struct filefork *fp;
4501         int             error = 0;
4502         upl_t           upl;
4503         upl_page_info_t *pl;
4504         off_t           f_offset;
4505         off_t           page_needed_f_offset;
4506         int             offset;
4507         int             isize;
4508         int             upl_size;
4509         int             pg_index;
4510         boolean_t       truncate_lock_held = FALSE;
4511         boolean_t       file_converted = FALSE;
4512         kern_return_t   kret;
4513
4514         vp = ap->a_vp;
4515         cp = VTOC(vp);
4516         fp = VTOF(vp);
4517
4518 #if CONFIG_PROTECT
4519         if ((error = cp_handle_vnop(vp, CP_READ_ACCESS | CP_WRITE_ACCESS, 0)) != 0) {
4520                 /*
4521                  * If we errored here, then this means that one of two things occurred:
4522                  * 1. there was a problem with the decryption of the key.
4523                  * 2. the device is locked and we are not allowed to access this particular file.
4524                  *
4525                  * Either way, this means that we need to shut down this upl now.  As long as
4526                  * the pl pointer is NULL (meaning that we're supposed to create the UPL ourselves)
4527                  * then we create a upl and immediately abort it.
4528                  */
4529                 if (ap->a_pl == NULL) {
4530                         /* create the upl */
4531                         ubc_create_upl (vp, ap->a_f_offset, ap->a_size, &upl, &pl,
4532                                         UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT);
4533                         /* mark the range as needed so it doesn't immediately get discarded upon abort */
4534                         ubc_upl_range_needed (upl, ap->a_pl_offset / PAGE_SIZE, 1);
4535
4536                         /* Abort the range */
4537                         ubc_upl_abort_range (upl, 0, ap->a_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
4538                 }
4539
4540
4541                 return error;
4542         }
4543 #endif /* CONFIG_PROTECT */
4544
4545         if (ap->a_pl != NULL) {
4546                 /*
4547                  * this can only happen for swap files now that
4548                  * we're asking for V2 paging behavior...
4549                  * so don't need to worry about decompression, or
4550                  * keeping track of blocks read or taking the truncate lock
4551                  */
4552                 error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
4553                                        ap->a_size, (off_t)fp->ff_size, ap->a_flags);
4554                 goto pagein_done;
4555         }
4556
4557         page_needed_f_offset = ap->a_f_offset + ap->a_pl_offset;
4558
4559 retry_pagein:
4560         /*
4561          * take truncate lock (shared/recursive) to guard against
4562          * zero-fill thru fsync interfering, but only for v2
4563          *
4564          * the HFS_RECURSE_TRUNCLOCK arg indicates that we want the
4565          * lock shared and we are allowed to recurse 1 level if this thread already
4566          * owns the lock exclusively... this can legally occur
4567          * if we are doing a shrinking ftruncate against a file
4568          * that is mapped private, and the pages being truncated
4569          * do not currently exist in the cache... in that case
4570          * we will have to page-in the missing pages in order
4571          * to provide them to the private mapping... we must
4572          * also call hfs_unlock_truncate with a postive been_recursed
4573          * arg to indicate that if we have recursed, there is no need to drop
4574          * the lock.  Allowing this simple recursion is necessary
4575          * in order to avoid a certain deadlock... since the ftruncate
4576          * already holds the truncate lock exclusively, if we try
4577          * to acquire it shared to protect the pagein path, we will
4578          * hang this thread
4579          *
4580          * NOTE: The if () block below is a workaround in order to prevent a
4581          * VM deadlock. See rdar://7853471.
4582          *
4583          * If we are in a forced unmount, then launchd will still have the
4584          * dyld_shared_cache file mapped as it is trying to reboot.  If we
4585          * take the truncate lock here to service a page fault, then our
4586          * thread could deadlock with the forced-unmount.  The forced unmount
4587          * thread will try to reclaim the dyld_shared_cache vnode, but since it's
4588          * marked C_DELETED, it will call ubc_setsize(0).  As a result, the unmount
4589          * thread will think it needs to copy all of the data out of the file
4590          * and into a VM copy object.  If we hold the cnode lock here, then that
4591          * VM operation will not be able to proceed, because we'll set a busy page
4592          * before attempting to grab the lock.  Note that this isn't as simple as "don't
4593          * call ubc_setsize" because doing that would just shift the problem to the
4594          * ubc_msync done before the vnode is reclaimed.
4595          *
4596          * So, if a forced unmount on this volume is in flight AND the cnode is
4597          * marked C_DELETED, then just go ahead and do the page in without taking
4598          * the lock (thus suspending pagein_v2 semantics temporarily).  Since it's on a file
4599          * that is not going to be available on the next mount, this seems like a
4600          * OK solution from a correctness point of view, even though it is hacky.
4601          */
4602         if (vfs_isforce(vnode_mount(vp))) {
4603                 if (cp->c_flag & C_DELETED) {
4604                         /* If we don't get it, then just go ahead and operate without the lock */
4605                         truncate_lock_held = hfs_try_trunclock(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4606                 }
4607         }
4608         else {
4609                 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4610                 truncate_lock_held = TRUE;
4611         }
4612
4613         kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT);
4614
4615         if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
4616                 error = EINVAL;
4617                 goto pagein_done;
4618         }
4619         ubc_upl_range_needed(upl, ap->a_pl_offset / PAGE_SIZE, 1);
4620
4621         upl_size = isize = ap->a_size;
4622
4623         /*
4624          * Scan from the back to find the last page in the UPL, so that we
4625          * aren't looking at a UPL that may have already been freed by the
4626          * preceding aborts/completions.
4627          */
4628         for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
4629                 if (upl_page_present(pl, --pg_index))
4630                         break;
4631                 if (pg_index == 0) {
4632                         /*
4633                          * no absent pages were found in the range specified
4634                          * just abort the UPL to get rid of it and then we're done
4635                          */
4636                         ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
4637                         goto pagein_done;
4638                 }
4639         }
4640         /*
4641          * initialize the offset variables before we touch the UPL.
4642          * f_offset is the position into the file, in bytes
4643          * offset is the position into the UPL, in bytes
4644          * pg_index is the pg# of the UPL we're operating on
4645          * isize is the offset into the UPL of the last page that is present.
4646          */
4647         isize = ((pg_index + 1) * PAGE_SIZE);
4648         pg_index = 0;
4649         offset = 0;
4650         f_offset = ap->a_f_offset;
4651
4652         while (isize) {
4653                 int  xsize;
4654                 int  num_of_pages;
4655
4656                 if ( !upl_page_present(pl, pg_index)) {
4657                         /*
4658                          * we asked for RET_ONLY_ABSENT, so it's possible
4659                          * to get back empty slots in the UPL.
4660                          * just skip over them
4661                          */
4662                         f_offset += PAGE_SIZE;
4663                         offset   += PAGE_SIZE;
4664                         isize    -= PAGE_SIZE;
4665                         pg_index++;
4666
4667                         continue;
4668                 }
4669                 /*
4670                  * We know that we have at least one absent page.
4671                  * Now checking to see how many in a row we have
4672                  */
4673                 num_of_pages = 1;
4674                 xsize = isize - PAGE_SIZE;
4675
4676                 while (xsize) {
4677                         if ( !upl_page_present(pl, pg_index + num_of_pages))
4678                                 break;
4679                         num_of_pages++;
4680                         xsize -= PAGE_SIZE;
4681                 }
4682                 xsize = num_of_pages * PAGE_SIZE;
4683
4684 #if HFS_COMPRESSION
4685                 if (VNODE_IS_RSRC(vp)) {
4686                         /* allow pageins of the resource fork */
4687                 } else {
4688                         int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
4689
4690                         if (compressed) {
4691
4692                                 if (truncate_lock_held) {
4693                                         /*
4694                                          * can't hold the truncate lock when calling into the decmpfs layer
4695                                          * since it calls back into this layer... even though we're only
4696                                          * holding the lock in shared mode, and the re-entrant path only
4697                                          * takes the lock shared, we can deadlock if some other thread
4698                                          * tries to grab the lock exclusively in between.
4699                                          */
4700                                         hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4701                                         truncate_lock_held = FALSE;
4702                                 }
4703                                 ap->a_pl = upl;
4704                                 ap->a_pl_offset = offset;
4705                                 ap->a_f_offset = f_offset;
4706                                 ap->a_size = xsize;
4707
4708                                 error = decmpfs_pagein_compressed(ap, &compressed, VTOCMP(vp));
4709                                 /*
4710                                  * note that decpfs_pagein_compressed can change the state of
4711                                  * 'compressed'... it will set it to 0 if the file is no longer
4712                                  * compressed once the compression lock is successfully taken
4713                                  * i.e. we would block on that lock while the file is being inflated
4714                                  */
4715                                 if (error == 0 && vnode_isfastdevicecandidate(vp)) {
4716                                         (void) hfs_addhotfile(vp);
4717                                 }
4718                                 if (compressed) {
4719                                         if (error == 0) {
4720                                                 /* successful page-in, update the access time */
4721                                                 VTOC(vp)->c_touch_acctime = TRUE;
4722
4723                                                 //
4724                                                 // compressed files are not traditional hot file candidates
4725                                                 // but they may be for CF (which ignores the ff_bytesread
4726                                                 // field)
4727                                                 //
4728                                                 if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
4729                                                         fp->ff_bytesread = 0;
4730                                                 }
4731                                         } else if (error == EAGAIN) {
4732                                                 /*
4733                                                  * EAGAIN indicates someone else already holds the compression lock...
4734                                                  * to avoid deadlocking, we'll abort this range of pages with an
4735                                                  * indication that the pagein needs to be redriven
4736                                                  */
4737                                                 ubc_upl_abort_range(upl, (upl_offset_t) offset, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_RESTART);
4738                                         } else if (error == ENOSPC) {
4739
4740                                                 if (upl_size == PAGE_SIZE)
4741                                                         panic("decmpfs_pagein_compressed: couldn't ubc_upl_map a single page\n");
4742
4743                                                 ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY);
4744
4745                                                 ap->a_size = PAGE_SIZE;
4746                                                 ap->a_pl = NULL;
4747                                                 ap->a_pl_offset = 0;
4748                                                 ap->a_f_offset = page_needed_f_offset;
4749
4750                                                 goto retry_pagein;
4751                                         } else {
4752                                                 ubc_upl_abort(upl, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
4753                                                 goto pagein_done;
4754                                         }
4755                                         goto pagein_next_range;
4756                                 }
4757                                 else {
4758                                         /*
4759                                          * Set file_converted only if the file became decompressed while we were
4760                                          * paging in.  If it were still compressed, we would re-start the loop using the goto
4761                                          * in the above block.  This avoid us overloading truncate_lock_held as our retry_pagein
4762                                          * condition below, since we could have avoided taking the truncate lock to prevent
4763                                          * a deadlock in the force unmount case.
4764                                          */
4765                                         file_converted = TRUE;
4766                                 }
4767                         }
4768                         if (file_converted == TRUE) {
4769                                 /*
4770                                  * the file was converted back to a regular file after we first saw it as compressed
4771                                  * we need to abort the upl, retake the truncate lock, recreate the UPL and start over
4772                                  * reset a_size so that we consider what remains of the original request
4773                                  * and null out a_upl and a_pl_offset.
4774                                  *
4775                                  * We should only be able to get into this block if the decmpfs_pagein_compressed
4776                                  * successfully decompressed the range in question for this file.
4777                                  */
4778                                 ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY);
4779
4780                                 ap->a_size = isize;
4781                                 ap->a_pl = NULL;
4782                                 ap->a_pl_offset = 0;
4783
4784                                 /* Reset file_converted back to false so that we don't infinite-loop. */
4785                                 file_converted = FALSE;
4786                                 goto retry_pagein;
4787                         }
4788                 }
4789 #endif
4790                 error = cluster_pagein(vp, upl, offset, f_offset, xsize, (off_t)fp->ff_size, ap->a_flags);
4791
4792                 /*
4793                  * Keep track of blocks read.
4794                  */
4795                 if ( !vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) {
4796                         int bytesread;
4797                         int took_cnode_lock = 0;
4798
4799                         if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE)
4800                                 bytesread = fp->ff_size;
4801                         else
4802                                 bytesread = xsize;
4803
4804                         /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
4805                         if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) {
4806                                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
4807                                 took_cnode_lock = 1;
4808                         }
4809                         /*
4810                          * If this file hasn't been seen since the start of
4811                          * the current sampling period then start over.
4812                          */
4813                         if (cp->c_atime < VTOHFS(vp)->hfc_timebase) {
4814                                 struct timeval tv;
4815
4816                                 fp->ff_bytesread = bytesread;
4817                                 microtime(&tv);
4818                                 cp->c_atime = tv.tv_sec;
4819                         } else {
4820                                 fp->ff_bytesread += bytesread;
4821                         }
4822                         cp->c_touch_acctime = TRUE;
4823
4824                         if (vnode_isfastdevicecandidate(vp)) {
4825                                 (void) hfs_addhotfile(vp);
4826                         }
4827                         if (took_cnode_lock)
4828                                 hfs_unlock(cp);
4829                 }
4830 pagein_next_range:
4831                 f_offset += xsize;
4832                 offset   += xsize;
4833                 isize    -= xsize;
4834                 pg_index += num_of_pages;
4835
4836                 error = 0;
4837         }
4838
4839 pagein_done:
4840         if (truncate_lock_held == TRUE) {
4841                 /* Note 1 is passed to hfs_unlock_truncate in been_recursed argument */
4842                 hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4843         }
4844
4845         return (error);
4846 }
4847
4848 /*
4849  * Pageout for HFS filesystem.
4850  */
4851 int
4852 hfs_vnop_pageout(struct vnop_pageout_args *ap)
4853 /*
4854         struct vnop_pageout_args {
4855            vnode_t a_vp,
4856            upl_t         a_pl,
4857            vm_offset_t   a_pl_offset,
4858            off_t         a_f_offset,
4859            size_t        a_size,
4860            int           a_flags
4861            vfs_context_t a_context;
4862         };
4863 */
4864 {
4865         vnode_t vp = ap->a_vp;
4866         struct cnode *cp;
4867         struct filefork *fp;
4868         int retval = 0;
4869         off_t filesize;
4870         upl_t           upl;
4871         upl_page_info_t* pl = NULL;
4872         vm_offset_t     a_pl_offset;
4873         int             a_flags;
4874         int is_pageoutv2 = 0;
4875         kern_return_t kret;
4876
4877         cp = VTOC(vp);
4878         fp = VTOF(vp);
4879
4880         a_flags = ap->a_flags;
4881         a_pl_offset = ap->a_pl_offset;
4882
4883         /*
4884          * we can tell if we're getting the new or old behavior from the UPL
4885          */
4886         if ((upl = ap->a_pl) == NULL) {
4887                 int request_flags;
4888
4889                 is_pageoutv2 = 1;
4890                 /*
4891                  * we're in control of any UPL we commit
4892                  * make sure someone hasn't accidentally passed in UPL_NOCOMMIT
4893                  */
4894                 a_flags &= ~UPL_NOCOMMIT;
4895                 a_pl_offset = 0;
4896
4897                 /*
4898                  * For V2 semantics, we want to take the cnode truncate lock
4899                  * shared to guard against the file size changing via zero-filling.
4900                  *
4901                  * However, we have to be careful because we may be invoked
4902                  * via the ubc_msync path to write out dirty mmap'd pages
4903                  * in response to a lock event on a content-protected
4904                  * filesystem (e.g. to write out class A files).
4905                  * As a result, we want to take the truncate lock 'SHARED' with
4906                  * the mini-recursion locktype so that we don't deadlock/panic
4907                  * because we may be already holding the truncate lock exclusive to force any other
4908                  * IOs to have blocked behind us.
4909                  */
4910                 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4911
4912                 if (a_flags & UPL_MSYNC) {
4913                         request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY;
4914                 }
4915                 else {
4916                         request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY;
4917                 }
4918
4919                 kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags);
4920
4921                 if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
4922                         retval = EINVAL;
4923                         goto pageout_done;
4924                 }
4925         }
4926         /*
4927          * from this point forward upl points at the UPL we're working with
4928          * it was either passed in or we succesfully created it
4929          */
4930
4931         /*
4932          * Figure out where the file ends, for pageout purposes.  If
4933          * ff_new_size > ff_size, then we're in the middle of extending the
4934          * file via a write, so it is safe (and necessary) that we be able
4935          * to pageout up to that point.
4936          */
4937         filesize = fp->ff_size;
4938         if (fp->ff_new_size > filesize)
4939                 filesize = fp->ff_new_size;
4940
4941         /*
4942          * Now that HFS is opting into VFC_VFSVNOP_PAGEOUTV2, we may need to operate on our own
4943          * UPL instead of relying on the UPL passed into us.  We go ahead and do that here,
4944          * scanning for dirty ranges.  We'll issue our own N cluster_pageout calls, for
4945          * N dirty ranges in the UPL.  Note that this is almost a direct copy of the
4946          * logic in vnode_pageout except that we need to do it after grabbing the truncate
4947          * lock in HFS so that we don't lock invert ourselves.
4948          *
4949          * Note that we can still get into this function on behalf of the default pager with
4950          * non-V2 behavior (swapfiles).  However in that case, we did not grab locks above
4951          * since fsync and other writing threads will grab the locks, then mark the
4952          * relevant pages as busy.  But the pageout codepath marks the pages as busy,
4953          * and THEN would attempt to grab the truncate lock, which would result in deadlock.  So
4954          * we do not try to grab anything for the pre-V2 case, which should only be accessed
4955          * by the paging/VM system.
4956          */
4957
4958         if (is_pageoutv2) {
4959                 off_t f_offset;
4960                 int offset;
4961                 int isize;
4962                 int pg_index;
4963                 int error;
4964                 int error_ret = 0;
4965
4966                 isize = ap->a_size;
4967                 f_offset = ap->a_f_offset;
4968
4969                 /*
4970                  * Scan from the back to find the last page in the UPL, so that we
4971                  * aren't looking at a UPL that may have already been freed by the
4972                  * preceding aborts/completions.
4973                  */
4974                 for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
4975                         if (upl_page_present(pl, --pg_index))
4976                                 break;
4977                         if (pg_index == 0) {
4978                                 ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
4979                                 goto pageout_done;
4980                         }
4981                 }
4982
4983                 /*
4984                  * initialize the offset variables before we touch the UPL.
4985                  * a_f_offset is the position into the file, in bytes
4986                  * offset is the position into the UPL, in bytes
4987                  * pg_index is the pg# of the UPL we're operating on.
4988                  * isize is the offset into the UPL of the last non-clean page.
4989                  */
4990                 isize = ((pg_index + 1) * PAGE_SIZE);
4991
4992                 offset = 0;
4993                 pg_index = 0;
4994
4995                 while (isize) {
4996                         int  xsize;
4997                         int  num_of_pages;
4998
4999                         if ( !upl_page_present(pl, pg_index)) {
5000                                 /*
5001                                  * we asked for RET_ONLY_DIRTY, so it's possible
5002                                  * to get back empty slots in the UPL.
5003                                  * just skip over them
5004                                  */
5005                                 f_offset += PAGE_SIZE;
5006                                 offset   += PAGE_SIZE;
5007                                 isize    -= PAGE_SIZE;
5008                                 pg_index++;
5009
5010                                 continue;
5011                         }
5012                         if ( !upl_dirty_page(pl, pg_index)) {
5013                                 panic ("hfs_vnop_pageout: unforeseen clean page @ index %d for UPL %p\n", pg_index, upl);
5014                         }
5015
5016                         /*
5017                          * We know that we have at least one dirty page.
5018                          * Now checking to see how many in a row we have
5019                          */
5020                         num_of_pages = 1;
5021                         xsize = isize - PAGE_SIZE;
5022
5023                         while (xsize) {
5024                                 if ( !upl_dirty_page(pl, pg_index + num_of_pages))
5025                                         break;
5026                                 num_of_pages++;
5027                                 xsize -= PAGE_SIZE;
5028                         }
5029                         xsize = num_of_pages * PAGE_SIZE;
5030
5031                         if ((error = cluster_pageout(vp, upl, offset, f_offset,
5032                                                         xsize, filesize, a_flags))) {
5033                                 if (error_ret == 0)
5034                                         error_ret = error;
5035                         }
5036                         f_offset += xsize;
5037                         offset   += xsize;
5038                         isize    -= xsize;
5039                         pg_index += num_of_pages;
5040                 }
5041                 /* capture errnos bubbled out of cluster_pageout if they occurred */
5042                 if (error_ret != 0) {
5043                         retval = error_ret;
5044                 }
5045         } /* end block for v2 pageout behavior */
5046         else {
5047                 /*
5048                  * just call cluster_pageout for old pre-v2 behavior
5049                  */
5050                 retval = cluster_pageout(vp, upl, a_pl_offset, ap->a_f_offset,
5051                                 ap->a_size, filesize, a_flags);
5052         }
5053
5054         /*
5055          * If data was written, update the modification time of the file
5056          * but only if it's mapped writable; we will have touched the
5057          * modifcation time for direct writes.
5058          */
5059         if (retval == 0 && (ubc_is_mapped_writable(vp)
5060                                                 || ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING))) {
5061                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5062
5063                 // Check again with lock
5064                 bool mapped_writable = ubc_is_mapped_writable(vp);
5065                 if (mapped_writable
5066                         || ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING)) {
5067                         cp->c_touch_modtime = TRUE;
5068                         cp->c_touch_chgtime = TRUE;
5069
5070                         /*
5071                          * We only need to increment the generation counter if
5072                          * it's currently mapped writable because we incremented
5073                          * the counter in hfs_vnop_mnomap.
5074                          */
5075                         if (mapped_writable)
5076                                 hfs_incr_gencount(VTOC(vp));
5077
5078                         /*
5079                          * If setuid or setgid bits are set and this process is
5080                          * not the superuser then clear the setuid and setgid bits
5081                          * as a precaution against tampering.
5082                          */
5083                         if ((cp->c_mode & (S_ISUID | S_ISGID)) &&
5084                                 (vfs_context_suser(ap->a_context) != 0)) {
5085                                 cp->c_mode &= ~(S_ISUID | S_ISGID);
5086                         }
5087                 }
5088
5089                 hfs_unlock(cp);
5090         }
5091
5092 pageout_done:
5093         if (is_pageoutv2) {
5094                 /*
5095                  * Release the truncate lock.  Note that because
5096                  * we may have taken the lock recursively by
5097                  * being invoked via ubc_msync due to lockdown,
5098                  * we should release it recursively, too.
5099                  */
5100                 hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
5101         }
5102         return (retval);
5103 }
5104
5105 /*
5106  * Intercept B-Tree node writes to unswap them if necessary.
5107  */
5108 int
5109 hfs_vnop_bwrite(struct vnop_bwrite_args *ap)
5110 {
5111         int retval = 0;
5112         register struct buf *bp = ap->a_bp;
5113         register struct vnode *vp = buf_vnode(bp);
5114         BlockDescriptor block;
5115
5116         /* Trap B-Tree writes */
5117         if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) ||
5118             (VTOC(vp)->c_fileid == kHFSCatalogFileID) ||
5119             (VTOC(vp)->c_fileid == kHFSAttributesFileID) ||
5120             (vp == VTOHFS(vp)->hfc_filevp)) {
5121
5122                 /*
5123                  * Swap and validate the node if it is in native byte order.
5124                  * This is always be true on big endian, so we always validate
5125                  * before writing here.  On little endian, the node typically has
5126                  * been swapped and validated when it was written to the journal,
5127                  * so we won't do anything here.
5128                  */
5129                 if (((u_int16_t *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) {
5130                         /* Prepare the block pointer */
5131                         block.blockHeader = bp;
5132                         block.buffer = (char *)buf_dataptr(bp);
5133                         block.blockNum = buf_lblkno(bp);
5134                         /* not found in cache ==> came from disk */
5135                         block.blockReadFromDisk = (buf_fromcache(bp) == 0);
5136                         block.blockSize = buf_count(bp);
5137
5138                         /* Endian un-swap B-Tree node */
5139                         retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, false);
5140                         if (retval)
5141                                 panic("hfs_vnop_bwrite: about to write corrupt node!\n");
5142                 }
5143         }
5144
5145         /* This buffer shouldn't be locked anymore but if it is clear it */
5146         if ((buf_flags(bp) & B_LOCKED)) {
5147                 // XXXdbg
5148                 if (VTOHFS(vp)->jnl) {
5149                         panic("hfs: CLEARING the lock bit on bp %p\n", bp);
5150                 }
5151                 buf_clearflags(bp, B_LOCKED);
5152         }
5153         retval = vn_bwrite (ap);
5154
5155         return (retval);
5156 }
5157
5158
5159 int
5160 hfs_pin_block_range(struct hfsmount *hfsmp, int pin_state, uint32_t start_block, uint32_t nblocks)
5161 {
5162         _dk_cs_pin_t pin;
5163         unsigned ioc;
5164         int err;
5165
5166         memset(&pin, 0, sizeof(pin));
5167         pin.cp_extent.offset = ((uint64_t)start_block) * HFSTOVCB(hfsmp)->blockSize;
5168         pin.cp_extent.length = ((uint64_t)nblocks) * HFSTOVCB(hfsmp)->blockSize;
5169         switch (pin_state) {
5170         case HFS_PIN_IT:
5171                 ioc = _DKIOCCSPINEXTENT;
5172                 pin.cp_flags = _DKIOCCSPINTOFASTMEDIA;
5173                 break;
5174         case HFS_PIN_IT | HFS_TEMP_PIN:
5175                 ioc = _DKIOCCSPINEXTENT;
5176                 pin.cp_flags = _DKIOCCSPINTOFASTMEDIA | _DKIOCCSTEMPORARYPIN;
5177                 break;
5178         case HFS_PIN_IT | HFS_DATALESS_PIN:
5179                 ioc = _DKIOCCSPINEXTENT;
5180                 pin.cp_flags = _DKIOCCSPINTOFASTMEDIA | _DKIOCCSPINFORSWAPFILE;
5181                 break;
5182         case HFS_UNPIN_IT:
5183                 ioc = _DKIOCCSUNPINEXTENT;
5184                 pin.cp_flags = 0;
5185                 break;
5186         case HFS_UNPIN_IT | HFS_EVICT_PIN:
5187                 ioc = _DKIOCCSPINEXTENT;
5188                 pin.cp_flags = _DKIOCCSPINTOSLOWMEDIA;
5189                 break;
5190         default:
5191                 return EINVAL;
5192         }
5193         err = VNOP_IOCTL(hfsmp->hfs_devvp, ioc, (caddr_t)&pin, 0, vfs_context_kernel());
5194         return err;
5195 }
5196
5197 //
5198 // The cnode lock should already be held on entry to this function
5199 //
5200 int
5201 hfs_pin_vnode(struct hfsmount *hfsmp, struct vnode *vp, int pin_state, uint32_t *num_blocks_pinned)
5202 {
5203         struct filefork *fp = VTOF(vp);
5204         int i, err=0, need_put=0;
5205         struct vnode *rsrc_vp=NULL;
5206         uint32_t npinned = 0;
5207         off_t               offset;
5208
5209         if (num_blocks_pinned) {
5210                 *num_blocks_pinned = 0;
5211         }
5212
5213         if (vnode_vtype(vp) != VREG) {
5214                 /* Not allowed to pin directories or symlinks */
5215                 printf("hfs: can't pin vnode of type %d\n", vnode_vtype(vp));
5216                 return (EPERM);
5217         }
5218
5219         if (fp->ff_unallocblocks) {
5220                 printf("hfs: can't pin a vnode w/unalloced blocks (%d)\n", fp->ff_unallocblocks);
5221                 return (EINVAL);
5222         }
5223
5224         /*
5225          * It is possible that if the caller unlocked/re-locked the cnode after checking
5226          * for C_NOEXISTS|C_DELETED that the file could have been deleted while the
5227          * cnode was unlocked.  So check the condition again and return ENOENT so that
5228          * the caller knows why we failed to pin the vnode.
5229          */
5230         if (VTOC(vp)->c_flag & (C_NOEXISTS|C_DELETED)) {
5231                 // makes no sense to pin something that's pending deletion
5232                 return ENOENT;
5233         }
5234
5235         if (fp->ff_blocks == 0 && (VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
5236                 if (!VNODE_IS_RSRC(vp) && hfs_vgetrsrc(hfsmp, vp, &rsrc_vp) == 0) {
5237                         //printf("hfs: fileid %d resource fork nblocks: %d / size: %lld\n", VTOC(vp)->c_fileid,
5238                         //       VTOC(rsrc_vp)->c_rsrcfork->ff_blocks,VTOC(rsrc_vp)->c_rsrcfork->ff_size);
5239
5240                         fp = VTOC(rsrc_vp)->c_rsrcfork;
5241                         need_put = 1;
5242                 }
5243         }
5244         if (fp->ff_blocks == 0) {
5245                 if (need_put) {
5246                         //
5247                         // use a distinct error code for a compressed file that has no resource fork;
5248                         // we return EALREADY to indicate that the data is already probably hot file
5249                         // cached because it's in an EA and the attributes btree is on the ssd
5250                         //
5251                         err = EALREADY;
5252                 } else {
5253                         err = EINVAL;
5254                 }
5255                 goto out;
5256         }
5257
5258         offset = 0;
5259         for (i = 0; i < kHFSPlusExtentDensity; i++) {
5260                 if (fp->ff_extents[i].startBlock == 0) {
5261                         break;
5262                 }
5263
5264                 err = hfs_pin_block_range(hfsmp, pin_state, fp->ff_extents[i].startBlock, fp->ff_extents[i].blockCount);
5265                 if (err) {
5266                         break;
5267                 } else {
5268                         npinned += fp->ff_extents[i].blockCount;
5269                 }
5270         }
5271
5272         if (err || npinned == 0) {
5273                 goto out;
5274         }
5275
5276         if (fp->ff_extents[kHFSPlusExtentDensity-1].startBlock) {
5277                 uint32_t pblocks;
5278                 uint8_t forktype = 0;
5279
5280                 if (fp == VTOC(vp)->c_rsrcfork) {
5281                         forktype = 0xff;
5282                 }
5283                 /*
5284                  * The file could have overflow extents, better pin them.
5285                  *
5286                  * We assume that since we are holding the cnode lock for this cnode,
5287                  * the files extents cannot be manipulated, but the tree could, so we
5288                  * need to ensure that it doesn't change behind our back as we iterate it.
5289                  */
5290                 int lockflags = hfs_systemfile_lock (hfsmp, SFL_EXTENTS, HFS_SHARED_LOCK);
5291                 err = hfs_pin_overflow_extents(hfsmp, VTOC(vp)->c_fileid, forktype, &pblocks);
5292                 hfs_systemfile_unlock (hfsmp, lockflags);
5293
5294                 if (err) {
5295                         goto out;
5296                 }
5297                 npinned += pblocks;
5298         }
5299
5300 out:
5301         if (num_blocks_pinned) {
5302                 *num_blocks_pinned = npinned;
5303         }
5304
5305         if (need_put && rsrc_vp) {
5306                 //
5307                 // have to unlock the cnode since it's shared between the
5308                 // resource fork vnode and the data fork vnode (and the
5309                 // vnode_put() may need to re-acquire the cnode lock to
5310                 // reclaim the resource fork vnode)
5311                 //
5312                 hfs_unlock(VTOC(vp));
5313                 vnode_put(rsrc_vp);
5314                 hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5315         }
5316         return err;
5317 }
5318
5319
5320 /*
5321  * Relocate a file to a new location on disk
5322  *  cnode must be locked on entry
5323  *
5324  * Relocation occurs by cloning the file's data from its
5325  * current set of blocks to a new set of blocks. During
5326  * the relocation all of the blocks (old and new) are
5327  * owned by the file.
5328  *
5329  * -----------------
5330  * |///////////////|
5331  * -----------------
5332  * 0               N (file offset)
5333  *
5334  * -----------------     -----------------
5335  * |///////////////|     |               |     STEP 1 (acquire new blocks)
5336  * -----------------     -----------------
5337  * 0               N     N+1             2N
5338  *
5339  * -----------------     -----------------
5340  * |///////////////|     |///////////////|     STEP 2 (clone data)
5341  * -----------------     -----------------
5342  * 0               N     N+1             2N
5343  *
5344  *                       -----------------
5345  *                       |///////////////|     STEP 3 (head truncate blocks)
5346  *                       -----------------
5347  *                       0               N
5348  *
5349  * During steps 2 and 3 page-outs to file offsets less
5350  * than or equal to N are suspended.
5351  *
5352  * During step 3 page-ins to the file get suspended.
5353  */
5354 int
5355 hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
5356         struct  proc *p)
5357 {
5358         struct  cnode *cp;
5359         struct  filefork *fp;
5360         struct  hfsmount *hfsmp;
5361         u_int32_t  headblks;
5362         u_int32_t  datablks;
5363         u_int32_t  blksize;
5364         u_int32_t  growsize;
5365         u_int32_t  nextallocsave;
5366         daddr64_t  sector_a,  sector_b;
5367         int eflags;
5368         off_t  newbytes;
5369         int  retval;
5370         int lockflags = 0;
5371         int took_trunc_lock = 0;
5372         int started_tr = 0;
5373         enum vtype vnodetype;
5374
5375         vnodetype = vnode_vtype(vp);
5376         if (vnodetype != VREG) {
5377                 /* Not allowed to move symlinks. */
5378                 return (EPERM);
5379         }
5380
5381         hfsmp = VTOHFS(vp);
5382         if (hfsmp->hfs_flags & HFS_FRAGMENTED_FREESPACE) {
5383                 return (ENOSPC);
5384         }
5385
5386         cp = VTOC(vp);
5387         fp = VTOF(vp);
5388         if (fp->ff_unallocblocks)
5389                 return (EINVAL);
5390
5391 #if CONFIG_PROTECT
5392         /*
5393          * <rdar://problem/9118426>
5394          * Disable HFS file relocation on content-protected filesystems
5395          */
5396         if (cp_fs_protected (hfsmp->hfs_mp)) {
5397                 return EINVAL;
5398         }
5399 #endif
5400         /* If it's an SSD, also disable HFS relocation */
5401         if (hfsmp->hfs_flags & HFS_SSD) {
5402                 return EINVAL;
5403         }
5404
5405
5406         blksize = hfsmp->blockSize;
5407         if (blockHint == 0)
5408                 blockHint = hfsmp->nextAllocation;
5409
5410         if (fp->ff_size > 0x7fffffff) {
5411                 return (EFBIG);
5412         }
5413
5414         if (!vnode_issystem(vp) && (vnodetype != VLNK)) {
5415                 hfs_unlock(cp);
5416                 hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
5417                 /* Force lock since callers expects lock to be held. */
5418                 if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS))) {
5419                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5420                         return (retval);
5421                 }
5422                 /* No need to continue if file was removed. */
5423                 if (cp->c_flag & C_NOEXISTS) {
5424                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5425                         return (ENOENT);
5426                 }
5427                 took_trunc_lock = 1;
5428         }
5429         headblks = fp->ff_blocks;
5430         datablks = howmany(fp->ff_size, blksize);
5431         growsize = datablks * blksize;
5432         eflags = kEFContigMask | kEFAllMask | kEFNoClumpMask;
5433         if (blockHint >= hfsmp->hfs_metazone_start &&
5434             blockHint <= hfsmp->hfs_metazone_end)
5435                 eflags |= kEFMetadataMask;
5436
5437         if (hfs_start_transaction(hfsmp) != 0) {
5438                 if (took_trunc_lock)
5439                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5440             return (EINVAL);
5441         }
5442         started_tr = 1;
5443         /*
5444          * Protect the extents b-tree and the allocation bitmap
5445          * during MapFileBlockC and ExtendFileC operations.
5446          */
5447         lockflags = SFL_BITMAP;
5448         if (overflow_extents(fp))
5449                 lockflags |= SFL_EXTENTS;
5450         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5451
5452         retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize - 1, &sector_a, NULL);
5453         if (retval) {
5454                 retval = MacToVFSError(retval);
5455                 goto out;
5456         }
5457
5458         /*
5459          * STEP 1 - acquire new allocation blocks.
5460          */
5461         nextallocsave = hfsmp->nextAllocation;
5462         retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes);
5463         if (eflags & kEFMetadataMask) {
5464                 hfs_lock_mount(hfsmp);
5465                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave);
5466                 MarkVCBDirty(hfsmp);
5467                 hfs_unlock_mount(hfsmp);
5468         }
5469
5470         retval = MacToVFSError(retval);
5471         if (retval == 0) {
5472                 cp->c_flag |= C_MODIFIED;
5473                 if (newbytes < growsize) {
5474                         retval = ENOSPC;
5475                         goto restore;
5476                 } else if (fp->ff_blocks < (headblks + datablks)) {
5477                         printf("hfs_relocate: allocation failed id=%u, vol=%s\n", cp->c_cnid, hfsmp->vcbVN);
5478                         retval = ENOSPC;
5479                         goto restore;
5480                 }
5481
5482                 retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize, &sector_b, NULL);
5483                 if (retval) {
5484                         retval = MacToVFSError(retval);
5485                 } else if ((sector_a + 1) == sector_b) {
5486                         retval = ENOSPC;
5487                         goto restore;
5488                 } else if ((eflags & kEFMetadataMask) &&
5489                            ((((u_int64_t)sector_b * hfsmp->hfs_logical_block_size) / blksize) >
5490                               hfsmp->hfs_metazone_end)) {
5491 #if 0
5492                         const char * filestr;
5493                         char emptystr = '\0';
5494
5495                         if (cp->c_desc.cd_nameptr != NULL) {
5496                                 filestr = (const char *)&cp->c_desc.cd_nameptr[0];
5497                         } else if (vnode_name(vp) != NULL) {
5498                                 filestr = vnode_name(vp);
5499                         } else {
5500                                 filestr = &emptystr;
5501                         }
5502 #endif
5503                         retval = ENOSPC;
5504                         goto restore;
5505                 }
5506         }
5507         /* Done with system locks and journal for now. */
5508         hfs_systemfile_unlock(hfsmp, lockflags);
5509         lockflags = 0;
5510         hfs_end_transaction(hfsmp);
5511         started_tr = 0;
5512
5513         if (retval) {
5514                 /*
5515                  * Check to see if failure is due to excessive fragmentation.
5516                  */
5517                 if ((retval == ENOSPC) &&
5518                     (hfs_freeblks(hfsmp, 0) > (datablks * 2))) {
5519                         hfsmp->hfs_flags |= HFS_FRAGMENTED_FREESPACE;
5520                 }
5521                 goto out;
5522         }
5523         /*
5524          * STEP 2 - clone file data into the new allocation blocks.
5525          */
5526
5527         if (vnodetype == VLNK)
5528                 retval = EPERM;
5529         else if (vnode_issystem(vp))
5530                 retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p);
5531         else
5532                 retval = hfs_clonefile(vp, headblks, datablks, blksize);
5533
5534         /* Start transaction for step 3 or for a restore. */
5535         if (hfs_start_transaction(hfsmp) != 0) {
5536                 retval = EINVAL;
5537                 goto out;
5538         }
5539         started_tr = 1;
5540         if (retval)
5541                 goto restore;
5542
5543         /*
5544          * STEP 3 - switch to cloned data and remove old blocks.
5545          */
5546         lockflags = SFL_BITMAP;
5547         if (overflow_extents(fp))
5548                 lockflags |= SFL_EXTENTS;
5549         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5550
5551         retval = HeadTruncateFile(hfsmp, (FCB*)fp, headblks);
5552
5553         hfs_systemfile_unlock(hfsmp, lockflags);
5554         lockflags = 0;
5555         if (retval)
5556                 goto restore;
5557 out:
5558         if (took_trunc_lock)
5559                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5560
5561         if (lockflags) {
5562                 hfs_systemfile_unlock(hfsmp, lockflags);
5563                 lockflags = 0;
5564         }
5565
5566         /* Push cnode's new extent data to disk. */
5567         if (retval == 0) {
5568                 hfs_update(vp, 0);
5569         }
5570         if (hfsmp->jnl) {
5571                 if (cp->c_cnid < kHFSFirstUserCatalogNodeID)
5572                         (void) hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT | HFS_FVH_WRITE_ALT);
5573                 else
5574                         (void) hfs_flushvolumeheader(hfsmp, 0);
5575         }
5576 exit:
5577         if (started_tr)
5578                 hfs_end_transaction(hfsmp);
5579
5580         return (retval);
5581
5582 restore:
5583         if (fp->ff_blocks == headblks) {
5584                 if (took_trunc_lock)
5585                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5586                 goto exit;
5587         }
5588         /*
5589          * Give back any newly allocated space.
5590          */
5591         if (lockflags == 0) {
5592                 lockflags = SFL_BITMAP;
5593                 if (overflow_extents(fp))
5594                         lockflags |= SFL_EXTENTS;
5595                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5596         }
5597
5598         (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, 0, FORK_IS_RSRC(fp),
5599                                                  FTOC(fp)->c_fileid, false);
5600
5601         hfs_systemfile_unlock(hfsmp, lockflags);
5602         lockflags = 0;
5603
5604         if (took_trunc_lock)
5605                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5606         goto exit;
5607 }
5608
5609
5610 /*
5611  * Clone a file's data within the file.
5612  *
5613  */
5614 static int
5615 hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
5616 {
5617         caddr_t  bufp;
5618         size_t  bufsize;
5619         size_t  copysize;
5620         size_t  iosize;
5621         size_t  offset;
5622         off_t   writebase;
5623         uio_t auio;
5624         int  error = 0;
5625
5626         writebase = blkstart * blksize;
5627         copysize = blkcnt * blksize;
5628         iosize = bufsize = MIN(copysize, 128 * 1024);
5629         offset = 0;
5630
5631         hfs_unlock(VTOC(vp));
5632
5633 #if CONFIG_PROTECT
5634         if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
5635                 hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5636                 return (error);
5637         }
5638 #endif /* CONFIG_PROTECT */
5639
5640     bufp = hfs_malloc(bufsize);
5641
5642         auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
5643
5644         while (offset < copysize) {
5645                 iosize = MIN(copysize - offset, iosize);
5646
5647                 uio_reset(auio, offset, UIO_SYSSPACE, UIO_READ);
5648                 uio_addiov(auio, (uintptr_t)bufp, iosize);
5649
5650                 error = cluster_read(vp, auio, copysize, IO_NOCACHE);
5651                 if (error) {
5652                         printf("hfs_clonefile: cluster_read failed - %d\n", error);
5653                         break;
5654                 }
5655                 if (uio_resid(auio) != 0) {
5656                         printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", (int64_t)uio_resid(auio));
5657                         error = EIO;
5658                         break;
5659                 }
5660
5661                 uio_reset(auio, writebase + offset, UIO_SYSSPACE, UIO_WRITE);
5662                 uio_addiov(auio, (uintptr_t)bufp, iosize);
5663
5664                 error = cluster_write(vp, auio, writebase + offset,
5665                                       writebase + offset + iosize,
5666                                       uio_offset(auio), 0, IO_NOCACHE | IO_SYNC);
5667                 if (error) {
5668                         printf("hfs_clonefile: cluster_write failed - %d\n", error);
5669                         break;
5670                 }
5671                 if (uio_resid(auio) != 0) {
5672                         printf("hfs_clonefile: cluster_write failed - uio_resid not zero\n");
5673                         error = EIO;
5674                         break;
5675                 }
5676                 offset += iosize;
5677         }
5678         uio_free(auio);
5679
5680         if ((blksize & PAGE_MASK)) {
5681                 /*
5682                  * since the copy may not have started on a PAGE
5683                  * boundary (or may not have ended on one), we
5684                  * may have pages left in the cache since NOCACHE
5685                  * will let partially written pages linger...
5686                  * lets just flush the entire range to make sure
5687                  * we don't have any pages left that are beyond
5688                  * (or intersect) the real LEOF of this file
5689                  */
5690                 ubc_msync(vp, writebase, writebase + offset, NULL, UBC_INVALIDATE | UBC_PUSHDIRTY);
5691         } else {
5692                 /*
5693                  * No need to call ubc_msync or hfs_invalbuf
5694                  * since the file was copied using IO_NOCACHE and
5695                  * the copy was done starting and ending on a page
5696                  * boundary in the file.
5697                  */
5698         }
5699     hfs_free(bufp, bufsize);
5700
5701         hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5702         return (error);
5703 }
5704
5705 /*
5706  * Clone a system (metadata) file.
5707  *
5708  */
5709 static int
5710 hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize,
5711                  kauth_cred_t cred, struct proc *p)
5712 {
5713         caddr_t  bufp;
5714         char * offset;
5715         size_t  bufsize;
5716         size_t  iosize;
5717         struct buf *bp = NULL;
5718         daddr64_t  blkno;
5719         daddr64_t  blk;
5720         daddr64_t  start_blk;
5721         daddr64_t  last_blk;
5722         int  breadcnt;
5723         int  i;
5724         int  error = 0;
5725
5726
5727         iosize = GetLogicalBlockSize(vp);
5728         bufsize = MIN(blkcnt * blksize, 1024 * 1024) & ~(iosize - 1);
5729         breadcnt = bufsize / iosize;
5730
5731     bufp = hfs_malloc(bufsize);
5732
5733         start_blk = ((daddr64_t)blkstart * blksize) / iosize;
5734         last_blk  = ((daddr64_t)blkcnt * blksize) / iosize;
5735         blkno = 0;
5736
5737         while (blkno < last_blk) {
5738                 /*
5739                  * Read up to a megabyte
5740                  */
5741                 offset = bufp;
5742                 for (i = 0, blk = blkno; (i < breadcnt) && (blk < last_blk); ++i, ++blk) {
5743                         error = (int)buf_meta_bread(vp, blk, iosize, cred, &bp);
5744                         if (error) {
5745                                 printf("hfs_clonesysfile: meta_bread error %d\n", error);
5746                                 goto out;
5747                         }
5748                         if (buf_count(bp) != iosize) {
5749                                 printf("hfs_clonesysfile: b_bcount is only %d\n", buf_count(bp));
5750                                 goto out;
5751                         }
5752                         bcopy((char *)buf_dataptr(bp), offset, iosize);
5753
5754                         buf_markinvalid(bp);
5755                         buf_brelse(bp);
5756                         bp = NULL;
5757
5758                         offset += iosize;
5759                 }
5760
5761                 /*
5762                  * Write up to a megabyte
5763                  */
5764                 offset = bufp;
5765                 for (i = 0; (i < breadcnt) && (blkno < last_blk); ++i, ++blkno) {
5766                         bp = buf_getblk(vp, start_blk + blkno, iosize, 0, 0, BLK_META);
5767                         if (bp == NULL) {
5768                                 printf("hfs_clonesysfile: getblk failed on blk %qd\n", start_blk + blkno);
5769                                 error = EIO;
5770                                 goto out;
5771                         }
5772                         bcopy(offset, (char *)buf_dataptr(bp), iosize);
5773                         error = (int)buf_bwrite(bp);
5774                         bp = NULL;
5775                         if (error)
5776                                 goto out;
5777                         offset += iosize;
5778                 }
5779         }
5780 out:
5781         if (bp) {
5782                 buf_brelse(bp);
5783         }
5784
5785     hfs_free(bufp, bufsize);
5786
5787         error = hfs_fsync(vp, MNT_WAIT, 0, p);
5788
5789         return (error);
5790 }
5791
5792 errno_t hfs_flush_invalid_ranges(vnode_t vp)
5793 {
5794         cnode_t *cp = VTOC(vp);
5795
5796         hfs_assert(cp->c_lockowner == current_thread());
5797         hfs_assert(cp->c_truncatelockowner == current_thread());
5798
5799         if (!ISSET(cp->c_flag, C_ZFWANTSYNC) && !cp->c_zftimeout)
5800                 return 0;
5801
5802         filefork_t *fp = VTOF(vp);
5803
5804         /*
5805          * We can't hold the cnode lock whilst we call cluster_write so we
5806          * need to copy the extents into a local buffer.
5807          */
5808         int max_exts = 16;
5809         struct ext {
5810                 off_t start, end;
5811         } exts_buf[max_exts];           // 256 bytes
5812         struct ext *exts = exts_buf;
5813         int ext_count = 0;
5814         errno_t ret;
5815
5816         struct rl_entry *r = TAILQ_FIRST(&fp->ff_invalidranges);
5817
5818         while (r) {
5819                 /* If we have more than can fit in our stack buffer, switch
5820                    to a heap buffer. */
5821                 if (exts == exts_buf && ext_count == max_exts) {
5822                         max_exts = 256;
5823                         exts = hfs_malloc(sizeof(struct ext) * max_exts);
5824                         memcpy(exts, exts_buf, ext_count * sizeof(struct ext));
5825                 }
5826
5827                 struct rl_entry *next = TAILQ_NEXT(r, rl_link);
5828
5829                 exts[ext_count++] = (struct ext){ r->rl_start, r->rl_end };
5830
5831                 if (!next || (ext_count == max_exts && exts != exts_buf)) {
5832                         hfs_unlock(cp);
5833                         for (int i = 0; i < ext_count; ++i) {
5834                                 ret = cluster_write(vp, NULL, fp->ff_size, exts[i].end + 1,
5835                                                                         exts[i].start, 0,
5836                                                                         IO_HEADZEROFILL | IO_NOZERODIRTY | IO_NOCACHE);
5837                                 if (ret) {
5838                                         hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
5839                                         goto exit;
5840                                 }
5841                         }
5842
5843                         if (!next) {
5844                                 hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
5845                                 break;
5846                         }
5847
5848                         /* Push any existing clusters which should clean up our invalid
5849                            ranges as they go through hfs_vnop_blockmap. */
5850                         cluster_push(vp, 0);
5851
5852                         hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
5853
5854                         /*
5855                          * Get back to where we were (given we dropped the lock).
5856                          * This shouldn't be many because we pushed above.
5857                          */
5858                         TAILQ_FOREACH(r, &fp->ff_invalidranges, rl_link) {
5859                                 if (r->rl_end > exts[ext_count - 1].end)
5860                                         break;
5861                         }
5862
5863                         ext_count = 0;
5864                 } else
5865                         r = next;
5866         }
5867
5868         ret = 0;
5869
5870 exit:
5871
5872         if (exts != exts_buf)
5873                 hfs_free(exts, sizeof(struct ext) * max_exts);
5874
5875         return ret;
5876 }