bsd/hfs/hfs_readwrite.c

   1 /*
   2  * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*      @(#)hfs_readwrite.c     1.0
  29  *
  30  *      (c) 1998-2001 Apple Computer, Inc.  All Rights Reserved
  31  *
  32  *      hfs_readwrite.c -- vnode operations to deal with reading and writing files.
  33  *
  34  */
  35
  36 #include <sys/param.h>
  37 #include <sys/systm.h>
  38 #include <sys/resourcevar.h>
  39 #include <sys/kernel.h>
  40 #include <sys/fcntl.h>
  41 #include <sys/filedesc.h>
  42 #include <sys/stat.h>
  43 #include <sys/buf.h>
  44 #include <sys/buf_internal.h>
  45 #include <sys/proc.h>
  46 #include <sys/kauth.h>
  47 #include <sys/vnode.h>
  48 #include <sys/vnode_internal.h>
  49 #include <sys/uio.h>
  50 #include <sys/vfs_context.h>
  51 #include <sys/fsevents.h>
  52 #include <kern/kalloc.h>
  53 #include <sys/disk.h>
  54 #include <sys/sysctl.h>
  55 #include <sys/fsctl.h>
  56 #include <sys/mount_internal.h>
  57
  58 #include <miscfs/specfs/specdev.h>
  59
  60 #include <sys/ubc.h>
  61 #include <sys/ubc_internal.h>
  62
  63 #include <vm/vm_pageout.h>
  64 #include <vm/vm_kern.h>
  65
  66 #include <sys/kdebug.h>
  67
  68 #include        "hfs.h"
  69 #include        "hfs_attrlist.h"
  70 #include        "hfs_endian.h"
  71 #include        "hfs_fsctl.h"
  72 #include        "hfs_quota.h"
  73 #include        "hfscommon/headers/FileMgrInternal.h"
  74 #include        "hfscommon/headers/BTreesInternal.h"
  75 #include        "hfs_cnode.h"
  76 #include        "hfs_dbg.h"
  77
  78 #define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2)))
  79
  80 enum {
  81         MAXHFSFILESIZE = 0x7FFFFFFF             /* this needs to go in the mount structure */
  82 };
  83
  84 /* from bsd/hfs/hfs_vfsops.c */
  85 extern int hfs_vfs_vget (struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context);
  86
  87 static int  hfs_clonefile(struct vnode *, int, int, int);
  88 static int  hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *);
  89 static int  hfs_minorupdate(struct vnode *vp);
  90 static int  do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skip, vfs_context_t context);
  91
  92 /* from bsd/hfs/hfs_vnops.c */
  93 extern decmpfs_cnode* hfs_lazy_init_decmpfs_cnode (struct cnode *cp);
  94
  95
  96
  97 int flush_cache_on_write = 0;
  98 SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW | CTLFLAG_LOCKED, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files");
  99
 100 /*
 101  * Read data from a file.
 102  */
 103 int
 104 hfs_vnop_read(struct vnop_read_args *ap)
 105 {
 106         /*
 107            struct vnop_read_args {
 108            struct vnodeop_desc *a_desc;
 109            vnode_t a_vp;
 110            struct uio *a_uio;
 111            int a_ioflag;
 112            vfs_context_t a_context;
 113            };
 114          */
 115
 116         uio_t uio = ap->a_uio;
 117         struct vnode *vp = ap->a_vp;
 118         struct cnode *cp;
 119         struct filefork *fp;
 120         struct hfsmount *hfsmp;
 121         off_t filesize;
 122         off_t filebytes;
 123         off_t start_resid = uio_resid(uio);
 124         off_t offset = uio_offset(uio);
 125         int retval = 0;
 126         int took_truncate_lock = 0;
 127         int io_throttle = 0;
 128
 129         /* Preflight checks */
 130         if (!vnode_isreg(vp)) {
 131                 /* can only read regular files */
 132                 if (vnode_isdir(vp))
 133                         return (EISDIR);
 134                 else
 135                         return (EPERM);
 136         }
 137         if (start_resid == 0)
 138                 return (0);             /* Nothing left to do */
 139         if (offset < 0)
 140                 return (EINVAL);        /* cant read from a negative offset */
 141
 142
 143
 144 #if HFS_COMPRESSION
 145         if (VNODE_IS_RSRC(vp)) {
 146                 if (hfs_hides_rsrc(ap->a_context, VTOC(vp), 1)) { /* 1 == don't take the cnode lock */
 147                         return 0;
 148                 }
 149                 /* otherwise read the resource fork normally */
 150         } else {
 151                 int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
 152                 if (compressed) {
 153                         retval = decmpfs_read_compressed(ap, &compressed, VTOCMP(vp));
 154                         if (compressed) {
 155                                 if (retval == 0) {
 156                                         /* successful read, update the access time */
 157                                         VTOC(vp)->c_touch_acctime = TRUE;
 158
 159                                         /* compressed files are not hot file candidates */
 160                                         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
 161                                                 VTOF(vp)->ff_bytesread = 0;
 162                                         }
 163                                 }
 164                                 return retval;
 165                         }
 166                         /* otherwise the file was converted back to a regular file while we were reading it */
 167                         retval = 0;
 168                 } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
 169                         int error;
 170
 171                         error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP);
 172                         if (error) {
 173                                 return error;
 174                         }
 175
 176                 }
 177         }
 178 #endif /* HFS_COMPRESSION */
 179
 180         cp = VTOC(vp);
 181         fp = VTOF(vp);
 182         hfsmp = VTOHFS(vp);
 183
 184 #if CONFIG_PROTECT
 185         if ((retval = cp_handle_vnop (vp, CP_READ_ACCESS, ap->a_ioflag)) != 0) {
 186                 goto exit;
 187         }
 188 #endif
 189
 190         /*
 191          * If this read request originated from a syscall (as opposed to
 192          * an in-kernel page fault or something), then set it up for
 193          * throttle checks
 194          */
 195         if (ap->a_ioflag & IO_SYSCALL_DISPATCH) {
 196                 io_throttle = IO_RETURN_ON_THROTTLE;
 197         }
 198
 199 read_again:
 200
 201         /* Protect against a size change. */
 202         hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
 203         took_truncate_lock = 1;
 204
 205         filesize = fp->ff_size;
 206         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 207         if (offset > filesize) {
 208                 if ((hfsmp->hfs_flags & HFS_STANDARD) &&
 209                     (offset > (off_t)MAXHFSFILESIZE)) {
 210                         retval = EFBIG;
 211                 }
 212                 goto exit;
 213         }
 214
 215         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_START,
 216                 (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0);
 217
 218         retval = cluster_read(vp, uio, filesize, ap->a_ioflag |io_throttle);
 219
 220         cp->c_touch_acctime = TRUE;
 221
 222         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_END,
 223                 (int)uio_offset(uio), uio_resid(uio), (int)filesize,  (int)filebytes, 0);
 224
 225         /*
 226          * Keep track blocks read
 227          */
 228         if (hfsmp->hfc_stage == HFC_RECORDING && retval == 0) {
 229                 int took_cnode_lock = 0;
 230                 off_t bytesread;
 231
 232                 bytesread = start_resid - uio_resid(uio);
 233
 234                 /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
 235                 if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) {
 236                         hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 237                         took_cnode_lock = 1;
 238                 }
 239                 /*
 240                  * If this file hasn't been seen since the start of
 241                  * the current sampling period then start over.
 242                  */
 243                 if (cp->c_atime < hfsmp->hfc_timebase) {
 244                         struct timeval tv;
 245
 246                         fp->ff_bytesread = bytesread;
 247                         microtime(&tv);
 248                         cp->c_atime = tv.tv_sec;
 249                 } else {
 250                         fp->ff_bytesread += bytesread;
 251                 }
 252                 if (took_cnode_lock)
 253                         hfs_unlock(cp);
 254         }
 255 exit:
 256         if (took_truncate_lock) {
 257                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 258         }
 259         if (retval == EAGAIN) {
 260                 throttle_lowpri_io(1);
 261
 262                 retval = 0;
 263                 goto read_again;
 264         }
 265         return (retval);
 266 }
 267
 268 /*
 269  * Write data to a file.
 270  */
 271 int
 272 hfs_vnop_write(struct vnop_write_args *ap)
 273 {
 274         uio_t uio = ap->a_uio;
 275         struct vnode *vp = ap->a_vp;
 276         struct cnode *cp;
 277         struct filefork *fp;
 278         struct hfsmount *hfsmp;
 279         kauth_cred_t cred = NULL;
 280         off_t origFileSize;
 281         off_t writelimit;
 282         off_t bytesToAdd = 0;
 283         off_t actualBytesAdded;
 284         off_t filebytes;
 285         off_t offset;
 286         ssize_t resid;
 287         int eflags;
 288         int ioflag = ap->a_ioflag;
 289         int retval = 0;
 290         int lockflags;
 291         int cnode_locked = 0;
 292         int partialwrite = 0;
 293         int do_snapshot = 1;
 294         time_t orig_ctime=VTOC(vp)->c_ctime;
 295         int took_truncate_lock = 0;
 296         int io_return_on_throttle = 0;
 297         struct rl_entry *invalid_range;
 298
 299 #if HFS_COMPRESSION
 300         if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
 301                 int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
 302                 switch(state) {
 303                         case FILE_IS_COMPRESSED:
 304                                 return EACCES;
 305                         case FILE_IS_CONVERTING:
 306                                 /* if FILE_IS_CONVERTING, we allow writes but do not
 307                                    bother with snapshots or else we will deadlock.
 308                                 */
 309                                 do_snapshot = 0;
 310                                 break;
 311                         default:
 312                                 printf("invalid state %d for compressed file\n", state);
 313                                 /* fall through */
 314                 }
 315         } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
 316                 int error;
 317
 318                 error = check_for_dataless_file(vp, NAMESPACE_HANDLER_WRITE_OP);
 319                 if (error != 0) {
 320                         return error;
 321                 }
 322         }
 323
 324         if (do_snapshot) {
 325                 check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_WRITE_OP, uio);
 326         }
 327
 328 #endif
 329
 330         resid = uio_resid(uio);
 331         offset = uio_offset(uio);
 332
 333         if (offset < 0)
 334                 return (EINVAL);
 335         if (resid == 0)
 336                 return (E_NONE);
 337         if (!vnode_isreg(vp))
 338                 return (EPERM);  /* Can only write regular files */
 339
 340         cp = VTOC(vp);
 341         fp = VTOF(vp);
 342         hfsmp = VTOHFS(vp);
 343
 344 #if CONFIG_PROTECT
 345         if ((retval = cp_handle_vnop (vp, CP_WRITE_ACCESS, 0)) != 0) {
 346                 goto exit;
 347         }
 348 #endif
 349
 350         eflags = kEFDeferMask;  /* defer file block allocations */
 351 #if HFS_SPARSE_DEV
 352         /*
 353          * When the underlying device is sparse and space
 354          * is low (< 8MB), stop doing delayed allocations
 355          * and begin doing synchronous I/O.
 356          */
 357         if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
 358             (hfs_freeblks(hfsmp, 0) < 2048)) {
 359                 eflags &= ~kEFDeferMask;
 360                 ioflag |= IO_SYNC;
 361         }
 362 #endif /* HFS_SPARSE_DEV */
 363
 364         if ((ioflag & (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) ==
 365                         (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) {
 366                 io_return_on_throttle = IO_RETURN_ON_THROTTLE;
 367         }
 368
 369 again:
 370         /* Protect against a size change. */
 371         /*
 372          * Protect against a size change.
 373          *
 374          * Note: If took_truncate_lock is true, then we previously got the lock shared
 375          * but needed to upgrade to exclusive.  So try getting it exclusive from the
 376          * start.
 377          */
 378         if (ioflag & IO_APPEND || took_truncate_lock) {
 379                 hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
 380         }
 381         else {
 382                 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
 383         }
 384         took_truncate_lock = 1;
 385
 386         /* Update UIO */
 387         if (ioflag & IO_APPEND) {
 388                 uio_setoffset(uio, fp->ff_size);
 389                 offset = fp->ff_size;
 390         }
 391         if ((cp->c_bsdflags & APPEND) && offset != fp->ff_size) {
 392                 retval = EPERM;
 393                 goto exit;
 394         }
 395
 396         origFileSize = fp->ff_size;
 397         writelimit = offset + resid;
 398         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 399
 400         /*
 401          * We may need an exclusive truncate lock for several reasons, all
 402          * of which are because we may be writing to a (portion of a) block
 403          * for the first time, and we need to make sure no readers see the
 404          * prior, uninitialized contents of the block.  The cases are:
 405          *
 406          * 1. We have unallocated (delayed allocation) blocks.  We may be
 407          *    allocating new blocks to the file and writing to them.
 408          *    (A more precise check would be whether the range we're writing
 409          *    to contains delayed allocation blocks.)
 410          * 2. We need to extend the file.  The bytes between the old EOF
 411          *    and the new EOF are not yet initialized.  This is important
 412          *    even if we're not allocating new blocks to the file.  If the
 413          *    old EOF and new EOF are in the same block, we still need to
 414          *    protect that range of bytes until they are written for the
 415          *    first time.
 416          * 3. The write overlaps some invalid ranges (delayed zero fill; that
 417          *    part of the file has been allocated, but not yet written).
 418          *
 419          * If we had a shared lock with the above cases, we need to try to upgrade
 420          * to an exclusive lock.  If the upgrade fails, we will lose the shared
 421          * lock, and will need to take the truncate lock again; the took_truncate_lock
 422          * flag will still be set, causing us to try for an exclusive lock next time.
 423          *
 424          * NOTE: Testing for #3 (delayed zero fill) needs to be done while the cnode
 425          * lock is held, since it protects the range lists.
 426          */
 427         if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) &&
 428             ((fp->ff_unallocblocks != 0) ||
 429              (writelimit > origFileSize))) {
 430                 if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) {
 431                         /*
 432                          * Lock upgrade failed and we lost our shared lock, try again.
 433                          * Note: we do not set took_truncate_lock=0 here.  Leaving it
 434                          * set to 1 will cause us to try to get the lock exclusive.
 435                          */
 436                         goto again;
 437                 }
 438                 else {
 439                         /* Store the owner in the c_truncatelockowner field if we successfully upgrade */
 440                         cp->c_truncatelockowner = current_thread();
 441                 }
 442         }
 443
 444         if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
 445                 goto exit;
 446         }
 447         cnode_locked = 1;
 448
 449         if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) {
 450                 hfs_incr_gencount (cp);
 451         }
 452
 453         /*
 454          * Now that we have the cnode lock, see if there are delayed zero fill ranges
 455          * overlapping our write.  If so, we need the truncate lock exclusive (see above).
 456          */
 457         if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) &&
 458             (rl_scan(&fp->ff_invalidranges, offset, writelimit-1, &invalid_range) != RL_NOOVERLAP)) {
 459                 /*
 460                  * When testing, it appeared that calling lck_rw_lock_shared_to_exclusive() causes
 461                  * a deadlock, rather than simply returning failure.  (That is, it apparently does
 462                  * not behave like a "try_lock").  Since this condition is rare, just drop the
 463                  * cnode lock and try again.  Since took_truncate_lock is set, we will
 464                  * automatically take the truncate lock exclusive.
 465                  */
 466                 hfs_unlock(cp);
 467                 cnode_locked = 0;
 468                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 469                 goto again;
 470         }
 471
 472         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_START,
 473                      (int)offset, uio_resid(uio), (int)fp->ff_size,
 474                      (int)filebytes, 0);
 475
 476         /* Check if we do not need to extend the file */
 477         if (writelimit <= filebytes) {
 478                 goto sizeok;
 479         }
 480
 481         cred = vfs_context_ucred(ap->a_context);
 482         bytesToAdd = writelimit - filebytes;
 483
 484 #if QUOTA
 485         retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, hfsmp->blockSize)),
 486                            cred, 0);
 487         if (retval)
 488                 goto exit;
 489 #endif /* QUOTA */
 490
 491         if (hfs_start_transaction(hfsmp) != 0) {
 492                 retval = EINVAL;
 493                 goto exit;
 494         }
 495
 496         while (writelimit > filebytes) {
 497                 bytesToAdd = writelimit - filebytes;
 498                 if (cred && suser(cred, NULL) != 0)
 499                         eflags |= kEFReserveMask;
 500
 501                 /* Protect extents b-tree and allocation bitmap */
 502                 lockflags = SFL_BITMAP;
 503                 if (overflow_extents(fp))
 504                         lockflags |= SFL_EXTENTS;
 505                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
 506
 507                 /* Files that are changing size are not hot file candidates. */
 508                 if (hfsmp->hfc_stage == HFC_RECORDING) {
 509                         fp->ff_bytesread = 0;
 510                 }
 511                 retval = MacToVFSError(ExtendFileC (hfsmp, (FCB*)fp, bytesToAdd,
 512                                 0, eflags, &actualBytesAdded));
 513
 514                 hfs_systemfile_unlock(hfsmp, lockflags);
 515
 516                 if ((actualBytesAdded == 0) && (retval == E_NONE))
 517                         retval = ENOSPC;
 518                 if (retval != E_NONE)
 519                         break;
 520                 filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 521                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_NONE,
 522                         (int)offset, uio_resid(uio), (int)fp->ff_size,  (int)filebytes, 0);
 523         }
 524         (void) hfs_update(vp, TRUE);
 525         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
 526         (void) hfs_end_transaction(hfsmp);
 527
 528         /*
 529          * If we didn't grow the file enough try a partial write.
 530          * POSIX expects this behavior.
 531          */
 532         if ((retval == ENOSPC) && (filebytes > offset)) {
 533                 retval = 0;
 534                 partialwrite = 1;
 535                 uio_setresid(uio, (uio_resid(uio) - bytesToAdd));
 536                 resid -= bytesToAdd;
 537                 writelimit = filebytes;
 538         }
 539 sizeok:
 540         if (retval == E_NONE) {
 541                 off_t filesize;
 542                 off_t zero_off;
 543                 off_t tail_off;
 544                 off_t inval_start;
 545                 off_t inval_end;
 546                 off_t io_start;
 547                 int lflag;
 548
 549                 if (writelimit > fp->ff_size)
 550                         filesize = writelimit;
 551                 else
 552                         filesize = fp->ff_size;
 553
 554                 lflag = ioflag & ~(IO_TAILZEROFILL | IO_HEADZEROFILL | IO_NOZEROVALID | IO_NOZERODIRTY);
 555
 556                 if (offset <= fp->ff_size) {
 557                         zero_off = offset & ~PAGE_MASK_64;
 558
 559                         /* Check to see whether the area between the zero_offset and the start
 560                            of the transfer to see whether is invalid and should be zero-filled
 561                            as part of the transfer:
 562                          */
 563                         if (offset > zero_off) {
 564                                 if (rl_scan(&fp->ff_invalidranges, zero_off, offset - 1, &invalid_range) != RL_NOOVERLAP)
 565                                         lflag |= IO_HEADZEROFILL;
 566                         }
 567                 } else {
 568                         off_t eof_page_base = fp->ff_size & ~PAGE_MASK_64;
 569
 570                         /* The bytes between fp->ff_size and uio->uio_offset must never be
 571                            read without being zeroed.  The current last block is filled with zeroes
 572                            if it holds valid data but in all cases merely do a little bookkeeping
 573                            to track the area from the end of the current last page to the start of
 574                            the area actually written.  For the same reason only the bytes up to the
 575                            start of the page where this write will start is invalidated; any remainder
 576                            before uio->uio_offset is explicitly zeroed as part of the cluster_write.
 577
 578                            Note that inval_start, the start of the page after the current EOF,
 579                            may be past the start of the write, in which case the zeroing
 580                            will be handled by the cluser_write of the actual data.
 581                          */
 582                         inval_start = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
 583                         inval_end = offset & ~PAGE_MASK_64;
 584                         zero_off = fp->ff_size;
 585
 586                         if ((fp->ff_size & PAGE_MASK_64) &&
 587                                 (rl_scan(&fp->ff_invalidranges,
 588                                                         eof_page_base,
 589                                                         fp->ff_size - 1,
 590                                                         &invalid_range) != RL_NOOVERLAP)) {
 591                                 /* The page containing the EOF is not valid, so the
 592                                    entire page must be made inaccessible now.  If the write
 593                                    starts on a page beyond the page containing the eof
 594                                    (inval_end > eof_page_base), add the
 595                                    whole page to the range to be invalidated.  Otherwise
 596                                    (i.e. if the write starts on the same page), zero-fill
 597                                    the entire page explicitly now:
 598                                  */
 599                                 if (inval_end > eof_page_base) {
 600                                         inval_start = eof_page_base;
 601                                 } else {
 602                                         zero_off = eof_page_base;
 603                                 };
 604                         };
 605
 606                         if (inval_start < inval_end) {
 607                                 struct timeval tv;
 608                                 /* There's some range of data that's going to be marked invalid */
 609
 610                                 if (zero_off < inval_start) {
 611                                         /* The pages between inval_start and inval_end are going to be invalidated,
 612                                            and the actual write will start on a page past inval_end.  Now's the last
 613                                            chance to zero-fill the page containing the EOF:
 614                                          */
 615                                         hfs_unlock(cp);
 616                                         cnode_locked = 0;
 617                                         retval = cluster_write(vp, (uio_t) 0,
 618                                                         fp->ff_size, inval_start,
 619                                                         zero_off, (off_t)0,
 620                                                         lflag | IO_HEADZEROFILL | IO_NOZERODIRTY);
 621                                         hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 622                                         cnode_locked = 1;
 623                                         if (retval) goto ioerr_exit;
 624                                         offset = uio_offset(uio);
 625                                 };
 626
 627                                 /* Mark the remaining area of the newly allocated space as invalid: */
 628                                 rl_add(inval_start, inval_end - 1 , &fp->ff_invalidranges);
 629                                 microuptime(&tv);
 630                                 cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
 631                                 zero_off = fp->ff_size = inval_end;
 632                         };
 633
 634                         if (offset > zero_off) lflag |= IO_HEADZEROFILL;
 635                 };
 636
 637                 /* Check to see whether the area between the end of the write and the end of
 638                    the page it falls in is invalid and should be zero-filled as part of the transfer:
 639                  */
 640                 tail_off = (writelimit + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
 641                 if (tail_off > filesize) tail_off = filesize;
 642                 if (tail_off > writelimit) {
 643                         if (rl_scan(&fp->ff_invalidranges, writelimit, tail_off - 1, &invalid_range) != RL_NOOVERLAP) {
 644                                 lflag |= IO_TAILZEROFILL;
 645                         };
 646                 };
 647
 648                 /*
 649                  * if the write starts beyond the current EOF (possibly advanced in the
 650                  * zeroing of the last block, above), then we'll zero fill from the current EOF
 651                  * to where the write begins:
 652                  *
 653                  * NOTE: If (and ONLY if) the portion of the file about to be written is
 654                  *       before the current EOF it might be marked as invalid now and must be
 655                  *       made readable (removed from the invalid ranges) before cluster_write
 656                  *       tries to write it:
 657                  */
 658                 io_start = (lflag & IO_HEADZEROFILL) ? zero_off : offset;
 659                 if (io_start < fp->ff_size) {
 660                         off_t io_end;
 661
 662                         io_end = (lflag & IO_TAILZEROFILL) ? tail_off : writelimit;
 663                         rl_remove(io_start, io_end - 1, &fp->ff_invalidranges);
 664                 };
 665
 666                 hfs_unlock(cp);
 667                 cnode_locked = 0;
 668
 669                 /*
 670                  * We need to tell UBC the fork's new size BEFORE calling
 671                  * cluster_write, in case any of the new pages need to be
 672                  * paged out before cluster_write completes (which does happen
 673                  * in embedded systems due to extreme memory pressure).
 674                  * Similarly, we need to tell hfs_vnop_pageout what the new EOF
 675                  * will be, so that it can pass that on to cluster_pageout, and
 676                  * allow those pageouts.
 677                  *
 678                  * We don't update ff_size yet since we don't want pageins to
 679                  * be able to see uninitialized data between the old and new
 680                  * EOF, until cluster_write has completed and initialized that
 681                  * part of the file.
 682                  *
 683                  * The vnode pager relies on the file size last given to UBC via
 684                  * ubc_setsize.  hfs_vnop_pageout relies on fp->ff_new_size or
 685                  * ff_size (whichever is larger).  NOTE: ff_new_size is always
 686                  * zero, unless we are extending the file via write.
 687                  */
 688                 if (filesize > fp->ff_size) {
 689                         fp->ff_new_size = filesize;
 690                         ubc_setsize(vp, filesize);
 691                 }
 692                 retval = cluster_write(vp, uio, fp->ff_size, filesize, zero_off,
 693                                 tail_off, lflag | IO_NOZERODIRTY | io_return_on_throttle);
 694                 if (retval) {
 695                         fp->ff_new_size = 0;    /* no longer extending; use ff_size */
 696
 697                         if (retval == EAGAIN) {
 698                                 /*
 699                                  * EAGAIN indicates that we still have I/O to do, but
 700                                  * that we now need to be throttled
 701                                  */
 702                                 if (resid != uio_resid(uio)) {
 703                                         /*
 704                                          * did manage to do some I/O before returning EAGAIN
 705                                          */
 706                                         resid = uio_resid(uio);
 707                                         offset = uio_offset(uio);
 708
 709                                         cp->c_touch_chgtime = TRUE;
 710                                         cp->c_touch_modtime = TRUE;
 711                                 }
 712                                 if (filesize > fp->ff_size) {
 713                                         /*
 714                                          * we called ubc_setsize before the call to
 715                                          * cluster_write... since we only partially
 716                                          * completed the I/O, we need to
 717                                          * re-adjust our idea of the filesize based
 718                                          * on our interim EOF
 719                                          */
 720                                         ubc_setsize(vp, offset);
 721
 722                                         fp->ff_size = offset;
 723                                 }
 724                                 goto exit;
 725                         }
 726                         if (filesize > origFileSize) {
 727                                 ubc_setsize(vp, origFileSize);
 728                         }
 729                         goto ioerr_exit;
 730                 }
 731
 732                 if (filesize > origFileSize) {
 733                         fp->ff_size = filesize;
 734
 735                         /* Files that are changing size are not hot file candidates. */
 736                         if (hfsmp->hfc_stage == HFC_RECORDING) {
 737                                 fp->ff_bytesread = 0;
 738                         }
 739                 }
 740                 fp->ff_new_size = 0;    /* ff_size now has the correct size */
 741
 742                 /* If we wrote some bytes, then touch the change and mod times */
 743                 if (resid > uio_resid(uio)) {
 744                         cp->c_touch_chgtime = TRUE;
 745                         cp->c_touch_modtime = TRUE;
 746                 }
 747         }
 748         if (partialwrite) {
 749                 uio_setresid(uio, (uio_resid(uio) + bytesToAdd));
 750                 resid += bytesToAdd;
 751         }
 752
 753         // XXXdbg - see radar 4871353 for more info
 754         {
 755             if (flush_cache_on_write && ((ioflag & IO_NOCACHE) || vnode_isnocache(vp))) {
 756                 VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NULL);
 757             }
 758         }
 759
 760 ioerr_exit:
 761         /*
 762          * If we successfully wrote any data, and we are not the superuser
 763          * we clear the setuid and setgid bits as a precaution against
 764          * tampering.
 765          */
 766         if (cp->c_mode & (S_ISUID | S_ISGID)) {
 767                 cred = vfs_context_ucred(ap->a_context);
 768                 if (resid > uio_resid(uio) && cred && suser(cred, NULL)) {
 769                         if (!cnode_locked) {
 770                                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 771                                 cnode_locked = 1;
 772                         }
 773                         cp->c_mode &= ~(S_ISUID | S_ISGID);
 774                 }
 775         }
 776         if (retval) {
 777                 if (ioflag & IO_UNIT) {
 778                         if (!cnode_locked) {
 779                                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 780                                 cnode_locked = 1;
 781                         }
 782                         (void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC,
 783                                            0, 0, ap->a_context);
 784                         uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio))));
 785                         uio_setresid(uio, resid);
 786                         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 787                 }
 788         } else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio))) {
 789                 if (!cnode_locked) {
 790                         hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 791                         cnode_locked = 1;
 792                 }
 793                 retval = hfs_update(vp, TRUE);
 794         }
 795         /* Updating vcbWrCnt doesn't need to be atomic. */
 796         hfsmp->vcbWrCnt++;
 797
 798         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_END,
 799                 (int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0);
 800 exit:
 801         if (cnode_locked)
 802                 hfs_unlock(cp);
 803
 804         if (took_truncate_lock) {
 805                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 806         }
 807         if (retval == EAGAIN) {
 808                 throttle_lowpri_io(1);
 809
 810                 retval = 0;
 811                 goto again;
 812         }
 813         return (retval);
 814 }
 815
 816 /* support for the "bulk-access" fcntl */
 817
 818 #define CACHE_LEVELS 16
 819 #define NUM_CACHE_ENTRIES (64*16)
 820 #define PARENT_IDS_FLAG 0x100
 821
 822 struct access_cache {
 823        int numcached;
 824        int cachehits; /* these two for statistics gathering */
 825        int lookups;
 826        unsigned int *acache;
 827        unsigned char *haveaccess;
 828 };
 829
 830 struct access_t {
 831         uid_t     uid;              /* IN: effective user id */
 832         short     flags;            /* IN: access requested (i.e. R_OK) */
 833         short     num_groups;       /* IN: number of groups user belongs to */
 834         int       num_files;        /* IN: number of files to process */
 835         int       *file_ids;        /* IN: array of file ids */
 836         gid_t     *groups;          /* IN: array of groups */
 837         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 838 } __attribute__((unavailable)); // this structure is for reference purposes only
 839
 840 struct user32_access_t {
 841         uid_t     uid;              /* IN: effective user id */
 842         short     flags;            /* IN: access requested (i.e. R_OK) */
 843         short     num_groups;       /* IN: number of groups user belongs to */
 844         int       num_files;        /* IN: number of files to process */
 845         user32_addr_t      file_ids;        /* IN: array of file ids */
 846         user32_addr_t      groups;          /* IN: array of groups */
 847         user32_addr_t      access;          /* OUT: access info for each file (0 for 'has access') */
 848 };
 849
 850 struct user64_access_t {
 851         uid_t           uid;                    /* IN: effective user id */
 852         short           flags;                  /* IN: access requested (i.e. R_OK) */
 853         short           num_groups;             /* IN: number of groups user belongs to */
 854         int             num_files;              /* IN: number of files to process */
 855         user64_addr_t   file_ids;               /* IN: array of file ids */
 856         user64_addr_t   groups;                 /* IN: array of groups */
 857         user64_addr_t   access;                 /* OUT: access info for each file (0 for 'has access') */
 858 };
 859
 860
 861 // these are the "extended" versions of the above structures
 862 // note that it is crucial that they be different sized than
 863 // the regular version
 864 struct ext_access_t {
 865         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 866         uint32_t   num_files;       /* IN: number of files to process */
 867         uint32_t   map_size;        /* IN: size of the bit map */
 868         uint32_t  *file_ids;        /* IN: Array of file ids */
 869         char      *bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 870         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 871         uint32_t   num_parents;   /* future use */
 872         cnid_t      *parents;   /* future use */
 873 } __attribute__((unavailable)); // this structure is for reference purposes only
 874
 875 struct user32_ext_access_t {
 876         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 877         uint32_t   num_files;       /* IN: number of files to process */
 878         uint32_t   map_size;        /* IN: size of the bit map */
 879         user32_addr_t  file_ids;        /* IN: Array of file ids */
 880         user32_addr_t     bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 881         user32_addr_t access;          /* OUT: access info for each file (0 for 'has access') */
 882         uint32_t   num_parents;   /* future use */
 883         user32_addr_t parents;   /* future use */
 884 };
 885
 886 struct user64_ext_access_t {
 887         uint32_t      flags;        /* IN: access requested (i.e. R_OK) */
 888         uint32_t      num_files;    /* IN: number of files to process */
 889         uint32_t      map_size;     /* IN: size of the bit map */
 890         user64_addr_t   file_ids;     /* IN: array of file ids */
 891         user64_addr_t   bitmap;       /* IN: array of groups */
 892         user64_addr_t   access;       /* OUT: access info for each file (0 for 'has access') */
 893         uint32_t      num_parents;/* future use */
 894         user64_addr_t   parents;/* future use */
 895 };
 896
 897
 898 /*
 899  * Perform a binary search for the given parent_id. Return value is
 900  * the index if there is a match.  If no_match_indexp is non-NULL it
 901  * will be assigned with the index to insert the item (even if it was
 902  * not found).
 903  */
 904 static int cache_binSearch(cnid_t *array, unsigned int hi, cnid_t parent_id, int *no_match_indexp)
 905 {
 906     int index=-1;
 907     unsigned int lo=0;
 908
 909     do {
 910         unsigned int mid = ((hi - lo)/2) + lo;
 911         unsigned int this_id = array[mid];
 912
 913         if (parent_id == this_id) {
 914             hi = mid;
 915             break;
 916         }
 917
 918         if (parent_id < this_id) {
 919             hi = mid;
 920             continue;
 921         }
 922
 923         if (parent_id > this_id) {
 924             lo = mid + 1;
 925             continue;
 926         }
 927     } while(lo < hi);
 928
 929     /* check if lo and hi converged on the match */
 930     if (parent_id == array[hi]) {
 931         index = hi;
 932     }
 933
 934     if (no_match_indexp) {
 935         *no_match_indexp = hi;
 936     }
 937
 938     return index;
 939 }
 940
 941
 942 static int
 943 lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id)
 944 {
 945     unsigned int hi;
 946     int matches = 0;
 947     int index, no_match_index;
 948
 949     if (cache->numcached == 0) {
 950         *indexp = 0;
 951         return 0; // table is empty, so insert at index=0 and report no match
 952     }
 953
 954     if (cache->numcached > NUM_CACHE_ENTRIES) {
 955         cache->numcached = NUM_CACHE_ENTRIES;
 956     }
 957
 958     hi = cache->numcached - 1;
 959
 960     index = cache_binSearch(cache->acache, hi, parent_id, &no_match_index);
 961
 962     /* if no existing entry found, find index for new one */
 963     if (index == -1) {
 964         index = no_match_index;
 965         matches = 0;
 966     } else {
 967         matches = 1;
 968     }
 969
 970     *indexp = index;
 971     return matches;
 972 }
 973
 974 /*
 975  * Add a node to the access_cache at the given index (or do a lookup first
 976  * to find the index if -1 is passed in). We currently do a replace rather
 977  * than an insert if the cache is full.
 978  */
 979 static void
 980 add_node(struct access_cache *cache, int index, cnid_t nodeID, int access)
 981 {
 982     int lookup_index = -1;
 983
 984     /* need to do a lookup first if -1 passed for index */
 985     if (index == -1) {
 986         if (lookup_bucket(cache, &lookup_index, nodeID)) {
 987             if (cache->haveaccess[lookup_index] != access && cache->haveaccess[lookup_index] == ESRCH) {
 988                 // only update an entry if the previous access was ESRCH (i.e. a scope checking error)
 989                 cache->haveaccess[lookup_index] = access;
 990             }
 991
 992             /* mission accomplished */
 993             return;
 994         } else {
 995             index = lookup_index;
 996         }
 997
 998     }
 999
1000     /* if the cache is full, do a replace rather than an insert */
1001     if (cache->numcached >= NUM_CACHE_ENTRIES) {
1002         cache->numcached = NUM_CACHE_ENTRIES-1;
1003
1004         if (index > cache->numcached) {
1005             index = cache->numcached;
1006         }
1007     }
1008
1009     if (index < cache->numcached && index < NUM_CACHE_ENTRIES && nodeID > cache->acache[index]) {
1010         index++;
1011     }
1012
1013     if (index >= 0 && index < cache->numcached) {
1014         /* only do bcopy if we're inserting */
1015         bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) );
1016         bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(unsigned char) );
1017     }
1018
1019     cache->acache[index] = nodeID;
1020     cache->haveaccess[index] = access;
1021     cache->numcached++;
1022 }
1023
1024
1025 struct cinfo {
1026     uid_t   uid;
1027     gid_t   gid;
1028     mode_t  mode;
1029     cnid_t  parentcnid;
1030     u_int16_t recflags;
1031 };
1032
1033 static int
1034 snoop_callback(const struct cat_desc *descp, const struct cat_attr *attrp, void * arg)
1035 {
1036     struct cinfo *cip = (struct cinfo *)arg;
1037
1038     cip->uid = attrp->ca_uid;
1039     cip->gid = attrp->ca_gid;
1040     cip->mode = attrp->ca_mode;
1041     cip->parentcnid = descp->cd_parentcnid;
1042     cip->recflags = attrp->ca_recflags;
1043
1044     return (0);
1045 }
1046
1047 /*
1048  * Lookup the cnid's attr info (uid, gid, and mode) as well as its parent id. If the item
1049  * isn't incore, then go to the catalog.
1050  */
1051 static int
1052 do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, cnid_t cnid,
1053     struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp)
1054 {
1055     int error = 0;
1056
1057     /* if this id matches the one the fsctl was called with, skip the lookup */
1058     if (cnid == skip_cp->c_cnid) {
1059         cnattrp->ca_uid = skip_cp->c_uid;
1060         cnattrp->ca_gid = skip_cp->c_gid;
1061         cnattrp->ca_mode = skip_cp->c_mode;
1062         cnattrp->ca_recflags = skip_cp->c_attr.ca_recflags;
1063         keyp->hfsPlus.parentID = skip_cp->c_parentcnid;
1064     } else {
1065         struct cinfo c_info;
1066
1067         /* otherwise, check the cnode hash incase the file/dir is incore */
1068         if (hfs_chash_snoop(hfsmp, cnid, 0, snoop_callback, &c_info) == 0) {
1069             cnattrp->ca_uid = c_info.uid;
1070             cnattrp->ca_gid = c_info.gid;
1071             cnattrp->ca_mode = c_info.mode;
1072             cnattrp->ca_recflags = c_info.recflags;
1073             keyp->hfsPlus.parentID = c_info.parentcnid;
1074         } else {
1075             int lockflags;
1076
1077             if (throttle_io_will_be_throttled(-1, HFSTOVFS(hfsmp)))
1078                     throttle_lowpri_io(1);
1079
1080             lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
1081
1082             /* lookup this cnid in the catalog */
1083             error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp);
1084
1085             hfs_systemfile_unlock(hfsmp, lockflags);
1086
1087             cache->lookups++;
1088         }
1089     }
1090
1091     return (error);
1092 }
1093
1094
1095 /*
1096  * Compute whether we have access to the given directory (nodeID) and all its parents. Cache
1097  * up to CACHE_LEVELS as we progress towards the root.
1098  */
1099 static int
1100 do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HFSCatalogNodeID nodeID,
1101     struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred,
1102     struct vfs_context *my_context,
1103     char *bitmap,
1104     uint32_t map_size,
1105     cnid_t* parents,
1106     uint32_t num_parents)
1107 {
1108     int                     myErr = 0;
1109     int                     myResult;
1110     HFSCatalogNodeID        thisNodeID;
1111     unsigned int            myPerms;
1112     struct cat_attr         cnattr;
1113     int                     cache_index = -1, scope_index = -1, scope_idx_start = -1;
1114     CatalogKey              catkey;
1115
1116     int i = 0, ids_to_cache = 0;
1117     int parent_ids[CACHE_LEVELS];
1118
1119     thisNodeID = nodeID;
1120     while (thisNodeID >=  kRootDirID) {
1121         myResult = 0;   /* default to "no access" */
1122
1123         /* check the cache before resorting to hitting the catalog */
1124
1125         /* ASSUMPTION: access info of cached entries is "final"... i.e. no need
1126          * to look any further after hitting cached dir */
1127
1128         if (lookup_bucket(cache, &cache_index, thisNodeID)) {
1129             cache->cachehits++;
1130             myErr = cache->haveaccess[cache_index];
1131             if (scope_index != -1) {
1132                 if (myErr == ESRCH) {
1133                     myErr = 0;
1134                 }
1135             } else {
1136                 scope_index = 0;   // so we'll just use the cache result
1137                 scope_idx_start = ids_to_cache;
1138             }
1139             myResult = (myErr == 0) ? 1 : 0;
1140             goto ExitThisRoutine;
1141         }
1142
1143
1144         if (parents) {
1145             int tmp;
1146             tmp = cache_binSearch(parents, num_parents-1, thisNodeID, NULL);
1147             if (scope_index == -1)
1148                 scope_index = tmp;
1149             if (tmp != -1 && scope_idx_start == -1 && ids_to_cache < CACHE_LEVELS) {
1150                 scope_idx_start = ids_to_cache;
1151             }
1152         }
1153
1154         /* remember which parents we want to cache */
1155         if (ids_to_cache < CACHE_LEVELS) {
1156             parent_ids[ids_to_cache] = thisNodeID;
1157             ids_to_cache++;
1158         }
1159         // Inefficient (using modulo) and we might want to use a hash function, not rely on the node id to be "nice"...
1160         if (bitmap && map_size) {
1161             bitmap[(thisNodeID/8)%(map_size)]|=(1<<(thisNodeID&7));
1162         }
1163
1164
1165         /* do the lookup (checks the cnode hash, then the catalog) */
1166         myErr = do_attr_lookup(hfsmp, cache, thisNodeID, skip_cp, &catkey, &cnattr);
1167         if (myErr) {
1168             goto ExitThisRoutine; /* no access */
1169         }
1170
1171         /* Root always gets access. */
1172         if (suser(myp_ucred, NULL) == 0) {
1173                 thisNodeID = catkey.hfsPlus.parentID;
1174                 myResult = 1;
1175                 continue;
1176         }
1177
1178         // if the thing has acl's, do the full permission check
1179         if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1180             struct vnode *vp;
1181
1182             /* get the vnode for this cnid */
1183             myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0, 0);
1184             if ( myErr ) {
1185                 myResult = 0;
1186                 goto ExitThisRoutine;
1187             }
1188
1189             thisNodeID = VTOC(vp)->c_parentcnid;
1190
1191             hfs_unlock(VTOC(vp));
1192
1193             if (vnode_vtype(vp) == VDIR) {
1194                 myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), my_context);
1195             } else {
1196                 myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, my_context);
1197             }
1198
1199             vnode_put(vp);
1200             if (myErr) {
1201                 myResult = 0;
1202                 goto ExitThisRoutine;
1203             }
1204         } else {
1205             unsigned int flags;
1206                 int mode = cnattr.ca_mode & S_IFMT;
1207                 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, cnattr.ca_mode, hfsmp->hfs_mp,myp_ucred, theProcPtr);
1208
1209                 if (mode == S_IFDIR) {
1210                         flags = R_OK | X_OK;
1211                 } else {
1212                         flags = R_OK;
1213                 }
1214                 if ( (myPerms & flags) != flags) {
1215                         myResult = 0;
1216                         myErr = EACCES;
1217                         goto ExitThisRoutine;   /* no access */
1218                 }
1219
1220             /* up the hierarchy we go */
1221             thisNodeID = catkey.hfsPlus.parentID;
1222         }
1223     }
1224
1225     /* if here, we have access to this node */
1226     myResult = 1;
1227
1228   ExitThisRoutine:
1229     if (parents && myErr == 0 && scope_index == -1) {
1230         myErr = ESRCH;
1231     }
1232
1233     if (myErr) {
1234         myResult = 0;
1235     }
1236     *err = myErr;
1237
1238     /* cache the parent directory(ies) */
1239     for (i = 0; i < ids_to_cache; i++) {
1240         if (myErr == 0 && parents && (scope_idx_start == -1 || i > scope_idx_start)) {
1241             add_node(cache, -1, parent_ids[i], ESRCH);
1242         } else {
1243             add_node(cache, -1, parent_ids[i], myErr);
1244         }
1245     }
1246
1247     return (myResult);
1248 }
1249
1250 static int
1251 do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
1252     struct vnop_ioctl_args *ap, int arg_size, vfs_context_t context)
1253 {
1254     boolean_t is64bit;
1255
1256     /*
1257      * NOTE: on entry, the vnode has an io_ref. In case this vnode
1258      * happens to be in our list of file_ids, we'll note it
1259      * avoid calling hfs_chashget_nowait() on that id as that
1260      * will cause a "locking against myself" panic.
1261      */
1262     Boolean check_leaf = true;
1263
1264     struct user64_ext_access_t *user_access_structp;
1265     struct user64_ext_access_t tmp_user_access;
1266     struct access_cache cache;
1267
1268     int error = 0, prev_parent_check_ok=1;
1269     unsigned int i;
1270
1271     short flags;
1272     unsigned int num_files = 0;
1273     int map_size = 0;
1274     int num_parents = 0;
1275     int *file_ids=NULL;
1276     short *access=NULL;
1277     char *bitmap=NULL;
1278     cnid_t *parents=NULL;
1279     int leaf_index;
1280
1281     cnid_t cnid;
1282     cnid_t prevParent_cnid = 0;
1283     unsigned int myPerms;
1284     short myaccess = 0;
1285     struct cat_attr cnattr;
1286     CatalogKey catkey;
1287     struct cnode *skip_cp = VTOC(vp);
1288     kauth_cred_t cred = vfs_context_ucred(context);
1289     proc_t p = vfs_context_proc(context);
1290
1291     is64bit = proc_is64bit(p);
1292
1293     /* initialize the local cache and buffers */
1294     cache.numcached = 0;
1295     cache.cachehits = 0;
1296     cache.lookups = 0;
1297     cache.acache = NULL;
1298     cache.haveaccess = NULL;
1299
1300     /* struct copyin done during dispatch... need to copy file_id array separately */
1301     if (ap->a_data == NULL) {
1302         error = EINVAL;
1303         goto err_exit_bulk_access;
1304     }
1305
1306     if (is64bit) {
1307         if (arg_size != sizeof(struct user64_ext_access_t)) {
1308             error = EINVAL;
1309             goto err_exit_bulk_access;
1310         }
1311
1312         user_access_structp = (struct user64_ext_access_t *)ap->a_data;
1313
1314     } else if (arg_size == sizeof(struct user32_access_t)) {
1315         struct user32_access_t *accessp = (struct user32_access_t *)ap->a_data;
1316
1317         // convert an old style bulk-access struct to the new style
1318         tmp_user_access.flags     = accessp->flags;
1319         tmp_user_access.num_files = accessp->num_files;
1320         tmp_user_access.map_size  = 0;
1321         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1322         tmp_user_access.bitmap    = USER_ADDR_NULL;
1323         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1324         tmp_user_access.num_parents = 0;
1325         user_access_structp = &tmp_user_access;
1326
1327     } else if (arg_size == sizeof(struct user32_ext_access_t)) {
1328         struct user32_ext_access_t *accessp = (struct user32_ext_access_t *)ap->a_data;
1329
1330         // up-cast from a 32-bit version of the struct
1331         tmp_user_access.flags     = accessp->flags;
1332         tmp_user_access.num_files = accessp->num_files;
1333         tmp_user_access.map_size  = accessp->map_size;
1334         tmp_user_access.num_parents  = accessp->num_parents;
1335
1336         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1337         tmp_user_access.bitmap    = CAST_USER_ADDR_T(accessp->bitmap);
1338         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1339         tmp_user_access.parents    = CAST_USER_ADDR_T(accessp->parents);
1340
1341         user_access_structp = &tmp_user_access;
1342     } else {
1343         error = EINVAL;
1344         goto err_exit_bulk_access;
1345     }
1346
1347     map_size = user_access_structp->map_size;
1348
1349     num_files = user_access_structp->num_files;
1350
1351     num_parents= user_access_structp->num_parents;
1352
1353     if (num_files < 1) {
1354         goto err_exit_bulk_access;
1355     }
1356     if (num_files > 1024) {
1357         error = EINVAL;
1358         goto err_exit_bulk_access;
1359     }
1360
1361     if (num_parents > 1024) {
1362         error = EINVAL;
1363         goto err_exit_bulk_access;
1364     }
1365
1366     file_ids = (int *) kalloc(sizeof(int) * num_files);
1367     access = (short *) kalloc(sizeof(short) * num_files);
1368     if (map_size) {
1369         bitmap = (char *) kalloc(sizeof(char) * map_size);
1370     }
1371
1372     if (num_parents) {
1373         parents = (cnid_t *) kalloc(sizeof(cnid_t) * num_parents);
1374     }
1375
1376     cache.acache = (unsigned int *) kalloc(sizeof(int) * NUM_CACHE_ENTRIES);
1377     cache.haveaccess = (unsigned char *) kalloc(sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1378
1379     if (file_ids == NULL || access == NULL || (map_size != 0 && bitmap == NULL) || cache.acache == NULL || cache.haveaccess == NULL) {
1380         if (file_ids) {
1381             kfree(file_ids, sizeof(int) * num_files);
1382         }
1383         if (bitmap) {
1384             kfree(bitmap, sizeof(char) * map_size);
1385         }
1386         if (access) {
1387             kfree(access, sizeof(short) * num_files);
1388         }
1389         if (cache.acache) {
1390             kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1391         }
1392         if (cache.haveaccess) {
1393             kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1394         }
1395         if (parents) {
1396             kfree(parents, sizeof(cnid_t) * num_parents);
1397         }
1398         return ENOMEM;
1399     }
1400
1401     // make sure the bitmap is zero'ed out...
1402     if (bitmap) {
1403         bzero(bitmap, (sizeof(char) * map_size));
1404     }
1405
1406     if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids,
1407                 num_files * sizeof(int)))) {
1408         goto err_exit_bulk_access;
1409     }
1410
1411     if (num_parents) {
1412         if ((error = copyin(user_access_structp->parents, (caddr_t)parents,
1413                     num_parents * sizeof(cnid_t)))) {
1414             goto err_exit_bulk_access;
1415         }
1416     }
1417
1418     flags = user_access_structp->flags;
1419     if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) {
1420         flags = R_OK;
1421     }
1422
1423     /* check if we've been passed leaf node ids or parent ids */
1424     if (flags & PARENT_IDS_FLAG) {
1425         check_leaf = false;
1426     }
1427
1428     /* Check access to each file_id passed in */
1429     for (i = 0; i < num_files; i++) {
1430         leaf_index=-1;
1431         cnid = (cnid_t) file_ids[i];
1432
1433         /* root always has access */
1434         if ((!parents) && (!suser(cred, NULL))) {
1435             access[i] = 0;
1436             continue;
1437         }
1438
1439         if (check_leaf) {
1440             /* do the lookup (checks the cnode hash, then the catalog) */
1441             error = do_attr_lookup(hfsmp, &cache, cnid, skip_cp, &catkey, &cnattr);
1442             if (error) {
1443                 access[i] = (short) error;
1444                 continue;
1445             }
1446
1447             if (parents) {
1448                 // Check if the leaf matches one of the parent scopes
1449                 leaf_index = cache_binSearch(parents, num_parents-1, cnid, NULL);
1450                 if (leaf_index >= 0 && parents[leaf_index] == cnid)
1451                     prev_parent_check_ok = 0;
1452                 else if (leaf_index >= 0)
1453                     prev_parent_check_ok = 1;
1454             }
1455
1456             // if the thing has acl's, do the full permission check
1457             if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1458                 struct vnode *cvp;
1459                 int myErr = 0;
1460                 /* get the vnode for this cnid */
1461                 myErr = hfs_vget(hfsmp, cnid, &cvp, 0, 0);
1462                 if ( myErr ) {
1463                     access[i] = myErr;
1464                     continue;
1465                 }
1466
1467                 hfs_unlock(VTOC(cvp));
1468
1469                 if (vnode_vtype(cvp) == VDIR) {
1470                     myErr = vnode_authorize(cvp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), context);
1471                 } else {
1472                     myErr = vnode_authorize(cvp, NULL, KAUTH_VNODE_READ_DATA, context);
1473                 }
1474
1475                 vnode_put(cvp);
1476                 if (myErr) {
1477                     access[i] = myErr;
1478                     continue;
1479                 }
1480             } else {
1481                 /* before calling CheckAccess(), check the target file for read access */
1482                 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
1483                     cnattr.ca_mode, hfsmp->hfs_mp, cred, p);
1484
1485                 /* fail fast if no access */
1486                 if ((myPerms & flags) == 0) {
1487                     access[i] = EACCES;
1488                     continue;
1489                 }
1490             }
1491         } else {
1492             /* we were passed an array of parent ids */
1493             catkey.hfsPlus.parentID = cnid;
1494         }
1495
1496         /* if the last guy had the same parent and had access, we're done */
1497         if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0 && prev_parent_check_ok) {
1498             cache.cachehits++;
1499             access[i] = 0;
1500             continue;
1501         }
1502
1503         myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID,
1504             skip_cp, p, cred, context,bitmap, map_size, parents, num_parents);
1505
1506         if (myaccess || (error == ESRCH && leaf_index != -1)) {
1507             access[i] = 0; // have access.. no errors to report
1508         } else {
1509             access[i] = (error != 0 ? (short) error : EACCES);
1510         }
1511
1512         prevParent_cnid = catkey.hfsPlus.parentID;
1513     }
1514
1515     /* copyout the access array */
1516     if ((error = copyout((caddr_t)access, user_access_structp->access,
1517                 num_files * sizeof (short)))) {
1518         goto err_exit_bulk_access;
1519     }
1520     if (map_size && bitmap) {
1521         if ((error = copyout((caddr_t)bitmap, user_access_structp->bitmap,
1522                     map_size * sizeof (char)))) {
1523             goto err_exit_bulk_access;
1524         }
1525     }
1526
1527
1528   err_exit_bulk_access:
1529
1530     if (file_ids)
1531         kfree(file_ids, sizeof(int) * num_files);
1532     if (parents)
1533         kfree(parents, sizeof(cnid_t) * num_parents);
1534     if (bitmap)
1535         kfree(bitmap, sizeof(char) * map_size);
1536     if (access)
1537         kfree(access, sizeof(short) * num_files);
1538     if (cache.acache)
1539         kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1540     if (cache.haveaccess)
1541         kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1542
1543     return (error);
1544 }
1545
1546
1547 /* end "bulk-access" support */
1548
1549
1550 /*
1551  * Callback for use with freeze ioctl.
1552  */
1553 static int
1554 hfs_freezewrite_callback(struct vnode *vp, __unused void *cargs)
1555 {
1556         vnode_waitforwrites(vp, 0, 0, 0, "hfs freeze");
1557
1558         return 0;
1559 }
1560
1561 /*
1562  * Control filesystem operating characteristics.
1563  */
1564 int
1565 hfs_vnop_ioctl( struct vnop_ioctl_args /* {
1566                 vnode_t a_vp;
1567                 int  a_command;
1568                 caddr_t  a_data;
1569                 int  a_fflag;
1570                 vfs_context_t a_context;
1571         } */ *ap)
1572 {
1573         struct vnode * vp = ap->a_vp;
1574         struct hfsmount *hfsmp = VTOHFS(vp);
1575         vfs_context_t context = ap->a_context;
1576         kauth_cred_t cred = vfs_context_ucred(context);
1577         proc_t p = vfs_context_proc(context);
1578         struct vfsstatfs *vfsp;
1579         boolean_t is64bit;
1580         off_t jnl_start, jnl_size;
1581         struct hfs_journal_info *jip;
1582 #if HFS_COMPRESSION
1583         int compressed = 0;
1584         off_t uncompressed_size = -1;
1585         int decmpfs_error = 0;
1586
1587         if (ap->a_command == F_RDADVISE) {
1588                 /* we need to inspect the decmpfs state of the file as early as possible */
1589                 compressed = hfs_file_is_compressed(VTOC(vp), 0);
1590                 if (compressed) {
1591                         if (VNODE_IS_RSRC(vp)) {
1592                                 /* if this is the resource fork, treat it as if it were empty */
1593                                 uncompressed_size = 0;
1594                         } else {
1595                                 decmpfs_error = hfs_uncompressed_size_of_compressed_file(NULL, vp, 0, &uncompressed_size, 0);
1596                                 if (decmpfs_error != 0) {
1597                                         /* failed to get the uncompressed size, we'll check for this later */
1598                                         uncompressed_size = -1;
1599                                 }
1600                         }
1601                 }
1602         }
1603 #endif /* HFS_COMPRESSION */
1604
1605         is64bit = proc_is64bit(p);
1606
1607 #if CONFIG_PROTECT
1608         {
1609                 int error = 0;
1610                 if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
1611                         return error;
1612                 }
1613         }
1614 #endif /* CONFIG_PROTECT */
1615
1616         switch (ap->a_command) {
1617
1618         case HFS_GETPATH:
1619         {
1620                 struct vnode *file_vp;
1621                 cnid_t  cnid;
1622                 int  outlen;
1623                 char *bufptr;
1624                 int error;
1625                 int flags = 0;
1626
1627                 /* Caller must be owner of file system. */
1628                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1629                 if (suser(cred, NULL) &&
1630                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1631                         return (EACCES);
1632                 }
1633                 /* Target vnode must be file system's root. */
1634                 if (!vnode_isvroot(vp)) {
1635                         return (EINVAL);
1636                 }
1637                 bufptr = (char *)ap->a_data;
1638                 cnid = strtoul(bufptr, NULL, 10);
1639                 if (ap->a_fflag & HFS_GETPATH_VOLUME_RELATIVE) {
1640                         flags |= BUILDPATH_VOLUME_RELATIVE;
1641                 }
1642
1643                 /* We need to call hfs_vfs_vget to leverage the code that will
1644                  * fix the origin list for us if needed, as opposed to calling
1645                  * hfs_vget, since we will need the parent for build_path call.
1646                  */
1647
1648                 if ((error = hfs_vfs_vget(HFSTOVFS(hfsmp), cnid, &file_vp, context))) {
1649                         return (error);
1650                 }
1651                 error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, flags, context);
1652                 vnode_put(file_vp);
1653
1654                 return (error);
1655         }
1656
1657         case HFS_GET_WRITE_GEN_COUNTER:
1658         {
1659                 struct cnode *cp = NULL;
1660                 int error;
1661                 u_int32_t *counter = (u_int32_t *)ap->a_data;
1662
1663                 cp = VTOC(vp);
1664
1665                 if (vnode_isdir (vp)) {
1666                         error = EISDIR;
1667                         *counter = 0;
1668                         return error;
1669                 }
1670
1671                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
1672                 if (error == 0) {
1673                         struct ubc_info *uip;
1674                         int is_mapped = 0;
1675
1676                         if (UBCINFOEXISTS(vp)) {
1677                                 uip = vp->v_ubcinfo;
1678                                 if (uip->ui_flags & UI_ISMAPPED) {
1679                                         is_mapped = 1;
1680                                 }
1681                         }
1682
1683
1684                         if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) {
1685                                 uint32_t gcount = hfs_get_gencount(cp);
1686                                 //
1687                                 // Even though we return EBUSY for files that are mmap'ed
1688                                 // we also want to bump the value so that the write-gen
1689                                 // counter will always be different once the file is unmapped
1690                                 // (since the file may be unmapped but the pageouts have not
1691                                 // yet happened).
1692                                 //
1693                                 if (is_mapped) {
1694                                         hfs_incr_gencount (cp);
1695                                         gcount = hfs_get_gencount(cp);
1696                                 }
1697
1698                                 *counter = gcount;
1699
1700                         }
1701                         else {
1702                                 /* not a file or dir? silently return */
1703                                 *counter = 0;
1704                         }
1705                         hfs_unlock (cp);
1706
1707                         if (is_mapped) {
1708                                 error = EBUSY;
1709                         }
1710                 }
1711
1712                 return error;
1713         }
1714
1715         case HFS_PREV_LINK:
1716         case HFS_NEXT_LINK:
1717         {
1718                 cnid_t linkfileid;
1719                 cnid_t nextlinkid;
1720                 cnid_t prevlinkid;
1721                 int error;
1722
1723                 /* Caller must be owner of file system. */
1724                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1725                 if (suser(cred, NULL) &&
1726                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1727                         return (EACCES);
1728                 }
1729                 /* Target vnode must be file system's root. */
1730                 if (!vnode_isvroot(vp)) {
1731                         return (EINVAL);
1732                 }
1733                 linkfileid = *(cnid_t *)ap->a_data;
1734                 if (linkfileid < kHFSFirstUserCatalogNodeID) {
1735                         return (EINVAL);
1736                 }
1737                 if ((error = hfs_lookup_siblinglinks(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) {
1738                         return (error);
1739                 }
1740                 if (ap->a_command == HFS_NEXT_LINK) {
1741                         *(cnid_t *)ap->a_data = nextlinkid;
1742                 } else {
1743                         *(cnid_t *)ap->a_data = prevlinkid;
1744                 }
1745                 return (0);
1746         }
1747
1748         case HFS_RESIZE_PROGRESS: {
1749
1750                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1751                 if (suser(cred, NULL) &&
1752                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1753                         return (EACCES); /* must be owner of file system */
1754                 }
1755                 if (!vnode_isvroot(vp)) {
1756                         return (EINVAL);
1757                 }
1758                 /* file system must not be mounted read-only */
1759                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1760                         return (EROFS);
1761                 }
1762
1763                 return hfs_resize_progress(hfsmp, (u_int32_t *)ap->a_data);
1764         }
1765
1766         case HFS_RESIZE_VOLUME: {
1767                 u_int64_t newsize;
1768                 u_int64_t cursize;
1769
1770                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1771                 if (suser(cred, NULL) &&
1772                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1773                         return (EACCES); /* must be owner of file system */
1774                 }
1775                 if (!vnode_isvroot(vp)) {
1776                         return (EINVAL);
1777                 }
1778
1779                 /* filesystem must not be mounted read only */
1780                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1781                         return (EROFS);
1782                 }
1783                 newsize = *(u_int64_t *)ap->a_data;
1784                 cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
1785
1786                 if (newsize > cursize) {
1787                         return hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context);
1788                 } else if (newsize < cursize) {
1789                         return hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context);
1790                 } else {
1791                         return (0);
1792                 }
1793         }
1794         case HFS_CHANGE_NEXT_ALLOCATION: {
1795                 int error = 0;          /* Assume success */
1796                 u_int32_t location;
1797
1798                 if (vnode_vfsisrdonly(vp)) {
1799                         return (EROFS);
1800                 }
1801                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1802                 if (suser(cred, NULL) &&
1803                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1804                         return (EACCES); /* must be owner of file system */
1805                 }
1806                 if (!vnode_isvroot(vp)) {
1807                         return (EINVAL);
1808                 }
1809                 hfs_lock_mount(hfsmp);
1810                 location = *(u_int32_t *)ap->a_data;
1811                 if ((location >= hfsmp->allocLimit) &&
1812                         (location != HFS_NO_UPDATE_NEXT_ALLOCATION)) {
1813                         error = EINVAL;
1814                         goto fail_change_next_allocation;
1815                 }
1816                 /* Return previous value. */
1817                 *(u_int32_t *)ap->a_data = hfsmp->nextAllocation;
1818                 if (location == HFS_NO_UPDATE_NEXT_ALLOCATION) {
1819                         /* On magic value for location, set nextAllocation to next block
1820                          * after metadata zone and set flag in mount structure to indicate
1821                          * that nextAllocation should not be updated again.
1822                          */
1823                         if (hfsmp->hfs_metazone_end != 0) {
1824                                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_end + 1);
1825                         }
1826                         hfsmp->hfs_flags |= HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1827                 } else {
1828                         hfsmp->hfs_flags &= ~HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1829                         HFS_UPDATE_NEXT_ALLOCATION(hfsmp, location);
1830                 }
1831                 MarkVCBDirty(hfsmp);
1832 fail_change_next_allocation:
1833                 hfs_unlock_mount(hfsmp);
1834                 return (error);
1835         }
1836
1837 #if HFS_SPARSE_DEV
1838         case HFS_SETBACKINGSTOREINFO: {
1839                 struct vnode * bsfs_rootvp;
1840                 struct vnode * di_vp;
1841                 struct hfs_backingstoreinfo *bsdata;
1842                 int error = 0;
1843
1844                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1845                         return (EROFS);
1846                 }
1847                 if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
1848                         return (EALREADY);
1849                 }
1850                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1851                 if (suser(cred, NULL) &&
1852                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1853                         return (EACCES); /* must be owner of file system */
1854                 }
1855                 bsdata = (struct hfs_backingstoreinfo *)ap->a_data;
1856                 if (bsdata == NULL) {
1857                         return (EINVAL);
1858                 }
1859                 if ((error = file_vnode(bsdata->backingfd, &di_vp))) {
1860                         return (error);
1861                 }
1862                 if ((error = vnode_getwithref(di_vp))) {
1863                         file_drop(bsdata->backingfd);
1864                         return(error);
1865                 }
1866
1867                 if (vnode_mount(vp) == vnode_mount(di_vp)) {
1868                         (void)vnode_put(di_vp);
1869                         file_drop(bsdata->backingfd);
1870                         return (EINVAL);
1871                 }
1872
1873                 /*
1874                  * Obtain the backing fs root vnode and keep a reference
1875                  * on it.  This reference will be dropped in hfs_unmount.
1876                  */
1877                 error = VFS_ROOT(vnode_mount(di_vp), &bsfs_rootvp, NULL); /* XXX use context! */
1878                 if (error) {
1879                         (void)vnode_put(di_vp);
1880                         file_drop(bsdata->backingfd);
1881                         return (error);
1882                 }
1883                 vnode_ref(bsfs_rootvp);
1884                 vnode_put(bsfs_rootvp);
1885
1886                 hfsmp->hfs_backingfs_rootvp = bsfs_rootvp;
1887
1888                 hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
1889                 /* The free extent cache is managed differently for sparse devices.
1890                  * There is a window between which the volume is mounted and the
1891                  * device is marked as sparse, so the free extent cache for this
1892                  * volume is currently initialized as normal volume (sorted by block
1893                  * count).  Reset the cache so that it will be rebuilt again
1894                  * for sparse device (sorted by start block).
1895                  */
1896                 ResetVCBFreeExtCache(hfsmp);
1897
1898                 hfsmp->hfs_sparsebandblks = bsdata->bandsize / HFSTOVCB(hfsmp)->blockSize;
1899                 hfsmp->hfs_sparsebandblks *= 4;
1900
1901                 /* We check the MNTK_VIRTUALDEV bit instead of marking the dependent process */
1902
1903                 /*
1904                  * If the sparse image is on a sparse image file (as opposed to a sparse
1905                  * bundle), then we may need to limit the free space to the maximum size
1906                  * of a file on that volume.  So we query (using pathconf), and if we get
1907                  * a meaningful result, we cache the number of blocks for later use in
1908                  * hfs_freeblks().
1909                  */
1910                 hfsmp->hfs_backingfs_maxblocks = 0;
1911                 if (vnode_vtype(di_vp) == VREG) {
1912                         int terr;
1913                         int hostbits;
1914                         terr = vn_pathconf(di_vp, _PC_FILESIZEBITS, &hostbits, context);
1915                         if (terr == 0 && hostbits != 0 && hostbits < 64) {
1916                                 u_int64_t hostfilesizemax = ((u_int64_t)1) << hostbits;
1917
1918                                 hfsmp->hfs_backingfs_maxblocks = hostfilesizemax / hfsmp->blockSize;
1919                         }
1920                 }
1921
1922                 (void)vnode_put(di_vp);
1923                 file_drop(bsdata->backingfd);
1924                 return (0);
1925         }
1926         case HFS_CLRBACKINGSTOREINFO: {
1927                 struct vnode * tmpvp;
1928
1929                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1930                 if (suser(cred, NULL) &&
1931                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1932                         return (EACCES); /* must be owner of file system */
1933                 }
1934                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1935                         return (EROFS);
1936                 }
1937
1938                 if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
1939                     hfsmp->hfs_backingfs_rootvp) {
1940
1941                         hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
1942                         tmpvp = hfsmp->hfs_backingfs_rootvp;
1943                         hfsmp->hfs_backingfs_rootvp = NULLVP;
1944                         hfsmp->hfs_sparsebandblks = 0;
1945                         vnode_rele(tmpvp);
1946                 }
1947                 return (0);
1948         }
1949 #endif /* HFS_SPARSE_DEV */
1950
1951         /* Change the next CNID stored in the VH */
1952         case HFS_CHANGE_NEXTCNID: {
1953                 int error = 0;          /* Assume success */
1954                 u_int32_t fileid;
1955                 int wraparound = 0;
1956                 int lockflags = 0;
1957
1958                 if (vnode_vfsisrdonly(vp)) {
1959                         return (EROFS);
1960                 }
1961                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1962                 if (suser(cred, NULL) &&
1963                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1964                         return (EACCES); /* must be owner of file system */
1965                 }
1966
1967                 fileid = *(u_int32_t *)ap->a_data;
1968
1969                 /* Must have catalog lock excl. to advance the CNID pointer */
1970                 lockflags = hfs_systemfile_lock (hfsmp, SFL_CATALOG , HFS_EXCLUSIVE_LOCK);
1971
1972                 hfs_lock_mount(hfsmp);
1973
1974                 /* If it is less than the current next CNID, force the wraparound bit to be set */
1975                 if (fileid < hfsmp->vcbNxtCNID) {
1976                         wraparound=1;
1977                 }
1978
1979                 /* Return previous value. */
1980                 *(u_int32_t *)ap->a_data = hfsmp->vcbNxtCNID;
1981
1982                 hfsmp->vcbNxtCNID = fileid;
1983
1984                 if (wraparound) {
1985                         hfsmp->vcbAtrb |= kHFSCatalogNodeIDsReusedMask;
1986                 }
1987
1988                 MarkVCBDirty(hfsmp);
1989                 hfs_unlock_mount(hfsmp);
1990                 hfs_systemfile_unlock (hfsmp, lockflags);
1991
1992                 return (error);
1993         }
1994
1995         case F_FREEZE_FS: {
1996                 struct mount *mp;
1997
1998                 mp = vnode_mount(vp);
1999                 hfsmp = VFSTOHFS(mp);
2000
2001                 if (!(hfsmp->jnl))
2002                         return (ENOTSUP);
2003
2004                 vfsp = vfs_statfs(mp);
2005
2006                 if (kauth_cred_getuid(cred) != vfsp->f_owner &&
2007                         !kauth_cred_issuser(cred))
2008                         return (EACCES);
2009
2010                 lck_rw_lock_exclusive(&hfsmp->hfs_insync);
2011
2012                 // flush things before we get started to try and prevent
2013                 // dirty data from being paged out while we're frozen.
2014                 // note: can't do this after taking the lock as it will
2015                 // deadlock against ourselves.
2016                 vnode_iterate(mp, 0, hfs_freezewrite_callback, NULL);
2017                 hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
2018
2019                 // DO NOT call hfs_journal_flush() because that takes a
2020                 // shared lock on the global exclusive lock!
2021                 journal_flush(hfsmp->jnl, TRUE);
2022
2023                 // don't need to iterate on all vnodes, we just need to
2024                 // wait for writes to the system files and the device vnode
2025                 //
2026                 // Now that journal flush waits for all metadata blocks to
2027                 // be written out, waiting for btree writes is probably no
2028                 // longer required.
2029                 if (HFSTOVCB(hfsmp)->extentsRefNum)
2030                     vnode_waitforwrites(HFSTOVCB(hfsmp)->extentsRefNum, 0, 0, 0, "hfs freeze");
2031                 if (HFSTOVCB(hfsmp)->catalogRefNum)
2032                     vnode_waitforwrites(HFSTOVCB(hfsmp)->catalogRefNum, 0, 0, 0, "hfs freeze");
2033                 if (HFSTOVCB(hfsmp)->allocationsRefNum)
2034                     vnode_waitforwrites(HFSTOVCB(hfsmp)->allocationsRefNum, 0, 0, 0, "hfs freeze");
2035                 if (hfsmp->hfs_attribute_vp)
2036                     vnode_waitforwrites(hfsmp->hfs_attribute_vp, 0, 0, 0, "hfs freeze");
2037                 vnode_waitforwrites(hfsmp->hfs_devvp, 0, 0, 0, "hfs freeze");
2038
2039                 hfsmp->hfs_freezing_proc = current_proc();
2040
2041                 return (0);
2042         }
2043
2044         case F_THAW_FS: {
2045                 vfsp = vfs_statfs(vnode_mount(vp));
2046                 if (kauth_cred_getuid(cred) != vfsp->f_owner &&
2047                         !kauth_cred_issuser(cred))
2048                         return (EACCES);
2049
2050                 // if we're not the one who froze the fs then we
2051                 // can't thaw it.
2052                 if (hfsmp->hfs_freezing_proc != current_proc()) {
2053                     return EPERM;
2054                 }
2055
2056                 // NOTE: if you add code here, also go check the
2057                 //       code that "thaws" the fs in hfs_vnop_close()
2058                 //
2059                 hfsmp->hfs_freezing_proc = NULL;
2060                 hfs_unlock_global (hfsmp);
2061                 lck_rw_unlock_exclusive(&hfsmp->hfs_insync);
2062
2063                 return (0);
2064         }
2065
2066         case HFS_BULKACCESS_FSCTL: {
2067             int size;
2068
2069             if (hfsmp->hfs_flags & HFS_STANDARD) {
2070                 return EINVAL;
2071             }
2072
2073             if (is64bit) {
2074                 size = sizeof(struct user64_access_t);
2075             } else {
2076                 size = sizeof(struct user32_access_t);
2077             }
2078
2079             return do_bulk_access_check(hfsmp, vp, ap, size, context);
2080         }
2081
2082         case HFS_EXT_BULKACCESS_FSCTL: {
2083             int size;
2084
2085             if (hfsmp->hfs_flags & HFS_STANDARD) {
2086                 return EINVAL;
2087             }
2088
2089             if (is64bit) {
2090                 size = sizeof(struct user64_ext_access_t);
2091             } else {
2092                 size = sizeof(struct user32_ext_access_t);
2093             }
2094
2095             return do_bulk_access_check(hfsmp, vp, ap, size, context);
2096         }
2097
2098         case HFS_SET_XATTREXTENTS_STATE: {
2099                 int state;
2100
2101                 if (ap->a_data == NULL) {
2102                         return (EINVAL);
2103                 }
2104
2105                 state = *(int *)ap->a_data;
2106
2107                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2108                         return (EROFS);
2109                 }
2110
2111                 /* Super-user can enable or disable extent-based extended
2112                  * attribute support on a volume
2113                  * Note: Starting Mac OS X 10.7, extent-based extended attributes
2114                  * are enabled by default, so any change will be transient only
2115                  * till the volume is remounted.
2116                  */
2117                 if (!kauth_cred_issuser(kauth_cred_get())) {
2118                         return (EPERM);
2119                 }
2120                 if (state == 0 || state == 1)
2121                         return hfs_set_volxattr(hfsmp, HFS_SET_XATTREXTENTS_STATE, state);
2122                 else
2123                         return (EINVAL);
2124         }
2125
2126         case F_SETSTATICCONTENT: {
2127                 int error;
2128                 int enable_static = 0;
2129                 struct cnode *cp = NULL;
2130                 /*
2131                  * lock the cnode, decorate the cnode flag, and bail out.
2132                  * VFS should have already authenticated the caller for us.
2133                  */
2134
2135                 if (ap->a_data) {
2136                         /*
2137                          * Note that even though ap->a_data is of type caddr_t,
2138                          * the fcntl layer at the syscall handler will pass in NULL
2139                          * or 1 depending on what the argument supplied to the fcntl
2140                          * was.  So it is in fact correct to check the ap->a_data
2141                          * argument for zero or non-zero value when deciding whether or not
2142                          * to enable the static bit in the cnode.
2143                          */
2144                         enable_static = 1;
2145                 }
2146                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2147                         return EROFS;
2148                 }
2149                 cp = VTOC(vp);
2150
2151                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2152                 if (error == 0) {
2153                         if (enable_static) {
2154                                 cp->c_flag |= C_SSD_STATIC;
2155                         }
2156                         else {
2157                                 cp->c_flag &= ~C_SSD_STATIC;
2158                         }
2159                         hfs_unlock (cp);
2160                 }
2161                 return error;
2162         }
2163
2164         case F_SET_GREEDY_MODE: {
2165                 int error;
2166                 int enable_greedy_mode = 0;
2167                 struct cnode *cp = NULL;
2168                 /*
2169                  * lock the cnode, decorate the cnode flag, and bail out.
2170                  * VFS should have already authenticated the caller for us.
2171                  */
2172
2173                 if (ap->a_data) {
2174                         /*
2175                          * Note that even though ap->a_data is of type caddr_t,
2176                          * the fcntl layer at the syscall handler will pass in NULL
2177                          * or 1 depending on what the argument supplied to the fcntl
2178                          * was.  So it is in fact correct to check the ap->a_data
2179                          * argument for zero or non-zero value when deciding whether or not
2180                          * to enable the greedy mode bit in the cnode.
2181                          */
2182                         enable_greedy_mode = 1;
2183                 }
2184                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2185                         return EROFS;
2186                 }
2187                 cp = VTOC(vp);
2188
2189                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2190                 if (error == 0) {
2191                         if (enable_greedy_mode) {
2192                                 cp->c_flag |= C_SSD_GREEDY_MODE;
2193                         }
2194                         else {
2195                                 cp->c_flag &= ~C_SSD_GREEDY_MODE;
2196                         }
2197                         hfs_unlock (cp);
2198                 }
2199                 return error;
2200         }
2201
2202         case F_MAKECOMPRESSED: {
2203                 int error = 0;
2204                 uint32_t gen_counter;
2205                 struct cnode *cp = NULL;
2206                 int reset_decmp = 0;
2207
2208                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2209                         return EROFS;
2210                 }
2211
2212                 /*
2213                  * acquire & lock the cnode.
2214                  * VFS should have already authenticated the caller for us.
2215                  */
2216
2217                 if (ap->a_data) {
2218                         /*
2219                          * Cast the pointer into a uint32_t so we can extract the
2220                          * supplied generation counter.
2221                          */
2222                         gen_counter = *((uint32_t*)ap->a_data);
2223                 }
2224                 else {
2225                         return EINVAL;
2226                 }
2227
2228 #if HFS_COMPRESSION
2229                 cp = VTOC(vp);
2230                 /* Grab truncate lock first; we may truncate the file */
2231                 hfs_lock_truncate (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2232
2233                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2234                 if (error) {
2235                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
2236                         return error;
2237                 }
2238
2239                 /* Are there any other usecounts/FDs? */
2240                 if (vnode_isinuse(vp, 1)) {
2241                         hfs_unlock(cp);
2242                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
2243                         return EBUSY;
2244                 }
2245
2246
2247                 /* now we have the cnode locked down; Validate arguments */
2248                 if (cp->c_attr.ca_flags & (UF_IMMUTABLE | UF_COMPRESSED)) {
2249                         /* EINVAL if you are trying to manipulate an IMMUTABLE file */
2250                         hfs_unlock(cp);
2251                         hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT);
2252                         return EINVAL;
2253                 }
2254
2255                 if ((hfs_get_gencount (cp)) == gen_counter) {
2256                         /*
2257                          * OK, the gen_counter matched.  Go for it:
2258                          * Toggle state bits, truncate file, and suppress mtime update
2259                          */
2260                         reset_decmp = 1;
2261                         cp->c_bsdflags |= UF_COMPRESSED;
2262
2263                         error = hfs_truncate(vp, 0, IO_NDELAY, 0, (HFS_TRUNCATE_SKIPTIMES), ap->a_context);
2264                 }
2265                 else {
2266                         error = ESTALE;
2267                 }
2268
2269                 /* Unlock cnode before executing decmpfs ; they may need to get an EA */
2270                 hfs_unlock(cp);
2271
2272                 /*
2273                  * Reset the decmp state while still holding the truncate lock. We need to
2274                  * serialize here against a listxattr on this node which may occur at any
2275                  * time.
2276                  *
2277                  * Even if '0/skiplock' is passed in 2nd argument to hfs_file_is_compressed,
2278                  * that will still potentially require getting the com.apple.decmpfs EA. If the
2279                  * EA is required, then we can't hold the cnode lock, because the getxattr call is
2280                  * generic(through VFS), and can't pass along any info telling it that we're already
2281                  * holding it (the lock). If we don't serialize, then we risk listxattr stopping
2282                  * and trying to fill in the hfs_file_is_compressed info during the callback
2283                  * operation, which will result in deadlock against the b-tree node.
2284                  *
2285                  * So, to serialize against listxattr (which will grab buf_t meta references on
2286                  * the b-tree blocks), we hold the truncate lock as we're manipulating the
2287                  * decmpfs payload.
2288                  */
2289                 if ((reset_decmp) && (error == 0)) {
2290                         decmpfs_cnode *dp = VTOCMP (vp);
2291                         if (dp != NULL) {
2292                                 decmpfs_cnode_set_vnode_state(dp, FILE_TYPE_UNKNOWN, 0);
2293                         }
2294
2295                         /* Initialize the decmpfs node as needed */
2296                         (void) hfs_file_is_compressed (cp, 0); /* ok to take lock */
2297                 }
2298
2299                 hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT);
2300
2301 #endif
2302                 return error;
2303         }
2304
2305         case F_SETBACKINGSTORE: {
2306
2307                 int error = 0;
2308
2309                 /*
2310                  * See comment in F_SETSTATICCONTENT re: using
2311              * a null check for a_data
2312                  */
2313                 if (ap->a_data) {
2314                         error = hfs_set_backingstore (vp, 1);
2315                 }
2316                 else {
2317                         error = hfs_set_backingstore (vp, 0);
2318                 }
2319
2320                 return error;
2321         }
2322
2323         case F_GETPATH_MTMINFO: {
2324                 int error = 0;
2325
2326                 int *data = (int*) ap->a_data;
2327
2328                 /* Ask if this is a backingstore vnode */
2329                 error = hfs_is_backingstore (vp, data);
2330
2331                 return error;
2332         }
2333
2334         case F_FULLFSYNC: {
2335                 int error;
2336
2337                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2338                         return (EROFS);
2339                 }
2340                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2341                 if (error == 0) {
2342                         error = hfs_fsync(vp, MNT_WAIT, TRUE, p);
2343                         hfs_unlock(VTOC(vp));
2344                 }
2345
2346                 return error;
2347         }
2348
2349         case F_CHKCLEAN: {
2350                 register struct cnode *cp;
2351                 int error;
2352
2353                 if (!vnode_isreg(vp))
2354                         return EINVAL;
2355
2356                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2357                 if (error == 0) {
2358                         cp = VTOC(vp);
2359                         /*
2360                          * used by regression test to determine if
2361                          * all the dirty pages (via write) have been cleaned
2362                          * after a call to 'fsysnc'.
2363                          */
2364                         error = is_file_clean(vp, VTOF(vp)->ff_size);
2365                         hfs_unlock(cp);
2366                 }
2367                 return (error);
2368         }
2369
2370         case F_RDADVISE: {
2371                 register struct radvisory *ra;
2372                 struct filefork *fp;
2373                 int error;
2374
2375                 if (!vnode_isreg(vp))
2376                         return EINVAL;
2377
2378                 ra = (struct radvisory *)(ap->a_data);
2379                 fp = VTOF(vp);
2380
2381                 /* Protect against a size change. */
2382                 hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2383
2384 #if HFS_COMPRESSION
2385                 if (compressed && (uncompressed_size == -1)) {
2386                         /* fetching the uncompressed size failed above, so return the error */
2387                         error = decmpfs_error;
2388                 } else if ((compressed && (ra->ra_offset >= uncompressed_size)) ||
2389                                    (!compressed && (ra->ra_offset >= fp->ff_size))) {
2390                         error = EFBIG;
2391                 }
2392 #else /* HFS_COMPRESSION */
2393                 if (ra->ra_offset >= fp->ff_size) {
2394                         error = EFBIG;
2395                 }
2396 #endif /* HFS_COMPRESSION */
2397                 else {
2398                         error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count);
2399                 }
2400
2401                 hfs_unlock_truncate(VTOC(vp), HFS_LOCK_DEFAULT);
2402                 return (error);
2403         }
2404
2405         case _IOC(IOC_OUT,'h', 4, 0):     /* Create date in local time */
2406         {
2407                 if (is64bit) {
2408                         *(user_time_t *)(ap->a_data) = (user_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2409                 }
2410                 else {
2411                         *(user32_time_t *)(ap->a_data) = (user32_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2412                 }
2413                 return 0;
2414         }
2415
2416         case SPOTLIGHT_FSCTL_GET_MOUNT_TIME:
2417             *(uint32_t *)ap->a_data = hfsmp->hfs_mount_time;
2418             break;
2419
2420         case SPOTLIGHT_FSCTL_GET_LAST_MTIME:
2421             *(uint32_t *)ap->a_data = hfsmp->hfs_last_mounted_mtime;
2422             break;
2423
2424         case HFS_FSCTL_GET_VERY_LOW_DISK:
2425             *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_dangerlimit;
2426             break;
2427
2428         case HFS_FSCTL_SET_VERY_LOW_DISK:
2429             if (*(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_warninglimit) {
2430                 return EINVAL;
2431             }
2432
2433             hfsmp->hfs_freespace_notify_dangerlimit = *(uint32_t *)ap->a_data;
2434             break;
2435
2436         case HFS_FSCTL_GET_LOW_DISK:
2437             *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_warninglimit;
2438             break;
2439
2440         case HFS_FSCTL_SET_LOW_DISK:
2441             if (   *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel
2442                 || *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_dangerlimit) {
2443
2444                 return EINVAL;
2445             }
2446
2447             hfsmp->hfs_freespace_notify_warninglimit = *(uint32_t *)ap->a_data;
2448             break;
2449
2450         case HFS_FSCTL_GET_DESIRED_DISK:
2451             *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_desiredlevel;
2452             break;
2453
2454         case HFS_FSCTL_SET_DESIRED_DISK:
2455             if (*(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) {
2456                 return EINVAL;
2457             }
2458
2459             hfsmp->hfs_freespace_notify_desiredlevel = *(uint32_t *)ap->a_data;
2460             break;
2461
2462         case HFS_VOLUME_STATUS:
2463             *(uint32_t *)ap->a_data = hfsmp->hfs_notification_conditions;
2464             break;
2465
2466         case HFS_SET_BOOT_INFO:
2467                 if (!vnode_isvroot(vp))
2468                         return(EINVAL);
2469                 if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(HFSTOVFS(hfsmp))->f_owner))
2470                         return(EACCES); /* must be superuser or owner of filesystem */
2471                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2472                         return (EROFS);
2473                 }
2474                 hfs_lock_mount (hfsmp);
2475                 bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo));
2476                 hfs_unlock_mount (hfsmp);
2477                 (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
2478                 break;
2479
2480         case HFS_GET_BOOT_INFO:
2481                 if (!vnode_isvroot(vp))
2482                         return(EINVAL);
2483                 hfs_lock_mount (hfsmp);
2484                 bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo));
2485                 hfs_unlock_mount(hfsmp);
2486                 break;
2487
2488         case HFS_MARK_BOOT_CORRUPT:
2489                 /* Mark the boot volume corrupt by setting
2490                  * kHFSVolumeInconsistentBit in the volume header.  This will
2491                  * force fsck_hfs on next mount.
2492                  */
2493                 if (!kauth_cred_issuser(kauth_cred_get())) {
2494                         return EACCES;
2495                 }
2496
2497                 /* Allowed only on the root vnode of the boot volume */
2498                 if (!(vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) ||
2499                     !vnode_isvroot(vp)) {
2500                         return EINVAL;
2501                 }
2502                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2503                         return (EROFS);
2504                 }
2505                 printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n");
2506                 hfs_mark_volume_inconsistent(hfsmp);
2507                 break;
2508
2509         case HFS_FSCTL_GET_JOURNAL_INFO:
2510                 jip = (struct hfs_journal_info*)ap->a_data;
2511
2512                 if (vp == NULLVP)
2513                         return EINVAL;
2514
2515             if (hfsmp->jnl == NULL) {
2516                         jnl_start = 0;
2517                         jnl_size  = 0;
2518             } else {
2519                         jnl_start = (off_t)(hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset;
2520                         jnl_size  = (off_t)hfsmp->jnl_size;
2521             }
2522
2523                 jip->jstart = jnl_start;
2524                 jip->jsize = jnl_size;
2525                 break;
2526
2527         case HFS_SET_ALWAYS_ZEROFILL: {
2528             struct cnode *cp = VTOC(vp);
2529
2530             if (*(int *)ap->a_data) {
2531                 cp->c_flag |= C_ALWAYS_ZEROFILL;
2532             } else {
2533                 cp->c_flag &= ~C_ALWAYS_ZEROFILL;
2534             }
2535             break;
2536         }
2537
2538         case HFS_DISABLE_METAZONE: {
2539                 /* Only root can disable metadata zone */
2540                 if (!kauth_cred_issuser(kauth_cred_get())) {
2541                         return EACCES;
2542                 }
2543                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2544                         return (EROFS);
2545                 }
2546
2547                 /* Disable metadata zone now */
2548                 (void) hfs_metadatazone_init(hfsmp, true);
2549                 printf ("hfs: Disabling metadata zone on %s\n", hfsmp->vcbVN);
2550                 break;
2551         }
2552
2553         default:
2554                 return (ENOTTY);
2555         }
2556
2557         return 0;
2558 }
2559
2560 /*
2561  * select
2562  */
2563 int
2564 hfs_vnop_select(__unused struct vnop_select_args *ap)
2565 /*
2566         struct vnop_select_args {
2567                 vnode_t a_vp;
2568                 int  a_which;
2569                 int  a_fflags;
2570                 void *a_wql;
2571                 vfs_context_t a_context;
2572         };
2573 */
2574 {
2575         /*
2576          * We should really check to see if I/O is possible.
2577          */
2578         return (1);
2579 }
2580
2581 /*
2582  * Converts a logical block number to a physical block, and optionally returns
2583  * the amount of remaining blocks in a run. The logical block is based on hfsNode.logBlockSize.
2584  * The physical block number is based on the device block size, currently its 512.
2585  * The block run is returned in logical blocks, and is the REMAINING amount of blocks
2586  */
2587 int
2588 hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, unsigned int *runp)
2589 {
2590         struct filefork *fp = VTOF(vp);
2591         struct hfsmount *hfsmp = VTOHFS(vp);
2592         int  retval = E_NONE;
2593         u_int32_t  logBlockSize;
2594         size_t  bytesContAvail = 0;
2595         off_t  blockposition;
2596         int lockExtBtree;
2597         int lockflags = 0;
2598
2599         /*
2600          * Check for underlying vnode requests and ensure that logical
2601          * to physical mapping is requested.
2602          */
2603         if (vpp != NULL)
2604                 *vpp = hfsmp->hfs_devvp;
2605         if (bnp == NULL)
2606                 return (0);
2607
2608         logBlockSize = GetLogicalBlockSize(vp);
2609         blockposition = (off_t)bn * logBlockSize;
2610
2611         lockExtBtree = overflow_extents(fp);
2612
2613         if (lockExtBtree)
2614                 lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK);
2615
2616         retval = MacToVFSError(
2617                             MapFileBlockC (HFSTOVCB(hfsmp),
2618                                             (FCB*)fp,
2619                                             MAXPHYSIO,
2620                                             blockposition,
2621                                             bnp,
2622                                             &bytesContAvail));
2623
2624         if (lockExtBtree)
2625                 hfs_systemfile_unlock(hfsmp, lockflags);
2626
2627         if (retval == E_NONE) {
2628                 /* Figure out how many read ahead blocks there are */
2629                 if (runp != NULL) {
2630                         if (can_cluster(logBlockSize)) {
2631                                 /* Make sure this result never goes negative: */
2632                                 *runp = (bytesContAvail < logBlockSize) ? 0 : (bytesContAvail / logBlockSize) - 1;
2633                         } else {
2634                                 *runp = 0;
2635                         }
2636                 }
2637         }
2638         return (retval);
2639 }
2640
2641 /*
2642  * Convert logical block number to file offset.
2643  */
2644 int
2645 hfs_vnop_blktooff(struct vnop_blktooff_args *ap)
2646 /*
2647         struct vnop_blktooff_args {
2648                 vnode_t a_vp;
2649                 daddr64_t a_lblkno;
2650                 off_t *a_offset;
2651         };
2652 */
2653 {
2654         if (ap->a_vp == NULL)
2655                 return (EINVAL);
2656         *ap->a_offset = (off_t)ap->a_lblkno * (off_t)GetLogicalBlockSize(ap->a_vp);
2657
2658         return(0);
2659 }
2660
2661 /*
2662  * Convert file offset to logical block number.
2663  */
2664 int
2665 hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap)
2666 /*
2667         struct vnop_offtoblk_args {
2668                 vnode_t a_vp;
2669                 off_t a_offset;
2670                 daddr64_t *a_lblkno;
2671         };
2672 */
2673 {
2674         if (ap->a_vp == NULL)
2675                 return (EINVAL);
2676         *ap->a_lblkno = (daddr64_t)(ap->a_offset / (off_t)GetLogicalBlockSize(ap->a_vp));
2677
2678         return(0);
2679 }
2680
2681 /*
2682  * Map file offset to physical block number.
2683  *
2684  * If this function is called for write operation, and if the file
2685  * had virtual blocks allocated (delayed allocation), real blocks
2686  * are allocated by calling ExtendFileC().
2687  *
2688  * If this function is called for read operation, and if the file
2689  * had virtual blocks allocated (delayed allocation), no change
2690  * to the size of file is done, and if required, rangelist is
2691  * searched for mapping.
2692  *
2693  * System file cnodes are expected to be locked (shared or exclusive).
2694  */
2695 int
2696 hfs_vnop_blockmap(struct vnop_blockmap_args *ap)
2697 /*
2698         struct vnop_blockmap_args {
2699                 vnode_t a_vp;
2700                 off_t a_foffset;
2701                 size_t a_size;
2702                 daddr64_t *a_bpn;
2703                 size_t *a_run;
2704                 void *a_poff;
2705                 int a_flags;
2706                 vfs_context_t a_context;
2707         };
2708 */
2709 {
2710         struct vnode *vp = ap->a_vp;
2711         struct cnode *cp;
2712         struct filefork *fp;
2713         struct hfsmount *hfsmp;
2714         size_t bytesContAvail = 0;
2715         int retval = E_NONE;
2716         int syslocks = 0;
2717         int lockflags = 0;
2718         struct rl_entry *invalid_range;
2719         enum rl_overlaptype overlaptype;
2720         int started_tr = 0;
2721         int tooklock = 0;
2722
2723 #if HFS_COMPRESSION
2724         if (VNODE_IS_RSRC(vp)) {
2725                 /* allow blockmaps to the resource fork */
2726         } else {
2727                 if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
2728                         int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
2729                         switch(state) {
2730                                 case FILE_IS_COMPRESSED:
2731                                         return ENOTSUP;
2732                                 case FILE_IS_CONVERTING:
2733                                         /* if FILE_IS_CONVERTING, we allow blockmap */
2734                                         break;
2735                                 default:
2736                                         printf("invalid state %d for compressed file\n", state);
2737                                         /* fall through */
2738                         }
2739                 }
2740         }
2741 #endif /* HFS_COMPRESSION */
2742
2743         /* Do not allow blockmap operation on a directory */
2744         if (vnode_isdir(vp)) {
2745                 return (ENOTSUP);
2746         }
2747
2748         /*
2749          * Check for underlying vnode requests and ensure that logical
2750          * to physical mapping is requested.
2751          */
2752         if (ap->a_bpn == NULL)
2753                 return (0);
2754
2755         if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) {
2756                 if (VTOC(vp)->c_lockowner != current_thread()) {
2757                         hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
2758                         tooklock = 1;
2759                 }
2760         }
2761         hfsmp = VTOHFS(vp);
2762         cp = VTOC(vp);
2763         fp = VTOF(vp);
2764
2765 retry:
2766         /* Check virtual blocks only when performing write operation */
2767         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
2768                 if (hfs_start_transaction(hfsmp) != 0) {
2769                         retval = EINVAL;
2770                         goto exit;
2771                 } else {
2772                         started_tr = 1;
2773                 }
2774                 syslocks = SFL_EXTENTS | SFL_BITMAP;
2775
2776         } else if (overflow_extents(fp)) {
2777                 syslocks = SFL_EXTENTS;
2778         }
2779
2780         if (syslocks)
2781                 lockflags = hfs_systemfile_lock(hfsmp, syslocks, HFS_EXCLUSIVE_LOCK);
2782
2783         /*
2784          * Check for any delayed allocations.
2785          */
2786         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
2787                 int64_t actbytes;
2788                 u_int32_t loanedBlocks;
2789
2790                 //
2791                 // Make sure we have a transaction.  It's possible
2792                 // that we came in and fp->ff_unallocblocks was zero
2793                 // but during the time we blocked acquiring the extents
2794                 // btree, ff_unallocblocks became non-zero and so we
2795                 // will need to start a transaction.
2796                 //
2797                 if (started_tr == 0) {
2798                         if (syslocks) {
2799                                 hfs_systemfile_unlock(hfsmp, lockflags);
2800                                 syslocks = 0;
2801                         }
2802                         goto retry;
2803                 }
2804
2805                 /*
2806                  * Note: ExtendFileC will Release any blocks on loan and
2807                  * aquire real blocks.  So we ask to extend by zero bytes
2808                  * since ExtendFileC will account for the virtual blocks.
2809                  */
2810
2811                 loanedBlocks = fp->ff_unallocblocks;
2812                 retval = ExtendFileC(hfsmp, (FCB*)fp, 0, 0,
2813                                      kEFAllMask | kEFNoClumpMask, &actbytes);
2814
2815                 if (retval) {
2816                         fp->ff_unallocblocks = loanedBlocks;
2817                         cp->c_blocks += loanedBlocks;
2818                         fp->ff_blocks += loanedBlocks;
2819
2820                         hfs_lock_mount (hfsmp);
2821                         hfsmp->loanedBlocks += loanedBlocks;
2822                         hfs_unlock_mount (hfsmp);
2823
2824                         hfs_systemfile_unlock(hfsmp, lockflags);
2825                         cp->c_flag |= C_MODIFIED;
2826                         if (started_tr) {
2827                                 (void) hfs_update(vp, TRUE);
2828                                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2829
2830                                 hfs_end_transaction(hfsmp);
2831                                 started_tr = 0;
2832                         }
2833                         goto exit;
2834                 }
2835         }
2836
2837         retval = MapFileBlockC(hfsmp, (FCB *)fp, ap->a_size, ap->a_foffset,
2838                                ap->a_bpn, &bytesContAvail);
2839         if (syslocks) {
2840                 hfs_systemfile_unlock(hfsmp, lockflags);
2841                 syslocks = 0;
2842         }
2843
2844         if (started_tr) {
2845                 (void) hfs_update(vp, TRUE);
2846                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2847                 hfs_end_transaction(hfsmp);
2848                 started_tr = 0;
2849         }
2850         if (retval) {
2851                 /* On write, always return error because virtual blocks, if any,
2852                  * should have been allocated in ExtendFileC().  We do not
2853                  * allocate virtual blocks on read, therefore return error
2854                  * only if no virtual blocks are allocated.  Otherwise we search
2855                  * rangelist for zero-fills
2856                  */
2857                 if ((MacToVFSError(retval) != ERANGE) ||
2858                     (ap->a_flags & VNODE_WRITE) ||
2859                     ((ap->a_flags & VNODE_READ) && (fp->ff_unallocblocks == 0))) {
2860                         goto exit;
2861                 }
2862
2863                 /* Validate if the start offset is within logical file size */
2864                 if (ap->a_foffset >= fp->ff_size) {
2865                         goto exit;
2866                 }
2867
2868                 /*
2869                  * At this point, we have encountered a failure during
2870                  * MapFileBlockC that resulted in ERANGE, and we are not servicing
2871                  * a write, and there are borrowed blocks.
2872                  *
2873                  * However, the cluster layer will not call blockmap for
2874                  * blocks that are borrowed and in-cache.  We have to assume that
2875                  * because we observed ERANGE being emitted from MapFileBlockC, this
2876                  * extent range is not valid on-disk.  So we treat this as a
2877                  * mapping that needs to be zero-filled prior to reading.
2878                  *
2879                  * Note that under certain circumstances (such as non-contiguous
2880                  * userland VM mappings in the calling process), cluster_io
2881                  * may be forced to split a large I/O driven by hfs_vnop_write
2882                  * into multiple sub-I/Os that necessitate a RMW cycle.  If this is
2883                  * the case here, then we have already removed the invalid range list
2884                  * mapping prior to getting to this blockmap call, so we should not
2885                  * search the invalid rangelist for this byte range.
2886                  */
2887
2888                 bytesContAvail = fp->ff_size - ap->a_foffset;
2889                 /*
2890                  * Clip the contiguous available bytes to, at most, the allowable
2891                  * maximum or the amount requested.
2892                  */
2893
2894                 if (bytesContAvail > ap->a_size) {
2895                         bytesContAvail = ap->a_size;
2896                 }
2897
2898                 *ap->a_bpn = (daddr64_t) -1;
2899                 retval = 0;
2900
2901                 goto exit;
2902         }
2903
2904         /* MapFileC() found a valid extent in the filefork.  Search the
2905          * mapping information further for invalid file ranges
2906          */
2907         overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
2908                               ap->a_foffset + (off_t)bytesContAvail - 1,
2909                               &invalid_range);
2910         if (overlaptype != RL_NOOVERLAP) {
2911                 switch(overlaptype) {
2912                 case RL_MATCHINGOVERLAP:
2913                 case RL_OVERLAPCONTAINSRANGE:
2914                 case RL_OVERLAPSTARTSBEFORE:
2915                         /* There's no valid block for this byte offset */
2916                         *ap->a_bpn = (daddr64_t)-1;
2917                         /* There's no point limiting the amount to be returned
2918                          * if the invalid range that was hit extends all the way
2919                          * to the EOF (i.e. there's no valid bytes between the
2920                          * end of this range and the file's EOF):
2921                          */
2922                         if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
2923                             ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
2924                                 bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
2925                         }
2926                         break;
2927
2928                 case RL_OVERLAPISCONTAINED:
2929                 case RL_OVERLAPENDSAFTER:
2930                         /* The range of interest hits an invalid block before the end: */
2931                         if (invalid_range->rl_start == ap->a_foffset) {
2932                                 /* There's actually no valid information to be had starting here: */
2933                                 *ap->a_bpn = (daddr64_t)-1;
2934                                 if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
2935                                     ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
2936                                         bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
2937                                 }
2938                         } else {
2939                                 bytesContAvail = invalid_range->rl_start - ap->a_foffset;
2940                         }
2941                         break;
2942
2943                 case RL_NOOVERLAP:
2944                         break;
2945                 } /* end switch */
2946                 if (bytesContAvail > ap->a_size)
2947                         bytesContAvail = ap->a_size;
2948         }
2949
2950 exit:
2951         if (retval == 0) {
2952                 if (ap->a_run)
2953                         *ap->a_run = bytesContAvail;
2954
2955                 if (ap->a_poff)
2956                         *(int *)ap->a_poff = 0;
2957         }
2958
2959         if (tooklock)
2960                 hfs_unlock(cp);
2961
2962         return (MacToVFSError(retval));
2963 }
2964
2965 /*
2966  * prepare and issue the I/O
2967  * buf_strategy knows how to deal
2968  * with requests that require
2969  * fragmented I/Os
2970  */
2971 int
2972 hfs_vnop_strategy(struct vnop_strategy_args *ap)
2973 {
2974         buf_t   bp = ap->a_bp;
2975         vnode_t vp = buf_vnode(bp);
2976         int error = 0;
2977
2978         /* Mark buffer as containing static data if cnode flag set */
2979         if (VTOC(vp)->c_flag & C_SSD_STATIC) {
2980                 buf_markstatic(bp);
2981         }
2982
2983         /* Mark buffer as containing static data if cnode flag set */
2984         if (VTOC(vp)->c_flag & C_SSD_GREEDY_MODE) {
2985                 bufattr_markgreedymode((bufattr_t)(&bp->b_attr));
2986         }
2987
2988 #if CONFIG_PROTECT
2989         cnode_t *cp = NULL;
2990
2991         if ((cp = cp_get_protected_cnode(vp)) != NULL) {
2992                 /*
2993                  * We rely upon the truncate lock to protect the
2994                  * CP cache key from getting tossed prior to our IO finishing here.
2995                  * Nearly all cluster io calls to manipulate file payload from HFS
2996                  * take the truncate lock before calling into the cluster
2997                  * layer to ensure the file size does not change, or that they
2998                  * have exclusive right to change the EOF of the file.
2999                  * That same guarantee protects us here since the code that
3000                  * deals with CP lock events must now take the truncate lock
3001                  * before doing anything.
3002                  *
3003                  * There is 1 exception here:
3004                  * 1) One exception should be the VM swapfile IO, because HFS will
3005                  * funnel the VNOP_PAGEOUT directly into a cluster_pageout call for the
3006                  * swapfile code only without holding the truncate lock.  This is because
3007                  * individual swapfiles are maintained at fixed-length sizes by the VM code.
3008                  * In non-swapfile IO we use PAGEOUT_V2 semantics which allow us to
3009                  * create our own UPL and thus take the truncate lock before calling
3010                  * into the cluster layer.  In that case, however, we are not concerned
3011                  * with the CP blob being wiped out in the middle of the IO
3012                  * because there isn't anything to toss; the VM swapfile key stays
3013                  * in-core as long as the file is open.
3014                  *
3015                  * NB:
3016                  * For filesystem resize, we may not have access to the underlying
3017                  * file's cache key for whatever reason (device may be locked).  However,
3018                  * we do not need it since we are going to use the temporary HFS-wide resize key
3019                  * which is generated once we start relocating file content.  If this file's I/O
3020                  * should be done using the resize key, it will have been supplied already, so
3021                  * do not attach the file's cp blob to the buffer.
3022                  */
3023                 if ((cp->c_cpentry->cp_flags & CP_RELOCATION_INFLIGHT) == 0) {
3024                         buf_setcpaddr(bp, cp->c_cpentry);
3025                 }
3026         }
3027 #endif /* CONFIG_PROTECT */
3028
3029         error = buf_strategy(VTOHFS(vp)->hfs_devvp, ap);
3030
3031         return error;
3032 }
3033
3034 static int
3035 hfs_minorupdate(struct vnode *vp) {
3036         struct cnode *cp = VTOC(vp);
3037         cp->c_flag &= ~C_MODIFIED;
3038         cp->c_touch_acctime = 0;
3039         cp->c_touch_chgtime = 0;
3040         cp->c_touch_modtime = 0;
3041
3042         return 0;
3043 }
3044
3045 int
3046 do_hfs_truncate(struct vnode *vp, off_t length, int flags, int truncateflags, vfs_context_t context)
3047 {
3048         register struct cnode *cp = VTOC(vp);
3049         struct filefork *fp = VTOF(vp);
3050         struct proc *p = vfs_context_proc(context);;
3051         kauth_cred_t cred = vfs_context_ucred(context);
3052         int retval;
3053         off_t bytesToAdd;
3054         off_t actualBytesAdded;
3055         off_t filebytes;
3056         u_int32_t fileblocks;
3057         int blksize;
3058         struct hfsmount *hfsmp;
3059         int lockflags;
3060         int skipupdate = (truncateflags & HFS_TRUNCATE_SKIPUPDATE);
3061         int suppress_times = (truncateflags & HFS_TRUNCATE_SKIPTIMES);
3062
3063         blksize = VTOVCB(vp)->blockSize;
3064         fileblocks = fp->ff_blocks;
3065         filebytes = (off_t)fileblocks * (off_t)blksize;
3066
3067         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_START,
3068                  (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
3069
3070         if (length < 0)
3071                 return (EINVAL);
3072
3073         /* This should only happen with a corrupt filesystem */
3074         if ((off_t)fp->ff_size < 0)
3075                 return (EINVAL);
3076
3077         if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE))
3078                 return (EFBIG);
3079
3080         hfsmp = VTOHFS(vp);
3081
3082         retval = E_NONE;
3083
3084         /* Files that are changing size are not hot file candidates. */
3085         if (hfsmp->hfc_stage == HFC_RECORDING) {
3086                 fp->ff_bytesread = 0;
3087         }
3088
3089         /*
3090          * We cannot just check if fp->ff_size == length (as an optimization)
3091          * since there may be extra physical blocks that also need truncation.
3092          */
3093 #if QUOTA
3094         if ((retval = hfs_getinoquota(cp)))
3095                 return(retval);
3096 #endif /* QUOTA */
3097
3098         /*
3099          * Lengthen the size of the file. We must ensure that the
3100          * last byte of the file is allocated. Since the smallest
3101          * value of ff_size is 0, length will be at least 1.
3102          */
3103         if (length > (off_t)fp->ff_size) {
3104 #if QUOTA
3105                 retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)),
3106                                    cred, 0);
3107                 if (retval)
3108                         goto Err_Exit;
3109 #endif /* QUOTA */
3110                 /*
3111                  * If we don't have enough physical space then
3112                  * we need to extend the physical size.
3113                  */
3114                 if (length > filebytes) {
3115                         int eflags;
3116                         u_int32_t blockHint = 0;
3117
3118                         /* All or nothing and don't round up to clumpsize. */
3119                         eflags = kEFAllMask | kEFNoClumpMask;
3120
3121                         if (cred && suser(cred, NULL) != 0)
3122                                 eflags |= kEFReserveMask;  /* keep a reserve */
3123
3124                         /*
3125                          * Allocate Journal and Quota files in metadata zone.
3126                          */
3127                         if (filebytes == 0 &&
3128                             hfsmp->hfs_flags & HFS_METADATA_ZONE &&
3129                             hfs_virtualmetafile(cp)) {
3130                                 eflags |= kEFMetadataMask;
3131                                 blockHint = hfsmp->hfs_metazone_start;
3132                         }
3133                         if (hfs_start_transaction(hfsmp) != 0) {
3134                             retval = EINVAL;
3135                             goto Err_Exit;
3136                         }
3137
3138                         /* Protect extents b-tree and allocation bitmap */
3139                         lockflags = SFL_BITMAP;
3140                         if (overflow_extents(fp))
3141                                 lockflags |= SFL_EXTENTS;
3142                         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3143
3144                         while ((length > filebytes) && (retval == E_NONE)) {
3145                                 bytesToAdd = length - filebytes;
3146                                 retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
3147                                                     (FCB*)fp,
3148                                                     bytesToAdd,
3149                                                     blockHint,
3150                                                     eflags,
3151                                                     &actualBytesAdded));
3152
3153                                 filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
3154                                 if (actualBytesAdded == 0 && retval == E_NONE) {
3155                                         if (length > filebytes)
3156                                                 length = filebytes;
3157                                         break;
3158                                 }
3159                         } /* endwhile */
3160
3161                         hfs_systemfile_unlock(hfsmp, lockflags);
3162
3163                         if (hfsmp->jnl) {
3164                                 if (skipupdate) {
3165                                         (void) hfs_minorupdate(vp);
3166                                 }
3167                                 else {
3168                                         (void) hfs_update(vp, TRUE);
3169                                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3170                                 }
3171                         }
3172
3173                         hfs_end_transaction(hfsmp);
3174
3175                         if (retval)
3176                                 goto Err_Exit;
3177
3178                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
3179                                 (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
3180                 }
3181
3182                 if (!(flags & IO_NOZEROFILL)) {
3183                         if (UBCINFOEXISTS(vp)  && (vnode_issystem(vp) == 0) && retval == E_NONE) {
3184                                 struct rl_entry *invalid_range;
3185                                 off_t zero_limit;
3186
3187                                 zero_limit = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
3188                                 if (length < zero_limit) zero_limit = length;
3189
3190                                 if (length > (off_t)fp->ff_size) {
3191                                         struct timeval tv;
3192
3193                                         /* Extending the file: time to fill out the current last page w. zeroes? */
3194                                         if ((fp->ff_size & PAGE_MASK_64) &&
3195                                             (rl_scan(&fp->ff_invalidranges, fp->ff_size & ~PAGE_MASK_64,
3196                                             fp->ff_size - 1, &invalid_range) == RL_NOOVERLAP)) {
3197
3198                                                 /* There's some valid data at the start of the (current) last page
3199                                                    of the file, so zero out the remainder of that page to ensure the
3200                                                    entire page contains valid data.  Since there is no invalid range
3201                                                    possible past the (current) eof, there's no need to remove anything
3202                                                    from the invalid range list before calling cluster_write():  */
3203                                                 hfs_unlock(cp);
3204                                                 retval = cluster_write(vp, (struct uio *) 0, fp->ff_size, zero_limit,
3205                                                                 fp->ff_size, (off_t)0,
3206                                                                 (flags & IO_SYNC) | IO_HEADZEROFILL | IO_NOZERODIRTY);
3207                                                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
3208                                                 if (retval) goto Err_Exit;
3209
3210                                                 /* Merely invalidate the remaining area, if necessary: */
3211                                                 if (length > zero_limit) {
3212                                                         microuptime(&tv);
3213                                                         rl_add(zero_limit, length - 1, &fp->ff_invalidranges);
3214                                                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
3215                                                 }
3216                                         } else {
3217                                         /* The page containing the (current) eof is invalid: just add the
3218                                            remainder of the page to the invalid list, along with the area
3219                                            being newly allocated:
3220                                          */
3221                                         microuptime(&tv);
3222                                         rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges);
3223                                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
3224                                         };
3225                                 }
3226                         } else {
3227                                         panic("hfs_truncate: invoked on non-UBC object?!");
3228                         };
3229                 }
3230                 if (suppress_times == 0) {
3231                         cp->c_touch_modtime = TRUE;
3232                 }
3233                 fp->ff_size = length;
3234
3235         } else { /* Shorten the size of the file */
3236
3237                 if ((off_t)fp->ff_size > length) {
3238                         /* Any space previously marked as invalid is now irrelevant: */
3239                         rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges);
3240                 }
3241
3242                 /*
3243                  * Account for any unmapped blocks. Note that the new
3244                  * file length can still end up with unmapped blocks.
3245                  */
3246                 if (fp->ff_unallocblocks > 0) {
3247                         u_int32_t finalblks;
3248                         u_int32_t loanedBlocks;
3249
3250                         hfs_lock_mount(hfsmp);
3251                         loanedBlocks = fp->ff_unallocblocks;
3252                         cp->c_blocks -= loanedBlocks;
3253                         fp->ff_blocks -= loanedBlocks;
3254                         fp->ff_unallocblocks = 0;
3255
3256                         hfsmp->loanedBlocks -= loanedBlocks;
3257
3258                         finalblks = (length + blksize - 1) / blksize;
3259                         if (finalblks > fp->ff_blocks) {
3260                                 /* calculate required unmapped blocks */
3261                                 loanedBlocks = finalblks - fp->ff_blocks;
3262                                 hfsmp->loanedBlocks += loanedBlocks;
3263
3264                                 fp->ff_unallocblocks = loanedBlocks;
3265                                 cp->c_blocks += loanedBlocks;
3266                                 fp->ff_blocks += loanedBlocks;
3267                         }
3268                         hfs_unlock_mount (hfsmp);
3269                 }
3270
3271                 /*
3272                  * For a TBE process the deallocation of the file blocks is
3273                  * delayed until the file is closed.  And hfs_close calls
3274                  * truncate with the IO_NDELAY flag set.  So when IO_NDELAY
3275                  * isn't set, we make sure this isn't a TBE process.
3276                  */
3277                 if ((flags & IO_NDELAY) || (proc_tbe(p) == 0)) {
3278 #if QUOTA
3279                   off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
3280 #endif /* QUOTA */
3281                   if (hfs_start_transaction(hfsmp) != 0) {
3282                       retval = EINVAL;
3283                       goto Err_Exit;
3284                   }
3285
3286                         if (fp->ff_unallocblocks == 0) {
3287                                 /* Protect extents b-tree and allocation bitmap */
3288                                 lockflags = SFL_BITMAP;
3289                                 if (overflow_extents(fp))
3290                                         lockflags |= SFL_EXTENTS;
3291                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3292
3293                                 retval = MacToVFSError(TruncateFileC(VTOVCB(vp), (FCB*)fp, length, 0,
3294                                                                                                          FORK_IS_RSRC (fp), FTOC(fp)->c_fileid, false));
3295
3296                                 hfs_systemfile_unlock(hfsmp, lockflags);
3297                         }
3298                         if (hfsmp->jnl) {
3299                                 if (retval == 0) {
3300                                         fp->ff_size = length;
3301                                 }
3302                                 if (skipupdate) {
3303                                         (void) hfs_minorupdate(vp);
3304                                 }
3305                                 else {
3306                                         (void) hfs_update(vp, TRUE);
3307                                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3308                                 }
3309                         }
3310                         hfs_end_transaction(hfsmp);
3311
3312                         filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
3313                         if (retval)
3314                                 goto Err_Exit;
3315 #if QUOTA
3316                         /* These are bytesreleased */
3317                         (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0);
3318 #endif /* QUOTA */
3319                 }
3320                 /*
3321                  * Only set update flag if the logical length changes & we aren't
3322                  * suppressing modtime updates.
3323                  */
3324                 if (((off_t)fp->ff_size != length) && (suppress_times == 0)) {
3325                         cp->c_touch_modtime = TRUE;
3326                 }
3327                 fp->ff_size = length;
3328         }
3329         if (cp->c_mode & (S_ISUID | S_ISGID)) {
3330                 if (!vfs_context_issuser(context)) {
3331                         cp->c_mode &= ~(S_ISUID | S_ISGID);
3332                         skipupdate = 0;
3333                 }
3334         }
3335         if (skipupdate) {
3336                 retval = hfs_minorupdate(vp);
3337         }
3338         else {
3339                 cp->c_touch_chgtime = TRUE;     /* status changed */
3340                 if (suppress_times == 0) {
3341                         cp->c_touch_modtime = TRUE;     /* file data was modified */
3342
3343                         /*
3344                          * If we are not suppressing the modtime update, then
3345                          * update the gen count as well.
3346                          */
3347                         if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK (cp->c_attr.ca_mode)) {
3348                                 hfs_incr_gencount(cp);
3349                         }
3350                 }
3351
3352                 retval = hfs_update(vp, MNT_WAIT);
3353         }
3354         if (retval) {
3355                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
3356                      -1, -1, -1, retval, 0);
3357         }
3358
3359 Err_Exit:
3360
3361         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_END,
3362                  (int)length, (int)fp->ff_size, (int)filebytes, retval, 0);
3363
3364         return (retval);
3365 }
3366
3367 /*
3368  * Preparation which must be done prior to deleting the catalog record
3369  * of a file or directory.  In order to make the on-disk as safe as possible,
3370  * we remove the catalog entry before releasing the bitmap blocks and the
3371  * overflow extent records.  However, some work must be done prior to deleting
3372  * the catalog record.
3373  *
3374  * When calling this function, the cnode must exist both in memory and on-disk.
3375  * If there are both resource fork and data fork vnodes, this function should
3376  * be called on both.
3377  */
3378
3379 int
3380 hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) {
3381
3382         struct filefork *fp = VTOF(vp);
3383         struct cnode *cp = VTOC(vp);
3384 #if QUOTA
3385         int retval = 0;
3386 #endif /* QUOTA */
3387
3388         /* Cannot truncate an HFS directory! */
3389         if (vnode_isdir(vp)) {
3390                 return (EISDIR);
3391         }
3392
3393         /*
3394          * See the comment below in hfs_truncate for why we need to call
3395          * setsize here.  Essentially we want to avoid pending IO if we
3396          * already know that the blocks are going to be released here.
3397          * This function is only called when totally removing all storage for a file, so
3398          * we can take a shortcut and immediately setsize (0);
3399          */
3400         ubc_setsize(vp, 0);
3401
3402         /* This should only happen with a corrupt filesystem */
3403         if ((off_t)fp->ff_size < 0)
3404                 return (EINVAL);
3405
3406         /*
3407          * We cannot just check if fp->ff_size == length (as an optimization)
3408          * since there may be extra physical blocks that also need truncation.
3409          */
3410 #if QUOTA
3411         if ((retval = hfs_getinoquota(cp))) {
3412                 return(retval);
3413         }
3414 #endif /* QUOTA */
3415
3416         /* Wipe out any invalid ranges which have yet to be backed by disk */
3417         rl_remove(0, fp->ff_size - 1, &fp->ff_invalidranges);
3418
3419         /*
3420          * Account for any unmapped blocks. Since we're deleting the
3421          * entire file, we don't have to worry about just shrinking
3422          * to a smaller number of borrowed blocks.
3423          */
3424         if (fp->ff_unallocblocks > 0) {
3425                 u_int32_t loanedBlocks;
3426
3427                 hfs_lock_mount (hfsmp);
3428                 loanedBlocks = fp->ff_unallocblocks;
3429                 cp->c_blocks -= loanedBlocks;
3430                 fp->ff_blocks -= loanedBlocks;
3431                 fp->ff_unallocblocks = 0;
3432
3433                 hfsmp->loanedBlocks -= loanedBlocks;
3434
3435                 hfs_unlock_mount (hfsmp);
3436         }
3437
3438         return 0;
3439 }
3440
3441
3442 /*
3443  * Special wrapper around calling TruncateFileC.  This function is useable
3444  * even when the catalog record does not exist any longer, making it ideal
3445  * for use when deleting a file.  The simplification here is that we know
3446  * that we are releasing all blocks.
3447  *
3448  * Note that this function may be called when there is no vnode backing
3449  * the file fork in question.  We may call this from hfs_vnop_inactive
3450  * to clear out resource fork data (and may not want to clear out the data
3451  * fork yet).  As a result, we pointer-check both sets of inputs before
3452  * doing anything with them.
3453  *
3454  * The caller is responsible for saving off a copy of the filefork(s)
3455  * embedded within the cnode prior to calling this function.  The pointers
3456  * supplied as arguments must be valid even if the cnode is no longer valid.
3457  */
3458
3459 int
3460 hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork,
3461                                          struct filefork *rsrcfork, u_int32_t fileid) {
3462
3463         off_t filebytes;
3464         u_int32_t fileblocks;
3465         int blksize = 0;
3466         int error = 0;
3467         int lockflags;
3468
3469         blksize = hfsmp->blockSize;
3470
3471         /* Data Fork */
3472         if ((datafork != NULL) && (datafork->ff_blocks > 0)) {
3473                 fileblocks = datafork->ff_blocks;
3474                 filebytes = (off_t)fileblocks * (off_t)blksize;
3475
3476                 /* We killed invalid ranges and loaned blocks before we removed the catalog entry */
3477
3478                 while (filebytes > 0) {
3479                         if (filebytes > HFS_BIGFILE_SIZE && overflow_extents(datafork)) {
3480                                 filebytes -= HFS_BIGFILE_SIZE;
3481                         } else {
3482                                 filebytes = 0;
3483                         }
3484
3485                         /* Start a transaction, and wipe out as many blocks as we can in this iteration */
3486                         if (hfs_start_transaction(hfsmp) != 0) {
3487                                 error = EINVAL;
3488                                 break;
3489                         }
3490
3491                         if (datafork->ff_unallocblocks == 0) {
3492                                 /* Protect extents b-tree and allocation bitmap */
3493                                 lockflags = SFL_BITMAP;
3494                                 if (overflow_extents(datafork))
3495                                         lockflags |= SFL_EXTENTS;
3496                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3497
3498                                 error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), datafork, filebytes, 1, 0, fileid, false));
3499
3500                                 hfs_systemfile_unlock(hfsmp, lockflags);
3501                         }
3502                         if (error == 0) {
3503                                 datafork->ff_size = filebytes;
3504                         }
3505                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3506
3507                         /* Finish the transaction and start over if necessary */
3508                         hfs_end_transaction(hfsmp);
3509
3510                         if (error) {
3511                                 break;
3512                         }
3513                 }
3514         }
3515
3516         /* Resource fork */
3517         if (error == 0 && (rsrcfork != NULL) && rsrcfork->ff_blocks > 0) {
3518                 fileblocks = rsrcfork->ff_blocks;
3519                 filebytes = (off_t)fileblocks * (off_t)blksize;
3520
3521                 /* We killed invalid ranges and loaned blocks before we removed the catalog entry */
3522
3523                 while (filebytes > 0) {
3524                         if (filebytes > HFS_BIGFILE_SIZE && overflow_extents(rsrcfork)) {
3525                                 filebytes -= HFS_BIGFILE_SIZE;
3526                         } else {
3527                                 filebytes = 0;
3528                         }
3529
3530                         /* Start a transaction, and wipe out as many blocks as we can in this iteration */
3531                         if (hfs_start_transaction(hfsmp) != 0) {
3532                                 error = EINVAL;
3533                                 break;
3534                         }
3535
3536                         if (rsrcfork->ff_unallocblocks == 0) {
3537                                 /* Protect extents b-tree and allocation bitmap */
3538                                 lockflags = SFL_BITMAP;
3539                                 if (overflow_extents(rsrcfork))
3540                                         lockflags |= SFL_EXTENTS;
3541                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3542
3543                                 error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), rsrcfork, filebytes, 1, 1, fileid, false));
3544
3545                                 hfs_systemfile_unlock(hfsmp, lockflags);
3546                         }
3547                         if (error == 0) {
3548                                 rsrcfork->ff_size = filebytes;
3549                         }
3550                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3551
3552                         /* Finish the transaction and start over if necessary */
3553                         hfs_end_transaction(hfsmp);
3554
3555                         if (error) {
3556                                 break;
3557                         }
3558                 }
3559         }
3560
3561         return error;
3562 }
3563
3564
3565 /*
3566  * Truncate a cnode to at most length size, freeing (or adding) the
3567  * disk blocks.
3568  */
3569 int
3570 hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize,
3571              int truncateflags, vfs_context_t context)
3572 {
3573         struct filefork *fp = VTOF(vp);
3574         off_t filebytes;
3575         u_int32_t fileblocks;
3576         int blksize, error = 0;
3577         struct cnode *cp = VTOC(vp);
3578
3579         /* Cannot truncate an HFS directory! */
3580         if (vnode_isdir(vp)) {
3581                 return (EISDIR);
3582         }
3583         /* A swap file cannot change size. */
3584         if (vnode_isswap(vp) && (length != 0)) {
3585                 return (EPERM);
3586         }
3587
3588         blksize = VTOVCB(vp)->blockSize;
3589         fileblocks = fp->ff_blocks;
3590         filebytes = (off_t)fileblocks * (off_t)blksize;
3591
3592         //
3593         // Have to do this here so that we don't wind up with
3594         // i/o pending for blocks that are about to be released
3595         // if we truncate the file.
3596         //
3597         // If skipsetsize is set, then the caller is responsible
3598         // for the ubc_setsize.
3599         //
3600         // Even if skipsetsize is set, if the length is zero we
3601         // want to call ubc_setsize() because as of SnowLeopard
3602         // it will no longer cause any page-ins and it will drop
3603         // any dirty pages so that we don't do any i/o that we
3604         // don't have to.  This also prevents a race where i/o
3605         // for truncated blocks may overwrite later data if the
3606         // blocks get reallocated to a different file.
3607         //
3608         if (!skipsetsize || length == 0)
3609                 ubc_setsize(vp, length);
3610
3611         // have to loop truncating or growing files that are
3612         // really big because otherwise transactions can get
3613         // enormous and consume too many kernel resources.
3614
3615         if (length < filebytes) {
3616                 while (filebytes > length) {
3617                         if ((filebytes - length) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
3618                                 filebytes -= HFS_BIGFILE_SIZE;
3619                         } else {
3620                                 filebytes = length;
3621                         }
3622                         cp->c_flag |= C_FORCEUPDATE;
3623                         error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context);
3624                         if (error)
3625                                 break;
3626                 }
3627         } else if (length > filebytes) {
3628                 while (filebytes < length) {
3629                         if ((length - filebytes) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
3630                                 filebytes += HFS_BIGFILE_SIZE;
3631                         } else {
3632                                 filebytes = length;
3633                         }
3634                         cp->c_flag |= C_FORCEUPDATE;
3635                         error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context);
3636                         if (error)
3637                                 break;
3638                 }
3639         } else /* Same logical size */ {
3640
3641                 error = do_hfs_truncate(vp, length, flags, truncateflags, context);
3642         }
3643         /* Files that are changing size are not hot file candidates. */
3644         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
3645                 fp->ff_bytesread = 0;
3646         }
3647
3648         return (error);
3649 }
3650
3651
3652
3653 /*
3654  * Preallocate file storage space.
3655  */
3656 int
3657 hfs_vnop_allocate(struct vnop_allocate_args /* {
3658                 vnode_t a_vp;
3659                 off_t a_length;
3660                 u_int32_t  a_flags;
3661                 off_t *a_bytesallocated;
3662                 off_t a_offset;
3663                 vfs_context_t a_context;
3664         } */ *ap)
3665 {
3666         struct vnode *vp = ap->a_vp;
3667         struct cnode *cp;
3668         struct filefork *fp;
3669         ExtendedVCB *vcb;
3670         off_t length = ap->a_length;
3671         off_t startingPEOF;
3672         off_t moreBytesRequested;
3673         off_t actualBytesAdded;
3674         off_t filebytes;
3675         u_int32_t fileblocks;
3676         int retval, retval2;
3677         u_int32_t blockHint;
3678         u_int32_t extendFlags;   /* For call to ExtendFileC */
3679         struct hfsmount *hfsmp;
3680         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
3681         int lockflags;
3682         time_t orig_ctime;
3683
3684         *(ap->a_bytesallocated) = 0;
3685
3686         if (!vnode_isreg(vp))
3687                 return (EISDIR);
3688         if (length < (off_t)0)
3689                 return (EINVAL);
3690
3691         cp = VTOC(vp);
3692
3693         orig_ctime = VTOC(vp)->c_ctime;
3694
3695         check_for_tracked_file(vp, orig_ctime, ap->a_length == 0 ? NAMESPACE_HANDLER_TRUNCATE_OP|NAMESPACE_HANDLER_DELETE_OP : NAMESPACE_HANDLER_TRUNCATE_OP, NULL);
3696
3697         hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
3698
3699         if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
3700                 goto Err_Exit;
3701         }
3702
3703         fp = VTOF(vp);
3704         hfsmp = VTOHFS(vp);
3705         vcb = VTOVCB(vp);
3706
3707         fileblocks = fp->ff_blocks;
3708         filebytes = (off_t)fileblocks * (off_t)vcb->blockSize;
3709
3710         if ((ap->a_flags & ALLOCATEFROMVOL) && (length < filebytes)) {
3711                 retval = EINVAL;
3712                 goto Err_Exit;
3713         }
3714
3715         /* Fill in the flags word for the call to Extend the file */
3716
3717         extendFlags = kEFNoClumpMask;
3718         if (ap->a_flags & ALLOCATECONTIG)
3719                 extendFlags |= kEFContigMask;
3720         if (ap->a_flags & ALLOCATEALL)
3721                 extendFlags |= kEFAllMask;
3722         if (cred && suser(cred, NULL) != 0)
3723                 extendFlags |= kEFReserveMask;
3724         if (hfs_virtualmetafile(cp))
3725                 extendFlags |= kEFMetadataMask;
3726
3727         retval = E_NONE;
3728         blockHint = 0;
3729         startingPEOF = filebytes;
3730
3731         if (ap->a_flags & ALLOCATEFROMPEOF)
3732                 length += filebytes;
3733         else if (ap->a_flags & ALLOCATEFROMVOL)
3734                 blockHint = ap->a_offset / VTOVCB(vp)->blockSize;
3735
3736         /* If no changes are necesary, then we're done */
3737         if (filebytes == length)
3738                 goto Std_Exit;
3739
3740         /*
3741          * Lengthen the size of the file. We must ensure that the
3742          * last byte of the file is allocated. Since the smallest
3743          * value of filebytes is 0, length will be at least 1.
3744          */
3745         if (length > filebytes) {
3746                 off_t total_bytes_added = 0, orig_request_size;
3747
3748                 orig_request_size = moreBytesRequested = length - filebytes;
3749
3750 #if QUOTA
3751                 retval = hfs_chkdq(cp,
3752                                 (int64_t)(roundup(moreBytesRequested, vcb->blockSize)),
3753                                 cred, 0);
3754                 if (retval)
3755                         goto Err_Exit;
3756
3757 #endif /* QUOTA */
3758                 /*
3759                  * Metadata zone checks.
3760                  */
3761                 if (hfsmp->hfs_flags & HFS_METADATA_ZONE) {
3762                         /*
3763                          * Allocate Journal and Quota files in metadata zone.
3764                          */
3765                         if (hfs_virtualmetafile(cp)) {
3766                                 blockHint = hfsmp->hfs_metazone_start;
3767                         } else if ((blockHint >= hfsmp->hfs_metazone_start) &&
3768                                    (blockHint <= hfsmp->hfs_metazone_end)) {
3769                                 /*
3770                                  * Move blockHint outside metadata zone.
3771                                  */
3772                                 blockHint = hfsmp->hfs_metazone_end + 1;
3773                         }
3774                 }
3775
3776
3777                 while ((length > filebytes) && (retval == E_NONE)) {
3778                     off_t bytesRequested;
3779
3780                     if (hfs_start_transaction(hfsmp) != 0) {
3781                         retval = EINVAL;
3782                         goto Err_Exit;
3783                     }
3784
3785                     /* Protect extents b-tree and allocation bitmap */
3786                     lockflags = SFL_BITMAP;
3787                     if (overflow_extents(fp))
3788                         lockflags |= SFL_EXTENTS;
3789                     lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3790
3791                     if (moreBytesRequested >= HFS_BIGFILE_SIZE) {
3792                         bytesRequested = HFS_BIGFILE_SIZE;
3793                     } else {
3794                         bytesRequested = moreBytesRequested;
3795                     }
3796
3797                     if (extendFlags & kEFContigMask) {
3798                             // if we're on a sparse device, this will force it to do a
3799                             // full scan to find the space needed.
3800                             hfsmp->hfs_flags &= ~HFS_DID_CONTIG_SCAN;
3801                     }
3802
3803                     retval = MacToVFSError(ExtendFileC(vcb,
3804                                                 (FCB*)fp,
3805                                                 bytesRequested,
3806                                                 blockHint,
3807                                                 extendFlags,
3808                                                 &actualBytesAdded));
3809
3810                     if (retval == E_NONE) {
3811                         *(ap->a_bytesallocated) += actualBytesAdded;
3812                         total_bytes_added += actualBytesAdded;
3813                         moreBytesRequested -= actualBytesAdded;
3814                         if (blockHint != 0) {
3815                             blockHint += actualBytesAdded / vcb->blockSize;
3816                         }
3817                     }
3818                     filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
3819
3820                     hfs_systemfile_unlock(hfsmp, lockflags);
3821
3822                     if (hfsmp->jnl) {
3823                         (void) hfs_update(vp, TRUE);
3824                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3825                     }
3826
3827                     hfs_end_transaction(hfsmp);
3828                 }
3829
3830
3831                 /*
3832                  * if we get an error and no changes were made then exit
3833                  * otherwise we must do the hfs_update to reflect the changes
3834                  */
3835                 if (retval && (startingPEOF == filebytes))
3836                         goto Err_Exit;
3837
3838                 /*
3839                  * Adjust actualBytesAdded to be allocation block aligned, not
3840                  * clump size aligned.
3841                  * NOTE: So what we are reporting does not affect reality
3842                  * until the file is closed, when we truncate the file to allocation
3843                  * block size.
3844                  */
3845                 if (total_bytes_added != 0 && orig_request_size < total_bytes_added)
3846                         *(ap->a_bytesallocated) =
3847                                 roundup(orig_request_size, (off_t)vcb->blockSize);
3848
3849         } else { /* Shorten the size of the file */
3850
3851                 if (fp->ff_size > length) {
3852                         /*
3853                          * Any buffers that are past the truncation point need to be
3854                          * invalidated (to maintain buffer cache consistency).
3855                          */
3856                 }
3857
3858                 retval = hfs_truncate(vp, length, 0, 0, 0, ap->a_context);
3859                 filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
3860
3861                 /*
3862                  * if we get an error and no changes were made then exit
3863                  * otherwise we must do the hfs_update to reflect the changes
3864                  */
3865                 if (retval && (startingPEOF == filebytes)) goto Err_Exit;
3866 #if QUOTA
3867                 /* These are  bytesreleased */
3868                 (void) hfs_chkdq(cp, (int64_t)-((startingPEOF - filebytes)), NOCRED,0);
3869 #endif /* QUOTA */
3870
3871                 if (fp->ff_size > filebytes) {
3872                         fp->ff_size = filebytes;
3873
3874                         hfs_unlock(cp);
3875                         ubc_setsize(vp, fp->ff_size);
3876                         hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
3877                 }
3878         }
3879
3880 Std_Exit:
3881         cp->c_touch_chgtime = TRUE;
3882         cp->c_touch_modtime = TRUE;
3883         retval2 = hfs_update(vp, MNT_WAIT);
3884
3885         if (retval == 0)
3886                 retval = retval2;
3887 Err_Exit:
3888         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
3889         hfs_unlock(cp);
3890         return (retval);
3891 }
3892
3893
3894 /*
3895  * Pagein for HFS filesystem
3896  */
3897 int
3898 hfs_vnop_pagein(struct vnop_pagein_args *ap)
3899 /*
3900         struct vnop_pagein_args {
3901                 vnode_t a_vp,
3902                 upl_t         a_pl,
3903                 vm_offset_t   a_pl_offset,
3904                 off_t         a_f_offset,
3905                 size_t        a_size,
3906                 int           a_flags
3907                 vfs_context_t a_context;
3908         };
3909 */
3910 {
3911         vnode_t         vp;
3912         struct cnode    *cp;
3913         struct filefork *fp;
3914         int             error = 0;
3915         upl_t           upl;
3916         upl_page_info_t *pl;
3917         off_t           f_offset;
3918         int             offset;
3919         int             isize;
3920         int             pg_index;
3921         boolean_t       truncate_lock_held = FALSE;
3922         boolean_t       file_converted = FALSE;
3923         kern_return_t   kret;
3924
3925         vp = ap->a_vp;
3926         cp = VTOC(vp);
3927         fp = VTOF(vp);
3928
3929 #if CONFIG_PROTECT
3930         if ((error = cp_handle_vnop(vp, CP_READ_ACCESS | CP_WRITE_ACCESS, 0)) != 0) {
3931                 /*
3932                  * If we errored here, then this means that one of two things occurred:
3933                  * 1. there was a problem with the decryption of the key.
3934                  * 2. the device is locked and we are not allowed to access this particular file.
3935                  *
3936                  * Either way, this means that we need to shut down this upl now.  As long as
3937                  * the pl pointer is NULL (meaning that we're supposed to create the UPL ourselves)
3938                  * then we create a upl and immediately abort it.
3939                  */
3940                 if (ap->a_pl == NULL) {
3941                         /* create the upl */
3942                         ubc_create_upl (vp, ap->a_f_offset, ap->a_size, &upl, &pl,
3943                                         UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT);
3944                         /* mark the range as needed so it doesn't immediately get discarded upon abort */
3945                         ubc_upl_range_needed (upl, ap->a_pl_offset / PAGE_SIZE, 1);
3946
3947                         /* Abort the range */
3948                         ubc_upl_abort_range (upl, 0, ap->a_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
3949                 }
3950
3951
3952                 return error;
3953         }
3954 #endif /* CONFIG_PROTECT */
3955
3956         if (ap->a_pl != NULL) {
3957                 /*
3958                  * this can only happen for swap files now that
3959                  * we're asking for V2 paging behavior...
3960                  * so don't need to worry about decompression, or
3961                  * keeping track of blocks read or taking the truncate lock
3962                  */
3963                 error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
3964                                        ap->a_size, (off_t)fp->ff_size, ap->a_flags);
3965                 goto pagein_done;
3966         }
3967
3968 retry_pagein:
3969         /*
3970          * take truncate lock (shared/recursive) to guard against
3971          * zero-fill thru fsync interfering, but only for v2
3972          *
3973          * the HFS_RECURSE_TRUNCLOCK arg indicates that we want the
3974          * lock shared and we are allowed to recurse 1 level if this thread already
3975          * owns the lock exclusively... this can legally occur
3976          * if we are doing a shrinking ftruncate against a file
3977          * that is mapped private, and the pages being truncated
3978          * do not currently exist in the cache... in that case
3979          * we will have to page-in the missing pages in order
3980          * to provide them to the private mapping... we must
3981          * also call hfs_unlock_truncate with a postive been_recursed
3982          * arg to indicate that if we have recursed, there is no need to drop
3983          * the lock.  Allowing this simple recursion is necessary
3984          * in order to avoid a certain deadlock... since the ftruncate
3985          * already holds the truncate lock exclusively, if we try
3986          * to acquire it shared to protect the pagein path, we will
3987          * hang this thread
3988          *
3989          * NOTE: The if () block below is a workaround in order to prevent a
3990          * VM deadlock. See rdar://7853471.
3991          *
3992          * If we are in a forced unmount, then launchd will still have the
3993          * dyld_shared_cache file mapped as it is trying to reboot.  If we
3994          * take the truncate lock here to service a page fault, then our
3995          * thread could deadlock with the forced-unmount.  The forced unmount
3996          * thread will try to reclaim the dyld_shared_cache vnode, but since it's
3997          * marked C_DELETED, it will call ubc_setsize(0).  As a result, the unmount
3998          * thread will think it needs to copy all of the data out of the file
3999          * and into a VM copy object.  If we hold the cnode lock here, then that
4000          * VM operation will not be able to proceed, because we'll set a busy page
4001          * before attempting to grab the lock.  Note that this isn't as simple as "don't
4002          * call ubc_setsize" because doing that would just shift the problem to the
4003          * ubc_msync done before the vnode is reclaimed.
4004          *
4005          * So, if a forced unmount on this volume is in flight AND the cnode is
4006          * marked C_DELETED, then just go ahead and do the page in without taking
4007          * the lock (thus suspending pagein_v2 semantics temporarily).  Since it's on a file
4008          * that is not going to be available on the next mount, this seems like a
4009          * OK solution from a correctness point of view, even though it is hacky.
4010          */
4011         if (vfs_isforce(vp->v_mount)) {
4012                 if (cp->c_flag & C_DELETED) {
4013                         /* If we don't get it, then just go ahead and operate without the lock */
4014                         truncate_lock_held = hfs_try_trunclock(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4015                 }
4016         }
4017         else {
4018                 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4019                 truncate_lock_held = TRUE;
4020         }
4021
4022         kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT);
4023
4024         if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
4025                 error = EINVAL;
4026                 goto pagein_done;
4027         }
4028         ubc_upl_range_needed(upl, ap->a_pl_offset / PAGE_SIZE, 1);
4029
4030         isize = ap->a_size;
4031
4032         /*
4033          * Scan from the back to find the last page in the UPL, so that we
4034          * aren't looking at a UPL that may have already been freed by the
4035          * preceding aborts/completions.
4036          */
4037         for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
4038                 if (upl_page_present(pl, --pg_index))
4039                         break;
4040                 if (pg_index == 0) {
4041                         /*
4042                          * no absent pages were found in the range specified
4043                          * just abort the UPL to get rid of it and then we're done
4044                          */
4045                         ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
4046                         goto pagein_done;
4047                 }
4048         }
4049         /*
4050          * initialize the offset variables before we touch the UPL.
4051          * f_offset is the position into the file, in bytes
4052          * offset is the position into the UPL, in bytes
4053          * pg_index is the pg# of the UPL we're operating on
4054          * isize is the offset into the UPL of the last page that is present.
4055          */
4056         isize = ((pg_index + 1) * PAGE_SIZE);
4057         pg_index = 0;
4058         offset = 0;
4059         f_offset = ap->a_f_offset;
4060
4061         while (isize) {
4062                 int  xsize;
4063                 int  num_of_pages;
4064
4065                 if ( !upl_page_present(pl, pg_index)) {
4066                         /*
4067                          * we asked for RET_ONLY_ABSENT, so it's possible
4068                          * to get back empty slots in the UPL.
4069                          * just skip over them
4070                          */
4071                         f_offset += PAGE_SIZE;
4072                         offset   += PAGE_SIZE;
4073                         isize    -= PAGE_SIZE;
4074                         pg_index++;
4075
4076                         continue;
4077                 }
4078                 /*
4079                  * We know that we have at least one absent page.
4080                  * Now checking to see how many in a row we have
4081                  */
4082                 num_of_pages = 1;
4083                 xsize = isize - PAGE_SIZE;
4084
4085                 while (xsize) {
4086                         if ( !upl_page_present(pl, pg_index + num_of_pages))
4087                                 break;
4088                         num_of_pages++;
4089                         xsize -= PAGE_SIZE;
4090                 }
4091                 xsize = num_of_pages * PAGE_SIZE;
4092
4093 #if HFS_COMPRESSION
4094                 if (VNODE_IS_RSRC(vp)) {
4095                         /* allow pageins of the resource fork */
4096                 } else {
4097                         int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
4098
4099                         if (compressed) {
4100                                 if (truncate_lock_held) {
4101                                         /*
4102                                          * can't hold the truncate lock when calling into the decmpfs layer
4103                                          * since it calls back into this layer... even though we're only
4104                                          * holding the lock in shared mode, and the re-entrant path only
4105                                          * takes the lock shared, we can deadlock if some other thread
4106                                          * tries to grab the lock exclusively in between.
4107                                          */
4108                                         hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4109                                         truncate_lock_held = FALSE;
4110                                 }
4111                                 ap->a_pl = upl;
4112                                 ap->a_pl_offset = offset;
4113                                 ap->a_f_offset = f_offset;
4114                                 ap->a_size = xsize;
4115
4116                                 error = decmpfs_pagein_compressed(ap, &compressed, VTOCMP(vp));
4117                                 /*
4118                                  * note that decpfs_pagein_compressed can change the state of
4119                                  * 'compressed'... it will set it to 0 if the file is no longer
4120                                  * compressed once the compression lock is successfully taken
4121                                  * i.e. we would block on that lock while the file is being inflated
4122                                  */
4123                                 if (compressed) {
4124                                         if (error == 0) {
4125                                                 /* successful page-in, update the access time */
4126                                                 VTOC(vp)->c_touch_acctime = TRUE;
4127
4128                                                 /* compressed files are not hot file candidates */
4129                                                 if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
4130                                                         fp->ff_bytesread = 0;
4131                                                 }
4132                                         } else if (error == EAGAIN) {
4133                                                 /*
4134                                                  * EAGAIN indicates someone else already holds the compression lock...
4135                                                  * to avoid deadlocking, we'll abort this range of pages with an
4136                                                  * indication that the pagein needs to be redriven
4137                                                  */
4138                                                 ubc_upl_abort_range(upl, (upl_offset_t) offset, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_RESTART);
4139                                         }
4140                                         goto pagein_next_range;
4141                                 }
4142                                 else {
4143                                         /*
4144                                          * Set file_converted only if the file became decompressed while we were
4145                                          * paging in.  If it were still compressed, we would re-start the loop using the goto
4146                                          * in the above block.  This avoid us overloading truncate_lock_held as our retry_pagein
4147                                          * condition below, since we could have avoided taking the truncate lock to prevent
4148                                          * a deadlock in the force unmount case.
4149                                          */
4150                                         file_converted = TRUE;
4151                                 }
4152                         }
4153                         if (file_converted == TRUE) {
4154                                 /*
4155                                  * the file was converted back to a regular file after we first saw it as compressed
4156                                  * we need to abort the upl, retake the truncate lock, recreate the UPL and start over
4157                                  * reset a_size so that we consider what remains of the original request
4158                                  * and null out a_upl and a_pl_offset.
4159                                  *
4160                                  * We should only be able to get into this block if the decmpfs_pagein_compressed
4161                                  * successfully decompressed the range in question for this file.
4162                                  */
4163                                 ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY);
4164
4165                                 ap->a_size = isize;
4166                                 ap->a_pl = NULL;
4167                                 ap->a_pl_offset = 0;
4168
4169                                 /* Reset file_converted back to false so that we don't infinite-loop. */
4170                                 file_converted = FALSE;
4171                                 goto retry_pagein;
4172                         }
4173                 }
4174 #endif
4175                 error = cluster_pagein(vp, upl, offset, f_offset, xsize, (off_t)fp->ff_size, ap->a_flags);
4176
4177                 /*
4178                  * Keep track of blocks read.
4179                  */
4180                 if ( !vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) {
4181                         int bytesread;
4182                         int took_cnode_lock = 0;
4183
4184                         if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE)
4185                                 bytesread = fp->ff_size;
4186                         else
4187                                 bytesread = xsize;
4188
4189                         /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
4190                         if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) {
4191                                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
4192                                 took_cnode_lock = 1;
4193                         }
4194                         /*
4195                          * If this file hasn't been seen since the start of
4196                          * the current sampling period then start over.
4197                          */
4198                         if (cp->c_atime < VTOHFS(vp)->hfc_timebase) {
4199                                 struct timeval tv;
4200
4201                                 fp->ff_bytesread = bytesread;
4202                                 microtime(&tv);
4203                                 cp->c_atime = tv.tv_sec;
4204                         } else {
4205                                 fp->ff_bytesread += bytesread;
4206                         }
4207                         cp->c_touch_acctime = TRUE;
4208                         if (took_cnode_lock)
4209                                 hfs_unlock(cp);
4210                 }
4211 pagein_next_range:
4212                 f_offset += xsize;
4213                 offset   += xsize;
4214                 isize    -= xsize;
4215                 pg_index += num_of_pages;
4216
4217                 error = 0;
4218         }
4219
4220 pagein_done:
4221         if (truncate_lock_held == TRUE) {
4222                 /* Note 1 is passed to hfs_unlock_truncate in been_recursed argument */
4223                 hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4224         }
4225
4226         return (error);
4227 }
4228
4229 /*
4230  * Pageout for HFS filesystem.
4231  */
4232 int
4233 hfs_vnop_pageout(struct vnop_pageout_args *ap)
4234 /*
4235         struct vnop_pageout_args {
4236            vnode_t a_vp,
4237            upl_t         a_pl,
4238            vm_offset_t   a_pl_offset,
4239            off_t         a_f_offset,
4240            size_t        a_size,
4241            int           a_flags
4242            vfs_context_t a_context;
4243         };
4244 */
4245 {
4246         vnode_t vp = ap->a_vp;
4247         struct cnode *cp;
4248         struct filefork *fp;
4249         int retval = 0;
4250         off_t filesize;
4251         upl_t           upl;
4252         upl_page_info_t* pl;
4253         vm_offset_t     a_pl_offset;
4254         int             a_flags;
4255         int is_pageoutv2 = 0;
4256         kern_return_t kret;
4257
4258         cp = VTOC(vp);
4259         fp = VTOF(vp);
4260
4261         /*
4262          * Figure out where the file ends, for pageout purposes.  If
4263          * ff_new_size > ff_size, then we're in the middle of extending the
4264          * file via a write, so it is safe (and necessary) that we be able
4265          * to pageout up to that point.
4266          */
4267         filesize = fp->ff_size;
4268         if (fp->ff_new_size > filesize)
4269                 filesize = fp->ff_new_size;
4270
4271         a_flags = ap->a_flags;
4272         a_pl_offset = ap->a_pl_offset;
4273
4274         if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) {
4275                 hfs_incr_gencount (cp);
4276         }
4277
4278         /*
4279          * we can tell if we're getting the new or old behavior from the UPL
4280          */
4281         if ((upl = ap->a_pl) == NULL) {
4282                 int request_flags;
4283
4284                 is_pageoutv2 = 1;
4285                 /*
4286                  * we're in control of any UPL we commit
4287                  * make sure someone hasn't accidentally passed in UPL_NOCOMMIT
4288                  */
4289                 a_flags &= ~UPL_NOCOMMIT;
4290                 a_pl_offset = 0;
4291
4292                 /*
4293                  * For V2 semantics, we want to take the cnode truncate lock
4294                  * shared to guard against the file size changing via zero-filling.
4295                  *
4296                  * However, we have to be careful because we may be invoked
4297                  * via the ubc_msync path to write out dirty mmap'd pages
4298                  * in response to a lock event on a content-protected
4299                  * filesystem (e.g. to write out class A files).
4300                  * As a result, we want to take the truncate lock 'SHARED' with
4301                  * the mini-recursion locktype so that we don't deadlock/panic
4302                  * because we may be already holding the truncate lock exclusive to force any other
4303                  * IOs to have blocked behind us.
4304                  */
4305                 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4306
4307                 if (a_flags & UPL_MSYNC) {
4308                         request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY;
4309                 }
4310                 else {
4311                         request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY;
4312                 }
4313
4314                 kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags);
4315
4316                 if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
4317                         retval = EINVAL;
4318                         goto pageout_done;
4319                 }
4320         }
4321         /*
4322          * from this point forward upl points at the UPL we're working with
4323          * it was either passed in or we succesfully created it
4324          */
4325
4326         /*
4327          * Now that HFS is opting into VFC_VFSVNOP_PAGEOUTV2, we may need to operate on our own
4328          * UPL instead of relying on the UPL passed into us.  We go ahead and do that here,
4329          * scanning for dirty ranges.  We'll issue our own N cluster_pageout calls, for
4330          * N dirty ranges in the UPL.  Note that this is almost a direct copy of the
4331          * logic in vnode_pageout except that we need to do it after grabbing the truncate
4332          * lock in HFS so that we don't lock invert ourselves.
4333          *
4334          * Note that we can still get into this function on behalf of the default pager with
4335          * non-V2 behavior (swapfiles).  However in that case, we did not grab locks above
4336          * since fsync and other writing threads will grab the locks, then mark the
4337          * relevant pages as busy.  But the pageout codepath marks the pages as busy,
4338          * and THEN would attempt to grab the truncate lock, which would result in deadlock.  So
4339          * we do not try to grab anything for the pre-V2 case, which should only be accessed
4340          * by the paging/VM system.
4341          */
4342
4343         if (is_pageoutv2) {
4344                 off_t f_offset;
4345                 int offset;
4346                 int isize;
4347                 int pg_index;
4348                 int error;
4349                 int error_ret = 0;
4350
4351                 isize = ap->a_size;
4352                 f_offset = ap->a_f_offset;
4353
4354                 /*
4355                  * Scan from the back to find the last page in the UPL, so that we
4356                  * aren't looking at a UPL that may have already been freed by the
4357                  * preceding aborts/completions.
4358                  */
4359                 for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
4360                         if (upl_page_present(pl, --pg_index))
4361                                 break;
4362                         if (pg_index == 0) {
4363                                 ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
4364                                 goto pageout_done;
4365                         }
4366                 }
4367
4368                 /*
4369                  * initialize the offset variables before we touch the UPL.
4370                  * a_f_offset is the position into the file, in bytes
4371                  * offset is the position into the UPL, in bytes
4372                  * pg_index is the pg# of the UPL we're operating on.
4373                  * isize is the offset into the UPL of the last non-clean page.
4374                  */
4375                 isize = ((pg_index + 1) * PAGE_SIZE);
4376
4377                 offset = 0;
4378                 pg_index = 0;
4379
4380                 while (isize) {
4381                         int  xsize;
4382                         int  num_of_pages;
4383
4384                         if ( !upl_page_present(pl, pg_index)) {
4385                                 /*
4386                                  * we asked for RET_ONLY_DIRTY, so it's possible
4387                                  * to get back empty slots in the UPL.
4388                                  * just skip over them
4389                                  */
4390                                 f_offset += PAGE_SIZE;
4391                                 offset   += PAGE_SIZE;
4392                                 isize    -= PAGE_SIZE;
4393                                 pg_index++;
4394
4395                                 continue;
4396                         }
4397                         if ( !upl_dirty_page(pl, pg_index)) {
4398                                 panic ("hfs_vnop_pageout: unforeseen clean page @ index %d for UPL %p\n", pg_index, upl);
4399                         }
4400
4401                         /*
4402                          * We know that we have at least one dirty page.
4403                          * Now checking to see how many in a row we have
4404                          */
4405                         num_of_pages = 1;
4406                         xsize = isize - PAGE_SIZE;
4407
4408                         while (xsize) {
4409                                 if ( !upl_dirty_page(pl, pg_index + num_of_pages))
4410                                         break;
4411                                 num_of_pages++;
4412                                 xsize -= PAGE_SIZE;
4413                         }
4414                         xsize = num_of_pages * PAGE_SIZE;
4415
4416                         if (!vnode_isswap(vp)) {
4417                                 off_t end_of_range;
4418                                 int tooklock;
4419
4420                                 tooklock = 0;
4421
4422                                 if (cp->c_lockowner != current_thread()) {
4423                                         if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
4424                                                 /*
4425                                                  * we're in the v2 path, so we are the
4426                                                  * owner of the UPL... we may have already
4427                                                  * processed some of the UPL, so abort it
4428                                                  * from the current working offset to the
4429                                                  * end of the UPL
4430                                                  */
4431                                                 ubc_upl_abort_range(upl,
4432                                                                     offset,
4433                                                                     ap->a_size - offset,
4434                                                                     UPL_ABORT_FREE_ON_EMPTY);
4435                                                 goto pageout_done;
4436                                         }
4437                                         tooklock = 1;
4438                                 }
4439                                 end_of_range = f_offset + xsize - 1;
4440
4441                                 if (end_of_range >= filesize) {
4442                                         end_of_range = (off_t)(filesize - 1);
4443                                 }
4444                                 if (f_offset < filesize) {
4445                                         rl_remove(f_offset, end_of_range, &fp->ff_invalidranges);
4446                                         cp->c_flag |= C_MODIFIED;  /* leof is dirty */
4447                                 }
4448                                 if (tooklock) {
4449                                         hfs_unlock(cp);
4450                                 }
4451                         }
4452                         if ((error = cluster_pageout(vp, upl, offset, f_offset,
4453                                                         xsize, filesize, a_flags))) {
4454                                 if (error_ret == 0)
4455                                         error_ret = error;
4456                         }
4457                         f_offset += xsize;
4458                         offset   += xsize;
4459                         isize    -= xsize;
4460                         pg_index += num_of_pages;
4461                 }
4462                 /* capture errnos bubbled out of cluster_pageout if they occurred */
4463                 if (error_ret != 0) {
4464                         retval = error_ret;
4465                 }
4466         } /* end block for v2 pageout behavior */
4467         else {
4468                 if (!vnode_isswap(vp)) {
4469                         off_t end_of_range;
4470                         int tooklock = 0;
4471
4472                         if (cp->c_lockowner != current_thread()) {
4473                                 if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
4474                                         if (!(a_flags & UPL_NOCOMMIT)) {
4475                                                 ubc_upl_abort_range(upl,
4476                                                                     a_pl_offset,
4477                                                                     ap->a_size,
4478                                                                     UPL_ABORT_FREE_ON_EMPTY);
4479                                         }
4480                                         goto pageout_done;
4481                                 }
4482                                 tooklock = 1;
4483                         }
4484                         end_of_range = ap->a_f_offset + ap->a_size - 1;
4485
4486                         if (end_of_range >= filesize) {
4487                                 end_of_range = (off_t)(filesize - 1);
4488                         }
4489                         if (ap->a_f_offset < filesize) {
4490                                 rl_remove(ap->a_f_offset, end_of_range, &fp->ff_invalidranges);
4491                                 cp->c_flag |= C_MODIFIED;  /* leof is dirty */
4492                         }
4493
4494                         if (tooklock) {
4495                                 hfs_unlock(cp);
4496                         }
4497                 }
4498                 /*
4499                  * just call cluster_pageout for old pre-v2 behavior
4500                  */
4501                 retval = cluster_pageout(vp, upl, a_pl_offset, ap->a_f_offset,
4502                                 ap->a_size, filesize, a_flags);
4503         }
4504
4505         /*
4506          * If data was written, update the modification time of the file.
4507          * If setuid or setgid bits are set and this process is not the
4508          * superuser then clear the setuid and setgid bits as a precaution
4509          * against tampering.
4510          */
4511         if (retval == 0) {
4512                 cp->c_touch_modtime = TRUE;
4513                 cp->c_touch_chgtime = TRUE;
4514                 if ((cp->c_mode & (S_ISUID | S_ISGID)) &&
4515                     (vfs_context_suser(ap->a_context) != 0)) {
4516                         hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
4517                         cp->c_mode &= ~(S_ISUID | S_ISGID);
4518                         hfs_unlock(cp);
4519                 }
4520         }
4521
4522 pageout_done:
4523         if (is_pageoutv2) {
4524                 /*
4525                  * Release the truncate lock.  Note that because
4526                  * we may have taken the lock recursively by
4527                  * being invoked via ubc_msync due to lockdown,
4528                  * we should release it recursively, too.
4529                  */
4530                 hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4531         }
4532         return (retval);
4533 }
4534
4535 /*
4536  * Intercept B-Tree node writes to unswap them if necessary.
4537  */
4538 int
4539 hfs_vnop_bwrite(struct vnop_bwrite_args *ap)
4540 {
4541         int retval = 0;
4542         register struct buf *bp = ap->a_bp;
4543         register struct vnode *vp = buf_vnode(bp);
4544         BlockDescriptor block;
4545
4546         /* Trap B-Tree writes */
4547         if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) ||
4548             (VTOC(vp)->c_fileid == kHFSCatalogFileID) ||
4549             (VTOC(vp)->c_fileid == kHFSAttributesFileID) ||
4550             (vp == VTOHFS(vp)->hfc_filevp)) {
4551
4552                 /*
4553                  * Swap and validate the node if it is in native byte order.
4554                  * This is always be true on big endian, so we always validate
4555                  * before writing here.  On little endian, the node typically has
4556                  * been swapped and validated when it was written to the journal,
4557                  * so we won't do anything here.
4558                  */
4559                 if (((u_int16_t *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) {
4560                         /* Prepare the block pointer */
4561                         block.blockHeader = bp;
4562                         block.buffer = (char *)buf_dataptr(bp);
4563                         block.blockNum = buf_lblkno(bp);
4564                         /* not found in cache ==> came from disk */
4565                         block.blockReadFromDisk = (buf_fromcache(bp) == 0);
4566                         block.blockSize = buf_count(bp);
4567
4568                         /* Endian un-swap B-Tree node */
4569                         retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, false);
4570                         if (retval)
4571                                 panic("hfs_vnop_bwrite: about to write corrupt node!\n");
4572                 }
4573         }
4574
4575         /* This buffer shouldn't be locked anymore but if it is clear it */
4576         if ((buf_flags(bp) & B_LOCKED)) {
4577                 // XXXdbg
4578                 if (VTOHFS(vp)->jnl) {
4579                         panic("hfs: CLEARING the lock bit on bp %p\n", bp);
4580                 }
4581                 buf_clearflags(bp, B_LOCKED);
4582         }
4583         retval = vn_bwrite (ap);
4584
4585         return (retval);
4586 }
4587
4588 /*
4589  * Relocate a file to a new location on disk
4590  *  cnode must be locked on entry
4591  *
4592  * Relocation occurs by cloning the file's data from its
4593  * current set of blocks to a new set of blocks. During
4594  * the relocation all of the blocks (old and new) are
4595  * owned by the file.
4596  *
4597  * -----------------
4598  * |///////////////|
4599  * -----------------
4600  * 0               N (file offset)
4601  *
4602  * -----------------     -----------------
4603  * |///////////////|     |               |     STEP 1 (acquire new blocks)
4604  * -----------------     -----------------
4605  * 0               N     N+1             2N
4606  *
4607  * -----------------     -----------------
4608  * |///////////////|     |///////////////|     STEP 2 (clone data)
4609  * -----------------     -----------------
4610  * 0               N     N+1             2N
4611  *
4612  *                       -----------------
4613  *                       |///////////////|     STEP 3 (head truncate blocks)
4614  *                       -----------------
4615  *                       0               N
4616  *
4617  * During steps 2 and 3 page-outs to file offsets less
4618  * than or equal to N are suspended.
4619  *
4620  * During step 3 page-ins to the file get suspended.
4621  */
4622 int
4623 hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
4624         struct  proc *p)
4625 {
4626         struct  cnode *cp;
4627         struct  filefork *fp;
4628         struct  hfsmount *hfsmp;
4629         u_int32_t  headblks;
4630         u_int32_t  datablks;
4631         u_int32_t  blksize;
4632         u_int32_t  growsize;
4633         u_int32_t  nextallocsave;
4634         daddr64_t  sector_a,  sector_b;
4635         int eflags;
4636         off_t  newbytes;
4637         int  retval;
4638         int lockflags = 0;
4639         int took_trunc_lock = 0;
4640         int started_tr = 0;
4641         enum vtype vnodetype;
4642
4643         vnodetype = vnode_vtype(vp);
4644         if (vnodetype != VREG) {
4645                 /* Not allowed to move symlinks. */
4646                 return (EPERM);
4647         }
4648
4649         hfsmp = VTOHFS(vp);
4650         if (hfsmp->hfs_flags & HFS_FRAGMENTED_FREESPACE) {
4651                 return (ENOSPC);
4652         }
4653
4654         cp = VTOC(vp);
4655         fp = VTOF(vp);
4656         if (fp->ff_unallocblocks)
4657                 return (EINVAL);
4658
4659 #if CONFIG_PROTECT
4660         /*
4661          * <rdar://problem/9118426>
4662          * Disable HFS file relocation on content-protected filesystems
4663          */
4664         if (cp_fs_protected (hfsmp->hfs_mp)) {
4665                 return EINVAL;
4666         }
4667 #endif
4668         /* If it's an SSD, also disable HFS relocation */
4669         if (hfsmp->hfs_flags & HFS_SSD) {
4670                 return EINVAL;
4671         }
4672
4673
4674         blksize = hfsmp->blockSize;
4675         if (blockHint == 0)
4676                 blockHint = hfsmp->nextAllocation;
4677
4678         if (fp->ff_size > 0x7fffffff) {
4679                 return (EFBIG);
4680         }
4681
4682         //
4683         // We do not believe that this call to hfs_fsync() is
4684         // necessary and it causes a journal transaction
4685         // deadlock so we are removing it.
4686         //
4687         //if (vnodetype == VREG && !vnode_issystem(vp)) {
4688         //      retval = hfs_fsync(vp, MNT_WAIT, 0, p);
4689         //      if (retval)
4690         //              return (retval);
4691         //}
4692
4693         if (!vnode_issystem(vp) && (vnodetype != VLNK)) {
4694                 hfs_unlock(cp);
4695                 hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
4696                 /* Force lock since callers expects lock to be held. */
4697                 if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS))) {
4698                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
4699                         return (retval);
4700                 }
4701                 /* No need to continue if file was removed. */
4702                 if (cp->c_flag & C_NOEXISTS) {
4703                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
4704                         return (ENOENT);
4705                 }
4706                 took_trunc_lock = 1;
4707         }
4708         headblks = fp->ff_blocks;
4709         datablks = howmany(fp->ff_size, blksize);
4710         growsize = datablks * blksize;
4711         eflags = kEFContigMask | kEFAllMask | kEFNoClumpMask;
4712         if (blockHint >= hfsmp->hfs_metazone_start &&
4713             blockHint <= hfsmp->hfs_metazone_end)
4714                 eflags |= kEFMetadataMask;
4715
4716         if (hfs_start_transaction(hfsmp) != 0) {
4717                 if (took_trunc_lock)
4718                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
4719             return (EINVAL);
4720         }
4721         started_tr = 1;
4722         /*
4723          * Protect the extents b-tree and the allocation bitmap
4724          * during MapFileBlockC and ExtendFileC operations.
4725          */
4726         lockflags = SFL_BITMAP;
4727         if (overflow_extents(fp))
4728                 lockflags |= SFL_EXTENTS;
4729         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
4730
4731         retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize - 1, &sector_a, NULL);
4732         if (retval) {
4733                 retval = MacToVFSError(retval);
4734                 goto out;
4735         }
4736
4737         /*
4738          * STEP 1 - acquire new allocation blocks.
4739          */
4740         nextallocsave = hfsmp->nextAllocation;
4741         retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes);
4742         if (eflags & kEFMetadataMask) {
4743                 hfs_lock_mount(hfsmp);
4744                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave);
4745                 MarkVCBDirty(hfsmp);
4746                 hfs_unlock_mount(hfsmp);
4747         }
4748
4749         retval = MacToVFSError(retval);
4750         if (retval == 0) {
4751                 cp->c_flag |= C_MODIFIED;
4752                 if (newbytes < growsize) {
4753                         retval = ENOSPC;
4754                         goto restore;
4755                 } else if (fp->ff_blocks < (headblks + datablks)) {
4756                         printf("hfs_relocate: allocation failed id=%u, vol=%s\n", cp->c_cnid, hfsmp->vcbVN);
4757                         retval = ENOSPC;
4758                         goto restore;
4759                 }
4760
4761                 retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize, &sector_b, NULL);
4762                 if (retval) {
4763                         retval = MacToVFSError(retval);
4764                 } else if ((sector_a + 1) == sector_b) {
4765                         retval = ENOSPC;
4766                         goto restore;
4767                 } else if ((eflags & kEFMetadataMask) &&
4768                            ((((u_int64_t)sector_b * hfsmp->hfs_logical_block_size) / blksize) >
4769                               hfsmp->hfs_metazone_end)) {
4770 #if 0
4771                         const char * filestr;
4772                         char emptystr = '\0';
4773
4774                         if (cp->c_desc.cd_nameptr != NULL) {
4775                                 filestr = (const char *)&cp->c_desc.cd_nameptr[0];
4776                         } else if (vnode_name(vp) != NULL) {
4777                                 filestr = vnode_name(vp);
4778                         } else {
4779                                 filestr = &emptystr;
4780                         }
4781 #endif
4782                         retval = ENOSPC;
4783                         goto restore;
4784                 }
4785         }
4786         /* Done with system locks and journal for now. */
4787         hfs_systemfile_unlock(hfsmp, lockflags);
4788         lockflags = 0;
4789         hfs_end_transaction(hfsmp);
4790         started_tr = 0;
4791
4792         if (retval) {
4793                 /*
4794                  * Check to see if failure is due to excessive fragmentation.
4795                  */
4796                 if ((retval == ENOSPC) &&
4797                     (hfs_freeblks(hfsmp, 0) > (datablks * 2))) {
4798                         hfsmp->hfs_flags |= HFS_FRAGMENTED_FREESPACE;
4799                 }
4800                 goto out;
4801         }
4802         /*
4803          * STEP 2 - clone file data into the new allocation blocks.
4804          */
4805
4806         if (vnodetype == VLNK)
4807                 retval = EPERM;
4808         else if (vnode_issystem(vp))
4809                 retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p);
4810         else
4811                 retval = hfs_clonefile(vp, headblks, datablks, blksize);
4812
4813         /* Start transaction for step 3 or for a restore. */
4814         if (hfs_start_transaction(hfsmp) != 0) {
4815                 retval = EINVAL;
4816                 goto out;
4817         }
4818         started_tr = 1;
4819         if (retval)
4820                 goto restore;
4821
4822         /*
4823          * STEP 3 - switch to cloned data and remove old blocks.
4824          */
4825         lockflags = SFL_BITMAP;
4826         if (overflow_extents(fp))
4827                 lockflags |= SFL_EXTENTS;
4828         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
4829
4830         retval = HeadTruncateFile(hfsmp, (FCB*)fp, headblks);
4831
4832         hfs_systemfile_unlock(hfsmp, lockflags);
4833         lockflags = 0;
4834         if (retval)
4835                 goto restore;
4836 out:
4837         if (took_trunc_lock)
4838                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
4839
4840         if (lockflags) {
4841                 hfs_systemfile_unlock(hfsmp, lockflags);
4842                 lockflags = 0;
4843         }
4844
4845         /* Push cnode's new extent data to disk. */
4846         if (retval == 0) {
4847                 (void) hfs_update(vp, MNT_WAIT);
4848         }
4849         if (hfsmp->jnl) {
4850                 if (cp->c_cnid < kHFSFirstUserCatalogNodeID)
4851                         (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
4852                 else
4853                         (void) hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
4854         }
4855 exit:
4856         if (started_tr)
4857                 hfs_end_transaction(hfsmp);
4858
4859         return (retval);
4860
4861 restore:
4862         if (fp->ff_blocks == headblks) {
4863                 if (took_trunc_lock)
4864                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
4865                 goto exit;
4866         }
4867         /*
4868          * Give back any newly allocated space.
4869          */
4870         if (lockflags == 0) {
4871                 lockflags = SFL_BITMAP;
4872                 if (overflow_extents(fp))
4873                         lockflags |= SFL_EXTENTS;
4874                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
4875         }
4876
4877         (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, 0, FORK_IS_RSRC(fp),
4878                                                  FTOC(fp)->c_fileid, false);
4879
4880         hfs_systemfile_unlock(hfsmp, lockflags);
4881         lockflags = 0;
4882
4883         if (took_trunc_lock)
4884                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
4885         goto exit;
4886 }
4887
4888
4889 /*
4890  * Clone a file's data within the file.
4891  *
4892  */
4893 static int
4894 hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
4895 {
4896         caddr_t  bufp;
4897         size_t  bufsize;
4898         size_t  copysize;
4899         size_t  iosize;
4900         size_t  offset;
4901         off_t   writebase;
4902         uio_t auio;
4903         int  error = 0;
4904
4905         writebase = blkstart * blksize;
4906         copysize = blkcnt * blksize;
4907         iosize = bufsize = MIN(copysize, 128 * 1024);
4908         offset = 0;
4909
4910         hfs_unlock(VTOC(vp));
4911
4912 #if CONFIG_PROTECT
4913         if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
4914                 hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
4915                 return (error);
4916         }
4917 #endif /* CONFIG_PROTECT */
4918
4919         if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
4920                 hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
4921                 return (ENOMEM);
4922         }
4923
4924         auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
4925
4926         while (offset < copysize) {
4927                 iosize = MIN(copysize - offset, iosize);
4928
4929                 uio_reset(auio, offset, UIO_SYSSPACE, UIO_READ);
4930                 uio_addiov(auio, (uintptr_t)bufp, iosize);
4931
4932                 error = cluster_read(vp, auio, copysize, IO_NOCACHE);
4933                 if (error) {
4934                         printf("hfs_clonefile: cluster_read failed - %d\n", error);
4935                         break;
4936                 }
4937                 if (uio_resid(auio) != 0) {
4938                         printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", (int64_t)uio_resid(auio));
4939                         error = EIO;
4940                         break;
4941                 }
4942
4943                 uio_reset(auio, writebase + offset, UIO_SYSSPACE, UIO_WRITE);
4944                 uio_addiov(auio, (uintptr_t)bufp, iosize);
4945
4946                 error = cluster_write(vp, auio, writebase + offset,
4947                                       writebase + offset + iosize,
4948                                       uio_offset(auio), 0, IO_NOCACHE | IO_SYNC);
4949                 if (error) {
4950                         printf("hfs_clonefile: cluster_write failed - %d\n", error);
4951                         break;
4952                 }
4953                 if (uio_resid(auio) != 0) {
4954                         printf("hfs_clonefile: cluster_write failed - uio_resid not zero\n");
4955                         error = EIO;
4956                         break;
4957                 }
4958                 offset += iosize;
4959         }
4960         uio_free(auio);
4961
4962         if ((blksize & PAGE_MASK)) {
4963                 /*
4964                  * since the copy may not have started on a PAGE
4965                  * boundary (or may not have ended on one), we
4966                  * may have pages left in the cache since NOCACHE
4967                  * will let partially written pages linger...
4968                  * lets just flush the entire range to make sure
4969                  * we don't have any pages left that are beyond
4970                  * (or intersect) the real LEOF of this file
4971                  */
4972                 ubc_msync(vp, writebase, writebase + offset, NULL, UBC_INVALIDATE | UBC_PUSHDIRTY);
4973         } else {
4974                 /*
4975                  * No need to call ubc_sync_range or hfs_invalbuf
4976                  * since the file was copied using IO_NOCACHE and
4977                  * the copy was done starting and ending on a page
4978                  * boundary in the file.
4979                  */
4980         }
4981         kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
4982
4983         hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
4984         return (error);
4985 }
4986
4987 /*
4988  * Clone a system (metadata) file.
4989  *
4990  */
4991 static int
4992 hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize,
4993                  kauth_cred_t cred, struct proc *p)
4994 {
4995         caddr_t  bufp;
4996         char * offset;
4997         size_t  bufsize;
4998         size_t  iosize;
4999         struct buf *bp = NULL;
5000         daddr64_t  blkno;
5001         daddr64_t  blk;
5002         daddr64_t  start_blk;
5003         daddr64_t  last_blk;
5004         int  breadcnt;
5005         int  i;
5006         int  error = 0;
5007
5008
5009         iosize = GetLogicalBlockSize(vp);
5010         bufsize = MIN(blkcnt * blksize, 1024 * 1024) & ~(iosize - 1);
5011         breadcnt = bufsize / iosize;
5012
5013         if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
5014                 return (ENOMEM);
5015         }
5016         start_blk = ((daddr64_t)blkstart * blksize) / iosize;
5017         last_blk  = ((daddr64_t)blkcnt * blksize) / iosize;
5018         blkno = 0;
5019
5020         while (blkno < last_blk) {
5021                 /*
5022                  * Read up to a megabyte
5023                  */
5024                 offset = bufp;
5025                 for (i = 0, blk = blkno; (i < breadcnt) && (blk < last_blk); ++i, ++blk) {
5026                         error = (int)buf_meta_bread(vp, blk, iosize, cred, &bp);
5027                         if (error) {
5028                                 printf("hfs_clonesysfile: meta_bread error %d\n", error);
5029                                 goto out;
5030                         }
5031                         if (buf_count(bp) != iosize) {
5032                                 printf("hfs_clonesysfile: b_bcount is only %d\n", buf_count(bp));
5033                                 goto out;
5034                         }
5035                         bcopy((char *)buf_dataptr(bp), offset, iosize);
5036
5037                         buf_markinvalid(bp);
5038                         buf_brelse(bp);
5039                         bp = NULL;
5040
5041                         offset += iosize;
5042                 }
5043
5044                 /*
5045                  * Write up to a megabyte
5046                  */
5047                 offset = bufp;
5048                 for (i = 0; (i < breadcnt) && (blkno < last_blk); ++i, ++blkno) {
5049                         bp = buf_getblk(vp, start_blk + blkno, iosize, 0, 0, BLK_META);
5050                         if (bp == NULL) {
5051                                 printf("hfs_clonesysfile: getblk failed on blk %qd\n", start_blk + blkno);
5052                                 error = EIO;
5053                                 goto out;
5054                         }
5055                         bcopy(offset, (char *)buf_dataptr(bp), iosize);
5056                         error = (int)buf_bwrite(bp);
5057                         bp = NULL;
5058                         if (error)
5059                                 goto out;
5060                         offset += iosize;
5061                 }
5062         }
5063 out:
5064         if (bp) {
5065                 buf_brelse(bp);
5066         }
5067
5068         kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
5069
5070         error = hfs_fsync(vp, MNT_WAIT, 0, p);
5071
5072         return (error);
5073 }