bsd/hfs/hfs_readwrite.c

   1 /*
   2  * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*      @(#)hfs_readwrite.c     1.0
  29  *
  30  *      (c) 1998-2001 Apple Computer, Inc.  All Rights Reserved
  31  *
  32  *      hfs_readwrite.c -- vnode operations to deal with reading and writing files.
  33  *
  34  */
  35
  36 #include <sys/param.h>
  37 #include <sys/systm.h>
  38 #include <sys/resourcevar.h>
  39 #include <sys/kernel.h>
  40 #include <sys/fcntl.h>
  41 #include <sys/filedesc.h>
  42 #include <sys/stat.h>
  43 #include <sys/buf.h>
  44 #include <sys/buf_internal.h>
  45 #include <sys/proc.h>
  46 #include <sys/kauth.h>
  47 #include <sys/vnode.h>
  48 #include <sys/vnode_internal.h>
  49 #include <sys/uio.h>
  50 #include <sys/vfs_context.h>
  51 #include <sys/fsevents.h>
  52 #include <kern/kalloc.h>
  53 #include <sys/disk.h>
  54 #include <sys/sysctl.h>
  55 #include <sys/fsctl.h>
  56 #include <sys/mount_internal.h>
  57 #include <sys/file_internal.h>
  58
  59 #include <miscfs/specfs/specdev.h>
  60
  61 #include <sys/ubc.h>
  62 #include <sys/ubc_internal.h>
  63
  64 #include <vm/vm_pageout.h>
  65 #include <vm/vm_kern.h>
  66
  67 #include <sys/kdebug.h>
  68
  69 #include        "hfs.h"
  70 #include        "hfs_attrlist.h"
  71 #include        "hfs_endian.h"
  72 #include        "hfs_fsctl.h"
  73 #include        "hfs_quota.h"
  74 #include        "hfscommon/headers/FileMgrInternal.h"
  75 #include        "hfscommon/headers/BTreesInternal.h"
  76 #include        "hfs_cnode.h"
  77 #include        "hfs_dbg.h"
  78
  79 #define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2)))
  80
  81 enum {
  82         MAXHFSFILESIZE = 0x7FFFFFFF             /* this needs to go in the mount structure */
  83 };
  84
  85 /* from bsd/hfs/hfs_vfsops.c */
  86 extern int hfs_vfs_vget (struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context);
  87
  88 static int  hfs_clonefile(struct vnode *, int, int, int);
  89 static int  hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *);
  90 static int  hfs_minorupdate(struct vnode *vp);
  91 static int  do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skip, vfs_context_t context);
  92
  93 /* from bsd/hfs/hfs_vnops.c */
  94 extern decmpfs_cnode* hfs_lazy_init_decmpfs_cnode (struct cnode *cp);
  95
  96
  97
  98 int flush_cache_on_write = 0;
  99 SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW | CTLFLAG_LOCKED, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files");
 100
 101 /*
 102  * Read data from a file.
 103  */
 104 int
 105 hfs_vnop_read(struct vnop_read_args *ap)
 106 {
 107         /*
 108            struct vnop_read_args {
 109            struct vnodeop_desc *a_desc;
 110            vnode_t a_vp;
 111            struct uio *a_uio;
 112            int a_ioflag;
 113            vfs_context_t a_context;
 114            };
 115          */
 116
 117         uio_t uio = ap->a_uio;
 118         struct vnode *vp = ap->a_vp;
 119         struct cnode *cp;
 120         struct filefork *fp;
 121         struct hfsmount *hfsmp;
 122         off_t filesize;
 123         off_t filebytes;
 124         off_t start_resid = uio_resid(uio);
 125         off_t offset = uio_offset(uio);
 126         int retval = 0;
 127         int took_truncate_lock = 0;
 128         int io_throttle = 0;
 129         int throttled_count = 0;
 130
 131         /* Preflight checks */
 132         if (!vnode_isreg(vp)) {
 133                 /* can only read regular files */
 134                 if (vnode_isdir(vp))
 135                         return (EISDIR);
 136                 else
 137                         return (EPERM);
 138         }
 139         if (start_resid == 0)
 140                 return (0);             /* Nothing left to do */
 141         if (offset < 0)
 142                 return (EINVAL);        /* cant read from a negative offset */
 143
 144         if ((ap->a_ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) ==
 145                                                 (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) {
 146                 /* Don't allow unencrypted io request from user space */
 147                 return EPERM;
 148         }
 149
 150
 151
 152 #if HFS_COMPRESSION
 153         if (VNODE_IS_RSRC(vp)) {
 154                 if (hfs_hides_rsrc(ap->a_context, VTOC(vp), 1)) { /* 1 == don't take the cnode lock */
 155                         return 0;
 156                 }
 157                 /* otherwise read the resource fork normally */
 158         } else {
 159                 int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
 160                 if (compressed) {
 161                         retval = decmpfs_read_compressed(ap, &compressed, VTOCMP(vp));
 162                         if (compressed) {
 163                                 if (retval == 0) {
 164                                         /* successful read, update the access time */
 165                                         VTOC(vp)->c_touch_acctime = TRUE;
 166
 167                                         /* compressed files are not hot file candidates */
 168                                         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
 169                                                 VTOF(vp)->ff_bytesread = 0;
 170                                         }
 171                                 }
 172                                 return retval;
 173                         }
 174                         /* otherwise the file was converted back to a regular file while we were reading it */
 175                         retval = 0;
 176                 } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
 177                         int error;
 178
 179                         error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP);
 180                         if (error) {
 181                                 return error;
 182                         }
 183
 184                 }
 185         }
 186 #endif /* HFS_COMPRESSION */
 187
 188         cp = VTOC(vp);
 189         fp = VTOF(vp);
 190         hfsmp = VTOHFS(vp);
 191
 192 #if CONFIG_PROTECT
 193         if ((retval = cp_handle_vnop (vp, CP_READ_ACCESS, ap->a_ioflag)) != 0) {
 194                 goto exit;
 195         }
 196 #endif
 197
 198         /*
 199          * If this read request originated from a syscall (as opposed to
 200          * an in-kernel page fault or something), then set it up for
 201          * throttle checks
 202          */
 203         if (ap->a_ioflag & IO_SYSCALL_DISPATCH) {
 204                 io_throttle = IO_RETURN_ON_THROTTLE;
 205         }
 206
 207 read_again:
 208
 209         /* Protect against a size change. */
 210         hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
 211         took_truncate_lock = 1;
 212
 213         filesize = fp->ff_size;
 214         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 215
 216         /*
 217          * Check the file size. Note that per POSIX spec, we return 0 at
 218          * file EOF, so attempting a read at an offset that is too big
 219          * should just return 0 on HFS+. Since the return value was initialized
 220          * to 0 above, we just jump to exit.  HFS Standard has its own behavior.
 221          */
 222         if (offset > filesize) {
 223                 if ((hfsmp->hfs_flags & HFS_STANDARD) &&
 224                     (offset > (off_t)MAXHFSFILESIZE)) {
 225                         retval = EFBIG;
 226                 }
 227                 goto exit;
 228         }
 229
 230         KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_START,
 231                 (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0);
 232
 233         retval = cluster_read(vp, uio, filesize, ap->a_ioflag |io_throttle);
 234
 235         cp->c_touch_acctime = TRUE;
 236
 237         KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_END,
 238                 (int)uio_offset(uio), uio_resid(uio), (int)filesize,  (int)filebytes, 0);
 239
 240         /*
 241          * Keep track blocks read
 242          */
 243         if (hfsmp->hfc_stage == HFC_RECORDING && retval == 0) {
 244                 int took_cnode_lock = 0;
 245                 off_t bytesread;
 246
 247                 bytesread = start_resid - uio_resid(uio);
 248
 249                 /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
 250                 if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) {
 251                         hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 252                         took_cnode_lock = 1;
 253                 }
 254                 /*
 255                  * If this file hasn't been seen since the start of
 256                  * the current sampling period then start over.
 257                  */
 258                 if (cp->c_atime < hfsmp->hfc_timebase) {
 259                         struct timeval tv;
 260
 261                         fp->ff_bytesread = bytesread;
 262                         microtime(&tv);
 263                         cp->c_atime = tv.tv_sec;
 264                 } else {
 265                         fp->ff_bytesread += bytesread;
 266                 }
 267                 if (took_cnode_lock)
 268                         hfs_unlock(cp);
 269         }
 270 exit:
 271         if (took_truncate_lock) {
 272                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 273         }
 274         if (retval == EAGAIN) {
 275                 throttle_lowpri_io(1);
 276                 throttled_count++;
 277
 278                 retval = 0;
 279                 goto read_again;
 280         }
 281         if (throttled_count) {
 282                 throttle_info_reset_window((uthread_t)get_bsdthread_info(current_thread()));
 283         }
 284         return (retval);
 285 }
 286
 287 /*
 288  * Write data to a file.
 289  */
 290 int
 291 hfs_vnop_write(struct vnop_write_args *ap)
 292 {
 293         uio_t uio = ap->a_uio;
 294         struct vnode *vp = ap->a_vp;
 295         struct cnode *cp;
 296         struct filefork *fp;
 297         struct hfsmount *hfsmp;
 298         kauth_cred_t cred = NULL;
 299         off_t origFileSize;
 300         off_t writelimit;
 301         off_t bytesToAdd = 0;
 302         off_t actualBytesAdded;
 303         off_t filebytes;
 304         off_t offset;
 305         ssize_t resid;
 306         int eflags;
 307         int ioflag = ap->a_ioflag;
 308         int retval = 0;
 309         int lockflags;
 310         int cnode_locked = 0;
 311         int partialwrite = 0;
 312         int do_snapshot = 1;
 313         time_t orig_ctime=VTOC(vp)->c_ctime;
 314         int took_truncate_lock = 0;
 315         int io_return_on_throttle = 0;
 316         int throttled_count = 0;
 317         struct rl_entry *invalid_range;
 318
 319 #if HFS_COMPRESSION
 320         if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
 321                 int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
 322                 switch(state) {
 323                         case FILE_IS_COMPRESSED:
 324                                 return EACCES;
 325                         case FILE_IS_CONVERTING:
 326                                 /* if FILE_IS_CONVERTING, we allow writes but do not
 327                                    bother with snapshots or else we will deadlock.
 328                                 */
 329                                 do_snapshot = 0;
 330                                 break;
 331                         default:
 332                                 printf("invalid state %d for compressed file\n", state);
 333                                 /* fall through */
 334                 }
 335         } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
 336                 int error;
 337
 338                 error = check_for_dataless_file(vp, NAMESPACE_HANDLER_WRITE_OP);
 339                 if (error != 0) {
 340                         return error;
 341                 }
 342         }
 343
 344         if (do_snapshot) {
 345                 check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_WRITE_OP, uio);
 346         }
 347
 348 #endif
 349
 350         if ((ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) ==
 351                                                 (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) {
 352                 /* Don't allow unencrypted io request from user space */
 353                 return EPERM;
 354         }
 355
 356
 357         resid = uio_resid(uio);
 358         offset = uio_offset(uio);
 359
 360         if (offset < 0)
 361                 return (EINVAL);
 362         if (resid == 0)
 363                 return (E_NONE);
 364         if (!vnode_isreg(vp))
 365                 return (EPERM);  /* Can only write regular files */
 366
 367         cp = VTOC(vp);
 368         fp = VTOF(vp);
 369         hfsmp = VTOHFS(vp);
 370
 371 #if CONFIG_PROTECT
 372         if ((retval = cp_handle_vnop (vp, CP_WRITE_ACCESS, 0)) != 0) {
 373                 goto exit;
 374         }
 375 #endif
 376
 377         eflags = kEFDeferMask;  /* defer file block allocations */
 378 #if HFS_SPARSE_DEV
 379         /*
 380          * When the underlying device is sparse and space
 381          * is low (< 8MB), stop doing delayed allocations
 382          * and begin doing synchronous I/O.
 383          */
 384         if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
 385             (hfs_freeblks(hfsmp, 0) < 2048)) {
 386                 eflags &= ~kEFDeferMask;
 387                 ioflag |= IO_SYNC;
 388         }
 389 #endif /* HFS_SPARSE_DEV */
 390
 391         if ((ioflag & (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) ==
 392                         (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) {
 393                 io_return_on_throttle = IO_RETURN_ON_THROTTLE;
 394         }
 395
 396 again:
 397         /*
 398          * Protect against a size change.
 399          *
 400          * Note: If took_truncate_lock is true, then we previously got the lock shared
 401          * but needed to upgrade to exclusive.  So try getting it exclusive from the
 402          * start.
 403          */
 404         if (ioflag & IO_APPEND || took_truncate_lock) {
 405                 hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
 406         }
 407         else {
 408                 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
 409         }
 410         took_truncate_lock = 1;
 411
 412         /* Update UIO */
 413         if (ioflag & IO_APPEND) {
 414                 uio_setoffset(uio, fp->ff_size);
 415                 offset = fp->ff_size;
 416         }
 417         if ((cp->c_bsdflags & APPEND) && offset != fp->ff_size) {
 418                 retval = EPERM;
 419                 goto exit;
 420         }
 421
 422         origFileSize = fp->ff_size;
 423         writelimit = offset + resid;
 424         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 425
 426         /*
 427          * We may need an exclusive truncate lock for several reasons, all
 428          * of which are because we may be writing to a (portion of a) block
 429          * for the first time, and we need to make sure no readers see the
 430          * prior, uninitialized contents of the block.  The cases are:
 431          *
 432          * 1. We have unallocated (delayed allocation) blocks.  We may be
 433          *    allocating new blocks to the file and writing to them.
 434          *    (A more precise check would be whether the range we're writing
 435          *    to contains delayed allocation blocks.)
 436          * 2. We need to extend the file.  The bytes between the old EOF
 437          *    and the new EOF are not yet initialized.  This is important
 438          *    even if we're not allocating new blocks to the file.  If the
 439          *    old EOF and new EOF are in the same block, we still need to
 440          *    protect that range of bytes until they are written for the
 441          *    first time.
 442          * 3. The write overlaps some invalid ranges (delayed zero fill; that
 443          *    part of the file has been allocated, but not yet written).
 444          *
 445          * If we had a shared lock with the above cases, we need to try to upgrade
 446          * to an exclusive lock.  If the upgrade fails, we will lose the shared
 447          * lock, and will need to take the truncate lock again; the took_truncate_lock
 448          * flag will still be set, causing us to try for an exclusive lock next time.
 449          *
 450          * NOTE: Testing for #3 (delayed zero fill) needs to be done while the cnode
 451          * lock is held, since it protects the range lists.
 452          */
 453         if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) &&
 454             ((fp->ff_unallocblocks != 0) ||
 455              (writelimit > origFileSize))) {
 456                 if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) {
 457                         /*
 458                          * Lock upgrade failed and we lost our shared lock, try again.
 459                          * Note: we do not set took_truncate_lock=0 here.  Leaving it
 460                          * set to 1 will cause us to try to get the lock exclusive.
 461                          */
 462                         goto again;
 463                 }
 464                 else {
 465                         /* Store the owner in the c_truncatelockowner field if we successfully upgrade */
 466                         cp->c_truncatelockowner = current_thread();
 467                 }
 468         }
 469
 470         if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
 471                 goto exit;
 472         }
 473         cnode_locked = 1;
 474
 475         /*
 476          * Now that we have the cnode lock, see if there are delayed zero fill ranges
 477          * overlapping our write.  If so, we need the truncate lock exclusive (see above).
 478          */
 479         if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) &&
 480             (rl_scan(&fp->ff_invalidranges, offset, writelimit-1, &invalid_range) != RL_NOOVERLAP)) {
 481                 /*
 482                  * When testing, it appeared that calling lck_rw_lock_shared_to_exclusive() causes
 483                  * a deadlock, rather than simply returning failure.  (That is, it apparently does
 484                  * not behave like a "try_lock").  Since this condition is rare, just drop the
 485                  * cnode lock and try again.  Since took_truncate_lock is set, we will
 486                  * automatically take the truncate lock exclusive.
 487                  */
 488                 hfs_unlock(cp);
 489                 cnode_locked = 0;
 490                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 491                 goto again;
 492         }
 493
 494         KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_START,
 495                      (int)offset, uio_resid(uio), (int)fp->ff_size,
 496                      (int)filebytes, 0);
 497
 498         /* Check if we do not need to extend the file */
 499         if (writelimit <= filebytes) {
 500                 goto sizeok;
 501         }
 502
 503         cred = vfs_context_ucred(ap->a_context);
 504         bytesToAdd = writelimit - filebytes;
 505
 506 #if QUOTA
 507         retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, hfsmp->blockSize)),
 508                            cred, 0);
 509         if (retval)
 510                 goto exit;
 511 #endif /* QUOTA */
 512
 513         if (hfs_start_transaction(hfsmp) != 0) {
 514                 retval = EINVAL;
 515                 goto exit;
 516         }
 517
 518         while (writelimit > filebytes) {
 519                 bytesToAdd = writelimit - filebytes;
 520                 if (cred && suser(cred, NULL) != 0)
 521                         eflags |= kEFReserveMask;
 522
 523                 /* Protect extents b-tree and allocation bitmap */
 524                 lockflags = SFL_BITMAP;
 525                 if (overflow_extents(fp))
 526                         lockflags |= SFL_EXTENTS;
 527                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
 528
 529                 /* Files that are changing size are not hot file candidates. */
 530                 if (hfsmp->hfc_stage == HFC_RECORDING) {
 531                         fp->ff_bytesread = 0;
 532                 }
 533                 retval = MacToVFSError(ExtendFileC (hfsmp, (FCB*)fp, bytesToAdd,
 534                                 0, eflags, &actualBytesAdded));
 535
 536                 hfs_systemfile_unlock(hfsmp, lockflags);
 537
 538                 if ((actualBytesAdded == 0) && (retval == E_NONE))
 539                         retval = ENOSPC;
 540                 if (retval != E_NONE)
 541                         break;
 542                 filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 543                 KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_NONE,
 544                         (int)offset, uio_resid(uio), (int)fp->ff_size,  (int)filebytes, 0);
 545         }
 546         (void) hfs_update(vp, TRUE);
 547         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
 548         (void) hfs_end_transaction(hfsmp);
 549
 550         /*
 551          * If we didn't grow the file enough try a partial write.
 552          * POSIX expects this behavior.
 553          */
 554         if ((retval == ENOSPC) && (filebytes > offset)) {
 555                 retval = 0;
 556                 partialwrite = 1;
 557                 uio_setresid(uio, (uio_resid(uio) - bytesToAdd));
 558                 resid -= bytesToAdd;
 559                 writelimit = filebytes;
 560         }
 561 sizeok:
 562         if (retval == E_NONE) {
 563                 off_t filesize;
 564                 off_t zero_off;
 565                 off_t tail_off;
 566                 off_t inval_start;
 567                 off_t inval_end;
 568                 off_t io_start;
 569                 int lflag;
 570
 571                 if (writelimit > fp->ff_size)
 572                         filesize = writelimit;
 573                 else
 574                         filesize = fp->ff_size;
 575
 576                 lflag = ioflag & ~(IO_TAILZEROFILL | IO_HEADZEROFILL | IO_NOZEROVALID | IO_NOZERODIRTY);
 577
 578                 if (offset <= fp->ff_size) {
 579                         zero_off = offset & ~PAGE_MASK_64;
 580
 581                         /* Check to see whether the area between the zero_offset and the start
 582                            of the transfer to see whether is invalid and should be zero-filled
 583                            as part of the transfer:
 584                          */
 585                         if (offset > zero_off) {
 586                                 if (rl_scan(&fp->ff_invalidranges, zero_off, offset - 1, &invalid_range) != RL_NOOVERLAP)
 587                                         lflag |= IO_HEADZEROFILL;
 588                         }
 589                 } else {
 590                         off_t eof_page_base = fp->ff_size & ~PAGE_MASK_64;
 591
 592                         /* The bytes between fp->ff_size and uio->uio_offset must never be
 593                            read without being zeroed.  The current last block is filled with zeroes
 594                            if it holds valid data but in all cases merely do a little bookkeeping
 595                            to track the area from the end of the current last page to the start of
 596                            the area actually written.  For the same reason only the bytes up to the
 597                            start of the page where this write will start is invalidated; any remainder
 598                            before uio->uio_offset is explicitly zeroed as part of the cluster_write.
 599
 600                            Note that inval_start, the start of the page after the current EOF,
 601                            may be past the start of the write, in which case the zeroing
 602                            will be handled by the cluser_write of the actual data.
 603                          */
 604                         inval_start = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
 605                         inval_end = offset & ~PAGE_MASK_64;
 606                         zero_off = fp->ff_size;
 607
 608                         if ((fp->ff_size & PAGE_MASK_64) &&
 609                                 (rl_scan(&fp->ff_invalidranges,
 610                                                         eof_page_base,
 611                                                         fp->ff_size - 1,
 612                                                         &invalid_range) != RL_NOOVERLAP)) {
 613                                 /* The page containing the EOF is not valid, so the
 614                                    entire page must be made inaccessible now.  If the write
 615                                    starts on a page beyond the page containing the eof
 616                                    (inval_end > eof_page_base), add the
 617                                    whole page to the range to be invalidated.  Otherwise
 618                                    (i.e. if the write starts on the same page), zero-fill
 619                                    the entire page explicitly now:
 620                                  */
 621                                 if (inval_end > eof_page_base) {
 622                                         inval_start = eof_page_base;
 623                                 } else {
 624                                         zero_off = eof_page_base;
 625                                 };
 626                         };
 627
 628                         if (inval_start < inval_end) {
 629                                 struct timeval tv;
 630                                 /* There's some range of data that's going to be marked invalid */
 631
 632                                 if (zero_off < inval_start) {
 633                                         /* The pages between inval_start and inval_end are going to be invalidated,
 634                                            and the actual write will start on a page past inval_end.  Now's the last
 635                                            chance to zero-fill the page containing the EOF:
 636                                          */
 637                                         hfs_unlock(cp);
 638                                         cnode_locked = 0;
 639                                         retval = cluster_write(vp, (uio_t) 0,
 640                                                         fp->ff_size, inval_start,
 641                                                         zero_off, (off_t)0,
 642                                                         lflag | IO_HEADZEROFILL | IO_NOZERODIRTY);
 643                                         hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 644                                         cnode_locked = 1;
 645                                         if (retval) goto ioerr_exit;
 646                                         offset = uio_offset(uio);
 647                                 };
 648
 649                                 /* Mark the remaining area of the newly allocated space as invalid: */
 650                                 rl_add(inval_start, inval_end - 1 , &fp->ff_invalidranges);
 651                                 microuptime(&tv);
 652                                 cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
 653                                 zero_off = fp->ff_size = inval_end;
 654                         };
 655
 656                         if (offset > zero_off) lflag |= IO_HEADZEROFILL;
 657                 };
 658
 659                 /* Check to see whether the area between the end of the write and the end of
 660                    the page it falls in is invalid and should be zero-filled as part of the transfer:
 661                  */
 662                 tail_off = (writelimit + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
 663                 if (tail_off > filesize) tail_off = filesize;
 664                 if (tail_off > writelimit) {
 665                         if (rl_scan(&fp->ff_invalidranges, writelimit, tail_off - 1, &invalid_range) != RL_NOOVERLAP) {
 666                                 lflag |= IO_TAILZEROFILL;
 667                         };
 668                 };
 669
 670                 /*
 671                  * if the write starts beyond the current EOF (possibly advanced in the
 672                  * zeroing of the last block, above), then we'll zero fill from the current EOF
 673                  * to where the write begins:
 674                  *
 675                  * NOTE: If (and ONLY if) the portion of the file about to be written is
 676                  *       before the current EOF it might be marked as invalid now and must be
 677                  *       made readable (removed from the invalid ranges) before cluster_write
 678                  *       tries to write it:
 679                  */
 680                 io_start = (lflag & IO_HEADZEROFILL) ? zero_off : offset;
 681                 if (io_start < fp->ff_size) {
 682                         off_t io_end;
 683
 684                         io_end = (lflag & IO_TAILZEROFILL) ? tail_off : writelimit;
 685                         rl_remove(io_start, io_end - 1, &fp->ff_invalidranges);
 686                 };
 687
 688                 hfs_unlock(cp);
 689                 cnode_locked = 0;
 690
 691                 /*
 692                  * We need to tell UBC the fork's new size BEFORE calling
 693                  * cluster_write, in case any of the new pages need to be
 694                  * paged out before cluster_write completes (which does happen
 695                  * in embedded systems due to extreme memory pressure).
 696                  * Similarly, we need to tell hfs_vnop_pageout what the new EOF
 697                  * will be, so that it can pass that on to cluster_pageout, and
 698                  * allow those pageouts.
 699                  *
 700                  * We don't update ff_size yet since we don't want pageins to
 701                  * be able to see uninitialized data between the old and new
 702                  * EOF, until cluster_write has completed and initialized that
 703                  * part of the file.
 704                  *
 705                  * The vnode pager relies on the file size last given to UBC via
 706                  * ubc_setsize.  hfs_vnop_pageout relies on fp->ff_new_size or
 707                  * ff_size (whichever is larger).  NOTE: ff_new_size is always
 708                  * zero, unless we are extending the file via write.
 709                  */
 710                 if (filesize > fp->ff_size) {
 711                         fp->ff_new_size = filesize;
 712                         ubc_setsize(vp, filesize);
 713                 }
 714                 retval = cluster_write(vp, uio, fp->ff_size, filesize, zero_off,
 715                                 tail_off, lflag | IO_NOZERODIRTY | io_return_on_throttle);
 716                 if (retval) {
 717                         fp->ff_new_size = 0;    /* no longer extending; use ff_size */
 718
 719                         if (retval == EAGAIN) {
 720                                 /*
 721                                  * EAGAIN indicates that we still have I/O to do, but
 722                                  * that we now need to be throttled
 723                                  */
 724                                 if (resid != uio_resid(uio)) {
 725                                         /*
 726                                          * did manage to do some I/O before returning EAGAIN
 727                                          */
 728                                         resid = uio_resid(uio);
 729                                         offset = uio_offset(uio);
 730
 731                                         cp->c_touch_chgtime = TRUE;
 732                                         cp->c_touch_modtime = TRUE;
 733                                         hfs_incr_gencount(cp);
 734                                 }
 735                                 if (filesize > fp->ff_size) {
 736                                         /*
 737                                          * we called ubc_setsize before the call to
 738                                          * cluster_write... since we only partially
 739                                          * completed the I/O, we need to
 740                                          * re-adjust our idea of the filesize based
 741                                          * on our interim EOF
 742                                          */
 743                                         ubc_setsize(vp, offset);
 744
 745                                         fp->ff_size = offset;
 746                                 }
 747                                 goto exit;
 748                         }
 749                         if (filesize > origFileSize) {
 750                                 ubc_setsize(vp, origFileSize);
 751                         }
 752                         goto ioerr_exit;
 753                 }
 754
 755                 if (filesize > origFileSize) {
 756                         fp->ff_size = filesize;
 757
 758                         /* Files that are changing size are not hot file candidates. */
 759                         if (hfsmp->hfc_stage == HFC_RECORDING) {
 760                                 fp->ff_bytesread = 0;
 761                         }
 762                 }
 763                 fp->ff_new_size = 0;    /* ff_size now has the correct size */
 764         }
 765         if (partialwrite) {
 766                 uio_setresid(uio, (uio_resid(uio) + bytesToAdd));
 767                 resid += bytesToAdd;
 768         }
 769
 770         // XXXdbg - see radar 4871353 for more info
 771         {
 772             if (flush_cache_on_write && ((ioflag & IO_NOCACHE) || vnode_isnocache(vp))) {
 773                 VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NULL);
 774             }
 775         }
 776
 777 ioerr_exit:
 778         if (resid > uio_resid(uio)) {
 779                 if (!cnode_locked) {
 780                         hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 781                         cnode_locked = 1;
 782                 }
 783
 784                 cp->c_touch_chgtime = TRUE;
 785                 cp->c_touch_modtime = TRUE;
 786                 hfs_incr_gencount(cp);
 787
 788                 /*
 789                  * If we successfully wrote any data, and we are not the superuser
 790                  * we clear the setuid and setgid bits as a precaution against
 791                  * tampering.
 792                  */
 793                 if (cp->c_mode & (S_ISUID | S_ISGID)) {
 794                         cred = vfs_context_ucred(ap->a_context);
 795                         if (cred && suser(cred, NULL)) {
 796                                 cp->c_mode &= ~(S_ISUID | S_ISGID);
 797                         }
 798                 }
 799         }
 800         if (retval) {
 801                 if (ioflag & IO_UNIT) {
 802                         (void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC,
 803                                            0, ap->a_context);
 804                         uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio))));
 805                         uio_setresid(uio, resid);
 806                         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 807                 }
 808         } else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio)))
 809                 retval = hfs_update(vp, TRUE);
 810
 811         /* Updating vcbWrCnt doesn't need to be atomic. */
 812         hfsmp->vcbWrCnt++;
 813
 814         KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_END,
 815                 (int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0);
 816 exit:
 817         if (cnode_locked)
 818                 hfs_unlock(cp);
 819
 820         if (took_truncate_lock) {
 821                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 822         }
 823         if (retval == EAGAIN) {
 824                 throttle_lowpri_io(1);
 825                 throttled_count++;
 826
 827                 retval = 0;
 828                 goto again;
 829         }
 830         if (throttled_count) {
 831                 throttle_info_reset_window((uthread_t)get_bsdthread_info(current_thread()));
 832         }
 833         return (retval);
 834 }
 835
 836 /* support for the "bulk-access" fcntl */
 837
 838 #define CACHE_LEVELS 16
 839 #define NUM_CACHE_ENTRIES (64*16)
 840 #define PARENT_IDS_FLAG 0x100
 841
 842 struct access_cache {
 843        int numcached;
 844        int cachehits; /* these two for statistics gathering */
 845        int lookups;
 846        unsigned int *acache;
 847        unsigned char *haveaccess;
 848 };
 849
 850 struct access_t {
 851         uid_t     uid;              /* IN: effective user id */
 852         short     flags;            /* IN: access requested (i.e. R_OK) */
 853         short     num_groups;       /* IN: number of groups user belongs to */
 854         int       num_files;        /* IN: number of files to process */
 855         int       *file_ids;        /* IN: array of file ids */
 856         gid_t     *groups;          /* IN: array of groups */
 857         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 858 } __attribute__((unavailable)); // this structure is for reference purposes only
 859
 860 struct user32_access_t {
 861         uid_t     uid;              /* IN: effective user id */
 862         short     flags;            /* IN: access requested (i.e. R_OK) */
 863         short     num_groups;       /* IN: number of groups user belongs to */
 864         int       num_files;        /* IN: number of files to process */
 865         user32_addr_t      file_ids;        /* IN: array of file ids */
 866         user32_addr_t      groups;          /* IN: array of groups */
 867         user32_addr_t      access;          /* OUT: access info for each file (0 for 'has access') */
 868 };
 869
 870 struct user64_access_t {
 871         uid_t           uid;                    /* IN: effective user id */
 872         short           flags;                  /* IN: access requested (i.e. R_OK) */
 873         short           num_groups;             /* IN: number of groups user belongs to */
 874         int             num_files;              /* IN: number of files to process */
 875         user64_addr_t   file_ids;               /* IN: array of file ids */
 876         user64_addr_t   groups;                 /* IN: array of groups */
 877         user64_addr_t   access;                 /* OUT: access info for each file (0 for 'has access') */
 878 };
 879
 880
 881 // these are the "extended" versions of the above structures
 882 // note that it is crucial that they be different sized than
 883 // the regular version
 884 struct ext_access_t {
 885         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 886         uint32_t   num_files;       /* IN: number of files to process */
 887         uint32_t   map_size;        /* IN: size of the bit map */
 888         uint32_t  *file_ids;        /* IN: Array of file ids */
 889         char      *bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 890         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 891         uint32_t   num_parents;   /* future use */
 892         cnid_t      *parents;   /* future use */
 893 } __attribute__((unavailable)); // this structure is for reference purposes only
 894
 895 struct user32_ext_access_t {
 896         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 897         uint32_t   num_files;       /* IN: number of files to process */
 898         uint32_t   map_size;        /* IN: size of the bit map */
 899         user32_addr_t  file_ids;        /* IN: Array of file ids */
 900         user32_addr_t     bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 901         user32_addr_t access;          /* OUT: access info for each file (0 for 'has access') */
 902         uint32_t   num_parents;   /* future use */
 903         user32_addr_t parents;   /* future use */
 904 };
 905
 906 struct user64_ext_access_t {
 907         uint32_t      flags;        /* IN: access requested (i.e. R_OK) */
 908         uint32_t      num_files;    /* IN: number of files to process */
 909         uint32_t      map_size;     /* IN: size of the bit map */
 910         user64_addr_t   file_ids;     /* IN: array of file ids */
 911         user64_addr_t   bitmap;       /* IN: array of groups */
 912         user64_addr_t   access;       /* OUT: access info for each file (0 for 'has access') */
 913         uint32_t      num_parents;/* future use */
 914         user64_addr_t   parents;/* future use */
 915 };
 916
 917
 918 /*
 919  * Perform a binary search for the given parent_id. Return value is
 920  * the index if there is a match.  If no_match_indexp is non-NULL it
 921  * will be assigned with the index to insert the item (even if it was
 922  * not found).
 923  */
 924 static int cache_binSearch(cnid_t *array, unsigned int hi, cnid_t parent_id, int *no_match_indexp)
 925 {
 926     int index=-1;
 927     unsigned int lo=0;
 928
 929     do {
 930         unsigned int mid = ((hi - lo)/2) + lo;
 931         unsigned int this_id = array[mid];
 932
 933         if (parent_id == this_id) {
 934             hi = mid;
 935             break;
 936         }
 937
 938         if (parent_id < this_id) {
 939             hi = mid;
 940             continue;
 941         }
 942
 943         if (parent_id > this_id) {
 944             lo = mid + 1;
 945             continue;
 946         }
 947     } while(lo < hi);
 948
 949     /* check if lo and hi converged on the match */
 950     if (parent_id == array[hi]) {
 951         index = hi;
 952     }
 953
 954     if (no_match_indexp) {
 955         *no_match_indexp = hi;
 956     }
 957
 958     return index;
 959 }
 960
 961
 962 static int
 963 lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id)
 964 {
 965     unsigned int hi;
 966     int matches = 0;
 967     int index, no_match_index;
 968
 969     if (cache->numcached == 0) {
 970         *indexp = 0;
 971         return 0; // table is empty, so insert at index=0 and report no match
 972     }
 973
 974     if (cache->numcached > NUM_CACHE_ENTRIES) {
 975         cache->numcached = NUM_CACHE_ENTRIES;
 976     }
 977
 978     hi = cache->numcached - 1;
 979
 980     index = cache_binSearch(cache->acache, hi, parent_id, &no_match_index);
 981
 982     /* if no existing entry found, find index for new one */
 983     if (index == -1) {
 984         index = no_match_index;
 985         matches = 0;
 986     } else {
 987         matches = 1;
 988     }
 989
 990     *indexp = index;
 991     return matches;
 992 }
 993
 994 /*
 995  * Add a node to the access_cache at the given index (or do a lookup first
 996  * to find the index if -1 is passed in). We currently do a replace rather
 997  * than an insert if the cache is full.
 998  */
 999 static void
1000 add_node(struct access_cache *cache, int index, cnid_t nodeID, int access)
1001 {
1002     int lookup_index = -1;
1003
1004     /* need to do a lookup first if -1 passed for index */
1005     if (index == -1) {
1006         if (lookup_bucket(cache, &lookup_index, nodeID)) {
1007             if (cache->haveaccess[lookup_index] != access && cache->haveaccess[lookup_index] == ESRCH) {
1008                 // only update an entry if the previous access was ESRCH (i.e. a scope checking error)
1009                 cache->haveaccess[lookup_index] = access;
1010             }
1011
1012             /* mission accomplished */
1013             return;
1014         } else {
1015             index = lookup_index;
1016         }
1017
1018     }
1019
1020     /* if the cache is full, do a replace rather than an insert */
1021     if (cache->numcached >= NUM_CACHE_ENTRIES) {
1022         cache->numcached = NUM_CACHE_ENTRIES-1;
1023
1024         if (index > cache->numcached) {
1025             index = cache->numcached;
1026         }
1027     }
1028
1029     if (index < cache->numcached && index < NUM_CACHE_ENTRIES && nodeID > cache->acache[index]) {
1030         index++;
1031     }
1032
1033     if (index >= 0 && index < cache->numcached) {
1034         /* only do bcopy if we're inserting */
1035         bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) );
1036         bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(unsigned char) );
1037     }
1038
1039     cache->acache[index] = nodeID;
1040     cache->haveaccess[index] = access;
1041     cache->numcached++;
1042 }
1043
1044
1045 struct cinfo {
1046     uid_t   uid;
1047     gid_t   gid;
1048     mode_t  mode;
1049     cnid_t  parentcnid;
1050     u_int16_t recflags;
1051 };
1052
1053 static int
1054 snoop_callback(const cnode_t *cp, void *arg)
1055 {
1056     struct cinfo *cip = arg;
1057
1058     cip->uid = cp->c_uid;
1059     cip->gid = cp->c_gid;
1060     cip->mode = cp->c_mode;
1061     cip->parentcnid = cp->c_parentcnid;
1062     cip->recflags = cp->c_attr.ca_recflags;
1063
1064     return (0);
1065 }
1066
1067 /*
1068  * Lookup the cnid's attr info (uid, gid, and mode) as well as its parent id. If the item
1069  * isn't incore, then go to the catalog.
1070  */
1071 static int
1072 do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, cnid_t cnid,
1073     struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp)
1074 {
1075     int error = 0;
1076
1077     /* if this id matches the one the fsctl was called with, skip the lookup */
1078     if (cnid == skip_cp->c_cnid) {
1079                 cnattrp->ca_uid = skip_cp->c_uid;
1080                 cnattrp->ca_gid = skip_cp->c_gid;
1081                 cnattrp->ca_mode = skip_cp->c_mode;
1082                 cnattrp->ca_recflags = skip_cp->c_attr.ca_recflags;
1083                 keyp->hfsPlus.parentID = skip_cp->c_parentcnid;
1084     } else {
1085                 struct cinfo c_info;
1086
1087                 /* otherwise, check the cnode hash incase the file/dir is incore */
1088                 error = hfs_chash_snoop(hfsmp, cnid, 0, snoop_callback, &c_info);
1089
1090                 if (error == EACCES) {
1091                         // File is deleted
1092                         return ENOENT;
1093                 } else if (!error) {
1094                         cnattrp->ca_uid = c_info.uid;
1095                         cnattrp->ca_gid = c_info.gid;
1096                         cnattrp->ca_mode = c_info.mode;
1097                         cnattrp->ca_recflags = c_info.recflags;
1098                         keyp->hfsPlus.parentID = c_info.parentcnid;
1099                 } else {
1100                         int lockflags;
1101
1102                         if (throttle_io_will_be_throttled(-1, HFSTOVFS(hfsmp)))
1103                                 throttle_lowpri_io(1);
1104
1105                         lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
1106
1107                         /* lookup this cnid in the catalog */
1108                         error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp);
1109
1110                         hfs_systemfile_unlock(hfsmp, lockflags);
1111
1112                         cache->lookups++;
1113                 }
1114     }
1115
1116     return (error);
1117 }
1118
1119
1120 /*
1121  * Compute whether we have access to the given directory (nodeID) and all its parents. Cache
1122  * up to CACHE_LEVELS as we progress towards the root.
1123  */
1124 static int
1125 do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HFSCatalogNodeID nodeID,
1126     struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred,
1127     struct vfs_context *my_context,
1128     char *bitmap,
1129     uint32_t map_size,
1130     cnid_t* parents,
1131     uint32_t num_parents)
1132 {
1133     int                     myErr = 0;
1134     int                     myResult;
1135     HFSCatalogNodeID        thisNodeID;
1136     unsigned int            myPerms;
1137     struct cat_attr         cnattr;
1138     int                     cache_index = -1, scope_index = -1, scope_idx_start = -1;
1139     CatalogKey              catkey;
1140
1141     int i = 0, ids_to_cache = 0;
1142     int parent_ids[CACHE_LEVELS];
1143
1144     thisNodeID = nodeID;
1145     while (thisNodeID >=  kRootDirID) {
1146         myResult = 0;   /* default to "no access" */
1147
1148         /* check the cache before resorting to hitting the catalog */
1149
1150         /* ASSUMPTION: access info of cached entries is "final"... i.e. no need
1151          * to look any further after hitting cached dir */
1152
1153         if (lookup_bucket(cache, &cache_index, thisNodeID)) {
1154             cache->cachehits++;
1155             myErr = cache->haveaccess[cache_index];
1156             if (scope_index != -1) {
1157                 if (myErr == ESRCH) {
1158                     myErr = 0;
1159                 }
1160             } else {
1161                 scope_index = 0;   // so we'll just use the cache result
1162                 scope_idx_start = ids_to_cache;
1163             }
1164             myResult = (myErr == 0) ? 1 : 0;
1165             goto ExitThisRoutine;
1166         }
1167
1168
1169         if (parents) {
1170             int tmp;
1171             tmp = cache_binSearch(parents, num_parents-1, thisNodeID, NULL);
1172             if (scope_index == -1)
1173                 scope_index = tmp;
1174             if (tmp != -1 && scope_idx_start == -1 && ids_to_cache < CACHE_LEVELS) {
1175                 scope_idx_start = ids_to_cache;
1176             }
1177         }
1178
1179         /* remember which parents we want to cache */
1180         if (ids_to_cache < CACHE_LEVELS) {
1181             parent_ids[ids_to_cache] = thisNodeID;
1182             ids_to_cache++;
1183         }
1184         // Inefficient (using modulo) and we might want to use a hash function, not rely on the node id to be "nice"...
1185         if (bitmap && map_size) {
1186             bitmap[(thisNodeID/8)%(map_size)]|=(1<<(thisNodeID&7));
1187         }
1188
1189
1190         /* do the lookup (checks the cnode hash, then the catalog) */
1191         myErr = do_attr_lookup(hfsmp, cache, thisNodeID, skip_cp, &catkey, &cnattr);
1192         if (myErr) {
1193             goto ExitThisRoutine; /* no access */
1194         }
1195
1196         /* Root always gets access. */
1197         if (suser(myp_ucred, NULL) == 0) {
1198                 thisNodeID = catkey.hfsPlus.parentID;
1199                 myResult = 1;
1200                 continue;
1201         }
1202
1203         // if the thing has acl's, do the full permission check
1204         if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1205             struct vnode *vp;
1206
1207             /* get the vnode for this cnid */
1208             myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0, 0);
1209             if ( myErr ) {
1210                 myResult = 0;
1211                 goto ExitThisRoutine;
1212             }
1213
1214             thisNodeID = VTOC(vp)->c_parentcnid;
1215
1216             hfs_unlock(VTOC(vp));
1217
1218             if (vnode_vtype(vp) == VDIR) {
1219                 myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), my_context);
1220             } else {
1221                 myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, my_context);
1222             }
1223
1224             vnode_put(vp);
1225             if (myErr) {
1226                 myResult = 0;
1227                 goto ExitThisRoutine;
1228             }
1229         } else {
1230             unsigned int flags;
1231                 int mode = cnattr.ca_mode & S_IFMT;
1232                 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, cnattr.ca_mode, hfsmp->hfs_mp,myp_ucred, theProcPtr);
1233
1234                 if (mode == S_IFDIR) {
1235                         flags = R_OK | X_OK;
1236                 } else {
1237                         flags = R_OK;
1238                 }
1239                 if ( (myPerms & flags) != flags) {
1240                         myResult = 0;
1241                         myErr = EACCES;
1242                         goto ExitThisRoutine;   /* no access */
1243                 }
1244
1245             /* up the hierarchy we go */
1246             thisNodeID = catkey.hfsPlus.parentID;
1247         }
1248     }
1249
1250     /* if here, we have access to this node */
1251     myResult = 1;
1252
1253   ExitThisRoutine:
1254     if (parents && myErr == 0 && scope_index == -1) {
1255         myErr = ESRCH;
1256     }
1257
1258     if (myErr) {
1259         myResult = 0;
1260     }
1261     *err = myErr;
1262
1263     /* cache the parent directory(ies) */
1264     for (i = 0; i < ids_to_cache; i++) {
1265         if (myErr == 0 && parents && (scope_idx_start == -1 || i > scope_idx_start)) {
1266             add_node(cache, -1, parent_ids[i], ESRCH);
1267         } else {
1268             add_node(cache, -1, parent_ids[i], myErr);
1269         }
1270     }
1271
1272     return (myResult);
1273 }
1274
1275 static int
1276 do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
1277     struct vnop_ioctl_args *ap, int arg_size, vfs_context_t context)
1278 {
1279     boolean_t is64bit;
1280
1281     /*
1282      * NOTE: on entry, the vnode has an io_ref. In case this vnode
1283      * happens to be in our list of file_ids, we'll note it
1284      * avoid calling hfs_chashget_nowait() on that id as that
1285      * will cause a "locking against myself" panic.
1286      */
1287     Boolean check_leaf = true;
1288
1289     struct user64_ext_access_t *user_access_structp;
1290     struct user64_ext_access_t tmp_user_access;
1291     struct access_cache cache;
1292
1293     int error = 0, prev_parent_check_ok=1;
1294     unsigned int i;
1295
1296     short flags;
1297     unsigned int num_files = 0;
1298     int map_size = 0;
1299     int num_parents = 0;
1300     int *file_ids=NULL;
1301     short *access=NULL;
1302     char *bitmap=NULL;
1303     cnid_t *parents=NULL;
1304     int leaf_index;
1305
1306     cnid_t cnid;
1307     cnid_t prevParent_cnid = 0;
1308     unsigned int myPerms;
1309     short myaccess = 0;
1310     struct cat_attr cnattr;
1311     CatalogKey catkey;
1312     struct cnode *skip_cp = VTOC(vp);
1313     kauth_cred_t cred = vfs_context_ucred(context);
1314     proc_t p = vfs_context_proc(context);
1315
1316     is64bit = proc_is64bit(p);
1317
1318     /* initialize the local cache and buffers */
1319     cache.numcached = 0;
1320     cache.cachehits = 0;
1321     cache.lookups = 0;
1322     cache.acache = NULL;
1323     cache.haveaccess = NULL;
1324
1325     /* struct copyin done during dispatch... need to copy file_id array separately */
1326     if (ap->a_data == NULL) {
1327         error = EINVAL;
1328         goto err_exit_bulk_access;
1329     }
1330
1331     if (is64bit) {
1332         if (arg_size != sizeof(struct user64_ext_access_t)) {
1333             error = EINVAL;
1334             goto err_exit_bulk_access;
1335         }
1336
1337         user_access_structp = (struct user64_ext_access_t *)ap->a_data;
1338
1339     } else if (arg_size == sizeof(struct user32_access_t)) {
1340         struct user32_access_t *accessp = (struct user32_access_t *)ap->a_data;
1341
1342         // convert an old style bulk-access struct to the new style
1343         tmp_user_access.flags     = accessp->flags;
1344         tmp_user_access.num_files = accessp->num_files;
1345         tmp_user_access.map_size  = 0;
1346         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1347         tmp_user_access.bitmap    = USER_ADDR_NULL;
1348         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1349         tmp_user_access.num_parents = 0;
1350         user_access_structp = &tmp_user_access;
1351
1352     } else if (arg_size == sizeof(struct user32_ext_access_t)) {
1353         struct user32_ext_access_t *accessp = (struct user32_ext_access_t *)ap->a_data;
1354
1355         // up-cast from a 32-bit version of the struct
1356         tmp_user_access.flags     = accessp->flags;
1357         tmp_user_access.num_files = accessp->num_files;
1358         tmp_user_access.map_size  = accessp->map_size;
1359         tmp_user_access.num_parents  = accessp->num_parents;
1360
1361         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1362         tmp_user_access.bitmap    = CAST_USER_ADDR_T(accessp->bitmap);
1363         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1364         tmp_user_access.parents    = CAST_USER_ADDR_T(accessp->parents);
1365
1366         user_access_structp = &tmp_user_access;
1367     } else {
1368         error = EINVAL;
1369         goto err_exit_bulk_access;
1370     }
1371
1372     map_size = user_access_structp->map_size;
1373
1374     num_files = user_access_structp->num_files;
1375
1376     num_parents= user_access_structp->num_parents;
1377
1378     if (num_files < 1) {
1379         goto err_exit_bulk_access;
1380     }
1381     if (num_files > 1024) {
1382         error = EINVAL;
1383         goto err_exit_bulk_access;
1384     }
1385
1386     if (num_parents > 1024) {
1387         error = EINVAL;
1388         goto err_exit_bulk_access;
1389     }
1390
1391     file_ids = (int *) kalloc(sizeof(int) * num_files);
1392     access = (short *) kalloc(sizeof(short) * num_files);
1393     if (map_size) {
1394         bitmap = (char *) kalloc(sizeof(char) * map_size);
1395     }
1396
1397     if (num_parents) {
1398         parents = (cnid_t *) kalloc(sizeof(cnid_t) * num_parents);
1399     }
1400
1401     cache.acache = (unsigned int *) kalloc(sizeof(int) * NUM_CACHE_ENTRIES);
1402     cache.haveaccess = (unsigned char *) kalloc(sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1403
1404     if (file_ids == NULL || access == NULL || (map_size != 0 && bitmap == NULL) || cache.acache == NULL || cache.haveaccess == NULL) {
1405         if (file_ids) {
1406             kfree(file_ids, sizeof(int) * num_files);
1407         }
1408         if (bitmap) {
1409             kfree(bitmap, sizeof(char) * map_size);
1410         }
1411         if (access) {
1412             kfree(access, sizeof(short) * num_files);
1413         }
1414         if (cache.acache) {
1415             kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1416         }
1417         if (cache.haveaccess) {
1418             kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1419         }
1420         if (parents) {
1421             kfree(parents, sizeof(cnid_t) * num_parents);
1422         }
1423         return ENOMEM;
1424     }
1425
1426     // make sure the bitmap is zero'ed out...
1427     if (bitmap) {
1428         bzero(bitmap, (sizeof(char) * map_size));
1429     }
1430
1431     if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids,
1432                 num_files * sizeof(int)))) {
1433         goto err_exit_bulk_access;
1434     }
1435
1436     if (num_parents) {
1437         if ((error = copyin(user_access_structp->parents, (caddr_t)parents,
1438                     num_parents * sizeof(cnid_t)))) {
1439             goto err_exit_bulk_access;
1440         }
1441     }
1442
1443     flags = user_access_structp->flags;
1444     if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) {
1445         flags = R_OK;
1446     }
1447
1448     /* check if we've been passed leaf node ids or parent ids */
1449     if (flags & PARENT_IDS_FLAG) {
1450         check_leaf = false;
1451     }
1452
1453     /* Check access to each file_id passed in */
1454     for (i = 0; i < num_files; i++) {
1455         leaf_index=-1;
1456         cnid = (cnid_t) file_ids[i];
1457
1458         /* root always has access */
1459         if ((!parents) && (!suser(cred, NULL))) {
1460             access[i] = 0;
1461             continue;
1462         }
1463
1464         if (check_leaf) {
1465             /* do the lookup (checks the cnode hash, then the catalog) */
1466             error = do_attr_lookup(hfsmp, &cache, cnid, skip_cp, &catkey, &cnattr);
1467             if (error) {
1468                 access[i] = (short) error;
1469                 continue;
1470             }
1471
1472             if (parents) {
1473                 // Check if the leaf matches one of the parent scopes
1474                 leaf_index = cache_binSearch(parents, num_parents-1, cnid, NULL);
1475                 if (leaf_index >= 0 && parents[leaf_index] == cnid)
1476                     prev_parent_check_ok = 0;
1477                 else if (leaf_index >= 0)
1478                     prev_parent_check_ok = 1;
1479             }
1480
1481             // if the thing has acl's, do the full permission check
1482             if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1483                 struct vnode *cvp;
1484                 int myErr = 0;
1485                 /* get the vnode for this cnid */
1486                 myErr = hfs_vget(hfsmp, cnid, &cvp, 0, 0);
1487                 if ( myErr ) {
1488                     access[i] = myErr;
1489                     continue;
1490                 }
1491
1492                 hfs_unlock(VTOC(cvp));
1493
1494                 if (vnode_vtype(cvp) == VDIR) {
1495                     myErr = vnode_authorize(cvp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), context);
1496                 } else {
1497                     myErr = vnode_authorize(cvp, NULL, KAUTH_VNODE_READ_DATA, context);
1498                 }
1499
1500                 vnode_put(cvp);
1501                 if (myErr) {
1502                     access[i] = myErr;
1503                     continue;
1504                 }
1505             } else {
1506                 /* before calling CheckAccess(), check the target file for read access */
1507                 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
1508                     cnattr.ca_mode, hfsmp->hfs_mp, cred, p);
1509
1510                 /* fail fast if no access */
1511                 if ((myPerms & flags) == 0) {
1512                     access[i] = EACCES;
1513                     continue;
1514                 }
1515             }
1516         } else {
1517             /* we were passed an array of parent ids */
1518             catkey.hfsPlus.parentID = cnid;
1519         }
1520
1521         /* if the last guy had the same parent and had access, we're done */
1522         if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0 && prev_parent_check_ok) {
1523             cache.cachehits++;
1524             access[i] = 0;
1525             continue;
1526         }
1527
1528         myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID,
1529             skip_cp, p, cred, context,bitmap, map_size, parents, num_parents);
1530
1531         if (myaccess || (error == ESRCH && leaf_index != -1)) {
1532             access[i] = 0; // have access.. no errors to report
1533         } else {
1534             access[i] = (error != 0 ? (short) error : EACCES);
1535         }
1536
1537         prevParent_cnid = catkey.hfsPlus.parentID;
1538     }
1539
1540     /* copyout the access array */
1541     if ((error = copyout((caddr_t)access, user_access_structp->access,
1542                 num_files * sizeof (short)))) {
1543         goto err_exit_bulk_access;
1544     }
1545     if (map_size && bitmap) {
1546         if ((error = copyout((caddr_t)bitmap, user_access_structp->bitmap,
1547                     map_size * sizeof (char)))) {
1548             goto err_exit_bulk_access;
1549         }
1550     }
1551
1552
1553   err_exit_bulk_access:
1554
1555     if (file_ids)
1556         kfree(file_ids, sizeof(int) * num_files);
1557     if (parents)
1558         kfree(parents, sizeof(cnid_t) * num_parents);
1559     if (bitmap)
1560         kfree(bitmap, sizeof(char) * map_size);
1561     if (access)
1562         kfree(access, sizeof(short) * num_files);
1563     if (cache.acache)
1564         kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1565     if (cache.haveaccess)
1566         kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1567
1568     return (error);
1569 }
1570
1571
1572 /* end "bulk-access" support */
1573
1574
1575 /*
1576  * Control filesystem operating characteristics.
1577  */
1578 int
1579 hfs_vnop_ioctl( struct vnop_ioctl_args /* {
1580                 vnode_t a_vp;
1581                 int  a_command;
1582                 caddr_t  a_data;
1583                 int  a_fflag;
1584                 vfs_context_t a_context;
1585         } */ *ap)
1586 {
1587         struct vnode * vp = ap->a_vp;
1588         struct hfsmount *hfsmp = VTOHFS(vp);
1589         vfs_context_t context = ap->a_context;
1590         kauth_cred_t cred = vfs_context_ucred(context);
1591         proc_t p = vfs_context_proc(context);
1592         struct vfsstatfs *vfsp;
1593         boolean_t is64bit;
1594         off_t jnl_start, jnl_size;
1595         struct hfs_journal_info *jip;
1596 #if HFS_COMPRESSION
1597         int compressed = 0;
1598         off_t uncompressed_size = -1;
1599         int decmpfs_error = 0;
1600
1601         if (ap->a_command == F_RDADVISE) {
1602                 /* we need to inspect the decmpfs state of the file as early as possible */
1603                 compressed = hfs_file_is_compressed(VTOC(vp), 0);
1604                 if (compressed) {
1605                         if (VNODE_IS_RSRC(vp)) {
1606                                 /* if this is the resource fork, treat it as if it were empty */
1607                                 uncompressed_size = 0;
1608                         } else {
1609                                 decmpfs_error = hfs_uncompressed_size_of_compressed_file(NULL, vp, 0, &uncompressed_size, 0);
1610                                 if (decmpfs_error != 0) {
1611                                         /* failed to get the uncompressed size, we'll check for this later */
1612                                         uncompressed_size = -1;
1613                                 }
1614                         }
1615                 }
1616         }
1617 #endif /* HFS_COMPRESSION */
1618
1619         is64bit = proc_is64bit(p);
1620
1621 #if CONFIG_PROTECT
1622         {
1623                 int error = 0;
1624                 if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
1625                         return error;
1626                 }
1627         }
1628 #endif /* CONFIG_PROTECT */
1629
1630         switch (ap->a_command) {
1631
1632         case HFS_GETPATH:
1633         {
1634                 struct vnode *file_vp;
1635                 cnid_t  cnid;
1636                 int  outlen;
1637                 char *bufptr;
1638                 int error;
1639                 int flags = 0;
1640
1641                 /* Caller must be owner of file system. */
1642                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1643                 if (suser(cred, NULL) &&
1644                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1645                         return (EACCES);
1646                 }
1647                 /* Target vnode must be file system's root. */
1648                 if (!vnode_isvroot(vp)) {
1649                         return (EINVAL);
1650                 }
1651                 bufptr = (char *)ap->a_data;
1652                 cnid = strtoul(bufptr, NULL, 10);
1653                 if (ap->a_fflag & HFS_GETPATH_VOLUME_RELATIVE) {
1654                         flags |= BUILDPATH_VOLUME_RELATIVE;
1655                 }
1656
1657                 /* We need to call hfs_vfs_vget to leverage the code that will
1658                  * fix the origin list for us if needed, as opposed to calling
1659                  * hfs_vget, since we will need the parent for build_path call.
1660                  */
1661
1662                 if ((error = hfs_vfs_vget(HFSTOVFS(hfsmp), cnid, &file_vp, context))) {
1663                         return (error);
1664                 }
1665                 error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, flags, context);
1666                 vnode_put(file_vp);
1667
1668                 return (error);
1669         }
1670
1671         case HFS_TRANSFER_DOCUMENT_ID:
1672         {
1673                 struct cnode *cp = NULL;
1674                 int error;
1675                 u_int32_t to_fd = *(u_int32_t *)ap->a_data;
1676                 struct fileproc *to_fp;
1677                 struct vnode *to_vp;
1678                 struct cnode *to_cp;
1679
1680                 cp = VTOC(vp);
1681
1682                 if ((error = fp_getfvp(p, to_fd, &to_fp, &to_vp)) != 0) {
1683                         //printf("could not get the vnode for fd %d (err %d)\n", to_fd, error);
1684                         return error;
1685                 }
1686                 if ( (error = vnode_getwithref(to_vp)) ) {
1687                         file_drop(to_fd);
1688                         return error;
1689                 }
1690
1691                 if (VTOHFS(to_vp) != hfsmp) {
1692                         error = EXDEV;
1693                         goto transfer_cleanup;
1694                 }
1695
1696                 int need_unlock = 1;
1697                 to_cp = VTOC(to_vp);
1698                 error = hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK);
1699                 if (error != 0) {
1700                         //printf("could not lock the pair of cnodes (error %d)\n", error);
1701                         goto transfer_cleanup;
1702                 }
1703
1704                 if (!(cp->c_bsdflags & UF_TRACKED)) {
1705                         error = EINVAL;
1706                 } else if (to_cp->c_bsdflags & UF_TRACKED) {
1707                         //
1708                         // if the destination is already tracked, return an error
1709                         // as otherwise it's a silent deletion of the target's
1710                         // document-id
1711                         //
1712                         error = EEXIST;
1713                 } else if (S_ISDIR(cp->c_attr.ca_mode) || S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) {
1714                         //
1715                         // we can use the FndrExtendedFileInfo because the doc-id is the first
1716                         // thing in both it and the ExtendedDirInfo struct which is fixed in
1717                         // format and can not change layout
1718                         //
1719                         struct FndrExtendedFileInfo *f_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)cp->c_finderinfo + 16);
1720                         struct FndrExtendedFileInfo *to_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)to_cp->c_finderinfo + 16);
1721
1722                         if (f_extinfo->document_id == 0) {
1723                                 uint32_t new_id;
1724
1725                                 hfs_unlockpair(cp, to_cp);  // have to unlock to be able to get a new-id
1726
1727                                 if ((error = hfs_generate_document_id(hfsmp, &new_id)) == 0) {
1728                                         //
1729                                         // re-lock the pair now that we have the document-id
1730                                         //
1731                                         hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK);
1732                                         f_extinfo->document_id = new_id;
1733                                 } else {
1734                                         goto transfer_cleanup;
1735                                 }
1736                         }
1737
1738                         to_extinfo->document_id = f_extinfo->document_id;
1739                         f_extinfo->document_id = 0;
1740                         //printf("TRANSFERRING: doc-id %d from ino %d to ino %d\n", to_extinfo->document_id, cp->c_fileid, to_cp->c_fileid);
1741
1742                         // make sure the destination is also UF_TRACKED
1743                         to_cp->c_bsdflags |= UF_TRACKED;
1744                         cp->c_bsdflags &= ~UF_TRACKED;
1745
1746                         // mark the cnodes dirty
1747                         cp->c_flag |= C_MODIFIED | C_FORCEUPDATE;
1748                         to_cp->c_flag |= C_MODIFIED | C_FORCEUPDATE;
1749
1750                         int lockflags;
1751                         if ((error = hfs_start_transaction(hfsmp)) == 0) {
1752
1753                                 lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK);
1754
1755                                 (void) cat_update(hfsmp, &cp->c_desc, &cp->c_attr, NULL, NULL);
1756                                 (void) cat_update(hfsmp, &to_cp->c_desc, &to_cp->c_attr, NULL, NULL);
1757
1758                                 hfs_systemfile_unlock (hfsmp, lockflags);
1759                                 (void) hfs_end_transaction(hfsmp);
1760                         }
1761
1762 #if CONFIG_FSE
1763                         add_fsevent(FSE_DOCID_CHANGED, context,
1764                                     FSE_ARG_DEV,   hfsmp->hfs_raw_dev,
1765                                     FSE_ARG_INO,   (ino64_t)cp->c_fileid,       // src inode #
1766                                     FSE_ARG_INO,   (ino64_t)to_cp->c_fileid,    // dst inode #
1767                                     FSE_ARG_INT32, to_extinfo->document_id,
1768                                     FSE_ARG_DONE);
1769
1770                         hfs_unlockpair(cp, to_cp);    // unlock this so we can send the fsevents
1771                         need_unlock = 0;
1772
1773                         if (need_fsevent(FSE_STAT_CHANGED, vp)) {
1774                                 add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
1775                         }
1776                         if (need_fsevent(FSE_STAT_CHANGED, to_vp)) {
1777                                 add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, to_vp, FSE_ARG_DONE);
1778                         }
1779 #else
1780                         hfs_unlockpair(cp, to_cp);    // unlock this so we can send the fsevents
1781                         need_unlock = 0;
1782 #endif
1783                 }
1784
1785                 if (need_unlock) {
1786                         hfs_unlockpair(cp, to_cp);
1787                 }
1788
1789         transfer_cleanup:
1790                 vnode_put(to_vp);
1791                 file_drop(to_fd);
1792
1793                 return error;
1794         }
1795
1796
1797
1798         case HFS_PREV_LINK:
1799         case HFS_NEXT_LINK:
1800         {
1801                 cnid_t linkfileid;
1802                 cnid_t nextlinkid;
1803                 cnid_t prevlinkid;
1804                 int error;
1805
1806                 /* Caller must be owner of file system. */
1807                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1808                 if (suser(cred, NULL) &&
1809                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1810                         return (EACCES);
1811                 }
1812                 /* Target vnode must be file system's root. */
1813                 if (!vnode_isvroot(vp)) {
1814                         return (EINVAL);
1815                 }
1816                 linkfileid = *(cnid_t *)ap->a_data;
1817                 if (linkfileid < kHFSFirstUserCatalogNodeID) {
1818                         return (EINVAL);
1819                 }
1820                 if ((error = hfs_lookup_siblinglinks(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) {
1821                         return (error);
1822                 }
1823                 if (ap->a_command == HFS_NEXT_LINK) {
1824                         *(cnid_t *)ap->a_data = nextlinkid;
1825                 } else {
1826                         *(cnid_t *)ap->a_data = prevlinkid;
1827                 }
1828                 return (0);
1829         }
1830
1831         case HFS_RESIZE_PROGRESS: {
1832
1833                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1834                 if (suser(cred, NULL) &&
1835                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1836                         return (EACCES); /* must be owner of file system */
1837                 }
1838                 if (!vnode_isvroot(vp)) {
1839                         return (EINVAL);
1840                 }
1841                 /* file system must not be mounted read-only */
1842                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1843                         return (EROFS);
1844                 }
1845
1846                 return hfs_resize_progress(hfsmp, (u_int32_t *)ap->a_data);
1847         }
1848
1849         case HFS_RESIZE_VOLUME: {
1850                 u_int64_t newsize;
1851                 u_int64_t cursize;
1852
1853                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1854                 if (suser(cred, NULL) &&
1855                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1856                         return (EACCES); /* must be owner of file system */
1857                 }
1858                 if (!vnode_isvroot(vp)) {
1859                         return (EINVAL);
1860                 }
1861
1862                 /* filesystem must not be mounted read only */
1863                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1864                         return (EROFS);
1865                 }
1866                 newsize = *(u_int64_t *)ap->a_data;
1867                 cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
1868
1869                 if (newsize > cursize) {
1870                         return hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context);
1871                 } else if (newsize < cursize) {
1872                         return hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context);
1873                 } else {
1874                         return (0);
1875                 }
1876         }
1877         case HFS_CHANGE_NEXT_ALLOCATION: {
1878                 int error = 0;          /* Assume success */
1879                 u_int32_t location;
1880
1881                 if (vnode_vfsisrdonly(vp)) {
1882                         return (EROFS);
1883                 }
1884                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1885                 if (suser(cred, NULL) &&
1886                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1887                         return (EACCES); /* must be owner of file system */
1888                 }
1889                 if (!vnode_isvroot(vp)) {
1890                         return (EINVAL);
1891                 }
1892                 hfs_lock_mount(hfsmp);
1893                 location = *(u_int32_t *)ap->a_data;
1894                 if ((location >= hfsmp->allocLimit) &&
1895                         (location != HFS_NO_UPDATE_NEXT_ALLOCATION)) {
1896                         error = EINVAL;
1897                         goto fail_change_next_allocation;
1898                 }
1899                 /* Return previous value. */
1900                 *(u_int32_t *)ap->a_data = hfsmp->nextAllocation;
1901                 if (location == HFS_NO_UPDATE_NEXT_ALLOCATION) {
1902                         /* On magic value for location, set nextAllocation to next block
1903                          * after metadata zone and set flag in mount structure to indicate
1904                          * that nextAllocation should not be updated again.
1905                          */
1906                         if (hfsmp->hfs_metazone_end != 0) {
1907                                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_end + 1);
1908                         }
1909                         hfsmp->hfs_flags |= HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1910                 } else {
1911                         hfsmp->hfs_flags &= ~HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1912                         HFS_UPDATE_NEXT_ALLOCATION(hfsmp, location);
1913                 }
1914                 MarkVCBDirty(hfsmp);
1915 fail_change_next_allocation:
1916                 hfs_unlock_mount(hfsmp);
1917                 return (error);
1918         }
1919
1920 #if HFS_SPARSE_DEV
1921         case HFS_SETBACKINGSTOREINFO: {
1922                 struct vnode * bsfs_rootvp;
1923                 struct vnode * di_vp;
1924                 struct hfs_backingstoreinfo *bsdata;
1925                 int error = 0;
1926
1927                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1928                         return (EROFS);
1929                 }
1930                 if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
1931                         return (EALREADY);
1932                 }
1933                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1934                 if (suser(cred, NULL) &&
1935                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1936                         return (EACCES); /* must be owner of file system */
1937                 }
1938                 bsdata = (struct hfs_backingstoreinfo *)ap->a_data;
1939                 if (bsdata == NULL) {
1940                         return (EINVAL);
1941                 }
1942                 if ((error = file_vnode(bsdata->backingfd, &di_vp))) {
1943                         return (error);
1944                 }
1945                 if ((error = vnode_getwithref(di_vp))) {
1946                         file_drop(bsdata->backingfd);
1947                         return(error);
1948                 }
1949
1950                 if (vnode_mount(vp) == vnode_mount(di_vp)) {
1951                         (void)vnode_put(di_vp);
1952                         file_drop(bsdata->backingfd);
1953                         return (EINVAL);
1954                 }
1955
1956                 /*
1957                  * Obtain the backing fs root vnode and keep a reference
1958                  * on it.  This reference will be dropped in hfs_unmount.
1959                  */
1960                 error = VFS_ROOT(vnode_mount(di_vp), &bsfs_rootvp, NULL); /* XXX use context! */
1961                 if (error) {
1962                         (void)vnode_put(di_vp);
1963                         file_drop(bsdata->backingfd);
1964                         return (error);
1965                 }
1966                 vnode_ref(bsfs_rootvp);
1967                 vnode_put(bsfs_rootvp);
1968
1969                 hfs_lock_mount(hfsmp);
1970                 hfsmp->hfs_backingfs_rootvp = bsfs_rootvp;
1971                 hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
1972                 hfsmp->hfs_sparsebandblks = bsdata->bandsize / hfsmp->blockSize * 4;
1973                 hfs_unlock_mount(hfsmp);
1974
1975                 /* We check the MNTK_VIRTUALDEV bit instead of marking the dependent process */
1976
1977                 /*
1978                  * If the sparse image is on a sparse image file (as opposed to a sparse
1979                  * bundle), then we may need to limit the free space to the maximum size
1980                  * of a file on that volume.  So we query (using pathconf), and if we get
1981                  * a meaningful result, we cache the number of blocks for later use in
1982                  * hfs_freeblks().
1983                  */
1984                 hfsmp->hfs_backingfs_maxblocks = 0;
1985                 if (vnode_vtype(di_vp) == VREG) {
1986                         int terr;
1987                         int hostbits;
1988                         terr = vn_pathconf(di_vp, _PC_FILESIZEBITS, &hostbits, context);
1989                         if (terr == 0 && hostbits != 0 && hostbits < 64) {
1990                                 u_int64_t hostfilesizemax = ((u_int64_t)1) << hostbits;
1991
1992                                 hfsmp->hfs_backingfs_maxblocks = hostfilesizemax / hfsmp->blockSize;
1993                         }
1994                 }
1995
1996                 /* The free extent cache is managed differently for sparse devices.
1997                  * There is a window between which the volume is mounted and the
1998                  * device is marked as sparse, so the free extent cache for this
1999                  * volume is currently initialized as normal volume (sorted by block
2000                  * count).  Reset the cache so that it will be rebuilt again
2001                  * for sparse device (sorted by start block).
2002                  */
2003                 ResetVCBFreeExtCache(hfsmp);
2004
2005                 (void)vnode_put(di_vp);
2006                 file_drop(bsdata->backingfd);
2007                 return (0);
2008         }
2009         case HFS_CLRBACKINGSTOREINFO: {
2010                 struct vnode * tmpvp;
2011
2012                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
2013                 if (suser(cred, NULL) &&
2014                         kauth_cred_getuid(cred) != vfsp->f_owner) {
2015                         return (EACCES); /* must be owner of file system */
2016                 }
2017                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2018                         return (EROFS);
2019                 }
2020
2021                 if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
2022                     hfsmp->hfs_backingfs_rootvp) {
2023
2024                         hfs_lock_mount(hfsmp);
2025                         hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
2026                         tmpvp = hfsmp->hfs_backingfs_rootvp;
2027                         hfsmp->hfs_backingfs_rootvp = NULLVP;
2028                         hfsmp->hfs_sparsebandblks = 0;
2029                         hfs_unlock_mount(hfsmp);
2030
2031                         vnode_rele(tmpvp);
2032                 }
2033                 return (0);
2034         }
2035 #endif /* HFS_SPARSE_DEV */
2036
2037         /* Change the next CNID stored in the VH */
2038         case HFS_CHANGE_NEXTCNID: {
2039                 int error = 0;          /* Assume success */
2040                 u_int32_t fileid;
2041                 int wraparound = 0;
2042                 int lockflags = 0;
2043
2044                 if (vnode_vfsisrdonly(vp)) {
2045                         return (EROFS);
2046                 }
2047                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
2048                 if (suser(cred, NULL) &&
2049                         kauth_cred_getuid(cred) != vfsp->f_owner) {
2050                         return (EACCES); /* must be owner of file system */
2051                 }
2052
2053                 fileid = *(u_int32_t *)ap->a_data;
2054
2055                 /* Must have catalog lock excl. to advance the CNID pointer */
2056                 lockflags = hfs_systemfile_lock (hfsmp, SFL_CATALOG , HFS_EXCLUSIVE_LOCK);
2057
2058                 hfs_lock_mount(hfsmp);
2059
2060                 /* If it is less than the current next CNID, force the wraparound bit to be set */
2061                 if (fileid < hfsmp->vcbNxtCNID) {
2062                         wraparound=1;
2063                 }
2064
2065                 /* Return previous value. */
2066                 *(u_int32_t *)ap->a_data = hfsmp->vcbNxtCNID;
2067
2068                 hfsmp->vcbNxtCNID = fileid;
2069
2070                 if (wraparound) {
2071                         hfsmp->vcbAtrb |= kHFSCatalogNodeIDsReusedMask;
2072                 }
2073
2074                 MarkVCBDirty(hfsmp);
2075                 hfs_unlock_mount(hfsmp);
2076                 hfs_systemfile_unlock (hfsmp, lockflags);
2077
2078                 return (error);
2079         }
2080
2081         case F_FREEZE_FS: {
2082                 struct mount *mp;
2083
2084                 mp = vnode_mount(vp);
2085                 hfsmp = VFSTOHFS(mp);
2086
2087                 if (!(hfsmp->jnl))
2088                         return (ENOTSUP);
2089
2090                 vfsp = vfs_statfs(mp);
2091
2092                 if (kauth_cred_getuid(cred) != vfsp->f_owner &&
2093                         !kauth_cred_issuser(cred))
2094                         return (EACCES);
2095
2096                 return hfs_freeze(hfsmp);
2097         }
2098
2099         case F_THAW_FS: {
2100                 vfsp = vfs_statfs(vnode_mount(vp));
2101                 if (kauth_cred_getuid(cred) != vfsp->f_owner &&
2102                         !kauth_cred_issuser(cred))
2103                         return (EACCES);
2104
2105                 return hfs_thaw(hfsmp, current_proc());
2106         }
2107
2108         case HFS_BULKACCESS_FSCTL: {
2109             int size;
2110
2111             if (hfsmp->hfs_flags & HFS_STANDARD) {
2112                 return EINVAL;
2113             }
2114
2115             if (is64bit) {
2116                 size = sizeof(struct user64_access_t);
2117             } else {
2118                 size = sizeof(struct user32_access_t);
2119             }
2120
2121             return do_bulk_access_check(hfsmp, vp, ap, size, context);
2122         }
2123
2124         case HFS_EXT_BULKACCESS_FSCTL: {
2125             int size;
2126
2127             if (hfsmp->hfs_flags & HFS_STANDARD) {
2128                 return EINVAL;
2129             }
2130
2131             if (is64bit) {
2132                 size = sizeof(struct user64_ext_access_t);
2133             } else {
2134                 size = sizeof(struct user32_ext_access_t);
2135             }
2136
2137             return do_bulk_access_check(hfsmp, vp, ap, size, context);
2138         }
2139
2140         case HFS_SET_XATTREXTENTS_STATE: {
2141                 int state;
2142
2143                 if (ap->a_data == NULL) {
2144                         return (EINVAL);
2145                 }
2146
2147                 state = *(int *)ap->a_data;
2148
2149                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2150                         return (EROFS);
2151                 }
2152
2153                 /* Super-user can enable or disable extent-based extended
2154                  * attribute support on a volume
2155                  * Note: Starting Mac OS X 10.7, extent-based extended attributes
2156                  * are enabled by default, so any change will be transient only
2157                  * till the volume is remounted.
2158                  */
2159                 if (!kauth_cred_issuser(kauth_cred_get())) {
2160                         return (EPERM);
2161                 }
2162                 if (state == 0 || state == 1)
2163                         return hfs_set_volxattr(hfsmp, HFS_SET_XATTREXTENTS_STATE, state);
2164                 else
2165                         return (EINVAL);
2166         }
2167
2168         case F_SETSTATICCONTENT: {
2169                 int error;
2170                 int enable_static = 0;
2171                 struct cnode *cp = NULL;
2172                 /*
2173                  * lock the cnode, decorate the cnode flag, and bail out.
2174                  * VFS should have already authenticated the caller for us.
2175                  */
2176
2177                 if (ap->a_data) {
2178                         /*
2179                          * Note that even though ap->a_data is of type caddr_t,
2180                          * the fcntl layer at the syscall handler will pass in NULL
2181                          * or 1 depending on what the argument supplied to the fcntl
2182                          * was.  So it is in fact correct to check the ap->a_data
2183                          * argument for zero or non-zero value when deciding whether or not
2184                          * to enable the static bit in the cnode.
2185                          */
2186                         enable_static = 1;
2187                 }
2188                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2189                         return EROFS;
2190                 }
2191                 cp = VTOC(vp);
2192
2193                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2194                 if (error == 0) {
2195                         if (enable_static) {
2196                                 cp->c_flag |= C_SSD_STATIC;
2197                         }
2198                         else {
2199                                 cp->c_flag &= ~C_SSD_STATIC;
2200                         }
2201                         hfs_unlock (cp);
2202                 }
2203                 return error;
2204         }
2205
2206         case F_SET_GREEDY_MODE: {
2207                 int error;
2208                 int enable_greedy_mode = 0;
2209                 struct cnode *cp = NULL;
2210                 /*
2211                  * lock the cnode, decorate the cnode flag, and bail out.
2212                  * VFS should have already authenticated the caller for us.
2213                  */
2214
2215                 if (ap->a_data) {
2216                         /*
2217                          * Note that even though ap->a_data is of type caddr_t,
2218                          * the fcntl layer at the syscall handler will pass in NULL
2219                          * or 1 depending on what the argument supplied to the fcntl
2220                          * was.  So it is in fact correct to check the ap->a_data
2221                          * argument for zero or non-zero value when deciding whether or not
2222                          * to enable the greedy mode bit in the cnode.
2223                          */
2224                         enable_greedy_mode = 1;
2225                 }
2226                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2227                         return EROFS;
2228                 }
2229                 cp = VTOC(vp);
2230
2231                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2232                 if (error == 0) {
2233                         if (enable_greedy_mode) {
2234                                 cp->c_flag |= C_SSD_GREEDY_MODE;
2235                         }
2236                         else {
2237                                 cp->c_flag &= ~C_SSD_GREEDY_MODE;
2238                         }
2239                         hfs_unlock (cp);
2240                 }
2241                 return error;
2242         }
2243
2244         case F_SETIOTYPE: {
2245                 int error;
2246                 uint32_t iotypeflag = 0;
2247
2248                 struct cnode *cp = NULL;
2249                 /*
2250                  * lock the cnode, decorate the cnode flag, and bail out.
2251                  * VFS should have already authenticated the caller for us.
2252                  */
2253
2254                 if (ap->a_data == NULL) {
2255                         return EINVAL;
2256                 }
2257
2258                 /*
2259                  * Note that even though ap->a_data is of type caddr_t, we
2260                  * can only use 32 bits of flag values.
2261                  */
2262                 iotypeflag = (uint32_t) ap->a_data;
2263                 switch (iotypeflag) {
2264                         case F_IOTYPE_ISOCHRONOUS:
2265                                 break;
2266                         default:
2267                                 return EINVAL;
2268                 }
2269
2270
2271                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2272                         return EROFS;
2273                 }
2274                 cp = VTOC(vp);
2275
2276                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2277                 if (error == 0) {
2278                         switch (iotypeflag) {
2279                                 case F_IOTYPE_ISOCHRONOUS:
2280                                         cp->c_flag |= C_IO_ISOCHRONOUS;
2281                                         break;
2282                                 default:
2283                                         break;
2284                         }
2285                         hfs_unlock (cp);
2286                 }
2287                 return error;
2288         }
2289
2290         case F_MAKECOMPRESSED: {
2291                 int error = 0;
2292                 uint32_t gen_counter;
2293                 struct cnode *cp = NULL;
2294                 int reset_decmp = 0;
2295
2296                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2297                         return EROFS;
2298                 }
2299
2300                 /*
2301                  * acquire & lock the cnode.
2302                  * VFS should have already authenticated the caller for us.
2303                  */
2304
2305                 if (ap->a_data) {
2306                         /*
2307                          * Cast the pointer into a uint32_t so we can extract the
2308                          * supplied generation counter.
2309                          */
2310                         gen_counter = *((uint32_t*)ap->a_data);
2311                 }
2312                 else {
2313                         return EINVAL;
2314                 }
2315
2316 #if HFS_COMPRESSION
2317                 cp = VTOC(vp);
2318                 /* Grab truncate lock first; we may truncate the file */
2319                 hfs_lock_truncate (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2320
2321                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2322                 if (error) {
2323                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
2324                         return error;
2325                 }
2326
2327                 /* Are there any other usecounts/FDs? */
2328                 if (vnode_isinuse(vp, 1)) {
2329                         hfs_unlock(cp);
2330                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
2331                         return EBUSY;
2332                 }
2333
2334                 /* now we have the cnode locked down; Validate arguments */
2335                 if (cp->c_attr.ca_flags & (UF_IMMUTABLE | UF_COMPRESSED)) {
2336                         /* EINVAL if you are trying to manipulate an IMMUTABLE file */
2337                         hfs_unlock(cp);
2338                         hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT);
2339                         return EINVAL;
2340                 }
2341
2342                 if ((hfs_get_gencount (cp)) == gen_counter) {
2343                         /*
2344                          * OK, the gen_counter matched.  Go for it:
2345                          * Toggle state bits, truncate file, and suppress mtime update
2346                          */
2347                         reset_decmp = 1;
2348                         cp->c_bsdflags |= UF_COMPRESSED;
2349
2350                         error = hfs_truncate(vp, 0, IO_NDELAY, HFS_TRUNCATE_SKIPTIMES,
2351                                                                  ap->a_context);
2352                 }
2353                 else {
2354                         error = ESTALE;
2355                 }
2356
2357                 /* Unlock cnode before executing decmpfs ; they may need to get an EA */
2358                 hfs_unlock(cp);
2359
2360                 /*
2361                  * Reset the decmp state while still holding the truncate lock. We need to
2362                  * serialize here against a listxattr on this node which may occur at any
2363                  * time.
2364                  *
2365                  * Even if '0/skiplock' is passed in 2nd argument to hfs_file_is_compressed,
2366                  * that will still potentially require getting the com.apple.decmpfs EA. If the
2367                  * EA is required, then we can't hold the cnode lock, because the getxattr call is
2368                  * generic(through VFS), and can't pass along any info telling it that we're already
2369                  * holding it (the lock). If we don't serialize, then we risk listxattr stopping
2370                  * and trying to fill in the hfs_file_is_compressed info during the callback
2371                  * operation, which will result in deadlock against the b-tree node.
2372                  *
2373                  * So, to serialize against listxattr (which will grab buf_t meta references on
2374                  * the b-tree blocks), we hold the truncate lock as we're manipulating the
2375                  * decmpfs payload.
2376                  */
2377                 if ((reset_decmp) && (error == 0)) {
2378                         decmpfs_cnode *dp = VTOCMP (vp);
2379                         if (dp != NULL) {
2380                                 decmpfs_cnode_set_vnode_state(dp, FILE_TYPE_UNKNOWN, 0);
2381                         }
2382
2383                         /* Initialize the decmpfs node as needed */
2384                         (void) hfs_file_is_compressed (cp, 0); /* ok to take lock */
2385                 }
2386
2387                 hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT);
2388
2389 #endif
2390                 return error;
2391         }
2392
2393         case F_SETBACKINGSTORE: {
2394
2395                 int error = 0;
2396
2397                 /*
2398                  * See comment in F_SETSTATICCONTENT re: using
2399              * a null check for a_data
2400                  */
2401                 if (ap->a_data) {
2402                         error = hfs_set_backingstore (vp, 1);
2403                 }
2404                 else {
2405                         error = hfs_set_backingstore (vp, 0);
2406                 }
2407
2408                 return error;
2409         }
2410
2411         case F_GETPATH_MTMINFO: {
2412                 int error = 0;
2413
2414                 int *data = (int*) ap->a_data;
2415
2416                 /* Ask if this is a backingstore vnode */
2417                 error = hfs_is_backingstore (vp, data);
2418
2419                 return error;
2420         }
2421
2422         case F_FULLFSYNC: {
2423                 int error;
2424
2425                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2426                         return (EROFS);
2427                 }
2428                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2429                 if (error == 0) {
2430                         error = hfs_fsync(vp, MNT_WAIT, TRUE, p);
2431                         hfs_unlock(VTOC(vp));
2432                 }
2433
2434                 return error;
2435         }
2436
2437         case F_CHKCLEAN: {
2438                 register struct cnode *cp;
2439                 int error;
2440
2441                 if (!vnode_isreg(vp))
2442                         return EINVAL;
2443
2444                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2445                 if (error == 0) {
2446                         cp = VTOC(vp);
2447                         /*
2448                          * used by regression test to determine if
2449                          * all the dirty pages (via write) have been cleaned
2450                          * after a call to 'fsysnc'.
2451                          */
2452                         error = is_file_clean(vp, VTOF(vp)->ff_size);
2453                         hfs_unlock(cp);
2454                 }
2455                 return (error);
2456         }
2457
2458         case F_RDADVISE: {
2459                 register struct radvisory *ra;
2460                 struct filefork *fp;
2461                 int error;
2462
2463                 if (!vnode_isreg(vp))
2464                         return EINVAL;
2465
2466                 ra = (struct radvisory *)(ap->a_data);
2467                 fp = VTOF(vp);
2468
2469                 /* Protect against a size change. */
2470                 hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2471
2472 #if HFS_COMPRESSION
2473                 if (compressed && (uncompressed_size == -1)) {
2474                         /* fetching the uncompressed size failed above, so return the error */
2475                         error = decmpfs_error;
2476                 } else if ((compressed && (ra->ra_offset >= uncompressed_size)) ||
2477                                    (!compressed && (ra->ra_offset >= fp->ff_size))) {
2478                         error = EFBIG;
2479                 }
2480 #else /* HFS_COMPRESSION */
2481                 if (ra->ra_offset >= fp->ff_size) {
2482                         error = EFBIG;
2483                 }
2484 #endif /* HFS_COMPRESSION */
2485                 else {
2486                         error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count);
2487                 }
2488
2489                 hfs_unlock_truncate(VTOC(vp), HFS_LOCK_DEFAULT);
2490                 return (error);
2491         }
2492
2493         case _IOC(IOC_OUT,'h', 4, 0):     /* Create date in local time */
2494         {
2495                 if (is64bit) {
2496                         *(user_time_t *)(ap->a_data) = (user_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2497                 }
2498                 else {
2499                         *(user32_time_t *)(ap->a_data) = (user32_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2500                 }
2501                 return 0;
2502         }
2503
2504         case SPOTLIGHT_FSCTL_GET_MOUNT_TIME:
2505             *(uint32_t *)ap->a_data = hfsmp->hfs_mount_time;
2506             break;
2507
2508         case SPOTLIGHT_FSCTL_GET_LAST_MTIME:
2509             *(uint32_t *)ap->a_data = hfsmp->hfs_last_mounted_mtime;
2510             break;
2511
2512         case HFS_FSCTL_GET_VERY_LOW_DISK:
2513             *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_dangerlimit;
2514             break;
2515
2516         case HFS_FSCTL_SET_VERY_LOW_DISK:
2517             if (*(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_warninglimit) {
2518                 return EINVAL;
2519             }
2520
2521             hfsmp->hfs_freespace_notify_dangerlimit = *(uint32_t *)ap->a_data;
2522             break;
2523
2524         case HFS_FSCTL_GET_LOW_DISK:
2525             *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_warninglimit;
2526             break;
2527
2528         case HFS_FSCTL_SET_LOW_DISK:
2529             if (   *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel
2530                 || *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_dangerlimit) {
2531
2532                 return EINVAL;
2533             }
2534
2535             hfsmp->hfs_freespace_notify_warninglimit = *(uint32_t *)ap->a_data;
2536             break;
2537
2538         case HFS_FSCTL_GET_DESIRED_DISK:
2539             *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_desiredlevel;
2540             break;
2541
2542         case HFS_FSCTL_SET_DESIRED_DISK:
2543             if (*(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) {
2544                 return EINVAL;
2545             }
2546
2547             hfsmp->hfs_freespace_notify_desiredlevel = *(uint32_t *)ap->a_data;
2548             break;
2549
2550         case HFS_VOLUME_STATUS:
2551             *(uint32_t *)ap->a_data = hfsmp->hfs_notification_conditions;
2552             break;
2553
2554         case HFS_SET_BOOT_INFO:
2555                 if (!vnode_isvroot(vp))
2556                         return(EINVAL);
2557                 if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(HFSTOVFS(hfsmp))->f_owner))
2558                         return(EACCES); /* must be superuser or owner of filesystem */
2559                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2560                         return (EROFS);
2561                 }
2562                 hfs_lock_mount (hfsmp);
2563                 bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo));
2564                 hfs_unlock_mount (hfsmp);
2565                 (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
2566                 break;
2567
2568         case HFS_GET_BOOT_INFO:
2569                 if (!vnode_isvroot(vp))
2570                         return(EINVAL);
2571                 hfs_lock_mount (hfsmp);
2572                 bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo));
2573                 hfs_unlock_mount(hfsmp);
2574                 break;
2575
2576         case HFS_MARK_BOOT_CORRUPT:
2577                 /* Mark the boot volume corrupt by setting
2578                  * kHFSVolumeInconsistentBit in the volume header.  This will
2579                  * force fsck_hfs on next mount.
2580                  */
2581                 if (!kauth_cred_issuser(kauth_cred_get())) {
2582                         return EACCES;
2583                 }
2584
2585                 /* Allowed only on the root vnode of the boot volume */
2586                 if (!(vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) ||
2587                     !vnode_isvroot(vp)) {
2588                         return EINVAL;
2589                 }
2590                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2591                         return (EROFS);
2592                 }
2593                 printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n");
2594                 hfs_mark_inconsistent(hfsmp, HFS_FSCK_FORCED);
2595                 break;
2596
2597         case HFS_FSCTL_GET_JOURNAL_INFO:
2598                 jip = (struct hfs_journal_info*)ap->a_data;
2599
2600                 if (vp == NULLVP)
2601                         return EINVAL;
2602
2603             if (hfsmp->jnl == NULL) {
2604                         jnl_start = 0;
2605                         jnl_size  = 0;
2606             } else {
2607                         jnl_start = (off_t)(hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset;
2608                         jnl_size  = (off_t)hfsmp->jnl_size;
2609             }
2610
2611                 jip->jstart = jnl_start;
2612                 jip->jsize = jnl_size;
2613                 break;
2614
2615         case HFS_SET_ALWAYS_ZEROFILL: {
2616             struct cnode *cp = VTOC(vp);
2617
2618             if (*(int *)ap->a_data) {
2619                 cp->c_flag |= C_ALWAYS_ZEROFILL;
2620             } else {
2621                 cp->c_flag &= ~C_ALWAYS_ZEROFILL;
2622             }
2623             break;
2624         }
2625
2626         case HFS_DISABLE_METAZONE: {
2627                 /* Only root can disable metadata zone */
2628                 if (!kauth_cred_issuser(kauth_cred_get())) {
2629                         return EACCES;
2630                 }
2631                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2632                         return (EROFS);
2633                 }
2634
2635                 /* Disable metadata zone now */
2636                 (void) hfs_metadatazone_init(hfsmp, true);
2637                 printf ("hfs: Disabling metadata zone on %s\n", hfsmp->vcbVN);
2638                 break;
2639         }
2640
2641
2642         case HFS_FSINFO_METADATA_BLOCKS: {
2643                 int error;
2644                 struct hfsinfo_metadata *hinfo;
2645
2646                 hinfo = (struct hfsinfo_metadata *)ap->a_data;
2647
2648                 /* Get information about number of metadata blocks */
2649                 error = hfs_getinfo_metadata_blocks(hfsmp, hinfo);
2650                 if (error) {
2651                         return error;
2652                 }
2653
2654                 break;
2655         }
2656
2657         case HFS_CS_FREESPACE_TRIM: {
2658                 int error = 0;
2659                 int lockflags = 0;
2660
2661                 /* Only root allowed */
2662                 if (!kauth_cred_issuser(kauth_cred_get())) {
2663                         return EACCES;
2664                 }
2665
2666                 /*
2667                  * This core functionality is similar to hfs_scan_blocks().
2668                  * The main difference is that hfs_scan_blocks() is called
2669                  * as part of mount where we are assured that the journal is
2670                  * empty to start with.  This fcntl() can be called on a
2671                  * mounted volume, therefore it has to flush the content of
2672                  * the journal as well as ensure the state of summary table.
2673                  *
2674                  * This fcntl scans over the entire allocation bitmap,
2675                  * creates list of all the free blocks, and issues TRIM
2676                  * down to the underlying device.  This can take long time
2677                  * as it can generate up to 512MB of read I/O.
2678                  */
2679
2680                 if ((hfsmp->hfs_flags & HFS_SUMMARY_TABLE) == 0) {
2681                         error = hfs_init_summary(hfsmp);
2682                         if (error) {
2683                                 printf("hfs: fsctl() could not initialize summary table for %s\n", hfsmp->vcbVN);
2684                                 return error;
2685                         }
2686                 }
2687
2688                 /*
2689                  * The journal maintains list of recently deallocated blocks to
2690                  * issue DKIOCUNMAPs when the corresponding journal transaction is
2691                  * flushed to the disk.  To avoid any race conditions, we only
2692                  * want one active trim list and only one thread issuing DKIOCUNMAPs.
2693                  * Therefore we make sure that the journal trim list is sync'ed,
2694                  * empty, and not modifiable for the duration of our scan.
2695                  *
2696                  * Take the journal lock before flushing the journal to the disk.
2697                  * We will keep on holding the journal lock till we don't get the
2698                  * bitmap lock to make sure that no new journal transactions can
2699                  * start.  This will make sure that the journal trim list is not
2700                  * modified after the journal flush and before getting bitmap lock.
2701                  * We can release the journal lock after we acquire the bitmap
2702                  * lock as it will prevent any further block deallocations.
2703                  */
2704                 hfs_journal_lock(hfsmp);
2705
2706                 /* Flush the journal and wait for all I/Os to finish up */
2707                 error = hfs_journal_flush(hfsmp, TRUE);
2708                 if (error) {
2709                         hfs_journal_unlock(hfsmp);
2710                         return error;
2711                 }
2712
2713                 /* Take bitmap lock to ensure it is not being modified */
2714                 lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
2715
2716                 /* Release the journal lock */
2717                 hfs_journal_unlock(hfsmp);
2718
2719                 /*
2720                  * ScanUnmapBlocks reads the bitmap in large block size
2721                  * (up to 1MB) unlike the runtime which reads the bitmap
2722                  * in the 4K block size.  This can cause buf_t collisions
2723                  * and potential data corruption.  To avoid this, we
2724                  * invalidate all the existing buffers associated with
2725                  * the bitmap vnode before scanning it.
2726                  *
2727                  * Note: ScanUnmapBlock() cleans up all the buffers
2728                  * after itself, so there won't be any large buffers left
2729                  * for us to clean up after it returns.
2730                  */
2731                 error = buf_invalidateblks(hfsmp->hfs_allocation_vp, 0, 0, 0);
2732                 if (error) {
2733                         hfs_systemfile_unlock(hfsmp, lockflags);
2734                         return error;
2735                 }
2736
2737                 /* Traverse bitmap and issue DKIOCUNMAPs */
2738                 error = ScanUnmapBlocks(hfsmp);
2739                 hfs_systemfile_unlock(hfsmp, lockflags);
2740                 if (error) {
2741                         return error;
2742                 }
2743
2744                 break;
2745         }
2746
2747         default:
2748                 return (ENOTTY);
2749         }
2750
2751         return 0;
2752 }
2753
2754 /*
2755  * select
2756  */
2757 int
2758 hfs_vnop_select(__unused struct vnop_select_args *ap)
2759 /*
2760         struct vnop_select_args {
2761                 vnode_t a_vp;
2762                 int  a_which;
2763                 int  a_fflags;
2764                 void *a_wql;
2765                 vfs_context_t a_context;
2766         };
2767 */
2768 {
2769         /*
2770          * We should really check to see if I/O is possible.
2771          */
2772         return (1);
2773 }
2774
2775 /*
2776  * Converts a logical block number to a physical block, and optionally returns
2777  * the amount of remaining blocks in a run. The logical block is based on hfsNode.logBlockSize.
2778  * The physical block number is based on the device block size, currently its 512.
2779  * The block run is returned in logical blocks, and is the REMAINING amount of blocks
2780  */
2781 int
2782 hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, unsigned int *runp)
2783 {
2784         struct filefork *fp = VTOF(vp);
2785         struct hfsmount *hfsmp = VTOHFS(vp);
2786         int  retval = E_NONE;
2787         u_int32_t  logBlockSize;
2788         size_t  bytesContAvail = 0;
2789         off_t  blockposition;
2790         int lockExtBtree;
2791         int lockflags = 0;
2792
2793         /*
2794          * Check for underlying vnode requests and ensure that logical
2795          * to physical mapping is requested.
2796          */
2797         if (vpp != NULL)
2798                 *vpp = hfsmp->hfs_devvp;
2799         if (bnp == NULL)
2800                 return (0);
2801
2802         logBlockSize = GetLogicalBlockSize(vp);
2803         blockposition = (off_t)bn * logBlockSize;
2804
2805         lockExtBtree = overflow_extents(fp);
2806
2807         if (lockExtBtree)
2808                 lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK);
2809
2810         retval = MacToVFSError(
2811                             MapFileBlockC (HFSTOVCB(hfsmp),
2812                                             (FCB*)fp,
2813                                             MAXPHYSIO,
2814                                             blockposition,
2815                                             bnp,
2816                                             &bytesContAvail));
2817
2818         if (lockExtBtree)
2819                 hfs_systemfile_unlock(hfsmp, lockflags);
2820
2821         if (retval == E_NONE) {
2822                 /* Figure out how many read ahead blocks there are */
2823                 if (runp != NULL) {
2824                         if (can_cluster(logBlockSize)) {
2825                                 /* Make sure this result never goes negative: */
2826                                 *runp = (bytesContAvail < logBlockSize) ? 0 : (bytesContAvail / logBlockSize) - 1;
2827                         } else {
2828                                 *runp = 0;
2829                         }
2830                 }
2831         }
2832         return (retval);
2833 }
2834
2835 /*
2836  * Convert logical block number to file offset.
2837  */
2838 int
2839 hfs_vnop_blktooff(struct vnop_blktooff_args *ap)
2840 /*
2841         struct vnop_blktooff_args {
2842                 vnode_t a_vp;
2843                 daddr64_t a_lblkno;
2844                 off_t *a_offset;
2845         };
2846 */
2847 {
2848         if (ap->a_vp == NULL)
2849                 return (EINVAL);
2850         *ap->a_offset = (off_t)ap->a_lblkno * (off_t)GetLogicalBlockSize(ap->a_vp);
2851
2852         return(0);
2853 }
2854
2855 /*
2856  * Convert file offset to logical block number.
2857  */
2858 int
2859 hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap)
2860 /*
2861         struct vnop_offtoblk_args {
2862                 vnode_t a_vp;
2863                 off_t a_offset;
2864                 daddr64_t *a_lblkno;
2865         };
2866 */
2867 {
2868         if (ap->a_vp == NULL)
2869                 return (EINVAL);
2870         *ap->a_lblkno = (daddr64_t)(ap->a_offset / (off_t)GetLogicalBlockSize(ap->a_vp));
2871
2872         return(0);
2873 }
2874
2875 /*
2876  * Map file offset to physical block number.
2877  *
2878  * If this function is called for write operation, and if the file
2879  * had virtual blocks allocated (delayed allocation), real blocks
2880  * are allocated by calling ExtendFileC().
2881  *
2882  * If this function is called for read operation, and if the file
2883  * had virtual blocks allocated (delayed allocation), no change
2884  * to the size of file is done, and if required, rangelist is
2885  * searched for mapping.
2886  *
2887  * System file cnodes are expected to be locked (shared or exclusive).
2888  */
2889 int
2890 hfs_vnop_blockmap(struct vnop_blockmap_args *ap)
2891 /*
2892         struct vnop_blockmap_args {
2893                 vnode_t a_vp;
2894                 off_t a_foffset;
2895                 size_t a_size;
2896                 daddr64_t *a_bpn;
2897                 size_t *a_run;
2898                 void *a_poff;
2899                 int a_flags;
2900                 vfs_context_t a_context;
2901         };
2902 */
2903 {
2904         struct vnode *vp = ap->a_vp;
2905         struct cnode *cp;
2906         struct filefork *fp;
2907         struct hfsmount *hfsmp;
2908         size_t bytesContAvail = 0;
2909         int retval = E_NONE;
2910         int syslocks = 0;
2911         int lockflags = 0;
2912         struct rl_entry *invalid_range;
2913         enum rl_overlaptype overlaptype;
2914         int started_tr = 0;
2915         int tooklock = 0;
2916
2917 #if HFS_COMPRESSION
2918         if (VNODE_IS_RSRC(vp)) {
2919                 /* allow blockmaps to the resource fork */
2920         } else {
2921                 if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
2922                         int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
2923                         switch(state) {
2924                                 case FILE_IS_COMPRESSED:
2925                                         return ENOTSUP;
2926                                 case FILE_IS_CONVERTING:
2927                                         /* if FILE_IS_CONVERTING, we allow blockmap */
2928                                         break;
2929                                 default:
2930                                         printf("invalid state %d for compressed file\n", state);
2931                                         /* fall through */
2932                         }
2933                 }
2934         }
2935 #endif /* HFS_COMPRESSION */
2936
2937         /* Do not allow blockmap operation on a directory */
2938         if (vnode_isdir(vp)) {
2939                 return (ENOTSUP);
2940         }
2941
2942         /*
2943          * Check for underlying vnode requests and ensure that logical
2944          * to physical mapping is requested.
2945          */
2946         if (ap->a_bpn == NULL)
2947                 return (0);
2948
2949         if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) {
2950                 if (VTOC(vp)->c_lockowner != current_thread()) {
2951                         hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
2952                         tooklock = 1;
2953                 }
2954         }
2955         hfsmp = VTOHFS(vp);
2956         cp = VTOC(vp);
2957         fp = VTOF(vp);
2958
2959 retry:
2960         /* Check virtual blocks only when performing write operation */
2961         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
2962                 if (hfs_start_transaction(hfsmp) != 0) {
2963                         retval = EINVAL;
2964                         goto exit;
2965                 } else {
2966                         started_tr = 1;
2967                 }
2968                 syslocks = SFL_EXTENTS | SFL_BITMAP;
2969
2970         } else if (overflow_extents(fp)) {
2971                 syslocks = SFL_EXTENTS;
2972         }
2973
2974         if (syslocks)
2975                 lockflags = hfs_systemfile_lock(hfsmp, syslocks, HFS_EXCLUSIVE_LOCK);
2976
2977         /*
2978          * Check for any delayed allocations.
2979          */
2980         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
2981                 int64_t actbytes;
2982                 u_int32_t loanedBlocks;
2983
2984                 //
2985                 // Make sure we have a transaction.  It's possible
2986                 // that we came in and fp->ff_unallocblocks was zero
2987                 // but during the time we blocked acquiring the extents
2988                 // btree, ff_unallocblocks became non-zero and so we
2989                 // will need to start a transaction.
2990                 //
2991                 if (started_tr == 0) {
2992                         if (syslocks) {
2993                                 hfs_systemfile_unlock(hfsmp, lockflags);
2994                                 syslocks = 0;
2995                         }
2996                         goto retry;
2997                 }
2998
2999                 /*
3000                  * Note: ExtendFileC will Release any blocks on loan and
3001                  * aquire real blocks.  So we ask to extend by zero bytes
3002                  * since ExtendFileC will account for the virtual blocks.
3003                  */
3004
3005                 loanedBlocks = fp->ff_unallocblocks;
3006                 retval = ExtendFileC(hfsmp, (FCB*)fp, 0, 0,
3007                                      kEFAllMask | kEFNoClumpMask, &actbytes);
3008
3009                 if (retval) {
3010                         fp->ff_unallocblocks = loanedBlocks;
3011                         cp->c_blocks += loanedBlocks;
3012                         fp->ff_blocks += loanedBlocks;
3013
3014                         hfs_lock_mount (hfsmp);
3015                         hfsmp->loanedBlocks += loanedBlocks;
3016                         hfs_unlock_mount (hfsmp);
3017
3018                         hfs_systemfile_unlock(hfsmp, lockflags);
3019                         cp->c_flag |= C_MODIFIED;
3020                         if (started_tr) {
3021                                 (void) hfs_update(vp, TRUE);
3022                                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3023
3024                                 hfs_end_transaction(hfsmp);
3025                                 started_tr = 0;
3026                         }
3027                         goto exit;
3028                 }
3029         }
3030
3031         retval = MapFileBlockC(hfsmp, (FCB *)fp, ap->a_size, ap->a_foffset,
3032                                ap->a_bpn, &bytesContAvail);
3033         if (syslocks) {
3034                 hfs_systemfile_unlock(hfsmp, lockflags);
3035                 syslocks = 0;
3036         }
3037
3038         if (started_tr) {
3039                 (void) hfs_update(vp, TRUE);
3040                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3041                 hfs_end_transaction(hfsmp);
3042                 started_tr = 0;
3043         }
3044         if (retval) {
3045                 /* On write, always return error because virtual blocks, if any,
3046                  * should have been allocated in ExtendFileC().  We do not
3047                  * allocate virtual blocks on read, therefore return error
3048                  * only if no virtual blocks are allocated.  Otherwise we search
3049                  * rangelist for zero-fills
3050                  */
3051                 if ((MacToVFSError(retval) != ERANGE) ||
3052                     (ap->a_flags & VNODE_WRITE) ||
3053                     ((ap->a_flags & VNODE_READ) && (fp->ff_unallocblocks == 0))) {
3054                         goto exit;
3055                 }
3056
3057                 /* Validate if the start offset is within logical file size */
3058                 if (ap->a_foffset >= fp->ff_size) {
3059                         goto exit;
3060                 }
3061
3062                 /*
3063                  * At this point, we have encountered a failure during
3064                  * MapFileBlockC that resulted in ERANGE, and we are not servicing
3065                  * a write, and there are borrowed blocks.
3066                  *
3067                  * However, the cluster layer will not call blockmap for
3068                  * blocks that are borrowed and in-cache.  We have to assume that
3069                  * because we observed ERANGE being emitted from MapFileBlockC, this
3070                  * extent range is not valid on-disk.  So we treat this as a
3071                  * mapping that needs to be zero-filled prior to reading.
3072                  *
3073                  * Note that under certain circumstances (such as non-contiguous
3074                  * userland VM mappings in the calling process), cluster_io
3075                  * may be forced to split a large I/O driven by hfs_vnop_write
3076                  * into multiple sub-I/Os that necessitate a RMW cycle.  If this is
3077                  * the case here, then we have already removed the invalid range list
3078                  * mapping prior to getting to this blockmap call, so we should not
3079                  * search the invalid rangelist for this byte range.
3080                  */
3081
3082                 bytesContAvail = fp->ff_size - ap->a_foffset;
3083                 /*
3084                  * Clip the contiguous available bytes to, at most, the allowable
3085                  * maximum or the amount requested.
3086                  */
3087
3088                 if (bytesContAvail > ap->a_size) {
3089                         bytesContAvail = ap->a_size;
3090                 }
3091
3092                 *ap->a_bpn = (daddr64_t) -1;
3093                 retval = 0;
3094
3095                 goto exit;
3096         }
3097
3098         /* MapFileC() found a valid extent in the filefork.  Search the
3099          * mapping information further for invalid file ranges
3100          */
3101         overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
3102                               ap->a_foffset + (off_t)bytesContAvail - 1,
3103                               &invalid_range);
3104         if (overlaptype != RL_NOOVERLAP) {
3105                 switch(overlaptype) {
3106                 case RL_MATCHINGOVERLAP:
3107                 case RL_OVERLAPCONTAINSRANGE:
3108                 case RL_OVERLAPSTARTSBEFORE:
3109                         /* There's no valid block for this byte offset */
3110                         *ap->a_bpn = (daddr64_t)-1;
3111                         /* There's no point limiting the amount to be returned
3112                          * if the invalid range that was hit extends all the way
3113                          * to the EOF (i.e. there's no valid bytes between the
3114                          * end of this range and the file's EOF):
3115                          */
3116                         if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
3117                             ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
3118                                 bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
3119                         }
3120                         break;
3121
3122                 case RL_OVERLAPISCONTAINED:
3123                 case RL_OVERLAPENDSAFTER:
3124                         /* The range of interest hits an invalid block before the end: */
3125                         if (invalid_range->rl_start == ap->a_foffset) {
3126                                 /* There's actually no valid information to be had starting here: */
3127                                 *ap->a_bpn = (daddr64_t)-1;
3128                                 if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
3129                                     ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
3130                                         bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
3131                                 }
3132                         } else {
3133                                 bytesContAvail = invalid_range->rl_start - ap->a_foffset;
3134                         }
3135                         break;
3136
3137                 case RL_NOOVERLAP:
3138                         break;
3139                 } /* end switch */
3140                 if (bytesContAvail > ap->a_size)
3141                         bytesContAvail = ap->a_size;
3142         }
3143
3144 exit:
3145         if (retval == 0) {
3146                 if (ap->a_run)
3147                         *ap->a_run = bytesContAvail;
3148
3149                 if (ap->a_poff)
3150                         *(int *)ap->a_poff = 0;
3151         }
3152
3153         if (tooklock)
3154                 hfs_unlock(cp);
3155
3156         return (MacToVFSError(retval));
3157 }
3158
3159 /*
3160  * prepare and issue the I/O
3161  * buf_strategy knows how to deal
3162  * with requests that require
3163  * fragmented I/Os
3164  */
3165 int
3166 hfs_vnop_strategy(struct vnop_strategy_args *ap)
3167 {
3168         buf_t   bp = ap->a_bp;
3169         vnode_t vp = buf_vnode(bp);
3170         int error = 0;
3171
3172         /* Mark buffer as containing static data if cnode flag set */
3173         if (VTOC(vp)->c_flag & C_SSD_STATIC) {
3174                 buf_markstatic(bp);
3175         }
3176
3177         /* Mark buffer as containing static data if cnode flag set */
3178         if (VTOC(vp)->c_flag & C_SSD_GREEDY_MODE) {
3179                 bufattr_markgreedymode(&bp->b_attr);
3180         }
3181
3182         /* mark buffer as containing burst mode data if cnode flag set */
3183         if (VTOC(vp)->c_flag & C_IO_ISOCHRONOUS) {
3184                 bufattr_markisochronous(&bp->b_attr);
3185         }
3186
3187 #if CONFIG_PROTECT
3188         cnode_t *cp = NULL;
3189
3190         if ((!bufattr_rawencrypted(&bp->b_attr)) &&
3191                         ((cp = cp_get_protected_cnode(vp)) != NULL)) {
3192                 /*
3193                  * We rely upon the truncate lock to protect the
3194                  * CP cache key from getting tossed prior to our IO finishing here.
3195                  * Nearly all cluster io calls to manipulate file payload from HFS
3196                  * take the truncate lock before calling into the cluster
3197                  * layer to ensure the file size does not change, or that they
3198                  * have exclusive right to change the EOF of the file.
3199                  * That same guarantee protects us here since the code that
3200                  * deals with CP lock events must now take the truncate lock
3201                  * before doing anything.
3202                  *
3203                  * There is 1 exception here:
3204                  * 1) One exception should be the VM swapfile IO, because HFS will
3205                  * funnel the VNOP_PAGEOUT directly into a cluster_pageout call for the
3206                  * swapfile code only without holding the truncate lock.  This is because
3207                  * individual swapfiles are maintained at fixed-length sizes by the VM code.
3208                  * In non-swapfile IO we use PAGEOUT_V2 semantics which allow us to
3209                  * create our own UPL and thus take the truncate lock before calling
3210                  * into the cluster layer.  In that case, however, we are not concerned
3211                  * with the CP blob being wiped out in the middle of the IO
3212                  * because there isn't anything to toss; the VM swapfile key stays
3213                  * in-core as long as the file is open.
3214                  */
3215
3216
3217                 /*
3218                  * Last chance: If this data protected I/O does not have unwrapped keys
3219                  * present, then try to get them.  We already know that it should, by this point.
3220                  */
3221                 if (cp->c_cpentry->cp_flags & (CP_KEY_FLUSHED | CP_NEEDS_KEYS)) {
3222                         int io_op = ( (buf_flags(bp) & B_READ) ? CP_READ_ACCESS : CP_WRITE_ACCESS);
3223                         if ((error = cp_handle_vnop(vp, io_op, 0)) != 0) {
3224                                 /*
3225                                  * We have to be careful here.  By this point in the I/O path, VM or the cluster
3226                                  * engine has prepared a buf_t with the proper file offsets and all the rest,
3227                                  * so simply erroring out will result in us leaking this particular buf_t.
3228                                  * We need to properly decorate the buf_t just as buf_strategy would so as
3229                                  * to make it appear that the I/O errored out with the particular error code.
3230                                  */
3231                                 buf_seterror (bp, error);
3232                                 buf_biodone(bp);
3233                                 return error;
3234                         }
3235                 }
3236
3237                 /*
3238                  *NB:
3239                  * For filesystem resize, we may not have access to the underlying
3240                  * file's cache key for whatever reason (device may be locked).  However,
3241                  * we do not need it since we are going to use the temporary HFS-wide resize key
3242                  * which is generated once we start relocating file content.  If this file's I/O
3243                  * should be done using the resize key, it will have been supplied already, so
3244                  * do not attach the file's cp blob to the buffer.
3245                  */
3246                 if ((cp->c_cpentry->cp_flags & CP_RELOCATION_INFLIGHT) == 0) {
3247                         buf_setcpaddr(bp, cp->c_cpentry);
3248                 }
3249         }
3250 #endif /* CONFIG_PROTECT */
3251
3252         error = buf_strategy(VTOHFS(vp)->hfs_devvp, ap);
3253
3254         return error;
3255 }
3256
3257 static int
3258 hfs_minorupdate(struct vnode *vp) {
3259         struct cnode *cp = VTOC(vp);
3260         cp->c_flag &= ~C_MODIFIED;
3261         cp->c_touch_acctime = 0;
3262         cp->c_touch_chgtime = 0;
3263         cp->c_touch_modtime = 0;
3264
3265         return 0;
3266 }
3267
3268 int
3269 do_hfs_truncate(struct vnode *vp, off_t length, int flags, int truncateflags, vfs_context_t context)
3270 {
3271         register struct cnode *cp = VTOC(vp);
3272         struct filefork *fp = VTOF(vp);
3273         kauth_cred_t cred = vfs_context_ucred(context);
3274         int retval;
3275         off_t bytesToAdd;
3276         off_t actualBytesAdded;
3277         off_t filebytes;
3278         u_int32_t fileblocks;
3279         int blksize;
3280         struct hfsmount *hfsmp;
3281         int lockflags;
3282         int skipupdate = (truncateflags & HFS_TRUNCATE_SKIPUPDATE);
3283         int suppress_times = (truncateflags & HFS_TRUNCATE_SKIPTIMES);
3284
3285         blksize = VTOVCB(vp)->blockSize;
3286         fileblocks = fp->ff_blocks;
3287         filebytes = (off_t)fileblocks * (off_t)blksize;
3288
3289         KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_START,
3290                  (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
3291
3292         if (length < 0)
3293                 return (EINVAL);
3294
3295         /* This should only happen with a corrupt filesystem */
3296         if ((off_t)fp->ff_size < 0)
3297                 return (EINVAL);
3298
3299         if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE))
3300                 return (EFBIG);
3301
3302         hfsmp = VTOHFS(vp);
3303
3304         retval = E_NONE;
3305
3306         /* Files that are changing size are not hot file candidates. */
3307         if (hfsmp->hfc_stage == HFC_RECORDING) {
3308                 fp->ff_bytesread = 0;
3309         }
3310
3311         /*
3312          * We cannot just check if fp->ff_size == length (as an optimization)
3313          * since there may be extra physical blocks that also need truncation.
3314          */
3315 #if QUOTA
3316         if ((retval = hfs_getinoquota(cp)))
3317                 return(retval);
3318 #endif /* QUOTA */
3319
3320         /*
3321          * Lengthen the size of the file. We must ensure that the
3322          * last byte of the file is allocated. Since the smallest
3323          * value of ff_size is 0, length will be at least 1.
3324          */
3325         if (length > (off_t)fp->ff_size) {
3326 #if QUOTA
3327                 retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)),
3328                                    cred, 0);
3329                 if (retval)
3330                         goto Err_Exit;
3331 #endif /* QUOTA */
3332                 /*
3333                  * If we don't have enough physical space then
3334                  * we need to extend the physical size.
3335                  */
3336                 if (length > filebytes) {
3337                         int eflags;
3338                         u_int32_t blockHint = 0;
3339
3340                         /* All or nothing and don't round up to clumpsize. */
3341                         eflags = kEFAllMask | kEFNoClumpMask;
3342
3343                         if (cred && (suser(cred, NULL) != 0)) {
3344                                 eflags |= kEFReserveMask;  /* keep a reserve */
3345                         }
3346
3347                         /*
3348                          * Allocate Journal and Quota files in metadata zone.
3349                          */
3350                         if (filebytes == 0 &&
3351                             hfsmp->hfs_flags & HFS_METADATA_ZONE &&
3352                             hfs_virtualmetafile(cp)) {
3353                                 eflags |= kEFMetadataMask;
3354                                 blockHint = hfsmp->hfs_metazone_start;
3355                         }
3356                         if (hfs_start_transaction(hfsmp) != 0) {
3357                             retval = EINVAL;
3358                             goto Err_Exit;
3359                         }
3360
3361                         /* Protect extents b-tree and allocation bitmap */
3362                         lockflags = SFL_BITMAP;
3363                         if (overflow_extents(fp))
3364                                 lockflags |= SFL_EXTENTS;
3365                         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3366
3367                         /*
3368                          * Keep growing the file as long as the current EOF is
3369                          * less than the desired value.
3370                          */
3371                         while ((length > filebytes) && (retval == E_NONE)) {
3372                                 bytesToAdd = length - filebytes;
3373                                 retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
3374                                                     (FCB*)fp,
3375                                                     bytesToAdd,
3376                                                     blockHint,
3377                                                     eflags,
3378                                                     &actualBytesAdded));
3379
3380                                 filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
3381                                 if (actualBytesAdded == 0 && retval == E_NONE) {
3382                                         if (length > filebytes)
3383                                                 length = filebytes;
3384                                         break;
3385                                 }
3386                         } /* endwhile */
3387
3388                         hfs_systemfile_unlock(hfsmp, lockflags);
3389
3390                         if (hfsmp->jnl) {
3391                                 if (skipupdate) {
3392                                         (void) hfs_minorupdate(vp);
3393                                 }
3394                                 else {
3395                                         (void) hfs_update(vp, TRUE);
3396                                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3397                                 }
3398                         }
3399
3400                         hfs_end_transaction(hfsmp);
3401
3402                         if (retval)
3403                                 goto Err_Exit;
3404
3405                         KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE,
3406                                 (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
3407                 }
3408
3409                 if (ISSET(flags, IO_NOZEROFILL)) {
3410                         // An optimisation for the hibernation file
3411                         if (vnode_isswap(vp))
3412                                 rl_remove_all(&fp->ff_invalidranges);
3413                 } else {
3414                         if (UBCINFOEXISTS(vp)  && (vnode_issystem(vp) == 0) && retval == E_NONE) {
3415                                 struct rl_entry *invalid_range;
3416                                 off_t zero_limit;
3417
3418                                 zero_limit = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
3419                                 if (length < zero_limit) zero_limit = length;
3420
3421                                 if (length > (off_t)fp->ff_size) {
3422                                         struct timeval tv;
3423
3424                                         /* Extending the file: time to fill out the current last page w. zeroes? */
3425                                         if ((fp->ff_size & PAGE_MASK_64) &&
3426                                             (rl_scan(&fp->ff_invalidranges, fp->ff_size & ~PAGE_MASK_64,
3427                                             fp->ff_size - 1, &invalid_range) == RL_NOOVERLAP)) {
3428
3429                                                 /* There's some valid data at the start of the (current) last page
3430                                                    of the file, so zero out the remainder of that page to ensure the
3431                                                    entire page contains valid data.  Since there is no invalid range
3432                                                    possible past the (current) eof, there's no need to remove anything
3433                                                    from the invalid range list before calling cluster_write():  */
3434                                                 hfs_unlock(cp);
3435                                                 retval = cluster_write(vp, (struct uio *) 0, fp->ff_size, zero_limit,
3436                                                                 fp->ff_size, (off_t)0,
3437                                                                 (flags & IO_SYNC) | IO_HEADZEROFILL | IO_NOZERODIRTY);
3438                                                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
3439                                                 if (retval) goto Err_Exit;
3440
3441                                                 /* Merely invalidate the remaining area, if necessary: */
3442                                                 if (length > zero_limit) {
3443                                                         microuptime(&tv);
3444                                                         rl_add(zero_limit, length - 1, &fp->ff_invalidranges);
3445                                                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
3446                                                 }
3447                                         } else {
3448                                         /* The page containing the (current) eof is invalid: just add the
3449                                            remainder of the page to the invalid list, along with the area
3450                                            being newly allocated:
3451                                          */
3452                                         microuptime(&tv);
3453                                         rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges);
3454                                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
3455                                         };
3456                                 }
3457                         } else {
3458                                         panic("hfs_truncate: invoked on non-UBC object?!");
3459                         };
3460                 }
3461                 if (suppress_times == 0) {
3462                         cp->c_touch_modtime = TRUE;
3463                 }
3464                 fp->ff_size = length;
3465
3466         } else { /* Shorten the size of the file */
3467
3468                 // An optimisation for the hibernation file
3469                 if (ISSET(flags, IO_NOZEROFILL) && vnode_isswap(vp)) {
3470                         rl_remove_all(&fp->ff_invalidranges);
3471                 } else if ((off_t)fp->ff_size > length) {
3472                         /* Any space previously marked as invalid is now irrelevant: */
3473                         rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges);
3474                 }
3475
3476                 /*
3477                  * Account for any unmapped blocks. Note that the new
3478                  * file length can still end up with unmapped blocks.
3479                  */
3480                 if (fp->ff_unallocblocks > 0) {
3481                         u_int32_t finalblks;
3482                         u_int32_t loanedBlocks;
3483
3484                         hfs_lock_mount(hfsmp);
3485                         loanedBlocks = fp->ff_unallocblocks;
3486                         cp->c_blocks -= loanedBlocks;
3487                         fp->ff_blocks -= loanedBlocks;
3488                         fp->ff_unallocblocks = 0;
3489
3490                         hfsmp->loanedBlocks -= loanedBlocks;
3491
3492                         finalblks = (length + blksize - 1) / blksize;
3493                         if (finalblks > fp->ff_blocks) {
3494                                 /* calculate required unmapped blocks */
3495                                 loanedBlocks = finalblks - fp->ff_blocks;
3496                                 hfsmp->loanedBlocks += loanedBlocks;
3497
3498                                 fp->ff_unallocblocks = loanedBlocks;
3499                                 cp->c_blocks += loanedBlocks;
3500                                 fp->ff_blocks += loanedBlocks;
3501                         }
3502                         hfs_unlock_mount (hfsmp);
3503                 }
3504
3505 #if QUOTA
3506                 off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
3507 #endif /* QUOTA */
3508                 if (hfs_start_transaction(hfsmp) != 0) {
3509                         retval = EINVAL;
3510                         goto Err_Exit;
3511                 }
3512
3513                 if (fp->ff_unallocblocks == 0) {
3514                         /* Protect extents b-tree and allocation bitmap */
3515                         lockflags = SFL_BITMAP;
3516                         if (overflow_extents(fp))
3517                                 lockflags |= SFL_EXTENTS;
3518                         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3519
3520                         retval = MacToVFSError(TruncateFileC(VTOVCB(vp), (FCB*)fp, length, 0,
3521                                                                                                  FORK_IS_RSRC (fp), FTOC(fp)->c_fileid, false));
3522
3523                         hfs_systemfile_unlock(hfsmp, lockflags);
3524                 }
3525                 if (hfsmp->jnl) {
3526                         if (retval == 0) {
3527                                 fp->ff_size = length;
3528                         }
3529                         if (skipupdate) {
3530                                 (void) hfs_minorupdate(vp);
3531                         }
3532                         else {
3533                                 (void) hfs_update(vp, TRUE);
3534                                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3535                         }
3536                 }
3537                 hfs_end_transaction(hfsmp);
3538
3539                 filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
3540                 if (retval)
3541                         goto Err_Exit;
3542 #if QUOTA
3543                 /* These are bytesreleased */
3544                 (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0);
3545 #endif /* QUOTA */
3546
3547                 /*
3548                  * Only set update flag if the logical length changes & we aren't
3549                  * suppressing modtime updates.
3550                  */
3551                 if (((off_t)fp->ff_size != length) && (suppress_times == 0)) {
3552                         cp->c_touch_modtime = TRUE;
3553                 }
3554                 fp->ff_size = length;
3555         }
3556         if (cp->c_mode & (S_ISUID | S_ISGID)) {
3557                 if (!vfs_context_issuser(context)) {
3558                         cp->c_mode &= ~(S_ISUID | S_ISGID);
3559                         skipupdate = 0;
3560                 }
3561         }
3562         if (skipupdate) {
3563                 retval = hfs_minorupdate(vp);
3564         }
3565         else {
3566                 cp->c_touch_chgtime = TRUE;     /* status changed */
3567                 if (suppress_times == 0) {
3568                         cp->c_touch_modtime = TRUE;     /* file data was modified */
3569
3570                         /*
3571                          * If we are not suppressing the modtime update, then
3572                          * update the gen count as well.
3573                          */
3574                         if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK (cp->c_attr.ca_mode)) {
3575                                 hfs_incr_gencount(cp);
3576                         }
3577                 }
3578
3579                 retval = hfs_update(vp, MNT_WAIT);
3580         }
3581         if (retval) {
3582                 KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE,
3583                      -1, -1, -1, retval, 0);
3584         }
3585
3586 Err_Exit:
3587
3588         KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_END,
3589                  (int)length, (int)fp->ff_size, (int)filebytes, retval, 0);
3590
3591         return (retval);
3592 }
3593
3594 /*
3595  * Preparation which must be done prior to deleting the catalog record
3596  * of a file or directory.  In order to make the on-disk as safe as possible,
3597  * we remove the catalog entry before releasing the bitmap blocks and the
3598  * overflow extent records.  However, some work must be done prior to deleting
3599  * the catalog record.
3600  *
3601  * When calling this function, the cnode must exist both in memory and on-disk.
3602  * If there are both resource fork and data fork vnodes, this function should
3603  * be called on both.
3604  */
3605
3606 int
3607 hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) {
3608
3609         struct filefork *fp = VTOF(vp);
3610         struct cnode *cp = VTOC(vp);
3611 #if QUOTA
3612         int retval = 0;
3613 #endif /* QUOTA */
3614
3615         /* Cannot truncate an HFS directory! */
3616         if (vnode_isdir(vp)) {
3617                 return (EISDIR);
3618         }
3619
3620         /*
3621          * See the comment below in hfs_truncate for why we need to call
3622          * setsize here.  Essentially we want to avoid pending IO if we
3623          * already know that the blocks are going to be released here.
3624          * This function is only called when totally removing all storage for a file, so
3625          * we can take a shortcut and immediately setsize (0);
3626          */
3627         ubc_setsize(vp, 0);
3628
3629         /* This should only happen with a corrupt filesystem */
3630         if ((off_t)fp->ff_size < 0)
3631                 return (EINVAL);
3632
3633         /*
3634          * We cannot just check if fp->ff_size == length (as an optimization)
3635          * since there may be extra physical blocks that also need truncation.
3636          */
3637 #if QUOTA
3638         if ((retval = hfs_getinoquota(cp))) {
3639                 return(retval);
3640         }
3641 #endif /* QUOTA */
3642
3643         /* Wipe out any invalid ranges which have yet to be backed by disk */
3644         rl_remove(0, fp->ff_size - 1, &fp->ff_invalidranges);
3645
3646         /*
3647          * Account for any unmapped blocks. Since we're deleting the
3648          * entire file, we don't have to worry about just shrinking
3649          * to a smaller number of borrowed blocks.
3650          */
3651         if (fp->ff_unallocblocks > 0) {
3652                 u_int32_t loanedBlocks;
3653
3654                 hfs_lock_mount (hfsmp);
3655                 loanedBlocks = fp->ff_unallocblocks;
3656                 cp->c_blocks -= loanedBlocks;
3657                 fp->ff_blocks -= loanedBlocks;
3658                 fp->ff_unallocblocks = 0;
3659
3660                 hfsmp->loanedBlocks -= loanedBlocks;
3661
3662                 hfs_unlock_mount (hfsmp);
3663         }
3664
3665         return 0;
3666 }
3667
3668
3669 /*
3670  * Special wrapper around calling TruncateFileC.  This function is useable
3671  * even when the catalog record does not exist any longer, making it ideal
3672  * for use when deleting a file.  The simplification here is that we know
3673  * that we are releasing all blocks.
3674  *
3675  * Note that this function may be called when there is no vnode backing
3676  * the file fork in question.  We may call this from hfs_vnop_inactive
3677  * to clear out resource fork data (and may not want to clear out the data
3678  * fork yet).  As a result, we pointer-check both sets of inputs before
3679  * doing anything with them.
3680  *
3681  * The caller is responsible for saving off a copy of the filefork(s)
3682  * embedded within the cnode prior to calling this function.  The pointers
3683  * supplied as arguments must be valid even if the cnode is no longer valid.
3684  */
3685
3686 int
3687 hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork,
3688                                          struct filefork *rsrcfork, u_int32_t fileid) {
3689
3690         off_t filebytes;
3691         u_int32_t fileblocks;
3692         int blksize = 0;
3693         int error = 0;
3694         int lockflags;
3695
3696         blksize = hfsmp->blockSize;
3697
3698         /* Data Fork */
3699         if (datafork) {
3700                 datafork->ff_size = 0;
3701
3702                 fileblocks = datafork->ff_blocks;
3703                 filebytes = (off_t)fileblocks * (off_t)blksize;
3704
3705                 /* We killed invalid ranges and loaned blocks before we removed the catalog entry */
3706
3707                 while (filebytes > 0) {
3708                         if (filebytes > HFS_BIGFILE_SIZE) {
3709                                 filebytes -= HFS_BIGFILE_SIZE;
3710                         } else {
3711                                 filebytes = 0;
3712                         }
3713
3714                         /* Start a transaction, and wipe out as many blocks as we can in this iteration */
3715                         if (hfs_start_transaction(hfsmp) != 0) {
3716                                 error = EINVAL;
3717                                 break;
3718                         }
3719
3720                         if (datafork->ff_unallocblocks == 0) {
3721                                 /* Protect extents b-tree and allocation bitmap */
3722                                 lockflags = SFL_BITMAP;
3723                                 if (overflow_extents(datafork))
3724                                         lockflags |= SFL_EXTENTS;
3725                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3726
3727                                 error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), datafork, filebytes, 1, 0, fileid, false));
3728
3729                                 hfs_systemfile_unlock(hfsmp, lockflags);
3730                         }
3731                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3732
3733                         /* Finish the transaction and start over if necessary */
3734                         hfs_end_transaction(hfsmp);
3735
3736                         if (error) {
3737                                 break;
3738                         }
3739                 }
3740         }
3741
3742         /* Resource fork */
3743         if (error == 0 && rsrcfork) {
3744                 rsrcfork->ff_size = 0;
3745
3746                 fileblocks = rsrcfork->ff_blocks;
3747                 filebytes = (off_t)fileblocks * (off_t)blksize;
3748
3749                 /* We killed invalid ranges and loaned blocks before we removed the catalog entry */
3750
3751                 while (filebytes > 0) {
3752                         if (filebytes > HFS_BIGFILE_SIZE) {
3753                                 filebytes -= HFS_BIGFILE_SIZE;
3754                         } else {
3755                                 filebytes = 0;
3756                         }
3757
3758                         /* Start a transaction, and wipe out as many blocks as we can in this iteration */
3759                         if (hfs_start_transaction(hfsmp) != 0) {
3760                                 error = EINVAL;
3761                                 break;
3762                         }
3763
3764                         if (rsrcfork->ff_unallocblocks == 0) {
3765                                 /* Protect extents b-tree and allocation bitmap */
3766                                 lockflags = SFL_BITMAP;
3767                                 if (overflow_extents(rsrcfork))
3768                                         lockflags |= SFL_EXTENTS;
3769                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3770
3771                                 error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), rsrcfork, filebytes, 1, 1, fileid, false));
3772
3773                                 hfs_systemfile_unlock(hfsmp, lockflags);
3774                         }
3775                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3776
3777                         /* Finish the transaction and start over if necessary */
3778                         hfs_end_transaction(hfsmp);
3779
3780                         if (error) {
3781                                 break;
3782                         }
3783                 }
3784         }
3785
3786         return error;
3787 }
3788
3789 errno_t hfs_ubc_setsize(vnode_t vp, off_t len, bool have_cnode_lock)
3790 {
3791         errno_t error;
3792
3793         /*
3794          * Call ubc_setsize to give the VM subsystem a chance to do
3795          * whatever it needs to with existing pages before we delete
3796          * blocks.  Note that symlinks don't use the UBC so we'll
3797          * get back ENOENT in that case.
3798          */
3799         if (have_cnode_lock) {
3800                 error = ubc_setsize_ex(vp, len, UBC_SETSIZE_NO_FS_REENTRY);
3801                 if (error == EAGAIN) {
3802                         cnode_t *cp = VTOC(vp);
3803
3804                         if (cp->c_truncatelockowner != current_thread()) {
3805 #if DEVELOPMENT || DEBUG
3806                                 panic("hfs: hfs_ubc_setsize called without exclusive truncate lock!");
3807 #else
3808                                 printf("hfs: hfs_ubc_setsize called without exclusive truncate lock!\n");
3809 #endif
3810                         }
3811
3812                         hfs_unlock(cp);
3813                         error = ubc_setsize_ex(vp, len, 0);
3814                         hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
3815                 }
3816         } else
3817                 error = ubc_setsize_ex(vp, len, 0);
3818
3819         return error == ENOENT ? 0 : error;
3820 }
3821
3822 /*
3823  * Truncate a cnode to at most length size, freeing (or adding) the
3824  * disk blocks.
3825  */
3826 int
3827 hfs_truncate(struct vnode *vp, off_t length, int flags,
3828                          int truncateflags, vfs_context_t context)
3829 {
3830         struct filefork *fp = VTOF(vp);
3831         off_t filebytes;
3832         u_int32_t fileblocks;
3833         int blksize;
3834         errno_t error = 0;
3835         struct cnode *cp = VTOC(vp);
3836
3837         /* Cannot truncate an HFS directory! */
3838         if (vnode_isdir(vp)) {
3839                 return (EISDIR);
3840         }
3841         /* A swap file cannot change size. */
3842         if (vnode_isswap(vp) && length && !ISSET(flags, IO_NOAUTH)) {
3843                 return (EPERM);
3844         }
3845
3846         blksize = VTOVCB(vp)->blockSize;
3847         fileblocks = fp->ff_blocks;
3848         filebytes = (off_t)fileblocks * (off_t)blksize;
3849
3850         bool caller_has_cnode_lock = (cp->c_lockowner == current_thread());
3851
3852         error = hfs_ubc_setsize(vp, length, caller_has_cnode_lock);
3853         if (error)
3854                 return error;
3855
3856         if (!caller_has_cnode_lock) {
3857                 error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
3858                 if (error)
3859                         return error;
3860         }
3861
3862         // have to loop truncating or growing files that are
3863         // really big because otherwise transactions can get
3864         // enormous and consume too many kernel resources.
3865
3866         if (length < filebytes) {
3867                 while (filebytes > length) {
3868                         if ((filebytes - length) > HFS_BIGFILE_SIZE) {
3869                                 filebytes -= HFS_BIGFILE_SIZE;
3870                         } else {
3871                                 filebytes = length;
3872                         }
3873                         cp->c_flag |= C_FORCEUPDATE;
3874                         error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context);
3875                         if (error)
3876                                 break;
3877                 }
3878         } else if (length > filebytes) {
3879                 while (filebytes < length) {
3880                         if ((length - filebytes) > HFS_BIGFILE_SIZE) {
3881                                 filebytes += HFS_BIGFILE_SIZE;
3882                         } else {
3883                                 filebytes = length;
3884                         }
3885                         cp->c_flag |= C_FORCEUPDATE;
3886                         error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context);
3887                         if (error)
3888                                 break;
3889                 }
3890         } else /* Same logical size */ {
3891
3892                 error = do_hfs_truncate(vp, length, flags, truncateflags, context);
3893         }
3894         /* Files that are changing size are not hot file candidates. */
3895         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
3896                 fp->ff_bytesread = 0;
3897         }
3898
3899         if (!caller_has_cnode_lock)
3900                 hfs_unlock(cp);
3901
3902         // Make sure UBC's size matches up (in case we didn't completely succeed)
3903         errno_t err2 = hfs_ubc_setsize(vp, fp->ff_size, caller_has_cnode_lock);
3904         if (!error)
3905                 error = err2;
3906
3907         return error;
3908 }
3909
3910
3911 /*
3912  * Preallocate file storage space.
3913  */
3914 int
3915 hfs_vnop_allocate(struct vnop_allocate_args /* {
3916                 vnode_t a_vp;
3917                 off_t a_length;
3918                 u_int32_t  a_flags;
3919                 off_t *a_bytesallocated;
3920                 off_t a_offset;
3921                 vfs_context_t a_context;
3922         } */ *ap)
3923 {
3924         struct vnode *vp = ap->a_vp;
3925         struct cnode *cp;
3926         struct filefork *fp;
3927         ExtendedVCB *vcb;
3928         off_t length = ap->a_length;
3929         off_t startingPEOF;
3930         off_t moreBytesRequested;
3931         off_t actualBytesAdded;
3932         off_t filebytes;
3933         u_int32_t fileblocks;
3934         int retval, retval2;
3935         u_int32_t blockHint;
3936         u_int32_t extendFlags;   /* For call to ExtendFileC */
3937         struct hfsmount *hfsmp;
3938         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
3939         int lockflags;
3940         time_t orig_ctime;
3941
3942         *(ap->a_bytesallocated) = 0;
3943
3944         if (!vnode_isreg(vp))
3945                 return (EISDIR);
3946         if (length < (off_t)0)
3947                 return (EINVAL);
3948
3949         cp = VTOC(vp);
3950
3951         orig_ctime = VTOC(vp)->c_ctime;
3952
3953         check_for_tracked_file(vp, orig_ctime, ap->a_length == 0 ? NAMESPACE_HANDLER_TRUNCATE_OP|NAMESPACE_HANDLER_DELETE_OP : NAMESPACE_HANDLER_TRUNCATE_OP, NULL);
3954
3955         hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
3956
3957         if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
3958                 goto Err_Exit;
3959         }
3960
3961         fp = VTOF(vp);
3962         hfsmp = VTOHFS(vp);
3963         vcb = VTOVCB(vp);
3964
3965         fileblocks = fp->ff_blocks;
3966         filebytes = (off_t)fileblocks * (off_t)vcb->blockSize;
3967
3968         if ((ap->a_flags & ALLOCATEFROMVOL) && (length < filebytes)) {
3969                 retval = EINVAL;
3970                 goto Err_Exit;
3971         }
3972
3973         /* Fill in the flags word for the call to Extend the file */
3974
3975         extendFlags = kEFNoClumpMask;
3976         if (ap->a_flags & ALLOCATECONTIG)
3977                 extendFlags |= kEFContigMask;
3978         if (ap->a_flags & ALLOCATEALL)
3979                 extendFlags |= kEFAllMask;
3980         if (cred && suser(cred, NULL) != 0)
3981                 extendFlags |= kEFReserveMask;
3982         if (hfs_virtualmetafile(cp))
3983                 extendFlags |= kEFMetadataMask;
3984
3985         retval = E_NONE;
3986         blockHint = 0;
3987         startingPEOF = filebytes;
3988
3989         if (ap->a_flags & ALLOCATEFROMPEOF)
3990                 length += filebytes;
3991         else if (ap->a_flags & ALLOCATEFROMVOL)
3992                 blockHint = ap->a_offset / VTOVCB(vp)->blockSize;
3993
3994         /* If no changes are necesary, then we're done */
3995         if (filebytes == length)
3996                 goto Std_Exit;
3997
3998         /*
3999          * Lengthen the size of the file. We must ensure that the
4000          * last byte of the file is allocated. Since the smallest
4001          * value of filebytes is 0, length will be at least 1.
4002          */
4003         if (length > filebytes) {
4004                 off_t total_bytes_added = 0, orig_request_size;
4005
4006                 orig_request_size = moreBytesRequested = length - filebytes;
4007
4008 #if QUOTA
4009                 retval = hfs_chkdq(cp,
4010                                 (int64_t)(roundup(moreBytesRequested, vcb->blockSize)),
4011                                 cred, 0);
4012                 if (retval)
4013                         goto Err_Exit;
4014
4015 #endif /* QUOTA */
4016                 /*
4017                  * Metadata zone checks.
4018                  */
4019                 if (hfsmp->hfs_flags & HFS_METADATA_ZONE) {
4020                         /*
4021                          * Allocate Journal and Quota files in metadata zone.
4022                          */
4023                         if (hfs_virtualmetafile(cp)) {
4024                                 blockHint = hfsmp->hfs_metazone_start;
4025                         } else if ((blockHint >= hfsmp->hfs_metazone_start) &&
4026                                    (blockHint <= hfsmp->hfs_metazone_end)) {
4027                                 /*
4028                                  * Move blockHint outside metadata zone.
4029                                  */
4030                                 blockHint = hfsmp->hfs_metazone_end + 1;
4031                         }
4032                 }
4033
4034
4035                 while ((length > filebytes) && (retval == E_NONE)) {
4036                     off_t bytesRequested;
4037
4038                     if (hfs_start_transaction(hfsmp) != 0) {
4039                         retval = EINVAL;
4040                         goto Err_Exit;
4041                     }
4042
4043                     /* Protect extents b-tree and allocation bitmap */
4044                     lockflags = SFL_BITMAP;
4045                     if (overflow_extents(fp))
4046                                 lockflags |= SFL_EXTENTS;
4047                     lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
4048
4049                     if (moreBytesRequested >= HFS_BIGFILE_SIZE) {
4050                                 bytesRequested = HFS_BIGFILE_SIZE;
4051                     } else {
4052                                 bytesRequested = moreBytesRequested;
4053                     }
4054
4055                     if (extendFlags & kEFContigMask) {
4056                             // if we're on a sparse device, this will force it to do a
4057                             // full scan to find the space needed.
4058                             hfsmp->hfs_flags &= ~HFS_DID_CONTIG_SCAN;
4059                     }
4060
4061                     retval = MacToVFSError(ExtendFileC(vcb,
4062                                                 (FCB*)fp,
4063                                                 bytesRequested,
4064                                                 blockHint,
4065                                                 extendFlags,
4066                                                 &actualBytesAdded));
4067
4068                     if (retval == E_NONE) {
4069                         *(ap->a_bytesallocated) += actualBytesAdded;
4070                         total_bytes_added += actualBytesAdded;
4071                         moreBytesRequested -= actualBytesAdded;
4072                         if (blockHint != 0) {
4073                             blockHint += actualBytesAdded / vcb->blockSize;
4074                         }
4075                     }
4076                     filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
4077
4078                     hfs_systemfile_unlock(hfsmp, lockflags);
4079
4080                     if (hfsmp->jnl) {
4081                         (void) hfs_update(vp, TRUE);
4082                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
4083                     }
4084
4085                     hfs_end_transaction(hfsmp);
4086                 }
4087
4088
4089                 /*
4090                  * if we get an error and no changes were made then exit
4091                  * otherwise we must do the hfs_update to reflect the changes
4092                  */
4093                 if (retval && (startingPEOF == filebytes))
4094                         goto Err_Exit;
4095
4096                 /*
4097                  * Adjust actualBytesAdded to be allocation block aligned, not
4098                  * clump size aligned.
4099                  * NOTE: So what we are reporting does not affect reality
4100                  * until the file is closed, when we truncate the file to allocation
4101                  * block size.
4102                  */
4103                 if (total_bytes_added != 0 && orig_request_size < total_bytes_added)
4104                         *(ap->a_bytesallocated) =
4105                                 roundup(orig_request_size, (off_t)vcb->blockSize);
4106
4107         } else { /* Shorten the size of the file */
4108
4109                 /*
4110                  * N.B. At present, this code is never called.  If and when we
4111                  * do start using it, it looks like there might be slightly
4112                  * strange semantics with the file size: it's possible for the
4113                  * file size to *increase* e.g. if current file size is 5,
4114                  * length is 1024 and filebytes is 4096, the file size will
4115                  * end up being 1024 bytes.  This isn't necessarily a problem
4116                  * but it's not consistent with the code above which doesn't
4117                  * change the file size.
4118                  */
4119
4120                 retval = hfs_truncate(vp, length, 0, 0, ap->a_context);
4121                 filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
4122
4123                 /*
4124                  * if we get an error and no changes were made then exit
4125                  * otherwise we must do the hfs_update to reflect the changes
4126                  */
4127                 if (retval && (startingPEOF == filebytes)) goto Err_Exit;
4128 #if QUOTA
4129                 /* These are  bytesreleased */
4130                 (void) hfs_chkdq(cp, (int64_t)-((startingPEOF - filebytes)), NOCRED,0);
4131 #endif /* QUOTA */
4132
4133                 if (fp->ff_size > filebytes) {
4134                         fp->ff_size = filebytes;
4135
4136                         hfs_ubc_setsize(vp, fp->ff_size, true);
4137                 }
4138         }
4139
4140 Std_Exit:
4141         cp->c_touch_chgtime = TRUE;
4142         cp->c_touch_modtime = TRUE;
4143         retval2 = hfs_update(vp, MNT_WAIT);
4144
4145         if (retval == 0)
4146                 retval = retval2;
4147 Err_Exit:
4148         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
4149         hfs_unlock(cp);
4150         return (retval);
4151 }
4152
4153
4154 /*
4155  * Pagein for HFS filesystem
4156  */
4157 int
4158 hfs_vnop_pagein(struct vnop_pagein_args *ap)
4159 /*
4160         struct vnop_pagein_args {
4161                 vnode_t a_vp,
4162                 upl_t         a_pl,
4163                 vm_offset_t   a_pl_offset,
4164                 off_t         a_f_offset,
4165                 size_t        a_size,
4166                 int           a_flags
4167                 vfs_context_t a_context;
4168         };
4169 */
4170 {
4171         vnode_t         vp;
4172         struct cnode    *cp;
4173         struct filefork *fp;
4174         int             error = 0;
4175         upl_t           upl;
4176         upl_page_info_t *pl;
4177         off_t           f_offset;
4178         off_t           page_needed_f_offset;
4179         int             offset;
4180         int             isize;
4181         int             upl_size;
4182         int             pg_index;
4183         boolean_t       truncate_lock_held = FALSE;
4184         boolean_t       file_converted = FALSE;
4185         kern_return_t   kret;
4186
4187         vp = ap->a_vp;
4188         cp = VTOC(vp);
4189         fp = VTOF(vp);
4190
4191 #if CONFIG_PROTECT
4192         if ((error = cp_handle_vnop(vp, CP_READ_ACCESS | CP_WRITE_ACCESS, 0)) != 0) {
4193                 /*
4194                  * If we errored here, then this means that one of two things occurred:
4195                  * 1. there was a problem with the decryption of the key.
4196                  * 2. the device is locked and we are not allowed to access this particular file.
4197                  *
4198                  * Either way, this means that we need to shut down this upl now.  As long as
4199                  * the pl pointer is NULL (meaning that we're supposed to create the UPL ourselves)
4200                  * then we create a upl and immediately abort it.
4201                  */
4202                 if (ap->a_pl == NULL) {
4203                         /* create the upl */
4204                         ubc_create_upl (vp, ap->a_f_offset, ap->a_size, &upl, &pl,
4205                                         UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT);
4206                         /* mark the range as needed so it doesn't immediately get discarded upon abort */
4207                         ubc_upl_range_needed (upl, ap->a_pl_offset / PAGE_SIZE, 1);
4208
4209                         /* Abort the range */
4210                         ubc_upl_abort_range (upl, 0, ap->a_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
4211                 }
4212
4213
4214                 return error;
4215         }
4216 #endif /* CONFIG_PROTECT */
4217
4218         if (ap->a_pl != NULL) {
4219                 /*
4220                  * this can only happen for swap files now that
4221                  * we're asking for V2 paging behavior...
4222                  * so don't need to worry about decompression, or
4223                  * keeping track of blocks read or taking the truncate lock
4224                  */
4225                 error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
4226                                        ap->a_size, (off_t)fp->ff_size, ap->a_flags);
4227                 goto pagein_done;
4228         }
4229
4230         page_needed_f_offset = ap->a_f_offset + ap->a_pl_offset;
4231
4232 retry_pagein:
4233         /*
4234          * take truncate lock (shared/recursive) to guard against
4235          * zero-fill thru fsync interfering, but only for v2
4236          *
4237          * the HFS_RECURSE_TRUNCLOCK arg indicates that we want the
4238          * lock shared and we are allowed to recurse 1 level if this thread already
4239          * owns the lock exclusively... this can legally occur
4240          * if we are doing a shrinking ftruncate against a file
4241          * that is mapped private, and the pages being truncated
4242          * do not currently exist in the cache... in that case
4243          * we will have to page-in the missing pages in order
4244          * to provide them to the private mapping... we must
4245          * also call hfs_unlock_truncate with a postive been_recursed
4246          * arg to indicate that if we have recursed, there is no need to drop
4247          * the lock.  Allowing this simple recursion is necessary
4248          * in order to avoid a certain deadlock... since the ftruncate
4249          * already holds the truncate lock exclusively, if we try
4250          * to acquire it shared to protect the pagein path, we will
4251          * hang this thread
4252          *
4253          * NOTE: The if () block below is a workaround in order to prevent a
4254          * VM deadlock. See rdar://7853471.
4255          *
4256          * If we are in a forced unmount, then launchd will still have the
4257          * dyld_shared_cache file mapped as it is trying to reboot.  If we
4258          * take the truncate lock here to service a page fault, then our
4259          * thread could deadlock with the forced-unmount.  The forced unmount
4260          * thread will try to reclaim the dyld_shared_cache vnode, but since it's
4261          * marked C_DELETED, it will call ubc_setsize(0).  As a result, the unmount
4262          * thread will think it needs to copy all of the data out of the file
4263          * and into a VM copy object.  If we hold the cnode lock here, then that
4264          * VM operation will not be able to proceed, because we'll set a busy page
4265          * before attempting to grab the lock.  Note that this isn't as simple as "don't
4266          * call ubc_setsize" because doing that would just shift the problem to the
4267          * ubc_msync done before the vnode is reclaimed.
4268          *
4269          * So, if a forced unmount on this volume is in flight AND the cnode is
4270          * marked C_DELETED, then just go ahead and do the page in without taking
4271          * the lock (thus suspending pagein_v2 semantics temporarily).  Since it's on a file
4272          * that is not going to be available on the next mount, this seems like a
4273          * OK solution from a correctness point of view, even though it is hacky.
4274          */
4275         if (vfs_isforce(vp->v_mount)) {
4276                 if (cp->c_flag & C_DELETED) {
4277                         /* If we don't get it, then just go ahead and operate without the lock */
4278                         truncate_lock_held = hfs_try_trunclock(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4279                 }
4280         }
4281         else {
4282                 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4283                 truncate_lock_held = TRUE;
4284         }
4285
4286         kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT);
4287
4288         if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
4289                 error = EINVAL;
4290                 goto pagein_done;
4291         }
4292         ubc_upl_range_needed(upl, ap->a_pl_offset / PAGE_SIZE, 1);
4293
4294         upl_size = isize = ap->a_size;
4295
4296         /*
4297          * Scan from the back to find the last page in the UPL, so that we
4298          * aren't looking at a UPL that may have already been freed by the
4299          * preceding aborts/completions.
4300          */
4301         for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
4302                 if (upl_page_present(pl, --pg_index))
4303                         break;
4304                 if (pg_index == 0) {
4305                         /*
4306                          * no absent pages were found in the range specified
4307                          * just abort the UPL to get rid of it and then we're done
4308                          */
4309                         ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
4310                         goto pagein_done;
4311                 }
4312         }
4313         /*
4314          * initialize the offset variables before we touch the UPL.
4315          * f_offset is the position into the file, in bytes
4316          * offset is the position into the UPL, in bytes
4317          * pg_index is the pg# of the UPL we're operating on
4318          * isize is the offset into the UPL of the last page that is present.
4319          */
4320         isize = ((pg_index + 1) * PAGE_SIZE);
4321         pg_index = 0;
4322         offset = 0;
4323         f_offset = ap->a_f_offset;
4324
4325         while (isize) {
4326                 int  xsize;
4327                 int  num_of_pages;
4328
4329                 if ( !upl_page_present(pl, pg_index)) {
4330                         /*
4331                          * we asked for RET_ONLY_ABSENT, so it's possible
4332                          * to get back empty slots in the UPL.
4333                          * just skip over them
4334                          */
4335                         f_offset += PAGE_SIZE;
4336                         offset   += PAGE_SIZE;
4337                         isize    -= PAGE_SIZE;
4338                         pg_index++;
4339
4340                         continue;
4341                 }
4342                 /*
4343                  * We know that we have at least one absent page.
4344                  * Now checking to see how many in a row we have
4345                  */
4346                 num_of_pages = 1;
4347                 xsize = isize - PAGE_SIZE;
4348
4349                 while (xsize) {
4350                         if ( !upl_page_present(pl, pg_index + num_of_pages))
4351                                 break;
4352                         num_of_pages++;
4353                         xsize -= PAGE_SIZE;
4354                 }
4355                 xsize = num_of_pages * PAGE_SIZE;
4356
4357 #if HFS_COMPRESSION
4358                 if (VNODE_IS_RSRC(vp)) {
4359                         /* allow pageins of the resource fork */
4360                 } else {
4361                         int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
4362
4363                         if (compressed) {
4364
4365                                 if (truncate_lock_held) {
4366                                         /*
4367                                          * can't hold the truncate lock when calling into the decmpfs layer
4368                                          * since it calls back into this layer... even though we're only
4369                                          * holding the lock in shared mode, and the re-entrant path only
4370                                          * takes the lock shared, we can deadlock if some other thread
4371                                          * tries to grab the lock exclusively in between.
4372                                          */
4373                                         hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4374                                         truncate_lock_held = FALSE;
4375                                 }
4376                                 ap->a_pl = upl;
4377                                 ap->a_pl_offset = offset;
4378                                 ap->a_f_offset = f_offset;
4379                                 ap->a_size = xsize;
4380
4381                                 error = decmpfs_pagein_compressed(ap, &compressed, VTOCMP(vp));
4382                                 /*
4383                                  * note that decpfs_pagein_compressed can change the state of
4384                                  * 'compressed'... it will set it to 0 if the file is no longer
4385                                  * compressed once the compression lock is successfully taken
4386                                  * i.e. we would block on that lock while the file is being inflated
4387                                  */
4388                                 if (compressed) {
4389                                         if (error == 0) {
4390                                                 /* successful page-in, update the access time */
4391                                                 VTOC(vp)->c_touch_acctime = TRUE;
4392
4393                                                 /* compressed files are not hot file candidates */
4394                                                 if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
4395                                                         fp->ff_bytesread = 0;
4396                                                 }
4397                                         } else if (error == EAGAIN) {
4398                                                 /*
4399                                                  * EAGAIN indicates someone else already holds the compression lock...
4400                                                  * to avoid deadlocking, we'll abort this range of pages with an
4401                                                  * indication that the pagein needs to be redriven
4402                                                  */
4403                                                 ubc_upl_abort_range(upl, (upl_offset_t) offset, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_RESTART);
4404                                         } else if (error == ENOSPC) {
4405
4406                                                 if (upl_size == PAGE_SIZE)
4407                                                         panic("decmpfs_pagein_compressed: couldn't ubc_upl_map a single page\n");
4408
4409                                                 ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY);
4410
4411                                                 ap->a_size = PAGE_SIZE;
4412                                                 ap->a_pl = NULL;
4413                                                 ap->a_pl_offset = 0;
4414                                                 ap->a_f_offset = page_needed_f_offset;
4415
4416                                                 goto retry_pagein;
4417                                         }
4418                                         goto pagein_next_range;
4419                                 }
4420                                 else {
4421                                         /*
4422                                          * Set file_converted only if the file became decompressed while we were
4423                                          * paging in.  If it were still compressed, we would re-start the loop using the goto
4424                                          * in the above block.  This avoid us overloading truncate_lock_held as our retry_pagein
4425                                          * condition below, since we could have avoided taking the truncate lock to prevent
4426                                          * a deadlock in the force unmount case.
4427                                          */
4428                                         file_converted = TRUE;
4429                                 }
4430                         }
4431                         if (file_converted == TRUE) {
4432                                 /*
4433                                  * the file was converted back to a regular file after we first saw it as compressed
4434                                  * we need to abort the upl, retake the truncate lock, recreate the UPL and start over
4435                                  * reset a_size so that we consider what remains of the original request
4436                                  * and null out a_upl and a_pl_offset.
4437                                  *
4438                                  * We should only be able to get into this block if the decmpfs_pagein_compressed
4439                                  * successfully decompressed the range in question for this file.
4440                                  */
4441                                 ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY);
4442
4443                                 ap->a_size = isize;
4444                                 ap->a_pl = NULL;
4445                                 ap->a_pl_offset = 0;
4446
4447                                 /* Reset file_converted back to false so that we don't infinite-loop. */
4448                                 file_converted = FALSE;
4449                                 goto retry_pagein;
4450                         }
4451                 }
4452 #endif
4453                 error = cluster_pagein(vp, upl, offset, f_offset, xsize, (off_t)fp->ff_size, ap->a_flags);
4454
4455                 /*
4456                  * Keep track of blocks read.
4457                  */
4458                 if ( !vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) {
4459                         int bytesread;
4460                         int took_cnode_lock = 0;
4461
4462                         if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE)
4463                                 bytesread = fp->ff_size;
4464                         else
4465                                 bytesread = xsize;
4466
4467                         /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
4468                         if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) {
4469                                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
4470                                 took_cnode_lock = 1;
4471                         }
4472                         /*
4473                          * If this file hasn't been seen since the start of
4474                          * the current sampling period then start over.
4475                          */
4476                         if (cp->c_atime < VTOHFS(vp)->hfc_timebase) {
4477                                 struct timeval tv;
4478
4479                                 fp->ff_bytesread = bytesread;
4480                                 microtime(&tv);
4481                                 cp->c_atime = tv.tv_sec;
4482                         } else {
4483                                 fp->ff_bytesread += bytesread;
4484                         }
4485                         cp->c_touch_acctime = TRUE;
4486                         if (took_cnode_lock)
4487                                 hfs_unlock(cp);
4488                 }
4489 pagein_next_range:
4490                 f_offset += xsize;
4491                 offset   += xsize;
4492                 isize    -= xsize;
4493                 pg_index += num_of_pages;
4494
4495                 error = 0;
4496         }
4497
4498 pagein_done:
4499         if (truncate_lock_held == TRUE) {
4500                 /* Note 1 is passed to hfs_unlock_truncate in been_recursed argument */
4501                 hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4502         }
4503
4504         return (error);
4505 }
4506
4507 /*
4508  * Pageout for HFS filesystem.
4509  */
4510 int
4511 hfs_vnop_pageout(struct vnop_pageout_args *ap)
4512 /*
4513         struct vnop_pageout_args {
4514            vnode_t a_vp,
4515            upl_t         a_pl,
4516            vm_offset_t   a_pl_offset,
4517            off_t         a_f_offset,
4518            size_t        a_size,
4519            int           a_flags
4520            vfs_context_t a_context;
4521         };
4522 */
4523 {
4524         vnode_t vp = ap->a_vp;
4525         struct cnode *cp;
4526         struct filefork *fp;
4527         int retval = 0;
4528         off_t filesize;
4529         upl_t           upl;
4530         upl_page_info_t* pl;
4531         vm_offset_t     a_pl_offset;
4532         int             a_flags;
4533         int is_pageoutv2 = 0;
4534         kern_return_t kret;
4535
4536         cp = VTOC(vp);
4537         fp = VTOF(vp);
4538
4539         /*
4540          * Figure out where the file ends, for pageout purposes.  If
4541          * ff_new_size > ff_size, then we're in the middle of extending the
4542          * file via a write, so it is safe (and necessary) that we be able
4543          * to pageout up to that point.
4544          */
4545         filesize = fp->ff_size;
4546         if (fp->ff_new_size > filesize)
4547                 filesize = fp->ff_new_size;
4548
4549         a_flags = ap->a_flags;
4550         a_pl_offset = ap->a_pl_offset;
4551
4552         /*
4553          * we can tell if we're getting the new or old behavior from the UPL
4554          */
4555         if ((upl = ap->a_pl) == NULL) {
4556                 int request_flags;
4557
4558                 is_pageoutv2 = 1;
4559                 /*
4560                  * we're in control of any UPL we commit
4561                  * make sure someone hasn't accidentally passed in UPL_NOCOMMIT
4562                  */
4563                 a_flags &= ~UPL_NOCOMMIT;
4564                 a_pl_offset = 0;
4565
4566                 /*
4567                  * For V2 semantics, we want to take the cnode truncate lock
4568                  * shared to guard against the file size changing via zero-filling.
4569                  *
4570                  * However, we have to be careful because we may be invoked
4571                  * via the ubc_msync path to write out dirty mmap'd pages
4572                  * in response to a lock event on a content-protected
4573                  * filesystem (e.g. to write out class A files).
4574                  * As a result, we want to take the truncate lock 'SHARED' with
4575                  * the mini-recursion locktype so that we don't deadlock/panic
4576                  * because we may be already holding the truncate lock exclusive to force any other
4577                  * IOs to have blocked behind us.
4578                  */
4579                 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4580
4581                 if (a_flags & UPL_MSYNC) {
4582                         request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY;
4583                 }
4584                 else {
4585                         request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY;
4586                 }
4587
4588                 kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags);
4589
4590                 if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
4591                         retval = EINVAL;
4592                         goto pageout_done;
4593                 }
4594         }
4595         /*
4596          * from this point forward upl points at the UPL we're working with
4597          * it was either passed in or we succesfully created it
4598          */
4599
4600         /*
4601          * Now that HFS is opting into VFC_VFSVNOP_PAGEOUTV2, we may need to operate on our own
4602          * UPL instead of relying on the UPL passed into us.  We go ahead and do that here,
4603          * scanning for dirty ranges.  We'll issue our own N cluster_pageout calls, for
4604          * N dirty ranges in the UPL.  Note that this is almost a direct copy of the
4605          * logic in vnode_pageout except that we need to do it after grabbing the truncate
4606          * lock in HFS so that we don't lock invert ourselves.
4607          *
4608          * Note that we can still get into this function on behalf of the default pager with
4609          * non-V2 behavior (swapfiles).  However in that case, we did not grab locks above
4610          * since fsync and other writing threads will grab the locks, then mark the
4611          * relevant pages as busy.  But the pageout codepath marks the pages as busy,
4612          * and THEN would attempt to grab the truncate lock, which would result in deadlock.  So
4613          * we do not try to grab anything for the pre-V2 case, which should only be accessed
4614          * by the paging/VM system.
4615          */
4616
4617         if (is_pageoutv2) {
4618                 off_t f_offset;
4619                 int offset;
4620                 int isize;
4621                 int pg_index;
4622                 int error;
4623                 int error_ret = 0;
4624
4625                 isize = ap->a_size;
4626                 f_offset = ap->a_f_offset;
4627
4628                 /*
4629                  * Scan from the back to find the last page in the UPL, so that we
4630                  * aren't looking at a UPL that may have already been freed by the
4631                  * preceding aborts/completions.
4632                  */
4633                 for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
4634                         if (upl_page_present(pl, --pg_index))
4635                                 break;
4636                         if (pg_index == 0) {
4637                                 ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
4638                                 goto pageout_done;
4639                         }
4640                 }
4641
4642                 /*
4643                  * initialize the offset variables before we touch the UPL.
4644                  * a_f_offset is the position into the file, in bytes
4645                  * offset is the position into the UPL, in bytes
4646                  * pg_index is the pg# of the UPL we're operating on.
4647                  * isize is the offset into the UPL of the last non-clean page.
4648                  */
4649                 isize = ((pg_index + 1) * PAGE_SIZE);
4650
4651                 offset = 0;
4652                 pg_index = 0;
4653
4654                 while (isize) {
4655                         int  xsize;
4656                         int  num_of_pages;
4657
4658                         if ( !upl_page_present(pl, pg_index)) {
4659                                 /*
4660                                  * we asked for RET_ONLY_DIRTY, so it's possible
4661                                  * to get back empty slots in the UPL.
4662                                  * just skip over them
4663                                  */
4664                                 f_offset += PAGE_SIZE;
4665                                 offset   += PAGE_SIZE;
4666                                 isize    -= PAGE_SIZE;
4667                                 pg_index++;
4668
4669                                 continue;
4670                         }
4671                         if ( !upl_dirty_page(pl, pg_index)) {
4672                                 panic ("hfs_vnop_pageout: unforeseen clean page @ index %d for UPL %p\n", pg_index, upl);
4673                         }
4674
4675                         /*
4676                          * We know that we have at least one dirty page.
4677                          * Now checking to see how many in a row we have
4678                          */
4679                         num_of_pages = 1;
4680                         xsize = isize - PAGE_SIZE;
4681
4682                         while (xsize) {
4683                                 if ( !upl_dirty_page(pl, pg_index + num_of_pages))
4684                                         break;
4685                                 num_of_pages++;
4686                                 xsize -= PAGE_SIZE;
4687                         }
4688                         xsize = num_of_pages * PAGE_SIZE;
4689
4690                         if (!vnode_isswap(vp)) {
4691                                 off_t end_of_range;
4692                                 int tooklock;
4693
4694                                 tooklock = 0;
4695
4696                                 if (cp->c_lockowner != current_thread()) {
4697                                         if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
4698                                                 /*
4699                                                  * we're in the v2 path, so we are the
4700                                                  * owner of the UPL... we may have already
4701                                                  * processed some of the UPL, so abort it
4702                                                  * from the current working offset to the
4703                                                  * end of the UPL
4704                                                  */
4705                                                 ubc_upl_abort_range(upl,
4706                                                                     offset,
4707                                                                     ap->a_size - offset,
4708                                                                     UPL_ABORT_FREE_ON_EMPTY);
4709                                                 goto pageout_done;
4710                                         }
4711                                         tooklock = 1;
4712                                 }
4713                                 end_of_range = f_offset + xsize - 1;
4714
4715                                 if (end_of_range >= filesize) {
4716                                         end_of_range = (off_t)(filesize - 1);
4717                                 }
4718                                 if (f_offset < filesize) {
4719                                         rl_remove(f_offset, end_of_range, &fp->ff_invalidranges);
4720                                         cp->c_flag |= C_MODIFIED;  /* leof is dirty */
4721                                 }
4722                                 if (tooklock) {
4723                                         hfs_unlock(cp);
4724                                 }
4725                         }
4726                         if ((error = cluster_pageout(vp, upl, offset, f_offset,
4727                                                         xsize, filesize, a_flags))) {
4728                                 if (error_ret == 0)
4729                                         error_ret = error;
4730                         }
4731                         f_offset += xsize;
4732                         offset   += xsize;
4733                         isize    -= xsize;
4734                         pg_index += num_of_pages;
4735                 }
4736                 /* capture errnos bubbled out of cluster_pageout if they occurred */
4737                 if (error_ret != 0) {
4738                         retval = error_ret;
4739                 }
4740         } /* end block for v2 pageout behavior */
4741         else {
4742                 if (!vnode_isswap(vp)) {
4743                         off_t end_of_range;
4744                         int tooklock = 0;
4745
4746                         if (cp->c_lockowner != current_thread()) {
4747                                 if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
4748                                         if (!(a_flags & UPL_NOCOMMIT)) {
4749                                                 ubc_upl_abort_range(upl,
4750                                                                     a_pl_offset,
4751                                                                     ap->a_size,
4752                                                                     UPL_ABORT_FREE_ON_EMPTY);
4753                                         }
4754                                         goto pageout_done;
4755                                 }
4756                                 tooklock = 1;
4757                         }
4758                         end_of_range = ap->a_f_offset + ap->a_size - 1;
4759
4760                         if (end_of_range >= filesize) {
4761                                 end_of_range = (off_t)(filesize - 1);
4762                         }
4763                         if (ap->a_f_offset < filesize) {
4764                                 rl_remove(ap->a_f_offset, end_of_range, &fp->ff_invalidranges);
4765                                 cp->c_flag |= C_MODIFIED;  /* leof is dirty */
4766                         }
4767
4768                         if (tooklock) {
4769                                 hfs_unlock(cp);
4770                         }
4771                 }
4772                 /*
4773                  * just call cluster_pageout for old pre-v2 behavior
4774                  */
4775                 retval = cluster_pageout(vp, upl, a_pl_offset, ap->a_f_offset,
4776                                 ap->a_size, filesize, a_flags);
4777         }
4778
4779         /*
4780          * If data was written, update the modification time of the file
4781          * but only if it's mapped writable; we will have touched the
4782          * modifcation time for direct writes.
4783          */
4784         if (retval == 0 && (ubc_is_mapped_writable(vp)
4785                                                 || ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING))) {
4786                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
4787
4788                 // Check again with lock
4789                 bool mapped_writable = ubc_is_mapped_writable(vp);
4790                 if (mapped_writable
4791                         || ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING)) {
4792                         cp->c_touch_modtime = TRUE;
4793                         cp->c_touch_chgtime = TRUE;
4794
4795                         /*
4796                          * We only need to increment the generation counter if
4797                          * it's currently mapped writable because we incremented
4798                          * the counter in hfs_vnop_mnomap.
4799                          */
4800                         if (mapped_writable)
4801                                 hfs_incr_gencount(VTOC(vp));
4802
4803                         /*
4804                          * If setuid or setgid bits are set and this process is
4805                          * not the superuser then clear the setuid and setgid bits
4806                          * as a precaution against tampering.
4807                          */
4808                         if ((cp->c_mode & (S_ISUID | S_ISGID)) &&
4809                                 (vfs_context_suser(ap->a_context) != 0)) {
4810                                 cp->c_mode &= ~(S_ISUID | S_ISGID);
4811                         }
4812                 }
4813
4814                 hfs_unlock(cp);
4815         }
4816
4817 pageout_done:
4818         if (is_pageoutv2) {
4819                 /*
4820                  * Release the truncate lock.  Note that because
4821                  * we may have taken the lock recursively by
4822                  * being invoked via ubc_msync due to lockdown,
4823                  * we should release it recursively, too.
4824                  */
4825                 hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4826         }
4827         return (retval);
4828 }
4829
4830 /*
4831  * Intercept B-Tree node writes to unswap them if necessary.
4832  */
4833 int
4834 hfs_vnop_bwrite(struct vnop_bwrite_args *ap)
4835 {
4836         int retval = 0;
4837         register struct buf *bp = ap->a_bp;
4838         register struct vnode *vp = buf_vnode(bp);
4839         BlockDescriptor block;
4840
4841         /* Trap B-Tree writes */
4842         if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) ||
4843             (VTOC(vp)->c_fileid == kHFSCatalogFileID) ||
4844             (VTOC(vp)->c_fileid == kHFSAttributesFileID) ||
4845             (vp == VTOHFS(vp)->hfc_filevp)) {
4846
4847                 /*
4848                  * Swap and validate the node if it is in native byte order.
4849                  * This is always be true on big endian, so we always validate
4850                  * before writing here.  On little endian, the node typically has
4851                  * been swapped and validated when it was written to the journal,
4852                  * so we won't do anything here.
4853                  */
4854                 if (((u_int16_t *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) {
4855                         /* Prepare the block pointer */
4856                         block.blockHeader = bp;
4857                         block.buffer = (char *)buf_dataptr(bp);
4858                         block.blockNum = buf_lblkno(bp);
4859                         /* not found in cache ==> came from disk */
4860                         block.blockReadFromDisk = (buf_fromcache(bp) == 0);
4861                         block.blockSize = buf_count(bp);
4862
4863                         /* Endian un-swap B-Tree node */
4864                         retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, false);
4865                         if (retval)
4866                                 panic("hfs_vnop_bwrite: about to write corrupt node!\n");
4867                 }
4868         }
4869
4870         /* This buffer shouldn't be locked anymore but if it is clear it */
4871         if ((buf_flags(bp) & B_LOCKED)) {
4872                 // XXXdbg
4873                 if (VTOHFS(vp)->jnl) {
4874                         panic("hfs: CLEARING the lock bit on bp %p\n", bp);
4875                 }
4876                 buf_clearflags(bp, B_LOCKED);
4877         }
4878         retval = vn_bwrite (ap);
4879
4880         return (retval);
4881 }
4882
4883 /*
4884  * Relocate a file to a new location on disk
4885  *  cnode must be locked on entry
4886  *
4887  * Relocation occurs by cloning the file's data from its
4888  * current set of blocks to a new set of blocks. During
4889  * the relocation all of the blocks (old and new) are
4890  * owned by the file.
4891  *
4892  * -----------------
4893  * |///////////////|
4894  * -----------------
4895  * 0               N (file offset)
4896  *
4897  * -----------------     -----------------
4898  * |///////////////|     |               |     STEP 1 (acquire new blocks)
4899  * -----------------     -----------------
4900  * 0               N     N+1             2N
4901  *
4902  * -----------------     -----------------
4903  * |///////////////|     |///////////////|     STEP 2 (clone data)
4904  * -----------------     -----------------
4905  * 0               N     N+1             2N
4906  *
4907  *                       -----------------
4908  *                       |///////////////|     STEP 3 (head truncate blocks)
4909  *                       -----------------
4910  *                       0               N
4911  *
4912  * During steps 2 and 3 page-outs to file offsets less
4913  * than or equal to N are suspended.
4914  *
4915  * During step 3 page-ins to the file get suspended.
4916  */
4917 int
4918 hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
4919         struct  proc *p)
4920 {
4921         struct  cnode *cp;
4922         struct  filefork *fp;
4923         struct  hfsmount *hfsmp;
4924         u_int32_t  headblks;
4925         u_int32_t  datablks;
4926         u_int32_t  blksize;
4927         u_int32_t  growsize;
4928         u_int32_t  nextallocsave;
4929         daddr64_t  sector_a,  sector_b;
4930         int eflags;
4931         off_t  newbytes;
4932         int  retval;
4933         int lockflags = 0;
4934         int took_trunc_lock = 0;
4935         int started_tr = 0;
4936         enum vtype vnodetype;
4937
4938         vnodetype = vnode_vtype(vp);
4939         if (vnodetype != VREG) {
4940                 /* Not allowed to move symlinks. */
4941                 return (EPERM);
4942         }
4943
4944         hfsmp = VTOHFS(vp);
4945         if (hfsmp->hfs_flags & HFS_FRAGMENTED_FREESPACE) {
4946                 return (ENOSPC);
4947         }
4948
4949         cp = VTOC(vp);
4950         fp = VTOF(vp);
4951         if (fp->ff_unallocblocks)
4952                 return (EINVAL);
4953
4954 #if CONFIG_PROTECT
4955         /*
4956          * <rdar://problem/9118426>
4957          * Disable HFS file relocation on content-protected filesystems
4958          */
4959         if (cp_fs_protected (hfsmp->hfs_mp)) {
4960                 return EINVAL;
4961         }
4962 #endif
4963         /* If it's an SSD, also disable HFS relocation */
4964         if (hfsmp->hfs_flags & HFS_SSD) {
4965                 return EINVAL;
4966         }
4967
4968
4969         blksize = hfsmp->blockSize;
4970         if (blockHint == 0)
4971                 blockHint = hfsmp->nextAllocation;
4972
4973         if (fp->ff_size > 0x7fffffff) {
4974                 return (EFBIG);
4975         }
4976
4977         //
4978         // We do not believe that this call to hfs_fsync() is
4979         // necessary and it causes a journal transaction
4980         // deadlock so we are removing it.
4981         //
4982         //if (vnodetype == VREG && !vnode_issystem(vp)) {
4983         //      retval = hfs_fsync(vp, MNT_WAIT, 0, p);
4984         //      if (retval)
4985         //              return (retval);
4986         //}
4987
4988         if (!vnode_issystem(vp) && (vnodetype != VLNK)) {
4989                 hfs_unlock(cp);
4990                 hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
4991                 /* Force lock since callers expects lock to be held. */
4992                 if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS))) {
4993                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
4994                         return (retval);
4995                 }
4996                 /* No need to continue if file was removed. */
4997                 if (cp->c_flag & C_NOEXISTS) {
4998                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
4999                         return (ENOENT);
5000                 }
5001                 took_trunc_lock = 1;
5002         }
5003         headblks = fp->ff_blocks;
5004         datablks = howmany(fp->ff_size, blksize);
5005         growsize = datablks * blksize;
5006         eflags = kEFContigMask | kEFAllMask | kEFNoClumpMask;
5007         if (blockHint >= hfsmp->hfs_metazone_start &&
5008             blockHint <= hfsmp->hfs_metazone_end)
5009                 eflags |= kEFMetadataMask;
5010
5011         if (hfs_start_transaction(hfsmp) != 0) {
5012                 if (took_trunc_lock)
5013                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5014             return (EINVAL);
5015         }
5016         started_tr = 1;
5017         /*
5018          * Protect the extents b-tree and the allocation bitmap
5019          * during MapFileBlockC and ExtendFileC operations.
5020          */
5021         lockflags = SFL_BITMAP;
5022         if (overflow_extents(fp))
5023                 lockflags |= SFL_EXTENTS;
5024         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5025
5026         retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize - 1, &sector_a, NULL);
5027         if (retval) {
5028                 retval = MacToVFSError(retval);
5029                 goto out;
5030         }
5031
5032         /*
5033          * STEP 1 - acquire new allocation blocks.
5034          */
5035         nextallocsave = hfsmp->nextAllocation;
5036         retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes);
5037         if (eflags & kEFMetadataMask) {
5038                 hfs_lock_mount(hfsmp);
5039                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave);
5040                 MarkVCBDirty(hfsmp);
5041                 hfs_unlock_mount(hfsmp);
5042         }
5043
5044         retval = MacToVFSError(retval);
5045         if (retval == 0) {
5046                 cp->c_flag |= C_MODIFIED;
5047                 if (newbytes < growsize) {
5048                         retval = ENOSPC;
5049                         goto restore;
5050                 } else if (fp->ff_blocks < (headblks + datablks)) {
5051                         printf("hfs_relocate: allocation failed id=%u, vol=%s\n", cp->c_cnid, hfsmp->vcbVN);
5052                         retval = ENOSPC;
5053                         goto restore;
5054                 }
5055
5056                 retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize, &sector_b, NULL);
5057                 if (retval) {
5058                         retval = MacToVFSError(retval);
5059                 } else if ((sector_a + 1) == sector_b) {
5060                         retval = ENOSPC;
5061                         goto restore;
5062                 } else if ((eflags & kEFMetadataMask) &&
5063                            ((((u_int64_t)sector_b * hfsmp->hfs_logical_block_size) / blksize) >
5064                               hfsmp->hfs_metazone_end)) {
5065 #if 0
5066                         const char * filestr;
5067                         char emptystr = '\0';
5068
5069                         if (cp->c_desc.cd_nameptr != NULL) {
5070                                 filestr = (const char *)&cp->c_desc.cd_nameptr[0];
5071                         } else if (vnode_name(vp) != NULL) {
5072                                 filestr = vnode_name(vp);
5073                         } else {
5074                                 filestr = &emptystr;
5075                         }
5076 #endif
5077                         retval = ENOSPC;
5078                         goto restore;
5079                 }
5080         }
5081         /* Done with system locks and journal for now. */
5082         hfs_systemfile_unlock(hfsmp, lockflags);
5083         lockflags = 0;
5084         hfs_end_transaction(hfsmp);
5085         started_tr = 0;
5086
5087         if (retval) {
5088                 /*
5089                  * Check to see if failure is due to excessive fragmentation.
5090                  */
5091                 if ((retval == ENOSPC) &&
5092                     (hfs_freeblks(hfsmp, 0) > (datablks * 2))) {
5093                         hfsmp->hfs_flags |= HFS_FRAGMENTED_FREESPACE;
5094                 }
5095                 goto out;
5096         }
5097         /*
5098          * STEP 2 - clone file data into the new allocation blocks.
5099          */
5100
5101         if (vnodetype == VLNK)
5102                 retval = EPERM;
5103         else if (vnode_issystem(vp))
5104                 retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p);
5105         else
5106                 retval = hfs_clonefile(vp, headblks, datablks, blksize);
5107
5108         /* Start transaction for step 3 or for a restore. */
5109         if (hfs_start_transaction(hfsmp) != 0) {
5110                 retval = EINVAL;
5111                 goto out;
5112         }
5113         started_tr = 1;
5114         if (retval)
5115                 goto restore;
5116
5117         /*
5118          * STEP 3 - switch to cloned data and remove old blocks.
5119          */
5120         lockflags = SFL_BITMAP;
5121         if (overflow_extents(fp))
5122                 lockflags |= SFL_EXTENTS;
5123         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5124
5125         retval = HeadTruncateFile(hfsmp, (FCB*)fp, headblks);
5126
5127         hfs_systemfile_unlock(hfsmp, lockflags);
5128         lockflags = 0;
5129         if (retval)
5130                 goto restore;
5131 out:
5132         if (took_trunc_lock)
5133                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5134
5135         if (lockflags) {
5136                 hfs_systemfile_unlock(hfsmp, lockflags);
5137                 lockflags = 0;
5138         }
5139
5140         /* Push cnode's new extent data to disk. */
5141         if (retval == 0) {
5142                 (void) hfs_update(vp, MNT_WAIT);
5143         }
5144         if (hfsmp->jnl) {
5145                 if (cp->c_cnid < kHFSFirstUserCatalogNodeID)
5146                         (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
5147                 else
5148                         (void) hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
5149         }
5150 exit:
5151         if (started_tr)
5152                 hfs_end_transaction(hfsmp);
5153
5154         return (retval);
5155
5156 restore:
5157         if (fp->ff_blocks == headblks) {
5158                 if (took_trunc_lock)
5159                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5160                 goto exit;
5161         }
5162         /*
5163          * Give back any newly allocated space.
5164          */
5165         if (lockflags == 0) {
5166                 lockflags = SFL_BITMAP;
5167                 if (overflow_extents(fp))
5168                         lockflags |= SFL_EXTENTS;
5169                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5170         }
5171
5172         (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, 0, FORK_IS_RSRC(fp),
5173                                                  FTOC(fp)->c_fileid, false);
5174
5175         hfs_systemfile_unlock(hfsmp, lockflags);
5176         lockflags = 0;
5177
5178         if (took_trunc_lock)
5179                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5180         goto exit;
5181 }
5182
5183
5184 /*
5185  * Clone a file's data within the file.
5186  *
5187  */
5188 static int
5189 hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
5190 {
5191         caddr_t  bufp;
5192         size_t  bufsize;
5193         size_t  copysize;
5194         size_t  iosize;
5195         size_t  offset;
5196         off_t   writebase;
5197         uio_t auio;
5198         int  error = 0;
5199
5200         writebase = blkstart * blksize;
5201         copysize = blkcnt * blksize;
5202         iosize = bufsize = MIN(copysize, 128 * 1024);
5203         offset = 0;
5204
5205         hfs_unlock(VTOC(vp));
5206
5207 #if CONFIG_PROTECT
5208         if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
5209                 hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5210                 return (error);
5211         }
5212 #endif /* CONFIG_PROTECT */
5213
5214         if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
5215                 hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5216                 return (ENOMEM);
5217         }
5218
5219         auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
5220
5221         while (offset < copysize) {
5222                 iosize = MIN(copysize - offset, iosize);
5223
5224                 uio_reset(auio, offset, UIO_SYSSPACE, UIO_READ);
5225                 uio_addiov(auio, (uintptr_t)bufp, iosize);
5226
5227                 error = cluster_read(vp, auio, copysize, IO_NOCACHE);
5228                 if (error) {
5229                         printf("hfs_clonefile: cluster_read failed - %d\n", error);
5230                         break;
5231                 }
5232                 if (uio_resid(auio) != 0) {
5233                         printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", (int64_t)uio_resid(auio));
5234                         error = EIO;
5235                         break;
5236                 }
5237
5238                 uio_reset(auio, writebase + offset, UIO_SYSSPACE, UIO_WRITE);
5239                 uio_addiov(auio, (uintptr_t)bufp, iosize);
5240
5241                 error = cluster_write(vp, auio, writebase + offset,
5242                                       writebase + offset + iosize,
5243                                       uio_offset(auio), 0, IO_NOCACHE | IO_SYNC);
5244                 if (error) {
5245                         printf("hfs_clonefile: cluster_write failed - %d\n", error);
5246                         break;
5247                 }
5248                 if (uio_resid(auio) != 0) {
5249                         printf("hfs_clonefile: cluster_write failed - uio_resid not zero\n");
5250                         error = EIO;
5251                         break;
5252                 }
5253                 offset += iosize;
5254         }
5255         uio_free(auio);
5256
5257         if ((blksize & PAGE_MASK)) {
5258                 /*
5259                  * since the copy may not have started on a PAGE
5260                  * boundary (or may not have ended on one), we
5261                  * may have pages left in the cache since NOCACHE
5262                  * will let partially written pages linger...
5263                  * lets just flush the entire range to make sure
5264                  * we don't have any pages left that are beyond
5265                  * (or intersect) the real LEOF of this file
5266                  */
5267                 ubc_msync(vp, writebase, writebase + offset, NULL, UBC_INVALIDATE | UBC_PUSHDIRTY);
5268         } else {
5269                 /*
5270                  * No need to call ubc_msync or hfs_invalbuf
5271                  * since the file was copied using IO_NOCACHE and
5272                  * the copy was done starting and ending on a page
5273                  * boundary in the file.
5274                  */
5275         }
5276         kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
5277
5278         hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5279         return (error);
5280 }
5281
5282 /*
5283  * Clone a system (metadata) file.
5284  *
5285  */
5286 static int
5287 hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize,
5288                  kauth_cred_t cred, struct proc *p)
5289 {
5290         caddr_t  bufp;
5291         char * offset;
5292         size_t  bufsize;
5293         size_t  iosize;
5294         struct buf *bp = NULL;
5295         daddr64_t  blkno;
5296         daddr64_t  blk;
5297         daddr64_t  start_blk;
5298         daddr64_t  last_blk;
5299         int  breadcnt;
5300         int  i;
5301         int  error = 0;
5302
5303
5304         iosize = GetLogicalBlockSize(vp);
5305         bufsize = MIN(blkcnt * blksize, 1024 * 1024) & ~(iosize - 1);
5306         breadcnt = bufsize / iosize;
5307
5308         if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
5309                 return (ENOMEM);
5310         }
5311         start_blk = ((daddr64_t)blkstart * blksize) / iosize;
5312         last_blk  = ((daddr64_t)blkcnt * blksize) / iosize;
5313         blkno = 0;
5314
5315         while (blkno < last_blk) {
5316                 /*
5317                  * Read up to a megabyte
5318                  */
5319                 offset = bufp;
5320                 for (i = 0, blk = blkno; (i < breadcnt) && (blk < last_blk); ++i, ++blk) {
5321                         error = (int)buf_meta_bread(vp, blk, iosize, cred, &bp);
5322                         if (error) {
5323                                 printf("hfs_clonesysfile: meta_bread error %d\n", error);
5324                                 goto out;
5325                         }
5326                         if (buf_count(bp) != iosize) {
5327                                 printf("hfs_clonesysfile: b_bcount is only %d\n", buf_count(bp));
5328                                 goto out;
5329                         }
5330                         bcopy((char *)buf_dataptr(bp), offset, iosize);
5331
5332                         buf_markinvalid(bp);
5333                         buf_brelse(bp);
5334                         bp = NULL;
5335
5336                         offset += iosize;
5337                 }
5338
5339                 /*
5340                  * Write up to a megabyte
5341                  */
5342                 offset = bufp;
5343                 for (i = 0; (i < breadcnt) && (blkno < last_blk); ++i, ++blkno) {
5344                         bp = buf_getblk(vp, start_blk + blkno, iosize, 0, 0, BLK_META);
5345                         if (bp == NULL) {
5346                                 printf("hfs_clonesysfile: getblk failed on blk %qd\n", start_blk + blkno);
5347                                 error = EIO;
5348                                 goto out;
5349                         }
5350                         bcopy(offset, (char *)buf_dataptr(bp), iosize);
5351                         error = (int)buf_bwrite(bp);
5352                         bp = NULL;
5353                         if (error)
5354                                 goto out;
5355                         offset += iosize;
5356                 }
5357         }
5358 out:
5359         if (bp) {
5360                 buf_brelse(bp);
5361         }
5362
5363         kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
5364
5365         error = hfs_fsync(vp, MNT_WAIT, 0, p);
5366
5367         return (error);
5368 }