bsd/hfs/hfs_readwrite.c

   1 /*
   2  * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*      @(#)hfs_readwrite.c     1.0
  29  *
  30  *      (c) 1998-2001 Apple Computer, Inc.  All Rights Reserved
  31  *
  32  *      hfs_readwrite.c -- vnode operations to deal with reading and writing files.
  33  *
  34  */
  35
  36 #include <sys/param.h>
  37 #include <sys/systm.h>
  38 #include <sys/resourcevar.h>
  39 #include <sys/kernel.h>
  40 #include <sys/fcntl.h>
  41 #include <sys/filedesc.h>
  42 #include <sys/stat.h>
  43 #include <sys/buf.h>
  44 #include <sys/proc.h>
  45 #include <sys/kauth.h>
  46 #include <sys/vnode.h>
  47 #include <sys/vnode_internal.h>
  48 #include <sys/uio.h>
  49 #include <sys/vfs_context.h>
  50 #include <sys/fsevents.h>
  51 #include <kern/kalloc.h>
  52 #include <sys/disk.h>
  53 #include <sys/sysctl.h>
  54 #include <sys/fsctl.h>
  55
  56 #include <miscfs/specfs/specdev.h>
  57
  58 #include <sys/ubc.h>
  59 #include <sys/ubc_internal.h>
  60
  61 #include <vm/vm_pageout.h>
  62 #include <vm/vm_kern.h>
  63
  64 #include <sys/kdebug.h>
  65
  66 #include        "hfs.h"
  67 #include        "hfs_attrlist.h"
  68 #include        "hfs_endian.h"
  69 #include        "hfs_fsctl.h"
  70 #include        "hfs_quota.h"
  71 #include        "hfscommon/headers/FileMgrInternal.h"
  72 #include        "hfscommon/headers/BTreesInternal.h"
  73 #include        "hfs_cnode.h"
  74 #include        "hfs_dbg.h"
  75
  76 #define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2)))
  77
  78 enum {
  79         MAXHFSFILESIZE = 0x7FFFFFFF             /* this needs to go in the mount structure */
  80 };
  81
  82 /* from bsd/hfs/hfs_vfsops.c */
  83 extern int hfs_vfs_vget (struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context);
  84
  85 static int  hfs_clonelink(struct vnode *, int, kauth_cred_t, struct proc *);
  86 static int  hfs_clonefile(struct vnode *, int, int, int);
  87 static int  hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *);
  88 static int  hfs_minorupdate(struct vnode *vp);
  89 static int  do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skip, vfs_context_t context);
  90
  91
  92 int flush_cache_on_write = 0;
  93 SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW | CTLFLAG_LOCKED, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files");
  94
  95 /*
  96  * Read data from a file.
  97  */
  98 int
  99 hfs_vnop_read(struct vnop_read_args *ap)
 100 {
 101         uio_t uio = ap->a_uio;
 102         struct vnode *vp = ap->a_vp;
 103         struct cnode *cp;
 104         struct filefork *fp;
 105         struct hfsmount *hfsmp;
 106         off_t filesize;
 107         off_t filebytes;
 108         off_t start_resid = uio_resid(uio);
 109         off_t offset = uio_offset(uio);
 110         int retval = 0;
 111         int took_truncate_lock = 0;
 112
 113         /* Preflight checks */
 114         if (!vnode_isreg(vp)) {
 115                 /* can only read regular files */
 116                 if (vnode_isdir(vp))
 117                         return (EISDIR);
 118                 else
 119                         return (EPERM);
 120         }
 121         if (start_resid == 0)
 122                 return (0);             /* Nothing left to do */
 123         if (offset < 0)
 124                 return (EINVAL);        /* cant read from a negative offset */
 125
 126 #if HFS_COMPRESSION
 127         if (VNODE_IS_RSRC(vp)) {
 128                 if (hfs_hides_rsrc(ap->a_context, VTOC(vp), 1)) { /* 1 == don't take the cnode lock */
 129                         return 0;
 130                 }
 131                 /* otherwise read the resource fork normally */
 132         } else {
 133                 int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
 134                 if (compressed) {
 135                         retval = decmpfs_read_compressed(ap, &compressed, VTOCMP(vp));
 136                         if (compressed) {
 137                                 if (retval == 0) {
 138                                         /* successful read, update the access time */
 139                                         VTOC(vp)->c_touch_acctime = TRUE;
 140
 141                                         /* compressed files are not hot file candidates */
 142                                         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
 143                                                 VTOF(vp)->ff_bytesread = 0;
 144                                         }
 145                                 }
 146                                 return retval;
 147                         }
 148                         /* otherwise the file was converted back to a regular file while we were reading it */
 149                         retval = 0;
 150                 } else if ((VTOC(vp)->c_flags & UF_COMPRESSED)) {
 151                         int error;
 152
 153                         error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP);
 154                         if (error) {
 155                                 return error;
 156                         }
 157
 158                 }
 159         }
 160 #endif /* HFS_COMPRESSION */
 161
 162         cp = VTOC(vp);
 163         fp = VTOF(vp);
 164         hfsmp = VTOHFS(vp);
 165
 166 #if CONFIG_PROTECT
 167         if ((retval = cp_handle_vnop (cp, CP_READ_ACCESS)) != 0) {
 168                 goto exit;
 169         }
 170 #endif
 171
 172         /* Protect against a size change. */
 173         hfs_lock_truncate(cp, HFS_SHARED_LOCK);
 174         took_truncate_lock = 1;
 175
 176         filesize = fp->ff_size;
 177         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 178         if (offset > filesize) {
 179                 if ((hfsmp->hfs_flags & HFS_STANDARD) &&
 180                     (offset > (off_t)MAXHFSFILESIZE)) {
 181                         retval = EFBIG;
 182                 }
 183                 goto exit;
 184         }
 185
 186         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_START,
 187                 (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0);
 188
 189         retval = cluster_read(vp, uio, filesize, ap->a_ioflag);
 190
 191         cp->c_touch_acctime = TRUE;
 192
 193         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_END,
 194                 (int)uio_offset(uio), uio_resid(uio), (int)filesize,  (int)filebytes, 0);
 195
 196         /*
 197          * Keep track blocks read
 198          */
 199         if (hfsmp->hfc_stage == HFC_RECORDING && retval == 0) {
 200                 int took_cnode_lock = 0;
 201                 off_t bytesread;
 202
 203                 bytesread = start_resid - uio_resid(uio);
 204
 205                 /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
 206                 if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) {
 207                         hfs_lock(cp, HFS_FORCE_LOCK);
 208                         took_cnode_lock = 1;
 209                 }
 210                 /*
 211                  * If this file hasn't been seen since the start of
 212                  * the current sampling period then start over.
 213                  */
 214                 if (cp->c_atime < hfsmp->hfc_timebase) {
 215                         struct timeval tv;
 216
 217                         fp->ff_bytesread = bytesread;
 218                         microtime(&tv);
 219                         cp->c_atime = tv.tv_sec;
 220                 } else {
 221                         fp->ff_bytesread += bytesread;
 222                 }
 223                 if (took_cnode_lock)
 224                         hfs_unlock(cp);
 225         }
 226 exit:
 227         if (took_truncate_lock) {
 228                 hfs_unlock_truncate(cp, 0);
 229         }
 230
 231         return (retval);
 232 }
 233
 234 /*
 235  * Write data to a file.
 236  */
 237 int
 238 hfs_vnop_write(struct vnop_write_args *ap)
 239 {
 240         uio_t uio = ap->a_uio;
 241         struct vnode *vp = ap->a_vp;
 242         struct cnode *cp;
 243         struct filefork *fp;
 244         struct hfsmount *hfsmp;
 245         kauth_cred_t cred = NULL;
 246         off_t origFileSize;
 247         off_t writelimit;
 248         off_t bytesToAdd = 0;
 249         off_t actualBytesAdded;
 250         off_t filebytes;
 251         off_t offset;
 252         ssize_t resid;
 253         int eflags;
 254         int ioflag = ap->a_ioflag;
 255         int retval = 0;
 256         int lockflags;
 257         int cnode_locked = 0;
 258         int partialwrite = 0;
 259         int do_snapshot = 1;
 260         time_t orig_ctime=VTOC(vp)->c_ctime;
 261         int took_truncate_lock = 0;
 262         struct rl_entry *invalid_range;
 263
 264 #if HFS_COMPRESSION
 265         if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
 266                 int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
 267                 switch(state) {
 268                         case FILE_IS_COMPRESSED:
 269                                 return EACCES;
 270                         case FILE_IS_CONVERTING:
 271                                 /* if FILE_IS_CONVERTING, we allow writes but do not
 272                                    bother with snapshots or else we will deadlock.
 273                                 */
 274                                 do_snapshot = 0;
 275                                 break;
 276                         default:
 277                                 printf("invalid state %d for compressed file\n", state);
 278                                 /* fall through */
 279                 }
 280         } else if ((VTOC(vp)->c_flags & UF_COMPRESSED)) {
 281                 int error;
 282
 283                 error = check_for_dataless_file(vp, NAMESPACE_HANDLER_WRITE_OP);
 284                 if (error != 0) {
 285                         return error;
 286                 }
 287         }
 288
 289         if (do_snapshot) {
 290                 check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_WRITE_OP, uio);
 291         }
 292
 293 #endif
 294
 295         // LP64todo - fix this! uio_resid may be 64-bit value
 296         resid = uio_resid(uio);
 297         offset = uio_offset(uio);
 298
 299         if (offset < 0)
 300                 return (EINVAL);
 301         if (resid == 0)
 302                 return (E_NONE);
 303         if (!vnode_isreg(vp))
 304                 return (EPERM);  /* Can only write regular files */
 305
 306         cp = VTOC(vp);
 307         fp = VTOF(vp);
 308         hfsmp = VTOHFS(vp);
 309
 310 #if CONFIG_PROTECT
 311         if ((retval = cp_handle_vnop (cp, CP_WRITE_ACCESS)) != 0) {
 312                 goto exit;
 313         }
 314 #endif
 315
 316         eflags = kEFDeferMask;  /* defer file block allocations */
 317 #if HFS_SPARSE_DEV
 318         /*
 319          * When the underlying device is sparse and space
 320          * is low (< 8MB), stop doing delayed allocations
 321          * and begin doing synchronous I/O.
 322          */
 323         if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
 324             (hfs_freeblks(hfsmp, 0) < 2048)) {
 325                 eflags &= ~kEFDeferMask;
 326                 ioflag |= IO_SYNC;
 327         }
 328 #endif /* HFS_SPARSE_DEV */
 329
 330 again:
 331         /* Protect against a size change. */
 332         /*
 333          * Protect against a size change.
 334          *
 335          * Note: If took_truncate_lock is true, then we previously got the lock shared
 336          * but needed to upgrade to exclusive.  So try getting it exclusive from the
 337          * start.
 338          */
 339         if (ioflag & IO_APPEND || took_truncate_lock) {
 340                 hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK);
 341         }
 342         else {
 343                 hfs_lock_truncate(cp, HFS_SHARED_LOCK);
 344         }
 345         took_truncate_lock = 1;
 346
 347         /* Update UIO */
 348         if (ioflag & IO_APPEND) {
 349                 uio_setoffset(uio, fp->ff_size);
 350                 offset = fp->ff_size;
 351         }
 352         if ((cp->c_flags & APPEND) && offset != fp->ff_size) {
 353                 retval = EPERM;
 354                 goto exit;
 355         }
 356
 357         origFileSize = fp->ff_size;
 358         writelimit = offset + resid;
 359         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 360
 361         /*
 362          * We may need an exclusive truncate lock for several reasons, all
 363          * of which are because we may be writing to a (portion of a) block
 364          * for the first time, and we need to make sure no readers see the
 365          * prior, uninitialized contents of the block.  The cases are:
 366          *
 367          * 1. We have unallocated (delayed allocation) blocks.  We may be
 368          *    allocating new blocks to the file and writing to them.
 369          *    (A more precise check would be whether the range we're writing
 370          *    to contains delayed allocation blocks.)
 371          * 2. We need to extend the file.  The bytes between the old EOF
 372          *    and the new EOF are not yet initialized.  This is important
 373          *    even if we're not allocating new blocks to the file.  If the
 374          *    old EOF and new EOF are in the same block, we still need to
 375          *    protect that range of bytes until they are written for the
 376          *    first time.
 377          * 3. The write overlaps some invalid ranges (delayed zero fill; that
 378          *    part of the file has been allocated, but not yet written).
 379          *
 380          * If we had a shared lock with the above cases, we need to try to upgrade
 381          * to an exclusive lock.  If the upgrade fails, we will lose the shared
 382          * lock, and will need to take the truncate lock again; the took_truncate_lock
 383          * flag will still be set, causing us to try for an exclusive lock next time.
 384          *
 385          * NOTE: Testing for #3 (delayed zero fill) needs to be done while the cnode
 386          * lock is held, since it protects the range lists.
 387          */
 388         if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) &&
 389             ((fp->ff_unallocblocks != 0) ||
 390              (writelimit > origFileSize))) {
 391                 if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) {
 392                         /*
 393                          * Lock upgrade failed and we lost our shared lock, try again.
 394                          * Note: we do not set took_truncate_lock=0 here.  Leaving it
 395                          * set to 1 will cause us to try to get the lock exclusive.
 396                          */
 397                         goto again;
 398                 }
 399                 else {
 400                         /* Store the owner in the c_truncatelockowner field if we successfully upgrade */
 401                         cp->c_truncatelockowner = current_thread();
 402                 }
 403         }
 404
 405         if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) {
 406                 goto exit;
 407         }
 408         cnode_locked = 1;
 409
 410         /*
 411          * Now that we have the cnode lock, see if there are delayed zero fill ranges
 412          * overlapping our write.  If so, we need the truncate lock exclusive (see above).
 413          */
 414         if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) &&
 415             (rl_scan(&fp->ff_invalidranges, offset, writelimit-1, &invalid_range) != RL_NOOVERLAP)) {
 416                 /*
 417                  * When testing, it appeared that calling lck_rw_lock_shared_to_exclusive() causes
 418                  * a deadlock, rather than simply returning failure.  (That is, it apparently does
 419                  * not behave like a "try_lock").  Since this condition is rare, just drop the
 420                  * cnode lock and try again.  Since took_truncate_lock is set, we will
 421                  * automatically take the truncate lock exclusive.
 422                  */
 423                 hfs_unlock(cp);
 424                 cnode_locked = 0;
 425                 hfs_unlock_truncate(cp, 0);
 426                 goto again;
 427         }
 428
 429         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_START,
 430                      (int)offset, uio_resid(uio), (int)fp->ff_size,
 431                      (int)filebytes, 0);
 432
 433         /* Check if we do not need to extend the file */
 434         if (writelimit <= filebytes) {
 435                 goto sizeok;
 436         }
 437
 438         cred = vfs_context_ucred(ap->a_context);
 439         bytesToAdd = writelimit - filebytes;
 440
 441 #if QUOTA
 442         retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, hfsmp->blockSize)),
 443                            cred, 0);
 444         if (retval)
 445                 goto exit;
 446 #endif /* QUOTA */
 447
 448         if (hfs_start_transaction(hfsmp) != 0) {
 449                 retval = EINVAL;
 450                 goto exit;
 451         }
 452
 453         while (writelimit > filebytes) {
 454                 bytesToAdd = writelimit - filebytes;
 455                 if (cred && suser(cred, NULL) != 0)
 456                         eflags |= kEFReserveMask;
 457
 458                 /* Protect extents b-tree and allocation bitmap */
 459                 lockflags = SFL_BITMAP;
 460                 if (overflow_extents(fp))
 461                         lockflags |= SFL_EXTENTS;
 462                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
 463
 464                 /* Files that are changing size are not hot file candidates. */
 465                 if (hfsmp->hfc_stage == HFC_RECORDING) {
 466                         fp->ff_bytesread = 0;
 467                 }
 468                 retval = MacToVFSError(ExtendFileC (hfsmp, (FCB*)fp, bytesToAdd,
 469                                 0, eflags, &actualBytesAdded));
 470
 471                 hfs_systemfile_unlock(hfsmp, lockflags);
 472
 473                 if ((actualBytesAdded == 0) && (retval == E_NONE))
 474                         retval = ENOSPC;
 475                 if (retval != E_NONE)
 476                         break;
 477                 filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 478                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_NONE,
 479                         (int)offset, uio_resid(uio), (int)fp->ff_size,  (int)filebytes, 0);
 480         }
 481         (void) hfs_update(vp, TRUE);
 482         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
 483         (void) hfs_end_transaction(hfsmp);
 484
 485         /*
 486          * If we didn't grow the file enough try a partial write.
 487          * POSIX expects this behavior.
 488          */
 489         if ((retval == ENOSPC) && (filebytes > offset)) {
 490                 retval = 0;
 491                 partialwrite = 1;
 492                 uio_setresid(uio, (uio_resid(uio) - bytesToAdd));
 493                 resid -= bytesToAdd;
 494                 writelimit = filebytes;
 495         }
 496 sizeok:
 497         if (retval == E_NONE) {
 498                 off_t filesize;
 499                 off_t zero_off;
 500                 off_t tail_off;
 501                 off_t inval_start;
 502                 off_t inval_end;
 503                 off_t io_start;
 504                 int lflag;
 505
 506                 if (writelimit > fp->ff_size)
 507                         filesize = writelimit;
 508                 else
 509                         filesize = fp->ff_size;
 510
 511                 lflag = ioflag & ~(IO_TAILZEROFILL | IO_HEADZEROFILL | IO_NOZEROVALID | IO_NOZERODIRTY);
 512
 513                 if (offset <= fp->ff_size) {
 514                         zero_off = offset & ~PAGE_MASK_64;
 515
 516                         /* Check to see whether the area between the zero_offset and the start
 517                            of the transfer to see whether is invalid and should be zero-filled
 518                            as part of the transfer:
 519                          */
 520                         if (offset > zero_off) {
 521                                 if (rl_scan(&fp->ff_invalidranges, zero_off, offset - 1, &invalid_range) != RL_NOOVERLAP)
 522                                         lflag |= IO_HEADZEROFILL;
 523                         }
 524                 } else {
 525                         off_t eof_page_base = fp->ff_size & ~PAGE_MASK_64;
 526
 527                         /* The bytes between fp->ff_size and uio->uio_offset must never be
 528                            read without being zeroed.  The current last block is filled with zeroes
 529                            if it holds valid data but in all cases merely do a little bookkeeping
 530                            to track the area from the end of the current last page to the start of
 531                            the area actually written.  For the same reason only the bytes up to the
 532                            start of the page where this write will start is invalidated; any remainder
 533                            before uio->uio_offset is explicitly zeroed as part of the cluster_write.
 534
 535                            Note that inval_start, the start of the page after the current EOF,
 536                            may be past the start of the write, in which case the zeroing
 537                            will be handled by the cluser_write of the actual data.
 538                          */
 539                         inval_start = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
 540                         inval_end = offset & ~PAGE_MASK_64;
 541                         zero_off = fp->ff_size;
 542
 543                         if ((fp->ff_size & PAGE_MASK_64) &&
 544                                 (rl_scan(&fp->ff_invalidranges,
 545                                                         eof_page_base,
 546                                                         fp->ff_size - 1,
 547                                                         &invalid_range) != RL_NOOVERLAP)) {
 548                                 /* The page containing the EOF is not valid, so the
 549                                    entire page must be made inaccessible now.  If the write
 550                                    starts on a page beyond the page containing the eof
 551                                    (inval_end > eof_page_base), add the
 552                                    whole page to the range to be invalidated.  Otherwise
 553                                    (i.e. if the write starts on the same page), zero-fill
 554                                    the entire page explicitly now:
 555                                  */
 556                                 if (inval_end > eof_page_base) {
 557                                         inval_start = eof_page_base;
 558                                 } else {
 559                                         zero_off = eof_page_base;
 560                                 };
 561                         };
 562
 563                         if (inval_start < inval_end) {
 564                                 struct timeval tv;
 565                                 /* There's some range of data that's going to be marked invalid */
 566
 567                                 if (zero_off < inval_start) {
 568                                         /* The pages between inval_start and inval_end are going to be invalidated,
 569                                            and the actual write will start on a page past inval_end.  Now's the last
 570                                            chance to zero-fill the page containing the EOF:
 571                                          */
 572                                         hfs_unlock(cp);
 573                                         cnode_locked = 0;
 574                                         retval = cluster_write(vp, (uio_t) 0,
 575                                                         fp->ff_size, inval_start,
 576                                                         zero_off, (off_t)0,
 577                                                         lflag | IO_HEADZEROFILL | IO_NOZERODIRTY);
 578                                         hfs_lock(cp, HFS_FORCE_LOCK);
 579                                         cnode_locked = 1;
 580                                         if (retval) goto ioerr_exit;
 581                                         offset = uio_offset(uio);
 582                                 };
 583
 584                                 /* Mark the remaining area of the newly allocated space as invalid: */
 585                                 rl_add(inval_start, inval_end - 1 , &fp->ff_invalidranges);
 586                                 microuptime(&tv);
 587                                 cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
 588                                 zero_off = fp->ff_size = inval_end;
 589                         };
 590
 591                         if (offset > zero_off) lflag |= IO_HEADZEROFILL;
 592                 };
 593
 594                 /* Check to see whether the area between the end of the write and the end of
 595                    the page it falls in is invalid and should be zero-filled as part of the transfer:
 596                  */
 597                 tail_off = (writelimit + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
 598                 if (tail_off > filesize) tail_off = filesize;
 599                 if (tail_off > writelimit) {
 600                         if (rl_scan(&fp->ff_invalidranges, writelimit, tail_off - 1, &invalid_range) != RL_NOOVERLAP) {
 601                                 lflag |= IO_TAILZEROFILL;
 602                         };
 603                 };
 604
 605                 /*
 606                  * if the write starts beyond the current EOF (possibly advanced in the
 607                  * zeroing of the last block, above), then we'll zero fill from the current EOF
 608                  * to where the write begins:
 609                  *
 610                  * NOTE: If (and ONLY if) the portion of the file about to be written is
 611                  *       before the current EOF it might be marked as invalid now and must be
 612                  *       made readable (removed from the invalid ranges) before cluster_write
 613                  *       tries to write it:
 614                  */
 615                 io_start = (lflag & IO_HEADZEROFILL) ? zero_off : offset;
 616                 if (io_start < fp->ff_size) {
 617                         off_t io_end;
 618
 619                         io_end = (lflag & IO_TAILZEROFILL) ? tail_off : writelimit;
 620                         rl_remove(io_start, io_end - 1, &fp->ff_invalidranges);
 621                 };
 622
 623                 hfs_unlock(cp);
 624                 cnode_locked = 0;
 625
 626                 /*
 627                  * We need to tell UBC the fork's new size BEFORE calling
 628                  * cluster_write, in case any of the new pages need to be
 629                  * paged out before cluster_write completes (which does happen
 630                  * in embedded systems due to extreme memory pressure).
 631                  * Similarly, we need to tell hfs_vnop_pageout what the new EOF
 632                  * will be, so that it can pass that on to cluster_pageout, and
 633                  * allow those pageouts.
 634                  *
 635                  * We don't update ff_size yet since we don't want pageins to
 636                  * be able to see uninitialized data between the old and new
 637                  * EOF, until cluster_write has completed and initialized that
 638                  * part of the file.
 639                  *
 640                  * The vnode pager relies on the file size last given to UBC via
 641                  * ubc_setsize.  hfs_vnop_pageout relies on fp->ff_new_size or
 642                  * ff_size (whichever is larger).  NOTE: ff_new_size is always
 643                  * zero, unless we are extending the file via write.
 644                  */
 645                 if (filesize > fp->ff_size) {
 646                         fp->ff_new_size = filesize;
 647                         ubc_setsize(vp, filesize);
 648                 }
 649                 retval = cluster_write(vp, uio, fp->ff_size, filesize, zero_off,
 650                                 tail_off, lflag | IO_NOZERODIRTY);
 651                 if (retval) {
 652                         fp->ff_new_size = 0;    /* no longer extending; use ff_size */
 653                         if (filesize > origFileSize) {
 654                                 ubc_setsize(vp, origFileSize);
 655                         }
 656                         goto ioerr_exit;
 657                 }
 658
 659                 if (filesize > origFileSize) {
 660                         fp->ff_size = filesize;
 661
 662                         /* Files that are changing size are not hot file candidates. */
 663                         if (hfsmp->hfc_stage == HFC_RECORDING) {
 664                                 fp->ff_bytesread = 0;
 665                         }
 666                 }
 667                 fp->ff_new_size = 0;    /* ff_size now has the correct size */
 668
 669                 /* If we wrote some bytes, then touch the change and mod times */
 670                 if (resid > uio_resid(uio)) {
 671                         cp->c_touch_chgtime = TRUE;
 672                         cp->c_touch_modtime = TRUE;
 673                 }
 674         }
 675         if (partialwrite) {
 676                 uio_setresid(uio, (uio_resid(uio) + bytesToAdd));
 677                 resid += bytesToAdd;
 678         }
 679
 680         // XXXdbg - see radar 4871353 for more info
 681         {
 682             if (flush_cache_on_write && ((ioflag & IO_NOCACHE) || vnode_isnocache(vp))) {
 683                 VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NULL);
 684             }
 685         }
 686
 687 ioerr_exit:
 688         /*
 689          * If we successfully wrote any data, and we are not the superuser
 690          * we clear the setuid and setgid bits as a precaution against
 691          * tampering.
 692          */
 693         if (cp->c_mode & (S_ISUID | S_ISGID)) {
 694                 cred = vfs_context_ucred(ap->a_context);
 695                 if (resid > uio_resid(uio) && cred && suser(cred, NULL)) {
 696                         if (!cnode_locked) {
 697                                 hfs_lock(cp, HFS_FORCE_LOCK);
 698                                 cnode_locked = 1;
 699                         }
 700                         cp->c_mode &= ~(S_ISUID | S_ISGID);
 701                 }
 702         }
 703         if (retval) {
 704                 if (ioflag & IO_UNIT) {
 705                         if (!cnode_locked) {
 706                                 hfs_lock(cp, HFS_FORCE_LOCK);
 707                                 cnode_locked = 1;
 708                         }
 709                         (void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC,
 710                                            0, 0, ap->a_context);
 711                         // LP64todo - fix this!  resid needs to by user_ssize_t
 712                         uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio))));
 713                         uio_setresid(uio, resid);
 714                         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 715                 }
 716         } else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio))) {
 717                 if (!cnode_locked) {
 718                         hfs_lock(cp, HFS_FORCE_LOCK);
 719                         cnode_locked = 1;
 720                 }
 721                 retval = hfs_update(vp, TRUE);
 722         }
 723         /* Updating vcbWrCnt doesn't need to be atomic. */
 724         hfsmp->vcbWrCnt++;
 725
 726         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_END,
 727                 (int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0);
 728 exit:
 729         if (cnode_locked)
 730                 hfs_unlock(cp);
 731
 732         if (took_truncate_lock) {
 733                 hfs_unlock_truncate(cp, 0);
 734         }
 735         return (retval);
 736 }
 737
 738 /* support for the "bulk-access" fcntl */
 739
 740 #define CACHE_LEVELS 16
 741 #define NUM_CACHE_ENTRIES (64*16)
 742 #define PARENT_IDS_FLAG 0x100
 743
 744 struct access_cache {
 745        int numcached;
 746        int cachehits; /* these two for statistics gathering */
 747        int lookups;
 748        unsigned int *acache;
 749        unsigned char *haveaccess;
 750 };
 751
 752 struct access_t {
 753         uid_t     uid;              /* IN: effective user id */
 754         short     flags;            /* IN: access requested (i.e. R_OK) */
 755         short     num_groups;       /* IN: number of groups user belongs to */
 756         int       num_files;        /* IN: number of files to process */
 757         int       *file_ids;        /* IN: array of file ids */
 758         gid_t     *groups;          /* IN: array of groups */
 759         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 760 } __attribute__((unavailable)); // this structure is for reference purposes only
 761
 762 struct user32_access_t {
 763         uid_t     uid;              /* IN: effective user id */
 764         short     flags;            /* IN: access requested (i.e. R_OK) */
 765         short     num_groups;       /* IN: number of groups user belongs to */
 766         int       num_files;        /* IN: number of files to process */
 767         user32_addr_t      file_ids;        /* IN: array of file ids */
 768         user32_addr_t      groups;          /* IN: array of groups */
 769         user32_addr_t      access;          /* OUT: access info for each file (0 for 'has access') */
 770 };
 771
 772 struct user64_access_t {
 773         uid_t           uid;                    /* IN: effective user id */
 774         short           flags;                  /* IN: access requested (i.e. R_OK) */
 775         short           num_groups;             /* IN: number of groups user belongs to */
 776         int             num_files;              /* IN: number of files to process */
 777         user64_addr_t   file_ids;               /* IN: array of file ids */
 778         user64_addr_t   groups;                 /* IN: array of groups */
 779         user64_addr_t   access;                 /* OUT: access info for each file (0 for 'has access') */
 780 };
 781
 782
 783 // these are the "extended" versions of the above structures
 784 // note that it is crucial that they be different sized than
 785 // the regular version
 786 struct ext_access_t {
 787         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 788         uint32_t   num_files;       /* IN: number of files to process */
 789         uint32_t   map_size;        /* IN: size of the bit map */
 790         uint32_t  *file_ids;        /* IN: Array of file ids */
 791         char      *bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 792         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 793         uint32_t   num_parents;   /* future use */
 794         cnid_t      *parents;   /* future use */
 795 } __attribute__((unavailable)); // this structure is for reference purposes only
 796
 797 struct user32_ext_access_t {
 798         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 799         uint32_t   num_files;       /* IN: number of files to process */
 800         uint32_t   map_size;        /* IN: size of the bit map */
 801         user32_addr_t  file_ids;        /* IN: Array of file ids */
 802         user32_addr_t     bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 803         user32_addr_t access;          /* OUT: access info for each file (0 for 'has access') */
 804         uint32_t   num_parents;   /* future use */
 805         user32_addr_t parents;   /* future use */
 806 };
 807
 808 struct user64_ext_access_t {
 809         uint32_t      flags;        /* IN: access requested (i.e. R_OK) */
 810         uint32_t      num_files;    /* IN: number of files to process */
 811         uint32_t      map_size;     /* IN: size of the bit map */
 812         user64_addr_t   file_ids;     /* IN: array of file ids */
 813         user64_addr_t   bitmap;       /* IN: array of groups */
 814         user64_addr_t   access;       /* OUT: access info for each file (0 for 'has access') */
 815         uint32_t      num_parents;/* future use */
 816         user64_addr_t   parents;/* future use */
 817 };
 818
 819
 820 /*
 821  * Perform a binary search for the given parent_id. Return value is
 822  * the index if there is a match.  If no_match_indexp is non-NULL it
 823  * will be assigned with the index to insert the item (even if it was
 824  * not found).
 825  */
 826 static int cache_binSearch(cnid_t *array, unsigned int hi, cnid_t parent_id, int *no_match_indexp)
 827 {
 828     int index=-1;
 829     unsigned int lo=0;
 830
 831     do {
 832         unsigned int mid = ((hi - lo)/2) + lo;
 833         unsigned int this_id = array[mid];
 834
 835         if (parent_id == this_id) {
 836             hi = mid;
 837             break;
 838         }
 839
 840         if (parent_id < this_id) {
 841             hi = mid;
 842             continue;
 843         }
 844
 845         if (parent_id > this_id) {
 846             lo = mid + 1;
 847             continue;
 848         }
 849     } while(lo < hi);
 850
 851     /* check if lo and hi converged on the match */
 852     if (parent_id == array[hi]) {
 853         index = hi;
 854     }
 855
 856     if (no_match_indexp) {
 857         *no_match_indexp = hi;
 858     }
 859
 860     return index;
 861 }
 862
 863
 864 static int
 865 lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id)
 866 {
 867     unsigned int hi;
 868     int matches = 0;
 869     int index, no_match_index;
 870
 871     if (cache->numcached == 0) {
 872         *indexp = 0;
 873         return 0; // table is empty, so insert at index=0 and report no match
 874     }
 875
 876     if (cache->numcached > NUM_CACHE_ENTRIES) {
 877         /*printf("hfs: EGAD! numcached is %d... cut our losses and trim to %d\n",
 878           cache->numcached, NUM_CACHE_ENTRIES);*/
 879         cache->numcached = NUM_CACHE_ENTRIES;
 880     }
 881
 882     hi = cache->numcached - 1;
 883
 884     index = cache_binSearch(cache->acache, hi, parent_id, &no_match_index);
 885
 886     /* if no existing entry found, find index for new one */
 887     if (index == -1) {
 888         index = no_match_index;
 889         matches = 0;
 890     } else {
 891         matches = 1;
 892     }
 893
 894     *indexp = index;
 895     return matches;
 896 }
 897
 898 /*
 899  * Add a node to the access_cache at the given index (or do a lookup first
 900  * to find the index if -1 is passed in). We currently do a replace rather
 901  * than an insert if the cache is full.
 902  */
 903 static void
 904 add_node(struct access_cache *cache, int index, cnid_t nodeID, int access)
 905 {
 906     int lookup_index = -1;
 907
 908     /* need to do a lookup first if -1 passed for index */
 909     if (index == -1) {
 910         if (lookup_bucket(cache, &lookup_index, nodeID)) {
 911             if (cache->haveaccess[lookup_index] != access && cache->haveaccess[lookup_index] == ESRCH) {
 912                 // only update an entry if the previous access was ESRCH (i.e. a scope checking error)
 913                 cache->haveaccess[lookup_index] = access;
 914             }
 915
 916             /* mission accomplished */
 917             return;
 918         } else {
 919             index = lookup_index;
 920         }
 921
 922     }
 923
 924     /* if the cache is full, do a replace rather than an insert */
 925     if (cache->numcached >= NUM_CACHE_ENTRIES) {
 926         //printf("hfs: cache is full (%d). replace at index %d\n", cache->numcached, index);
 927         cache->numcached = NUM_CACHE_ENTRIES-1;
 928
 929         if (index > cache->numcached) {
 930             //    printf("hfs: index %d pinned to %d\n", index, cache->numcached);
 931             index = cache->numcached;
 932         }
 933     }
 934
 935     if (index < cache->numcached && index < NUM_CACHE_ENTRIES && nodeID > cache->acache[index]) {
 936         index++;
 937     }
 938
 939     if (index >= 0 && index < cache->numcached) {
 940         /* only do bcopy if we're inserting */
 941         bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) );
 942         bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(unsigned char) );
 943     }
 944
 945     cache->acache[index] = nodeID;
 946     cache->haveaccess[index] = access;
 947     cache->numcached++;
 948 }
 949
 950
 951 struct cinfo {
 952     uid_t   uid;
 953     gid_t   gid;
 954     mode_t  mode;
 955     cnid_t  parentcnid;
 956     u_int16_t recflags;
 957 };
 958
 959 static int
 960 snoop_callback(const struct cat_desc *descp, const struct cat_attr *attrp, void * arg)
 961 {
 962     struct cinfo *cip = (struct cinfo *)arg;
 963
 964     cip->uid = attrp->ca_uid;
 965     cip->gid = attrp->ca_gid;
 966     cip->mode = attrp->ca_mode;
 967     cip->parentcnid = descp->cd_parentcnid;
 968     cip->recflags = attrp->ca_recflags;
 969
 970     return (0);
 971 }
 972
 973 /*
 974  * Lookup the cnid's attr info (uid, gid, and mode) as well as its parent id. If the item
 975  * isn't incore, then go to the catalog.
 976  */
 977 static int
 978 do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, cnid_t cnid,
 979     struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp)
 980 {
 981     int error = 0;
 982
 983     /* if this id matches the one the fsctl was called with, skip the lookup */
 984     if (cnid == skip_cp->c_cnid) {
 985         cnattrp->ca_uid = skip_cp->c_uid;
 986         cnattrp->ca_gid = skip_cp->c_gid;
 987         cnattrp->ca_mode = skip_cp->c_mode;
 988         cnattrp->ca_recflags = skip_cp->c_attr.ca_recflags;
 989         keyp->hfsPlus.parentID = skip_cp->c_parentcnid;
 990     } else {
 991         struct cinfo c_info;
 992
 993         /* otherwise, check the cnode hash incase the file/dir is incore */
 994         if (hfs_chash_snoop(hfsmp, cnid, 0, snoop_callback, &c_info) == 0) {
 995             cnattrp->ca_uid = c_info.uid;
 996             cnattrp->ca_gid = c_info.gid;
 997             cnattrp->ca_mode = c_info.mode;
 998             cnattrp->ca_recflags = c_info.recflags;
 999             keyp->hfsPlus.parentID = c_info.parentcnid;
1000         } else {
1001             int lockflags;
1002
1003             lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
1004
1005             /* lookup this cnid in the catalog */
1006             error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp);
1007
1008             hfs_systemfile_unlock(hfsmp, lockflags);
1009
1010             cache->lookups++;
1011         }
1012     }
1013
1014     return (error);
1015 }
1016
1017
1018 /*
1019  * Compute whether we have access to the given directory (nodeID) and all its parents. Cache
1020  * up to CACHE_LEVELS as we progress towards the root.
1021  */
1022 static int
1023 do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HFSCatalogNodeID nodeID,
1024     struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred,
1025     struct vfs_context *my_context,
1026     char *bitmap,
1027     uint32_t map_size,
1028     cnid_t* parents,
1029     uint32_t num_parents)
1030 {
1031     int                     myErr = 0;
1032     int                     myResult;
1033     HFSCatalogNodeID        thisNodeID;
1034     unsigned int            myPerms;
1035     struct cat_attr         cnattr;
1036     int                     cache_index = -1, scope_index = -1, scope_idx_start = -1;
1037     CatalogKey              catkey;
1038
1039     int i = 0, ids_to_cache = 0;
1040     int parent_ids[CACHE_LEVELS];
1041
1042     thisNodeID = nodeID;
1043     while (thisNodeID >=  kRootDirID) {
1044         myResult = 0;   /* default to "no access" */
1045
1046         /* check the cache before resorting to hitting the catalog */
1047
1048         /* ASSUMPTION: access info of cached entries is "final"... i.e. no need
1049          * to look any further after hitting cached dir */
1050
1051         if (lookup_bucket(cache, &cache_index, thisNodeID)) {
1052             cache->cachehits++;
1053             myErr = cache->haveaccess[cache_index];
1054             if (scope_index != -1) {
1055                 if (myErr == ESRCH) {
1056                     myErr = 0;
1057                 }
1058             } else {
1059                 scope_index = 0;   // so we'll just use the cache result
1060                 scope_idx_start = ids_to_cache;
1061             }
1062             myResult = (myErr == 0) ? 1 : 0;
1063             goto ExitThisRoutine;
1064         }
1065
1066
1067         if (parents) {
1068             int tmp;
1069             tmp = cache_binSearch(parents, num_parents-1, thisNodeID, NULL);
1070             if (scope_index == -1)
1071                 scope_index = tmp;
1072             if (tmp != -1 && scope_idx_start == -1 && ids_to_cache < CACHE_LEVELS) {
1073                 scope_idx_start = ids_to_cache;
1074             }
1075         }
1076
1077         /* remember which parents we want to cache */
1078         if (ids_to_cache < CACHE_LEVELS) {
1079             parent_ids[ids_to_cache] = thisNodeID;
1080             ids_to_cache++;
1081         }
1082         // Inefficient (using modulo) and we might want to use a hash function, not rely on the node id to be "nice"...
1083         if (bitmap && map_size) {
1084             bitmap[(thisNodeID/8)%(map_size)]|=(1<<(thisNodeID&7));
1085         }
1086
1087
1088         /* do the lookup (checks the cnode hash, then the catalog) */
1089         myErr = do_attr_lookup(hfsmp, cache, thisNodeID, skip_cp, &catkey, &cnattr);
1090         if (myErr) {
1091             goto ExitThisRoutine; /* no access */
1092         }
1093
1094         /* Root always gets access. */
1095         if (suser(myp_ucred, NULL) == 0) {
1096                 thisNodeID = catkey.hfsPlus.parentID;
1097                 myResult = 1;
1098                 continue;
1099         }
1100
1101         // if the thing has acl's, do the full permission check
1102         if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1103             struct vnode *vp;
1104
1105             /* get the vnode for this cnid */
1106             myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0, 0);
1107             if ( myErr ) {
1108                 myResult = 0;
1109                 goto ExitThisRoutine;
1110             }
1111
1112             thisNodeID = VTOC(vp)->c_parentcnid;
1113
1114             hfs_unlock(VTOC(vp));
1115
1116             if (vnode_vtype(vp) == VDIR) {
1117                 myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), my_context);
1118             } else {
1119                 myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, my_context);
1120             }
1121
1122             vnode_put(vp);
1123             if (myErr) {
1124                 myResult = 0;
1125                 goto ExitThisRoutine;
1126             }
1127         } else {
1128             unsigned int flags;
1129                 int mode = cnattr.ca_mode & S_IFMT;
1130                 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, cnattr.ca_mode, hfsmp->hfs_mp,myp_ucred, theProcPtr);
1131
1132                 if (mode == S_IFDIR) {
1133                         flags = R_OK | X_OK;
1134                 } else {
1135                         flags = R_OK;
1136                 }
1137                 if ( (myPerms & flags) != flags) {
1138                         myResult = 0;
1139                         myErr = EACCES;
1140                         goto ExitThisRoutine;   /* no access */
1141                 }
1142
1143             /* up the hierarchy we go */
1144             thisNodeID = catkey.hfsPlus.parentID;
1145         }
1146     }
1147
1148     /* if here, we have access to this node */
1149     myResult = 1;
1150
1151   ExitThisRoutine:
1152     if (parents && myErr == 0 && scope_index == -1) {
1153         myErr = ESRCH;
1154     }
1155
1156     if (myErr) {
1157         myResult = 0;
1158     }
1159     *err = myErr;
1160
1161     /* cache the parent directory(ies) */
1162     for (i = 0; i < ids_to_cache; i++) {
1163         if (myErr == 0 && parents && (scope_idx_start == -1 || i > scope_idx_start)) {
1164             add_node(cache, -1, parent_ids[i], ESRCH);
1165         } else {
1166             add_node(cache, -1, parent_ids[i], myErr);
1167         }
1168     }
1169
1170     return (myResult);
1171 }
1172
1173 static int
1174 do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
1175     struct vnop_ioctl_args *ap, int arg_size, vfs_context_t context)
1176 {
1177     boolean_t is64bit;
1178
1179     /*
1180      * NOTE: on entry, the vnode is locked. Incase this vnode
1181      * happens to be in our list of file_ids, we'll note it
1182      * avoid calling hfs_chashget_nowait() on that id as that
1183      * will cause a "locking against myself" panic.
1184      */
1185     Boolean check_leaf = true;
1186
1187     struct user64_ext_access_t *user_access_structp;
1188     struct user64_ext_access_t tmp_user_access;
1189     struct access_cache cache;
1190
1191     int error = 0, prev_parent_check_ok=1;
1192     unsigned int i;
1193
1194     short flags;
1195     unsigned int num_files = 0;
1196     int map_size = 0;
1197     int num_parents = 0;
1198     int *file_ids=NULL;
1199     short *access=NULL;
1200     char *bitmap=NULL;
1201     cnid_t *parents=NULL;
1202     int leaf_index;
1203
1204     cnid_t cnid;
1205     cnid_t prevParent_cnid = 0;
1206     unsigned int myPerms;
1207     short myaccess = 0;
1208     struct cat_attr cnattr;
1209     CatalogKey catkey;
1210     struct cnode *skip_cp = VTOC(vp);
1211     kauth_cred_t cred = vfs_context_ucred(context);
1212     proc_t p = vfs_context_proc(context);
1213
1214     is64bit = proc_is64bit(p);
1215
1216     /* initialize the local cache and buffers */
1217     cache.numcached = 0;
1218     cache.cachehits = 0;
1219     cache.lookups = 0;
1220     cache.acache = NULL;
1221     cache.haveaccess = NULL;
1222
1223     /* struct copyin done during dispatch... need to copy file_id array separately */
1224     if (ap->a_data == NULL) {
1225         error = EINVAL;
1226         goto err_exit_bulk_access;
1227     }
1228
1229     if (is64bit) {
1230         if (arg_size != sizeof(struct user64_ext_access_t)) {
1231             error = EINVAL;
1232             goto err_exit_bulk_access;
1233         }
1234
1235         user_access_structp = (struct user64_ext_access_t *)ap->a_data;
1236
1237     } else if (arg_size == sizeof(struct user32_access_t)) {
1238         struct user32_access_t *accessp = (struct user32_access_t *)ap->a_data;
1239
1240         // convert an old style bulk-access struct to the new style
1241         tmp_user_access.flags     = accessp->flags;
1242         tmp_user_access.num_files = accessp->num_files;
1243         tmp_user_access.map_size  = 0;
1244         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1245         tmp_user_access.bitmap    = USER_ADDR_NULL;
1246         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1247         tmp_user_access.num_parents = 0;
1248         user_access_structp = &tmp_user_access;
1249
1250     } else if (arg_size == sizeof(struct user32_ext_access_t)) {
1251         struct user32_ext_access_t *accessp = (struct user32_ext_access_t *)ap->a_data;
1252
1253         // up-cast from a 32-bit version of the struct
1254         tmp_user_access.flags     = accessp->flags;
1255         tmp_user_access.num_files = accessp->num_files;
1256         tmp_user_access.map_size  = accessp->map_size;
1257         tmp_user_access.num_parents  = accessp->num_parents;
1258
1259         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1260         tmp_user_access.bitmap    = CAST_USER_ADDR_T(accessp->bitmap);
1261         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1262         tmp_user_access.parents    = CAST_USER_ADDR_T(accessp->parents);
1263
1264         user_access_structp = &tmp_user_access;
1265     } else {
1266         error = EINVAL;
1267         goto err_exit_bulk_access;
1268     }
1269
1270     map_size = user_access_structp->map_size;
1271
1272     num_files = user_access_structp->num_files;
1273
1274     num_parents= user_access_structp->num_parents;
1275
1276     if (num_files < 1) {
1277         goto err_exit_bulk_access;
1278     }
1279     if (num_files > 1024) {
1280         error = EINVAL;
1281         goto err_exit_bulk_access;
1282     }
1283
1284     if (num_parents > 1024) {
1285         error = EINVAL;
1286         goto err_exit_bulk_access;
1287     }
1288
1289     file_ids = (int *) kalloc(sizeof(int) * num_files);
1290     access = (short *) kalloc(sizeof(short) * num_files);
1291     if (map_size) {
1292         bitmap = (char *) kalloc(sizeof(char) * map_size);
1293     }
1294
1295     if (num_parents) {
1296         parents = (cnid_t *) kalloc(sizeof(cnid_t) * num_parents);
1297     }
1298
1299     cache.acache = (unsigned int *) kalloc(sizeof(int) * NUM_CACHE_ENTRIES);
1300     cache.haveaccess = (unsigned char *) kalloc(sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1301
1302     if (file_ids == NULL || access == NULL || (map_size != 0 && bitmap == NULL) || cache.acache == NULL || cache.haveaccess == NULL) {
1303         if (file_ids) {
1304             kfree(file_ids, sizeof(int) * num_files);
1305         }
1306         if (bitmap) {
1307             kfree(bitmap, sizeof(char) * map_size);
1308         }
1309         if (access) {
1310             kfree(access, sizeof(short) * num_files);
1311         }
1312         if (cache.acache) {
1313             kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1314         }
1315         if (cache.haveaccess) {
1316             kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1317         }
1318         if (parents) {
1319             kfree(parents, sizeof(cnid_t) * num_parents);
1320         }
1321         return ENOMEM;
1322     }
1323
1324     // make sure the bitmap is zero'ed out...
1325     if (bitmap) {
1326         bzero(bitmap, (sizeof(char) * map_size));
1327     }
1328
1329     if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids,
1330                 num_files * sizeof(int)))) {
1331         goto err_exit_bulk_access;
1332     }
1333
1334     if (num_parents) {
1335         if ((error = copyin(user_access_structp->parents, (caddr_t)parents,
1336                     num_parents * sizeof(cnid_t)))) {
1337             goto err_exit_bulk_access;
1338         }
1339     }
1340
1341     flags = user_access_structp->flags;
1342     if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) {
1343         flags = R_OK;
1344     }
1345
1346     /* check if we've been passed leaf node ids or parent ids */
1347     if (flags & PARENT_IDS_FLAG) {
1348         check_leaf = false;
1349     }
1350
1351     /* Check access to each file_id passed in */
1352     for (i = 0; i < num_files; i++) {
1353         leaf_index=-1;
1354         cnid = (cnid_t) file_ids[i];
1355
1356         /* root always has access */
1357         if ((!parents) && (!suser(cred, NULL))) {
1358             access[i] = 0;
1359             continue;
1360         }
1361
1362         if (check_leaf) {
1363             /* do the lookup (checks the cnode hash, then the catalog) */
1364             error = do_attr_lookup(hfsmp, &cache, cnid, skip_cp, &catkey, &cnattr);
1365             if (error) {
1366                 access[i] = (short) error;
1367                 continue;
1368             }
1369
1370             if (parents) {
1371                 // Check if the leaf matches one of the parent scopes
1372                 leaf_index = cache_binSearch(parents, num_parents-1, cnid, NULL);
1373                 if (leaf_index >= 0 && parents[leaf_index] == cnid)
1374                     prev_parent_check_ok = 0;
1375                 else if (leaf_index >= 0)
1376                     prev_parent_check_ok = 1;
1377             }
1378
1379             // if the thing has acl's, do the full permission check
1380             if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1381                 struct vnode *cvp;
1382                 int myErr = 0;
1383                 /* get the vnode for this cnid */
1384                 myErr = hfs_vget(hfsmp, cnid, &cvp, 0, 0);
1385                 if ( myErr ) {
1386                     access[i] = myErr;
1387                     continue;
1388                 }
1389
1390                 hfs_unlock(VTOC(cvp));
1391
1392                 if (vnode_vtype(cvp) == VDIR) {
1393                     myErr = vnode_authorize(cvp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), context);
1394                 } else {
1395                     myErr = vnode_authorize(cvp, NULL, KAUTH_VNODE_READ_DATA, context);
1396                 }
1397
1398                 vnode_put(cvp);
1399                 if (myErr) {
1400                     access[i] = myErr;
1401                     continue;
1402                 }
1403             } else {
1404                 /* before calling CheckAccess(), check the target file for read access */
1405                 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
1406                     cnattr.ca_mode, hfsmp->hfs_mp, cred, p);
1407
1408                 /* fail fast if no access */
1409                 if ((myPerms & flags) == 0) {
1410                     access[i] = EACCES;
1411                     continue;
1412                 }
1413             }
1414         } else {
1415             /* we were passed an array of parent ids */
1416             catkey.hfsPlus.parentID = cnid;
1417         }
1418
1419         /* if the last guy had the same parent and had access, we're done */
1420         if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0 && prev_parent_check_ok) {
1421             cache.cachehits++;
1422             access[i] = 0;
1423             continue;
1424         }
1425
1426         myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID,
1427             skip_cp, p, cred, context,bitmap, map_size, parents, num_parents);
1428
1429         if (myaccess || (error == ESRCH && leaf_index != -1)) {
1430             access[i] = 0; // have access.. no errors to report
1431         } else {
1432             access[i] = (error != 0 ? (short) error : EACCES);
1433         }
1434
1435         prevParent_cnid = catkey.hfsPlus.parentID;
1436     }
1437
1438     /* copyout the access array */
1439     if ((error = copyout((caddr_t)access, user_access_structp->access,
1440                 num_files * sizeof (short)))) {
1441         goto err_exit_bulk_access;
1442     }
1443     if (map_size && bitmap) {
1444         if ((error = copyout((caddr_t)bitmap, user_access_structp->bitmap,
1445                     map_size * sizeof (char)))) {
1446             goto err_exit_bulk_access;
1447         }
1448     }
1449
1450
1451   err_exit_bulk_access:
1452
1453     //printf("hfs: on exit (err %d), numfiles/numcached/cachehits/lookups is %d/%d/%d/%d\n", error, num_files, cache.numcached, cache.cachehits, cache.lookups);
1454
1455     if (file_ids)
1456         kfree(file_ids, sizeof(int) * num_files);
1457     if (parents)
1458         kfree(parents, sizeof(cnid_t) * num_parents);
1459     if (bitmap)
1460         kfree(bitmap, sizeof(char) * map_size);
1461     if (access)
1462         kfree(access, sizeof(short) * num_files);
1463     if (cache.acache)
1464         kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1465     if (cache.haveaccess)
1466         kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1467
1468     return (error);
1469 }
1470
1471
1472 /* end "bulk-access" support */
1473
1474
1475 /*
1476  * Callback for use with freeze ioctl.
1477  */
1478 static int
1479 hfs_freezewrite_callback(struct vnode *vp, __unused void *cargs)
1480 {
1481         vnode_waitforwrites(vp, 0, 0, 0, "hfs freeze");
1482
1483         return 0;
1484 }
1485
1486 /*
1487  * Control filesystem operating characteristics.
1488  */
1489 int
1490 hfs_vnop_ioctl( struct vnop_ioctl_args /* {
1491                 vnode_t a_vp;
1492                 int  a_command;
1493                 caddr_t  a_data;
1494                 int  a_fflag;
1495                 vfs_context_t a_context;
1496         } */ *ap)
1497 {
1498         struct vnode * vp = ap->a_vp;
1499         struct hfsmount *hfsmp = VTOHFS(vp);
1500         vfs_context_t context = ap->a_context;
1501         kauth_cred_t cred = vfs_context_ucred(context);
1502         proc_t p = vfs_context_proc(context);
1503         struct vfsstatfs *vfsp;
1504         boolean_t is64bit;
1505         off_t jnl_start, jnl_size;
1506         struct hfs_journal_info *jip;
1507 #if HFS_COMPRESSION
1508         int compressed = 0;
1509         off_t uncompressed_size = -1;
1510         int decmpfs_error = 0;
1511
1512         if (ap->a_command == F_RDADVISE) {
1513                 /* we need to inspect the decmpfs state of the file as early as possible */
1514                 compressed = hfs_file_is_compressed(VTOC(vp), 0);
1515                 if (compressed) {
1516                         if (VNODE_IS_RSRC(vp)) {
1517                                 /* if this is the resource fork, treat it as if it were empty */
1518                                 uncompressed_size = 0;
1519                         } else {
1520                                 decmpfs_error = hfs_uncompressed_size_of_compressed_file(NULL, vp, 0, &uncompressed_size, 0);
1521                                 if (decmpfs_error != 0) {
1522                                         /* failed to get the uncompressed size, we'll check for this later */
1523                                         uncompressed_size = -1;
1524                                 }
1525                         }
1526                 }
1527         }
1528 #endif /* HFS_COMPRESSION */
1529
1530         is64bit = proc_is64bit(p);
1531
1532 #if CONFIG_PROTECT
1533         {
1534                 int error = 0;
1535                 if ((error = cp_handle_vnop(VTOC(vp), CP_WRITE_ACCESS)) != 0) {
1536                         return error;
1537                 }
1538         }
1539 #endif /* CONFIG_PROTECT */
1540
1541         switch (ap->a_command) {
1542
1543         case HFS_GETPATH:
1544         {
1545                 struct vnode *file_vp;
1546                 cnid_t  cnid;
1547                 int  outlen;
1548                 char *bufptr;
1549                 int error;
1550
1551                 /* Caller must be owner of file system. */
1552                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1553                 if (suser(cred, NULL) &&
1554                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1555                         return (EACCES);
1556                 }
1557                 /* Target vnode must be file system's root. */
1558                 if (!vnode_isvroot(vp)) {
1559                         return (EINVAL);
1560                 }
1561                 bufptr = (char *)ap->a_data;
1562                 cnid = strtoul(bufptr, NULL, 10);
1563
1564                 /* We need to call hfs_vfs_vget to leverage the code that will
1565                  * fix the origin list for us if needed, as opposed to calling
1566                  * hfs_vget, since we will need the parent for build_path call.
1567                  */
1568
1569                 if ((error = hfs_vfs_vget(HFSTOVFS(hfsmp), cnid, &file_vp, context))) {
1570                         return (error);
1571                 }
1572                 error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, 0, context);
1573                 vnode_put(file_vp);
1574
1575                 return (error);
1576         }
1577
1578         case HFS_PREV_LINK:
1579         case HFS_NEXT_LINK:
1580         {
1581                 cnid_t linkfileid;
1582                 cnid_t nextlinkid;
1583                 cnid_t prevlinkid;
1584                 int error;
1585
1586                 /* Caller must be owner of file system. */
1587                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1588                 if (suser(cred, NULL) &&
1589                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1590                         return (EACCES);
1591                 }
1592                 /* Target vnode must be file system's root. */
1593                 if (!vnode_isvroot(vp)) {
1594                         return (EINVAL);
1595                 }
1596                 linkfileid = *(cnid_t *)ap->a_data;
1597                 if (linkfileid < kHFSFirstUserCatalogNodeID) {
1598                         return (EINVAL);
1599                 }
1600                 if ((error = hfs_lookup_siblinglinks(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) {
1601                         return (error);
1602                 }
1603                 if (ap->a_command == HFS_NEXT_LINK) {
1604                         *(cnid_t *)ap->a_data = nextlinkid;
1605                 } else {
1606                         *(cnid_t *)ap->a_data = prevlinkid;
1607                 }
1608                 return (0);
1609         }
1610
1611         case HFS_RESIZE_PROGRESS: {
1612
1613                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1614                 if (suser(cred, NULL) &&
1615                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1616                         return (EACCES); /* must be owner of file system */
1617                 }
1618                 if (!vnode_isvroot(vp)) {
1619                         return (EINVAL);
1620                 }
1621                 /* file system must not be mounted read-only */
1622                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1623                         return (EROFS);
1624                 }
1625
1626                 return hfs_resize_progress(hfsmp, (u_int32_t *)ap->a_data);
1627         }
1628
1629         case HFS_RESIZE_VOLUME: {
1630                 u_int64_t newsize;
1631                 u_int64_t cursize;
1632
1633                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1634                 if (suser(cred, NULL) &&
1635                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1636                         return (EACCES); /* must be owner of file system */
1637                 }
1638                 if (!vnode_isvroot(vp)) {
1639                         return (EINVAL);
1640                 }
1641
1642                 /* filesystem must not be mounted read only */
1643                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1644                         return (EROFS);
1645                 }
1646                 newsize = *(u_int64_t *)ap->a_data;
1647                 cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
1648
1649                 if (newsize > cursize) {
1650                         return hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context);
1651                 } else if (newsize < cursize) {
1652                         return hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context);
1653                 } else {
1654                         return (0);
1655                 }
1656         }
1657         case HFS_CHANGE_NEXT_ALLOCATION: {
1658                 int error = 0;          /* Assume success */
1659                 u_int32_t location;
1660
1661                 if (vnode_vfsisrdonly(vp)) {
1662                         return (EROFS);
1663                 }
1664                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1665                 if (suser(cred, NULL) &&
1666                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1667                         return (EACCES); /* must be owner of file system */
1668                 }
1669                 if (!vnode_isvroot(vp)) {
1670                         return (EINVAL);
1671                 }
1672                 HFS_MOUNT_LOCK(hfsmp, TRUE);
1673                 location = *(u_int32_t *)ap->a_data;
1674                 if ((location >= hfsmp->allocLimit) &&
1675                         (location != HFS_NO_UPDATE_NEXT_ALLOCATION)) {
1676                         error = EINVAL;
1677                         goto fail_change_next_allocation;
1678                 }
1679                 /* Return previous value. */
1680                 *(u_int32_t *)ap->a_data = hfsmp->nextAllocation;
1681                 if (location == HFS_NO_UPDATE_NEXT_ALLOCATION) {
1682                         /* On magic value for location, set nextAllocation to next block
1683                          * after metadata zone and set flag in mount structure to indicate
1684                          * that nextAllocation should not be updated again.
1685                          */
1686                         if (hfsmp->hfs_metazone_end != 0) {
1687                                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_end + 1);
1688                         }
1689                         hfsmp->hfs_flags |= HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1690                 } else {
1691                         hfsmp->hfs_flags &= ~HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1692                         HFS_UPDATE_NEXT_ALLOCATION(hfsmp, location);
1693                 }
1694                 MarkVCBDirty(hfsmp);
1695 fail_change_next_allocation:
1696                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
1697                 return (error);
1698         }
1699
1700 #if HFS_SPARSE_DEV
1701         case HFS_SETBACKINGSTOREINFO: {
1702                 struct vnode * bsfs_rootvp;
1703                 struct vnode * di_vp;
1704                 struct hfs_backingstoreinfo *bsdata;
1705                 int error = 0;
1706
1707                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1708                         return (EROFS);
1709                 }
1710                 if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
1711                         return (EALREADY);
1712                 }
1713                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1714                 if (suser(cred, NULL) &&
1715                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1716                         return (EACCES); /* must be owner of file system */
1717                 }
1718                 bsdata = (struct hfs_backingstoreinfo *)ap->a_data;
1719                 if (bsdata == NULL) {
1720                         return (EINVAL);
1721                 }
1722                 if ((error = file_vnode(bsdata->backingfd, &di_vp))) {
1723                         return (error);
1724                 }
1725                 if ((error = vnode_getwithref(di_vp))) {
1726                         file_drop(bsdata->backingfd);
1727                         return(error);
1728                 }
1729
1730                 if (vnode_mount(vp) == vnode_mount(di_vp)) {
1731                         (void)vnode_put(di_vp);
1732                         file_drop(bsdata->backingfd);
1733                         return (EINVAL);
1734                 }
1735
1736                 /*
1737                  * Obtain the backing fs root vnode and keep a reference
1738                  * on it.  This reference will be dropped in hfs_unmount.
1739                  */
1740                 error = VFS_ROOT(vnode_mount(di_vp), &bsfs_rootvp, NULL); /* XXX use context! */
1741                 if (error) {
1742                         (void)vnode_put(di_vp);
1743                         file_drop(bsdata->backingfd);
1744                         return (error);
1745                 }
1746                 vnode_ref(bsfs_rootvp);
1747                 vnode_put(bsfs_rootvp);
1748
1749                 hfsmp->hfs_backingfs_rootvp = bsfs_rootvp;
1750
1751                 hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
1752                 /* The free extent cache is managed differently for sparse devices.
1753                  * There is a window between which the volume is mounted and the
1754                  * device is marked as sparse, so the free extent cache for this
1755                  * volume is currently initialized as normal volume (sorted by block
1756                  * count).  Reset the cache so that it will be rebuilt again
1757                  * for sparse device (sorted by start block).
1758                  */
1759                 ResetVCBFreeExtCache(hfsmp);
1760
1761                 hfsmp->hfs_sparsebandblks = bsdata->bandsize / HFSTOVCB(hfsmp)->blockSize;
1762                 hfsmp->hfs_sparsebandblks *= 4;
1763
1764                 vfs_markdependency(hfsmp->hfs_mp);
1765
1766                 /*
1767                  * If the sparse image is on a sparse image file (as opposed to a sparse
1768                  * bundle), then we may need to limit the free space to the maximum size
1769                  * of a file on that volume.  So we query (using pathconf), and if we get
1770                  * a meaningful result, we cache the number of blocks for later use in
1771                  * hfs_freeblks().
1772                  */
1773                 hfsmp->hfs_backingfs_maxblocks = 0;
1774                 if (vnode_vtype(di_vp) == VREG) {
1775                         int terr;
1776                         int hostbits;
1777                         terr = vn_pathconf(di_vp, _PC_FILESIZEBITS, &hostbits, context);
1778                         if (terr == 0 && hostbits != 0 && hostbits < 64) {
1779                                 u_int64_t hostfilesizemax = ((u_int64_t)1) << hostbits;
1780
1781                                 hfsmp->hfs_backingfs_maxblocks = hostfilesizemax / hfsmp->blockSize;
1782                         }
1783                 }
1784
1785                 (void)vnode_put(di_vp);
1786                 file_drop(bsdata->backingfd);
1787                 return (0);
1788         }
1789         case HFS_CLRBACKINGSTOREINFO: {
1790                 struct vnode * tmpvp;
1791
1792                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1793                 if (suser(cred, NULL) &&
1794                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1795                         return (EACCES); /* must be owner of file system */
1796                 }
1797                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1798                         return (EROFS);
1799                 }
1800
1801                 if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
1802                     hfsmp->hfs_backingfs_rootvp) {
1803
1804                         hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
1805                         tmpvp = hfsmp->hfs_backingfs_rootvp;
1806                         hfsmp->hfs_backingfs_rootvp = NULLVP;
1807                         hfsmp->hfs_sparsebandblks = 0;
1808                         vnode_rele(tmpvp);
1809                 }
1810                 return (0);
1811         }
1812 #endif /* HFS_SPARSE_DEV */
1813
1814         case F_FREEZE_FS: {
1815                 struct mount *mp;
1816
1817                 mp = vnode_mount(vp);
1818                 hfsmp = VFSTOHFS(mp);
1819
1820                 if (!(hfsmp->jnl))
1821                         return (ENOTSUP);
1822
1823                 vfsp = vfs_statfs(mp);
1824
1825                 if (kauth_cred_getuid(cred) != vfsp->f_owner &&
1826                         !kauth_cred_issuser(cred))
1827                         return (EACCES);
1828
1829                 lck_rw_lock_exclusive(&hfsmp->hfs_insync);
1830
1831                 // flush things before we get started to try and prevent
1832                 // dirty data from being paged out while we're frozen.
1833                 // note: can't do this after taking the lock as it will
1834                 // deadlock against ourselves.
1835                 vnode_iterate(mp, 0, hfs_freezewrite_callback, NULL);
1836                 hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
1837
1838                 // DO NOT call hfs_journal_flush() because that takes a
1839                 // shared lock on the global exclusive lock!
1840                 journal_flush(hfsmp->jnl, TRUE);
1841
1842                 // don't need to iterate on all vnodes, we just need to
1843                 // wait for writes to the system files and the device vnode
1844                 //
1845                 // Now that journal flush waits for all metadata blocks to
1846                 // be written out, waiting for btree writes is probably no
1847                 // longer required.
1848                 if (HFSTOVCB(hfsmp)->extentsRefNum)
1849                     vnode_waitforwrites(HFSTOVCB(hfsmp)->extentsRefNum, 0, 0, 0, "hfs freeze");
1850                 if (HFSTOVCB(hfsmp)->catalogRefNum)
1851                     vnode_waitforwrites(HFSTOVCB(hfsmp)->catalogRefNum, 0, 0, 0, "hfs freeze");
1852                 if (HFSTOVCB(hfsmp)->allocationsRefNum)
1853                     vnode_waitforwrites(HFSTOVCB(hfsmp)->allocationsRefNum, 0, 0, 0, "hfs freeze");
1854                 if (hfsmp->hfs_attribute_vp)
1855                     vnode_waitforwrites(hfsmp->hfs_attribute_vp, 0, 0, 0, "hfs freeze");
1856                 vnode_waitforwrites(hfsmp->hfs_devvp, 0, 0, 0, "hfs freeze");
1857
1858                 hfsmp->hfs_freezing_proc = current_proc();
1859
1860                 return (0);
1861         }
1862
1863         case F_THAW_FS: {
1864                 vfsp = vfs_statfs(vnode_mount(vp));
1865                 if (kauth_cred_getuid(cred) != vfsp->f_owner &&
1866                         !kauth_cred_issuser(cred))
1867                         return (EACCES);
1868
1869                 // if we're not the one who froze the fs then we
1870                 // can't thaw it.
1871                 if (hfsmp->hfs_freezing_proc != current_proc()) {
1872                     return EPERM;
1873                 }
1874
1875                 // NOTE: if you add code here, also go check the
1876                 //       code that "thaws" the fs in hfs_vnop_close()
1877                 //
1878                 hfsmp->hfs_freezing_proc = NULL;
1879                 hfs_unlock_global (hfsmp);
1880                 lck_rw_unlock_exclusive(&hfsmp->hfs_insync);
1881
1882                 return (0);
1883         }
1884
1885         case HFS_BULKACCESS_FSCTL: {
1886             int size;
1887
1888             if (hfsmp->hfs_flags & HFS_STANDARD) {
1889                 return EINVAL;
1890             }
1891
1892             if (is64bit) {
1893                 size = sizeof(struct user64_access_t);
1894             } else {
1895                 size = sizeof(struct user32_access_t);
1896             }
1897
1898             return do_bulk_access_check(hfsmp, vp, ap, size, context);
1899         }
1900
1901         case HFS_EXT_BULKACCESS_FSCTL: {
1902             int size;
1903
1904             if (hfsmp->hfs_flags & HFS_STANDARD) {
1905                 return EINVAL;
1906             }
1907
1908             if (is64bit) {
1909                 size = sizeof(struct user64_ext_access_t);
1910             } else {
1911                 size = sizeof(struct user32_ext_access_t);
1912             }
1913
1914             return do_bulk_access_check(hfsmp, vp, ap, size, context);
1915         }
1916
1917         case HFS_SET_XATTREXTENTS_STATE: {
1918                 int state;
1919
1920                 if (ap->a_data == NULL) {
1921                         return (EINVAL);
1922                 }
1923
1924                 state = *(int *)ap->a_data;
1925
1926                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1927                         return (EROFS);
1928                 }
1929
1930                 /* Super-user can enable or disable extent-based extended
1931                  * attribute support on a volume
1932                  * Note: Starting Mac OS X 10.7, extent-based extended attributes
1933                  * are enabled by default, so any change will be transient only
1934                  * till the volume is remounted.
1935                  */
1936                 if (!is_suser()) {
1937                         return (EPERM);
1938                 }
1939                 if (state == 0 || state == 1)
1940                         return hfs_set_volxattr(hfsmp, HFS_SET_XATTREXTENTS_STATE, state);
1941                 else
1942                         return (EINVAL);
1943         }
1944
1945         case F_FULLFSYNC: {
1946                 int error;
1947
1948                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1949                         return (EROFS);
1950                 }
1951                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK);
1952                 if (error == 0) {
1953                         error = hfs_fsync(vp, MNT_WAIT, TRUE, p);
1954                         hfs_unlock(VTOC(vp));
1955                 }
1956
1957                 return error;
1958         }
1959
1960         case F_CHKCLEAN: {
1961                 register struct cnode *cp;
1962                 int error;
1963
1964                 if (!vnode_isreg(vp))
1965                         return EINVAL;
1966
1967                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK);
1968                 if (error == 0) {
1969                         cp = VTOC(vp);
1970                         /*
1971                          * used by regression test to determine if
1972                          * all the dirty pages (via write) have been cleaned
1973                          * after a call to 'fsysnc'.
1974                          */
1975                         error = is_file_clean(vp, VTOF(vp)->ff_size);
1976                         hfs_unlock(cp);
1977                 }
1978                 return (error);
1979         }
1980
1981         case F_RDADVISE: {
1982                 register struct radvisory *ra;
1983                 struct filefork *fp;
1984                 int error;
1985
1986                 if (!vnode_isreg(vp))
1987                         return EINVAL;
1988
1989                 ra = (struct radvisory *)(ap->a_data);
1990                 fp = VTOF(vp);
1991
1992                 /* Protect against a size change. */
1993                 hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK);
1994
1995 #if HFS_COMPRESSION
1996                 if (compressed && (uncompressed_size == -1)) {
1997                         /* fetching the uncompressed size failed above, so return the error */
1998                         error = decmpfs_error;
1999                 } else if ((compressed && (ra->ra_offset >= uncompressed_size)) ||
2000                                    (!compressed && (ra->ra_offset >= fp->ff_size))) {
2001                         error = EFBIG;
2002                 }
2003 #else /* HFS_COMPRESSION */
2004                 if (ra->ra_offset >= fp->ff_size) {
2005                         error = EFBIG;
2006                 }
2007 #endif /* HFS_COMPRESSION */
2008                 else {
2009                         error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count);
2010                 }
2011
2012                 hfs_unlock_truncate(VTOC(vp), 0);
2013                 return (error);
2014         }
2015
2016         case F_READBOOTSTRAP:
2017         case F_WRITEBOOTSTRAP:
2018                 return 0;
2019
2020         case _IOC(IOC_OUT,'h', 4, 0):     /* Create date in local time */
2021         {
2022                 if (is64bit) {
2023                         *(user_time_t *)(ap->a_data) = (user_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2024                 }
2025                 else {
2026                         *(user32_time_t *)(ap->a_data) = (user32_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2027                 }
2028                 return 0;
2029         }
2030
2031         case SPOTLIGHT_FSCTL_GET_MOUNT_TIME:
2032             *(uint32_t *)ap->a_data = hfsmp->hfs_mount_time;
2033             break;
2034
2035         case SPOTLIGHT_FSCTL_GET_LAST_MTIME:
2036             *(uint32_t *)ap->a_data = hfsmp->hfs_last_mounted_mtime;
2037             break;
2038
2039         case HFS_FSCTL_SET_VERY_LOW_DISK:
2040             if (*(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_warninglimit) {
2041                 return EINVAL;
2042             }
2043
2044             hfsmp->hfs_freespace_notify_dangerlimit = *(uint32_t *)ap->a_data;
2045             break;
2046
2047         case HFS_FSCTL_SET_LOW_DISK:
2048             if (   *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel
2049                 || *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_dangerlimit) {
2050
2051                 return EINVAL;
2052             }
2053
2054             hfsmp->hfs_freespace_notify_warninglimit = *(uint32_t *)ap->a_data;
2055             break;
2056
2057         case HFS_FSCTL_SET_DESIRED_DISK:
2058             if (*(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) {
2059                 return EINVAL;
2060             }
2061
2062             hfsmp->hfs_freespace_notify_desiredlevel = *(uint32_t *)ap->a_data;
2063             break;
2064
2065         case HFS_VOLUME_STATUS:
2066             *(uint32_t *)ap->a_data = hfsmp->hfs_notification_conditions;
2067             break;
2068
2069         case HFS_SET_BOOT_INFO:
2070                 if (!vnode_isvroot(vp))
2071                         return(EINVAL);
2072                 if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(HFSTOVFS(hfsmp))->f_owner))
2073                         return(EACCES); /* must be superuser or owner of filesystem */
2074                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2075                         return (EROFS);
2076                 }
2077                 HFS_MOUNT_LOCK(hfsmp, TRUE);
2078                 bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo));
2079                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
2080                 (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
2081                 break;
2082
2083         case HFS_GET_BOOT_INFO:
2084                 if (!vnode_isvroot(vp))
2085                         return(EINVAL);
2086                 HFS_MOUNT_LOCK(hfsmp, TRUE);
2087                 bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo));
2088                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
2089                 break;
2090
2091         case HFS_MARK_BOOT_CORRUPT:
2092                 /* Mark the boot volume corrupt by setting
2093                  * kHFSVolumeInconsistentBit in the volume header.  This will
2094                  * force fsck_hfs on next mount.
2095                  */
2096                 if (!is_suser()) {
2097                         return EACCES;
2098                 }
2099
2100                 /* Allowed only on the root vnode of the boot volume */
2101                 if (!(vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) ||
2102                     !vnode_isvroot(vp)) {
2103                         return EINVAL;
2104                 }
2105                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2106                         return (EROFS);
2107                 }
2108                 printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n");
2109                 hfs_mark_volume_inconsistent(hfsmp);
2110                 break;
2111
2112         case HFS_FSCTL_GET_JOURNAL_INFO:
2113                 jip = (struct hfs_journal_info*)ap->a_data;
2114
2115                 if (vp == NULLVP)
2116                         return EINVAL;
2117
2118             if (hfsmp->jnl == NULL) {
2119                         jnl_start = 0;
2120                         jnl_size  = 0;
2121             } else {
2122                         jnl_start = (off_t)(hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset;
2123                         jnl_size  = (off_t)hfsmp->jnl_size;
2124             }
2125
2126                 jip->jstart = jnl_start;
2127                 jip->jsize = jnl_size;
2128                 break;
2129
2130         case HFS_SET_ALWAYS_ZEROFILL: {
2131             struct cnode *cp = VTOC(vp);
2132
2133             if (*(int *)ap->a_data) {
2134                 cp->c_flag |= C_ALWAYS_ZEROFILL;
2135             } else {
2136                 cp->c_flag &= ~C_ALWAYS_ZEROFILL;
2137             }
2138             break;
2139         }
2140
2141         case HFS_DISABLE_METAZONE: {
2142                 /* Only root can disable metadata zone */
2143                 if (!is_suser()) {
2144                         return EACCES;
2145                 }
2146                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2147                         return (EROFS);
2148                 }
2149
2150                 /* Disable metadata zone now */
2151                 (void) hfs_metadatazone_init(hfsmp, true);
2152                 printf ("hfs: Disabling metadata zone on %s\n", hfsmp->vcbVN);
2153                 break;
2154         }
2155
2156         default:
2157                 return (ENOTTY);
2158         }
2159
2160         return 0;
2161 }
2162
2163 /*
2164  * select
2165  */
2166 int
2167 hfs_vnop_select(__unused struct vnop_select_args *ap)
2168 /*
2169         struct vnop_select_args {
2170                 vnode_t a_vp;
2171                 int  a_which;
2172                 int  a_fflags;
2173                 void *a_wql;
2174                 vfs_context_t a_context;
2175         };
2176 */
2177 {
2178         /*
2179          * We should really check to see if I/O is possible.
2180          */
2181         return (1);
2182 }
2183
2184 /*
2185  * Converts a logical block number to a physical block, and optionally returns
2186  * the amount of remaining blocks in a run. The logical block is based on hfsNode.logBlockSize.
2187  * The physical block number is based on the device block size, currently its 512.
2188  * The block run is returned in logical blocks, and is the REMAINING amount of blocks
2189  */
2190 int
2191 hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, unsigned int *runp)
2192 {
2193         struct filefork *fp = VTOF(vp);
2194         struct hfsmount *hfsmp = VTOHFS(vp);
2195         int  retval = E_NONE;
2196         u_int32_t  logBlockSize;
2197         size_t  bytesContAvail = 0;
2198         off_t  blockposition;
2199         int lockExtBtree;
2200         int lockflags = 0;
2201
2202         /*
2203          * Check for underlying vnode requests and ensure that logical
2204          * to physical mapping is requested.
2205          */
2206         if (vpp != NULL)
2207                 *vpp = hfsmp->hfs_devvp;
2208         if (bnp == NULL)
2209                 return (0);
2210
2211         logBlockSize = GetLogicalBlockSize(vp);
2212         blockposition = (off_t)bn * logBlockSize;
2213
2214         lockExtBtree = overflow_extents(fp);
2215
2216         if (lockExtBtree)
2217                 lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK);
2218
2219         retval = MacToVFSError(
2220                             MapFileBlockC (HFSTOVCB(hfsmp),
2221                                             (FCB*)fp,
2222                                             MAXPHYSIO,
2223                                             blockposition,
2224                                             bnp,
2225                                             &bytesContAvail));
2226
2227         if (lockExtBtree)
2228                 hfs_systemfile_unlock(hfsmp, lockflags);
2229
2230         if (retval == E_NONE) {
2231                 /* Figure out how many read ahead blocks there are */
2232                 if (runp != NULL) {
2233                         if (can_cluster(logBlockSize)) {
2234                                 /* Make sure this result never goes negative: */
2235                                 *runp = (bytesContAvail < logBlockSize) ? 0 : (bytesContAvail / logBlockSize) - 1;
2236                         } else {
2237                                 *runp = 0;
2238                         }
2239                 }
2240         }
2241         return (retval);
2242 }
2243
2244 /*
2245  * Convert logical block number to file offset.
2246  */
2247 int
2248 hfs_vnop_blktooff(struct vnop_blktooff_args *ap)
2249 /*
2250         struct vnop_blktooff_args {
2251                 vnode_t a_vp;
2252                 daddr64_t a_lblkno;
2253                 off_t *a_offset;
2254         };
2255 */
2256 {
2257         if (ap->a_vp == NULL)
2258                 return (EINVAL);
2259         *ap->a_offset = (off_t)ap->a_lblkno * (off_t)GetLogicalBlockSize(ap->a_vp);
2260
2261         return(0);
2262 }
2263
2264 /*
2265  * Convert file offset to logical block number.
2266  */
2267 int
2268 hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap)
2269 /*
2270         struct vnop_offtoblk_args {
2271                 vnode_t a_vp;
2272                 off_t a_offset;
2273                 daddr64_t *a_lblkno;
2274         };
2275 */
2276 {
2277         if (ap->a_vp == NULL)
2278                 return (EINVAL);
2279         *ap->a_lblkno = (daddr64_t)(ap->a_offset / (off_t)GetLogicalBlockSize(ap->a_vp));
2280
2281         return(0);
2282 }
2283
2284 /*
2285  * Map file offset to physical block number.
2286  *
2287  * If this function is called for write operation, and if the file
2288  * had virtual blocks allocated (delayed allocation), real blocks
2289  * are allocated by calling ExtendFileC().
2290  *
2291  * If this function is called for read operation, and if the file
2292  * had virtual blocks allocated (delayed allocation), no change
2293  * to the size of file is done, and if required, rangelist is
2294  * searched for mapping.
2295  *
2296  * System file cnodes are expected to be locked (shared or exclusive).
2297  */
2298 int
2299 hfs_vnop_blockmap(struct vnop_blockmap_args *ap)
2300 /*
2301         struct vnop_blockmap_args {
2302                 vnode_t a_vp;
2303                 off_t a_foffset;
2304                 size_t a_size;
2305                 daddr64_t *a_bpn;
2306                 size_t *a_run;
2307                 void *a_poff;
2308                 int a_flags;
2309                 vfs_context_t a_context;
2310         };
2311 */
2312 {
2313         struct vnode *vp = ap->a_vp;
2314         struct cnode *cp;
2315         struct filefork *fp;
2316         struct hfsmount *hfsmp;
2317         size_t bytesContAvail = 0;
2318         int retval = E_NONE;
2319         int syslocks = 0;
2320         int lockflags = 0;
2321         struct rl_entry *invalid_range;
2322         enum rl_overlaptype overlaptype;
2323         int started_tr = 0;
2324         int tooklock = 0;
2325
2326 #if HFS_COMPRESSION
2327         if (VNODE_IS_RSRC(vp)) {
2328                 /* allow blockmaps to the resource fork */
2329         } else {
2330                 if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
2331                         int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
2332                         switch(state) {
2333                                 case FILE_IS_COMPRESSED:
2334                                         return ENOTSUP;
2335                                 case FILE_IS_CONVERTING:
2336                                         /* if FILE_IS_CONVERTING, we allow blockmap */
2337                                         break;
2338                                 default:
2339                                         printf("invalid state %d for compressed file\n", state);
2340                                         /* fall through */
2341                         }
2342                 }
2343         }
2344 #endif /* HFS_COMPRESSION */
2345
2346         /* Do not allow blockmap operation on a directory */
2347         if (vnode_isdir(vp)) {
2348                 return (ENOTSUP);
2349         }
2350
2351         /*
2352          * Check for underlying vnode requests and ensure that logical
2353          * to physical mapping is requested.
2354          */
2355         if (ap->a_bpn == NULL)
2356                 return (0);
2357
2358         if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) {
2359                 if (VTOC(vp)->c_lockowner != current_thread()) {
2360                         hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
2361                         tooklock = 1;
2362                 }
2363         }
2364         hfsmp = VTOHFS(vp);
2365         cp = VTOC(vp);
2366         fp = VTOF(vp);
2367
2368 retry:
2369         /* Check virtual blocks only when performing write operation */
2370         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
2371                 if (hfs_start_transaction(hfsmp) != 0) {
2372                         retval = EINVAL;
2373                         goto exit;
2374                 } else {
2375                         started_tr = 1;
2376                 }
2377                 syslocks = SFL_EXTENTS | SFL_BITMAP;
2378
2379         } else if (overflow_extents(fp)) {
2380                 syslocks = SFL_EXTENTS;
2381         }
2382
2383         if (syslocks)
2384                 lockflags = hfs_systemfile_lock(hfsmp, syslocks, HFS_EXCLUSIVE_LOCK);
2385
2386         /*
2387          * Check for any delayed allocations.
2388          */
2389         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
2390                 int64_t actbytes;
2391                 u_int32_t loanedBlocks;
2392
2393                 //
2394                 // Make sure we have a transaction.  It's possible
2395                 // that we came in and fp->ff_unallocblocks was zero
2396                 // but during the time we blocked acquiring the extents
2397                 // btree, ff_unallocblocks became non-zero and so we
2398                 // will need to start a transaction.
2399                 //
2400                 if (started_tr == 0) {
2401                         if (syslocks) {
2402                                 hfs_systemfile_unlock(hfsmp, lockflags);
2403                                 syslocks = 0;
2404                         }
2405                         goto retry;
2406                 }
2407
2408                 /*
2409                  * Note: ExtendFileC will Release any blocks on loan and
2410                  * aquire real blocks.  So we ask to extend by zero bytes
2411                  * since ExtendFileC will account for the virtual blocks.
2412                  */
2413
2414                 loanedBlocks = fp->ff_unallocblocks;
2415                 retval = ExtendFileC(hfsmp, (FCB*)fp, 0, 0,
2416                                      kEFAllMask | kEFNoClumpMask, &actbytes);
2417
2418                 if (retval) {
2419                         fp->ff_unallocblocks = loanedBlocks;
2420                         cp->c_blocks += loanedBlocks;
2421                         fp->ff_blocks += loanedBlocks;
2422
2423                         HFS_MOUNT_LOCK(hfsmp, TRUE);
2424                         hfsmp->loanedBlocks += loanedBlocks;
2425                         HFS_MOUNT_UNLOCK(hfsmp, TRUE);
2426
2427                         hfs_systemfile_unlock(hfsmp, lockflags);
2428                         cp->c_flag |= C_MODIFIED;
2429                         if (started_tr) {
2430                                 (void) hfs_update(vp, TRUE);
2431                                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2432
2433                                 hfs_end_transaction(hfsmp);
2434                                 started_tr = 0;
2435                         }
2436                         goto exit;
2437                 }
2438         }
2439
2440         retval = MapFileBlockC(hfsmp, (FCB *)fp, ap->a_size, ap->a_foffset,
2441                                ap->a_bpn, &bytesContAvail);
2442         if (syslocks) {
2443                 hfs_systemfile_unlock(hfsmp, lockflags);
2444                 syslocks = 0;
2445         }
2446
2447         if (started_tr) {
2448                 (void) hfs_update(vp, TRUE);
2449                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2450                 hfs_end_transaction(hfsmp);
2451                 started_tr = 0;
2452         }
2453         if (retval) {
2454                 /* On write, always return error because virtual blocks, if any,
2455                  * should have been allocated in ExtendFileC().  We do not
2456                  * allocate virtual blocks on read, therefore return error
2457                  * only if no virtual blocks are allocated.  Otherwise we search
2458                  * rangelist for zero-fills
2459                  */
2460                 if ((MacToVFSError(retval) != ERANGE) ||
2461                     (ap->a_flags & VNODE_WRITE) ||
2462                     ((ap->a_flags & VNODE_READ) && (fp->ff_unallocblocks == 0))) {
2463                         goto exit;
2464                 }
2465
2466                 /* Validate if the start offset is within logical file size */
2467                 if (ap->a_foffset > fp->ff_size) {
2468                         goto exit;
2469                 }
2470
2471                 /* Searching file extents has failed for read operation, therefore
2472                  * search rangelist for any uncommitted holes in the file.
2473                  */
2474                 overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
2475                                       ap->a_foffset + (off_t)(ap->a_size - 1),
2476                                       &invalid_range);
2477                 switch(overlaptype) {
2478                 case RL_OVERLAPISCONTAINED:
2479                         /* start_offset <= rl_start, end_offset >= rl_end */
2480                         if (ap->a_foffset != invalid_range->rl_start) {
2481                                 break;
2482                         }
2483                 case RL_MATCHINGOVERLAP:
2484                         /* start_offset = rl_start, end_offset = rl_end */
2485                 case RL_OVERLAPCONTAINSRANGE:
2486                         /* start_offset >= rl_start, end_offset <= rl_end */
2487                 case RL_OVERLAPSTARTSBEFORE:
2488                         /* start_offset > rl_start, end_offset >= rl_start */
2489                         if ((off_t)fp->ff_size > (invalid_range->rl_end + 1)) {
2490                                 bytesContAvail = (invalid_range->rl_end + 1) - ap->a_foffset;
2491                         } else {
2492                                 bytesContAvail = fp->ff_size - ap->a_foffset;
2493                         }
2494                         if (bytesContAvail > ap->a_size) {
2495                                 bytesContAvail = ap->a_size;
2496                         }
2497                         *ap->a_bpn = (daddr64_t)-1;
2498                         retval = 0;
2499                         break;
2500                 case RL_OVERLAPENDSAFTER:
2501                         /* start_offset < rl_start, end_offset < rl_end */
2502                 case RL_NOOVERLAP:
2503                         break;
2504                 }
2505                 goto exit;
2506         }
2507
2508         /* MapFileC() found a valid extent in the filefork.  Search the
2509          * mapping information further for invalid file ranges
2510          */
2511         overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
2512                               ap->a_foffset + (off_t)bytesContAvail - 1,
2513                               &invalid_range);
2514         if (overlaptype != RL_NOOVERLAP) {
2515                 switch(overlaptype) {
2516                 case RL_MATCHINGOVERLAP:
2517                 case RL_OVERLAPCONTAINSRANGE:
2518                 case RL_OVERLAPSTARTSBEFORE:
2519                         /* There's no valid block for this byte offset */
2520                         *ap->a_bpn = (daddr64_t)-1;
2521                         /* There's no point limiting the amount to be returned
2522                          * if the invalid range that was hit extends all the way
2523                          * to the EOF (i.e. there's no valid bytes between the
2524                          * end of this range and the file's EOF):
2525                          */
2526                         if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
2527                             ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
2528                                 bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
2529                         }
2530                         break;
2531
2532                 case RL_OVERLAPISCONTAINED:
2533                 case RL_OVERLAPENDSAFTER:
2534                         /* The range of interest hits an invalid block before the end: */
2535                         if (invalid_range->rl_start == ap->a_foffset) {
2536                                 /* There's actually no valid information to be had starting here: */
2537                                 *ap->a_bpn = (daddr64_t)-1;
2538                                 if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
2539                                     ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
2540                                         bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
2541                                 }
2542                         } else {
2543                                 bytesContAvail = invalid_range->rl_start - ap->a_foffset;
2544                         }
2545                         break;
2546
2547                 case RL_NOOVERLAP:
2548                         break;
2549                 } /* end switch */
2550                 if (bytesContAvail > ap->a_size)
2551                         bytesContAvail = ap->a_size;
2552         }
2553
2554 exit:
2555         if (retval == 0) {
2556                 if (ap->a_run)
2557                         *ap->a_run = bytesContAvail;
2558
2559                 if (ap->a_poff)
2560                         *(int *)ap->a_poff = 0;
2561         }
2562
2563         if (tooklock)
2564                 hfs_unlock(cp);
2565
2566         return (MacToVFSError(retval));
2567 }
2568
2569
2570 /*
2571  * prepare and issue the I/O
2572  * buf_strategy knows how to deal
2573  * with requests that require
2574  * fragmented I/Os
2575  */
2576 int
2577 hfs_vnop_strategy(struct vnop_strategy_args *ap)
2578 {
2579         buf_t   bp = ap->a_bp;
2580         vnode_t vp = buf_vnode(bp);
2581         int error = 0;
2582
2583 #if CONFIG_PROTECT
2584         cnode_t *cp = NULL;
2585
2586         if ((cp = cp_get_protected_cnode(vp)) != NULL) {
2587                 /*
2588                  * Some paths to hfs_vnop_strategy will take the cnode lock,
2589                  * and some won't. But since content protection is only enabled
2590                  * for files that (a) aren't system files and (b) are regular
2591                  * files, any valid cnode here will be unlocked.
2592                  */
2593                 hfs_lock(cp, HFS_SHARED_LOCK);
2594                 buf_setcpaddr(bp, cp->c_cpentry);
2595         }
2596 #endif /* CONFIG_PROTECT */
2597
2598         error = buf_strategy(VTOHFS(vp)->hfs_devvp, ap);
2599
2600 #if CONFIG_PROTECT
2601         if (cp) {
2602                 hfs_unlock(cp);
2603         }
2604 #endif
2605
2606         return error;
2607 }
2608
2609 static int
2610 hfs_minorupdate(struct vnode *vp) {
2611         struct cnode *cp = VTOC(vp);
2612         cp->c_flag &= ~C_MODIFIED;
2613         cp->c_touch_acctime = 0;
2614         cp->c_touch_chgtime = 0;
2615         cp->c_touch_modtime = 0;
2616
2617         return 0;
2618 }
2619
2620 int
2621 do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_context_t context)
2622 {
2623         register struct cnode *cp = VTOC(vp);
2624         struct filefork *fp = VTOF(vp);
2625         struct proc *p = vfs_context_proc(context);;
2626         kauth_cred_t cred = vfs_context_ucred(context);
2627         int retval;
2628         off_t bytesToAdd;
2629         off_t actualBytesAdded;
2630         off_t filebytes;
2631         u_int32_t fileblocks;
2632         int blksize;
2633         struct hfsmount *hfsmp;
2634         int lockflags;
2635
2636         blksize = VTOVCB(vp)->blockSize;
2637         fileblocks = fp->ff_blocks;
2638         filebytes = (off_t)fileblocks * (off_t)blksize;
2639
2640         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_START,
2641                  (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
2642
2643         if (length < 0)
2644                 return (EINVAL);
2645
2646         /* This should only happen with a corrupt filesystem */
2647         if ((off_t)fp->ff_size < 0)
2648                 return (EINVAL);
2649
2650         if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE))
2651                 return (EFBIG);
2652
2653         hfsmp = VTOHFS(vp);
2654
2655         retval = E_NONE;
2656
2657         /* Files that are changing size are not hot file candidates. */
2658         if (hfsmp->hfc_stage == HFC_RECORDING) {
2659                 fp->ff_bytesread = 0;
2660         }
2661
2662         /*
2663          * We cannot just check if fp->ff_size == length (as an optimization)
2664          * since there may be extra physical blocks that also need truncation.
2665          */
2666 #if QUOTA
2667         if ((retval = hfs_getinoquota(cp)))
2668                 return(retval);
2669 #endif /* QUOTA */
2670
2671         /*
2672          * Lengthen the size of the file. We must ensure that the
2673          * last byte of the file is allocated. Since the smallest
2674          * value of ff_size is 0, length will be at least 1.
2675          */
2676         if (length > (off_t)fp->ff_size) {
2677 #if QUOTA
2678                 retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)),
2679                                    cred, 0);
2680                 if (retval)
2681                         goto Err_Exit;
2682 #endif /* QUOTA */
2683                 /*
2684                  * If we don't have enough physical space then
2685                  * we need to extend the physical size.
2686                  */
2687                 if (length > filebytes) {
2688                         int eflags;
2689                         u_int32_t blockHint = 0;
2690
2691                         /* All or nothing and don't round up to clumpsize. */
2692                         eflags = kEFAllMask | kEFNoClumpMask;
2693
2694                         if (cred && suser(cred, NULL) != 0)
2695                                 eflags |= kEFReserveMask;  /* keep a reserve */
2696
2697                         /*
2698                          * Allocate Journal and Quota files in metadata zone.
2699                          */
2700                         if (filebytes == 0 &&
2701                             hfsmp->hfs_flags & HFS_METADATA_ZONE &&
2702                             hfs_virtualmetafile(cp)) {
2703                                 eflags |= kEFMetadataMask;
2704                                 blockHint = hfsmp->hfs_metazone_start;
2705                         }
2706                         if (hfs_start_transaction(hfsmp) != 0) {
2707                             retval = EINVAL;
2708                             goto Err_Exit;
2709                         }
2710
2711                         /* Protect extents b-tree and allocation bitmap */
2712                         lockflags = SFL_BITMAP;
2713                         if (overflow_extents(fp))
2714                                 lockflags |= SFL_EXTENTS;
2715                         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
2716
2717                         while ((length > filebytes) && (retval == E_NONE)) {
2718                                 bytesToAdd = length - filebytes;
2719                                 retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
2720                                                     (FCB*)fp,
2721                                                     bytesToAdd,
2722                                                     blockHint,
2723                                                     eflags,
2724                                                     &actualBytesAdded));
2725
2726                                 filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
2727                                 if (actualBytesAdded == 0 && retval == E_NONE) {
2728                                         if (length > filebytes)
2729                                                 length = filebytes;
2730                                         break;
2731                                 }
2732                         } /* endwhile */
2733
2734                         hfs_systemfile_unlock(hfsmp, lockflags);
2735
2736                         if (hfsmp->jnl) {
2737                                 if (skipupdate) {
2738                                         (void) hfs_minorupdate(vp);
2739                                 }
2740                                 else {
2741                                         (void) hfs_update(vp, TRUE);
2742                                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2743                                 }
2744                         }
2745
2746                         hfs_end_transaction(hfsmp);
2747
2748                         if (retval)
2749                                 goto Err_Exit;
2750
2751                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
2752                                 (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
2753                 }
2754
2755                 if (!(flags & IO_NOZEROFILL)) {
2756                         if (UBCINFOEXISTS(vp)  && (vnode_issystem(vp) == 0) && retval == E_NONE) {
2757                                 struct rl_entry *invalid_range;
2758                                 off_t zero_limit;
2759
2760                                 zero_limit = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
2761                                 if (length < zero_limit) zero_limit = length;
2762
2763                                 if (length > (off_t)fp->ff_size) {
2764                                         struct timeval tv;
2765
2766                                         /* Extending the file: time to fill out the current last page w. zeroes? */
2767                                         if ((fp->ff_size & PAGE_MASK_64) &&
2768                                             (rl_scan(&fp->ff_invalidranges, fp->ff_size & ~PAGE_MASK_64,
2769                                             fp->ff_size - 1, &invalid_range) == RL_NOOVERLAP)) {
2770
2771                                                 /* There's some valid data at the start of the (current) last page
2772                                                    of the file, so zero out the remainder of that page to ensure the
2773                                                    entire page contains valid data.  Since there is no invalid range
2774                                                    possible past the (current) eof, there's no need to remove anything
2775                                                    from the invalid range list before calling cluster_write():  */
2776                                                 hfs_unlock(cp);
2777                                                 retval = cluster_write(vp, (struct uio *) 0, fp->ff_size, zero_limit,
2778                                                                 fp->ff_size, (off_t)0,
2779                                                                 (flags & IO_SYNC) | IO_HEADZEROFILL | IO_NOZERODIRTY);
2780                                                 hfs_lock(cp, HFS_FORCE_LOCK);
2781                                                 if (retval) goto Err_Exit;
2782
2783                                                 /* Merely invalidate the remaining area, if necessary: */
2784                                                 if (length > zero_limit) {
2785                                                         microuptime(&tv);
2786                                                         rl_add(zero_limit, length - 1, &fp->ff_invalidranges);
2787                                                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
2788                                                 }
2789                                         } else {
2790                                         /* The page containing the (current) eof is invalid: just add the
2791                                            remainder of the page to the invalid list, along with the area
2792                                            being newly allocated:
2793                                          */
2794                                         microuptime(&tv);
2795                                         rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges);
2796                                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
2797                                         };
2798                                 }
2799                         } else {
2800                                         panic("hfs_truncate: invoked on non-UBC object?!");
2801                         };
2802                 }
2803                 cp->c_touch_modtime = TRUE;
2804                 fp->ff_size = length;
2805
2806         } else { /* Shorten the size of the file */
2807
2808                 if ((off_t)fp->ff_size > length) {
2809                         /* Any space previously marked as invalid is now irrelevant: */
2810                         rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges);
2811                 }
2812
2813                 /*
2814                  * Account for any unmapped blocks. Note that the new
2815                  * file length can still end up with unmapped blocks.
2816                  */
2817                 if (fp->ff_unallocblocks > 0) {
2818                         u_int32_t finalblks;
2819                         u_int32_t loanedBlocks;
2820
2821                         HFS_MOUNT_LOCK(hfsmp, TRUE);
2822
2823                         loanedBlocks = fp->ff_unallocblocks;
2824                         cp->c_blocks -= loanedBlocks;
2825                         fp->ff_blocks -= loanedBlocks;
2826                         fp->ff_unallocblocks = 0;
2827
2828                         hfsmp->loanedBlocks -= loanedBlocks;
2829
2830                         finalblks = (length + blksize - 1) / blksize;
2831                         if (finalblks > fp->ff_blocks) {
2832                                 /* calculate required unmapped blocks */
2833                                 loanedBlocks = finalblks - fp->ff_blocks;
2834                                 hfsmp->loanedBlocks += loanedBlocks;
2835
2836                                 fp->ff_unallocblocks = loanedBlocks;
2837                                 cp->c_blocks += loanedBlocks;
2838                                 fp->ff_blocks += loanedBlocks;
2839                         }
2840                         HFS_MOUNT_UNLOCK(hfsmp, TRUE);
2841                 }
2842
2843                 /*
2844                  * For a TBE process the deallocation of the file blocks is
2845                  * delayed until the file is closed.  And hfs_close calls
2846                  * truncate with the IO_NDELAY flag set.  So when IO_NDELAY
2847                  * isn't set, we make sure this isn't a TBE process.
2848                  */
2849                 if ((flags & IO_NDELAY) || (proc_tbe(p) == 0)) {
2850 #if QUOTA
2851                   off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
2852 #endif /* QUOTA */
2853                   if (hfs_start_transaction(hfsmp) != 0) {
2854                       retval = EINVAL;
2855                       goto Err_Exit;
2856                   }
2857
2858                         if (fp->ff_unallocblocks == 0) {
2859                                 /* Protect extents b-tree and allocation bitmap */
2860                                 lockflags = SFL_BITMAP;
2861                                 if (overflow_extents(fp))
2862                                         lockflags |= SFL_EXTENTS;
2863                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
2864
2865                                 retval = MacToVFSError(TruncateFileC(VTOVCB(vp), (FCB*)fp, length, 0,
2866                                                                                                          FORK_IS_RSRC (fp), FTOC(fp)->c_fileid, false));
2867
2868                                 hfs_systemfile_unlock(hfsmp, lockflags);
2869                         }
2870                         if (hfsmp->jnl) {
2871                                 if (retval == 0) {
2872                                         fp->ff_size = length;
2873                                 }
2874                                 if (skipupdate) {
2875                                         (void) hfs_minorupdate(vp);
2876                                 }
2877                                 else {
2878                                         (void) hfs_update(vp, TRUE);
2879                                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2880                                 }
2881                         }
2882                         hfs_end_transaction(hfsmp);
2883
2884                         filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
2885                         if (retval)
2886                                 goto Err_Exit;
2887 #if QUOTA
2888                         /* These are bytesreleased */
2889                         (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0);
2890 #endif /* QUOTA */
2891                 }
2892                 /* Only set update flag if the logical length changes */
2893                 if ((off_t)fp->ff_size != length)
2894                         cp->c_touch_modtime = TRUE;
2895                 fp->ff_size = length;
2896         }
2897         if (cp->c_mode & (S_ISUID | S_ISGID)) {
2898                 if (!vfs_context_issuser(context)) {
2899                         cp->c_mode &= ~(S_ISUID | S_ISGID);
2900                         skipupdate = 0;
2901                 }
2902         }
2903         if (skipupdate) {
2904                 retval = hfs_minorupdate(vp);
2905         }
2906         else {
2907                 cp->c_touch_chgtime = TRUE;     /* status changed */
2908                 cp->c_touch_modtime = TRUE;     /* file data was modified */
2909                 retval = hfs_update(vp, MNT_WAIT);
2910         }
2911         if (retval) {
2912                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
2913                      -1, -1, -1, retval, 0);
2914         }
2915
2916 Err_Exit:
2917
2918         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_END,
2919                  (int)length, (int)fp->ff_size, (int)filebytes, retval, 0);
2920
2921         return (retval);
2922 }
2923
2924 /*
2925  * Preparation which must be done prior to deleting the catalog record
2926  * of a file or directory.  In order to make the on-disk as safe as possible,
2927  * we remove the catalog entry before releasing the bitmap blocks and the
2928  * overflow extent records.  However, some work must be done prior to deleting
2929  * the catalog record.
2930  *
2931  * When calling this function, the cnode must exist both in memory and on-disk.
2932  * If there are both resource fork and data fork vnodes, this function should
2933  * be called on both.
2934  */
2935
2936 int
2937 hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) {
2938
2939         struct filefork *fp = VTOF(vp);
2940         struct cnode *cp = VTOC(vp);
2941         int retval = 0;
2942
2943         /* Cannot truncate an HFS directory! */
2944         if (vnode_isdir(vp)) {
2945                 return (EISDIR);
2946         }
2947
2948         /*
2949          * See the comment below in hfs_truncate for why we need to call
2950          * setsize here.  Essentially we want to avoid pending IO if we
2951          * already know that the blocks are going to be released here.
2952          * This function is only called when totally removing all storage for a file, so
2953          * we can take a shortcut and immediately setsize (0);
2954          */
2955         ubc_setsize(vp, 0);
2956
2957         /* This should only happen with a corrupt filesystem */
2958         if ((off_t)fp->ff_size < 0)
2959                 return (EINVAL);
2960
2961         /*
2962          * We cannot just check if fp->ff_size == length (as an optimization)
2963          * since there may be extra physical blocks that also need truncation.
2964          */
2965 #if QUOTA
2966         if ((retval = hfs_getinoquota(cp))) {
2967                 return(retval);
2968         }
2969 #endif /* QUOTA */
2970
2971         /* Wipe out any invalid ranges which have yet to be backed by disk */
2972         rl_remove(0, fp->ff_size - 1, &fp->ff_invalidranges);
2973
2974         /*
2975          * Account for any unmapped blocks. Since we're deleting the
2976          * entire file, we don't have to worry about just shrinking
2977          * to a smaller number of borrowed blocks.
2978          */
2979         if (fp->ff_unallocblocks > 0) {
2980                 u_int32_t loanedBlocks;
2981
2982                 HFS_MOUNT_LOCK(hfsmp, TRUE);
2983
2984                 loanedBlocks = fp->ff_unallocblocks;
2985                 cp->c_blocks -= loanedBlocks;
2986                 fp->ff_blocks -= loanedBlocks;
2987                 fp->ff_unallocblocks = 0;
2988
2989                 hfsmp->loanedBlocks -= loanedBlocks;
2990
2991                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
2992         }
2993
2994         return 0;
2995 }
2996
2997
2998 /*
2999  * Special wrapper around calling TruncateFileC.  This function is useable
3000  * even when the catalog record does not exist any longer, making it ideal
3001  * for use when deleting a file.  The simplification here is that we know
3002  * that we are releasing all blocks.
3003  *
3004  * The caller is responsible for saving off a copy of the filefork(s)
3005  * embedded within the cnode prior to calling this function.  The pointers
3006  * supplied as arguments must be valid even if the cnode is no longer valid.
3007  */
3008
3009 int
3010 hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork,
3011                                          struct filefork *rsrcfork, u_int32_t fileid) {
3012
3013         off_t filebytes;
3014         u_int32_t fileblocks;
3015         int blksize = 0;
3016         int error = 0;
3017         int lockflags;
3018
3019         blksize = hfsmp->blockSize;
3020
3021         /* Data Fork */
3022         if (datafork->ff_blocks > 0) {
3023                 fileblocks = datafork->ff_blocks;
3024                 filebytes = (off_t)fileblocks * (off_t)blksize;
3025
3026                 /* We killed invalid ranges and loaned blocks before we removed the catalog entry */
3027
3028                 while (filebytes > 0) {
3029                         if (filebytes > HFS_BIGFILE_SIZE && overflow_extents(datafork)) {
3030                                 filebytes -= HFS_BIGFILE_SIZE;
3031                         } else {
3032                                 filebytes = 0;
3033                         }
3034
3035                         /* Start a transaction, and wipe out as many blocks as we can in this iteration */
3036                         if (hfs_start_transaction(hfsmp) != 0) {
3037                                 error = EINVAL;
3038                                 break;
3039                         }
3040
3041                         if (datafork->ff_unallocblocks == 0) {
3042                                 /* Protect extents b-tree and allocation bitmap */
3043                                 lockflags = SFL_BITMAP;
3044                                 if (overflow_extents(datafork))
3045                                         lockflags |= SFL_EXTENTS;
3046                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3047
3048                                 error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), datafork, filebytes, 1, 0, fileid, false));
3049
3050                                 hfs_systemfile_unlock(hfsmp, lockflags);
3051                         }
3052                         if (error == 0) {
3053                                 datafork->ff_size = filebytes;
3054                         }
3055                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3056
3057                         /* Finish the transaction and start over if necessary */
3058                         hfs_end_transaction(hfsmp);
3059
3060                         if (error) {
3061                                 break;
3062                         }
3063                 }
3064         }
3065
3066         /* Resource fork */
3067         if (error == 0 && (rsrcfork != NULL) && rsrcfork->ff_blocks > 0) {
3068                 fileblocks = rsrcfork->ff_blocks;
3069                 filebytes = (off_t)fileblocks * (off_t)blksize;
3070
3071                 /* We killed invalid ranges and loaned blocks before we removed the catalog entry */
3072
3073                 while (filebytes > 0) {
3074                         if (filebytes > HFS_BIGFILE_SIZE && overflow_extents(rsrcfork)) {
3075                                 filebytes -= HFS_BIGFILE_SIZE;
3076                         } else {
3077                                 filebytes = 0;
3078                         }
3079
3080                         /* Start a transaction, and wipe out as many blocks as we can in this iteration */
3081                         if (hfs_start_transaction(hfsmp) != 0) {
3082                                 error = EINVAL;
3083                                 break;
3084                         }
3085
3086                         if (rsrcfork->ff_unallocblocks == 0) {
3087                                 /* Protect extents b-tree and allocation bitmap */
3088                                 lockflags = SFL_BITMAP;
3089                                 if (overflow_extents(rsrcfork))
3090                                         lockflags |= SFL_EXTENTS;
3091                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3092
3093                                 error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), rsrcfork, filebytes, 1, 1, fileid, false));
3094
3095                                 hfs_systemfile_unlock(hfsmp, lockflags);
3096                         }
3097                         if (error == 0) {
3098                                 rsrcfork->ff_size = filebytes;
3099                         }
3100                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3101
3102                         /* Finish the transaction and start over if necessary */
3103                         hfs_end_transaction(hfsmp);
3104
3105                         if (error) {
3106                                 break;
3107                         }
3108                 }
3109         }
3110
3111         return error;
3112 }
3113
3114
3115 /*
3116  * Truncate a cnode to at most length size, freeing (or adding) the
3117  * disk blocks.
3118  */
3119 int
3120 hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize,
3121              int skipupdate, vfs_context_t context)
3122 {
3123         struct filefork *fp = VTOF(vp);
3124         off_t filebytes;
3125         u_int32_t fileblocks;
3126         int blksize, error = 0;
3127         struct cnode *cp = VTOC(vp);
3128
3129         /* Cannot truncate an HFS directory! */
3130         if (vnode_isdir(vp)) {
3131                 return (EISDIR);
3132         }
3133         /* A swap file cannot change size. */
3134         if (vnode_isswap(vp) && (length != 0)) {
3135                 return (EPERM);
3136         }
3137
3138         blksize = VTOVCB(vp)->blockSize;
3139         fileblocks = fp->ff_blocks;
3140         filebytes = (off_t)fileblocks * (off_t)blksize;
3141
3142         //
3143         // Have to do this here so that we don't wind up with
3144         // i/o pending for blocks that are about to be released
3145         // if we truncate the file.
3146         //
3147         // If skipsetsize is set, then the caller is responsible
3148         // for the ubc_setsize.
3149         //
3150         // Even if skipsetsize is set, if the length is zero we
3151         // want to call ubc_setsize() because as of SnowLeopard
3152         // it will no longer cause any page-ins and it will drop
3153         // any dirty pages so that we don't do any i/o that we
3154         // don't have to.  This also prevents a race where i/o
3155         // for truncated blocks may overwrite later data if the
3156         // blocks get reallocated to a different file.
3157         //
3158         if (!skipsetsize || length == 0)
3159                 ubc_setsize(vp, length);
3160
3161         // have to loop truncating or growing files that are
3162         // really big because otherwise transactions can get
3163         // enormous and consume too many kernel resources.
3164
3165         if (length < filebytes) {
3166                 while (filebytes > length) {
3167                         if ((filebytes - length) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
3168                                 filebytes -= HFS_BIGFILE_SIZE;
3169                         } else {
3170                                 filebytes = length;
3171                         }
3172                         cp->c_flag |= C_FORCEUPDATE;
3173                         error = do_hfs_truncate(vp, filebytes, flags, skipupdate, context);
3174                         if (error)
3175                                 break;
3176                 }
3177         } else if (length > filebytes) {
3178                 while (filebytes < length) {
3179                         if ((length - filebytes) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
3180                                 filebytes += HFS_BIGFILE_SIZE;
3181                         } else {
3182                                 filebytes = length;
3183                         }
3184                         cp->c_flag |= C_FORCEUPDATE;
3185                         error = do_hfs_truncate(vp, filebytes, flags, skipupdate, context);
3186                         if (error)
3187                                 break;
3188                 }
3189         } else /* Same logical size */ {
3190
3191                 error = do_hfs_truncate(vp, length, flags, skipupdate, context);
3192         }
3193         /* Files that are changing size are not hot file candidates. */
3194         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
3195                 fp->ff_bytesread = 0;
3196         }
3197
3198         return (error);
3199 }
3200
3201
3202
3203 /*
3204  * Preallocate file storage space.
3205  */
3206 int
3207 hfs_vnop_allocate(struct vnop_allocate_args /* {
3208                 vnode_t a_vp;
3209                 off_t a_length;
3210                 u_int32_t  a_flags;
3211                 off_t *a_bytesallocated;
3212                 off_t a_offset;
3213                 vfs_context_t a_context;
3214         } */ *ap)
3215 {
3216         struct vnode *vp = ap->a_vp;
3217         struct cnode *cp;
3218         struct filefork *fp;
3219         ExtendedVCB *vcb;
3220         off_t length = ap->a_length;
3221         off_t startingPEOF;
3222         off_t moreBytesRequested;
3223         off_t actualBytesAdded;
3224         off_t filebytes;
3225         u_int32_t fileblocks;
3226         int retval, retval2;
3227         u_int32_t blockHint;
3228         u_int32_t extendFlags;   /* For call to ExtendFileC */
3229         struct hfsmount *hfsmp;
3230         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
3231         int lockflags;
3232         time_t orig_ctime;
3233
3234         *(ap->a_bytesallocated) = 0;
3235
3236         if (!vnode_isreg(vp))
3237                 return (EISDIR);
3238         if (length < (off_t)0)
3239                 return (EINVAL);
3240
3241         cp = VTOC(vp);
3242
3243         orig_ctime = VTOC(vp)->c_ctime;
3244
3245         check_for_tracked_file(vp, orig_ctime, ap->a_length == 0 ? NAMESPACE_HANDLER_TRUNCATE_OP|NAMESPACE_HANDLER_DELETE_OP : NAMESPACE_HANDLER_TRUNCATE_OP, NULL);
3246
3247         hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK);
3248
3249         if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
3250                 goto Err_Exit;
3251         }
3252
3253         fp = VTOF(vp);
3254         hfsmp = VTOHFS(vp);
3255         vcb = VTOVCB(vp);
3256
3257         fileblocks = fp->ff_blocks;
3258         filebytes = (off_t)fileblocks * (off_t)vcb->blockSize;
3259
3260         if ((ap->a_flags & ALLOCATEFROMVOL) && (length < filebytes)) {
3261                 retval = EINVAL;
3262                 goto Err_Exit;
3263         }
3264
3265         /* Fill in the flags word for the call to Extend the file */
3266
3267         extendFlags = kEFNoClumpMask;
3268         if (ap->a_flags & ALLOCATECONTIG)
3269                 extendFlags |= kEFContigMask;
3270         if (ap->a_flags & ALLOCATEALL)
3271                 extendFlags |= kEFAllMask;
3272         if (cred && suser(cred, NULL) != 0)
3273                 extendFlags |= kEFReserveMask;
3274         if (hfs_virtualmetafile(cp))
3275                 extendFlags |= kEFMetadataMask;
3276
3277         retval = E_NONE;
3278         blockHint = 0;
3279         startingPEOF = filebytes;
3280
3281         if (ap->a_flags & ALLOCATEFROMPEOF)
3282                 length += filebytes;
3283         else if (ap->a_flags & ALLOCATEFROMVOL)
3284                 blockHint = ap->a_offset / VTOVCB(vp)->blockSize;
3285
3286         /* If no changes are necesary, then we're done */
3287         if (filebytes == length)
3288                 goto Std_Exit;
3289
3290         /*
3291          * Lengthen the size of the file. We must ensure that the
3292          * last byte of the file is allocated. Since the smallest
3293          * value of filebytes is 0, length will be at least 1.
3294          */
3295         if (length > filebytes) {
3296                 off_t total_bytes_added = 0, orig_request_size;
3297
3298                 orig_request_size = moreBytesRequested = length - filebytes;
3299
3300 #if QUOTA
3301                 retval = hfs_chkdq(cp,
3302                                 (int64_t)(roundup(moreBytesRequested, vcb->blockSize)),
3303                                 cred, 0);
3304                 if (retval)
3305                         goto Err_Exit;
3306
3307 #endif /* QUOTA */
3308                 /*
3309                  * Metadata zone checks.
3310                  */
3311                 if (hfsmp->hfs_flags & HFS_METADATA_ZONE) {
3312                         /*
3313                          * Allocate Journal and Quota files in metadata zone.
3314                          */
3315                         if (hfs_virtualmetafile(cp)) {
3316                                 blockHint = hfsmp->hfs_metazone_start;
3317                         } else if ((blockHint >= hfsmp->hfs_metazone_start) &&
3318                                    (blockHint <= hfsmp->hfs_metazone_end)) {
3319                                 /*
3320                                  * Move blockHint outside metadata zone.
3321                                  */
3322                                 blockHint = hfsmp->hfs_metazone_end + 1;
3323                         }
3324                 }
3325
3326
3327                 while ((length > filebytes) && (retval == E_NONE)) {
3328                     off_t bytesRequested;
3329
3330                     if (hfs_start_transaction(hfsmp) != 0) {
3331                         retval = EINVAL;
3332                         goto Err_Exit;
3333                     }
3334
3335                     /* Protect extents b-tree and allocation bitmap */
3336                     lockflags = SFL_BITMAP;
3337                     if (overflow_extents(fp))
3338                         lockflags |= SFL_EXTENTS;
3339                     lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3340
3341                     if (moreBytesRequested >= HFS_BIGFILE_SIZE) {
3342                         bytesRequested = HFS_BIGFILE_SIZE;
3343                     } else {
3344                         bytesRequested = moreBytesRequested;
3345                     }
3346
3347                     if (extendFlags & kEFContigMask) {
3348                             // if we're on a sparse device, this will force it to do a
3349                             // full scan to find the space needed.
3350                             hfsmp->hfs_flags &= ~HFS_DID_CONTIG_SCAN;
3351                     }
3352
3353                     retval = MacToVFSError(ExtendFileC(vcb,
3354                                                 (FCB*)fp,
3355                                                 bytesRequested,
3356                                                 blockHint,
3357                                                 extendFlags,
3358                                                 &actualBytesAdded));
3359
3360                     if (retval == E_NONE) {
3361                         *(ap->a_bytesallocated) += actualBytesAdded;
3362                         total_bytes_added += actualBytesAdded;
3363                         moreBytesRequested -= actualBytesAdded;
3364                         if (blockHint != 0) {
3365                             blockHint += actualBytesAdded / vcb->blockSize;
3366                         }
3367                     }
3368                     filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
3369
3370                     hfs_systemfile_unlock(hfsmp, lockflags);
3371
3372                     if (hfsmp->jnl) {
3373                         (void) hfs_update(vp, TRUE);
3374                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3375                     }
3376
3377                     hfs_end_transaction(hfsmp);
3378                 }
3379
3380
3381                 /*
3382                  * if we get an error and no changes were made then exit
3383                  * otherwise we must do the hfs_update to reflect the changes
3384                  */
3385                 if (retval && (startingPEOF == filebytes))
3386                         goto Err_Exit;
3387
3388                 /*
3389                  * Adjust actualBytesAdded to be allocation block aligned, not
3390                  * clump size aligned.
3391                  * NOTE: So what we are reporting does not affect reality
3392                  * until the file is closed, when we truncate the file to allocation
3393                  * block size.
3394                  */
3395                 if (total_bytes_added != 0 && orig_request_size < total_bytes_added)
3396                         *(ap->a_bytesallocated) =
3397                                 roundup(orig_request_size, (off_t)vcb->blockSize);
3398
3399         } else { /* Shorten the size of the file */
3400
3401                 if (fp->ff_size > length) {
3402                         /*
3403                          * Any buffers that are past the truncation point need to be
3404                          * invalidated (to maintain buffer cache consistency).
3405                          */
3406                 }
3407
3408                 retval = hfs_truncate(vp, length, 0, 0, 0, ap->a_context);
3409                 filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
3410
3411                 /*
3412                  * if we get an error and no changes were made then exit
3413                  * otherwise we must do the hfs_update to reflect the changes
3414                  */
3415                 if (retval && (startingPEOF == filebytes)) goto Err_Exit;
3416 #if QUOTA
3417                 /* These are  bytesreleased */
3418                 (void) hfs_chkdq(cp, (int64_t)-((startingPEOF - filebytes)), NOCRED,0);
3419 #endif /* QUOTA */
3420
3421                 if (fp->ff_size > filebytes) {
3422                         fp->ff_size = filebytes;
3423
3424                         hfs_unlock(cp);
3425                         ubc_setsize(vp, fp->ff_size);
3426                         hfs_lock(cp, HFS_FORCE_LOCK);
3427                 }
3428         }
3429
3430 Std_Exit:
3431         cp->c_touch_chgtime = TRUE;
3432         cp->c_touch_modtime = TRUE;
3433         retval2 = hfs_update(vp, MNT_WAIT);
3434
3435         if (retval == 0)
3436                 retval = retval2;
3437 Err_Exit:
3438         hfs_unlock_truncate(cp, 0);
3439         hfs_unlock(cp);
3440         return (retval);
3441 }
3442
3443
3444 /*
3445  * Pagein for HFS filesystem
3446  */
3447 int
3448 hfs_vnop_pagein(struct vnop_pagein_args *ap)
3449 /*
3450         struct vnop_pagein_args {
3451                 vnode_t a_vp,
3452                 upl_t         a_pl,
3453                 vm_offset_t   a_pl_offset,
3454                 off_t         a_f_offset,
3455                 size_t        a_size,
3456                 int           a_flags
3457                 vfs_context_t a_context;
3458         };
3459 */
3460 {
3461         vnode_t         vp;
3462         struct cnode    *cp;
3463         struct filefork *fp;
3464         int             error = 0;
3465         upl_t           upl;
3466         upl_page_info_t *pl;
3467         off_t           f_offset;
3468         int             offset;
3469         int             isize;
3470         int             pg_index;
3471         boolean_t       truncate_lock_held = FALSE;
3472         boolean_t       file_converted = FALSE;
3473         kern_return_t   kret;
3474
3475         vp = ap->a_vp;
3476         cp = VTOC(vp);
3477         fp = VTOF(vp);
3478
3479 #if CONFIG_PROTECT
3480         if ((error = cp_handle_vnop(cp, CP_READ_ACCESS | CP_WRITE_ACCESS)) != 0) {
3481                 return error;
3482         }
3483 #endif /* CONFIG_PROTECT */
3484
3485         if (ap->a_pl != NULL) {
3486                 /*
3487                  * this can only happen for swap files now that
3488                  * we're asking for V2 paging behavior...
3489                  * so don't need to worry about decompression, or
3490                  * keeping track of blocks read or taking the truncate lock
3491                  */
3492                 error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
3493                                        ap->a_size, (off_t)fp->ff_size, ap->a_flags);
3494                 goto pagein_done;
3495         }
3496
3497 retry_pagein:
3498         /*
3499          * take truncate lock (shared/recursive) to guard against
3500          * zero-fill thru fsync interfering, but only for v2
3501          *
3502          * the HFS_RECURSE_TRUNCLOCK arg indicates that we want the
3503          * lock shared and we are allowed to recurse 1 level if this thread already
3504          * owns the lock exclusively... this can legally occur
3505          * if we are doing a shrinking ftruncate against a file
3506          * that is mapped private, and the pages being truncated
3507          * do not currently exist in the cache... in that case
3508          * we will have to page-in the missing pages in order
3509          * to provide them to the private mapping... we must
3510          * also call hfs_unlock_truncate with a postive been_recursed
3511          * arg to indicate that if we have recursed, there is no need to drop
3512          * the lock.  Allowing this simple recursion is necessary
3513          * in order to avoid a certain deadlock... since the ftruncate
3514          * already holds the truncate lock exclusively, if we try
3515          * to acquire it shared to protect the pagein path, we will
3516          * hang this thread
3517          *
3518          * NOTE: The if () block below is a workaround in order to prevent a
3519          * VM deadlock. See rdar://7853471.
3520          *
3521          * If we are in a forced unmount, then launchd will still have the
3522          * dyld_shared_cache file mapped as it is trying to reboot.  If we
3523          * take the truncate lock here to service a page fault, then our
3524          * thread could deadlock with the forced-unmount.  The forced unmount
3525          * thread will try to reclaim the dyld_shared_cache vnode, but since it's
3526          * marked C_DELETED, it will call ubc_setsize(0).  As a result, the unmount
3527          * thread will think it needs to copy all of the data out of the file
3528          * and into a VM copy object.  If we hold the cnode lock here, then that
3529          * VM operation will not be able to proceed, because we'll set a busy page
3530          * before attempting to grab the lock.  Note that this isn't as simple as "don't
3531          * call ubc_setsize" because doing that would just shift the problem to the
3532          * ubc_msync done before the vnode is reclaimed.
3533          *
3534          * So, if a forced unmount on this volume is in flight AND the cnode is
3535          * marked C_DELETED, then just go ahead and do the page in without taking
3536          * the lock (thus suspending pagein_v2 semantics temporarily).  Since it's on a file
3537          * that is not going to be available on the next mount, this seems like a
3538          * OK solution from a correctness point of view, even though it is hacky.
3539          */
3540         if (vfs_isforce(vp->v_mount)) {
3541                 if (cp->c_flag & C_DELETED) {
3542                         /* If we don't get it, then just go ahead and operate without the lock */
3543                         truncate_lock_held = hfs_try_trunclock(cp, HFS_RECURSE_TRUNCLOCK);
3544                 }
3545         }
3546         else {
3547                 hfs_lock_truncate(cp, HFS_RECURSE_TRUNCLOCK);
3548                 truncate_lock_held = TRUE;
3549         }
3550
3551         kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT);
3552
3553         if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
3554                 error = EINVAL;
3555                 goto pagein_done;
3556         }
3557         isize = ap->a_size;
3558
3559         /*
3560          * Scan from the back to find the last page in the UPL, so that we
3561          * aren't looking at a UPL that may have already been freed by the
3562          * preceding aborts/completions.
3563          */
3564         for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
3565                 if (upl_page_present(pl, --pg_index))
3566                         break;
3567                 if (pg_index == 0) {
3568                         /*
3569                          * no absent pages were found in the range specified
3570                          * just abort the UPL to get rid of it and then we're done
3571                          */
3572                         ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
3573                         goto pagein_done;
3574                 }
3575         }
3576         /*
3577          * initialize the offset variables before we touch the UPL.
3578          * f_offset is the position into the file, in bytes
3579          * offset is the position into the UPL, in bytes
3580          * pg_index is the pg# of the UPL we're operating on
3581          * isize is the offset into the UPL of the last page that is present.
3582          */
3583         isize = ((pg_index + 1) * PAGE_SIZE);
3584         pg_index = 0;
3585         offset = 0;
3586         f_offset = ap->a_f_offset;
3587
3588         while (isize) {
3589                 int  xsize;
3590                 int  num_of_pages;
3591
3592                 if ( !upl_page_present(pl, pg_index)) {
3593                         /*
3594                          * we asked for RET_ONLY_ABSENT, so it's possible
3595                          * to get back empty slots in the UPL.
3596                          * just skip over them
3597                          */
3598                         f_offset += PAGE_SIZE;
3599                         offset   += PAGE_SIZE;
3600                         isize    -= PAGE_SIZE;
3601                         pg_index++;
3602
3603                         continue;
3604                 }
3605                 /*
3606                  * We know that we have at least one absent page.
3607                  * Now checking to see how many in a row we have
3608                  */
3609                 num_of_pages = 1;
3610                 xsize = isize - PAGE_SIZE;
3611
3612                 while (xsize) {
3613                         if ( !upl_page_present(pl, pg_index + num_of_pages))
3614                                 break;
3615                         num_of_pages++;
3616                         xsize -= PAGE_SIZE;
3617                 }
3618                 xsize = num_of_pages * PAGE_SIZE;
3619
3620 #if HFS_COMPRESSION
3621                 if (VNODE_IS_RSRC(vp)) {
3622                         /* allow pageins of the resource fork */
3623                 } else {
3624                         int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
3625
3626                         if (compressed) {
3627                                 if (truncate_lock_held) {
3628                                         /*
3629                                          * can't hold the truncate lock when calling into the decmpfs layer
3630                                          * since it calls back into this layer... even though we're only
3631                                          * holding the lock in shared mode, and the re-entrant path only
3632                                          * takes the lock shared, we can deadlock if some other thread
3633                                          * tries to grab the lock exclusively in between.
3634                                          */
3635                                         hfs_unlock_truncate(cp, 1);
3636                                         truncate_lock_held = FALSE;
3637                                 }
3638                                 ap->a_pl = upl;
3639                                 ap->a_pl_offset = offset;
3640                                 ap->a_f_offset = f_offset;
3641                                 ap->a_size = xsize;
3642
3643                                 error = decmpfs_pagein_compressed(ap, &compressed, VTOCMP(vp));
3644                                 /*
3645                                  * note that decpfs_pagein_compressed can change the state of
3646                                  * 'compressed'... it will set it to 0 if the file is no longer
3647                                  * compressed once the compression lock is successfully taken
3648                                  * i.e. we would block on that lock while the file is being inflated
3649                                  */
3650                                 if (compressed) {
3651                                         if (error == 0) {
3652                                                 /* successful page-in, update the access time */
3653                                                 VTOC(vp)->c_touch_acctime = TRUE;
3654
3655                                                 /* compressed files are not hot file candidates */
3656                                                 if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
3657                                                         fp->ff_bytesread = 0;
3658                                                 }
3659                                         } else if (error == EAGAIN) {
3660                                                 /*
3661                                                  * EAGAIN indicates someone else already holds the compression lock...
3662                                                  * to avoid deadlocking, we'll abort this range of pages with an
3663                                                  * indication that the pagein needs to be redriven
3664                                                  */
3665                                                 ubc_upl_abort_range(upl, (upl_offset_t) offset, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_RESTART);
3666                                         }
3667                                         goto pagein_next_range;
3668                                 }
3669                                 else {
3670                                         /*
3671                                          * Set file_converted only if the file became decompressed while we were
3672                                          * paging in.  If it were still compressed, we would re-start the loop using the goto
3673                                          * in the above block.  This avoid us overloading truncate_lock_held as our retry_pagein
3674                                          * condition below, since we could have avoided taking the truncate lock to prevent
3675                                          * a deadlock in the force unmount case.
3676                                          */
3677                                         file_converted = TRUE;
3678                                 }
3679                         }
3680                         if (file_converted == TRUE) {
3681                                 /*
3682                                  * the file was converted back to a regular file after we first saw it as compressed
3683                                  * we need to abort the upl, retake the truncate lock, recreate the UPL and start over
3684                                  * reset a_size so that we consider what remains of the original request
3685                                  * and null out a_upl and a_pl_offset.
3686                                  *
3687                                  * We should only be able to get into this block if the decmpfs_pagein_compressed
3688                                  * successfully decompressed the range in question for this file.
3689                                  */
3690                                 ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY);
3691
3692                                 ap->a_size = isize;
3693                                 ap->a_pl = NULL;
3694                                 ap->a_pl_offset = 0;
3695
3696                                 /* Reset file_converted back to false so that we don't infinite-loop. */
3697                                 file_converted = FALSE;
3698                                 goto retry_pagein;
3699                         }
3700                 }
3701 #endif
3702                 error = cluster_pagein(vp, upl, offset, f_offset, xsize, (off_t)fp->ff_size, ap->a_flags);
3703
3704                 /*
3705                  * Keep track of blocks read.
3706                  */
3707                 if ( !vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) {
3708                         int bytesread;
3709                         int took_cnode_lock = 0;
3710
3711                         if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE)
3712                                 bytesread = fp->ff_size;
3713                         else
3714                                 bytesread = xsize;
3715
3716                         /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
3717                         if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) {
3718                                 hfs_lock(cp, HFS_FORCE_LOCK);
3719                                 took_cnode_lock = 1;
3720                         }
3721                         /*
3722                          * If this file hasn't been seen since the start of
3723                          * the current sampling period then start over.
3724                          */
3725                         if (cp->c_atime < VTOHFS(vp)->hfc_timebase) {
3726                                 struct timeval tv;
3727
3728                                 fp->ff_bytesread = bytesread;
3729                                 microtime(&tv);
3730                                 cp->c_atime = tv.tv_sec;
3731                         } else {
3732                                 fp->ff_bytesread += bytesread;
3733                         }
3734                         cp->c_touch_acctime = TRUE;
3735                         if (took_cnode_lock)
3736                                 hfs_unlock(cp);
3737                 }
3738 pagein_next_range:
3739                 f_offset += xsize;
3740                 offset   += xsize;
3741                 isize    -= xsize;
3742                 pg_index += num_of_pages;
3743
3744                 error = 0;
3745         }
3746
3747 pagein_done:
3748         if (truncate_lock_held == TRUE) {
3749                 /* Note 1 is passed to hfs_unlock_truncate in been_recursed argument */
3750                 hfs_unlock_truncate(cp, 1);
3751         }
3752
3753         return (error);
3754 }
3755
3756 /*
3757  * Pageout for HFS filesystem.
3758  */
3759 int
3760 hfs_vnop_pageout(struct vnop_pageout_args *ap)
3761 /*
3762         struct vnop_pageout_args {
3763            vnode_t a_vp,
3764            upl_t         a_pl,
3765            vm_offset_t   a_pl_offset,
3766            off_t         a_f_offset,
3767            size_t        a_size,
3768            int           a_flags
3769            vfs_context_t a_context;
3770         };
3771 */
3772 {
3773         vnode_t vp = ap->a_vp;
3774         struct cnode *cp;
3775         struct filefork *fp;
3776         int retval = 0;
3777         off_t filesize;
3778         upl_t           upl;
3779         upl_page_info_t* pl;
3780         vm_offset_t     a_pl_offset;
3781         int             a_flags;
3782         int is_pageoutv2 = 0;
3783         kern_return_t kret;
3784
3785         cp = VTOC(vp);
3786         fp = VTOF(vp);
3787
3788         /*
3789          * Figure out where the file ends, for pageout purposes.  If
3790          * ff_new_size > ff_size, then we're in the middle of extending the
3791          * file via a write, so it is safe (and necessary) that we be able
3792          * to pageout up to that point.
3793          */
3794         filesize = fp->ff_size;
3795         if (fp->ff_new_size > filesize)
3796                 filesize = fp->ff_new_size;
3797
3798         a_flags = ap->a_flags;
3799         a_pl_offset = ap->a_pl_offset;
3800
3801         /*
3802          * we can tell if we're getting the new or old behavior from the UPL
3803          */
3804         if ((upl = ap->a_pl) == NULL) {
3805                 int request_flags;
3806
3807                 is_pageoutv2 = 1;
3808                 /*
3809                  * we're in control of any UPL we commit
3810                  * make sure someone hasn't accidentally passed in UPL_NOCOMMIT
3811                  */
3812                 a_flags &= ~UPL_NOCOMMIT;
3813                 a_pl_offset = 0;
3814
3815                 /*
3816                  * take truncate lock (shared) to guard against
3817                  * zero-fill thru fsync interfering, but only for v2
3818                  */
3819                 hfs_lock_truncate(cp, HFS_SHARED_LOCK);
3820
3821                 if (a_flags & UPL_MSYNC) {
3822                         request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY;
3823                 }
3824                 else {
3825                         request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY;
3826                 }
3827
3828                 kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags);
3829
3830                 if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
3831                         retval = EINVAL;
3832                         goto pageout_done;
3833                 }
3834         }
3835         /*
3836          * from this point forward upl points at the UPL we're working with
3837          * it was either passed in or we succesfully created it
3838          */
3839
3840         /*
3841          * Now that HFS is opting into VFC_VFSVNOP_PAGEOUTV2, we may need to operate on our own
3842          * UPL instead of relying on the UPL passed into us.  We go ahead and do that here,
3843          * scanning for dirty ranges.  We'll issue our own N cluster_pageout calls, for
3844          * N dirty ranges in the UPL.  Note that this is almost a direct copy of the
3845          * logic in vnode_pageout except that we need to do it after grabbing the truncate
3846          * lock in HFS so that we don't lock invert ourselves.
3847          *
3848          * Note that we can still get into this function on behalf of the default pager with
3849          * non-V2 behavior (swapfiles).  However in that case, we did not grab locks above
3850          * since fsync and other writing threads will grab the locks, then mark the
3851          * relevant pages as busy.  But the pageout codepath marks the pages as busy,
3852          * and THEN would attempt to grab the truncate lock, which would result in deadlock.  So
3853          * we do not try to grab anything for the pre-V2 case, which should only be accessed
3854          * by the paging/VM system.
3855          */
3856
3857         if (is_pageoutv2) {
3858                 off_t f_offset;
3859                 int offset;
3860                 int isize;
3861                 int pg_index;
3862                 int error;
3863                 int error_ret = 0;
3864
3865                 isize = ap->a_size;
3866                 f_offset = ap->a_f_offset;
3867
3868                 /*
3869                  * Scan from the back to find the last page in the UPL, so that we
3870                  * aren't looking at a UPL that may have already been freed by the
3871                  * preceding aborts/completions.
3872                  */
3873                 for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
3874                         if (upl_page_present(pl, --pg_index))
3875                                 break;
3876                         if (pg_index == 0) {
3877                                 ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
3878                                 goto pageout_done;
3879                         }
3880                 }
3881
3882                 /*
3883                  * initialize the offset variables before we touch the UPL.
3884                  * a_f_offset is the position into the file, in bytes
3885                  * offset is the position into the UPL, in bytes
3886                  * pg_index is the pg# of the UPL we're operating on.
3887                  * isize is the offset into the UPL of the last non-clean page.
3888                  */
3889                 isize = ((pg_index + 1) * PAGE_SIZE);
3890
3891                 offset = 0;
3892                 pg_index = 0;
3893
3894                 while (isize) {
3895                         int  xsize;
3896                         int  num_of_pages;
3897
3898                         if ( !upl_page_present(pl, pg_index)) {
3899                                 /*
3900                                  * we asked for RET_ONLY_DIRTY, so it's possible
3901                                  * to get back empty slots in the UPL.
3902                                  * just skip over them
3903                                  */
3904                                 f_offset += PAGE_SIZE;
3905                                 offset   += PAGE_SIZE;
3906                                 isize    -= PAGE_SIZE;
3907                                 pg_index++;
3908
3909                                 continue;
3910                         }
3911                         if ( !upl_dirty_page(pl, pg_index)) {
3912                                 panic ("hfs_vnop_pageout: unforeseen clean page @ index %d for UPL %p\n", pg_index, upl);
3913                         }
3914
3915                         /*
3916                          * We know that we have at least one dirty page.
3917                          * Now checking to see how many in a row we have
3918                          */
3919                         num_of_pages = 1;
3920                         xsize = isize - PAGE_SIZE;
3921
3922                         while (xsize) {
3923                                 if ( !upl_dirty_page(pl, pg_index + num_of_pages))
3924                                         break;
3925                                 num_of_pages++;
3926                                 xsize -= PAGE_SIZE;
3927                         }
3928                         xsize = num_of_pages * PAGE_SIZE;
3929
3930                         if (!vnode_isswap(vp)) {
3931                                 off_t end_of_range;
3932                                 int tooklock;
3933
3934                                 tooklock = 0;
3935
3936                                 if (cp->c_lockowner != current_thread()) {
3937                                         if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
3938                                                 /*
3939                                                  * we're in the v2 path, so we are the
3940                                                  * owner of the UPL... we may have already
3941                                                  * processed some of the UPL, so abort it
3942                                                  * from the current working offset to the
3943                                                  * end of the UPL
3944                                                  */
3945                                                 ubc_upl_abort_range(upl,
3946                                                                     offset,
3947                                                                     ap->a_size - offset,
3948                                                                     UPL_ABORT_FREE_ON_EMPTY);
3949                                                 goto pageout_done;
3950                                         }
3951                                         tooklock = 1;
3952                                 }
3953                                 end_of_range = f_offset + xsize - 1;
3954
3955                                 if (end_of_range >= filesize) {
3956                                         end_of_range = (off_t)(filesize - 1);
3957                                 }
3958                                 if (f_offset < filesize) {
3959                                         rl_remove(f_offset, end_of_range, &fp->ff_invalidranges);
3960                                         cp->c_flag |= C_MODIFIED;  /* leof is dirty */
3961                                 }
3962                                 if (tooklock) {
3963                                         hfs_unlock(cp);
3964                                 }
3965                         }
3966                         if ((error = cluster_pageout(vp, upl, offset, f_offset,
3967                                                         xsize, filesize, a_flags))) {
3968                                 if (error_ret == 0)
3969                                         error_ret = error;
3970                         }
3971                         f_offset += xsize;
3972                         offset   += xsize;
3973                         isize    -= xsize;
3974                         pg_index += num_of_pages;
3975                 }
3976                 /* capture errnos bubbled out of cluster_pageout if they occurred */
3977                 if (error_ret != 0) {
3978                         retval = error_ret;
3979                 }
3980         } /* end block for v2 pageout behavior */
3981         else {
3982                 if (!vnode_isswap(vp)) {
3983                         off_t end_of_range;
3984                         int tooklock = 0;
3985
3986                         if (cp->c_lockowner != current_thread()) {
3987                                 if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
3988                                         if (!(a_flags & UPL_NOCOMMIT)) {
3989                                                 ubc_upl_abort_range(upl,
3990                                                                     a_pl_offset,
3991                                                                     ap->a_size,
3992                                                                     UPL_ABORT_FREE_ON_EMPTY);
3993                                         }
3994                                         goto pageout_done;
3995                                 }
3996                                 tooklock = 1;
3997                         }
3998                         end_of_range = ap->a_f_offset + ap->a_size - 1;
3999
4000                         if (end_of_range >= filesize) {
4001                                 end_of_range = (off_t)(filesize - 1);
4002                         }
4003                         if (ap->a_f_offset < filesize) {
4004                                 rl_remove(ap->a_f_offset, end_of_range, &fp->ff_invalidranges);
4005                                 cp->c_flag |= C_MODIFIED;  /* leof is dirty */
4006                         }
4007
4008                         if (tooklock) {
4009                                 hfs_unlock(cp);
4010                         }
4011                 }
4012                 /*
4013                  * just call cluster_pageout for old pre-v2 behavior
4014                  */
4015                 retval = cluster_pageout(vp, upl, a_pl_offset, ap->a_f_offset,
4016                                 ap->a_size, filesize, a_flags);
4017         }
4018
4019         /*
4020          * If data was written, update the modification time of the file.
4021          * If setuid or setgid bits are set and this process is not the
4022          * superuser then clear the setuid and setgid bits as a precaution
4023          * against tampering.
4024          */
4025         if (retval == 0) {
4026                 cp->c_touch_modtime = TRUE;
4027                 cp->c_touch_chgtime = TRUE;
4028                 if ((cp->c_mode & (S_ISUID | S_ISGID)) &&
4029                     (vfs_context_suser(ap->a_context) != 0)) {
4030                         hfs_lock(cp, HFS_FORCE_LOCK);
4031                         cp->c_mode &= ~(S_ISUID | S_ISGID);
4032                         hfs_unlock(cp);
4033                 }
4034         }
4035
4036 pageout_done:
4037         if (is_pageoutv2) {
4038                 /* release truncate lock (shared) */
4039                 hfs_unlock_truncate(cp, 0);
4040         }
4041         return (retval);
4042 }
4043
4044 /*
4045  * Intercept B-Tree node writes to unswap them if necessary.
4046  */
4047 int
4048 hfs_vnop_bwrite(struct vnop_bwrite_args *ap)
4049 {
4050         int retval = 0;
4051         register struct buf *bp = ap->a_bp;
4052         register struct vnode *vp = buf_vnode(bp);
4053         BlockDescriptor block;
4054
4055         /* Trap B-Tree writes */
4056         if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) ||
4057             (VTOC(vp)->c_fileid == kHFSCatalogFileID) ||
4058             (VTOC(vp)->c_fileid == kHFSAttributesFileID) ||
4059             (vp == VTOHFS(vp)->hfc_filevp)) {
4060
4061                 /*
4062                  * Swap and validate the node if it is in native byte order.
4063                  * This is always be true on big endian, so we always validate
4064                  * before writing here.  On little endian, the node typically has
4065                  * been swapped and validated when it was written to the journal,
4066                  * so we won't do anything here.
4067                  */
4068                 if (((u_int16_t *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) {
4069                         /* Prepare the block pointer */
4070                         block.blockHeader = bp;
4071                         block.buffer = (char *)buf_dataptr(bp);
4072                         block.blockNum = buf_lblkno(bp);
4073                         /* not found in cache ==> came from disk */
4074                         block.blockReadFromDisk = (buf_fromcache(bp) == 0);
4075                         block.blockSize = buf_count(bp);
4076
4077                         /* Endian un-swap B-Tree node */
4078                         retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, false);
4079                         if (retval)
4080                                 panic("hfs_vnop_bwrite: about to write corrupt node!\n");
4081                 }
4082         }
4083
4084         /* This buffer shouldn't be locked anymore but if it is clear it */
4085         if ((buf_flags(bp) & B_LOCKED)) {
4086                 // XXXdbg
4087                 if (VTOHFS(vp)->jnl) {
4088                         panic("hfs: CLEARING the lock bit on bp %p\n", bp);
4089                 }
4090                 buf_clearflags(bp, B_LOCKED);
4091         }
4092         retval = vn_bwrite (ap);
4093
4094         return (retval);
4095 }
4096
4097 /*
4098  * Relocate a file to a new location on disk
4099  *  cnode must be locked on entry
4100  *
4101  * Relocation occurs by cloning the file's data from its
4102  * current set of blocks to a new set of blocks. During
4103  * the relocation all of the blocks (old and new) are
4104  * owned by the file.
4105  *
4106  * -----------------
4107  * |///////////////|
4108  * -----------------
4109  * 0               N (file offset)
4110  *
4111  * -----------------     -----------------
4112  * |///////////////|     |               |     STEP 1 (acquire new blocks)
4113  * -----------------     -----------------
4114  * 0               N     N+1             2N
4115  *
4116  * -----------------     -----------------
4117  * |///////////////|     |///////////////|     STEP 2 (clone data)
4118  * -----------------     -----------------
4119  * 0               N     N+1             2N
4120  *
4121  *                       -----------------
4122  *                       |///////////////|     STEP 3 (head truncate blocks)
4123  *                       -----------------
4124  *                       0               N
4125  *
4126  * During steps 2 and 3 page-outs to file offsets less
4127  * than or equal to N are suspended.
4128  *
4129  * During step 3 page-ins to the file get suspended.
4130  */
4131 int
4132 hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
4133         struct  proc *p)
4134 {
4135         struct  cnode *cp;
4136         struct  filefork *fp;
4137         struct  hfsmount *hfsmp;
4138         u_int32_t  headblks;
4139         u_int32_t  datablks;
4140         u_int32_t  blksize;
4141         u_int32_t  growsize;
4142         u_int32_t  nextallocsave;
4143         daddr64_t  sector_a,  sector_b;
4144         int eflags;
4145         off_t  newbytes;
4146         int  retval;
4147         int lockflags = 0;
4148         int took_trunc_lock = 0;
4149         int started_tr = 0;
4150         enum vtype vnodetype;
4151
4152         vnodetype = vnode_vtype(vp);
4153         if (vnodetype != VREG && vnodetype != VLNK) {
4154                 return (EPERM);
4155         }
4156
4157         hfsmp = VTOHFS(vp);
4158         if (hfsmp->hfs_flags & HFS_FRAGMENTED_FREESPACE) {
4159                 return (ENOSPC);
4160         }
4161
4162         cp = VTOC(vp);
4163         fp = VTOF(vp);
4164         if (fp->ff_unallocblocks)
4165                 return (EINVAL);
4166
4167 #if CONFIG_PROTECT
4168         /*
4169          * <rdar://problem/9118426>
4170          * Disable HFS file relocation on content-protected filesystems
4171          */
4172         if (cp_fs_protected (hfsmp->hfs_mp)) {
4173                 return EINVAL;
4174         }
4175 #endif
4176
4177         /* If it's an SSD, also disable HFS relocation */
4178         if (hfsmp->hfs_flags & HFS_SSD) {
4179                 return EINVAL;
4180         }
4181
4182         blksize = hfsmp->blockSize;
4183         if (blockHint == 0)
4184                 blockHint = hfsmp->nextAllocation;
4185
4186         if ((fp->ff_size > 0x7fffffff) ||
4187             ((fp->ff_size > blksize) && vnodetype == VLNK)) {
4188                 return (EFBIG);
4189         }
4190
4191         //
4192         // We do not believe that this call to hfs_fsync() is
4193         // necessary and it causes a journal transaction
4194         // deadlock so we are removing it.
4195         //
4196         //if (vnodetype == VREG && !vnode_issystem(vp)) {
4197         //      retval = hfs_fsync(vp, MNT_WAIT, 0, p);
4198         //      if (retval)
4199         //              return (retval);
4200         //}
4201
4202         if (!vnode_issystem(vp) && (vnodetype != VLNK)) {
4203                 hfs_unlock(cp);
4204                 hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK);
4205                 /* Force lock since callers expects lock to be held. */
4206                 if ((retval = hfs_lock(cp, HFS_FORCE_LOCK))) {
4207                         hfs_unlock_truncate(cp, 0);
4208                         return (retval);
4209                 }
4210                 /* No need to continue if file was removed. */
4211                 if (cp->c_flag & C_NOEXISTS) {
4212                         hfs_unlock_truncate(cp, 0);
4213                         return (ENOENT);
4214                 }
4215                 took_trunc_lock = 1;
4216         }
4217         headblks = fp->ff_blocks;
4218         datablks = howmany(fp->ff_size, blksize);
4219         growsize = datablks * blksize;
4220         eflags = kEFContigMask | kEFAllMask | kEFNoClumpMask;
4221         if (blockHint >= hfsmp->hfs_metazone_start &&
4222             blockHint <= hfsmp->hfs_metazone_end)
4223                 eflags |= kEFMetadataMask;
4224
4225         if (hfs_start_transaction(hfsmp) != 0) {
4226                 if (took_trunc_lock)
4227                         hfs_unlock_truncate(cp, 0);
4228             return (EINVAL);
4229         }
4230         started_tr = 1;
4231         /*
4232          * Protect the extents b-tree and the allocation bitmap
4233          * during MapFileBlockC and ExtendFileC operations.
4234          */
4235         lockflags = SFL_BITMAP;
4236         if (overflow_extents(fp))
4237                 lockflags |= SFL_EXTENTS;
4238         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
4239
4240         retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize - 1, &sector_a, NULL);
4241         if (retval) {
4242                 retval = MacToVFSError(retval);
4243                 goto out;
4244         }
4245
4246         /*
4247          * STEP 1 - acquire new allocation blocks.
4248          */
4249         nextallocsave = hfsmp->nextAllocation;
4250         retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes);
4251         if (eflags & kEFMetadataMask) {
4252                 HFS_MOUNT_LOCK(hfsmp, TRUE);
4253                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave);
4254                 MarkVCBDirty(hfsmp);
4255                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
4256         }
4257
4258         retval = MacToVFSError(retval);
4259         if (retval == 0) {
4260                 cp->c_flag |= C_MODIFIED;
4261                 if (newbytes < growsize) {
4262                         retval = ENOSPC;
4263                         goto restore;
4264                 } else if (fp->ff_blocks < (headblks + datablks)) {
4265                         printf("hfs_relocate: allocation failed");
4266                         retval = ENOSPC;
4267                         goto restore;
4268                 }
4269
4270                 retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize, &sector_b, NULL);
4271                 if (retval) {
4272                         retval = MacToVFSError(retval);
4273                 } else if ((sector_a + 1) == sector_b) {
4274                         retval = ENOSPC;
4275                         goto restore;
4276                 } else if ((eflags & kEFMetadataMask) &&
4277                            ((((u_int64_t)sector_b * hfsmp->hfs_logical_block_size) / blksize) >
4278                               hfsmp->hfs_metazone_end)) {
4279 #if 0
4280                         const char * filestr;
4281                         char emptystr = '\0';
4282
4283                         if (cp->c_desc.cd_nameptr != NULL) {
4284                                 filestr = (const char *)&cp->c_desc.cd_nameptr[0];
4285                         } else if (vnode_name(vp) != NULL) {
4286                                 filestr = vnode_name(vp);
4287                         } else {
4288                                 filestr = &emptystr;
4289                         }
4290 #endif
4291                         retval = ENOSPC;
4292                         goto restore;
4293                 }
4294         }
4295         /* Done with system locks and journal for now. */
4296         hfs_systemfile_unlock(hfsmp, lockflags);
4297         lockflags = 0;
4298         hfs_end_transaction(hfsmp);
4299         started_tr = 0;
4300
4301         if (retval) {
4302                 /*
4303                  * Check to see if failure is due to excessive fragmentation.
4304                  */
4305                 if ((retval == ENOSPC) &&
4306                     (hfs_freeblks(hfsmp, 0) > (datablks * 2))) {
4307                         hfsmp->hfs_flags |= HFS_FRAGMENTED_FREESPACE;
4308                 }
4309                 goto out;
4310         }
4311         /*
4312          * STEP 2 - clone file data into the new allocation blocks.
4313          */
4314
4315         if (vnodetype == VLNK)
4316                 retval = hfs_clonelink(vp, blksize, cred, p);
4317         else if (vnode_issystem(vp))
4318                 retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p);
4319         else
4320                 retval = hfs_clonefile(vp, headblks, datablks, blksize);
4321
4322         /* Start transaction for step 3 or for a restore. */
4323         if (hfs_start_transaction(hfsmp) != 0) {
4324                 retval = EINVAL;
4325                 goto out;
4326         }
4327         started_tr = 1;
4328         if (retval)
4329                 goto restore;
4330
4331         /*
4332          * STEP 3 - switch to cloned data and remove old blocks.
4333          */
4334         lockflags = SFL_BITMAP;
4335         if (overflow_extents(fp))
4336                 lockflags |= SFL_EXTENTS;
4337         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
4338
4339         retval = HeadTruncateFile(hfsmp, (FCB*)fp, headblks);
4340
4341         hfs_systemfile_unlock(hfsmp, lockflags);
4342         lockflags = 0;
4343         if (retval)
4344                 goto restore;
4345 out:
4346         if (took_trunc_lock)
4347                 hfs_unlock_truncate(cp, 0);
4348
4349         if (lockflags) {
4350                 hfs_systemfile_unlock(hfsmp, lockflags);
4351                 lockflags = 0;
4352         }
4353
4354         /* Push cnode's new extent data to disk. */
4355         if (retval == 0) {
4356                 (void) hfs_update(vp, MNT_WAIT);
4357         }
4358         if (hfsmp->jnl) {
4359                 if (cp->c_cnid < kHFSFirstUserCatalogNodeID)
4360                         (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
4361                 else
4362                         (void) hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
4363         }
4364 exit:
4365         if (started_tr)
4366                 hfs_end_transaction(hfsmp);
4367
4368         return (retval);
4369
4370 restore:
4371         if (fp->ff_blocks == headblks) {
4372                 if (took_trunc_lock)
4373                         hfs_unlock_truncate(cp, 0);
4374                 goto exit;
4375         }
4376         /*
4377          * Give back any newly allocated space.
4378          */
4379         if (lockflags == 0) {
4380                 lockflags = SFL_BITMAP;
4381                 if (overflow_extents(fp))
4382                         lockflags |= SFL_EXTENTS;
4383                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
4384         }
4385
4386         (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, 0, FORK_IS_RSRC(fp),
4387                                                  FTOC(fp)->c_fileid, false);
4388
4389         hfs_systemfile_unlock(hfsmp, lockflags);
4390         lockflags = 0;
4391
4392         if (took_trunc_lock)
4393                 hfs_unlock_truncate(cp, 0);
4394         goto exit;
4395 }
4396
4397
4398 /*
4399  * Clone a symlink.
4400  *
4401  */
4402 static int
4403 hfs_clonelink(struct vnode *vp, int blksize, kauth_cred_t cred, __unused struct proc *p)
4404 {
4405         struct buf *head_bp = NULL;
4406         struct buf *tail_bp = NULL;
4407         int error;
4408
4409
4410         error = (int)buf_meta_bread(vp, (daddr64_t)0, blksize, cred, &head_bp);
4411         if (error)
4412                 goto out;
4413
4414         tail_bp = buf_getblk(vp, (daddr64_t)1, blksize, 0, 0, BLK_META);
4415         if (tail_bp == NULL) {
4416                 error = EIO;
4417                 goto out;
4418         }
4419         bcopy((char *)buf_dataptr(head_bp), (char *)buf_dataptr(tail_bp), blksize);
4420         error = (int)buf_bwrite(tail_bp);
4421 out:
4422         if (head_bp) {
4423                 buf_markinvalid(head_bp);
4424                 buf_brelse(head_bp);
4425         }
4426         (void) buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
4427
4428         return (error);
4429 }
4430
4431 /*
4432  * Clone a file's data within the file.
4433  *
4434  */
4435 static int
4436 hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
4437 {
4438         caddr_t  bufp;
4439         size_t  bufsize;
4440         size_t  copysize;
4441         size_t  iosize;
4442         size_t  offset;
4443         off_t   writebase;
4444         uio_t auio;
4445         int  error = 0;
4446
4447         writebase = blkstart * blksize;
4448         copysize = blkcnt * blksize;
4449         iosize = bufsize = MIN(copysize, 128 * 1024);
4450         offset = 0;
4451
4452         hfs_unlock(VTOC(vp));
4453
4454 #if CONFIG_PROTECT
4455         if ((error = cp_handle_vnop(VTOC(vp), CP_WRITE_ACCESS)) != 0) {
4456                 hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
4457                 return (error);
4458         }
4459 #endif /* CONFIG_PROTECT */
4460
4461         if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
4462                 hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
4463                 return (ENOMEM);
4464         }
4465
4466         auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
4467
4468         while (offset < copysize) {
4469                 iosize = MIN(copysize - offset, iosize);
4470
4471                 uio_reset(auio, offset, UIO_SYSSPACE, UIO_READ);
4472                 uio_addiov(auio, (uintptr_t)bufp, iosize);
4473
4474                 error = cluster_read(vp, auio, copysize, IO_NOCACHE);
4475                 if (error) {
4476                         printf("hfs_clonefile: cluster_read failed - %d\n", error);
4477                         break;
4478                 }
4479                 if (uio_resid(auio) != 0) {
4480                         printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", uio_resid(auio));
4481                         error = EIO;
4482                         break;
4483                 }
4484
4485                 uio_reset(auio, writebase + offset, UIO_SYSSPACE, UIO_WRITE);
4486                 uio_addiov(auio, (uintptr_t)bufp, iosize);
4487
4488                 error = cluster_write(vp, auio, writebase + offset,
4489                                       writebase + offset + iosize,
4490                                       uio_offset(auio), 0, IO_NOCACHE | IO_SYNC);
4491                 if (error) {
4492                         printf("hfs_clonefile: cluster_write failed - %d\n", error);
4493                         break;
4494                 }
4495                 if (uio_resid(auio) != 0) {
4496                         printf("hfs_clonefile: cluster_write failed - uio_resid not zero\n");
4497                         error = EIO;
4498                         break;
4499                 }
4500                 offset += iosize;
4501         }
4502         uio_free(auio);
4503
4504         if ((blksize & PAGE_MASK)) {
4505                 /*
4506                  * since the copy may not have started on a PAGE
4507                  * boundary (or may not have ended on one), we
4508                  * may have pages left in the cache since NOCACHE
4509                  * will let partially written pages linger...
4510                  * lets just flush the entire range to make sure
4511                  * we don't have any pages left that are beyond
4512                  * (or intersect) the real LEOF of this file
4513                  */
4514                 ubc_msync(vp, writebase, writebase + offset, NULL, UBC_INVALIDATE | UBC_PUSHDIRTY);
4515         } else {
4516                 /*
4517                  * No need to call ubc_sync_range or hfs_invalbuf
4518                  * since the file was copied using IO_NOCACHE and
4519                  * the copy was done starting and ending on a page
4520                  * boundary in the file.
4521                  */
4522         }
4523         kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
4524
4525         hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
4526         return (error);
4527 }
4528
4529 /*
4530  * Clone a system (metadata) file.
4531  *
4532  */
4533 static int
4534 hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize,
4535                  kauth_cred_t cred, struct proc *p)
4536 {
4537         caddr_t  bufp;
4538         char * offset;
4539         size_t  bufsize;
4540         size_t  iosize;
4541         struct buf *bp = NULL;
4542         daddr64_t  blkno;
4543         daddr64_t  blk;
4544         daddr64_t  start_blk;
4545         daddr64_t  last_blk;
4546         int  breadcnt;
4547         int  i;
4548         int  error = 0;
4549
4550
4551         iosize = GetLogicalBlockSize(vp);
4552         bufsize = MIN(blkcnt * blksize, 1024 * 1024) & ~(iosize - 1);
4553         breadcnt = bufsize / iosize;
4554
4555         if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
4556                 return (ENOMEM);
4557         }
4558         start_blk = ((daddr64_t)blkstart * blksize) / iosize;
4559         last_blk  = ((daddr64_t)blkcnt * blksize) / iosize;
4560         blkno = 0;
4561
4562         while (blkno < last_blk) {
4563                 /*
4564                  * Read up to a megabyte
4565                  */
4566                 offset = bufp;
4567                 for (i = 0, blk = blkno; (i < breadcnt) && (blk < last_blk); ++i, ++blk) {
4568                         error = (int)buf_meta_bread(vp, blk, iosize, cred, &bp);
4569                         if (error) {
4570                                 printf("hfs_clonesysfile: meta_bread error %d\n", error);
4571                                 goto out;
4572                         }
4573                         if (buf_count(bp) != iosize) {
4574                                 printf("hfs_clonesysfile: b_bcount is only %d\n", buf_count(bp));
4575                                 goto out;
4576                         }
4577                         bcopy((char *)buf_dataptr(bp), offset, iosize);
4578
4579                         buf_markinvalid(bp);
4580                         buf_brelse(bp);
4581                         bp = NULL;
4582
4583                         offset += iosize;
4584                 }
4585
4586                 /*
4587                  * Write up to a megabyte
4588                  */
4589                 offset = bufp;
4590                 for (i = 0; (i < breadcnt) && (blkno < last_blk); ++i, ++blkno) {
4591                         bp = buf_getblk(vp, start_blk + blkno, iosize, 0, 0, BLK_META);
4592                         if (bp == NULL) {
4593                                 printf("hfs_clonesysfile: getblk failed on blk %qd\n", start_blk + blkno);
4594                                 error = EIO;
4595                                 goto out;
4596                         }
4597                         bcopy(offset, (char *)buf_dataptr(bp), iosize);
4598                         error = (int)buf_bwrite(bp);
4599                         bp = NULL;
4600                         if (error)
4601                                 goto out;
4602                         offset += iosize;
4603                 }
4604         }
4605 out:
4606         if (bp) {
4607                 buf_brelse(bp);
4608         }
4609
4610         kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
4611
4612         error = hfs_fsync(vp, MNT_WAIT, 0, p);
4613
4614         return (error);
4615 }