bsd/hfs/hfs_readwrite.c

   1 /*
   2  * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*      @(#)hfs_readwrite.c     1.0
  29  *
  30  *      (c) 1998-2001 Apple Computer, Inc.  All Rights Reserved
  31  *
  32  *      hfs_readwrite.c -- vnode operations to deal with reading and writing files.
  33  *
  34  */
  35
  36 #include <sys/param.h>
  37 #include <sys/systm.h>
  38 #include <sys/resourcevar.h>
  39 #include <sys/kernel.h>
  40 #include <sys/fcntl.h>
  41 #include <sys/filedesc.h>
  42 #include <sys/stat.h>
  43 #include <sys/buf.h>
  44 #include <sys/proc.h>
  45 #include <sys/kauth.h>
  46 #include <sys/vnode.h>
  47 #include <sys/vnode_internal.h>
  48 #include <sys/uio.h>
  49 #include <sys/vfs_context.h>
  50 #include <sys/fsevents.h>
  51 #include <kern/kalloc.h>
  52 #include <sys/disk.h>
  53 #include <sys/sysctl.h>
  54 #include <sys/fsctl.h>
  55
  56 #include <miscfs/specfs/specdev.h>
  57
  58 #include <sys/ubc.h>
  59 #include <sys/ubc_internal.h>
  60
  61 #include <vm/vm_pageout.h>
  62 #include <vm/vm_kern.h>
  63
  64 #include <sys/kdebug.h>
  65
  66 #include        "hfs.h"
  67 #include        "hfs_attrlist.h"
  68 #include        "hfs_endian.h"
  69 #include        "hfs_fsctl.h"
  70 #include        "hfs_quota.h"
  71 #include        "hfscommon/headers/FileMgrInternal.h"
  72 #include        "hfscommon/headers/BTreesInternal.h"
  73 #include        "hfs_cnode.h"
  74 #include        "hfs_dbg.h"
  75
  76 #define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2)))
  77
  78 enum {
  79         MAXHFSFILESIZE = 0x7FFFFFFF             /* this needs to go in the mount structure */
  80 };
  81
  82 /* from bsd/hfs/hfs_vfsops.c */
  83 extern int hfs_vfs_vget (struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context);
  84
  85 static int  hfs_clonelink(struct vnode *, int, kauth_cred_t, struct proc *);
  86 static int  hfs_clonefile(struct vnode *, int, int, int);
  87 static int  hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *);
  88 static int  hfs_minorupdate(struct vnode *vp);
  89 static int  do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skip, vfs_context_t context);
  90
  91
  92 int flush_cache_on_write = 0;
  93 SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW | CTLFLAG_LOCKED, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files");
  94
  95 /*
  96  * Read data from a file.
  97  */
  98 int
  99 hfs_vnop_read(struct vnop_read_args *ap)
 100 {
 101         uio_t uio = ap->a_uio;
 102         struct vnode *vp = ap->a_vp;
 103         struct cnode *cp;
 104         struct filefork *fp;
 105         struct hfsmount *hfsmp;
 106         off_t filesize;
 107         off_t filebytes;
 108         off_t start_resid = uio_resid(uio);
 109         off_t offset = uio_offset(uio);
 110         int retval = 0;
 111         int took_truncate_lock = 0;
 112
 113         /* Preflight checks */
 114         if (!vnode_isreg(vp)) {
 115                 /* can only read regular files */
 116                 if (vnode_isdir(vp))
 117                         return (EISDIR);
 118                 else
 119                         return (EPERM);
 120         }
 121         if (start_resid == 0)
 122                 return (0);             /* Nothing left to do */
 123         if (offset < 0)
 124                 return (EINVAL);        /* cant read from a negative offset */
 125
 126 #if HFS_COMPRESSION
 127         if (VNODE_IS_RSRC(vp)) {
 128                 if (hfs_hides_rsrc(ap->a_context, VTOC(vp), 1)) { /* 1 == don't take the cnode lock */
 129                         return 0;
 130                 }
 131                 /* otherwise read the resource fork normally */
 132         } else {
 133                 int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
 134                 if (compressed) {
 135                         retval = decmpfs_read_compressed(ap, &compressed, VTOCMP(vp));
 136                         if (compressed) {
 137                                 if (retval == 0) {
 138                                         /* successful read, update the access time */
 139                                         VTOC(vp)->c_touch_acctime = TRUE;
 140
 141                                         /* compressed files are not hot file candidates */
 142                                         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
 143                                                 VTOF(vp)->ff_bytesread = 0;
 144                                         }
 145                                 }
 146                                 return retval;
 147                         }
 148                         /* otherwise the file was converted back to a regular file while we were reading it */
 149                         retval = 0;
 150                 } else if ((VTOC(vp)->c_flags & UF_COMPRESSED)) {
 151                         int error;
 152
 153                         error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP);
 154                         if (error) {
 155                                 return error;
 156                         }
 157
 158                 }
 159         }
 160 #endif /* HFS_COMPRESSION */
 161
 162         cp = VTOC(vp);
 163         fp = VTOF(vp);
 164         hfsmp = VTOHFS(vp);
 165
 166 #if CONFIG_PROTECT
 167         if ((retval = cp_handle_vnop (cp, CP_READ_ACCESS)) != 0) {
 168                 goto exit;
 169         }
 170 #endif
 171
 172         /* Protect against a size change. */
 173         hfs_lock_truncate(cp, HFS_SHARED_LOCK);
 174         took_truncate_lock = 1;
 175
 176         filesize = fp->ff_size;
 177         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 178         if (offset > filesize) {
 179                 if ((hfsmp->hfs_flags & HFS_STANDARD) &&
 180                     (offset > (off_t)MAXHFSFILESIZE)) {
 181                         retval = EFBIG;
 182                 }
 183                 goto exit;
 184         }
 185
 186         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_START,
 187                 (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0);
 188
 189         retval = cluster_read(vp, uio, filesize, ap->a_ioflag);
 190
 191         cp->c_touch_acctime = TRUE;
 192
 193         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_END,
 194                 (int)uio_offset(uio), uio_resid(uio), (int)filesize,  (int)filebytes, 0);
 195
 196         /*
 197          * Keep track blocks read
 198          */
 199         if (hfsmp->hfc_stage == HFC_RECORDING && retval == 0) {
 200                 int took_cnode_lock = 0;
 201                 off_t bytesread;
 202
 203                 bytesread = start_resid - uio_resid(uio);
 204
 205                 /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
 206                 if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) {
 207                         hfs_lock(cp, HFS_FORCE_LOCK);
 208                         took_cnode_lock = 1;
 209                 }
 210                 /*
 211                  * If this file hasn't been seen since the start of
 212                  * the current sampling period then start over.
 213                  */
 214                 if (cp->c_atime < hfsmp->hfc_timebase) {
 215                         struct timeval tv;
 216
 217                         fp->ff_bytesread = bytesread;
 218                         microtime(&tv);
 219                         cp->c_atime = tv.tv_sec;
 220                 } else {
 221                         fp->ff_bytesread += bytesread;
 222                 }
 223                 if (took_cnode_lock)
 224                         hfs_unlock(cp);
 225         }
 226 exit:
 227         if (took_truncate_lock) {
 228                 hfs_unlock_truncate(cp, 0);
 229         }
 230
 231         return (retval);
 232 }
 233
 234 /*
 235  * Write data to a file.
 236  */
 237 int
 238 hfs_vnop_write(struct vnop_write_args *ap)
 239 {
 240         uio_t uio = ap->a_uio;
 241         struct vnode *vp = ap->a_vp;
 242         struct cnode *cp;
 243         struct filefork *fp;
 244         struct hfsmount *hfsmp;
 245         kauth_cred_t cred = NULL;
 246         off_t origFileSize;
 247         off_t writelimit;
 248         off_t bytesToAdd = 0;
 249         off_t actualBytesAdded;
 250         off_t filebytes;
 251         off_t offset;
 252         ssize_t resid;
 253         int eflags;
 254         int ioflag = ap->a_ioflag;
 255         int retval = 0;
 256         int lockflags;
 257         int cnode_locked = 0;
 258         int partialwrite = 0;
 259         int do_snapshot = 1;
 260         time_t orig_ctime=VTOC(vp)->c_ctime;
 261         int took_truncate_lock = 0;
 262
 263 #if HFS_COMPRESSION
 264         if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
 265                 int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
 266                 switch(state) {
 267                         case FILE_IS_COMPRESSED:
 268                                 return EACCES;
 269                         case FILE_IS_CONVERTING:
 270                                 /* if FILE_IS_CONVERTING, we allow writes but do not
 271                                    bother with snapshots or else we will deadlock.
 272                                 */
 273                                 do_snapshot = 0;
 274                                 break;
 275                         default:
 276                                 printf("invalid state %d for compressed file\n", state);
 277                                 /* fall through */
 278                 }
 279         } else if ((VTOC(vp)->c_flags & UF_COMPRESSED)) {
 280                 int error;
 281
 282                 error = check_for_dataless_file(vp, NAMESPACE_HANDLER_WRITE_OP);
 283                 if (error != 0) {
 284                         return error;
 285                 }
 286         }
 287
 288         if (do_snapshot) {
 289                 check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_WRITE_OP, uio);
 290         }
 291
 292 #endif
 293
 294         // LP64todo - fix this! uio_resid may be 64-bit value
 295         resid = uio_resid(uio);
 296         offset = uio_offset(uio);
 297
 298         if (offset < 0)
 299                 return (EINVAL);
 300         if (resid == 0)
 301                 return (E_NONE);
 302         if (!vnode_isreg(vp))
 303                 return (EPERM);  /* Can only write regular files */
 304
 305         cp = VTOC(vp);
 306         fp = VTOF(vp);
 307         hfsmp = VTOHFS(vp);
 308
 309 #if CONFIG_PROTECT
 310         if ((retval = cp_handle_vnop (cp, CP_WRITE_ACCESS)) != 0) {
 311                 goto exit;
 312         }
 313 #endif
 314
 315         eflags = kEFDeferMask;  /* defer file block allocations */
 316 #if HFS_SPARSE_DEV
 317         /*
 318          * When the underlying device is sparse and space
 319          * is low (< 8MB), stop doing delayed allocations
 320          * and begin doing synchronous I/O.
 321          */
 322         if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
 323             (hfs_freeblks(hfsmp, 0) < 2048)) {
 324                 eflags &= ~kEFDeferMask;
 325                 ioflag |= IO_SYNC;
 326         }
 327 #endif /* HFS_SPARSE_DEV */
 328
 329 again:
 330         /* Protect against a size change. */
 331         if (ioflag & IO_APPEND) {
 332                 hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK);
 333         }
 334         else {
 335                 hfs_lock_truncate(cp, HFS_SHARED_LOCK);
 336         }
 337         took_truncate_lock = 1;
 338
 339         /* Update UIO */
 340         if (ioflag & IO_APPEND) {
 341                 uio_setoffset(uio, fp->ff_size);
 342                 offset = fp->ff_size;
 343         }
 344         if ((cp->c_flags & APPEND) && offset != fp->ff_size) {
 345                 retval = EPERM;
 346                 goto exit;
 347         }
 348
 349         origFileSize = fp->ff_size;
 350         writelimit = offset + resid;
 351         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 352
 353         /* If the truncate lock is shared, and if we either have virtual
 354          * blocks or will need to extend the file, upgrade the truncate
 355          * to exclusive lock.  If upgrade fails, we lose the lock and
 356          * have to get exclusive lock again.  Note that we want to
 357          * grab the truncate lock exclusive even if we're not allocating new blocks
 358          * because we could still be growing past the LEOF.
 359          */
 360         if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) &&
 361             ((fp->ff_unallocblocks != 0) || (writelimit > origFileSize))) {
 362                 /* Lock upgrade failed and we lost our shared lock, try again */
 363                 if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) {
 364                         goto again;
 365                 }
 366                 else {
 367                         /* Store the owner in the c_truncatelockowner field if we successfully upgrade */
 368                         cp->c_truncatelockowner = current_thread();
 369                 }
 370         }
 371
 372         if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) {
 373                 goto exit;
 374         }
 375         cnode_locked = 1;
 376
 377         if (cp->c_truncatelockowner == HFS_SHARED_OWNER) {
 378                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_START,
 379                              (int)offset, uio_resid(uio), (int)fp->ff_size,
 380                              (int)filebytes, 0);
 381         }
 382
 383         /* Check if we do not need to extend the file */
 384         if (writelimit <= filebytes) {
 385                 goto sizeok;
 386         }
 387
 388         cred = vfs_context_ucred(ap->a_context);
 389         bytesToAdd = writelimit - filebytes;
 390
 391 #if QUOTA
 392         retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, hfsmp->blockSize)),
 393                            cred, 0);
 394         if (retval)
 395                 goto exit;
 396 #endif /* QUOTA */
 397
 398         if (hfs_start_transaction(hfsmp) != 0) {
 399                 retval = EINVAL;
 400                 goto exit;
 401         }
 402
 403         while (writelimit > filebytes) {
 404                 bytesToAdd = writelimit - filebytes;
 405                 if (cred && suser(cred, NULL) != 0)
 406                         eflags |= kEFReserveMask;
 407
 408                 /* Protect extents b-tree and allocation bitmap */
 409                 lockflags = SFL_BITMAP;
 410                 if (overflow_extents(fp))
 411                         lockflags |= SFL_EXTENTS;
 412                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
 413
 414                 /* Files that are changing size are not hot file candidates. */
 415                 if (hfsmp->hfc_stage == HFC_RECORDING) {
 416                         fp->ff_bytesread = 0;
 417                 }
 418                 retval = MacToVFSError(ExtendFileC (hfsmp, (FCB*)fp, bytesToAdd,
 419                                 0, eflags, &actualBytesAdded));
 420
 421                 hfs_systemfile_unlock(hfsmp, lockflags);
 422
 423                 if ((actualBytesAdded == 0) && (retval == E_NONE))
 424                         retval = ENOSPC;
 425                 if (retval != E_NONE)
 426                         break;
 427                 filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 428                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_NONE,
 429                         (int)offset, uio_resid(uio), (int)fp->ff_size,  (int)filebytes, 0);
 430         }
 431         (void) hfs_update(vp, TRUE);
 432         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
 433         (void) hfs_end_transaction(hfsmp);
 434
 435         /*
 436          * If we didn't grow the file enough try a partial write.
 437          * POSIX expects this behavior.
 438          */
 439         if ((retval == ENOSPC) && (filebytes > offset)) {
 440                 retval = 0;
 441                 partialwrite = 1;
 442                 uio_setresid(uio, (uio_resid(uio) - bytesToAdd));
 443                 resid -= bytesToAdd;
 444                 writelimit = filebytes;
 445         }
 446 sizeok:
 447         if (retval == E_NONE) {
 448                 off_t filesize;
 449                 off_t zero_off;
 450                 off_t tail_off;
 451                 off_t inval_start;
 452                 off_t inval_end;
 453                 off_t io_start;
 454                 int lflag;
 455                 struct rl_entry *invalid_range;
 456
 457                 if (writelimit > fp->ff_size)
 458                         filesize = writelimit;
 459                 else
 460                         filesize = fp->ff_size;
 461
 462                 lflag = ioflag & ~(IO_TAILZEROFILL | IO_HEADZEROFILL | IO_NOZEROVALID | IO_NOZERODIRTY);
 463
 464                 if (offset <= fp->ff_size) {
 465                         zero_off = offset & ~PAGE_MASK_64;
 466
 467                         /* Check to see whether the area between the zero_offset and the start
 468                            of the transfer to see whether is invalid and should be zero-filled
 469                            as part of the transfer:
 470                          */
 471                         if (offset > zero_off) {
 472                                 if (rl_scan(&fp->ff_invalidranges, zero_off, offset - 1, &invalid_range) != RL_NOOVERLAP)
 473                                         lflag |= IO_HEADZEROFILL;
 474                         }
 475                 } else {
 476                         off_t eof_page_base = fp->ff_size & ~PAGE_MASK_64;
 477
 478                         /* The bytes between fp->ff_size and uio->uio_offset must never be
 479                            read without being zeroed.  The current last block is filled with zeroes
 480                            if it holds valid data but in all cases merely do a little bookkeeping
 481                            to track the area from the end of the current last page to the start of
 482                            the area actually written.  For the same reason only the bytes up to the
 483                            start of the page where this write will start is invalidated; any remainder
 484                            before uio->uio_offset is explicitly zeroed as part of the cluster_write.
 485
 486                            Note that inval_start, the start of the page after the current EOF,
 487                            may be past the start of the write, in which case the zeroing
 488                            will be handled by the cluser_write of the actual data.
 489                          */
 490                         inval_start = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
 491                         inval_end = offset & ~PAGE_MASK_64;
 492                         zero_off = fp->ff_size;
 493
 494                         if ((fp->ff_size & PAGE_MASK_64) &&
 495                                 (rl_scan(&fp->ff_invalidranges,
 496                                                         eof_page_base,
 497                                                         fp->ff_size - 1,
 498                                                         &invalid_range) != RL_NOOVERLAP)) {
 499                                 /* The page containing the EOF is not valid, so the
 500                                    entire page must be made inaccessible now.  If the write
 501                                    starts on a page beyond the page containing the eof
 502                                    (inval_end > eof_page_base), add the
 503                                    whole page to the range to be invalidated.  Otherwise
 504                                    (i.e. if the write starts on the same page), zero-fill
 505                                    the entire page explicitly now:
 506                                  */
 507                                 if (inval_end > eof_page_base) {
 508                                         inval_start = eof_page_base;
 509                                 } else {
 510                                         zero_off = eof_page_base;
 511                                 };
 512                         };
 513
 514                         if (inval_start < inval_end) {
 515                                 struct timeval tv;
 516                                 /* There's some range of data that's going to be marked invalid */
 517
 518                                 if (zero_off < inval_start) {
 519                                         /* The pages between inval_start and inval_end are going to be invalidated,
 520                                            and the actual write will start on a page past inval_end.  Now's the last
 521                                            chance to zero-fill the page containing the EOF:
 522                                          */
 523                                         hfs_unlock(cp);
 524                                         cnode_locked = 0;
 525                                         retval = cluster_write(vp, (uio_t) 0,
 526                                                         fp->ff_size, inval_start,
 527                                                         zero_off, (off_t)0,
 528                                                         lflag | IO_HEADZEROFILL | IO_NOZERODIRTY);
 529                                         hfs_lock(cp, HFS_FORCE_LOCK);
 530                                         cnode_locked = 1;
 531                                         if (retval) goto ioerr_exit;
 532                                         offset = uio_offset(uio);
 533                                 };
 534
 535                                 /* Mark the remaining area of the newly allocated space as invalid: */
 536                                 rl_add(inval_start, inval_end - 1 , &fp->ff_invalidranges);
 537                                 microuptime(&tv);
 538                                 cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
 539                                 zero_off = fp->ff_size = inval_end;
 540                         };
 541
 542                         if (offset > zero_off) lflag |= IO_HEADZEROFILL;
 543                 };
 544
 545                 /* Check to see whether the area between the end of the write and the end of
 546                    the page it falls in is invalid and should be zero-filled as part of the transfer:
 547                  */
 548                 tail_off = (writelimit + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
 549                 if (tail_off > filesize) tail_off = filesize;
 550                 if (tail_off > writelimit) {
 551                         if (rl_scan(&fp->ff_invalidranges, writelimit, tail_off - 1, &invalid_range) != RL_NOOVERLAP) {
 552                                 lflag |= IO_TAILZEROFILL;
 553                         };
 554                 };
 555
 556                 /*
 557                  * if the write starts beyond the current EOF (possibly advanced in the
 558                  * zeroing of the last block, above), then we'll zero fill from the current EOF
 559                  * to where the write begins:
 560                  *
 561                  * NOTE: If (and ONLY if) the portion of the file about to be written is
 562                  *       before the current EOF it might be marked as invalid now and must be
 563                  *       made readable (removed from the invalid ranges) before cluster_write
 564                  *       tries to write it:
 565                  */
 566                 io_start = (lflag & IO_HEADZEROFILL) ? zero_off : offset;
 567                 if (io_start < fp->ff_size) {
 568                         off_t io_end;
 569
 570                         io_end = (lflag & IO_TAILZEROFILL) ? tail_off : writelimit;
 571                         rl_remove(io_start, io_end - 1, &fp->ff_invalidranges);
 572                 };
 573
 574                 hfs_unlock(cp);
 575                 cnode_locked = 0;
 576
 577                 /*
 578                  * We need to tell UBC the fork's new size BEFORE calling
 579                  * cluster_write, in case any of the new pages need to be
 580                  * paged out before cluster_write completes (which does happen
 581                  * in embedded systems due to extreme memory pressure).
 582                  * Similarly, we need to tell hfs_vnop_pageout what the new EOF
 583                  * will be, so that it can pass that on to cluster_pageout, and
 584                  * allow those pageouts.
 585                  *
 586                  * We don't update ff_size yet since we don't want pageins to
 587                  * be able to see uninitialized data between the old and new
 588                  * EOF, until cluster_write has completed and initialized that
 589                  * part of the file.
 590                  *
 591                  * The vnode pager relies on the file size last given to UBC via
 592                  * ubc_setsize.  hfs_vnop_pageout relies on fp->ff_new_size or
 593                  * ff_size (whichever is larger).  NOTE: ff_new_size is always
 594                  * zero, unless we are extending the file via write.
 595                  */
 596                 if (filesize > fp->ff_size) {
 597                         fp->ff_new_size = filesize;
 598                         ubc_setsize(vp, filesize);
 599                 }
 600                 retval = cluster_write(vp, uio, fp->ff_size, filesize, zero_off,
 601                                 tail_off, lflag | IO_NOZERODIRTY);
 602                 if (retval) {
 603                         fp->ff_new_size = 0;    /* no longer extending; use ff_size */
 604                         if (filesize > origFileSize) {
 605                                 ubc_setsize(vp, origFileSize);
 606                         }
 607                         goto ioerr_exit;
 608                 }
 609
 610                 if (filesize > origFileSize) {
 611                         fp->ff_size = filesize;
 612
 613                         /* Files that are changing size are not hot file candidates. */
 614                         if (hfsmp->hfc_stage == HFC_RECORDING) {
 615                                 fp->ff_bytesread = 0;
 616                         }
 617                 }
 618                 fp->ff_new_size = 0;    /* ff_size now has the correct size */
 619
 620                 /* If we wrote some bytes, then touch the change and mod times */
 621                 if (resid > uio_resid(uio)) {
 622                         cp->c_touch_chgtime = TRUE;
 623                         cp->c_touch_modtime = TRUE;
 624                 }
 625         }
 626         if (partialwrite) {
 627                 uio_setresid(uio, (uio_resid(uio) + bytesToAdd));
 628                 resid += bytesToAdd;
 629         }
 630
 631         // XXXdbg - see radar 4871353 for more info
 632         {
 633             if (flush_cache_on_write && ((ioflag & IO_NOCACHE) || vnode_isnocache(vp))) {
 634                 VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NULL);
 635             }
 636         }
 637
 638 ioerr_exit:
 639         /*
 640          * If we successfully wrote any data, and we are not the superuser
 641          * we clear the setuid and setgid bits as a precaution against
 642          * tampering.
 643          */
 644         if (cp->c_mode & (S_ISUID | S_ISGID)) {
 645                 cred = vfs_context_ucred(ap->a_context);
 646                 if (resid > uio_resid(uio) && cred && suser(cred, NULL)) {
 647                         if (!cnode_locked) {
 648                                 hfs_lock(cp, HFS_FORCE_LOCK);
 649                                 cnode_locked = 1;
 650                         }
 651                         cp->c_mode &= ~(S_ISUID | S_ISGID);
 652                 }
 653         }
 654         if (retval) {
 655                 if (ioflag & IO_UNIT) {
 656                         if (!cnode_locked) {
 657                                 hfs_lock(cp, HFS_FORCE_LOCK);
 658                                 cnode_locked = 1;
 659                         }
 660                         (void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC,
 661                                            0, 0, ap->a_context);
 662                         // LP64todo - fix this!  resid needs to by user_ssize_t
 663                         uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio))));
 664                         uio_setresid(uio, resid);
 665                         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 666                 }
 667         } else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio))) {
 668                 if (!cnode_locked) {
 669                         hfs_lock(cp, HFS_FORCE_LOCK);
 670                         cnode_locked = 1;
 671                 }
 672                 retval = hfs_update(vp, TRUE);
 673         }
 674         /* Updating vcbWrCnt doesn't need to be atomic. */
 675         hfsmp->vcbWrCnt++;
 676
 677         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_END,
 678                 (int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0);
 679 exit:
 680         if (cnode_locked)
 681                 hfs_unlock(cp);
 682
 683         if (took_truncate_lock) {
 684                 hfs_unlock_truncate(cp, 0);
 685         }
 686         return (retval);
 687 }
 688
 689 /* support for the "bulk-access" fcntl */
 690
 691 #define CACHE_LEVELS 16
 692 #define NUM_CACHE_ENTRIES (64*16)
 693 #define PARENT_IDS_FLAG 0x100
 694
 695 struct access_cache {
 696        int numcached;
 697        int cachehits; /* these two for statistics gathering */
 698        int lookups;
 699        unsigned int *acache;
 700        unsigned char *haveaccess;
 701 };
 702
 703 struct access_t {
 704         uid_t     uid;              /* IN: effective user id */
 705         short     flags;            /* IN: access requested (i.e. R_OK) */
 706         short     num_groups;       /* IN: number of groups user belongs to */
 707         int       num_files;        /* IN: number of files to process */
 708         int       *file_ids;        /* IN: array of file ids */
 709         gid_t     *groups;          /* IN: array of groups */
 710         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 711 } __attribute__((unavailable)); // this structure is for reference purposes only
 712
 713 struct user32_access_t {
 714         uid_t     uid;              /* IN: effective user id */
 715         short     flags;            /* IN: access requested (i.e. R_OK) */
 716         short     num_groups;       /* IN: number of groups user belongs to */
 717         int       num_files;        /* IN: number of files to process */
 718         user32_addr_t      file_ids;        /* IN: array of file ids */
 719         user32_addr_t      groups;          /* IN: array of groups */
 720         user32_addr_t      access;          /* OUT: access info for each file (0 for 'has access') */
 721 };
 722
 723 struct user64_access_t {
 724         uid_t           uid;                    /* IN: effective user id */
 725         short           flags;                  /* IN: access requested (i.e. R_OK) */
 726         short           num_groups;             /* IN: number of groups user belongs to */
 727         int             num_files;              /* IN: number of files to process */
 728         user64_addr_t   file_ids;               /* IN: array of file ids */
 729         user64_addr_t   groups;                 /* IN: array of groups */
 730         user64_addr_t   access;                 /* OUT: access info for each file (0 for 'has access') */
 731 };
 732
 733
 734 // these are the "extended" versions of the above structures
 735 // note that it is crucial that they be different sized than
 736 // the regular version
 737 struct ext_access_t {
 738         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 739         uint32_t   num_files;       /* IN: number of files to process */
 740         uint32_t   map_size;        /* IN: size of the bit map */
 741         uint32_t  *file_ids;        /* IN: Array of file ids */
 742         char      *bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 743         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 744         uint32_t   num_parents;   /* future use */
 745         cnid_t      *parents;   /* future use */
 746 } __attribute__((unavailable)); // this structure is for reference purposes only
 747
 748 struct user32_ext_access_t {
 749         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 750         uint32_t   num_files;       /* IN: number of files to process */
 751         uint32_t   map_size;        /* IN: size of the bit map */
 752         user32_addr_t  file_ids;        /* IN: Array of file ids */
 753         user32_addr_t     bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 754         user32_addr_t access;          /* OUT: access info for each file (0 for 'has access') */
 755         uint32_t   num_parents;   /* future use */
 756         user32_addr_t parents;   /* future use */
 757 };
 758
 759 struct user64_ext_access_t {
 760         uint32_t      flags;        /* IN: access requested (i.e. R_OK) */
 761         uint32_t      num_files;    /* IN: number of files to process */
 762         uint32_t      map_size;     /* IN: size of the bit map */
 763         user64_addr_t   file_ids;     /* IN: array of file ids */
 764         user64_addr_t   bitmap;       /* IN: array of groups */
 765         user64_addr_t   access;       /* OUT: access info for each file (0 for 'has access') */
 766         uint32_t      num_parents;/* future use */
 767         user64_addr_t   parents;/* future use */
 768 };
 769
 770
 771 /*
 772  * Perform a binary search for the given parent_id. Return value is
 773  * the index if there is a match.  If no_match_indexp is non-NULL it
 774  * will be assigned with the index to insert the item (even if it was
 775  * not found).
 776  */
 777 static int cache_binSearch(cnid_t *array, unsigned int hi, cnid_t parent_id, int *no_match_indexp)
 778 {
 779     int index=-1;
 780     unsigned int lo=0;
 781
 782     do {
 783         unsigned int mid = ((hi - lo)/2) + lo;
 784         unsigned int this_id = array[mid];
 785
 786         if (parent_id == this_id) {
 787             hi = mid;
 788             break;
 789         }
 790
 791         if (parent_id < this_id) {
 792             hi = mid;
 793             continue;
 794         }
 795
 796         if (parent_id > this_id) {
 797             lo = mid + 1;
 798             continue;
 799         }
 800     } while(lo < hi);
 801
 802     /* check if lo and hi converged on the match */
 803     if (parent_id == array[hi]) {
 804         index = hi;
 805     }
 806
 807     if (no_match_indexp) {
 808         *no_match_indexp = hi;
 809     }
 810
 811     return index;
 812 }
 813
 814
 815 static int
 816 lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id)
 817 {
 818     unsigned int hi;
 819     int matches = 0;
 820     int index, no_match_index;
 821
 822     if (cache->numcached == 0) {
 823         *indexp = 0;
 824         return 0; // table is empty, so insert at index=0 and report no match
 825     }
 826
 827     if (cache->numcached > NUM_CACHE_ENTRIES) {
 828         /*printf("hfs: EGAD! numcached is %d... cut our losses and trim to %d\n",
 829           cache->numcached, NUM_CACHE_ENTRIES);*/
 830         cache->numcached = NUM_CACHE_ENTRIES;
 831     }
 832
 833     hi = cache->numcached - 1;
 834
 835     index = cache_binSearch(cache->acache, hi, parent_id, &no_match_index);
 836
 837     /* if no existing entry found, find index for new one */
 838     if (index == -1) {
 839         index = no_match_index;
 840         matches = 0;
 841     } else {
 842         matches = 1;
 843     }
 844
 845     *indexp = index;
 846     return matches;
 847 }
 848
 849 /*
 850  * Add a node to the access_cache at the given index (or do a lookup first
 851  * to find the index if -1 is passed in). We currently do a replace rather
 852  * than an insert if the cache is full.
 853  */
 854 static void
 855 add_node(struct access_cache *cache, int index, cnid_t nodeID, int access)
 856 {
 857     int lookup_index = -1;
 858
 859     /* need to do a lookup first if -1 passed for index */
 860     if (index == -1) {
 861         if (lookup_bucket(cache, &lookup_index, nodeID)) {
 862             if (cache->haveaccess[lookup_index] != access && cache->haveaccess[lookup_index] == ESRCH) {
 863                 // only update an entry if the previous access was ESRCH (i.e. a scope checking error)
 864                 cache->haveaccess[lookup_index] = access;
 865             }
 866
 867             /* mission accomplished */
 868             return;
 869         } else {
 870             index = lookup_index;
 871         }
 872
 873     }
 874
 875     /* if the cache is full, do a replace rather than an insert */
 876     if (cache->numcached >= NUM_CACHE_ENTRIES) {
 877         //printf("hfs: cache is full (%d). replace at index %d\n", cache->numcached, index);
 878         cache->numcached = NUM_CACHE_ENTRIES-1;
 879
 880         if (index > cache->numcached) {
 881             //    printf("hfs: index %d pinned to %d\n", index, cache->numcached);
 882             index = cache->numcached;
 883         }
 884     }
 885
 886     if (index < cache->numcached && index < NUM_CACHE_ENTRIES && nodeID > cache->acache[index]) {
 887         index++;
 888     }
 889
 890     if (index >= 0 && index < cache->numcached) {
 891         /* only do bcopy if we're inserting */
 892         bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) );
 893         bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(unsigned char) );
 894     }
 895
 896     cache->acache[index] = nodeID;
 897     cache->haveaccess[index] = access;
 898     cache->numcached++;
 899 }
 900
 901
 902 struct cinfo {
 903     uid_t   uid;
 904     gid_t   gid;
 905     mode_t  mode;
 906     cnid_t  parentcnid;
 907     u_int16_t recflags;
 908 };
 909
 910 static int
 911 snoop_callback(const struct cat_desc *descp, const struct cat_attr *attrp, void * arg)
 912 {
 913     struct cinfo *cip = (struct cinfo *)arg;
 914
 915     cip->uid = attrp->ca_uid;
 916     cip->gid = attrp->ca_gid;
 917     cip->mode = attrp->ca_mode;
 918     cip->parentcnid = descp->cd_parentcnid;
 919     cip->recflags = attrp->ca_recflags;
 920
 921     return (0);
 922 }
 923
 924 /*
 925  * Lookup the cnid's attr info (uid, gid, and mode) as well as its parent id. If the item
 926  * isn't incore, then go to the catalog.
 927  */
 928 static int
 929 do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, cnid_t cnid,
 930     struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp)
 931 {
 932     int error = 0;
 933
 934     /* if this id matches the one the fsctl was called with, skip the lookup */
 935     if (cnid == skip_cp->c_cnid) {
 936         cnattrp->ca_uid = skip_cp->c_uid;
 937         cnattrp->ca_gid = skip_cp->c_gid;
 938         cnattrp->ca_mode = skip_cp->c_mode;
 939         cnattrp->ca_recflags = skip_cp->c_attr.ca_recflags;
 940         keyp->hfsPlus.parentID = skip_cp->c_parentcnid;
 941     } else {
 942         struct cinfo c_info;
 943
 944         /* otherwise, check the cnode hash incase the file/dir is incore */
 945         if (hfs_chash_snoop(hfsmp, cnid, 0, snoop_callback, &c_info) == 0) {
 946             cnattrp->ca_uid = c_info.uid;
 947             cnattrp->ca_gid = c_info.gid;
 948             cnattrp->ca_mode = c_info.mode;
 949             cnattrp->ca_recflags = c_info.recflags;
 950             keyp->hfsPlus.parentID = c_info.parentcnid;
 951         } else {
 952             int lockflags;
 953
 954             lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
 955
 956             /* lookup this cnid in the catalog */
 957             error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp);
 958
 959             hfs_systemfile_unlock(hfsmp, lockflags);
 960
 961             cache->lookups++;
 962         }
 963     }
 964
 965     return (error);
 966 }
 967
 968
 969 /*
 970  * Compute whether we have access to the given directory (nodeID) and all its parents. Cache
 971  * up to CACHE_LEVELS as we progress towards the root.
 972  */
 973 static int
 974 do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HFSCatalogNodeID nodeID,
 975     struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred,
 976     struct vfs_context *my_context,
 977     char *bitmap,
 978     uint32_t map_size,
 979     cnid_t* parents,
 980     uint32_t num_parents)
 981 {
 982     int                     myErr = 0;
 983     int                     myResult;
 984     HFSCatalogNodeID        thisNodeID;
 985     unsigned int            myPerms;
 986     struct cat_attr         cnattr;
 987     int                     cache_index = -1, scope_index = -1, scope_idx_start = -1;
 988     CatalogKey              catkey;
 989
 990     int i = 0, ids_to_cache = 0;
 991     int parent_ids[CACHE_LEVELS];
 992
 993     thisNodeID = nodeID;
 994     while (thisNodeID >=  kRootDirID) {
 995         myResult = 0;   /* default to "no access" */
 996
 997         /* check the cache before resorting to hitting the catalog */
 998
 999         /* ASSUMPTION: access info of cached entries is "final"... i.e. no need
1000          * to look any further after hitting cached dir */
1001
1002         if (lookup_bucket(cache, &cache_index, thisNodeID)) {
1003             cache->cachehits++;
1004             myErr = cache->haveaccess[cache_index];
1005             if (scope_index != -1) {
1006                 if (myErr == ESRCH) {
1007                     myErr = 0;
1008                 }
1009             } else {
1010                 scope_index = 0;   // so we'll just use the cache result
1011                 scope_idx_start = ids_to_cache;
1012             }
1013             myResult = (myErr == 0) ? 1 : 0;
1014             goto ExitThisRoutine;
1015         }
1016
1017
1018         if (parents) {
1019             int tmp;
1020             tmp = cache_binSearch(parents, num_parents-1, thisNodeID, NULL);
1021             if (scope_index == -1)
1022                 scope_index = tmp;
1023             if (tmp != -1 && scope_idx_start == -1 && ids_to_cache < CACHE_LEVELS) {
1024                 scope_idx_start = ids_to_cache;
1025             }
1026         }
1027
1028         /* remember which parents we want to cache */
1029         if (ids_to_cache < CACHE_LEVELS) {
1030             parent_ids[ids_to_cache] = thisNodeID;
1031             ids_to_cache++;
1032         }
1033         // Inefficient (using modulo) and we might want to use a hash function, not rely on the node id to be "nice"...
1034         if (bitmap && map_size) {
1035             bitmap[(thisNodeID/8)%(map_size)]|=(1<<(thisNodeID&7));
1036         }
1037
1038
1039         /* do the lookup (checks the cnode hash, then the catalog) */
1040         myErr = do_attr_lookup(hfsmp, cache, thisNodeID, skip_cp, &catkey, &cnattr);
1041         if (myErr) {
1042             goto ExitThisRoutine; /* no access */
1043         }
1044
1045         /* Root always gets access. */
1046         if (suser(myp_ucred, NULL) == 0) {
1047                 thisNodeID = catkey.hfsPlus.parentID;
1048                 myResult = 1;
1049                 continue;
1050         }
1051
1052         // if the thing has acl's, do the full permission check
1053         if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1054             struct vnode *vp;
1055
1056             /* get the vnode for this cnid */
1057             myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0, 0);
1058             if ( myErr ) {
1059                 myResult = 0;
1060                 goto ExitThisRoutine;
1061             }
1062
1063             thisNodeID = VTOC(vp)->c_parentcnid;
1064
1065             hfs_unlock(VTOC(vp));
1066
1067             if (vnode_vtype(vp) == VDIR) {
1068                 myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), my_context);
1069             } else {
1070                 myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, my_context);
1071             }
1072
1073             vnode_put(vp);
1074             if (myErr) {
1075                 myResult = 0;
1076                 goto ExitThisRoutine;
1077             }
1078         } else {
1079             unsigned int flags;
1080                 int mode = cnattr.ca_mode & S_IFMT;
1081                 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, cnattr.ca_mode, hfsmp->hfs_mp,myp_ucred, theProcPtr);
1082
1083                 if (mode == S_IFDIR) {
1084                         flags = R_OK | X_OK;
1085                 } else {
1086                         flags = R_OK;
1087                 }
1088                 if ( (myPerms & flags) != flags) {
1089                         myResult = 0;
1090                         myErr = EACCES;
1091                         goto ExitThisRoutine;   /* no access */
1092                 }
1093
1094             /* up the hierarchy we go */
1095             thisNodeID = catkey.hfsPlus.parentID;
1096         }
1097     }
1098
1099     /* if here, we have access to this node */
1100     myResult = 1;
1101
1102   ExitThisRoutine:
1103     if (parents && myErr == 0 && scope_index == -1) {
1104         myErr = ESRCH;
1105     }
1106
1107     if (myErr) {
1108         myResult = 0;
1109     }
1110     *err = myErr;
1111
1112     /* cache the parent directory(ies) */
1113     for (i = 0; i < ids_to_cache; i++) {
1114         if (myErr == 0 && parents && (scope_idx_start == -1 || i > scope_idx_start)) {
1115             add_node(cache, -1, parent_ids[i], ESRCH);
1116         } else {
1117             add_node(cache, -1, parent_ids[i], myErr);
1118         }
1119     }
1120
1121     return (myResult);
1122 }
1123
1124 static int
1125 do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
1126     struct vnop_ioctl_args *ap, int arg_size, vfs_context_t context)
1127 {
1128     boolean_t is64bit;
1129
1130     /*
1131      * NOTE: on entry, the vnode is locked. Incase this vnode
1132      * happens to be in our list of file_ids, we'll note it
1133      * avoid calling hfs_chashget_nowait() on that id as that
1134      * will cause a "locking against myself" panic.
1135      */
1136     Boolean check_leaf = true;
1137
1138     struct user64_ext_access_t *user_access_structp;
1139     struct user64_ext_access_t tmp_user_access;
1140     struct access_cache cache;
1141
1142     int error = 0, prev_parent_check_ok=1;
1143     unsigned int i;
1144
1145     short flags;
1146     unsigned int num_files = 0;
1147     int map_size = 0;
1148     int num_parents = 0;
1149     int *file_ids=NULL;
1150     short *access=NULL;
1151     char *bitmap=NULL;
1152     cnid_t *parents=NULL;
1153     int leaf_index;
1154
1155     cnid_t cnid;
1156     cnid_t prevParent_cnid = 0;
1157     unsigned int myPerms;
1158     short myaccess = 0;
1159     struct cat_attr cnattr;
1160     CatalogKey catkey;
1161     struct cnode *skip_cp = VTOC(vp);
1162     kauth_cred_t cred = vfs_context_ucred(context);
1163     proc_t p = vfs_context_proc(context);
1164
1165     is64bit = proc_is64bit(p);
1166
1167     /* initialize the local cache and buffers */
1168     cache.numcached = 0;
1169     cache.cachehits = 0;
1170     cache.lookups = 0;
1171     cache.acache = NULL;
1172     cache.haveaccess = NULL;
1173
1174     /* struct copyin done during dispatch... need to copy file_id array separately */
1175     if (ap->a_data == NULL) {
1176         error = EINVAL;
1177         goto err_exit_bulk_access;
1178     }
1179
1180     if (is64bit) {
1181         if (arg_size != sizeof(struct user64_ext_access_t)) {
1182             error = EINVAL;
1183             goto err_exit_bulk_access;
1184         }
1185
1186         user_access_structp = (struct user64_ext_access_t *)ap->a_data;
1187
1188     } else if (arg_size == sizeof(struct user32_access_t)) {
1189         struct user32_access_t *accessp = (struct user32_access_t *)ap->a_data;
1190
1191         // convert an old style bulk-access struct to the new style
1192         tmp_user_access.flags     = accessp->flags;
1193         tmp_user_access.num_files = accessp->num_files;
1194         tmp_user_access.map_size  = 0;
1195         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1196         tmp_user_access.bitmap    = USER_ADDR_NULL;
1197         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1198         tmp_user_access.num_parents = 0;
1199         user_access_structp = &tmp_user_access;
1200
1201     } else if (arg_size == sizeof(struct user32_ext_access_t)) {
1202         struct user32_ext_access_t *accessp = (struct user32_ext_access_t *)ap->a_data;
1203
1204         // up-cast from a 32-bit version of the struct
1205         tmp_user_access.flags     = accessp->flags;
1206         tmp_user_access.num_files = accessp->num_files;
1207         tmp_user_access.map_size  = accessp->map_size;
1208         tmp_user_access.num_parents  = accessp->num_parents;
1209
1210         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1211         tmp_user_access.bitmap    = CAST_USER_ADDR_T(accessp->bitmap);
1212         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1213         tmp_user_access.parents    = CAST_USER_ADDR_T(accessp->parents);
1214
1215         user_access_structp = &tmp_user_access;
1216     } else {
1217         error = EINVAL;
1218         goto err_exit_bulk_access;
1219     }
1220
1221     map_size = user_access_structp->map_size;
1222
1223     num_files = user_access_structp->num_files;
1224
1225     num_parents= user_access_structp->num_parents;
1226
1227     if (num_files < 1) {
1228         goto err_exit_bulk_access;
1229     }
1230     if (num_files > 1024) {
1231         error = EINVAL;
1232         goto err_exit_bulk_access;
1233     }
1234
1235     if (num_parents > 1024) {
1236         error = EINVAL;
1237         goto err_exit_bulk_access;
1238     }
1239
1240     file_ids = (int *) kalloc(sizeof(int) * num_files);
1241     access = (short *) kalloc(sizeof(short) * num_files);
1242     if (map_size) {
1243         bitmap = (char *) kalloc(sizeof(char) * map_size);
1244     }
1245
1246     if (num_parents) {
1247         parents = (cnid_t *) kalloc(sizeof(cnid_t) * num_parents);
1248     }
1249
1250     cache.acache = (unsigned int *) kalloc(sizeof(int) * NUM_CACHE_ENTRIES);
1251     cache.haveaccess = (unsigned char *) kalloc(sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1252
1253     if (file_ids == NULL || access == NULL || (map_size != 0 && bitmap == NULL) || cache.acache == NULL || cache.haveaccess == NULL) {
1254         if (file_ids) {
1255             kfree(file_ids, sizeof(int) * num_files);
1256         }
1257         if (bitmap) {
1258             kfree(bitmap, sizeof(char) * map_size);
1259         }
1260         if (access) {
1261             kfree(access, sizeof(short) * num_files);
1262         }
1263         if (cache.acache) {
1264             kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1265         }
1266         if (cache.haveaccess) {
1267             kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1268         }
1269         if (parents) {
1270             kfree(parents, sizeof(cnid_t) * num_parents);
1271         }
1272         return ENOMEM;
1273     }
1274
1275     // make sure the bitmap is zero'ed out...
1276     if (bitmap) {
1277         bzero(bitmap, (sizeof(char) * map_size));
1278     }
1279
1280     if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids,
1281                 num_files * sizeof(int)))) {
1282         goto err_exit_bulk_access;
1283     }
1284
1285     if (num_parents) {
1286         if ((error = copyin(user_access_structp->parents, (caddr_t)parents,
1287                     num_parents * sizeof(cnid_t)))) {
1288             goto err_exit_bulk_access;
1289         }
1290     }
1291
1292     flags = user_access_structp->flags;
1293     if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) {
1294         flags = R_OK;
1295     }
1296
1297     /* check if we've been passed leaf node ids or parent ids */
1298     if (flags & PARENT_IDS_FLAG) {
1299         check_leaf = false;
1300     }
1301
1302     /* Check access to each file_id passed in */
1303     for (i = 0; i < num_files; i++) {
1304         leaf_index=-1;
1305         cnid = (cnid_t) file_ids[i];
1306
1307         /* root always has access */
1308         if ((!parents) && (!suser(cred, NULL))) {
1309             access[i] = 0;
1310             continue;
1311         }
1312
1313         if (check_leaf) {
1314             /* do the lookup (checks the cnode hash, then the catalog) */
1315             error = do_attr_lookup(hfsmp, &cache, cnid, skip_cp, &catkey, &cnattr);
1316             if (error) {
1317                 access[i] = (short) error;
1318                 continue;
1319             }
1320
1321             if (parents) {
1322                 // Check if the leaf matches one of the parent scopes
1323                 leaf_index = cache_binSearch(parents, num_parents-1, cnid, NULL);
1324                 if (leaf_index >= 0 && parents[leaf_index] == cnid)
1325                     prev_parent_check_ok = 0;
1326                 else if (leaf_index >= 0)
1327                     prev_parent_check_ok = 1;
1328             }
1329
1330             // if the thing has acl's, do the full permission check
1331             if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1332                 struct vnode *cvp;
1333                 int myErr = 0;
1334                 /* get the vnode for this cnid */
1335                 myErr = hfs_vget(hfsmp, cnid, &cvp, 0, 0);
1336                 if ( myErr ) {
1337                     access[i] = myErr;
1338                     continue;
1339                 }
1340
1341                 hfs_unlock(VTOC(cvp));
1342
1343                 if (vnode_vtype(cvp) == VDIR) {
1344                     myErr = vnode_authorize(cvp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), context);
1345                 } else {
1346                     myErr = vnode_authorize(cvp, NULL, KAUTH_VNODE_READ_DATA, context);
1347                 }
1348
1349                 vnode_put(cvp);
1350                 if (myErr) {
1351                     access[i] = myErr;
1352                     continue;
1353                 }
1354             } else {
1355                 /* before calling CheckAccess(), check the target file for read access */
1356                 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
1357                     cnattr.ca_mode, hfsmp->hfs_mp, cred, p);
1358
1359                 /* fail fast if no access */
1360                 if ((myPerms & flags) == 0) {
1361                     access[i] = EACCES;
1362                     continue;
1363                 }
1364             }
1365         } else {
1366             /* we were passed an array of parent ids */
1367             catkey.hfsPlus.parentID = cnid;
1368         }
1369
1370         /* if the last guy had the same parent and had access, we're done */
1371         if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0 && prev_parent_check_ok) {
1372             cache.cachehits++;
1373             access[i] = 0;
1374             continue;
1375         }
1376
1377         myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID,
1378             skip_cp, p, cred, context,bitmap, map_size, parents, num_parents);
1379
1380         if (myaccess || (error == ESRCH && leaf_index != -1)) {
1381             access[i] = 0; // have access.. no errors to report
1382         } else {
1383             access[i] = (error != 0 ? (short) error : EACCES);
1384         }
1385
1386         prevParent_cnid = catkey.hfsPlus.parentID;
1387     }
1388
1389     /* copyout the access array */
1390     if ((error = copyout((caddr_t)access, user_access_structp->access,
1391                 num_files * sizeof (short)))) {
1392         goto err_exit_bulk_access;
1393     }
1394     if (map_size && bitmap) {
1395         if ((error = copyout((caddr_t)bitmap, user_access_structp->bitmap,
1396                     map_size * sizeof (char)))) {
1397             goto err_exit_bulk_access;
1398         }
1399     }
1400
1401
1402   err_exit_bulk_access:
1403
1404     //printf("hfs: on exit (err %d), numfiles/numcached/cachehits/lookups is %d/%d/%d/%d\n", error, num_files, cache.numcached, cache.cachehits, cache.lookups);
1405
1406     if (file_ids)
1407         kfree(file_ids, sizeof(int) * num_files);
1408     if (parents)
1409         kfree(parents, sizeof(cnid_t) * num_parents);
1410     if (bitmap)
1411         kfree(bitmap, sizeof(char) * map_size);
1412     if (access)
1413         kfree(access, sizeof(short) * num_files);
1414     if (cache.acache)
1415         kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1416     if (cache.haveaccess)
1417         kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1418
1419     return (error);
1420 }
1421
1422
1423 /* end "bulk-access" support */
1424
1425
1426 /*
1427  * Callback for use with freeze ioctl.
1428  */
1429 static int
1430 hfs_freezewrite_callback(struct vnode *vp, __unused void *cargs)
1431 {
1432         vnode_waitforwrites(vp, 0, 0, 0, "hfs freeze");
1433
1434         return 0;
1435 }
1436
1437 /*
1438  * Control filesystem operating characteristics.
1439  */
1440 int
1441 hfs_vnop_ioctl( struct vnop_ioctl_args /* {
1442                 vnode_t a_vp;
1443                 int  a_command;
1444                 caddr_t  a_data;
1445                 int  a_fflag;
1446                 vfs_context_t a_context;
1447         } */ *ap)
1448 {
1449         struct vnode * vp = ap->a_vp;
1450         struct hfsmount *hfsmp = VTOHFS(vp);
1451         vfs_context_t context = ap->a_context;
1452         kauth_cred_t cred = vfs_context_ucred(context);
1453         proc_t p = vfs_context_proc(context);
1454         struct vfsstatfs *vfsp;
1455         boolean_t is64bit;
1456         off_t jnl_start, jnl_size;
1457         struct hfs_journal_info *jip;
1458 #if HFS_COMPRESSION
1459         int compressed = 0;
1460         off_t uncompressed_size = -1;
1461         int decmpfs_error = 0;
1462
1463         if (ap->a_command == F_RDADVISE) {
1464                 /* we need to inspect the decmpfs state of the file as early as possible */
1465                 compressed = hfs_file_is_compressed(VTOC(vp), 0);
1466                 if (compressed) {
1467                         if (VNODE_IS_RSRC(vp)) {
1468                                 /* if this is the resource fork, treat it as if it were empty */
1469                                 uncompressed_size = 0;
1470                         } else {
1471                                 decmpfs_error = hfs_uncompressed_size_of_compressed_file(NULL, vp, 0, &uncompressed_size, 0);
1472                                 if (decmpfs_error != 0) {
1473                                         /* failed to get the uncompressed size, we'll check for this later */
1474                                         uncompressed_size = -1;
1475                                 }
1476                         }
1477                 }
1478         }
1479 #endif /* HFS_COMPRESSION */
1480
1481         is64bit = proc_is64bit(p);
1482
1483 #if CONFIG_PROTECT
1484         {
1485                 int error = 0;
1486                 if ((error = cp_handle_vnop(VTOC(vp), CP_WRITE_ACCESS)) != 0) {
1487                         return error;
1488                 }
1489         }
1490 #endif /* CONFIG_PROTECT */
1491
1492         switch (ap->a_command) {
1493
1494         case HFS_GETPATH:
1495         {
1496                 struct vnode *file_vp;
1497                 cnid_t  cnid;
1498                 int  outlen;
1499                 char *bufptr;
1500                 int error;
1501
1502                 /* Caller must be owner of file system. */
1503                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1504                 if (suser(cred, NULL) &&
1505                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1506                         return (EACCES);
1507                 }
1508                 /* Target vnode must be file system's root. */
1509                 if (!vnode_isvroot(vp)) {
1510                         return (EINVAL);
1511                 }
1512                 bufptr = (char *)ap->a_data;
1513                 cnid = strtoul(bufptr, NULL, 10);
1514
1515                 /* We need to call hfs_vfs_vget to leverage the code that will
1516                  * fix the origin list for us if needed, as opposed to calling
1517                  * hfs_vget, since we will need the parent for build_path call.
1518                  */
1519
1520                 if ((error = hfs_vfs_vget(HFSTOVFS(hfsmp), cnid, &file_vp, context))) {
1521                         return (error);
1522                 }
1523                 error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, 0, context);
1524                 vnode_put(file_vp);
1525
1526                 return (error);
1527         }
1528
1529         case HFS_PREV_LINK:
1530         case HFS_NEXT_LINK:
1531         {
1532                 cnid_t linkfileid;
1533                 cnid_t nextlinkid;
1534                 cnid_t prevlinkid;
1535                 int error;
1536
1537                 /* Caller must be owner of file system. */
1538                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1539                 if (suser(cred, NULL) &&
1540                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1541                         return (EACCES);
1542                 }
1543                 /* Target vnode must be file system's root. */
1544                 if (!vnode_isvroot(vp)) {
1545                         return (EINVAL);
1546                 }
1547                 linkfileid = *(cnid_t *)ap->a_data;
1548                 if (linkfileid < kHFSFirstUserCatalogNodeID) {
1549                         return (EINVAL);
1550                 }
1551                 if ((error = hfs_lookup_siblinglinks(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) {
1552                         return (error);
1553                 }
1554                 if (ap->a_command == HFS_NEXT_LINK) {
1555                         *(cnid_t *)ap->a_data = nextlinkid;
1556                 } else {
1557                         *(cnid_t *)ap->a_data = prevlinkid;
1558                 }
1559                 return (0);
1560         }
1561
1562         case HFS_RESIZE_PROGRESS: {
1563
1564                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1565                 if (suser(cred, NULL) &&
1566                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1567                         return (EACCES); /* must be owner of file system */
1568                 }
1569                 if (!vnode_isvroot(vp)) {
1570                         return (EINVAL);
1571                 }
1572                 /* file system must not be mounted read-only */
1573                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1574                         return (EROFS);
1575                 }
1576
1577                 return hfs_resize_progress(hfsmp, (u_int32_t *)ap->a_data);
1578         }
1579
1580         case HFS_RESIZE_VOLUME: {
1581                 u_int64_t newsize;
1582                 u_int64_t cursize;
1583
1584                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1585                 if (suser(cred, NULL) &&
1586                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1587                         return (EACCES); /* must be owner of file system */
1588                 }
1589                 if (!vnode_isvroot(vp)) {
1590                         return (EINVAL);
1591                 }
1592
1593                 /* filesystem must not be mounted read only */
1594                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1595                         return (EROFS);
1596                 }
1597                 newsize = *(u_int64_t *)ap->a_data;
1598                 cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
1599
1600                 if (newsize > cursize) {
1601                         return hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context);
1602                 } else if (newsize < cursize) {
1603                         return hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context);
1604                 } else {
1605                         return (0);
1606                 }
1607         }
1608         case HFS_CHANGE_NEXT_ALLOCATION: {
1609                 int error = 0;          /* Assume success */
1610                 u_int32_t location;
1611
1612                 if (vnode_vfsisrdonly(vp)) {
1613                         return (EROFS);
1614                 }
1615                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1616                 if (suser(cred, NULL) &&
1617                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1618                         return (EACCES); /* must be owner of file system */
1619                 }
1620                 if (!vnode_isvroot(vp)) {
1621                         return (EINVAL);
1622                 }
1623                 HFS_MOUNT_LOCK(hfsmp, TRUE);
1624                 location = *(u_int32_t *)ap->a_data;
1625                 if ((location >= hfsmp->allocLimit) &&
1626                         (location != HFS_NO_UPDATE_NEXT_ALLOCATION)) {
1627                         error = EINVAL;
1628                         goto fail_change_next_allocation;
1629                 }
1630                 /* Return previous value. */
1631                 *(u_int32_t *)ap->a_data = hfsmp->nextAllocation;
1632                 if (location == HFS_NO_UPDATE_NEXT_ALLOCATION) {
1633                         /* On magic value for location, set nextAllocation to next block
1634                          * after metadata zone and set flag in mount structure to indicate
1635                          * that nextAllocation should not be updated again.
1636                          */
1637                         if (hfsmp->hfs_metazone_end != 0) {
1638                                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_end + 1);
1639                         }
1640                         hfsmp->hfs_flags |= HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1641                 } else {
1642                         hfsmp->hfs_flags &= ~HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1643                         HFS_UPDATE_NEXT_ALLOCATION(hfsmp, location);
1644                 }
1645                 MarkVCBDirty(hfsmp);
1646 fail_change_next_allocation:
1647                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
1648                 return (error);
1649         }
1650
1651 #if HFS_SPARSE_DEV
1652         case HFS_SETBACKINGSTOREINFO: {
1653                 struct vnode * bsfs_rootvp;
1654                 struct vnode * di_vp;
1655                 struct hfs_backingstoreinfo *bsdata;
1656                 int error = 0;
1657
1658                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1659                         return (EROFS);
1660                 }
1661                 if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
1662                         return (EALREADY);
1663                 }
1664                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1665                 if (suser(cred, NULL) &&
1666                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1667                         return (EACCES); /* must be owner of file system */
1668                 }
1669                 bsdata = (struct hfs_backingstoreinfo *)ap->a_data;
1670                 if (bsdata == NULL) {
1671                         return (EINVAL);
1672                 }
1673                 if ((error = file_vnode(bsdata->backingfd, &di_vp))) {
1674                         return (error);
1675                 }
1676                 if ((error = vnode_getwithref(di_vp))) {
1677                         file_drop(bsdata->backingfd);
1678                         return(error);
1679                 }
1680
1681                 if (vnode_mount(vp) == vnode_mount(di_vp)) {
1682                         (void)vnode_put(di_vp);
1683                         file_drop(bsdata->backingfd);
1684                         return (EINVAL);
1685                 }
1686
1687                 /*
1688                  * Obtain the backing fs root vnode and keep a reference
1689                  * on it.  This reference will be dropped in hfs_unmount.
1690                  */
1691                 error = VFS_ROOT(vnode_mount(di_vp), &bsfs_rootvp, NULL); /* XXX use context! */
1692                 if (error) {
1693                         (void)vnode_put(di_vp);
1694                         file_drop(bsdata->backingfd);
1695                         return (error);
1696                 }
1697                 vnode_ref(bsfs_rootvp);
1698                 vnode_put(bsfs_rootvp);
1699
1700                 hfsmp->hfs_backingfs_rootvp = bsfs_rootvp;
1701
1702                 hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
1703                 /* The free extent cache is managed differently for sparse devices.
1704                  * There is a window between which the volume is mounted and the
1705                  * device is marked as sparse, so the free extent cache for this
1706                  * volume is currently initialized as normal volume (sorted by block
1707                  * count).  Reset the cache so that it will be rebuilt again
1708                  * for sparse device (sorted by start block).
1709                  */
1710                 ResetVCBFreeExtCache(hfsmp);
1711
1712                 hfsmp->hfs_sparsebandblks = bsdata->bandsize / HFSTOVCB(hfsmp)->blockSize;
1713                 hfsmp->hfs_sparsebandblks *= 4;
1714
1715                 vfs_markdependency(hfsmp->hfs_mp);
1716
1717                 /*
1718                  * If the sparse image is on a sparse image file (as opposed to a sparse
1719                  * bundle), then we may need to limit the free space to the maximum size
1720                  * of a file on that volume.  So we query (using pathconf), and if we get
1721                  * a meaningful result, we cache the number of blocks for later use in
1722                  * hfs_freeblks().
1723                  */
1724                 hfsmp->hfs_backingfs_maxblocks = 0;
1725                 if (vnode_vtype(di_vp) == VREG) {
1726                         int terr;
1727                         int hostbits;
1728                         terr = vn_pathconf(di_vp, _PC_FILESIZEBITS, &hostbits, context);
1729                         if (terr == 0 && hostbits != 0 && hostbits < 64) {
1730                                 u_int64_t hostfilesizemax = ((u_int64_t)1) << hostbits;
1731
1732                                 hfsmp->hfs_backingfs_maxblocks = hostfilesizemax / hfsmp->blockSize;
1733                         }
1734                 }
1735
1736                 (void)vnode_put(di_vp);
1737                 file_drop(bsdata->backingfd);
1738                 return (0);
1739         }
1740         case HFS_CLRBACKINGSTOREINFO: {
1741                 struct vnode * tmpvp;
1742
1743                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1744                 if (suser(cred, NULL) &&
1745                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1746                         return (EACCES); /* must be owner of file system */
1747                 }
1748                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1749                         return (EROFS);
1750                 }
1751
1752                 if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
1753                     hfsmp->hfs_backingfs_rootvp) {
1754
1755                         hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
1756                         tmpvp = hfsmp->hfs_backingfs_rootvp;
1757                         hfsmp->hfs_backingfs_rootvp = NULLVP;
1758                         hfsmp->hfs_sparsebandblks = 0;
1759                         vnode_rele(tmpvp);
1760                 }
1761                 return (0);
1762         }
1763 #endif /* HFS_SPARSE_DEV */
1764
1765         case F_FREEZE_FS: {
1766                 struct mount *mp;
1767
1768                 mp = vnode_mount(vp);
1769                 hfsmp = VFSTOHFS(mp);
1770
1771                 if (!(hfsmp->jnl))
1772                         return (ENOTSUP);
1773
1774                 vfsp = vfs_statfs(mp);
1775
1776                 if (kauth_cred_getuid(cred) != vfsp->f_owner &&
1777                         !kauth_cred_issuser(cred))
1778                         return (EACCES);
1779
1780                 lck_rw_lock_exclusive(&hfsmp->hfs_insync);
1781
1782                 // flush things before we get started to try and prevent
1783                 // dirty data from being paged out while we're frozen.
1784                 // note: can't do this after taking the lock as it will
1785                 // deadlock against ourselves.
1786                 vnode_iterate(mp, 0, hfs_freezewrite_callback, NULL);
1787                 hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
1788
1789                 // DO NOT call hfs_journal_flush() because that takes a
1790                 // shared lock on the global exclusive lock!
1791                 journal_flush(hfsmp->jnl, TRUE);
1792
1793                 // don't need to iterate on all vnodes, we just need to
1794                 // wait for writes to the system files and the device vnode
1795                 //
1796                 // Now that journal flush waits for all metadata blocks to
1797                 // be written out, waiting for btree writes is probably no
1798                 // longer required.
1799                 if (HFSTOVCB(hfsmp)->extentsRefNum)
1800                     vnode_waitforwrites(HFSTOVCB(hfsmp)->extentsRefNum, 0, 0, 0, "hfs freeze");
1801                 if (HFSTOVCB(hfsmp)->catalogRefNum)
1802                     vnode_waitforwrites(HFSTOVCB(hfsmp)->catalogRefNum, 0, 0, 0, "hfs freeze");
1803                 if (HFSTOVCB(hfsmp)->allocationsRefNum)
1804                     vnode_waitforwrites(HFSTOVCB(hfsmp)->allocationsRefNum, 0, 0, 0, "hfs freeze");
1805                 if (hfsmp->hfs_attribute_vp)
1806                     vnode_waitforwrites(hfsmp->hfs_attribute_vp, 0, 0, 0, "hfs freeze");
1807                 vnode_waitforwrites(hfsmp->hfs_devvp, 0, 0, 0, "hfs freeze");
1808
1809                 hfsmp->hfs_freezing_proc = current_proc();
1810
1811                 return (0);
1812         }
1813
1814         case F_THAW_FS: {
1815                 vfsp = vfs_statfs(vnode_mount(vp));
1816                 if (kauth_cred_getuid(cred) != vfsp->f_owner &&
1817                         !kauth_cred_issuser(cred))
1818                         return (EACCES);
1819
1820                 // if we're not the one who froze the fs then we
1821                 // can't thaw it.
1822                 if (hfsmp->hfs_freezing_proc != current_proc()) {
1823                     return EPERM;
1824                 }
1825
1826                 // NOTE: if you add code here, also go check the
1827                 //       code that "thaws" the fs in hfs_vnop_close()
1828                 //
1829                 hfsmp->hfs_freezing_proc = NULL;
1830                 hfs_unlock_global (hfsmp);
1831                 lck_rw_unlock_exclusive(&hfsmp->hfs_insync);
1832
1833                 return (0);
1834         }
1835
1836         case HFS_BULKACCESS_FSCTL: {
1837             int size;
1838
1839             if (hfsmp->hfs_flags & HFS_STANDARD) {
1840                 return EINVAL;
1841             }
1842
1843             if (is64bit) {
1844                 size = sizeof(struct user64_access_t);
1845             } else {
1846                 size = sizeof(struct user32_access_t);
1847             }
1848
1849             return do_bulk_access_check(hfsmp, vp, ap, size, context);
1850         }
1851
1852         case HFS_EXT_BULKACCESS_FSCTL: {
1853             int size;
1854
1855             if (hfsmp->hfs_flags & HFS_STANDARD) {
1856                 return EINVAL;
1857             }
1858
1859             if (is64bit) {
1860                 size = sizeof(struct user64_ext_access_t);
1861             } else {
1862                 size = sizeof(struct user32_ext_access_t);
1863             }
1864
1865             return do_bulk_access_check(hfsmp, vp, ap, size, context);
1866         }
1867
1868         case HFS_SET_XATTREXTENTS_STATE: {
1869                 int state;
1870
1871                 if (ap->a_data == NULL) {
1872                         return (EINVAL);
1873                 }
1874
1875                 state = *(int *)ap->a_data;
1876
1877                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1878                         return (EROFS);
1879                 }
1880
1881                 /* Super-user can enable or disable extent-based extended
1882                  * attribute support on a volume
1883                  * Note: Starting Mac OS X 10.7, extent-based extended attributes
1884                  * are enabled by default, so any change will be transient only
1885                  * till the volume is remounted.
1886                  */
1887                 if (!is_suser()) {
1888                         return (EPERM);
1889                 }
1890                 if (state == 0 || state == 1)
1891                         return hfs_set_volxattr(hfsmp, HFS_SET_XATTREXTENTS_STATE, state);
1892                 else
1893                         return (EINVAL);
1894         }
1895
1896         case F_FULLFSYNC: {
1897                 int error;
1898
1899                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1900                         return (EROFS);
1901                 }
1902                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK);
1903                 if (error == 0) {
1904                         error = hfs_fsync(vp, MNT_WAIT, TRUE, p);
1905                         hfs_unlock(VTOC(vp));
1906                 }
1907
1908                 return error;
1909         }
1910
1911         case F_CHKCLEAN: {
1912                 register struct cnode *cp;
1913                 int error;
1914
1915                 if (!vnode_isreg(vp))
1916                         return EINVAL;
1917
1918                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK);
1919                 if (error == 0) {
1920                         cp = VTOC(vp);
1921                         /*
1922                          * used by regression test to determine if
1923                          * all the dirty pages (via write) have been cleaned
1924                          * after a call to 'fsysnc'.
1925                          */
1926                         error = is_file_clean(vp, VTOF(vp)->ff_size);
1927                         hfs_unlock(cp);
1928                 }
1929                 return (error);
1930         }
1931
1932         case F_RDADVISE: {
1933                 register struct radvisory *ra;
1934                 struct filefork *fp;
1935                 int error;
1936
1937                 if (!vnode_isreg(vp))
1938                         return EINVAL;
1939
1940                 ra = (struct radvisory *)(ap->a_data);
1941                 fp = VTOF(vp);
1942
1943                 /* Protect against a size change. */
1944                 hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK);
1945
1946 #if HFS_COMPRESSION
1947                 if (compressed && (uncompressed_size == -1)) {
1948                         /* fetching the uncompressed size failed above, so return the error */
1949                         error = decmpfs_error;
1950                 } else if ((compressed && (ra->ra_offset >= uncompressed_size)) ||
1951                                    (!compressed && (ra->ra_offset >= fp->ff_size))) {
1952                         error = EFBIG;
1953                 }
1954 #else /* HFS_COMPRESSION */
1955                 if (ra->ra_offset >= fp->ff_size) {
1956                         error = EFBIG;
1957                 }
1958 #endif /* HFS_COMPRESSION */
1959                 else {
1960                         error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count);
1961                 }
1962
1963                 hfs_unlock_truncate(VTOC(vp), 0);
1964                 return (error);
1965         }
1966
1967         case F_READBOOTSTRAP:
1968         case F_WRITEBOOTSTRAP:
1969         {
1970             struct vnode *devvp = NULL;
1971             user_fbootstraptransfer_t *user_bootstrapp;
1972             int devBlockSize;
1973             int error;
1974             uio_t auio;
1975             daddr64_t blockNumber;
1976             u_int32_t blockOffset;
1977             u_int32_t xfersize;
1978             struct buf *bp;
1979             user_fbootstraptransfer_t user_bootstrap;
1980
1981                 if (!vnode_isvroot(vp))
1982                         return (EINVAL);
1983                 /* LP64 - when caller is a 64 bit process then we are passed a pointer
1984                  * to a user_fbootstraptransfer_t else we get a pointer to a
1985                  * fbootstraptransfer_t which we munge into a user_fbootstraptransfer_t
1986                  */
1987                 if ((hfsmp->hfs_flags & HFS_READ_ONLY)
1988                         && (ap->a_command == F_WRITEBOOTSTRAP)) {
1989                         return (EROFS);
1990                 }
1991                 if (is64bit) {
1992                         user_bootstrapp = (user_fbootstraptransfer_t *)ap->a_data;
1993                 }
1994                 else {
1995                         user32_fbootstraptransfer_t *bootstrapp = (user32_fbootstraptransfer_t *)ap->a_data;
1996                         user_bootstrapp = &user_bootstrap;
1997                         user_bootstrap.fbt_offset = bootstrapp->fbt_offset;
1998                         user_bootstrap.fbt_length = bootstrapp->fbt_length;
1999                         user_bootstrap.fbt_buffer = CAST_USER_ADDR_T(bootstrapp->fbt_buffer);
2000                 }
2001
2002                 if ((user_bootstrapp->fbt_offset < 0) || (user_bootstrapp->fbt_offset > 1024) ||
2003                                 (user_bootstrapp->fbt_length > 1024)) {
2004                         return EINVAL;
2005                 }
2006
2007                 if (user_bootstrapp->fbt_offset + user_bootstrapp->fbt_length > 1024)
2008                         return EINVAL;
2009
2010                 devvp = VTOHFS(vp)->hfs_devvp;
2011                 auio = uio_create(1, user_bootstrapp->fbt_offset,
2012                                                   is64bit ? UIO_USERSPACE64 : UIO_USERSPACE32,
2013                                                   (ap->a_command == F_WRITEBOOTSTRAP) ? UIO_WRITE : UIO_READ);
2014                 uio_addiov(auio, user_bootstrapp->fbt_buffer, user_bootstrapp->fbt_length);
2015
2016             devBlockSize = vfs_devblocksize(vnode_mount(vp));
2017
2018             while (uio_resid(auio) > 0) {
2019                         blockNumber = uio_offset(auio) / devBlockSize;
2020                         error = (int)buf_bread(devvp, blockNumber, devBlockSize, cred, &bp);
2021                         if (error) {
2022                                 if (bp) buf_brelse(bp);
2023                                 uio_free(auio);
2024                                 return error;
2025                         };
2026
2027                         blockOffset = uio_offset(auio) % devBlockSize;
2028                         xfersize = devBlockSize - blockOffset;
2029                         error = uiomove((caddr_t)buf_dataptr(bp) + blockOffset, (int)xfersize, auio);
2030                         if (error) {
2031                                 buf_brelse(bp);
2032                                 uio_free(auio);
2033                                 return error;
2034                         };
2035                         if (uio_rw(auio) == UIO_WRITE) {
2036                                 error = VNOP_BWRITE(bp);
2037                                 if (error) {
2038                                         uio_free(auio);
2039                         return error;
2040                                 }
2041                         } else {
2042                                 buf_brelse(bp);
2043                         };
2044                 };
2045                 uio_free(auio);
2046         };
2047         return 0;
2048
2049         case _IOC(IOC_OUT,'h', 4, 0):     /* Create date in local time */
2050         {
2051                 if (is64bit) {
2052                         *(user_time_t *)(ap->a_data) = (user_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2053                 }
2054                 else {
2055                         *(user32_time_t *)(ap->a_data) = (user32_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2056                 }
2057                 return 0;
2058         }
2059
2060         case SPOTLIGHT_FSCTL_GET_MOUNT_TIME:
2061             *(uint32_t *)ap->a_data = hfsmp->hfs_mount_time;
2062             break;
2063
2064         case SPOTLIGHT_FSCTL_GET_LAST_MTIME:
2065             *(uint32_t *)ap->a_data = hfsmp->hfs_last_mounted_mtime;
2066             break;
2067
2068         case HFS_FSCTL_SET_VERY_LOW_DISK:
2069             if (*(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_warninglimit) {
2070                 return EINVAL;
2071             }
2072
2073             hfsmp->hfs_freespace_notify_dangerlimit = *(uint32_t *)ap->a_data;
2074             break;
2075
2076         case HFS_FSCTL_SET_LOW_DISK:
2077             if (   *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel
2078                 || *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_dangerlimit) {
2079
2080                 return EINVAL;
2081             }
2082
2083             hfsmp->hfs_freespace_notify_warninglimit = *(uint32_t *)ap->a_data;
2084             break;
2085
2086         case HFS_FSCTL_SET_DESIRED_DISK:
2087             if (*(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) {
2088                 return EINVAL;
2089             }
2090
2091             hfsmp->hfs_freespace_notify_desiredlevel = *(uint32_t *)ap->a_data;
2092             break;
2093
2094         case HFS_VOLUME_STATUS:
2095             *(uint32_t *)ap->a_data = hfsmp->hfs_notification_conditions;
2096             break;
2097
2098         case HFS_SET_BOOT_INFO:
2099                 if (!vnode_isvroot(vp))
2100                         return(EINVAL);
2101                 if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(HFSTOVFS(hfsmp))->f_owner))
2102                         return(EACCES); /* must be superuser or owner of filesystem */
2103                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2104                         return (EROFS);
2105                 }
2106                 HFS_MOUNT_LOCK(hfsmp, TRUE);
2107                 bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo));
2108                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
2109                 (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
2110                 break;
2111
2112         case HFS_GET_BOOT_INFO:
2113                 if (!vnode_isvroot(vp))
2114                         return(EINVAL);
2115                 HFS_MOUNT_LOCK(hfsmp, TRUE);
2116                 bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo));
2117                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
2118                 break;
2119
2120         case HFS_MARK_BOOT_CORRUPT:
2121                 /* Mark the boot volume corrupt by setting
2122                  * kHFSVolumeInconsistentBit in the volume header.  This will
2123                  * force fsck_hfs on next mount.
2124                  */
2125                 if (!is_suser()) {
2126                         return EACCES;
2127                 }
2128
2129                 /* Allowed only on the root vnode of the boot volume */
2130                 if (!(vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) ||
2131                     !vnode_isvroot(vp)) {
2132                         return EINVAL;
2133                 }
2134                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2135                         return (EROFS);
2136                 }
2137                 printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n");
2138                 hfs_mark_volume_inconsistent(hfsmp);
2139                 break;
2140
2141         case HFS_FSCTL_GET_JOURNAL_INFO:
2142                 jip = (struct hfs_journal_info*)ap->a_data;
2143
2144                 if (vp == NULLVP)
2145                         return EINVAL;
2146
2147             if (hfsmp->jnl == NULL) {
2148                         jnl_start = 0;
2149                         jnl_size  = 0;
2150             } else {
2151                         jnl_start = (off_t)(hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset;
2152                         jnl_size  = (off_t)hfsmp->jnl_size;
2153             }
2154
2155                 jip->jstart = jnl_start;
2156                 jip->jsize = jnl_size;
2157                 break;
2158
2159         case HFS_SET_ALWAYS_ZEROFILL: {
2160             struct cnode *cp = VTOC(vp);
2161
2162             if (*(int *)ap->a_data) {
2163                 cp->c_flag |= C_ALWAYS_ZEROFILL;
2164             } else {
2165                 cp->c_flag &= ~C_ALWAYS_ZEROFILL;
2166             }
2167             break;
2168         }
2169
2170         case HFS_DISABLE_METAZONE: {
2171                 /* Only root can disable metadata zone */
2172                 if (!is_suser()) {
2173                         return EACCES;
2174                 }
2175                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2176                         return (EROFS);
2177                 }
2178
2179                 /* Disable metadata zone now */
2180                 (void) hfs_metadatazone_init(hfsmp, true);
2181                 printf ("hfs: Disabling metadata zone on %s\n", hfsmp->vcbVN);
2182                 break;
2183         }
2184
2185         default:
2186                 return (ENOTTY);
2187         }
2188
2189         return 0;
2190 }
2191
2192 /*
2193  * select
2194  */
2195 int
2196 hfs_vnop_select(__unused struct vnop_select_args *ap)
2197 /*
2198         struct vnop_select_args {
2199                 vnode_t a_vp;
2200                 int  a_which;
2201                 int  a_fflags;
2202                 void *a_wql;
2203                 vfs_context_t a_context;
2204         };
2205 */
2206 {
2207         /*
2208          * We should really check to see if I/O is possible.
2209          */
2210         return (1);
2211 }
2212
2213 /*
2214  * Converts a logical block number to a physical block, and optionally returns
2215  * the amount of remaining blocks in a run. The logical block is based on hfsNode.logBlockSize.
2216  * The physical block number is based on the device block size, currently its 512.
2217  * The block run is returned in logical blocks, and is the REMAINING amount of blocks
2218  */
2219 int
2220 hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, unsigned int *runp)
2221 {
2222         struct filefork *fp = VTOF(vp);
2223         struct hfsmount *hfsmp = VTOHFS(vp);
2224         int  retval = E_NONE;
2225         u_int32_t  logBlockSize;
2226         size_t  bytesContAvail = 0;
2227         off_t  blockposition;
2228         int lockExtBtree;
2229         int lockflags = 0;
2230
2231         /*
2232          * Check for underlying vnode requests and ensure that logical
2233          * to physical mapping is requested.
2234          */
2235         if (vpp != NULL)
2236                 *vpp = hfsmp->hfs_devvp;
2237         if (bnp == NULL)
2238                 return (0);
2239
2240         logBlockSize = GetLogicalBlockSize(vp);
2241         blockposition = (off_t)bn * logBlockSize;
2242
2243         lockExtBtree = overflow_extents(fp);
2244
2245         if (lockExtBtree)
2246                 lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK);
2247
2248         retval = MacToVFSError(
2249                             MapFileBlockC (HFSTOVCB(hfsmp),
2250                                             (FCB*)fp,
2251                                             MAXPHYSIO,
2252                                             blockposition,
2253                                             bnp,
2254                                             &bytesContAvail));
2255
2256         if (lockExtBtree)
2257                 hfs_systemfile_unlock(hfsmp, lockflags);
2258
2259         if (retval == E_NONE) {
2260                 /* Figure out how many read ahead blocks there are */
2261                 if (runp != NULL) {
2262                         if (can_cluster(logBlockSize)) {
2263                                 /* Make sure this result never goes negative: */
2264                                 *runp = (bytesContAvail < logBlockSize) ? 0 : (bytesContAvail / logBlockSize) - 1;
2265                         } else {
2266                                 *runp = 0;
2267                         }
2268                 }
2269         }
2270         return (retval);
2271 }
2272
2273 /*
2274  * Convert logical block number to file offset.
2275  */
2276 int
2277 hfs_vnop_blktooff(struct vnop_blktooff_args *ap)
2278 /*
2279         struct vnop_blktooff_args {
2280                 vnode_t a_vp;
2281                 daddr64_t a_lblkno;
2282                 off_t *a_offset;
2283         };
2284 */
2285 {
2286         if (ap->a_vp == NULL)
2287                 return (EINVAL);
2288         *ap->a_offset = (off_t)ap->a_lblkno * (off_t)GetLogicalBlockSize(ap->a_vp);
2289
2290         return(0);
2291 }
2292
2293 /*
2294  * Convert file offset to logical block number.
2295  */
2296 int
2297 hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap)
2298 /*
2299         struct vnop_offtoblk_args {
2300                 vnode_t a_vp;
2301                 off_t a_offset;
2302                 daddr64_t *a_lblkno;
2303         };
2304 */
2305 {
2306         if (ap->a_vp == NULL)
2307                 return (EINVAL);
2308         *ap->a_lblkno = (daddr64_t)(ap->a_offset / (off_t)GetLogicalBlockSize(ap->a_vp));
2309
2310         return(0);
2311 }
2312
2313 /*
2314  * Map file offset to physical block number.
2315  *
2316  * If this function is called for write operation, and if the file
2317  * had virtual blocks allocated (delayed allocation), real blocks
2318  * are allocated by calling ExtendFileC().
2319  *
2320  * If this function is called for read operation, and if the file
2321  * had virtual blocks allocated (delayed allocation), no change
2322  * to the size of file is done, and if required, rangelist is
2323  * searched for mapping.
2324  *
2325  * System file cnodes are expected to be locked (shared or exclusive).
2326  */
2327 int
2328 hfs_vnop_blockmap(struct vnop_blockmap_args *ap)
2329 /*
2330         struct vnop_blockmap_args {
2331                 vnode_t a_vp;
2332                 off_t a_foffset;
2333                 size_t a_size;
2334                 daddr64_t *a_bpn;
2335                 size_t *a_run;
2336                 void *a_poff;
2337                 int a_flags;
2338                 vfs_context_t a_context;
2339         };
2340 */
2341 {
2342         struct vnode *vp = ap->a_vp;
2343         struct cnode *cp;
2344         struct filefork *fp;
2345         struct hfsmount *hfsmp;
2346         size_t bytesContAvail = 0;
2347         int retval = E_NONE;
2348         int syslocks = 0;
2349         int lockflags = 0;
2350         struct rl_entry *invalid_range;
2351         enum rl_overlaptype overlaptype;
2352         int started_tr = 0;
2353         int tooklock = 0;
2354
2355 #if HFS_COMPRESSION
2356         if (VNODE_IS_RSRC(vp)) {
2357                 /* allow blockmaps to the resource fork */
2358         } else {
2359                 if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
2360                         int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
2361                         switch(state) {
2362                                 case FILE_IS_COMPRESSED:
2363                                         return ENOTSUP;
2364                                 case FILE_IS_CONVERTING:
2365                                         /* if FILE_IS_CONVERTING, we allow blockmap */
2366                                         break;
2367                                 default:
2368                                         printf("invalid state %d for compressed file\n", state);
2369                                         /* fall through */
2370                         }
2371                 }
2372         }
2373 #endif /* HFS_COMPRESSION */
2374
2375         /* Do not allow blockmap operation on a directory */
2376         if (vnode_isdir(vp)) {
2377                 return (ENOTSUP);
2378         }
2379
2380         /*
2381          * Check for underlying vnode requests and ensure that logical
2382          * to physical mapping is requested.
2383          */
2384         if (ap->a_bpn == NULL)
2385                 return (0);
2386
2387         if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) {
2388                 if (VTOC(vp)->c_lockowner != current_thread()) {
2389                         hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
2390                         tooklock = 1;
2391                 }
2392         }
2393         hfsmp = VTOHFS(vp);
2394         cp = VTOC(vp);
2395         fp = VTOF(vp);
2396
2397 retry:
2398         /* Check virtual blocks only when performing write operation */
2399         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
2400                 if (hfs_start_transaction(hfsmp) != 0) {
2401                         retval = EINVAL;
2402                         goto exit;
2403                 } else {
2404                         started_tr = 1;
2405                 }
2406                 syslocks = SFL_EXTENTS | SFL_BITMAP;
2407
2408         } else if (overflow_extents(fp)) {
2409                 syslocks = SFL_EXTENTS;
2410         }
2411
2412         if (syslocks)
2413                 lockflags = hfs_systemfile_lock(hfsmp, syslocks, HFS_EXCLUSIVE_LOCK);
2414
2415         /*
2416          * Check for any delayed allocations.
2417          */
2418         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
2419                 int64_t actbytes;
2420                 u_int32_t loanedBlocks;
2421
2422                 //
2423                 // Make sure we have a transaction.  It's possible
2424                 // that we came in and fp->ff_unallocblocks was zero
2425                 // but during the time we blocked acquiring the extents
2426                 // btree, ff_unallocblocks became non-zero and so we
2427                 // will need to start a transaction.
2428                 //
2429                 if (started_tr == 0) {
2430                         if (syslocks) {
2431                                 hfs_systemfile_unlock(hfsmp, lockflags);
2432                                 syslocks = 0;
2433                         }
2434                         goto retry;
2435                 }
2436
2437                 /*
2438                  * Note: ExtendFileC will Release any blocks on loan and
2439                  * aquire real blocks.  So we ask to extend by zero bytes
2440                  * since ExtendFileC will account for the virtual blocks.
2441                  */
2442
2443                 loanedBlocks = fp->ff_unallocblocks;
2444                 retval = ExtendFileC(hfsmp, (FCB*)fp, 0, 0,
2445                                      kEFAllMask | kEFNoClumpMask, &actbytes);
2446
2447                 if (retval) {
2448                         fp->ff_unallocblocks = loanedBlocks;
2449                         cp->c_blocks += loanedBlocks;
2450                         fp->ff_blocks += loanedBlocks;
2451
2452                         HFS_MOUNT_LOCK(hfsmp, TRUE);
2453                         hfsmp->loanedBlocks += loanedBlocks;
2454                         HFS_MOUNT_UNLOCK(hfsmp, TRUE);
2455
2456                         hfs_systemfile_unlock(hfsmp, lockflags);
2457                         cp->c_flag |= C_MODIFIED;
2458                         if (started_tr) {
2459                                 (void) hfs_update(vp, TRUE);
2460                                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2461
2462                                 hfs_end_transaction(hfsmp);
2463                                 started_tr = 0;
2464                         }
2465                         goto exit;
2466                 }
2467         }
2468
2469         retval = MapFileBlockC(hfsmp, (FCB *)fp, ap->a_size, ap->a_foffset,
2470                                ap->a_bpn, &bytesContAvail);
2471         if (syslocks) {
2472                 hfs_systemfile_unlock(hfsmp, lockflags);
2473                 syslocks = 0;
2474         }
2475
2476         if (started_tr) {
2477                 (void) hfs_update(vp, TRUE);
2478                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2479                 hfs_end_transaction(hfsmp);
2480                 started_tr = 0;
2481         }
2482         if (retval) {
2483                 /* On write, always return error because virtual blocks, if any,
2484                  * should have been allocated in ExtendFileC().  We do not
2485                  * allocate virtual blocks on read, therefore return error
2486                  * only if no virtual blocks are allocated.  Otherwise we search
2487                  * rangelist for zero-fills
2488                  */
2489                 if ((MacToVFSError(retval) != ERANGE) ||
2490                     (ap->a_flags & VNODE_WRITE) ||
2491                     ((ap->a_flags & VNODE_READ) && (fp->ff_unallocblocks == 0))) {
2492                         goto exit;
2493                 }
2494
2495                 /* Validate if the start offset is within logical file size */
2496                 if (ap->a_foffset > fp->ff_size) {
2497                         goto exit;
2498                 }
2499
2500                 /* Searching file extents has failed for read operation, therefore
2501                  * search rangelist for any uncommitted holes in the file.
2502                  */
2503                 overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
2504                                       ap->a_foffset + (off_t)(ap->a_size - 1),
2505                                       &invalid_range);
2506                 switch(overlaptype) {
2507                 case RL_OVERLAPISCONTAINED:
2508                         /* start_offset <= rl_start, end_offset >= rl_end */
2509                         if (ap->a_foffset != invalid_range->rl_start) {
2510                                 break;
2511                         }
2512                 case RL_MATCHINGOVERLAP:
2513                         /* start_offset = rl_start, end_offset = rl_end */
2514                 case RL_OVERLAPCONTAINSRANGE:
2515                         /* start_offset >= rl_start, end_offset <= rl_end */
2516                 case RL_OVERLAPSTARTSBEFORE:
2517                         /* start_offset > rl_start, end_offset >= rl_start */
2518                         if ((off_t)fp->ff_size > (invalid_range->rl_end + 1)) {
2519                                 bytesContAvail = (invalid_range->rl_end + 1) - ap->a_foffset;
2520                         } else {
2521                                 bytesContAvail = fp->ff_size - ap->a_foffset;
2522                         }
2523                         if (bytesContAvail > ap->a_size) {
2524                                 bytesContAvail = ap->a_size;
2525                         }
2526                         *ap->a_bpn = (daddr64_t)-1;
2527                         retval = 0;
2528                         break;
2529                 case RL_OVERLAPENDSAFTER:
2530                         /* start_offset < rl_start, end_offset < rl_end */
2531                 case RL_NOOVERLAP:
2532                         break;
2533                 }
2534                 goto exit;
2535         }
2536
2537         /* MapFileC() found a valid extent in the filefork.  Search the
2538          * mapping information further for invalid file ranges
2539          */
2540         overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
2541                               ap->a_foffset + (off_t)bytesContAvail - 1,
2542                               &invalid_range);
2543         if (overlaptype != RL_NOOVERLAP) {
2544                 switch(overlaptype) {
2545                 case RL_MATCHINGOVERLAP:
2546                 case RL_OVERLAPCONTAINSRANGE:
2547                 case RL_OVERLAPSTARTSBEFORE:
2548                         /* There's no valid block for this byte offset */
2549                         *ap->a_bpn = (daddr64_t)-1;
2550                         /* There's no point limiting the amount to be returned
2551                          * if the invalid range that was hit extends all the way
2552                          * to the EOF (i.e. there's no valid bytes between the
2553                          * end of this range and the file's EOF):
2554                          */
2555                         if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
2556                             ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
2557                                 bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
2558                         }
2559                         break;
2560
2561                 case RL_OVERLAPISCONTAINED:
2562                 case RL_OVERLAPENDSAFTER:
2563                         /* The range of interest hits an invalid block before the end: */
2564                         if (invalid_range->rl_start == ap->a_foffset) {
2565                                 /* There's actually no valid information to be had starting here: */
2566                                 *ap->a_bpn = (daddr64_t)-1;
2567                                 if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
2568                                     ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
2569                                         bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
2570                                 }
2571                         } else {
2572                                 bytesContAvail = invalid_range->rl_start - ap->a_foffset;
2573                         }
2574                         break;
2575
2576                 case RL_NOOVERLAP:
2577                         break;
2578                 } /* end switch */
2579                 if (bytesContAvail > ap->a_size)
2580                         bytesContAvail = ap->a_size;
2581         }
2582
2583 exit:
2584         if (retval == 0) {
2585                 if (ap->a_run)
2586                         *ap->a_run = bytesContAvail;
2587
2588                 if (ap->a_poff)
2589                         *(int *)ap->a_poff = 0;
2590         }
2591
2592         if (tooklock)
2593                 hfs_unlock(cp);
2594
2595         return (MacToVFSError(retval));
2596 }
2597
2598
2599 /*
2600  * prepare and issue the I/O
2601  * buf_strategy knows how to deal
2602  * with requests that require
2603  * fragmented I/Os
2604  */
2605 int
2606 hfs_vnop_strategy(struct vnop_strategy_args *ap)
2607 {
2608         buf_t   bp = ap->a_bp;
2609         vnode_t vp = buf_vnode(bp);
2610         int error = 0;
2611
2612 #if CONFIG_PROTECT
2613         cnode_t *cp = NULL;
2614
2615         if ((cp = cp_get_protected_cnode(vp)) != NULL) {
2616                 /*
2617                  * Some paths to hfs_vnop_strategy will take the cnode lock,
2618                  * and some won't. But since content protection is only enabled
2619                  * for files that (a) aren't system files and (b) are regular
2620                  * files, any valid cnode here will be unlocked.
2621                  */
2622                 hfs_lock(cp, HFS_SHARED_LOCK);
2623                 buf_setcpaddr(bp, cp->c_cpentry);
2624         }
2625 #endif /* CONFIG_PROTECT */
2626
2627         error = buf_strategy(VTOHFS(vp)->hfs_devvp, ap);
2628
2629 #if CONFIG_PROTECT
2630         if (cp) {
2631                 hfs_unlock(cp);
2632         }
2633 #endif
2634
2635         return error;
2636 }
2637
2638 static int
2639 hfs_minorupdate(struct vnode *vp) {
2640         struct cnode *cp = VTOC(vp);
2641         cp->c_flag &= ~C_MODIFIED;
2642         cp->c_touch_acctime = 0;
2643         cp->c_touch_chgtime = 0;
2644         cp->c_touch_modtime = 0;
2645
2646         return 0;
2647 }
2648
2649 int
2650 do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_context_t context)
2651 {
2652         register struct cnode *cp = VTOC(vp);
2653         struct filefork *fp = VTOF(vp);
2654         struct proc *p = vfs_context_proc(context);;
2655         kauth_cred_t cred = vfs_context_ucred(context);
2656         int retval;
2657         off_t bytesToAdd;
2658         off_t actualBytesAdded;
2659         off_t filebytes;
2660         u_int32_t fileblocks;
2661         int blksize;
2662         struct hfsmount *hfsmp;
2663         int lockflags;
2664
2665         blksize = VTOVCB(vp)->blockSize;
2666         fileblocks = fp->ff_blocks;
2667         filebytes = (off_t)fileblocks * (off_t)blksize;
2668
2669         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_START,
2670                  (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
2671
2672         if (length < 0)
2673                 return (EINVAL);
2674
2675         /* This should only happen with a corrupt filesystem */
2676         if ((off_t)fp->ff_size < 0)
2677                 return (EINVAL);
2678
2679         if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE))
2680                 return (EFBIG);
2681
2682         hfsmp = VTOHFS(vp);
2683
2684         retval = E_NONE;
2685
2686         /* Files that are changing size are not hot file candidates. */
2687         if (hfsmp->hfc_stage == HFC_RECORDING) {
2688                 fp->ff_bytesread = 0;
2689         }
2690
2691         /*
2692          * We cannot just check if fp->ff_size == length (as an optimization)
2693          * since there may be extra physical blocks that also need truncation.
2694          */
2695 #if QUOTA
2696         if ((retval = hfs_getinoquota(cp)))
2697                 return(retval);
2698 #endif /* QUOTA */
2699
2700         /*
2701          * Lengthen the size of the file. We must ensure that the
2702          * last byte of the file is allocated. Since the smallest
2703          * value of ff_size is 0, length will be at least 1.
2704          */
2705         if (length > (off_t)fp->ff_size) {
2706 #if QUOTA
2707                 retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)),
2708                                    cred, 0);
2709                 if (retval)
2710                         goto Err_Exit;
2711 #endif /* QUOTA */
2712                 /*
2713                  * If we don't have enough physical space then
2714                  * we need to extend the physical size.
2715                  */
2716                 if (length > filebytes) {
2717                         int eflags;
2718                         u_int32_t blockHint = 0;
2719
2720                         /* All or nothing and don't round up to clumpsize. */
2721                         eflags = kEFAllMask | kEFNoClumpMask;
2722
2723                         if (cred && suser(cred, NULL) != 0)
2724                                 eflags |= kEFReserveMask;  /* keep a reserve */
2725
2726                         /*
2727                          * Allocate Journal and Quota files in metadata zone.
2728                          */
2729                         if (filebytes == 0 &&
2730                             hfsmp->hfs_flags & HFS_METADATA_ZONE &&
2731                             hfs_virtualmetafile(cp)) {
2732                                 eflags |= kEFMetadataMask;
2733                                 blockHint = hfsmp->hfs_metazone_start;
2734                         }
2735                         if (hfs_start_transaction(hfsmp) != 0) {
2736                             retval = EINVAL;
2737                             goto Err_Exit;
2738                         }
2739
2740                         /* Protect extents b-tree and allocation bitmap */
2741                         lockflags = SFL_BITMAP;
2742                         if (overflow_extents(fp))
2743                                 lockflags |= SFL_EXTENTS;
2744                         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
2745
2746                         while ((length > filebytes) && (retval == E_NONE)) {
2747                                 bytesToAdd = length - filebytes;
2748                                 retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
2749                                                     (FCB*)fp,
2750                                                     bytesToAdd,
2751                                                     blockHint,
2752                                                     eflags,
2753                                                     &actualBytesAdded));
2754
2755                                 filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
2756                                 if (actualBytesAdded == 0 && retval == E_NONE) {
2757                                         if (length > filebytes)
2758                                                 length = filebytes;
2759                                         break;
2760                                 }
2761                         } /* endwhile */
2762
2763                         hfs_systemfile_unlock(hfsmp, lockflags);
2764
2765                         if (hfsmp->jnl) {
2766                                 if (skipupdate) {
2767                                         (void) hfs_minorupdate(vp);
2768                                 }
2769                                 else {
2770                                         (void) hfs_update(vp, TRUE);
2771                                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2772                                 }
2773                         }
2774
2775                         hfs_end_transaction(hfsmp);
2776
2777                         if (retval)
2778                                 goto Err_Exit;
2779
2780                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
2781                                 (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
2782                 }
2783
2784                 if (!(flags & IO_NOZEROFILL)) {
2785                         if (UBCINFOEXISTS(vp)  && (vnode_issystem(vp) == 0) && retval == E_NONE) {
2786                                 struct rl_entry *invalid_range;
2787                                 off_t zero_limit;
2788
2789                                 zero_limit = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
2790                                 if (length < zero_limit) zero_limit = length;
2791
2792                                 if (length > (off_t)fp->ff_size) {
2793                                         struct timeval tv;
2794
2795                                         /* Extending the file: time to fill out the current last page w. zeroes? */
2796                                         if ((fp->ff_size & PAGE_MASK_64) &&
2797                                             (rl_scan(&fp->ff_invalidranges, fp->ff_size & ~PAGE_MASK_64,
2798                                             fp->ff_size - 1, &invalid_range) == RL_NOOVERLAP)) {
2799
2800                                                 /* There's some valid data at the start of the (current) last page
2801                                                    of the file, so zero out the remainder of that page to ensure the
2802                                                    entire page contains valid data.  Since there is no invalid range
2803                                                    possible past the (current) eof, there's no need to remove anything
2804                                                    from the invalid range list before calling cluster_write():  */
2805                                                 hfs_unlock(cp);
2806                                                 retval = cluster_write(vp, (struct uio *) 0, fp->ff_size, zero_limit,
2807                                                                 fp->ff_size, (off_t)0,
2808                                                                 (flags & IO_SYNC) | IO_HEADZEROFILL | IO_NOZERODIRTY);
2809                                                 hfs_lock(cp, HFS_FORCE_LOCK);
2810                                                 if (retval) goto Err_Exit;
2811
2812                                                 /* Merely invalidate the remaining area, if necessary: */
2813                                                 if (length > zero_limit) {
2814                                                         microuptime(&tv);
2815                                                         rl_add(zero_limit, length - 1, &fp->ff_invalidranges);
2816                                                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
2817                                                 }
2818                                         } else {
2819                                         /* The page containing the (current) eof is invalid: just add the
2820                                            remainder of the page to the invalid list, along with the area
2821                                            being newly allocated:
2822                                          */
2823                                         microuptime(&tv);
2824                                         rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges);
2825                                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
2826                                         };
2827                                 }
2828                         } else {
2829                                         panic("hfs_truncate: invoked on non-UBC object?!");
2830                         };
2831                 }
2832                 cp->c_touch_modtime = TRUE;
2833                 fp->ff_size = length;
2834
2835         } else { /* Shorten the size of the file */
2836
2837                 if ((off_t)fp->ff_size > length) {
2838                         /* Any space previously marked as invalid is now irrelevant: */
2839                         rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges);
2840                 }
2841
2842                 /*
2843                  * Account for any unmapped blocks. Note that the new
2844                  * file length can still end up with unmapped blocks.
2845                  */
2846                 if (fp->ff_unallocblocks > 0) {
2847                         u_int32_t finalblks;
2848                         u_int32_t loanedBlocks;
2849
2850                         HFS_MOUNT_LOCK(hfsmp, TRUE);
2851
2852                         loanedBlocks = fp->ff_unallocblocks;
2853                         cp->c_blocks -= loanedBlocks;
2854                         fp->ff_blocks -= loanedBlocks;
2855                         fp->ff_unallocblocks = 0;
2856
2857                         hfsmp->loanedBlocks -= loanedBlocks;
2858
2859                         finalblks = (length + blksize - 1) / blksize;
2860                         if (finalblks > fp->ff_blocks) {
2861                                 /* calculate required unmapped blocks */
2862                                 loanedBlocks = finalblks - fp->ff_blocks;
2863                                 hfsmp->loanedBlocks += loanedBlocks;
2864
2865                                 fp->ff_unallocblocks = loanedBlocks;
2866                                 cp->c_blocks += loanedBlocks;
2867                                 fp->ff_blocks += loanedBlocks;
2868                         }
2869                         HFS_MOUNT_UNLOCK(hfsmp, TRUE);
2870                 }
2871
2872                 /*
2873                  * For a TBE process the deallocation of the file blocks is
2874                  * delayed until the file is closed.  And hfs_close calls
2875                  * truncate with the IO_NDELAY flag set.  So when IO_NDELAY
2876                  * isn't set, we make sure this isn't a TBE process.
2877                  */
2878                 if ((flags & IO_NDELAY) || (proc_tbe(p) == 0)) {
2879 #if QUOTA
2880                   off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
2881 #endif /* QUOTA */
2882                   if (hfs_start_transaction(hfsmp) != 0) {
2883                       retval = EINVAL;
2884                       goto Err_Exit;
2885                   }
2886
2887                         if (fp->ff_unallocblocks == 0) {
2888                                 /* Protect extents b-tree and allocation bitmap */
2889                                 lockflags = SFL_BITMAP;
2890                                 if (overflow_extents(fp))
2891                                         lockflags |= SFL_EXTENTS;
2892                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
2893
2894                                 retval = MacToVFSError(TruncateFileC(VTOVCB(vp), (FCB*)fp, length, 0,
2895                                                                                                          FORK_IS_RSRC (fp), FTOC(fp)->c_fileid, false));
2896
2897                                 hfs_systemfile_unlock(hfsmp, lockflags);
2898                         }
2899                         if (hfsmp->jnl) {
2900                                 if (retval == 0) {
2901                                         fp->ff_size = length;
2902                                 }
2903                                 if (skipupdate) {
2904                                         (void) hfs_minorupdate(vp);
2905                                 }
2906                                 else {
2907                                         (void) hfs_update(vp, TRUE);
2908                                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2909                                 }
2910                         }
2911                         hfs_end_transaction(hfsmp);
2912
2913                         filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
2914                         if (retval)
2915                                 goto Err_Exit;
2916 #if QUOTA
2917                         /* These are bytesreleased */
2918                         (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0);
2919 #endif /* QUOTA */
2920                 }
2921                 /* Only set update flag if the logical length changes */
2922                 if ((off_t)fp->ff_size != length)
2923                         cp->c_touch_modtime = TRUE;
2924                 fp->ff_size = length;
2925         }
2926         if (cp->c_mode & (S_ISUID | S_ISGID)) {
2927                 if (!vfs_context_issuser(context)) {
2928                         cp->c_mode &= ~(S_ISUID | S_ISGID);
2929                         skipupdate = 0;
2930                 }
2931         }
2932         if (skipupdate) {
2933                 retval = hfs_minorupdate(vp);
2934         }
2935         else {
2936                 cp->c_touch_chgtime = TRUE;     /* status changed */
2937                 cp->c_touch_modtime = TRUE;     /* file data was modified */
2938                 retval = hfs_update(vp, MNT_WAIT);
2939         }
2940         if (retval) {
2941                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
2942                      -1, -1, -1, retval, 0);
2943         }
2944
2945 Err_Exit:
2946
2947         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_END,
2948                  (int)length, (int)fp->ff_size, (int)filebytes, retval, 0);
2949
2950         return (retval);
2951 }
2952
2953 /*
2954  * Preparation which must be done prior to deleting the catalog record
2955  * of a file or directory.  In order to make the on-disk as safe as possible,
2956  * we remove the catalog entry before releasing the bitmap blocks and the
2957  * overflow extent records.  However, some work must be done prior to deleting
2958  * the catalog record.
2959  *
2960  * When calling this function, the cnode must exist both in memory and on-disk.
2961  * If there are both resource fork and data fork vnodes, this function should
2962  * be called on both.
2963  */
2964
2965 int
2966 hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) {
2967
2968         struct filefork *fp = VTOF(vp);
2969         struct cnode *cp = VTOC(vp);
2970         int retval = 0;
2971
2972         /* Cannot truncate an HFS directory! */
2973         if (vnode_isdir(vp)) {
2974                 return (EISDIR);
2975         }
2976
2977         /*
2978          * See the comment below in hfs_truncate for why we need to call
2979          * setsize here.  Essentially we want to avoid pending IO if we
2980          * already know that the blocks are going to be released here.
2981          * This function is only called when totally removing all storage for a file, so
2982          * we can take a shortcut and immediately setsize (0);
2983          */
2984         ubc_setsize(vp, 0);
2985
2986         /* This should only happen with a corrupt filesystem */
2987         if ((off_t)fp->ff_size < 0)
2988                 return (EINVAL);
2989
2990         /*
2991          * We cannot just check if fp->ff_size == length (as an optimization)
2992          * since there may be extra physical blocks that also need truncation.
2993          */
2994 #if QUOTA
2995         if ((retval = hfs_getinoquota(cp))) {
2996                 return(retval);
2997         }
2998 #endif /* QUOTA */
2999
3000         /* Wipe out any invalid ranges which have yet to be backed by disk */
3001         rl_remove(0, fp->ff_size - 1, &fp->ff_invalidranges);
3002
3003         /*
3004          * Account for any unmapped blocks. Since we're deleting the
3005          * entire file, we don't have to worry about just shrinking
3006          * to a smaller number of borrowed blocks.
3007          */
3008         if (fp->ff_unallocblocks > 0) {
3009                 u_int32_t loanedBlocks;
3010
3011                 HFS_MOUNT_LOCK(hfsmp, TRUE);
3012
3013                 loanedBlocks = fp->ff_unallocblocks;
3014                 cp->c_blocks -= loanedBlocks;
3015                 fp->ff_blocks -= loanedBlocks;
3016                 fp->ff_unallocblocks = 0;
3017
3018                 hfsmp->loanedBlocks -= loanedBlocks;
3019
3020                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
3021         }
3022
3023         return 0;
3024 }
3025
3026
3027 /*
3028  * Special wrapper around calling TruncateFileC.  This function is useable
3029  * even when the catalog record does not exist any longer, making it ideal
3030  * for use when deleting a file.  The simplification here is that we know
3031  * that we are releasing all blocks.
3032  *
3033  * The caller is responsible for saving off a copy of the filefork(s)
3034  * embedded within the cnode prior to calling this function.  The pointers
3035  * supplied as arguments must be valid even if the cnode is no longer valid.
3036  */
3037
3038 int
3039 hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork,
3040                                          struct filefork *rsrcfork, u_int32_t fileid) {
3041
3042         off_t filebytes;
3043         u_int32_t fileblocks;
3044         int blksize = 0;
3045         int error = 0;
3046         int lockflags;
3047
3048         blksize = hfsmp->blockSize;
3049
3050         /* Data Fork */
3051         if (datafork->ff_blocks > 0) {
3052                 fileblocks = datafork->ff_blocks;
3053                 filebytes = (off_t)fileblocks * (off_t)blksize;
3054
3055                 /* We killed invalid ranges and loaned blocks before we removed the catalog entry */
3056
3057                 while (filebytes > 0) {
3058                         if (filebytes > HFS_BIGFILE_SIZE && overflow_extents(datafork)) {
3059                                 filebytes -= HFS_BIGFILE_SIZE;
3060                         } else {
3061                                 filebytes = 0;
3062                         }
3063
3064                         /* Start a transaction, and wipe out as many blocks as we can in this iteration */
3065                         if (hfs_start_transaction(hfsmp) != 0) {
3066                                 error = EINVAL;
3067                                 break;
3068                         }
3069
3070                         if (datafork->ff_unallocblocks == 0) {
3071                                 /* Protect extents b-tree and allocation bitmap */
3072                                 lockflags = SFL_BITMAP;
3073                                 if (overflow_extents(datafork))
3074                                         lockflags |= SFL_EXTENTS;
3075                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3076
3077                                 error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), datafork, filebytes, 1, 0, fileid, false));
3078
3079                                 hfs_systemfile_unlock(hfsmp, lockflags);
3080                         }
3081                         if (error == 0) {
3082                                 datafork->ff_size = filebytes;
3083                         }
3084                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3085
3086                         /* Finish the transaction and start over if necessary */
3087                         hfs_end_transaction(hfsmp);
3088
3089                         if (error) {
3090                                 break;
3091                         }
3092                 }
3093         }
3094
3095         /* Resource fork */
3096         if (error == 0 && (rsrcfork != NULL) && rsrcfork->ff_blocks > 0) {
3097                 fileblocks = rsrcfork->ff_blocks;
3098                 filebytes = (off_t)fileblocks * (off_t)blksize;
3099
3100                 /* We killed invalid ranges and loaned blocks before we removed the catalog entry */
3101
3102                 while (filebytes > 0) {
3103                         if (filebytes > HFS_BIGFILE_SIZE && overflow_extents(rsrcfork)) {
3104                                 filebytes -= HFS_BIGFILE_SIZE;
3105                         } else {
3106                                 filebytes = 0;
3107                         }
3108
3109                         /* Start a transaction, and wipe out as many blocks as we can in this iteration */
3110                         if (hfs_start_transaction(hfsmp) != 0) {
3111                                 error = EINVAL;
3112                                 break;
3113                         }
3114
3115                         if (rsrcfork->ff_unallocblocks == 0) {
3116                                 /* Protect extents b-tree and allocation bitmap */
3117                                 lockflags = SFL_BITMAP;
3118                                 if (overflow_extents(rsrcfork))
3119                                         lockflags |= SFL_EXTENTS;
3120                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3121
3122                                 error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), rsrcfork, filebytes, 1, 1, fileid, false));
3123
3124                                 hfs_systemfile_unlock(hfsmp, lockflags);
3125                         }
3126                         if (error == 0) {
3127                                 rsrcfork->ff_size = filebytes;
3128                         }
3129                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3130
3131                         /* Finish the transaction and start over if necessary */
3132                         hfs_end_transaction(hfsmp);
3133
3134                         if (error) {
3135                                 break;
3136                         }
3137                 }
3138         }
3139
3140         return error;
3141 }
3142
3143
3144 /*
3145  * Truncate a cnode to at most length size, freeing (or adding) the
3146  * disk blocks.
3147  */
3148 int
3149 hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize,
3150              int skipupdate, vfs_context_t context)
3151 {
3152         struct filefork *fp = VTOF(vp);
3153         off_t filebytes;
3154         u_int32_t fileblocks;
3155         int blksize, error = 0;
3156         struct cnode *cp = VTOC(vp);
3157
3158         /* Cannot truncate an HFS directory! */
3159         if (vnode_isdir(vp)) {
3160                 return (EISDIR);
3161         }
3162         /* A swap file cannot change size. */
3163         if (vnode_isswap(vp) && (length != 0)) {
3164                 return (EPERM);
3165         }
3166
3167         blksize = VTOVCB(vp)->blockSize;
3168         fileblocks = fp->ff_blocks;
3169         filebytes = (off_t)fileblocks * (off_t)blksize;
3170
3171         //
3172         // Have to do this here so that we don't wind up with
3173         // i/o pending for blocks that are about to be released
3174         // if we truncate the file.
3175         //
3176         // If skipsetsize is set, then the caller is responsible
3177         // for the ubc_setsize.
3178         //
3179         // Even if skipsetsize is set, if the length is zero we
3180         // want to call ubc_setsize() because as of SnowLeopard
3181         // it will no longer cause any page-ins and it will drop
3182         // any dirty pages so that we don't do any i/o that we
3183         // don't have to.  This also prevents a race where i/o
3184         // for truncated blocks may overwrite later data if the
3185         // blocks get reallocated to a different file.
3186         //
3187         if (!skipsetsize || length == 0)
3188                 ubc_setsize(vp, length);
3189
3190         // have to loop truncating or growing files that are
3191         // really big because otherwise transactions can get
3192         // enormous and consume too many kernel resources.
3193
3194         if (length < filebytes) {
3195                 while (filebytes > length) {
3196                         if ((filebytes - length) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
3197                                 filebytes -= HFS_BIGFILE_SIZE;
3198                         } else {
3199                                 filebytes = length;
3200                         }
3201                         cp->c_flag |= C_FORCEUPDATE;
3202                         error = do_hfs_truncate(vp, filebytes, flags, skipupdate, context);
3203                         if (error)
3204                                 break;
3205                 }
3206         } else if (length > filebytes) {
3207                 while (filebytes < length) {
3208                         if ((length - filebytes) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
3209                                 filebytes += HFS_BIGFILE_SIZE;
3210                         } else {
3211                                 filebytes = length;
3212                         }
3213                         cp->c_flag |= C_FORCEUPDATE;
3214                         error = do_hfs_truncate(vp, filebytes, flags, skipupdate, context);
3215                         if (error)
3216                                 break;
3217                 }
3218         } else /* Same logical size */ {
3219
3220                 error = do_hfs_truncate(vp, length, flags, skipupdate, context);
3221         }
3222         /* Files that are changing size are not hot file candidates. */
3223         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
3224                 fp->ff_bytesread = 0;
3225         }
3226
3227         return (error);
3228 }
3229
3230
3231
3232 /*
3233  * Preallocate file storage space.
3234  */
3235 int
3236 hfs_vnop_allocate(struct vnop_allocate_args /* {
3237                 vnode_t a_vp;
3238                 off_t a_length;
3239                 u_int32_t  a_flags;
3240                 off_t *a_bytesallocated;
3241                 off_t a_offset;
3242                 vfs_context_t a_context;
3243         } */ *ap)
3244 {
3245         struct vnode *vp = ap->a_vp;
3246         struct cnode *cp;
3247         struct filefork *fp;
3248         ExtendedVCB *vcb;
3249         off_t length = ap->a_length;
3250         off_t startingPEOF;
3251         off_t moreBytesRequested;
3252         off_t actualBytesAdded;
3253         off_t filebytes;
3254         u_int32_t fileblocks;
3255         int retval, retval2;
3256         u_int32_t blockHint;
3257         u_int32_t extendFlags;   /* For call to ExtendFileC */
3258         struct hfsmount *hfsmp;
3259         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
3260         int lockflags;
3261         time_t orig_ctime;
3262
3263         *(ap->a_bytesallocated) = 0;
3264
3265         if (!vnode_isreg(vp))
3266                 return (EISDIR);
3267         if (length < (off_t)0)
3268                 return (EINVAL);
3269
3270         cp = VTOC(vp);
3271
3272         orig_ctime = VTOC(vp)->c_ctime;
3273
3274         check_for_tracked_file(vp, orig_ctime, ap->a_length == 0 ? NAMESPACE_HANDLER_TRUNCATE_OP|NAMESPACE_HANDLER_DELETE_OP : NAMESPACE_HANDLER_TRUNCATE_OP, NULL);
3275
3276         hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK);
3277
3278         if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
3279                 goto Err_Exit;
3280         }
3281
3282         fp = VTOF(vp);
3283         hfsmp = VTOHFS(vp);
3284         vcb = VTOVCB(vp);
3285
3286         fileblocks = fp->ff_blocks;
3287         filebytes = (off_t)fileblocks * (off_t)vcb->blockSize;
3288
3289         if ((ap->a_flags & ALLOCATEFROMVOL) && (length < filebytes)) {
3290                 retval = EINVAL;
3291                 goto Err_Exit;
3292         }
3293
3294         /* Fill in the flags word for the call to Extend the file */
3295
3296         extendFlags = kEFNoClumpMask;
3297         if (ap->a_flags & ALLOCATECONTIG)
3298                 extendFlags |= kEFContigMask;
3299         if (ap->a_flags & ALLOCATEALL)
3300                 extendFlags |= kEFAllMask;
3301         if (cred && suser(cred, NULL) != 0)
3302                 extendFlags |= kEFReserveMask;
3303         if (hfs_virtualmetafile(cp))
3304                 extendFlags |= kEFMetadataMask;
3305
3306         retval = E_NONE;
3307         blockHint = 0;
3308         startingPEOF = filebytes;
3309
3310         if (ap->a_flags & ALLOCATEFROMPEOF)
3311                 length += filebytes;
3312         else if (ap->a_flags & ALLOCATEFROMVOL)
3313                 blockHint = ap->a_offset / VTOVCB(vp)->blockSize;
3314
3315         /* If no changes are necesary, then we're done */
3316         if (filebytes == length)
3317                 goto Std_Exit;
3318
3319         /*
3320          * Lengthen the size of the file. We must ensure that the
3321          * last byte of the file is allocated. Since the smallest
3322          * value of filebytes is 0, length will be at least 1.
3323          */
3324         if (length > filebytes) {
3325                 off_t total_bytes_added = 0, orig_request_size;
3326
3327                 orig_request_size = moreBytesRequested = length - filebytes;
3328
3329 #if QUOTA
3330                 retval = hfs_chkdq(cp,
3331                                 (int64_t)(roundup(moreBytesRequested, vcb->blockSize)),
3332                                 cred, 0);
3333                 if (retval)
3334                         goto Err_Exit;
3335
3336 #endif /* QUOTA */
3337                 /*
3338                  * Metadata zone checks.
3339                  */
3340                 if (hfsmp->hfs_flags & HFS_METADATA_ZONE) {
3341                         /*
3342                          * Allocate Journal and Quota files in metadata zone.
3343                          */
3344                         if (hfs_virtualmetafile(cp)) {
3345                                 blockHint = hfsmp->hfs_metazone_start;
3346                         } else if ((blockHint >= hfsmp->hfs_metazone_start) &&
3347                                    (blockHint <= hfsmp->hfs_metazone_end)) {
3348                                 /*
3349                                  * Move blockHint outside metadata zone.
3350                                  */
3351                                 blockHint = hfsmp->hfs_metazone_end + 1;
3352                         }
3353                 }
3354
3355
3356                 while ((length > filebytes) && (retval == E_NONE)) {
3357                     off_t bytesRequested;
3358
3359                     if (hfs_start_transaction(hfsmp) != 0) {
3360                         retval = EINVAL;
3361                         goto Err_Exit;
3362                     }
3363
3364                     /* Protect extents b-tree and allocation bitmap */
3365                     lockflags = SFL_BITMAP;
3366                     if (overflow_extents(fp))
3367                         lockflags |= SFL_EXTENTS;
3368                     lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3369
3370                     if (moreBytesRequested >= HFS_BIGFILE_SIZE) {
3371                         bytesRequested = HFS_BIGFILE_SIZE;
3372                     } else {
3373                         bytesRequested = moreBytesRequested;
3374                     }
3375
3376                     if (extendFlags & kEFContigMask) {
3377                             // if we're on a sparse device, this will force it to do a
3378                             // full scan to find the space needed.
3379                             hfsmp->hfs_flags &= ~HFS_DID_CONTIG_SCAN;
3380                     }
3381
3382                     retval = MacToVFSError(ExtendFileC(vcb,
3383                                                 (FCB*)fp,
3384                                                 bytesRequested,
3385                                                 blockHint,
3386                                                 extendFlags,
3387                                                 &actualBytesAdded));
3388
3389                     if (retval == E_NONE) {
3390                         *(ap->a_bytesallocated) += actualBytesAdded;
3391                         total_bytes_added += actualBytesAdded;
3392                         moreBytesRequested -= actualBytesAdded;
3393                         if (blockHint != 0) {
3394                             blockHint += actualBytesAdded / vcb->blockSize;
3395                         }
3396                     }
3397                     filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
3398
3399                     hfs_systemfile_unlock(hfsmp, lockflags);
3400
3401                     if (hfsmp->jnl) {
3402                         (void) hfs_update(vp, TRUE);
3403                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3404                     }
3405
3406                     hfs_end_transaction(hfsmp);
3407                 }
3408
3409
3410                 /*
3411                  * if we get an error and no changes were made then exit
3412                  * otherwise we must do the hfs_update to reflect the changes
3413                  */
3414                 if (retval && (startingPEOF == filebytes))
3415                         goto Err_Exit;
3416
3417                 /*
3418                  * Adjust actualBytesAdded to be allocation block aligned, not
3419                  * clump size aligned.
3420                  * NOTE: So what we are reporting does not affect reality
3421                  * until the file is closed, when we truncate the file to allocation
3422                  * block size.
3423                  */
3424                 if (total_bytes_added != 0 && orig_request_size < total_bytes_added)
3425                         *(ap->a_bytesallocated) =
3426                                 roundup(orig_request_size, (off_t)vcb->blockSize);
3427
3428         } else { /* Shorten the size of the file */
3429
3430                 if (fp->ff_size > length) {
3431                         /*
3432                          * Any buffers that are past the truncation point need to be
3433                          * invalidated (to maintain buffer cache consistency).
3434                          */
3435                 }
3436
3437                 retval = hfs_truncate(vp, length, 0, 0, 0, ap->a_context);
3438                 filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
3439
3440                 /*
3441                  * if we get an error and no changes were made then exit
3442                  * otherwise we must do the hfs_update to reflect the changes
3443                  */
3444                 if (retval && (startingPEOF == filebytes)) goto Err_Exit;
3445 #if QUOTA
3446                 /* These are  bytesreleased */
3447                 (void) hfs_chkdq(cp, (int64_t)-((startingPEOF - filebytes)), NOCRED,0);
3448 #endif /* QUOTA */
3449
3450                 if (fp->ff_size > filebytes) {
3451                         fp->ff_size = filebytes;
3452
3453                         hfs_unlock(cp);
3454                         ubc_setsize(vp, fp->ff_size);
3455                         hfs_lock(cp, HFS_FORCE_LOCK);
3456                 }
3457         }
3458
3459 Std_Exit:
3460         cp->c_touch_chgtime = TRUE;
3461         cp->c_touch_modtime = TRUE;
3462         retval2 = hfs_update(vp, MNT_WAIT);
3463
3464         if (retval == 0)
3465                 retval = retval2;
3466 Err_Exit:
3467         hfs_unlock_truncate(cp, 0);
3468         hfs_unlock(cp);
3469         return (retval);
3470 }
3471
3472
3473 /*
3474  * Pagein for HFS filesystem
3475  */
3476 int
3477 hfs_vnop_pagein(struct vnop_pagein_args *ap)
3478 /*
3479         struct vnop_pagein_args {
3480                 vnode_t a_vp,
3481                 upl_t         a_pl,
3482                 vm_offset_t   a_pl_offset,
3483                 off_t         a_f_offset,
3484                 size_t        a_size,
3485                 int           a_flags
3486                 vfs_context_t a_context;
3487         };
3488 */
3489 {
3490         vnode_t         vp;
3491         struct cnode    *cp;
3492         struct filefork *fp;
3493         int             error = 0;
3494         upl_t           upl;
3495         upl_page_info_t *pl;
3496         off_t           f_offset;
3497         int             offset;
3498         int             isize;
3499         int             pg_index;
3500         boolean_t       truncate_lock_held = FALSE;
3501         boolean_t       file_converted = FALSE;
3502         kern_return_t   kret;
3503
3504         vp = ap->a_vp;
3505         cp = VTOC(vp);
3506         fp = VTOF(vp);
3507
3508 #if CONFIG_PROTECT
3509         if ((error = cp_handle_vnop(cp, CP_READ_ACCESS | CP_WRITE_ACCESS)) != 0) {
3510                 return error;
3511         }
3512 #endif /* CONFIG_PROTECT */
3513
3514         if (ap->a_pl != NULL) {
3515                 /*
3516                  * this can only happen for swap files now that
3517                  * we're asking for V2 paging behavior...
3518                  * so don't need to worry about decompression, or
3519                  * keeping track of blocks read or taking the truncate lock
3520                  */
3521                 error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
3522                                        ap->a_size, (off_t)fp->ff_size, ap->a_flags);
3523                 goto pagein_done;
3524         }
3525
3526 retry_pagein:
3527         /*
3528          * take truncate lock (shared/recursive) to guard against
3529          * zero-fill thru fsync interfering, but only for v2
3530          *
3531          * the HFS_RECURSE_TRUNCLOCK arg indicates that we want the
3532          * lock shared and we are allowed to recurse 1 level if this thread already
3533          * owns the lock exclusively... this can legally occur
3534          * if we are doing a shrinking ftruncate against a file
3535          * that is mapped private, and the pages being truncated
3536          * do not currently exist in the cache... in that case
3537          * we will have to page-in the missing pages in order
3538          * to provide them to the private mapping... we must
3539          * also call hfs_unlock_truncate with a postive been_recursed
3540          * arg to indicate that if we have recursed, there is no need to drop
3541          * the lock.  Allowing this simple recursion is necessary
3542          * in order to avoid a certain deadlock... since the ftruncate
3543          * already holds the truncate lock exclusively, if we try
3544          * to acquire it shared to protect the pagein path, we will
3545          * hang this thread
3546          *
3547          * NOTE: The if () block below is a workaround in order to prevent a
3548          * VM deadlock. See rdar://7853471.
3549          *
3550          * If we are in a forced unmount, then launchd will still have the
3551          * dyld_shared_cache file mapped as it is trying to reboot.  If we
3552          * take the truncate lock here to service a page fault, then our
3553          * thread could deadlock with the forced-unmount.  The forced unmount
3554          * thread will try to reclaim the dyld_shared_cache vnode, but since it's
3555          * marked C_DELETED, it will call ubc_setsize(0).  As a result, the unmount
3556          * thread will think it needs to copy all of the data out of the file
3557          * and into a VM copy object.  If we hold the cnode lock here, then that
3558          * VM operation will not be able to proceed, because we'll set a busy page
3559          * before attempting to grab the lock.  Note that this isn't as simple as "don't
3560          * call ubc_setsize" because doing that would just shift the problem to the
3561          * ubc_msync done before the vnode is reclaimed.
3562          *
3563          * So, if a forced unmount on this volume is in flight AND the cnode is
3564          * marked C_DELETED, then just go ahead and do the page in without taking
3565          * the lock (thus suspending pagein_v2 semantics temporarily).  Since it's on a file
3566          * that is not going to be available on the next mount, this seems like a
3567          * OK solution from a correctness point of view, even though it is hacky.
3568          */
3569         if (vfs_isforce(vp->v_mount)) {
3570                 if (cp->c_flag & C_DELETED) {
3571                         /* If we don't get it, then just go ahead and operate without the lock */
3572                         truncate_lock_held = hfs_try_trunclock(cp, HFS_RECURSE_TRUNCLOCK);
3573                 }
3574         }
3575         else {
3576                 hfs_lock_truncate(cp, HFS_RECURSE_TRUNCLOCK);
3577                 truncate_lock_held = TRUE;
3578         }
3579
3580         kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT);
3581
3582         if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
3583                 error = EINVAL;
3584                 goto pagein_done;
3585         }
3586         isize = ap->a_size;
3587
3588         /*
3589          * Scan from the back to find the last page in the UPL, so that we
3590          * aren't looking at a UPL that may have already been freed by the
3591          * preceding aborts/completions.
3592          */
3593         for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
3594                 if (upl_page_present(pl, --pg_index))
3595                         break;
3596                 if (pg_index == 0) {
3597                         /*
3598                          * no absent pages were found in the range specified
3599                          * just abort the UPL to get rid of it and then we're done
3600                          */
3601                         ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
3602                         goto pagein_done;
3603                 }
3604         }
3605         /*
3606          * initialize the offset variables before we touch the UPL.
3607          * f_offset is the position into the file, in bytes
3608          * offset is the position into the UPL, in bytes
3609          * pg_index is the pg# of the UPL we're operating on
3610          * isize is the offset into the UPL of the last page that is present.
3611          */
3612         isize = ((pg_index + 1) * PAGE_SIZE);
3613         pg_index = 0;
3614         offset = 0;
3615         f_offset = ap->a_f_offset;
3616
3617         while (isize) {
3618                 int  xsize;
3619                 int  num_of_pages;
3620
3621                 if ( !upl_page_present(pl, pg_index)) {
3622                         /*
3623                          * we asked for RET_ONLY_ABSENT, so it's possible
3624                          * to get back empty slots in the UPL.
3625                          * just skip over them
3626                          */
3627                         f_offset += PAGE_SIZE;
3628                         offset   += PAGE_SIZE;
3629                         isize    -= PAGE_SIZE;
3630                         pg_index++;
3631
3632                         continue;
3633                 }
3634                 /*
3635                  * We know that we have at least one absent page.
3636                  * Now checking to see how many in a row we have
3637                  */
3638                 num_of_pages = 1;
3639                 xsize = isize - PAGE_SIZE;
3640
3641                 while (xsize) {
3642                         if ( !upl_page_present(pl, pg_index + num_of_pages))
3643                                 break;
3644                         num_of_pages++;
3645                         xsize -= PAGE_SIZE;
3646                 }
3647                 xsize = num_of_pages * PAGE_SIZE;
3648
3649 #if HFS_COMPRESSION
3650                 if (VNODE_IS_RSRC(vp)) {
3651                         /* allow pageins of the resource fork */
3652                 } else {
3653                         int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
3654
3655                         if (compressed) {
3656                                 if (truncate_lock_held) {
3657                                         /*
3658                                          * can't hold the truncate lock when calling into the decmpfs layer
3659                                          * since it calls back into this layer... even though we're only
3660                                          * holding the lock in shared mode, and the re-entrant path only
3661                                          * takes the lock shared, we can deadlock if some other thread
3662                                          * tries to grab the lock exclusively in between.
3663                                          */
3664                                         hfs_unlock_truncate(cp, 1);
3665                                         truncate_lock_held = FALSE;
3666                                 }
3667                                 ap->a_pl = upl;
3668                                 ap->a_pl_offset = offset;
3669                                 ap->a_f_offset = f_offset;
3670                                 ap->a_size = xsize;
3671
3672                                 error = decmpfs_pagein_compressed(ap, &compressed, VTOCMP(vp));
3673                                 /*
3674                                  * note that decpfs_pagein_compressed can change the state of
3675                                  * 'compressed'... it will set it to 0 if the file is no longer
3676                                  * compressed once the compression lock is successfully taken
3677                                  * i.e. we would block on that lock while the file is being inflated
3678                                  */
3679                                 if (compressed) {
3680                                         if (error == 0) {
3681                                                 /* successful page-in, update the access time */
3682                                                 VTOC(vp)->c_touch_acctime = TRUE;
3683
3684                                                 /* compressed files are not hot file candidates */
3685                                                 if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
3686                                                         fp->ff_bytesread = 0;
3687                                                 }
3688                                         } else if (error == EAGAIN) {
3689                                                 /*
3690                                                  * EAGAIN indicates someone else already holds the compression lock...
3691                                                  * to avoid deadlocking, we'll abort this range of pages with an
3692                                                  * indication that the pagein needs to be redriven
3693                                                  */
3694                                                 ubc_upl_abort_range(upl, (upl_offset_t) offset, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_RESTART);
3695                                         }
3696                                         goto pagein_next_range;
3697                                 }
3698                                 else {
3699                                         /*
3700                                          * Set file_converted only if the file became decompressed while we were
3701                                          * paging in.  If it were still compressed, we would re-start the loop using the goto
3702                                          * in the above block.  This avoid us overloading truncate_lock_held as our retry_pagein
3703                                          * condition below, since we could have avoided taking the truncate lock to prevent
3704                                          * a deadlock in the force unmount case.
3705                                          */
3706                                         file_converted = TRUE;
3707                                 }
3708                         }
3709                         if (file_converted == TRUE) {
3710                                 /*
3711                                  * the file was converted back to a regular file after we first saw it as compressed
3712                                  * we need to abort the upl, retake the truncate lock, recreate the UPL and start over
3713                                  * reset a_size so that we consider what remains of the original request
3714                                  * and null out a_upl and a_pl_offset.
3715                                  *
3716                                  * We should only be able to get into this block if the decmpfs_pagein_compressed
3717                                  * successfully decompressed the range in question for this file.
3718                                  */
3719                                 ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY);
3720
3721                                 ap->a_size = isize;
3722                                 ap->a_pl = NULL;
3723                                 ap->a_pl_offset = 0;
3724
3725                                 /* Reset file_converted back to false so that we don't infinite-loop. */
3726                                 file_converted = FALSE;
3727                                 goto retry_pagein;
3728                         }
3729                 }
3730 #endif
3731                 error = cluster_pagein(vp, upl, offset, f_offset, xsize, (off_t)fp->ff_size, ap->a_flags);
3732
3733                 /*
3734                  * Keep track of blocks read.
3735                  */
3736                 if ( !vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) {
3737                         int bytesread;
3738                         int took_cnode_lock = 0;
3739
3740                         if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE)
3741                                 bytesread = fp->ff_size;
3742                         else
3743                                 bytesread = xsize;
3744
3745                         /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
3746                         if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) {
3747                                 hfs_lock(cp, HFS_FORCE_LOCK);
3748                                 took_cnode_lock = 1;
3749                         }
3750                         /*
3751                          * If this file hasn't been seen since the start of
3752                          * the current sampling period then start over.
3753                          */
3754                         if (cp->c_atime < VTOHFS(vp)->hfc_timebase) {
3755                                 struct timeval tv;
3756
3757                                 fp->ff_bytesread = bytesread;
3758                                 microtime(&tv);
3759                                 cp->c_atime = tv.tv_sec;
3760                         } else {
3761                                 fp->ff_bytesread += bytesread;
3762                         }
3763                         cp->c_touch_acctime = TRUE;
3764                         if (took_cnode_lock)
3765                                 hfs_unlock(cp);
3766                 }
3767 pagein_next_range:
3768                 f_offset += xsize;
3769                 offset   += xsize;
3770                 isize    -= xsize;
3771                 pg_index += num_of_pages;
3772
3773                 error = 0;
3774         }
3775
3776 pagein_done:
3777         if (truncate_lock_held == TRUE) {
3778                 /* Note 1 is passed to hfs_unlock_truncate in been_recursed argument */
3779                 hfs_unlock_truncate(cp, 1);
3780         }
3781
3782         return (error);
3783 }
3784
3785 /*
3786  * Pageout for HFS filesystem.
3787  */
3788 int
3789 hfs_vnop_pageout(struct vnop_pageout_args *ap)
3790 /*
3791         struct vnop_pageout_args {
3792            vnode_t a_vp,
3793            upl_t         a_pl,
3794            vm_offset_t   a_pl_offset,
3795            off_t         a_f_offset,
3796            size_t        a_size,
3797            int           a_flags
3798            vfs_context_t a_context;
3799         };
3800 */
3801 {
3802         vnode_t vp = ap->a_vp;
3803         struct cnode *cp;
3804         struct filefork *fp;
3805         int retval = 0;
3806         off_t filesize;
3807         upl_t           upl;
3808         upl_page_info_t* pl;
3809         vm_offset_t     a_pl_offset;
3810         int             a_flags;
3811         int is_pageoutv2 = 0;
3812         kern_return_t kret;
3813
3814         cp = VTOC(vp);
3815         fp = VTOF(vp);
3816
3817         /*
3818          * Figure out where the file ends, for pageout purposes.  If
3819          * ff_new_size > ff_size, then we're in the middle of extending the
3820          * file via a write, so it is safe (and necessary) that we be able
3821          * to pageout up to that point.
3822          */
3823         filesize = fp->ff_size;
3824         if (fp->ff_new_size > filesize)
3825                 filesize = fp->ff_new_size;
3826
3827         a_flags = ap->a_flags;
3828         a_pl_offset = ap->a_pl_offset;
3829
3830         /*
3831          * we can tell if we're getting the new or old behavior from the UPL
3832          */
3833         if ((upl = ap->a_pl) == NULL) {
3834                 int request_flags;
3835
3836                 is_pageoutv2 = 1;
3837                 /*
3838                  * we're in control of any UPL we commit
3839                  * make sure someone hasn't accidentally passed in UPL_NOCOMMIT
3840                  */
3841                 a_flags &= ~UPL_NOCOMMIT;
3842                 a_pl_offset = 0;
3843
3844                 /*
3845                  * take truncate lock (shared) to guard against
3846                  * zero-fill thru fsync interfering, but only for v2
3847                  */
3848                 hfs_lock_truncate(cp, HFS_SHARED_LOCK);
3849
3850                 if (a_flags & UPL_MSYNC) {
3851                         request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY;
3852                 }
3853                 else {
3854                         request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY;
3855                 }
3856
3857                 kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags);
3858
3859                 if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
3860                         retval = EINVAL;
3861                         goto pageout_done;
3862                 }
3863         }
3864         /*
3865          * from this point forward upl points at the UPL we're working with
3866          * it was either passed in or we succesfully created it
3867          */
3868
3869         /*
3870          * Now that HFS is opting into VFC_VFSVNOP_PAGEOUTV2, we may need to operate on our own
3871          * UPL instead of relying on the UPL passed into us.  We go ahead and do that here,
3872          * scanning for dirty ranges.  We'll issue our own N cluster_pageout calls, for
3873          * N dirty ranges in the UPL.  Note that this is almost a direct copy of the
3874          * logic in vnode_pageout except that we need to do it after grabbing the truncate
3875          * lock in HFS so that we don't lock invert ourselves.
3876          *
3877          * Note that we can still get into this function on behalf of the default pager with
3878          * non-V2 behavior (swapfiles).  However in that case, we did not grab locks above
3879          * since fsync and other writing threads will grab the locks, then mark the
3880          * relevant pages as busy.  But the pageout codepath marks the pages as busy,
3881          * and THEN would attempt to grab the truncate lock, which would result in deadlock.  So
3882          * we do not try to grab anything for the pre-V2 case, which should only be accessed
3883          * by the paging/VM system.
3884          */
3885
3886         if (is_pageoutv2) {
3887                 off_t f_offset;
3888                 int offset;
3889                 int isize;
3890                 int pg_index;
3891                 int error;
3892                 int error_ret = 0;
3893
3894                 isize = ap->a_size;
3895                 f_offset = ap->a_f_offset;
3896
3897                 /*
3898                  * Scan from the back to find the last page in the UPL, so that we
3899                  * aren't looking at a UPL that may have already been freed by the
3900                  * preceding aborts/completions.
3901                  */
3902                 for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
3903                         if (upl_page_present(pl, --pg_index))
3904                                 break;
3905                         if (pg_index == 0) {
3906                                 ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
3907                                 goto pageout_done;
3908                         }
3909                 }
3910
3911                 /*
3912                  * initialize the offset variables before we touch the UPL.
3913                  * a_f_offset is the position into the file, in bytes
3914                  * offset is the position into the UPL, in bytes
3915                  * pg_index is the pg# of the UPL we're operating on.
3916                  * isize is the offset into the UPL of the last non-clean page.
3917                  */
3918                 isize = ((pg_index + 1) * PAGE_SIZE);
3919
3920                 offset = 0;
3921                 pg_index = 0;
3922
3923                 while (isize) {
3924                         int  xsize;
3925                         int  num_of_pages;
3926
3927                         if ( !upl_page_present(pl, pg_index)) {
3928                                 /*
3929                                  * we asked for RET_ONLY_DIRTY, so it's possible
3930                                  * to get back empty slots in the UPL.
3931                                  * just skip over them
3932                                  */
3933                                 f_offset += PAGE_SIZE;
3934                                 offset   += PAGE_SIZE;
3935                                 isize    -= PAGE_SIZE;
3936                                 pg_index++;
3937
3938                                 continue;
3939                         }
3940                         if ( !upl_dirty_page(pl, pg_index)) {
3941                                 panic ("hfs_vnop_pageout: unforeseen clean page @ index %d for UPL %p\n", pg_index, upl);
3942                         }
3943
3944                         /*
3945                          * We know that we have at least one dirty page.
3946                          * Now checking to see how many in a row we have
3947                          */
3948                         num_of_pages = 1;
3949                         xsize = isize - PAGE_SIZE;
3950
3951                         while (xsize) {
3952                                 if ( !upl_dirty_page(pl, pg_index + num_of_pages))
3953                                         break;
3954                                 num_of_pages++;
3955                                 xsize -= PAGE_SIZE;
3956                         }
3957                         xsize = num_of_pages * PAGE_SIZE;
3958
3959                         if (!vnode_isswap(vp)) {
3960                                 off_t end_of_range;
3961                                 int tooklock;
3962
3963                                 tooklock = 0;
3964
3965                                 if (cp->c_lockowner != current_thread()) {
3966                                         if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
3967                                                 /*
3968                                                  * we're in the v2 path, so we are the
3969                                                  * owner of the UPL... we may have already
3970                                                  * processed some of the UPL, so abort it
3971                                                  * from the current working offset to the
3972                                                  * end of the UPL
3973                                                  */
3974                                                 ubc_upl_abort_range(upl,
3975                                                                     offset,
3976                                                                     ap->a_size - offset,
3977                                                                     UPL_ABORT_FREE_ON_EMPTY);
3978                                                 goto pageout_done;
3979                                         }
3980                                         tooklock = 1;
3981                                 }
3982                                 end_of_range = f_offset + xsize - 1;
3983
3984                                 if (end_of_range >= filesize) {
3985                                         end_of_range = (off_t)(filesize - 1);
3986                                 }
3987                                 if (f_offset < filesize) {
3988                                         rl_remove(f_offset, end_of_range, &fp->ff_invalidranges);
3989                                         cp->c_flag |= C_MODIFIED;  /* leof is dirty */
3990                                 }
3991                                 if (tooklock) {
3992                                         hfs_unlock(cp);
3993                                 }
3994                         }
3995                         if ((error = cluster_pageout(vp, upl, offset, f_offset,
3996                                                         xsize, filesize, a_flags))) {
3997                                 if (error_ret == 0)
3998                                         error_ret = error;
3999                         }
4000                         f_offset += xsize;
4001                         offset   += xsize;
4002                         isize    -= xsize;
4003                         pg_index += num_of_pages;
4004                 }
4005                 /* capture errnos bubbled out of cluster_pageout if they occurred */
4006                 if (error_ret != 0) {
4007                         retval = error_ret;
4008                 }
4009         } /* end block for v2 pageout behavior */
4010         else {
4011                 if (!vnode_isswap(vp)) {
4012                         off_t end_of_range;
4013                         int tooklock = 0;
4014
4015                         if (cp->c_lockowner != current_thread()) {
4016                                 if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
4017                                         if (!(a_flags & UPL_NOCOMMIT)) {
4018                                                 ubc_upl_abort_range(upl,
4019                                                                     a_pl_offset,
4020                                                                     ap->a_size,
4021                                                                     UPL_ABORT_FREE_ON_EMPTY);
4022                                         }
4023                                         goto pageout_done;
4024                                 }
4025                                 tooklock = 1;
4026                         }
4027                         end_of_range = ap->a_f_offset + ap->a_size - 1;
4028
4029                         if (end_of_range >= filesize) {
4030                                 end_of_range = (off_t)(filesize - 1);
4031                         }
4032                         if (ap->a_f_offset < filesize) {
4033                                 rl_remove(ap->a_f_offset, end_of_range, &fp->ff_invalidranges);
4034                                 cp->c_flag |= C_MODIFIED;  /* leof is dirty */
4035                         }
4036
4037                         if (tooklock) {
4038                                 hfs_unlock(cp);
4039                         }
4040                 }
4041                 /*
4042                  * just call cluster_pageout for old pre-v2 behavior
4043                  */
4044                 retval = cluster_pageout(vp, upl, a_pl_offset, ap->a_f_offset,
4045                                 ap->a_size, filesize, a_flags);
4046         }
4047
4048         /*
4049          * If data was written, update the modification time of the file.
4050          * If setuid or setgid bits are set and this process is not the
4051          * superuser then clear the setuid and setgid bits as a precaution
4052          * against tampering.
4053          */
4054         if (retval == 0) {
4055                 cp->c_touch_modtime = TRUE;
4056                 cp->c_touch_chgtime = TRUE;
4057                 if ((cp->c_mode & (S_ISUID | S_ISGID)) &&
4058                     (vfs_context_suser(ap->a_context) != 0)) {
4059                         hfs_lock(cp, HFS_FORCE_LOCK);
4060                         cp->c_mode &= ~(S_ISUID | S_ISGID);
4061                         hfs_unlock(cp);
4062                 }
4063         }
4064
4065 pageout_done:
4066         if (is_pageoutv2) {
4067                 /* release truncate lock (shared) */
4068                 hfs_unlock_truncate(cp, 0);
4069         }
4070         return (retval);
4071 }
4072
4073 /*
4074  * Intercept B-Tree node writes to unswap them if necessary.
4075  */
4076 int
4077 hfs_vnop_bwrite(struct vnop_bwrite_args *ap)
4078 {
4079         int retval = 0;
4080         register struct buf *bp = ap->a_bp;
4081         register struct vnode *vp = buf_vnode(bp);
4082         BlockDescriptor block;
4083
4084         /* Trap B-Tree writes */
4085         if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) ||
4086             (VTOC(vp)->c_fileid == kHFSCatalogFileID) ||
4087             (VTOC(vp)->c_fileid == kHFSAttributesFileID) ||
4088             (vp == VTOHFS(vp)->hfc_filevp)) {
4089
4090                 /*
4091                  * Swap and validate the node if it is in native byte order.
4092                  * This is always be true on big endian, so we always validate
4093                  * before writing here.  On little endian, the node typically has
4094                  * been swapped and validated when it was written to the journal,
4095                  * so we won't do anything here.
4096                  */
4097                 if (((u_int16_t *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) {
4098                         /* Prepare the block pointer */
4099                         block.blockHeader = bp;
4100                         block.buffer = (char *)buf_dataptr(bp);
4101                         block.blockNum = buf_lblkno(bp);
4102                         /* not found in cache ==> came from disk */
4103                         block.blockReadFromDisk = (buf_fromcache(bp) == 0);
4104                         block.blockSize = buf_count(bp);
4105
4106                         /* Endian un-swap B-Tree node */
4107                         retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, false);
4108                         if (retval)
4109                                 panic("hfs_vnop_bwrite: about to write corrupt node!\n");
4110                 }
4111         }
4112
4113         /* This buffer shouldn't be locked anymore but if it is clear it */
4114         if ((buf_flags(bp) & B_LOCKED)) {
4115                 // XXXdbg
4116                 if (VTOHFS(vp)->jnl) {
4117                         panic("hfs: CLEARING the lock bit on bp %p\n", bp);
4118                 }
4119                 buf_clearflags(bp, B_LOCKED);
4120         }
4121         retval = vn_bwrite (ap);
4122
4123         return (retval);
4124 }
4125
4126 /*
4127  * Relocate a file to a new location on disk
4128  *  cnode must be locked on entry
4129  *
4130  * Relocation occurs by cloning the file's data from its
4131  * current set of blocks to a new set of blocks. During
4132  * the relocation all of the blocks (old and new) are
4133  * owned by the file.
4134  *
4135  * -----------------
4136  * |///////////////|
4137  * -----------------
4138  * 0               N (file offset)
4139  *
4140  * -----------------     -----------------
4141  * |///////////////|     |               |     STEP 1 (acquire new blocks)
4142  * -----------------     -----------------
4143  * 0               N     N+1             2N
4144  *
4145  * -----------------     -----------------
4146  * |///////////////|     |///////////////|     STEP 2 (clone data)
4147  * -----------------     -----------------
4148  * 0               N     N+1             2N
4149  *
4150  *                       -----------------
4151  *                       |///////////////|     STEP 3 (head truncate blocks)
4152  *                       -----------------
4153  *                       0               N
4154  *
4155  * During steps 2 and 3 page-outs to file offsets less
4156  * than or equal to N are suspended.
4157  *
4158  * During step 3 page-ins to the file get suspended.
4159  */
4160 int
4161 hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
4162         struct  proc *p)
4163 {
4164         struct  cnode *cp;
4165         struct  filefork *fp;
4166         struct  hfsmount *hfsmp;
4167         u_int32_t  headblks;
4168         u_int32_t  datablks;
4169         u_int32_t  blksize;
4170         u_int32_t  growsize;
4171         u_int32_t  nextallocsave;
4172         daddr64_t  sector_a,  sector_b;
4173         int eflags;
4174         off_t  newbytes;
4175         int  retval;
4176         int lockflags = 0;
4177         int took_trunc_lock = 0;
4178         int started_tr = 0;
4179         enum vtype vnodetype;
4180
4181         vnodetype = vnode_vtype(vp);
4182         if (vnodetype != VREG && vnodetype != VLNK) {
4183                 return (EPERM);
4184         }
4185
4186         hfsmp = VTOHFS(vp);
4187         if (hfsmp->hfs_flags & HFS_FRAGMENTED_FREESPACE) {
4188                 return (ENOSPC);
4189         }
4190
4191         cp = VTOC(vp);
4192         fp = VTOF(vp);
4193         if (fp->ff_unallocblocks)
4194                 return (EINVAL);
4195
4196 #if CONFIG_PROTECT
4197         /*
4198          * <rdar://problem/9118426>
4199          * Disable HFS file relocation on content-protected filesystems
4200          */
4201         if (cp_fs_protected (hfsmp->hfs_mp)) {
4202                 return EINVAL;
4203         }
4204 #endif
4205
4206         /* If it's an SSD, also disable HFS relocation */
4207         if (hfsmp->hfs_flags & HFS_SSD) {
4208                 return EINVAL;
4209         }
4210
4211         blksize = hfsmp->blockSize;
4212         if (blockHint == 0)
4213                 blockHint = hfsmp->nextAllocation;
4214
4215         if ((fp->ff_size > 0x7fffffff) ||
4216             ((fp->ff_size > blksize) && vnodetype == VLNK)) {
4217                 return (EFBIG);
4218         }
4219
4220         //
4221         // We do not believe that this call to hfs_fsync() is
4222         // necessary and it causes a journal transaction
4223         // deadlock so we are removing it.
4224         //
4225         //if (vnodetype == VREG && !vnode_issystem(vp)) {
4226         //      retval = hfs_fsync(vp, MNT_WAIT, 0, p);
4227         //      if (retval)
4228         //              return (retval);
4229         //}
4230
4231         if (!vnode_issystem(vp) && (vnodetype != VLNK)) {
4232                 hfs_unlock(cp);
4233                 hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK);
4234                 /* Force lock since callers expects lock to be held. */
4235                 if ((retval = hfs_lock(cp, HFS_FORCE_LOCK))) {
4236                         hfs_unlock_truncate(cp, 0);
4237                         return (retval);
4238                 }
4239                 /* No need to continue if file was removed. */
4240                 if (cp->c_flag & C_NOEXISTS) {
4241                         hfs_unlock_truncate(cp, 0);
4242                         return (ENOENT);
4243                 }
4244                 took_trunc_lock = 1;
4245         }
4246         headblks = fp->ff_blocks;
4247         datablks = howmany(fp->ff_size, blksize);
4248         growsize = datablks * blksize;
4249         eflags = kEFContigMask | kEFAllMask | kEFNoClumpMask;
4250         if (blockHint >= hfsmp->hfs_metazone_start &&
4251             blockHint <= hfsmp->hfs_metazone_end)
4252                 eflags |= kEFMetadataMask;
4253
4254         if (hfs_start_transaction(hfsmp) != 0) {
4255                 if (took_trunc_lock)
4256                         hfs_unlock_truncate(cp, 0);
4257             return (EINVAL);
4258         }
4259         started_tr = 1;
4260         /*
4261          * Protect the extents b-tree and the allocation bitmap
4262          * during MapFileBlockC and ExtendFileC operations.
4263          */
4264         lockflags = SFL_BITMAP;
4265         if (overflow_extents(fp))
4266                 lockflags |= SFL_EXTENTS;
4267         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
4268
4269         retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize - 1, &sector_a, NULL);
4270         if (retval) {
4271                 retval = MacToVFSError(retval);
4272                 goto out;
4273         }
4274
4275         /*
4276          * STEP 1 - acquire new allocation blocks.
4277          */
4278         nextallocsave = hfsmp->nextAllocation;
4279         retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes);
4280         if (eflags & kEFMetadataMask) {
4281                 HFS_MOUNT_LOCK(hfsmp, TRUE);
4282                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave);
4283                 MarkVCBDirty(hfsmp);
4284                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
4285         }
4286
4287         retval = MacToVFSError(retval);
4288         if (retval == 0) {
4289                 cp->c_flag |= C_MODIFIED;
4290                 if (newbytes < growsize) {
4291                         retval = ENOSPC;
4292                         goto restore;
4293                 } else if (fp->ff_blocks < (headblks + datablks)) {
4294                         printf("hfs_relocate: allocation failed");
4295                         retval = ENOSPC;
4296                         goto restore;
4297                 }
4298
4299                 retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize, &sector_b, NULL);
4300                 if (retval) {
4301                         retval = MacToVFSError(retval);
4302                 } else if ((sector_a + 1) == sector_b) {
4303                         retval = ENOSPC;
4304                         goto restore;
4305                 } else if ((eflags & kEFMetadataMask) &&
4306                            ((((u_int64_t)sector_b * hfsmp->hfs_logical_block_size) / blksize) >
4307                               hfsmp->hfs_metazone_end)) {
4308 #if 0
4309                         const char * filestr;
4310                         char emptystr = '\0';
4311
4312                         if (cp->c_desc.cd_nameptr != NULL) {
4313                                 filestr = (const char *)&cp->c_desc.cd_nameptr[0];
4314                         } else if (vnode_name(vp) != NULL) {
4315                                 filestr = vnode_name(vp);
4316                         } else {
4317                                 filestr = &emptystr;
4318                         }
4319 #endif
4320                         retval = ENOSPC;
4321                         goto restore;
4322                 }
4323         }
4324         /* Done with system locks and journal for now. */
4325         hfs_systemfile_unlock(hfsmp, lockflags);
4326         lockflags = 0;
4327         hfs_end_transaction(hfsmp);
4328         started_tr = 0;
4329
4330         if (retval) {
4331                 /*
4332                  * Check to see if failure is due to excessive fragmentation.
4333                  */
4334                 if ((retval == ENOSPC) &&
4335                     (hfs_freeblks(hfsmp, 0) > (datablks * 2))) {
4336                         hfsmp->hfs_flags |= HFS_FRAGMENTED_FREESPACE;
4337                 }
4338                 goto out;
4339         }
4340         /*
4341          * STEP 2 - clone file data into the new allocation blocks.
4342          */
4343
4344         if (vnodetype == VLNK)
4345                 retval = hfs_clonelink(vp, blksize, cred, p);
4346         else if (vnode_issystem(vp))
4347                 retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p);
4348         else
4349                 retval = hfs_clonefile(vp, headblks, datablks, blksize);
4350
4351         /* Start transaction for step 3 or for a restore. */
4352         if (hfs_start_transaction(hfsmp) != 0) {
4353                 retval = EINVAL;
4354                 goto out;
4355         }
4356         started_tr = 1;
4357         if (retval)
4358                 goto restore;
4359
4360         /*
4361          * STEP 3 - switch to cloned data and remove old blocks.
4362          */
4363         lockflags = SFL_BITMAP;
4364         if (overflow_extents(fp))
4365                 lockflags |= SFL_EXTENTS;
4366         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
4367
4368         retval = HeadTruncateFile(hfsmp, (FCB*)fp, headblks);
4369
4370         hfs_systemfile_unlock(hfsmp, lockflags);
4371         lockflags = 0;
4372         if (retval)
4373                 goto restore;
4374 out:
4375         if (took_trunc_lock)
4376                 hfs_unlock_truncate(cp, 0);
4377
4378         if (lockflags) {
4379                 hfs_systemfile_unlock(hfsmp, lockflags);
4380                 lockflags = 0;
4381         }
4382
4383         /* Push cnode's new extent data to disk. */
4384         if (retval == 0) {
4385                 (void) hfs_update(vp, MNT_WAIT);
4386         }
4387         if (hfsmp->jnl) {
4388                 if (cp->c_cnid < kHFSFirstUserCatalogNodeID)
4389                         (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
4390                 else
4391                         (void) hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
4392         }
4393 exit:
4394         if (started_tr)
4395                 hfs_end_transaction(hfsmp);
4396
4397         return (retval);
4398
4399 restore:
4400         if (fp->ff_blocks == headblks) {
4401                 if (took_trunc_lock)
4402                         hfs_unlock_truncate(cp, 0);
4403                 goto exit;
4404         }
4405         /*
4406          * Give back any newly allocated space.
4407          */
4408         if (lockflags == 0) {
4409                 lockflags = SFL_BITMAP;
4410                 if (overflow_extents(fp))
4411                         lockflags |= SFL_EXTENTS;
4412                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
4413         }
4414
4415         (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, 0, FORK_IS_RSRC(fp),
4416                                                  FTOC(fp)->c_fileid, false);
4417
4418         hfs_systemfile_unlock(hfsmp, lockflags);
4419         lockflags = 0;
4420
4421         if (took_trunc_lock)
4422                 hfs_unlock_truncate(cp, 0);
4423         goto exit;
4424 }
4425
4426
4427 /*
4428  * Clone a symlink.
4429  *
4430  */
4431 static int
4432 hfs_clonelink(struct vnode *vp, int blksize, kauth_cred_t cred, __unused struct proc *p)
4433 {
4434         struct buf *head_bp = NULL;
4435         struct buf *tail_bp = NULL;
4436         int error;
4437
4438
4439         error = (int)buf_meta_bread(vp, (daddr64_t)0, blksize, cred, &head_bp);
4440         if (error)
4441                 goto out;
4442
4443         tail_bp = buf_getblk(vp, (daddr64_t)1, blksize, 0, 0, BLK_META);
4444         if (tail_bp == NULL) {
4445                 error = EIO;
4446                 goto out;
4447         }
4448         bcopy((char *)buf_dataptr(head_bp), (char *)buf_dataptr(tail_bp), blksize);
4449         error = (int)buf_bwrite(tail_bp);
4450 out:
4451         if (head_bp) {
4452                 buf_markinvalid(head_bp);
4453                 buf_brelse(head_bp);
4454         }
4455         (void) buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
4456
4457         return (error);
4458 }
4459
4460 /*
4461  * Clone a file's data within the file.
4462  *
4463  */
4464 static int
4465 hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
4466 {
4467         caddr_t  bufp;
4468         size_t  bufsize;
4469         size_t  copysize;
4470         size_t  iosize;
4471         size_t  offset;
4472         off_t   writebase;
4473         uio_t auio;
4474         int  error = 0;
4475
4476         writebase = blkstart * blksize;
4477         copysize = blkcnt * blksize;
4478         iosize = bufsize = MIN(copysize, 128 * 1024);
4479         offset = 0;
4480
4481         hfs_unlock(VTOC(vp));
4482
4483 #if CONFIG_PROTECT
4484         if ((error = cp_handle_vnop(VTOC(vp), CP_WRITE_ACCESS)) != 0) {
4485                 hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
4486                 return (error);
4487         }
4488 #endif /* CONFIG_PROTECT */
4489
4490         if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
4491                 hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
4492                 return (ENOMEM);
4493         }
4494
4495         auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
4496
4497         while (offset < copysize) {
4498                 iosize = MIN(copysize - offset, iosize);
4499
4500                 uio_reset(auio, offset, UIO_SYSSPACE, UIO_READ);
4501                 uio_addiov(auio, (uintptr_t)bufp, iosize);
4502
4503                 error = cluster_read(vp, auio, copysize, IO_NOCACHE);
4504                 if (error) {
4505                         printf("hfs_clonefile: cluster_read failed - %d\n", error);
4506                         break;
4507                 }
4508                 if (uio_resid(auio) != 0) {
4509                         printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", uio_resid(auio));
4510                         error = EIO;
4511                         break;
4512                 }
4513
4514                 uio_reset(auio, writebase + offset, UIO_SYSSPACE, UIO_WRITE);
4515                 uio_addiov(auio, (uintptr_t)bufp, iosize);
4516
4517                 error = cluster_write(vp, auio, writebase + offset,
4518                                       writebase + offset + iosize,
4519                                       uio_offset(auio), 0, IO_NOCACHE | IO_SYNC);
4520                 if (error) {
4521                         printf("hfs_clonefile: cluster_write failed - %d\n", error);
4522                         break;
4523                 }
4524                 if (uio_resid(auio) != 0) {
4525                         printf("hfs_clonefile: cluster_write failed - uio_resid not zero\n");
4526                         error = EIO;
4527                         break;
4528                 }
4529                 offset += iosize;
4530         }
4531         uio_free(auio);
4532
4533         if ((blksize & PAGE_MASK)) {
4534                 /*
4535                  * since the copy may not have started on a PAGE
4536                  * boundary (or may not have ended on one), we
4537                  * may have pages left in the cache since NOCACHE
4538                  * will let partially written pages linger...
4539                  * lets just flush the entire range to make sure
4540                  * we don't have any pages left that are beyond
4541                  * (or intersect) the real LEOF of this file
4542                  */
4543                 ubc_msync(vp, writebase, writebase + offset, NULL, UBC_INVALIDATE | UBC_PUSHDIRTY);
4544         } else {
4545                 /*
4546                  * No need to call ubc_sync_range or hfs_invalbuf
4547                  * since the file was copied using IO_NOCACHE and
4548                  * the copy was done starting and ending on a page
4549                  * boundary in the file.
4550                  */
4551         }
4552         kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
4553
4554         hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
4555         return (error);
4556 }
4557
4558 /*
4559  * Clone a system (metadata) file.
4560  *
4561  */
4562 static int
4563 hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize,
4564                  kauth_cred_t cred, struct proc *p)
4565 {
4566         caddr_t  bufp;
4567         char * offset;
4568         size_t  bufsize;
4569         size_t  iosize;
4570         struct buf *bp = NULL;
4571         daddr64_t  blkno;
4572         daddr64_t  blk;
4573         daddr64_t  start_blk;
4574         daddr64_t  last_blk;
4575         int  breadcnt;
4576         int  i;
4577         int  error = 0;
4578
4579
4580         iosize = GetLogicalBlockSize(vp);
4581         bufsize = MIN(blkcnt * blksize, 1024 * 1024) & ~(iosize - 1);
4582         breadcnt = bufsize / iosize;
4583
4584         if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
4585                 return (ENOMEM);
4586         }
4587         start_blk = ((daddr64_t)blkstart * blksize) / iosize;
4588         last_blk  = ((daddr64_t)blkcnt * blksize) / iosize;
4589         blkno = 0;
4590
4591         while (blkno < last_blk) {
4592                 /*
4593                  * Read up to a megabyte
4594                  */
4595                 offset = bufp;
4596                 for (i = 0, blk = blkno; (i < breadcnt) && (blk < last_blk); ++i, ++blk) {
4597                         error = (int)buf_meta_bread(vp, blk, iosize, cred, &bp);
4598                         if (error) {
4599                                 printf("hfs_clonesysfile: meta_bread error %d\n", error);
4600                                 goto out;
4601                         }
4602                         if (buf_count(bp) != iosize) {
4603                                 printf("hfs_clonesysfile: b_bcount is only %d\n", buf_count(bp));
4604                                 goto out;
4605                         }
4606                         bcopy((char *)buf_dataptr(bp), offset, iosize);
4607
4608                         buf_markinvalid(bp);
4609                         buf_brelse(bp);
4610                         bp = NULL;
4611
4612                         offset += iosize;
4613                 }
4614
4615                 /*
4616                  * Write up to a megabyte
4617                  */
4618                 offset = bufp;
4619                 for (i = 0; (i < breadcnt) && (blkno < last_blk); ++i, ++blkno) {
4620                         bp = buf_getblk(vp, start_blk + blkno, iosize, 0, 0, BLK_META);
4621                         if (bp == NULL) {
4622                                 printf("hfs_clonesysfile: getblk failed on blk %qd\n", start_blk + blkno);
4623                                 error = EIO;
4624                                 goto out;
4625                         }
4626                         bcopy(offset, (char *)buf_dataptr(bp), iosize);
4627                         error = (int)buf_bwrite(bp);
4628                         bp = NULL;
4629                         if (error)
4630                                 goto out;
4631                         offset += iosize;
4632                 }
4633         }
4634 out:
4635         if (bp) {
4636                 buf_brelse(bp);
4637         }
4638
4639         kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
4640
4641         error = hfs_fsync(vp, MNT_WAIT, 0, p);
4642
4643         return (error);
4644 }