bsd/hfs/hfs_readwrite.c

   1 /*
   2  * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*      @(#)hfs_readwrite.c     1.0
  29  *
  30  *      (c) 1998-2001 Apple Computer, Inc.  All Rights Reserved
  31  *
  32  *      hfs_readwrite.c -- vnode operations to deal with reading and writing files.
  33  *
  34  */
  35
  36 #include <sys/param.h>
  37 #include <sys/systm.h>
  38 #include <sys/resourcevar.h>
  39 #include <sys/kernel.h>
  40 #include <sys/fcntl.h>
  41 #include <sys/filedesc.h>
  42 #include <sys/stat.h>
  43 #include <sys/buf.h>
  44 #include <sys/proc.h>
  45 #include <sys/kauth.h>
  46 #include <sys/vnode.h>
  47 #include <sys/vnode_internal.h>
  48 #include <sys/uio.h>
  49 #include <sys/vfs_context.h>
  50 #include <sys/fsevents.h>
  51 #include <kern/kalloc.h>
  52 #include <sys/disk.h>
  53 #include <sys/sysctl.h>
  54 #include <sys/fsctl.h>
  55
  56 #include <miscfs/specfs/specdev.h>
  57
  58 #include <sys/ubc.h>
  59 #include <sys/ubc_internal.h>
  60
  61 #include <vm/vm_pageout.h>
  62 #include <vm/vm_kern.h>
  63
  64 #include <sys/kdebug.h>
  65
  66 #include        "hfs.h"
  67 #include        "hfs_attrlist.h"
  68 #include        "hfs_endian.h"
  69 #include        "hfs_fsctl.h"
  70 #include        "hfs_quota.h"
  71 #include        "hfscommon/headers/FileMgrInternal.h"
  72 #include        "hfscommon/headers/BTreesInternal.h"
  73 #include        "hfs_cnode.h"
  74 #include        "hfs_dbg.h"
  75
  76 #define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2)))
  77
  78 enum {
  79         MAXHFSFILESIZE = 0x7FFFFFFF             /* this needs to go in the mount structure */
  80 };
  81
  82 /* from bsd/hfs/hfs_vfsops.c */
  83 extern int hfs_vfs_vget (struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context);
  84
  85 static int  hfs_clonelink(struct vnode *, int, kauth_cred_t, struct proc *);
  86 static int  hfs_clonefile(struct vnode *, int, int, int);
  87 static int  hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *);
  88 static int  hfs_minorupdate(struct vnode *vp);
  89 static int  do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skip, vfs_context_t context);
  90
  91
  92 int flush_cache_on_write = 0;
  93 SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files");
  94
  95
  96 /*
  97  * Read data from a file.
  98  */
  99 int
 100 hfs_vnop_read(struct vnop_read_args *ap)
 101 {
 102         uio_t uio = ap->a_uio;
 103         struct vnode *vp = ap->a_vp;
 104         struct cnode *cp;
 105         struct filefork *fp;
 106         struct hfsmount *hfsmp;
 107         off_t filesize;
 108         off_t filebytes;
 109         off_t start_resid = uio_resid(uio);
 110         off_t offset = uio_offset(uio);
 111         int retval = 0;
 112
 113         /* Preflight checks */
 114         if (!vnode_isreg(vp)) {
 115                 /* can only read regular files */
 116                 if (vnode_isdir(vp))
 117                         return (EISDIR);
 118                 else
 119                         return (EPERM);
 120         }
 121         if (start_resid == 0)
 122                 return (0);             /* Nothing left to do */
 123         if (offset < 0)
 124                 return (EINVAL);        /* cant read from a negative offset */
 125
 126 #if HFS_COMPRESSION
 127         if (VNODE_IS_RSRC(vp)) {
 128                 if (hfs_hides_rsrc(ap->a_context, VTOC(vp), 1)) { /* 1 == don't take the cnode lock */
 129                         return 0;
 130                 }
 131                 /* otherwise read the resource fork normally */
 132         } else {
 133                 int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
 134                 if (compressed) {
 135                         retval = decmpfs_read_compressed(ap, &compressed, VTOCMP(vp));
 136                         if (compressed) {
 137                                 if (retval == 0) {
 138                                         /* successful read, update the access time */
 139                                         VTOC(vp)->c_touch_acctime = TRUE;
 140
 141                                         /* compressed files are not hot file candidates */
 142                                         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
 143                                                 VTOF(vp)->ff_bytesread = 0;
 144                                         }
 145                                 }
 146                                 return retval;
 147                         }
 148                         /* otherwise the file was converted back to a regular file while we were reading it */
 149                         retval = 0;
 150                 }
 151         }
 152 #endif /* HFS_COMPRESSION */
 153
 154         cp = VTOC(vp);
 155         fp = VTOF(vp);
 156         hfsmp = VTOHFS(vp);
 157
 158         /* Protect against a size change. */
 159         hfs_lock_truncate(cp, 0);
 160
 161         filesize = fp->ff_size;
 162         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 163         if (offset > filesize) {
 164                 if ((hfsmp->hfs_flags & HFS_STANDARD) &&
 165                     (offset > (off_t)MAXHFSFILESIZE)) {
 166                         retval = EFBIG;
 167                 }
 168                 goto exit;
 169         }
 170
 171         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_START,
 172                 (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0);
 173
 174         retval = cluster_read(vp, uio, filesize, ap->a_ioflag);
 175
 176         cp->c_touch_acctime = TRUE;
 177
 178         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_END,
 179                 (int)uio_offset(uio), uio_resid(uio), (int)filesize,  (int)filebytes, 0);
 180
 181         /*
 182          * Keep track blocks read
 183          */
 184         if (hfsmp->hfc_stage == HFC_RECORDING && retval == 0) {
 185                 int took_cnode_lock = 0;
 186                 off_t bytesread;
 187
 188                 bytesread = start_resid - uio_resid(uio);
 189
 190                 /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
 191                 if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) {
 192                         hfs_lock(cp, HFS_FORCE_LOCK);
 193                         took_cnode_lock = 1;
 194                 }
 195                 /*
 196                  * If this file hasn't been seen since the start of
 197                  * the current sampling period then start over.
 198                  */
 199                 if (cp->c_atime < hfsmp->hfc_timebase) {
 200                         struct timeval tv;
 201
 202                         fp->ff_bytesread = bytesread;
 203                         microtime(&tv);
 204                         cp->c_atime = tv.tv_sec;
 205                 } else {
 206                         fp->ff_bytesread += bytesread;
 207                 }
 208                 if (took_cnode_lock)
 209                         hfs_unlock(cp);
 210         }
 211 exit:
 212         hfs_unlock_truncate(cp, 0);
 213         return (retval);
 214 }
 215
 216 /*
 217  * Write data to a file.
 218  */
 219 int
 220 hfs_vnop_write(struct vnop_write_args *ap)
 221 {
 222         uio_t uio = ap->a_uio;
 223         struct vnode *vp = ap->a_vp;
 224         struct cnode *cp;
 225         struct filefork *fp;
 226         struct hfsmount *hfsmp;
 227         kauth_cred_t cred = NULL;
 228         off_t origFileSize;
 229         off_t writelimit;
 230         off_t bytesToAdd = 0;
 231         off_t actualBytesAdded;
 232         off_t filebytes;
 233         off_t offset;
 234         ssize_t resid;
 235         int eflags;
 236         int ioflag = ap->a_ioflag;
 237         int retval = 0;
 238         int lockflags;
 239         int cnode_locked = 0;
 240         int partialwrite = 0;
 241         int exclusive_lock = 0;
 242
 243 #if HFS_COMPRESSION
 244         if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
 245                 int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
 246                 switch(state) {
 247                         case FILE_IS_COMPRESSED:
 248                                 return EACCES;
 249                         case FILE_IS_CONVERTING:
 250                                 /* if FILE_IS_CONVERTING, we allow writes */
 251                                 break;
 252                         default:
 253                                 printf("invalid state %d for compressed file\n", state);
 254                                 /* fall through */
 255                 }
 256         }
 257 #endif
 258
 259         // LP64todo - fix this! uio_resid may be 64-bit value
 260         resid = uio_resid(uio);
 261         offset = uio_offset(uio);
 262
 263         if (ioflag & IO_APPEND) {
 264             exclusive_lock = 1;
 265         }
 266
 267         if (offset < 0)
 268                 return (EINVAL);
 269         if (resid == 0)
 270                 return (E_NONE);
 271         if (!vnode_isreg(vp))
 272                 return (EPERM);  /* Can only write regular files */
 273
 274         cp = VTOC(vp);
 275         fp = VTOF(vp);
 276         hfsmp = VTOHFS(vp);
 277
 278         eflags = kEFDeferMask;  /* defer file block allocations */
 279 #ifdef HFS_SPARSE_DEV
 280         /*
 281          * When the underlying device is sparse and space
 282          * is low (< 8MB), stop doing delayed allocations
 283          * and begin doing synchronous I/O.
 284          */
 285         if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
 286             (hfs_freeblks(hfsmp, 0) < 2048)) {
 287                 eflags &= ~kEFDeferMask;
 288                 ioflag |= IO_SYNC;
 289         }
 290 #endif /* HFS_SPARSE_DEV */
 291
 292 again:
 293         /* Protect against a size change. */
 294         hfs_lock_truncate(cp, exclusive_lock);
 295
 296         if (ioflag & IO_APPEND) {
 297                 uio_setoffset(uio, fp->ff_size);
 298                 offset = fp->ff_size;
 299         }
 300         if ((cp->c_flags & APPEND) && offset != fp->ff_size) {
 301                 retval = EPERM;
 302                 goto exit;
 303         }
 304
 305         origFileSize = fp->ff_size;
 306         writelimit = offset + resid;
 307         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 308
 309         /* If the truncate lock is shared, and if we either have virtual
 310          * blocks or will need to extend the file, upgrade the truncate
 311          * to exclusive lock.  If upgrade fails, we lose the lock and
 312          * have to get exclusive lock again.  Note that we want to
 313          * grab the truncate lock exclusive even if we're not allocating new blocks
 314          * because we could still be growing past the LEOF.
 315          */
 316         if ((exclusive_lock == 0) &&
 317             ((fp->ff_unallocblocks != 0) || (writelimit > origFileSize))) {
 318                 exclusive_lock = 1;
 319                 /* Lock upgrade failed and we lost our shared lock, try again */
 320                 if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) {
 321                         goto again;
 322                 }
 323         }
 324
 325         if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) {
 326                 goto exit;
 327         }
 328         cnode_locked = 1;
 329
 330         if (!exclusive_lock) {
 331                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_START,
 332                              (int)offset, uio_resid(uio), (int)fp->ff_size,
 333                              (int)filebytes, 0);
 334         }
 335
 336         /* Check if we do not need to extend the file */
 337         if (writelimit <= filebytes) {
 338                 goto sizeok;
 339         }
 340
 341         cred = vfs_context_ucred(ap->a_context);
 342         bytesToAdd = writelimit - filebytes;
 343
 344 #if QUOTA
 345         retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, hfsmp->blockSize)),
 346                            cred, 0);
 347         if (retval)
 348                 goto exit;
 349 #endif /* QUOTA */
 350
 351         if (hfs_start_transaction(hfsmp) != 0) {
 352                 retval = EINVAL;
 353                 goto exit;
 354         }
 355
 356         while (writelimit > filebytes) {
 357                 bytesToAdd = writelimit - filebytes;
 358                 if (cred && suser(cred, NULL) != 0)
 359                         eflags |= kEFReserveMask;
 360
 361                 /* Protect extents b-tree and allocation bitmap */
 362                 lockflags = SFL_BITMAP;
 363                 if (overflow_extents(fp))
 364                         lockflags |= SFL_EXTENTS;
 365                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
 366
 367                 /* Files that are changing size are not hot file candidates. */
 368                 if (hfsmp->hfc_stage == HFC_RECORDING) {
 369                         fp->ff_bytesread = 0;
 370                 }
 371                 retval = MacToVFSError(ExtendFileC (hfsmp, (FCB*)fp, bytesToAdd,
 372                                 0, eflags, &actualBytesAdded));
 373
 374                 hfs_systemfile_unlock(hfsmp, lockflags);
 375
 376                 if ((actualBytesAdded == 0) && (retval == E_NONE))
 377                         retval = ENOSPC;
 378                 if (retval != E_NONE)
 379                         break;
 380                 filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 381                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_NONE,
 382                         (int)offset, uio_resid(uio), (int)fp->ff_size,  (int)filebytes, 0);
 383         }
 384         (void) hfs_update(vp, TRUE);
 385         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
 386         (void) hfs_end_transaction(hfsmp);
 387
 388         /*
 389          * If we didn't grow the file enough try a partial write.
 390          * POSIX expects this behavior.
 391          */
 392         if ((retval == ENOSPC) && (filebytes > offset)) {
 393                 retval = 0;
 394                 partialwrite = 1;
 395                 uio_setresid(uio, (uio_resid(uio) - bytesToAdd));
 396                 resid -= bytesToAdd;
 397                 writelimit = filebytes;
 398         }
 399 sizeok:
 400         if (retval == E_NONE) {
 401                 off_t filesize;
 402                 off_t zero_off;
 403                 off_t tail_off;
 404                 off_t inval_start;
 405                 off_t inval_end;
 406                 off_t io_start;
 407                 int lflag;
 408                 struct rl_entry *invalid_range;
 409
 410                 if (writelimit > fp->ff_size)
 411                         filesize = writelimit;
 412                 else
 413                         filesize = fp->ff_size;
 414
 415                 lflag = ioflag & ~(IO_TAILZEROFILL | IO_HEADZEROFILL | IO_NOZEROVALID | IO_NOZERODIRTY);
 416
 417                 if (offset <= fp->ff_size) {
 418                         zero_off = offset & ~PAGE_MASK_64;
 419
 420                         /* Check to see whether the area between the zero_offset and the start
 421                            of the transfer to see whether is invalid and should be zero-filled
 422                            as part of the transfer:
 423                          */
 424                         if (offset > zero_off) {
 425                                 if (rl_scan(&fp->ff_invalidranges, zero_off, offset - 1, &invalid_range) != RL_NOOVERLAP)
 426                                         lflag |= IO_HEADZEROFILL;
 427                         }
 428                 } else {
 429                         off_t eof_page_base = fp->ff_size & ~PAGE_MASK_64;
 430
 431                         /* The bytes between fp->ff_size and uio->uio_offset must never be
 432                            read without being zeroed.  The current last block is filled with zeroes
 433                            if it holds valid data but in all cases merely do a little bookkeeping
 434                            to track the area from the end of the current last page to the start of
 435                            the area actually written.  For the same reason only the bytes up to the
 436                            start of the page where this write will start is invalidated; any remainder
 437                            before uio->uio_offset is explicitly zeroed as part of the cluster_write.
 438
 439                            Note that inval_start, the start of the page after the current EOF,
 440                            may be past the start of the write, in which case the zeroing
 441                            will be handled by the cluser_write of the actual data.
 442                          */
 443                         inval_start = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
 444                         inval_end = offset & ~PAGE_MASK_64;
 445                         zero_off = fp->ff_size;
 446
 447                         if ((fp->ff_size & PAGE_MASK_64) &&
 448                                 (rl_scan(&fp->ff_invalidranges,
 449                                                         eof_page_base,
 450                                                         fp->ff_size - 1,
 451                                                         &invalid_range) != RL_NOOVERLAP)) {
 452                                 /* The page containing the EOF is not valid, so the
 453                                    entire page must be made inaccessible now.  If the write
 454                                    starts on a page beyond the page containing the eof
 455                                    (inval_end > eof_page_base), add the
 456                                    whole page to the range to be invalidated.  Otherwise
 457                                    (i.e. if the write starts on the same page), zero-fill
 458                                    the entire page explicitly now:
 459                                  */
 460                                 if (inval_end > eof_page_base) {
 461                                         inval_start = eof_page_base;
 462                                 } else {
 463                                         zero_off = eof_page_base;
 464                                 };
 465                         };
 466
 467                         if (inval_start < inval_end) {
 468                                 struct timeval tv;
 469                                 /* There's some range of data that's going to be marked invalid */
 470
 471                                 if (zero_off < inval_start) {
 472                                         /* The pages between inval_start and inval_end are going to be invalidated,
 473                                            and the actual write will start on a page past inval_end.  Now's the last
 474                                            chance to zero-fill the page containing the EOF:
 475                                          */
 476                                         hfs_unlock(cp);
 477                                         cnode_locked = 0;
 478                                         retval = cluster_write(vp, (uio_t) 0,
 479                                                         fp->ff_size, inval_start,
 480                                                         zero_off, (off_t)0,
 481                                                         lflag | IO_HEADZEROFILL | IO_NOZERODIRTY);
 482                                         hfs_lock(cp, HFS_FORCE_LOCK);
 483                                         cnode_locked = 1;
 484                                         if (retval) goto ioerr_exit;
 485                                         offset = uio_offset(uio);
 486                                 };
 487
 488                                 /* Mark the remaining area of the newly allocated space as invalid: */
 489                                 rl_add(inval_start, inval_end - 1 , &fp->ff_invalidranges);
 490                                 microuptime(&tv);
 491                                 cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
 492                                 zero_off = fp->ff_size = inval_end;
 493                         };
 494
 495                         if (offset > zero_off) lflag |= IO_HEADZEROFILL;
 496                 };
 497
 498                 /* Check to see whether the area between the end of the write and the end of
 499                    the page it falls in is invalid and should be zero-filled as part of the transfer:
 500                  */
 501                 tail_off = (writelimit + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
 502                 if (tail_off > filesize) tail_off = filesize;
 503                 if (tail_off > writelimit) {
 504                         if (rl_scan(&fp->ff_invalidranges, writelimit, tail_off - 1, &invalid_range) != RL_NOOVERLAP) {
 505                                 lflag |= IO_TAILZEROFILL;
 506                         };
 507                 };
 508
 509                 /*
 510                  * if the write starts beyond the current EOF (possibly advanced in the
 511                  * zeroing of the last block, above), then we'll zero fill from the current EOF
 512                  * to where the write begins:
 513                  *
 514                  * NOTE: If (and ONLY if) the portion of the file about to be written is
 515                  *       before the current EOF it might be marked as invalid now and must be
 516                  *       made readable (removed from the invalid ranges) before cluster_write
 517                  *       tries to write it:
 518                  */
 519                 io_start = (lflag & IO_HEADZEROFILL) ? zero_off : offset;
 520                 if (io_start < fp->ff_size) {
 521                         off_t io_end;
 522
 523                         io_end = (lflag & IO_TAILZEROFILL) ? tail_off : writelimit;
 524                         rl_remove(io_start, io_end - 1, &fp->ff_invalidranges);
 525                 };
 526
 527                 hfs_unlock(cp);
 528                 cnode_locked = 0;
 529
 530                 /*
 531                  * We need to tell UBC the fork's new size BEFORE calling
 532                  * cluster_write, in case any of the new pages need to be
 533                  * paged out before cluster_write completes (which does happen
 534                  * in embedded systems due to extreme memory pressure).
 535                  * Similarly, we need to tell hfs_vnop_pageout what the new EOF
 536                  * will be, so that it can pass that on to cluster_pageout, and
 537                  * allow those pageouts.
 538                  *
 539                  * We don't update ff_size yet since we don't want pageins to
 540                  * be able to see uninitialized data between the old and new
 541                  * EOF, until cluster_write has completed and initialized that
 542                  * part of the file.
 543                  *
 544                  * The vnode pager relies on the file size last given to UBC via
 545                  * ubc_setsize.  hfs_vnop_pageout relies on fp->ff_new_size or
 546                  * ff_size (whichever is larger).  NOTE: ff_new_size is always
 547                  * zero, unless we are extending the file via write.
 548                  */
 549                 if (filesize > fp->ff_size) {
 550                         fp->ff_new_size = filesize;
 551                         ubc_setsize(vp, filesize);
 552                 }
 553                 retval = cluster_write(vp, uio, fp->ff_size, filesize, zero_off,
 554                                 tail_off, lflag | IO_NOZERODIRTY);
 555                 if (retval) {
 556                         fp->ff_new_size = 0;    /* no longer extending; use ff_size */
 557                         if (filesize > origFileSize) {
 558                                 ubc_setsize(vp, origFileSize);
 559                         }
 560                         goto ioerr_exit;
 561                 }
 562
 563                 if (filesize > origFileSize) {
 564                         fp->ff_size = filesize;
 565
 566                         /* Files that are changing size are not hot file candidates. */
 567                         if (hfsmp->hfc_stage == HFC_RECORDING) {
 568                                 fp->ff_bytesread = 0;
 569                         }
 570                 }
 571                 fp->ff_new_size = 0;    /* ff_size now has the correct size */
 572
 573                 /* If we wrote some bytes, then touch the change and mod times */
 574                 if (resid > uio_resid(uio)) {
 575                         cp->c_touch_chgtime = TRUE;
 576                         cp->c_touch_modtime = TRUE;
 577                 }
 578         }
 579         if (partialwrite) {
 580                 uio_setresid(uio, (uio_resid(uio) + bytesToAdd));
 581                 resid += bytesToAdd;
 582         }
 583
 584         // XXXdbg - see radar 4871353 for more info
 585         {
 586             if (flush_cache_on_write && ((ioflag & IO_NOCACHE) || vnode_isnocache(vp))) {
 587                 VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NULL);
 588             }
 589         }
 590
 591 ioerr_exit:
 592         /*
 593          * If we successfully wrote any data, and we are not the superuser
 594          * we clear the setuid and setgid bits as a precaution against
 595          * tampering.
 596          */
 597         if (cp->c_mode & (S_ISUID | S_ISGID)) {
 598                 cred = vfs_context_ucred(ap->a_context);
 599                 if (resid > uio_resid(uio) && cred && suser(cred, NULL)) {
 600                         if (!cnode_locked) {
 601                                 hfs_lock(cp, HFS_FORCE_LOCK);
 602                                 cnode_locked = 1;
 603                         }
 604                         cp->c_mode &= ~(S_ISUID | S_ISGID);
 605                 }
 606         }
 607         if (retval) {
 608                 if (ioflag & IO_UNIT) {
 609                         if (!cnode_locked) {
 610                                 hfs_lock(cp, HFS_FORCE_LOCK);
 611                                 cnode_locked = 1;
 612                         }
 613                         (void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC,
 614                                            0, 0, ap->a_context);
 615                         // LP64todo - fix this!  resid needs to by user_ssize_t
 616                         uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio))));
 617                         uio_setresid(uio, resid);
 618                         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 619                 }
 620         } else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio))) {
 621                 if (!cnode_locked) {
 622                         hfs_lock(cp, HFS_FORCE_LOCK);
 623                         cnode_locked = 1;
 624                 }
 625                 retval = hfs_update(vp, TRUE);
 626         }
 627         /* Updating vcbWrCnt doesn't need to be atomic. */
 628         hfsmp->vcbWrCnt++;
 629
 630         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_END,
 631                 (int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0);
 632 exit:
 633         if (cnode_locked)
 634                 hfs_unlock(cp);
 635         hfs_unlock_truncate(cp, exclusive_lock);
 636         return (retval);
 637 }
 638
 639 /* support for the "bulk-access" fcntl */
 640
 641 #define CACHE_LEVELS 16
 642 #define NUM_CACHE_ENTRIES (64*16)
 643 #define PARENT_IDS_FLAG 0x100
 644
 645 struct access_cache {
 646        int numcached;
 647        int cachehits; /* these two for statistics gathering */
 648        int lookups;
 649        unsigned int *acache;
 650        unsigned char *haveaccess;
 651 };
 652
 653 struct access_t {
 654         uid_t     uid;              /* IN: effective user id */
 655         short     flags;            /* IN: access requested (i.e. R_OK) */
 656         short     num_groups;       /* IN: number of groups user belongs to */
 657         int       num_files;        /* IN: number of files to process */
 658         int       *file_ids;        /* IN: array of file ids */
 659         gid_t     *groups;          /* IN: array of groups */
 660         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 661 } __attribute__((unavailable)); // this structure is for reference purposes only
 662
 663 struct user32_access_t {
 664         uid_t     uid;              /* IN: effective user id */
 665         short     flags;            /* IN: access requested (i.e. R_OK) */
 666         short     num_groups;       /* IN: number of groups user belongs to */
 667         int       num_files;        /* IN: number of files to process */
 668         user32_addr_t      file_ids;        /* IN: array of file ids */
 669         user32_addr_t      groups;          /* IN: array of groups */
 670         user32_addr_t      access;          /* OUT: access info for each file (0 for 'has access') */
 671 };
 672
 673 struct user64_access_t {
 674         uid_t           uid;                    /* IN: effective user id */
 675         short           flags;                  /* IN: access requested (i.e. R_OK) */
 676         short           num_groups;             /* IN: number of groups user belongs to */
 677         int             num_files;              /* IN: number of files to process */
 678         user64_addr_t   file_ids;               /* IN: array of file ids */
 679         user64_addr_t   groups;                 /* IN: array of groups */
 680         user64_addr_t   access;                 /* OUT: access info for each file (0 for 'has access') */
 681 };
 682
 683
 684 // these are the "extended" versions of the above structures
 685 // note that it is crucial that they be different sized than
 686 // the regular version
 687 struct ext_access_t {
 688         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 689         uint32_t   num_files;       /* IN: number of files to process */
 690         uint32_t   map_size;        /* IN: size of the bit map */
 691         uint32_t  *file_ids;        /* IN: Array of file ids */
 692         char      *bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 693         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 694         uint32_t   num_parents;   /* future use */
 695         cnid_t      *parents;   /* future use */
 696 } __attribute__((unavailable)); // this structure is for reference purposes only
 697
 698 struct user32_ext_access_t {
 699         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 700         uint32_t   num_files;       /* IN: number of files to process */
 701         uint32_t   map_size;        /* IN: size of the bit map */
 702         user32_addr_t  file_ids;        /* IN: Array of file ids */
 703         user32_addr_t     bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 704         user32_addr_t access;          /* OUT: access info for each file (0 for 'has access') */
 705         uint32_t   num_parents;   /* future use */
 706         user32_addr_t parents;   /* future use */
 707 };
 708
 709 struct user64_ext_access_t {
 710         uint32_t      flags;        /* IN: access requested (i.e. R_OK) */
 711         uint32_t      num_files;    /* IN: number of files to process */
 712         uint32_t      map_size;     /* IN: size of the bit map */
 713         user64_addr_t   file_ids;     /* IN: array of file ids */
 714         user64_addr_t   bitmap;       /* IN: array of groups */
 715         user64_addr_t   access;       /* OUT: access info for each file (0 for 'has access') */
 716         uint32_t      num_parents;/* future use */
 717         user64_addr_t   parents;/* future use */
 718 };
 719
 720
 721 /*
 722  * Perform a binary search for the given parent_id. Return value is
 723  * the index if there is a match.  If no_match_indexp is non-NULL it
 724  * will be assigned with the index to insert the item (even if it was
 725  * not found).
 726  */
 727 static int cache_binSearch(cnid_t *array, unsigned int hi, cnid_t parent_id, int *no_match_indexp)
 728 {
 729     int index=-1;
 730     unsigned int lo=0;
 731
 732     do {
 733         unsigned int mid = ((hi - lo)/2) + lo;
 734         unsigned int this_id = array[mid];
 735
 736         if (parent_id == this_id) {
 737             hi = mid;
 738             break;
 739         }
 740
 741         if (parent_id < this_id) {
 742             hi = mid;
 743             continue;
 744         }
 745
 746         if (parent_id > this_id) {
 747             lo = mid + 1;
 748             continue;
 749         }
 750     } while(lo < hi);
 751
 752     /* check if lo and hi converged on the match */
 753     if (parent_id == array[hi]) {
 754         index = hi;
 755     }
 756
 757     if (no_match_indexp) {
 758         *no_match_indexp = hi;
 759     }
 760
 761     return index;
 762 }
 763
 764
 765 static int
 766 lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id)
 767 {
 768     unsigned int hi;
 769     int matches = 0;
 770     int index, no_match_index;
 771
 772     if (cache->numcached == 0) {
 773         *indexp = 0;
 774         return 0; // table is empty, so insert at index=0 and report no match
 775     }
 776
 777     if (cache->numcached > NUM_CACHE_ENTRIES) {
 778         /*printf("hfs: EGAD! numcached is %d... cut our losses and trim to %d\n",
 779           cache->numcached, NUM_CACHE_ENTRIES);*/
 780         cache->numcached = NUM_CACHE_ENTRIES;
 781     }
 782
 783     hi = cache->numcached - 1;
 784
 785     index = cache_binSearch(cache->acache, hi, parent_id, &no_match_index);
 786
 787     /* if no existing entry found, find index for new one */
 788     if (index == -1) {
 789         index = no_match_index;
 790         matches = 0;
 791     } else {
 792         matches = 1;
 793     }
 794
 795     *indexp = index;
 796     return matches;
 797 }
 798
 799 /*
 800  * Add a node to the access_cache at the given index (or do a lookup first
 801  * to find the index if -1 is passed in). We currently do a replace rather
 802  * than an insert if the cache is full.
 803  */
 804 static void
 805 add_node(struct access_cache *cache, int index, cnid_t nodeID, int access)
 806 {
 807     int lookup_index = -1;
 808
 809     /* need to do a lookup first if -1 passed for index */
 810     if (index == -1) {
 811         if (lookup_bucket(cache, &lookup_index, nodeID)) {
 812             if (cache->haveaccess[lookup_index] != access && cache->haveaccess[lookup_index] == ESRCH) {
 813                 // only update an entry if the previous access was ESRCH (i.e. a scope checking error)
 814                 cache->haveaccess[lookup_index] = access;
 815             }
 816
 817             /* mission accomplished */
 818             return;
 819         } else {
 820             index = lookup_index;
 821         }
 822
 823     }
 824
 825     /* if the cache is full, do a replace rather than an insert */
 826     if (cache->numcached >= NUM_CACHE_ENTRIES) {
 827         //printf("hfs: cache is full (%d). replace at index %d\n", cache->numcached, index);
 828         cache->numcached = NUM_CACHE_ENTRIES-1;
 829
 830         if (index > cache->numcached) {
 831             //    printf("hfs: index %d pinned to %d\n", index, cache->numcached);
 832             index = cache->numcached;
 833         }
 834     }
 835
 836     if (index < cache->numcached && index < NUM_CACHE_ENTRIES && nodeID > cache->acache[index]) {
 837         index++;
 838     }
 839
 840     if (index >= 0 && index < cache->numcached) {
 841         /* only do bcopy if we're inserting */
 842         bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) );
 843         bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(unsigned char) );
 844     }
 845
 846     cache->acache[index] = nodeID;
 847     cache->haveaccess[index] = access;
 848     cache->numcached++;
 849 }
 850
 851
 852 struct cinfo {
 853     uid_t   uid;
 854     gid_t   gid;
 855     mode_t  mode;
 856     cnid_t  parentcnid;
 857     u_int16_t recflags;
 858 };
 859
 860 static int
 861 snoop_callback(const struct cat_desc *descp, const struct cat_attr *attrp, void * arg)
 862 {
 863     struct cinfo *cip = (struct cinfo *)arg;
 864
 865     cip->uid = attrp->ca_uid;
 866     cip->gid = attrp->ca_gid;
 867     cip->mode = attrp->ca_mode;
 868     cip->parentcnid = descp->cd_parentcnid;
 869     cip->recflags = attrp->ca_recflags;
 870
 871     return (0);
 872 }
 873
 874 /*
 875  * Lookup the cnid's attr info (uid, gid, and mode) as well as its parent id. If the item
 876  * isn't incore, then go to the catalog.
 877  */
 878 static int
 879 do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, cnid_t cnid,
 880     struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp)
 881 {
 882     int error = 0;
 883
 884     /* if this id matches the one the fsctl was called with, skip the lookup */
 885     if (cnid == skip_cp->c_cnid) {
 886         cnattrp->ca_uid = skip_cp->c_uid;
 887         cnattrp->ca_gid = skip_cp->c_gid;
 888         cnattrp->ca_mode = skip_cp->c_mode;
 889         cnattrp->ca_recflags = skip_cp->c_attr.ca_recflags;
 890         keyp->hfsPlus.parentID = skip_cp->c_parentcnid;
 891     } else {
 892         struct cinfo c_info;
 893
 894         /* otherwise, check the cnode hash incase the file/dir is incore */
 895         if (hfs_chash_snoop(hfsmp, cnid, snoop_callback, &c_info) == 0) {
 896             cnattrp->ca_uid = c_info.uid;
 897             cnattrp->ca_gid = c_info.gid;
 898             cnattrp->ca_mode = c_info.mode;
 899             cnattrp->ca_recflags = c_info.recflags;
 900             keyp->hfsPlus.parentID = c_info.parentcnid;
 901         } else {
 902             int lockflags;
 903
 904             lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
 905
 906             /* lookup this cnid in the catalog */
 907             error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp);
 908
 909             hfs_systemfile_unlock(hfsmp, lockflags);
 910
 911             cache->lookups++;
 912         }
 913     }
 914
 915     return (error);
 916 }
 917
 918
 919 /*
 920  * Compute whether we have access to the given directory (nodeID) and all its parents. Cache
 921  * up to CACHE_LEVELS as we progress towards the root.
 922  */
 923 static int
 924 do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HFSCatalogNodeID nodeID,
 925     struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred,
 926     struct vfs_context *my_context,
 927     char *bitmap,
 928     uint32_t map_size,
 929     cnid_t* parents,
 930     uint32_t num_parents)
 931 {
 932     int                     myErr = 0;
 933     int                     myResult;
 934     HFSCatalogNodeID        thisNodeID;
 935     unsigned int            myPerms;
 936     struct cat_attr         cnattr;
 937     int                     cache_index = -1, scope_index = -1, scope_idx_start = -1;
 938     CatalogKey              catkey;
 939
 940     int i = 0, ids_to_cache = 0;
 941     int parent_ids[CACHE_LEVELS];
 942
 943     thisNodeID = nodeID;
 944     while (thisNodeID >=  kRootDirID) {
 945         myResult = 0;   /* default to "no access" */
 946
 947         /* check the cache before resorting to hitting the catalog */
 948
 949         /* ASSUMPTION: access info of cached entries is "final"... i.e. no need
 950          * to look any further after hitting cached dir */
 951
 952         if (lookup_bucket(cache, &cache_index, thisNodeID)) {
 953             cache->cachehits++;
 954             myErr = cache->haveaccess[cache_index];
 955             if (scope_index != -1) {
 956                 if (myErr == ESRCH) {
 957                     myErr = 0;
 958                 }
 959             } else {
 960                 scope_index = 0;   // so we'll just use the cache result
 961                 scope_idx_start = ids_to_cache;
 962             }
 963             myResult = (myErr == 0) ? 1 : 0;
 964             goto ExitThisRoutine;
 965         }
 966
 967
 968         if (parents) {
 969             int tmp;
 970             tmp = cache_binSearch(parents, num_parents-1, thisNodeID, NULL);
 971             if (scope_index == -1)
 972                 scope_index = tmp;
 973             if (tmp != -1 && scope_idx_start == -1 && ids_to_cache < CACHE_LEVELS) {
 974                 scope_idx_start = ids_to_cache;
 975             }
 976         }
 977
 978         /* remember which parents we want to cache */
 979         if (ids_to_cache < CACHE_LEVELS) {
 980             parent_ids[ids_to_cache] = thisNodeID;
 981             ids_to_cache++;
 982         }
 983         // Inefficient (using modulo) and we might want to use a hash function, not rely on the node id to be "nice"...
 984         if (bitmap && map_size) {
 985             bitmap[(thisNodeID/8)%(map_size)]|=(1<<(thisNodeID&7));
 986         }
 987
 988
 989         /* do the lookup (checks the cnode hash, then the catalog) */
 990         myErr = do_attr_lookup(hfsmp, cache, thisNodeID, skip_cp, &catkey, &cnattr);
 991         if (myErr) {
 992             goto ExitThisRoutine; /* no access */
 993         }
 994
 995         /* Root always gets access. */
 996         if (suser(myp_ucred, NULL) == 0) {
 997                 thisNodeID = catkey.hfsPlus.parentID;
 998                 myResult = 1;
 999                 continue;
1000         }
1001
1002         // if the thing has acl's, do the full permission check
1003         if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1004             struct vnode *vp;
1005
1006             /* get the vnode for this cnid */
1007             myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0);
1008             if ( myErr ) {
1009                 myResult = 0;
1010                 goto ExitThisRoutine;
1011             }
1012
1013             thisNodeID = VTOC(vp)->c_parentcnid;
1014
1015             hfs_unlock(VTOC(vp));
1016
1017             if (vnode_vtype(vp) == VDIR) {
1018                 myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), my_context);
1019             } else {
1020                 myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, my_context);
1021             }
1022
1023             vnode_put(vp);
1024             if (myErr) {
1025                 myResult = 0;
1026                 goto ExitThisRoutine;
1027             }
1028         } else {
1029             unsigned int flags;
1030
1031             myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
1032                 cnattr.ca_mode, hfsmp->hfs_mp,
1033                 myp_ucred, theProcPtr);
1034
1035             if (cnattr.ca_mode & S_IFDIR) {
1036                 flags = R_OK | X_OK;
1037             } else {
1038                 flags = R_OK;
1039             }
1040             if ( (myPerms & flags) != flags) {
1041                 myResult = 0;
1042                 myErr = EACCES;
1043                 goto ExitThisRoutine;   /* no access */
1044             }
1045
1046             /* up the hierarchy we go */
1047             thisNodeID = catkey.hfsPlus.parentID;
1048         }
1049     }
1050
1051     /* if here, we have access to this node */
1052     myResult = 1;
1053
1054   ExitThisRoutine:
1055     if (parents && myErr == 0 && scope_index == -1) {
1056         myErr = ESRCH;
1057     }
1058
1059     if (myErr) {
1060         myResult = 0;
1061     }
1062     *err = myErr;
1063
1064     /* cache the parent directory(ies) */
1065     for (i = 0; i < ids_to_cache; i++) {
1066         if (myErr == 0 && parents && (scope_idx_start == -1 || i > scope_idx_start)) {
1067             add_node(cache, -1, parent_ids[i], ESRCH);
1068         } else {
1069             add_node(cache, -1, parent_ids[i], myErr);
1070         }
1071     }
1072
1073     return (myResult);
1074 }
1075
1076 static int
1077 do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
1078     struct vnop_ioctl_args *ap, int arg_size, vfs_context_t context)
1079 {
1080     boolean_t is64bit;
1081
1082     /*
1083      * NOTE: on entry, the vnode is locked. Incase this vnode
1084      * happens to be in our list of file_ids, we'll note it
1085      * avoid calling hfs_chashget_nowait() on that id as that
1086      * will cause a "locking against myself" panic.
1087      */
1088     Boolean check_leaf = true;
1089
1090     struct user64_ext_access_t *user_access_structp;
1091     struct user64_ext_access_t tmp_user_access;
1092     struct access_cache cache;
1093
1094     int error = 0, prev_parent_check_ok=1;
1095     unsigned int i;
1096
1097     short flags;
1098     unsigned int num_files = 0;
1099     int map_size = 0;
1100     int num_parents = 0;
1101     int *file_ids=NULL;
1102     short *access=NULL;
1103     char *bitmap=NULL;
1104     cnid_t *parents=NULL;
1105     int leaf_index;
1106
1107     cnid_t cnid;
1108     cnid_t prevParent_cnid = 0;
1109     unsigned int myPerms;
1110     short myaccess = 0;
1111     struct cat_attr cnattr;
1112     CatalogKey catkey;
1113     struct cnode *skip_cp = VTOC(vp);
1114     kauth_cred_t cred = vfs_context_ucred(context);
1115     proc_t p = vfs_context_proc(context);
1116
1117     is64bit = proc_is64bit(p);
1118
1119     /* initialize the local cache and buffers */
1120     cache.numcached = 0;
1121     cache.cachehits = 0;
1122     cache.lookups = 0;
1123     cache.acache = NULL;
1124     cache.haveaccess = NULL;
1125
1126     /* struct copyin done during dispatch... need to copy file_id array separately */
1127     if (ap->a_data == NULL) {
1128         error = EINVAL;
1129         goto err_exit_bulk_access;
1130     }
1131
1132     if (is64bit) {
1133         if (arg_size != sizeof(struct user64_ext_access_t)) {
1134             error = EINVAL;
1135             goto err_exit_bulk_access;
1136         }
1137
1138         user_access_structp = (struct user64_ext_access_t *)ap->a_data;
1139
1140     } else if (arg_size == sizeof(struct user32_access_t)) {
1141         struct user32_access_t *accessp = (struct user32_access_t *)ap->a_data;
1142
1143         // convert an old style bulk-access struct to the new style
1144         tmp_user_access.flags     = accessp->flags;
1145         tmp_user_access.num_files = accessp->num_files;
1146         tmp_user_access.map_size  = 0;
1147         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1148         tmp_user_access.bitmap    = USER_ADDR_NULL;
1149         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1150         tmp_user_access.num_parents = 0;
1151         user_access_structp = &tmp_user_access;
1152
1153     } else if (arg_size == sizeof(struct user32_ext_access_t)) {
1154         struct user32_ext_access_t *accessp = (struct user32_ext_access_t *)ap->a_data;
1155
1156         // up-cast from a 32-bit version of the struct
1157         tmp_user_access.flags     = accessp->flags;
1158         tmp_user_access.num_files = accessp->num_files;
1159         tmp_user_access.map_size  = accessp->map_size;
1160         tmp_user_access.num_parents  = accessp->num_parents;
1161
1162         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1163         tmp_user_access.bitmap    = CAST_USER_ADDR_T(accessp->bitmap);
1164         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1165         tmp_user_access.parents    = CAST_USER_ADDR_T(accessp->parents);
1166
1167         user_access_structp = &tmp_user_access;
1168     } else {
1169         error = EINVAL;
1170         goto err_exit_bulk_access;
1171     }
1172
1173     map_size = user_access_structp->map_size;
1174
1175     num_files = user_access_structp->num_files;
1176
1177     num_parents= user_access_structp->num_parents;
1178
1179     if (num_files < 1) {
1180         goto err_exit_bulk_access;
1181     }
1182     if (num_files > 1024) {
1183         error = EINVAL;
1184         goto err_exit_bulk_access;
1185     }
1186
1187     if (num_parents > 1024) {
1188         error = EINVAL;
1189         goto err_exit_bulk_access;
1190     }
1191
1192     file_ids = (int *) kalloc(sizeof(int) * num_files);
1193     access = (short *) kalloc(sizeof(short) * num_files);
1194     if (map_size) {
1195         bitmap = (char *) kalloc(sizeof(char) * map_size);
1196     }
1197
1198     if (num_parents) {
1199         parents = (cnid_t *) kalloc(sizeof(cnid_t) * num_parents);
1200     }
1201
1202     cache.acache = (unsigned int *) kalloc(sizeof(int) * NUM_CACHE_ENTRIES);
1203     cache.haveaccess = (unsigned char *) kalloc(sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1204
1205     if (file_ids == NULL || access == NULL || (map_size != 0 && bitmap == NULL) || cache.acache == NULL || cache.haveaccess == NULL) {
1206         if (file_ids) {
1207             kfree(file_ids, sizeof(int) * num_files);
1208         }
1209         if (bitmap) {
1210             kfree(bitmap, sizeof(char) * map_size);
1211         }
1212         if (access) {
1213             kfree(access, sizeof(short) * num_files);
1214         }
1215         if (cache.acache) {
1216             kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1217         }
1218         if (cache.haveaccess) {
1219             kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1220         }
1221         if (parents) {
1222             kfree(parents, sizeof(cnid_t) * num_parents);
1223         }
1224         return ENOMEM;
1225     }
1226
1227     // make sure the bitmap is zero'ed out...
1228     if (bitmap) {
1229         bzero(bitmap, (sizeof(char) * map_size));
1230     }
1231
1232     if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids,
1233                 num_files * sizeof(int)))) {
1234         goto err_exit_bulk_access;
1235     }
1236
1237     if (num_parents) {
1238         if ((error = copyin(user_access_structp->parents, (caddr_t)parents,
1239                     num_parents * sizeof(cnid_t)))) {
1240             goto err_exit_bulk_access;
1241         }
1242     }
1243
1244     flags = user_access_structp->flags;
1245     if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) {
1246         flags = R_OK;
1247     }
1248
1249     /* check if we've been passed leaf node ids or parent ids */
1250     if (flags & PARENT_IDS_FLAG) {
1251         check_leaf = false;
1252     }
1253
1254     /* Check access to each file_id passed in */
1255     for (i = 0; i < num_files; i++) {
1256         leaf_index=-1;
1257         cnid = (cnid_t) file_ids[i];
1258
1259         /* root always has access */
1260         if ((!parents) && (!suser(cred, NULL))) {
1261             access[i] = 0;
1262             continue;
1263         }
1264
1265         if (check_leaf) {
1266             /* do the lookup (checks the cnode hash, then the catalog) */
1267             error = do_attr_lookup(hfsmp, &cache, cnid, skip_cp, &catkey, &cnattr);
1268             if (error) {
1269                 access[i] = (short) error;
1270                 continue;
1271             }
1272
1273             if (parents) {
1274                 // Check if the leaf matches one of the parent scopes
1275                 leaf_index = cache_binSearch(parents, num_parents-1, cnid, NULL);
1276                 if (leaf_index >= 0 && parents[leaf_index] == cnid)
1277                     prev_parent_check_ok = 0;
1278                 else if (leaf_index >= 0)
1279                     prev_parent_check_ok = 1;
1280             }
1281
1282             // if the thing has acl's, do the full permission check
1283             if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1284                 struct vnode *cvp;
1285                 int myErr = 0;
1286                 /* get the vnode for this cnid */
1287                 myErr = hfs_vget(hfsmp, cnid, &cvp, 0);
1288                 if ( myErr ) {
1289                     access[i] = myErr;
1290                     continue;
1291                 }
1292
1293                 hfs_unlock(VTOC(cvp));
1294
1295                 if (vnode_vtype(cvp) == VDIR) {
1296                     myErr = vnode_authorize(cvp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), context);
1297                 } else {
1298                     myErr = vnode_authorize(cvp, NULL, KAUTH_VNODE_READ_DATA, context);
1299                 }
1300
1301                 vnode_put(cvp);
1302                 if (myErr) {
1303                     access[i] = myErr;
1304                     continue;
1305                 }
1306             } else {
1307                 /* before calling CheckAccess(), check the target file for read access */
1308                 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
1309                     cnattr.ca_mode, hfsmp->hfs_mp, cred, p);
1310
1311                 /* fail fast if no access */
1312                 if ((myPerms & flags) == 0) {
1313                     access[i] = EACCES;
1314                     continue;
1315                 }
1316             }
1317         } else {
1318             /* we were passed an array of parent ids */
1319             catkey.hfsPlus.parentID = cnid;
1320         }
1321
1322         /* if the last guy had the same parent and had access, we're done */
1323         if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0 && prev_parent_check_ok) {
1324             cache.cachehits++;
1325             access[i] = 0;
1326             continue;
1327         }
1328
1329         myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID,
1330             skip_cp, p, cred, context,bitmap, map_size, parents, num_parents);
1331
1332         if (myaccess || (error == ESRCH && leaf_index != -1)) {
1333             access[i] = 0; // have access.. no errors to report
1334         } else {
1335             access[i] = (error != 0 ? (short) error : EACCES);
1336         }
1337
1338         prevParent_cnid = catkey.hfsPlus.parentID;
1339     }
1340
1341     /* copyout the access array */
1342     if ((error = copyout((caddr_t)access, user_access_structp->access,
1343                 num_files * sizeof (short)))) {
1344         goto err_exit_bulk_access;
1345     }
1346     if (map_size && bitmap) {
1347         if ((error = copyout((caddr_t)bitmap, user_access_structp->bitmap,
1348                     map_size * sizeof (char)))) {
1349             goto err_exit_bulk_access;
1350         }
1351     }
1352
1353
1354   err_exit_bulk_access:
1355
1356     //printf("hfs: on exit (err %d), numfiles/numcached/cachehits/lookups is %d/%d/%d/%d\n", error, num_files, cache.numcached, cache.cachehits, cache.lookups);
1357
1358     if (file_ids)
1359         kfree(file_ids, sizeof(int) * num_files);
1360     if (parents)
1361         kfree(parents, sizeof(cnid_t) * num_parents);
1362     if (bitmap)
1363         kfree(bitmap, sizeof(char) * map_size);
1364     if (access)
1365         kfree(access, sizeof(short) * num_files);
1366     if (cache.acache)
1367         kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1368     if (cache.haveaccess)
1369         kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1370
1371     return (error);
1372 }
1373
1374
1375 /* end "bulk-access" support */
1376
1377
1378 /*
1379  * Callback for use with freeze ioctl.
1380  */
1381 static int
1382 hfs_freezewrite_callback(struct vnode *vp, __unused void *cargs)
1383 {
1384         vnode_waitforwrites(vp, 0, 0, 0, "hfs freeze");
1385
1386         return 0;
1387 }
1388
1389 /*
1390  * Control filesystem operating characteristics.
1391  */
1392 int
1393 hfs_vnop_ioctl( struct vnop_ioctl_args /* {
1394                 vnode_t a_vp;
1395                 int  a_command;
1396                 caddr_t  a_data;
1397                 int  a_fflag;
1398                 vfs_context_t a_context;
1399         } */ *ap)
1400 {
1401         struct vnode * vp = ap->a_vp;
1402         struct hfsmount *hfsmp = VTOHFS(vp);
1403         vfs_context_t context = ap->a_context;
1404         kauth_cred_t cred = vfs_context_ucred(context);
1405         proc_t p = vfs_context_proc(context);
1406         struct vfsstatfs *vfsp;
1407         boolean_t is64bit;
1408         off_t jnl_start, jnl_size;
1409         struct hfs_journal_info *jip;
1410 #if HFS_COMPRESSION
1411         int compressed = 0;
1412         off_t uncompressed_size = -1;
1413         int decmpfs_error = 0;
1414
1415         if (ap->a_command == F_RDADVISE) {
1416                 /* we need to inspect the decmpfs state of the file as early as possible */
1417                 compressed = hfs_file_is_compressed(VTOC(vp), 0);
1418                 if (compressed) {
1419                         if (VNODE_IS_RSRC(vp)) {
1420                                 /* if this is the resource fork, treat it as if it were empty */
1421                                 uncompressed_size = 0;
1422                         } else {
1423                                 decmpfs_error = hfs_uncompressed_size_of_compressed_file(NULL, vp, 0, &uncompressed_size, 0);
1424                                 if (decmpfs_error != 0) {
1425                                         /* failed to get the uncompressed size, we'll check for this later */
1426                                         uncompressed_size = -1;
1427                                 }
1428                         }
1429                 }
1430         }
1431 #endif /* HFS_COMPRESSION */
1432
1433         is64bit = proc_is64bit(p);
1434
1435         switch (ap->a_command) {
1436
1437         case HFS_GETPATH:
1438         {
1439                 struct vnode *file_vp;
1440                 cnid_t  cnid;
1441                 int  outlen;
1442                 char *bufptr;
1443                 int error;
1444
1445                 /* Caller must be owner of file system. */
1446                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1447                 if (suser(cred, NULL) &&
1448                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1449                         return (EACCES);
1450                 }
1451                 /* Target vnode must be file system's root. */
1452                 if (!vnode_isvroot(vp)) {
1453                         return (EINVAL);
1454                 }
1455                 bufptr = (char *)ap->a_data;
1456                 cnid = strtoul(bufptr, NULL, 10);
1457
1458                 /* We need to call hfs_vfs_vget to leverage the code that will
1459                  * fix the origin list for us if needed, as opposed to calling
1460                  * hfs_vget, since we will need the parent for build_path call.
1461                  */
1462
1463                 if ((error = hfs_vfs_vget(HFSTOVFS(hfsmp), cnid, &file_vp, context))) {
1464                         return (error);
1465                 }
1466                 error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, 0, context);
1467                 vnode_put(file_vp);
1468
1469                 return (error);
1470         }
1471
1472         case HFS_PREV_LINK:
1473         case HFS_NEXT_LINK:
1474         {
1475                 cnid_t linkfileid;
1476                 cnid_t nextlinkid;
1477                 cnid_t prevlinkid;
1478                 int error;
1479
1480                 /* Caller must be owner of file system. */
1481                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1482                 if (suser(cred, NULL) &&
1483                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1484                         return (EACCES);
1485                 }
1486                 /* Target vnode must be file system's root. */
1487                 if (!vnode_isvroot(vp)) {
1488                         return (EINVAL);
1489                 }
1490                 linkfileid = *(cnid_t *)ap->a_data;
1491                 if (linkfileid < kHFSFirstUserCatalogNodeID) {
1492                         return (EINVAL);
1493                 }
1494                 if ((error = hfs_lookuplink(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) {
1495                         return (error);
1496                 }
1497                 if (ap->a_command == HFS_NEXT_LINK) {
1498                         *(cnid_t *)ap->a_data = nextlinkid;
1499                 } else {
1500                         *(cnid_t *)ap->a_data = prevlinkid;
1501                 }
1502                 return (0);
1503         }
1504
1505         case HFS_RESIZE_PROGRESS: {
1506
1507                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1508                 if (suser(cred, NULL) &&
1509                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1510                         return (EACCES); /* must be owner of file system */
1511                 }
1512                 if (!vnode_isvroot(vp)) {
1513                         return (EINVAL);
1514                 }
1515                 /* file system must not be mounted read-only */
1516                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1517                         return (EROFS);
1518                 }
1519
1520                 return hfs_resize_progress(hfsmp, (u_int32_t *)ap->a_data);
1521         }
1522
1523         case HFS_RESIZE_VOLUME: {
1524                 u_int64_t newsize;
1525                 u_int64_t cursize;
1526
1527                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1528                 if (suser(cred, NULL) &&
1529                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1530                         return (EACCES); /* must be owner of file system */
1531                 }
1532                 if (!vnode_isvroot(vp)) {
1533                         return (EINVAL);
1534                 }
1535
1536                 /* filesystem must not be mounted read only */
1537                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1538                         return (EROFS);
1539                 }
1540                 newsize = *(u_int64_t *)ap->a_data;
1541                 cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
1542
1543                 if (newsize > cursize) {
1544                         return hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context);
1545                 } else if (newsize < cursize) {
1546                         return hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context);
1547                 } else {
1548                         return (0);
1549                 }
1550         }
1551         case HFS_CHANGE_NEXT_ALLOCATION: {
1552                 int error = 0;          /* Assume success */
1553                 u_int32_t location;
1554
1555                 if (vnode_vfsisrdonly(vp)) {
1556                         return (EROFS);
1557                 }
1558                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1559                 if (suser(cred, NULL) &&
1560                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1561                         return (EACCES); /* must be owner of file system */
1562                 }
1563                 if (!vnode_isvroot(vp)) {
1564                         return (EINVAL);
1565                 }
1566                 HFS_MOUNT_LOCK(hfsmp, TRUE);
1567                 location = *(u_int32_t *)ap->a_data;
1568                 if ((location >= hfsmp->allocLimit) &&
1569                         (location != HFS_NO_UPDATE_NEXT_ALLOCATION)) {
1570                         error = EINVAL;
1571                         goto fail_change_next_allocation;
1572                 }
1573                 /* Return previous value. */
1574                 *(u_int32_t *)ap->a_data = hfsmp->nextAllocation;
1575                 if (location == HFS_NO_UPDATE_NEXT_ALLOCATION) {
1576                         /* On magic value for location, set nextAllocation to next block
1577                          * after metadata zone and set flag in mount structure to indicate
1578                          * that nextAllocation should not be updated again.
1579                          */
1580                         if (hfsmp->hfs_metazone_end != 0) {
1581                                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_end + 1);
1582                         }
1583                         hfsmp->hfs_flags |= HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1584                 } else {
1585                         hfsmp->hfs_flags &= ~HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1586                         HFS_UPDATE_NEXT_ALLOCATION(hfsmp, location);
1587                 }
1588                 MarkVCBDirty(hfsmp);
1589 fail_change_next_allocation:
1590                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
1591                 return (error);
1592         }
1593
1594 #ifdef HFS_SPARSE_DEV
1595         case HFS_SETBACKINGSTOREINFO: {
1596                 struct vnode * bsfs_rootvp;
1597                 struct vnode * di_vp;
1598                 struct hfs_backingstoreinfo *bsdata;
1599                 int error = 0;
1600
1601                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1602                         return (EROFS);
1603                 }
1604                 if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
1605                         return (EALREADY);
1606                 }
1607                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1608                 if (suser(cred, NULL) &&
1609                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1610                         return (EACCES); /* must be owner of file system */
1611                 }
1612                 bsdata = (struct hfs_backingstoreinfo *)ap->a_data;
1613                 if (bsdata == NULL) {
1614                         return (EINVAL);
1615                 }
1616                 if ((error = file_vnode(bsdata->backingfd, &di_vp))) {
1617                         return (error);
1618                 }
1619                 if ((error = vnode_getwithref(di_vp))) {
1620                         file_drop(bsdata->backingfd);
1621                         return(error);
1622                 }
1623
1624                 if (vnode_mount(vp) == vnode_mount(di_vp)) {
1625                         (void)vnode_put(di_vp);
1626                         file_drop(bsdata->backingfd);
1627                         return (EINVAL);
1628                 }
1629
1630                 /*
1631                  * Obtain the backing fs root vnode and keep a reference
1632                  * on it.  This reference will be dropped in hfs_unmount.
1633                  */
1634                 error = VFS_ROOT(vnode_mount(di_vp), &bsfs_rootvp, NULL); /* XXX use context! */
1635                 if (error) {
1636                         (void)vnode_put(di_vp);
1637                         file_drop(bsdata->backingfd);
1638                         return (error);
1639                 }
1640                 vnode_ref(bsfs_rootvp);
1641                 vnode_put(bsfs_rootvp);
1642
1643                 hfsmp->hfs_backingfs_rootvp = bsfs_rootvp;
1644                 hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
1645                 hfsmp->hfs_sparsebandblks = bsdata->bandsize / HFSTOVCB(hfsmp)->blockSize;
1646                 hfsmp->hfs_sparsebandblks *= 4;
1647
1648                 vfs_markdependency(hfsmp->hfs_mp);
1649
1650                 /*
1651                  * If the sparse image is on a sparse image file (as opposed to a sparse
1652                  * bundle), then we may need to limit the free space to the maximum size
1653                  * of a file on that volume.  So we query (using pathconf), and if we get
1654                  * a meaningful result, we cache the number of blocks for later use in
1655                  * hfs_freeblks().
1656                  */
1657                 hfsmp->hfs_backingfs_maxblocks = 0;
1658                 if (vnode_vtype(di_vp) == VREG) {
1659                         int terr;
1660                         int hostbits;
1661                         terr = vn_pathconf(di_vp, _PC_FILESIZEBITS, &hostbits, context);
1662                         if (terr == 0 && hostbits != 0 && hostbits < 64) {
1663                                 u_int64_t hostfilesizemax = ((u_int64_t)1) << hostbits;
1664
1665                                 hfsmp->hfs_backingfs_maxblocks = hostfilesizemax / hfsmp->blockSize;
1666                         }
1667                 }
1668
1669                 (void)vnode_put(di_vp);
1670                 file_drop(bsdata->backingfd);
1671                 return (0);
1672         }
1673         case HFS_CLRBACKINGSTOREINFO: {
1674                 struct vnode * tmpvp;
1675
1676                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1677                 if (suser(cred, NULL) &&
1678                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1679                         return (EACCES); /* must be owner of file system */
1680                 }
1681                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1682                         return (EROFS);
1683                 }
1684
1685                 if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
1686                     hfsmp->hfs_backingfs_rootvp) {
1687
1688                         hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
1689                         tmpvp = hfsmp->hfs_backingfs_rootvp;
1690                         hfsmp->hfs_backingfs_rootvp = NULLVP;
1691                         hfsmp->hfs_sparsebandblks = 0;
1692                         vnode_rele(tmpvp);
1693                 }
1694                 return (0);
1695         }
1696 #endif /* HFS_SPARSE_DEV */
1697
1698         case F_FREEZE_FS: {
1699                 struct mount *mp;
1700
1701                 mp = vnode_mount(vp);
1702                 hfsmp = VFSTOHFS(mp);
1703
1704                 if (!(hfsmp->jnl))
1705                         return (ENOTSUP);
1706
1707                 vfsp = vfs_statfs(mp);
1708
1709                 if (kauth_cred_getuid(cred) != vfsp->f_owner &&
1710                         !kauth_cred_issuser(cred))
1711                         return (EACCES);
1712
1713                 lck_rw_lock_exclusive(&hfsmp->hfs_insync);
1714
1715                 // flush things before we get started to try and prevent
1716                 // dirty data from being paged out while we're frozen.
1717                 // note: can't do this after taking the lock as it will
1718                 // deadlock against ourselves.
1719                 vnode_iterate(mp, 0, hfs_freezewrite_callback, NULL);
1720                 hfs_global_exclusive_lock_acquire(hfsmp);
1721
1722                 // DO NOT call hfs_journal_flush() because that takes a
1723                 // shared lock on the global exclusive lock!
1724                 journal_flush(hfsmp->jnl);
1725
1726                 // don't need to iterate on all vnodes, we just need to
1727                 // wait for writes to the system files and the device vnode
1728                 if (HFSTOVCB(hfsmp)->extentsRefNum)
1729                     vnode_waitforwrites(HFSTOVCB(hfsmp)->extentsRefNum, 0, 0, 0, "hfs freeze");
1730                 if (HFSTOVCB(hfsmp)->catalogRefNum)
1731                     vnode_waitforwrites(HFSTOVCB(hfsmp)->catalogRefNum, 0, 0, 0, "hfs freeze");
1732                 if (HFSTOVCB(hfsmp)->allocationsRefNum)
1733                     vnode_waitforwrites(HFSTOVCB(hfsmp)->allocationsRefNum, 0, 0, 0, "hfs freeze");
1734                 if (hfsmp->hfs_attribute_vp)
1735                     vnode_waitforwrites(hfsmp->hfs_attribute_vp, 0, 0, 0, "hfs freeze");
1736                 vnode_waitforwrites(hfsmp->hfs_devvp, 0, 0, 0, "hfs freeze");
1737
1738                 hfsmp->hfs_freezing_proc = current_proc();
1739
1740                 return (0);
1741         }
1742
1743         case F_THAW_FS: {
1744                 vfsp = vfs_statfs(vnode_mount(vp));
1745                 if (kauth_cred_getuid(cred) != vfsp->f_owner &&
1746                         !kauth_cred_issuser(cred))
1747                         return (EACCES);
1748
1749                 // if we're not the one who froze the fs then we
1750                 // can't thaw it.
1751                 if (hfsmp->hfs_freezing_proc != current_proc()) {
1752                     return EPERM;
1753                 }
1754
1755                 // NOTE: if you add code here, also go check the
1756                 //       code that "thaws" the fs in hfs_vnop_close()
1757                 //
1758                 hfsmp->hfs_freezing_proc = NULL;
1759                 hfs_global_exclusive_lock_release(hfsmp);
1760                 lck_rw_unlock_exclusive(&hfsmp->hfs_insync);
1761
1762                 return (0);
1763         }
1764
1765         case HFS_BULKACCESS_FSCTL: {
1766             int size;
1767
1768             if (hfsmp->hfs_flags & HFS_STANDARD) {
1769                 return EINVAL;
1770             }
1771
1772             if (is64bit) {
1773                 size = sizeof(struct user64_access_t);
1774             } else {
1775                 size = sizeof(struct user32_access_t);
1776             }
1777
1778             return do_bulk_access_check(hfsmp, vp, ap, size, context);
1779         }
1780
1781         case HFS_EXT_BULKACCESS_FSCTL: {
1782             int size;
1783
1784             if (hfsmp->hfs_flags & HFS_STANDARD) {
1785                 return EINVAL;
1786             }
1787
1788             if (is64bit) {
1789                 size = sizeof(struct user64_ext_access_t);
1790             } else {
1791                 size = sizeof(struct user32_ext_access_t);
1792             }
1793
1794             return do_bulk_access_check(hfsmp, vp, ap, size, context);
1795         }
1796
1797         case HFS_SETACLSTATE: {
1798                 int state;
1799
1800                 if (ap->a_data == NULL) {
1801                         return (EINVAL);
1802                 }
1803
1804                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1805                 state = *(int *)ap->a_data;
1806
1807                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1808                         return (EROFS);
1809                 }
1810                 // super-user can enable or disable acl's on a volume.
1811                 // the volume owner can only enable acl's
1812                 if (!is_suser() && (state == 0 || kauth_cred_getuid(cred) != vfsp->f_owner)) {
1813                         return (EPERM);
1814                 }
1815                 if (state == 0 || state == 1)
1816                         return hfs_set_volxattr(hfsmp, HFS_SETACLSTATE, state);
1817                 else
1818                         return (EINVAL);
1819         }
1820
1821         case HFS_SET_XATTREXTENTS_STATE: {
1822                 int state;
1823
1824                 if (ap->a_data == NULL) {
1825                         return (EINVAL);
1826                 }
1827
1828                 state = *(int *)ap->a_data;
1829
1830                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1831                         return (EROFS);
1832                 }
1833
1834                 /* Super-user can enable or disable extent-based extended
1835                  * attribute support on a volume
1836                  */
1837                 if (!is_suser()) {
1838                         return (EPERM);
1839                 }
1840                 if (state == 0 || state == 1)
1841                         return hfs_set_volxattr(hfsmp, HFS_SET_XATTREXTENTS_STATE, state);
1842                 else
1843                         return (EINVAL);
1844         }
1845
1846         case F_FULLFSYNC: {
1847                 int error;
1848
1849                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1850                         return (EROFS);
1851                 }
1852                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK);
1853                 if (error == 0) {
1854                         error = hfs_fsync(vp, MNT_WAIT, TRUE, p);
1855                         hfs_unlock(VTOC(vp));
1856                 }
1857
1858                 return error;
1859         }
1860
1861         case F_CHKCLEAN: {
1862                 register struct cnode *cp;
1863                 int error;
1864
1865                 if (!vnode_isreg(vp))
1866                         return EINVAL;
1867
1868                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK);
1869                 if (error == 0) {
1870                         cp = VTOC(vp);
1871                         /*
1872                          * used by regression test to determine if
1873                          * all the dirty pages (via write) have been cleaned
1874                          * after a call to 'fsysnc'.
1875                          */
1876                         error = is_file_clean(vp, VTOF(vp)->ff_size);
1877                         hfs_unlock(cp);
1878                 }
1879                 return (error);
1880         }
1881
1882         case F_RDADVISE: {
1883                 register struct radvisory *ra;
1884                 struct filefork *fp;
1885                 int error;
1886
1887                 if (!vnode_isreg(vp))
1888                         return EINVAL;
1889
1890                 ra = (struct radvisory *)(ap->a_data);
1891                 fp = VTOF(vp);
1892
1893                 /* Protect against a size change. */
1894                 hfs_lock_truncate(VTOC(vp), TRUE);
1895
1896 #if HFS_COMPRESSION
1897                 if (compressed && (uncompressed_size == -1)) {
1898                         /* fetching the uncompressed size failed above, so return the error */
1899                         error = decmpfs_error;
1900                 } else if ((compressed && (ra->ra_offset >= uncompressed_size)) ||
1901                                    (!compressed && (ra->ra_offset >= fp->ff_size))) {
1902                         error = EFBIG;
1903                 }
1904 #else /* HFS_COMPRESSION */
1905                 if (ra->ra_offset >= fp->ff_size) {
1906                         error = EFBIG;
1907                 }
1908 #endif /* HFS_COMPRESSION */
1909                 else {
1910                         error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count);
1911                 }
1912
1913                 hfs_unlock_truncate(VTOC(vp), TRUE);
1914                 return (error);
1915         }
1916
1917         case F_READBOOTSTRAP:
1918         case F_WRITEBOOTSTRAP:
1919         {
1920             struct vnode *devvp = NULL;
1921             user_fbootstraptransfer_t *user_bootstrapp;
1922             int devBlockSize;
1923             int error;
1924             uio_t auio;
1925             daddr64_t blockNumber;
1926             u_int32_t blockOffset;
1927             u_int32_t xfersize;
1928             struct buf *bp;
1929             user_fbootstraptransfer_t user_bootstrap;
1930
1931                 if (!vnode_isvroot(vp))
1932                         return (EINVAL);
1933                 /* LP64 - when caller is a 64 bit process then we are passed a pointer
1934                  * to a user_fbootstraptransfer_t else we get a pointer to a
1935                  * fbootstraptransfer_t which we munge into a user_fbootstraptransfer_t
1936                  */
1937                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1938                         return (EROFS);
1939                 }
1940                 if (is64bit) {
1941                         user_bootstrapp = (user_fbootstraptransfer_t *)ap->a_data;
1942                 }
1943                 else {
1944                 user32_fbootstraptransfer_t *bootstrapp = (user32_fbootstraptransfer_t *)ap->a_data;
1945                         user_bootstrapp = &user_bootstrap;
1946                         user_bootstrap.fbt_offset = bootstrapp->fbt_offset;
1947                         user_bootstrap.fbt_length = bootstrapp->fbt_length;
1948                         user_bootstrap.fbt_buffer = CAST_USER_ADDR_T(bootstrapp->fbt_buffer);
1949                 }
1950
1951                 if ((user_bootstrapp->fbt_offset < 0) || (user_bootstrapp->fbt_offset > 1024) ||
1952                                 (user_bootstrapp->fbt_length > 1024)) {
1953                         return EINVAL;
1954                 }
1955
1956                 if (user_bootstrapp->fbt_offset + user_bootstrapp->fbt_length > 1024)
1957                         return EINVAL;
1958
1959             devvp = VTOHFS(vp)->hfs_devvp;
1960                 auio = uio_create(1, user_bootstrapp->fbt_offset,
1961                                                   is64bit ? UIO_USERSPACE64 : UIO_USERSPACE32,
1962                                                   (ap->a_command == F_WRITEBOOTSTRAP) ? UIO_WRITE : UIO_READ);
1963                 uio_addiov(auio, user_bootstrapp->fbt_buffer, user_bootstrapp->fbt_length);
1964
1965             devBlockSize = vfs_devblocksize(vnode_mount(vp));
1966
1967             while (uio_resid(auio) > 0) {
1968                         blockNumber = uio_offset(auio) / devBlockSize;
1969                         error = (int)buf_bread(devvp, blockNumber, devBlockSize, cred, &bp);
1970                         if (error) {
1971                                 if (bp) buf_brelse(bp);
1972                                 uio_free(auio);
1973                                 return error;
1974                         };
1975
1976                         blockOffset = uio_offset(auio) % devBlockSize;
1977                         xfersize = devBlockSize - blockOffset;
1978                         error = uiomove((caddr_t)buf_dataptr(bp) + blockOffset, (int)xfersize, auio);
1979                         if (error) {
1980                                 buf_brelse(bp);
1981                                 uio_free(auio);
1982                                 return error;
1983                         };
1984                         if (uio_rw(auio) == UIO_WRITE) {
1985                                 error = VNOP_BWRITE(bp);
1986                                 if (error) {
1987                                         uio_free(auio);
1988                         return error;
1989                                 }
1990                         } else {
1991                                 buf_brelse(bp);
1992                         };
1993                 };
1994                 uio_free(auio);
1995         };
1996         return 0;
1997
1998         case _IOC(IOC_OUT,'h', 4, 0):     /* Create date in local time */
1999         {
2000                 if (is64bit) {
2001                         *(user_time_t *)(ap->a_data) = (user_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2002                 }
2003                 else {
2004                         *(user32_time_t *)(ap->a_data) = (user32_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2005                 }
2006                 return 0;
2007         }
2008
2009         case SPOTLIGHT_FSCTL_GET_MOUNT_TIME:
2010             *(uint32_t *)ap->a_data = hfsmp->hfs_mount_time;
2011             break;
2012
2013         case SPOTLIGHT_FSCTL_GET_LAST_MTIME:
2014             *(uint32_t *)ap->a_data = hfsmp->hfs_last_mounted_mtime;
2015             break;
2016
2017         case HFS_FSCTL_SET_VERY_LOW_DISK:
2018             if (*(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_warninglimit) {
2019                 return EINVAL;
2020             }
2021
2022             hfsmp->hfs_freespace_notify_dangerlimit = *(uint32_t *)ap->a_data;
2023             break;
2024
2025         case HFS_FSCTL_SET_LOW_DISK:
2026             if (   *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel
2027                 || *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_dangerlimit) {
2028
2029                 return EINVAL;
2030             }
2031
2032             hfsmp->hfs_freespace_notify_warninglimit = *(uint32_t *)ap->a_data;
2033             break;
2034
2035         case HFS_FSCTL_SET_DESIRED_DISK:
2036             if (*(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) {
2037                 return EINVAL;
2038             }
2039
2040             hfsmp->hfs_freespace_notify_desiredlevel = *(uint32_t *)ap->a_data;
2041             break;
2042
2043         case HFS_VOLUME_STATUS:
2044             *(uint32_t *)ap->a_data = hfsmp->hfs_notification_conditions;
2045             break;
2046
2047         case HFS_SET_BOOT_INFO:
2048                 if (!vnode_isvroot(vp))
2049                         return(EINVAL);
2050                 if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(HFSTOVFS(hfsmp))->f_owner))
2051                         return(EACCES); /* must be superuser or owner of filesystem */
2052                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2053                         return (EROFS);
2054                 }
2055                 HFS_MOUNT_LOCK(hfsmp, TRUE);
2056                 bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo));
2057                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
2058                 (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
2059                 break;
2060
2061         case HFS_GET_BOOT_INFO:
2062                 if (!vnode_isvroot(vp))
2063                         return(EINVAL);
2064                 HFS_MOUNT_LOCK(hfsmp, TRUE);
2065                 bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo));
2066                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
2067                 break;
2068
2069         case HFS_MARK_BOOT_CORRUPT:
2070                 /* Mark the boot volume corrupt by setting
2071                  * kHFSVolumeInconsistentBit in the volume header.  This will
2072                  * force fsck_hfs on next mount.
2073                  */
2074                 if (!is_suser()) {
2075                         return EACCES;
2076                 }
2077
2078                 /* Allowed only on the root vnode of the boot volume */
2079                 if (!(vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) ||
2080                     !vnode_isvroot(vp)) {
2081                         return EINVAL;
2082                 }
2083                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2084                         return (EROFS);
2085                 }
2086                 printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n");
2087                 hfs_mark_volume_inconsistent(hfsmp);
2088                 break;
2089
2090         case HFS_FSCTL_GET_JOURNAL_INFO:
2091                 jip = (struct hfs_journal_info*)ap->a_data;
2092
2093                 if (vp == NULLVP)
2094                         return EINVAL;
2095
2096             if (hfsmp->jnl == NULL) {
2097                         jnl_start = 0;
2098                         jnl_size  = 0;
2099             } else {
2100                         jnl_start = (off_t)(hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset;
2101                         jnl_size  = (off_t)hfsmp->jnl_size;
2102             }
2103
2104                 jip->jstart = jnl_start;
2105                 jip->jsize = jnl_size;
2106                 break;
2107
2108         case HFS_SET_ALWAYS_ZEROFILL: {
2109             struct cnode *cp = VTOC(vp);
2110
2111             if (*(int *)ap->a_data) {
2112                 cp->c_flag |= C_ALWAYS_ZEROFILL;
2113             } else {
2114                 cp->c_flag &= ~C_ALWAYS_ZEROFILL;
2115             }
2116             break;
2117         }
2118
2119         default:
2120                 return (ENOTTY);
2121         }
2122
2123         return 0;
2124 }
2125
2126 /*
2127  * select
2128  */
2129 int
2130 hfs_vnop_select(__unused struct vnop_select_args *ap)
2131 /*
2132         struct vnop_select_args {
2133                 vnode_t a_vp;
2134                 int  a_which;
2135                 int  a_fflags;
2136                 void *a_wql;
2137                 vfs_context_t a_context;
2138         };
2139 */
2140 {
2141         /*
2142          * We should really check to see if I/O is possible.
2143          */
2144         return (1);
2145 }
2146
2147 /*
2148  * Converts a logical block number to a physical block, and optionally returns
2149  * the amount of remaining blocks in a run. The logical block is based on hfsNode.logBlockSize.
2150  * The physical block number is based on the device block size, currently its 512.
2151  * The block run is returned in logical blocks, and is the REMAINING amount of blocks
2152  */
2153 int
2154 hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, unsigned int *runp)
2155 {
2156         struct filefork *fp = VTOF(vp);
2157         struct hfsmount *hfsmp = VTOHFS(vp);
2158         int  retval = E_NONE;
2159         u_int32_t  logBlockSize;
2160         size_t  bytesContAvail = 0;
2161         off_t  blockposition;
2162         int lockExtBtree;
2163         int lockflags = 0;
2164
2165         /*
2166          * Check for underlying vnode requests and ensure that logical
2167          * to physical mapping is requested.
2168          */
2169         if (vpp != NULL)
2170                 *vpp = hfsmp->hfs_devvp;
2171         if (bnp == NULL)
2172                 return (0);
2173
2174         logBlockSize = GetLogicalBlockSize(vp);
2175         blockposition = (off_t)bn * logBlockSize;
2176
2177         lockExtBtree = overflow_extents(fp);
2178
2179         if (lockExtBtree)
2180                 lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK);
2181
2182         retval = MacToVFSError(
2183                             MapFileBlockC (HFSTOVCB(hfsmp),
2184                                             (FCB*)fp,
2185                                             MAXPHYSIO,
2186                                             blockposition,
2187                                             bnp,
2188                                             &bytesContAvail));
2189
2190         if (lockExtBtree)
2191                 hfs_systemfile_unlock(hfsmp, lockflags);
2192
2193         if (retval == E_NONE) {
2194                 /* Figure out how many read ahead blocks there are */
2195                 if (runp != NULL) {
2196                         if (can_cluster(logBlockSize)) {
2197                                 /* Make sure this result never goes negative: */
2198                                 *runp = (bytesContAvail < logBlockSize) ? 0 : (bytesContAvail / logBlockSize) - 1;
2199                         } else {
2200                                 *runp = 0;
2201                         }
2202                 }
2203         }
2204         return (retval);
2205 }
2206
2207 /*
2208  * Convert logical block number to file offset.
2209  */
2210 int
2211 hfs_vnop_blktooff(struct vnop_blktooff_args *ap)
2212 /*
2213         struct vnop_blktooff_args {
2214                 vnode_t a_vp;
2215                 daddr64_t a_lblkno;
2216                 off_t *a_offset;
2217         };
2218 */
2219 {
2220         if (ap->a_vp == NULL)
2221                 return (EINVAL);
2222         *ap->a_offset = (off_t)ap->a_lblkno * (off_t)GetLogicalBlockSize(ap->a_vp);
2223
2224         return(0);
2225 }
2226
2227 /*
2228  * Convert file offset to logical block number.
2229  */
2230 int
2231 hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap)
2232 /*
2233         struct vnop_offtoblk_args {
2234                 vnode_t a_vp;
2235                 off_t a_offset;
2236                 daddr64_t *a_lblkno;
2237         };
2238 */
2239 {
2240         if (ap->a_vp == NULL)
2241                 return (EINVAL);
2242         *ap->a_lblkno = (daddr64_t)(ap->a_offset / (off_t)GetLogicalBlockSize(ap->a_vp));
2243
2244         return(0);
2245 }
2246
2247 /*
2248  * Map file offset to physical block number.
2249  *
2250  * If this function is called for write operation, and if the file
2251  * had virtual blocks allocated (delayed allocation), real blocks
2252  * are allocated by calling ExtendFileC().
2253  *
2254  * If this function is called for read operation, and if the file
2255  * had virtual blocks allocated (delayed allocation), no change
2256  * to the size of file is done, and if required, rangelist is
2257  * searched for mapping.
2258  *
2259  * System file cnodes are expected to be locked (shared or exclusive).
2260  */
2261 int
2262 hfs_vnop_blockmap(struct vnop_blockmap_args *ap)
2263 /*
2264         struct vnop_blockmap_args {
2265                 vnode_t a_vp;
2266                 off_t a_foffset;
2267                 size_t a_size;
2268                 daddr64_t *a_bpn;
2269                 size_t *a_run;
2270                 void *a_poff;
2271                 int a_flags;
2272                 vfs_context_t a_context;
2273         };
2274 */
2275 {
2276         struct vnode *vp = ap->a_vp;
2277         struct cnode *cp;
2278         struct filefork *fp;
2279         struct hfsmount *hfsmp;
2280         size_t bytesContAvail = 0;
2281         int retval = E_NONE;
2282         int syslocks = 0;
2283         int lockflags = 0;
2284         struct rl_entry *invalid_range;
2285         enum rl_overlaptype overlaptype;
2286         int started_tr = 0;
2287         int tooklock = 0;
2288
2289 #if HFS_COMPRESSION
2290         if (VNODE_IS_RSRC(vp)) {
2291                 /* allow blockmaps to the resource fork */
2292         } else {
2293                 if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
2294                         int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
2295                         switch(state) {
2296                                 case FILE_IS_COMPRESSED:
2297                                         return ENOTSUP;
2298                                 case FILE_IS_CONVERTING:
2299                                         /* if FILE_IS_CONVERTING, we allow blockmap */
2300                                         break;
2301                                 default:
2302                                         printf("invalid state %d for compressed file\n", state);
2303                                         /* fall through */
2304                         }
2305                 }
2306         }
2307 #endif /* HFS_COMPRESSION */
2308
2309         /* Do not allow blockmap operation on a directory */
2310         if (vnode_isdir(vp)) {
2311                 return (ENOTSUP);
2312         }
2313
2314         /*
2315          * Check for underlying vnode requests and ensure that logical
2316          * to physical mapping is requested.
2317          */
2318         if (ap->a_bpn == NULL)
2319                 return (0);
2320
2321         if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) {
2322                 if (VTOC(vp)->c_lockowner != current_thread()) {
2323                         hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
2324                         tooklock = 1;
2325                 }
2326         }
2327         hfsmp = VTOHFS(vp);
2328         cp = VTOC(vp);
2329         fp = VTOF(vp);
2330
2331 retry:
2332         /* Check virtual blocks only when performing write operation */
2333         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
2334                 if (hfs_start_transaction(hfsmp) != 0) {
2335                         retval = EINVAL;
2336                         goto exit;
2337                 } else {
2338                         started_tr = 1;
2339                 }
2340                 syslocks = SFL_EXTENTS | SFL_BITMAP;
2341
2342         } else if (overflow_extents(fp)) {
2343                 syslocks = SFL_EXTENTS;
2344         }
2345
2346         if (syslocks)
2347                 lockflags = hfs_systemfile_lock(hfsmp, syslocks, HFS_EXCLUSIVE_LOCK);
2348
2349         /*
2350          * Check for any delayed allocations.
2351          */
2352         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
2353                 int64_t actbytes;
2354                 u_int32_t loanedBlocks;
2355
2356                 //
2357                 // Make sure we have a transaction.  It's possible
2358                 // that we came in and fp->ff_unallocblocks was zero
2359                 // but during the time we blocked acquiring the extents
2360                 // btree, ff_unallocblocks became non-zero and so we
2361                 // will need to start a transaction.
2362                 //
2363                 if (started_tr == 0) {
2364                         if (syslocks) {
2365                                 hfs_systemfile_unlock(hfsmp, lockflags);
2366                                 syslocks = 0;
2367                         }
2368                         goto retry;
2369                 }
2370
2371                 /*
2372                  * Note: ExtendFileC will Release any blocks on loan and
2373                  * aquire real blocks.  So we ask to extend by zero bytes
2374                  * since ExtendFileC will account for the virtual blocks.
2375                  */
2376
2377                 loanedBlocks = fp->ff_unallocblocks;
2378                 retval = ExtendFileC(hfsmp, (FCB*)fp, 0, 0,
2379                                      kEFAllMask | kEFNoClumpMask, &actbytes);
2380
2381                 if (retval) {
2382                         fp->ff_unallocblocks = loanedBlocks;
2383                         cp->c_blocks += loanedBlocks;
2384                         fp->ff_blocks += loanedBlocks;
2385
2386                         HFS_MOUNT_LOCK(hfsmp, TRUE);
2387                         hfsmp->loanedBlocks += loanedBlocks;
2388                         HFS_MOUNT_UNLOCK(hfsmp, TRUE);
2389
2390                         hfs_systemfile_unlock(hfsmp, lockflags);
2391                         cp->c_flag |= C_MODIFIED;
2392                         if (started_tr) {
2393                                 (void) hfs_update(vp, TRUE);
2394                                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2395
2396                                 hfs_end_transaction(hfsmp);
2397                                 started_tr = 0;
2398                         }
2399                         goto exit;
2400                 }
2401         }
2402
2403         retval = MapFileBlockC(hfsmp, (FCB *)fp, ap->a_size, ap->a_foffset,
2404                                ap->a_bpn, &bytesContAvail);
2405         if (syslocks) {
2406                 hfs_systemfile_unlock(hfsmp, lockflags);
2407                 syslocks = 0;
2408         }
2409
2410         if (started_tr) {
2411                 (void) hfs_update(vp, TRUE);
2412                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2413                 hfs_end_transaction(hfsmp);
2414                 started_tr = 0;
2415         }
2416         if (retval) {
2417                 /* On write, always return error because virtual blocks, if any,
2418                  * should have been allocated in ExtendFileC().  We do not
2419                  * allocate virtual blocks on read, therefore return error
2420                  * only if no virtual blocks are allocated.  Otherwise we search
2421                  * rangelist for zero-fills
2422                  */
2423                 if ((MacToVFSError(retval) != ERANGE) ||
2424                     (ap->a_flags & VNODE_WRITE) ||
2425                     ((ap->a_flags & VNODE_READ) && (fp->ff_unallocblocks == 0))) {
2426                         goto exit;
2427                 }
2428
2429                 /* Validate if the start offset is within logical file size */
2430                 if (ap->a_foffset > fp->ff_size) {
2431                         goto exit;
2432                 }
2433
2434                 /* Searching file extents has failed for read operation, therefore
2435                  * search rangelist for any uncommitted holes in the file.
2436                  */
2437                 overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
2438                                       ap->a_foffset + (off_t)(ap->a_size - 1),
2439                                       &invalid_range);
2440                 switch(overlaptype) {
2441                 case RL_OVERLAPISCONTAINED:
2442                         /* start_offset <= rl_start, end_offset >= rl_end */
2443                         if (ap->a_foffset != invalid_range->rl_start) {
2444                                 break;
2445                         }
2446                 case RL_MATCHINGOVERLAP:
2447                         /* start_offset = rl_start, end_offset = rl_end */
2448                 case RL_OVERLAPCONTAINSRANGE:
2449                         /* start_offset >= rl_start, end_offset <= rl_end */
2450                 case RL_OVERLAPSTARTSBEFORE:
2451                         /* start_offset > rl_start, end_offset >= rl_start */
2452                         if ((off_t)fp->ff_size > (invalid_range->rl_end + 1)) {
2453                                 bytesContAvail = (invalid_range->rl_end + 1) - ap->a_foffset;
2454                         } else {
2455                                 bytesContAvail = fp->ff_size - ap->a_foffset;
2456                         }
2457                         if (bytesContAvail > ap->a_size) {
2458                                 bytesContAvail = ap->a_size;
2459                         }
2460                         *ap->a_bpn = (daddr64_t)-1;
2461                         retval = 0;
2462                         break;
2463                 case RL_OVERLAPENDSAFTER:
2464                         /* start_offset < rl_start, end_offset < rl_end */
2465                 case RL_NOOVERLAP:
2466                         break;
2467                 }
2468                 goto exit;
2469         }
2470
2471         /* MapFileC() found a valid extent in the filefork.  Search the
2472          * mapping information further for invalid file ranges
2473          */
2474         overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
2475                               ap->a_foffset + (off_t)bytesContAvail - 1,
2476                               &invalid_range);
2477         if (overlaptype != RL_NOOVERLAP) {
2478                 switch(overlaptype) {
2479                 case RL_MATCHINGOVERLAP:
2480                 case RL_OVERLAPCONTAINSRANGE:
2481                 case RL_OVERLAPSTARTSBEFORE:
2482                         /* There's no valid block for this byte offset */
2483                         *ap->a_bpn = (daddr64_t)-1;
2484                         /* There's no point limiting the amount to be returned
2485                          * if the invalid range that was hit extends all the way
2486                          * to the EOF (i.e. there's no valid bytes between the
2487                          * end of this range and the file's EOF):
2488                          */
2489                         if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
2490                             ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
2491                                 bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
2492                         }
2493                         break;
2494
2495                 case RL_OVERLAPISCONTAINED:
2496                 case RL_OVERLAPENDSAFTER:
2497                         /* The range of interest hits an invalid block before the end: */
2498                         if (invalid_range->rl_start == ap->a_foffset) {
2499                                 /* There's actually no valid information to be had starting here: */
2500                                 *ap->a_bpn = (daddr64_t)-1;
2501                                 if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
2502                                     ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
2503                                         bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
2504                                 }
2505                         } else {
2506                                 bytesContAvail = invalid_range->rl_start - ap->a_foffset;
2507                         }
2508                         break;
2509
2510                 case RL_NOOVERLAP:
2511                         break;
2512                 } /* end switch */
2513                 if (bytesContAvail > ap->a_size)
2514                         bytesContAvail = ap->a_size;
2515         }
2516
2517 exit:
2518         if (retval == 0) {
2519                 if (ap->a_run)
2520                         *ap->a_run = bytesContAvail;
2521
2522                 if (ap->a_poff)
2523                         *(int *)ap->a_poff = 0;
2524         }
2525
2526         if (tooklock)
2527                 hfs_unlock(cp);
2528
2529         return (MacToVFSError(retval));
2530 }
2531
2532
2533 /*
2534  * prepare and issue the I/O
2535  * buf_strategy knows how to deal
2536  * with requests that require
2537  * fragmented I/Os
2538  */
2539 int
2540 hfs_vnop_strategy(struct vnop_strategy_args *ap)
2541 {
2542         buf_t   bp = ap->a_bp;
2543         vnode_t vp = buf_vnode(bp);
2544
2545         return (buf_strategy(VTOHFS(vp)->hfs_devvp, ap));
2546 }
2547
2548 static int
2549 hfs_minorupdate(struct vnode *vp) {
2550         struct cnode *cp = VTOC(vp);
2551         cp->c_flag &= ~C_MODIFIED;
2552         cp->c_touch_acctime = 0;
2553         cp->c_touch_chgtime = 0;
2554         cp->c_touch_modtime = 0;
2555
2556         return 0;
2557 }
2558
2559 static int
2560 do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_context_t context)
2561 {
2562         register struct cnode *cp = VTOC(vp);
2563         struct filefork *fp = VTOF(vp);
2564         struct proc *p = vfs_context_proc(context);;
2565         kauth_cred_t cred = vfs_context_ucred(context);
2566         int retval;
2567         off_t bytesToAdd;
2568         off_t actualBytesAdded;
2569         off_t filebytes;
2570         u_int32_t fileblocks;
2571         int blksize;
2572         struct hfsmount *hfsmp;
2573         int lockflags;
2574
2575         blksize = VTOVCB(vp)->blockSize;
2576         fileblocks = fp->ff_blocks;
2577         filebytes = (off_t)fileblocks * (off_t)blksize;
2578
2579         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_START,
2580                  (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
2581
2582         if (length < 0)
2583                 return (EINVAL);
2584
2585         /* This should only happen with a corrupt filesystem */
2586         if ((off_t)fp->ff_size < 0)
2587                 return (EINVAL);
2588
2589         if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE))
2590                 return (EFBIG);
2591
2592         hfsmp = VTOHFS(vp);
2593
2594         retval = E_NONE;
2595
2596         /* Files that are changing size are not hot file candidates. */
2597         if (hfsmp->hfc_stage == HFC_RECORDING) {
2598                 fp->ff_bytesread = 0;
2599         }
2600
2601         /*
2602          * We cannot just check if fp->ff_size == length (as an optimization)
2603          * since there may be extra physical blocks that also need truncation.
2604          */
2605 #if QUOTA
2606         if ((retval = hfs_getinoquota(cp)))
2607                 return(retval);
2608 #endif /* QUOTA */
2609
2610         /*
2611          * Lengthen the size of the file. We must ensure that the
2612          * last byte of the file is allocated. Since the smallest
2613          * value of ff_size is 0, length will be at least 1.
2614          */
2615         if (length > (off_t)fp->ff_size) {
2616 #if QUOTA
2617                 retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)),
2618                                    cred, 0);
2619                 if (retval)
2620                         goto Err_Exit;
2621 #endif /* QUOTA */
2622                 /*
2623                  * If we don't have enough physical space then
2624                  * we need to extend the physical size.
2625                  */
2626                 if (length > filebytes) {
2627                         int eflags;
2628                         u_int32_t blockHint = 0;
2629
2630                         /* All or nothing and don't round up to clumpsize. */
2631                         eflags = kEFAllMask | kEFNoClumpMask;
2632
2633                         if (cred && suser(cred, NULL) != 0)
2634                                 eflags |= kEFReserveMask;  /* keep a reserve */
2635
2636                         /*
2637                          * Allocate Journal and Quota files in metadata zone.
2638                          */
2639                         if (filebytes == 0 &&
2640                             hfsmp->hfs_flags & HFS_METADATA_ZONE &&
2641                             hfs_virtualmetafile(cp)) {
2642                                 eflags |= kEFMetadataMask;
2643                                 blockHint = hfsmp->hfs_metazone_start;
2644                         }
2645                         if (hfs_start_transaction(hfsmp) != 0) {
2646                             retval = EINVAL;
2647                             goto Err_Exit;
2648                         }
2649
2650                         /* Protect extents b-tree and allocation bitmap */
2651                         lockflags = SFL_BITMAP;
2652                         if (overflow_extents(fp))
2653                                 lockflags |= SFL_EXTENTS;
2654                         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
2655
2656                         while ((length > filebytes) && (retval == E_NONE)) {
2657                                 bytesToAdd = length - filebytes;
2658                                 retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
2659                                                     (FCB*)fp,
2660                                                     bytesToAdd,
2661                                                     blockHint,
2662                                                     eflags,
2663                                                     &actualBytesAdded));
2664
2665                                 filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
2666                                 if (actualBytesAdded == 0 && retval == E_NONE) {
2667                                         if (length > filebytes)
2668                                                 length = filebytes;
2669                                         break;
2670                                 }
2671                         } /* endwhile */
2672
2673                         hfs_systemfile_unlock(hfsmp, lockflags);
2674
2675                         if (hfsmp->jnl) {
2676                                 if (skipupdate) {
2677                                         (void) hfs_minorupdate(vp);
2678                                 }
2679                                 else {
2680                                         (void) hfs_update(vp, TRUE);
2681                                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2682                                 }
2683                         }
2684
2685                         hfs_end_transaction(hfsmp);
2686
2687                         if (retval)
2688                                 goto Err_Exit;
2689
2690                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
2691                                 (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
2692                 }
2693
2694                 if (!(flags & IO_NOZEROFILL)) {
2695                         if (UBCINFOEXISTS(vp)  && (vnode_issystem(vp) == 0) && retval == E_NONE) {
2696                                 struct rl_entry *invalid_range;
2697                                 off_t zero_limit;
2698
2699                                 zero_limit = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
2700                                 if (length < zero_limit) zero_limit = length;
2701
2702                                 if (length > (off_t)fp->ff_size) {
2703                                         struct timeval tv;
2704
2705                                         /* Extending the file: time to fill out the current last page w. zeroes? */
2706                                         if ((fp->ff_size & PAGE_MASK_64) &&
2707                                             (rl_scan(&fp->ff_invalidranges, fp->ff_size & ~PAGE_MASK_64,
2708                                             fp->ff_size - 1, &invalid_range) == RL_NOOVERLAP)) {
2709
2710                                                 /* There's some valid data at the start of the (current) last page
2711                                                    of the file, so zero out the remainder of that page to ensure the
2712                                                    entire page contains valid data.  Since there is no invalid range
2713                                                    possible past the (current) eof, there's no need to remove anything
2714                                                    from the invalid range list before calling cluster_write():  */
2715                                                 hfs_unlock(cp);
2716                                                 retval = cluster_write(vp, (struct uio *) 0, fp->ff_size, zero_limit,
2717                                                                 fp->ff_size, (off_t)0,
2718                                                                 (flags & IO_SYNC) | IO_HEADZEROFILL | IO_NOZERODIRTY);
2719                                                 hfs_lock(cp, HFS_FORCE_LOCK);
2720                                                 if (retval) goto Err_Exit;
2721
2722                                                 /* Merely invalidate the remaining area, if necessary: */
2723                                                 if (length > zero_limit) {
2724                                                         microuptime(&tv);
2725                                                         rl_add(zero_limit, length - 1, &fp->ff_invalidranges);
2726                                                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
2727                                                 }
2728                                         } else {
2729                                         /* The page containing the (current) eof is invalid: just add the
2730                                            remainder of the page to the invalid list, along with the area
2731                                            being newly allocated:
2732                                          */
2733                                         microuptime(&tv);
2734                                         rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges);
2735                                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
2736                                         };
2737                                 }
2738                         } else {
2739                                         panic("hfs_truncate: invoked on non-UBC object?!");
2740                         };
2741                 }
2742                 cp->c_touch_modtime = TRUE;
2743                 fp->ff_size = length;
2744
2745         } else { /* Shorten the size of the file */
2746
2747                 if ((off_t)fp->ff_size > length) {
2748                         /* Any space previously marked as invalid is now irrelevant: */
2749                         rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges);
2750                 }
2751
2752                 /*
2753                  * Account for any unmapped blocks. Note that the new
2754                  * file length can still end up with unmapped blocks.
2755                  */
2756                 if (fp->ff_unallocblocks > 0) {
2757                         u_int32_t finalblks;
2758                         u_int32_t loanedBlocks;
2759
2760                         HFS_MOUNT_LOCK(hfsmp, TRUE);
2761
2762                         loanedBlocks = fp->ff_unallocblocks;
2763                         cp->c_blocks -= loanedBlocks;
2764                         fp->ff_blocks -= loanedBlocks;
2765                         fp->ff_unallocblocks = 0;
2766
2767                         hfsmp->loanedBlocks -= loanedBlocks;
2768
2769                         finalblks = (length + blksize - 1) / blksize;
2770                         if (finalblks > fp->ff_blocks) {
2771                                 /* calculate required unmapped blocks */
2772                                 loanedBlocks = finalblks - fp->ff_blocks;
2773                                 hfsmp->loanedBlocks += loanedBlocks;
2774
2775                                 fp->ff_unallocblocks = loanedBlocks;
2776                                 cp->c_blocks += loanedBlocks;
2777                                 fp->ff_blocks += loanedBlocks;
2778                         }
2779                         HFS_MOUNT_UNLOCK(hfsmp, TRUE);
2780                 }
2781
2782                 /*
2783                  * For a TBE process the deallocation of the file blocks is
2784                  * delayed until the file is closed.  And hfs_close calls
2785                  * truncate with the IO_NDELAY flag set.  So when IO_NDELAY
2786                  * isn't set, we make sure this isn't a TBE process.
2787                  */
2788                 if ((flags & IO_NDELAY) || (proc_tbe(p) == 0)) {
2789 #if QUOTA
2790                   off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
2791 #endif /* QUOTA */
2792                   if (hfs_start_transaction(hfsmp) != 0) {
2793                       retval = EINVAL;
2794                       goto Err_Exit;
2795                   }
2796
2797                         if (fp->ff_unallocblocks == 0) {
2798                                 /* Protect extents b-tree and allocation bitmap */
2799                                 lockflags = SFL_BITMAP;
2800                                 if (overflow_extents(fp))
2801                                         lockflags |= SFL_EXTENTS;
2802                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
2803
2804                                 retval = MacToVFSError(TruncateFileC(VTOVCB(vp),
2805                                                 (FCB*)fp, length, false));
2806
2807                                 hfs_systemfile_unlock(hfsmp, lockflags);
2808                         }
2809                         if (hfsmp->jnl) {
2810                                 if (retval == 0) {
2811                                         fp->ff_size = length;
2812                                 }
2813                                 if (skipupdate) {
2814                                         (void) hfs_minorupdate(vp);
2815                                 }
2816                                 else {
2817                                         (void) hfs_update(vp, TRUE);
2818                                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2819                                 }
2820                         }
2821                         hfs_end_transaction(hfsmp);
2822
2823                         filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
2824                         if (retval)
2825                                 goto Err_Exit;
2826 #if QUOTA
2827                         /* These are bytesreleased */
2828                         (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0);
2829 #endif /* QUOTA */
2830                 }
2831                 /* Only set update flag if the logical length changes */
2832                 if ((off_t)fp->ff_size != length)
2833                         cp->c_touch_modtime = TRUE;
2834                 fp->ff_size = length;
2835         }
2836         if (cp->c_mode & (S_ISUID | S_ISGID)) {
2837                 if (!vfs_context_issuser(context)) {
2838                         cp->c_mode &= ~(S_ISUID | S_ISGID);
2839                         skipupdate = 0;
2840                 }
2841         }
2842         if (skipupdate) {
2843                 retval = hfs_minorupdate(vp);
2844         }
2845         else {
2846                 cp->c_touch_chgtime = TRUE;     /* status changed */
2847                 cp->c_touch_modtime = TRUE;     /* file data was modified */
2848                 retval = hfs_update(vp, MNT_WAIT);
2849         }
2850         if (retval) {
2851                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
2852                      -1, -1, -1, retval, 0);
2853         }
2854
2855 Err_Exit:
2856
2857         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_END,
2858                  (int)length, (int)fp->ff_size, (int)filebytes, retval, 0);
2859
2860         return (retval);
2861 }
2862
2863
2864
2865 /*
2866  * Truncate a cnode to at most length size, freeing (or adding) the
2867  * disk blocks.
2868  */
2869 __private_extern__
2870 int
2871 hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize,
2872              int skipupdate, vfs_context_t context)
2873 {
2874         struct filefork *fp = VTOF(vp);
2875         off_t filebytes;
2876         u_int32_t fileblocks;
2877         int blksize, error = 0;
2878         struct cnode *cp = VTOC(vp);
2879
2880         /* Cannot truncate an HFS directory! */
2881         if (vnode_isdir(vp)) {
2882                 return (EISDIR);
2883         }
2884         /* A swap file cannot change size. */
2885         if (vnode_isswap(vp) && (length != 0)) {
2886                 return (EPERM);
2887         }
2888
2889         blksize = VTOVCB(vp)->blockSize;
2890         fileblocks = fp->ff_blocks;
2891         filebytes = (off_t)fileblocks * (off_t)blksize;
2892
2893         //
2894         // Have to do this here so that we don't wind up with
2895         // i/o pending for blocks that are about to be released
2896         // if we truncate the file.
2897         //
2898         // If skipsetsize is set, then the caller is responsible
2899         // for the ubc_setsize.
2900         //
2901         // Even if skipsetsize is set, if the length is zero we
2902         // want to call ubc_setsize() because as of SnowLeopard
2903         // it will no longer cause any page-ins and it will drop
2904         // any dirty pages so that we don't do any i/o that we
2905         // don't have to.  This also prevents a race where i/o
2906         // for truncated blocks may overwrite later data if the
2907         // blocks get reallocated to a different file.
2908         //
2909         if (!skipsetsize || length == 0)
2910                 ubc_setsize(vp, length);
2911
2912         // have to loop truncating or growing files that are
2913         // really big because otherwise transactions can get
2914         // enormous and consume too many kernel resources.
2915
2916         if (length < filebytes) {
2917                 while (filebytes > length) {
2918                         if ((filebytes - length) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
2919                                 filebytes -= HFS_BIGFILE_SIZE;
2920                         } else {
2921                                 filebytes = length;
2922                         }
2923                         cp->c_flag |= C_FORCEUPDATE;
2924                         error = do_hfs_truncate(vp, filebytes, flags, skipupdate, context);
2925                         if (error)
2926                                 break;
2927                 }
2928         } else if (length > filebytes) {
2929                 while (filebytes < length) {
2930                         if ((length - filebytes) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
2931                                 filebytes += HFS_BIGFILE_SIZE;
2932                         } else {
2933                                 filebytes = length;
2934                         }
2935                         cp->c_flag |= C_FORCEUPDATE;
2936                         error = do_hfs_truncate(vp, filebytes, flags, skipupdate, context);
2937                         if (error)
2938                                 break;
2939                 }
2940         } else /* Same logical size */ {
2941
2942                 error = do_hfs_truncate(vp, length, flags, skipupdate, context);
2943         }
2944         /* Files that are changing size are not hot file candidates. */
2945         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
2946                 fp->ff_bytesread = 0;
2947         }
2948
2949         return (error);
2950 }
2951
2952
2953
2954 /*
2955  * Preallocate file storage space.
2956  */
2957 int
2958 hfs_vnop_allocate(struct vnop_allocate_args /* {
2959                 vnode_t a_vp;
2960                 off_t a_length;
2961                 u_int32_t  a_flags;
2962                 off_t *a_bytesallocated;
2963                 off_t a_offset;
2964                 vfs_context_t a_context;
2965         } */ *ap)
2966 {
2967         struct vnode *vp = ap->a_vp;
2968         struct cnode *cp;
2969         struct filefork *fp;
2970         ExtendedVCB *vcb;
2971         off_t length = ap->a_length;
2972         off_t startingPEOF;
2973         off_t moreBytesRequested;
2974         off_t actualBytesAdded;
2975         off_t filebytes;
2976         u_int32_t fileblocks;
2977         int retval, retval2;
2978         u_int32_t blockHint;
2979         u_int32_t extendFlags;   /* For call to ExtendFileC */
2980         struct hfsmount *hfsmp;
2981         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
2982         int lockflags;
2983
2984         *(ap->a_bytesallocated) = 0;
2985
2986         if (!vnode_isreg(vp))
2987                 return (EISDIR);
2988         if (length < (off_t)0)
2989                 return (EINVAL);
2990
2991         cp = VTOC(vp);
2992
2993         hfs_lock_truncate(cp, TRUE);
2994
2995         if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
2996                 goto Err_Exit;
2997         }
2998
2999         fp = VTOF(vp);
3000         hfsmp = VTOHFS(vp);
3001         vcb = VTOVCB(vp);
3002
3003         fileblocks = fp->ff_blocks;
3004         filebytes = (off_t)fileblocks * (off_t)vcb->blockSize;
3005
3006         if ((ap->a_flags & ALLOCATEFROMVOL) && (length < filebytes)) {
3007                 retval = EINVAL;
3008                 goto Err_Exit;
3009         }
3010
3011         /* Fill in the flags word for the call to Extend the file */
3012
3013         extendFlags = kEFNoClumpMask;
3014         if (ap->a_flags & ALLOCATECONTIG)
3015                 extendFlags |= kEFContigMask;
3016         if (ap->a_flags & ALLOCATEALL)
3017                 extendFlags |= kEFAllMask;
3018         if (cred && suser(cred, NULL) != 0)
3019                 extendFlags |= kEFReserveMask;
3020         if (hfs_virtualmetafile(cp))
3021                 extendFlags |= kEFMetadataMask;
3022
3023         retval = E_NONE;
3024         blockHint = 0;
3025         startingPEOF = filebytes;
3026
3027         if (ap->a_flags & ALLOCATEFROMPEOF)
3028                 length += filebytes;
3029         else if (ap->a_flags & ALLOCATEFROMVOL)
3030                 blockHint = ap->a_offset / VTOVCB(vp)->blockSize;
3031
3032         /* If no changes are necesary, then we're done */
3033         if (filebytes == length)
3034                 goto Std_Exit;
3035
3036         /*
3037          * Lengthen the size of the file. We must ensure that the
3038          * last byte of the file is allocated. Since the smallest
3039          * value of filebytes is 0, length will be at least 1.
3040          */
3041         if (length > filebytes) {
3042                 off_t total_bytes_added = 0, orig_request_size;
3043
3044                 orig_request_size = moreBytesRequested = length - filebytes;
3045
3046 #if QUOTA
3047                 retval = hfs_chkdq(cp,
3048                                 (int64_t)(roundup(moreBytesRequested, vcb->blockSize)),
3049                                 cred, 0);
3050                 if (retval)
3051                         goto Err_Exit;
3052
3053 #endif /* QUOTA */
3054                 /*
3055                  * Metadata zone checks.
3056                  */
3057                 if (hfsmp->hfs_flags & HFS_METADATA_ZONE) {
3058                         /*
3059                          * Allocate Journal and Quota files in metadata zone.
3060                          */
3061                         if (hfs_virtualmetafile(cp)) {
3062                                 blockHint = hfsmp->hfs_metazone_start;
3063                         } else if ((blockHint >= hfsmp->hfs_metazone_start) &&
3064                                    (blockHint <= hfsmp->hfs_metazone_end)) {
3065                                 /*
3066                                  * Move blockHint outside metadata zone.
3067                                  */
3068                                 blockHint = hfsmp->hfs_metazone_end + 1;
3069                         }
3070                 }
3071
3072
3073                 while ((length > filebytes) && (retval == E_NONE)) {
3074                     off_t bytesRequested;
3075
3076                     if (hfs_start_transaction(hfsmp) != 0) {
3077                         retval = EINVAL;
3078                         goto Err_Exit;
3079                     }
3080
3081                     /* Protect extents b-tree and allocation bitmap */
3082                     lockflags = SFL_BITMAP;
3083                     if (overflow_extents(fp))
3084                         lockflags |= SFL_EXTENTS;
3085                     lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3086
3087                     if (moreBytesRequested >= HFS_BIGFILE_SIZE) {
3088                         bytesRequested = HFS_BIGFILE_SIZE;
3089                     } else {
3090                         bytesRequested = moreBytesRequested;
3091                     }
3092
3093                     if (extendFlags & kEFContigMask) {
3094                             // if we're on a sparse device, this will force it to do a
3095                             // full scan to find the space needed.
3096                             hfsmp->hfs_flags &= ~HFS_DID_CONTIG_SCAN;
3097                     }
3098
3099                     retval = MacToVFSError(ExtendFileC(vcb,
3100                                                 (FCB*)fp,
3101                                                 bytesRequested,
3102                                                 blockHint,
3103                                                 extendFlags,
3104                                                 &actualBytesAdded));
3105
3106                     if (retval == E_NONE) {
3107                         *(ap->a_bytesallocated) += actualBytesAdded;
3108                         total_bytes_added += actualBytesAdded;
3109                         moreBytesRequested -= actualBytesAdded;
3110                         if (blockHint != 0) {
3111                             blockHint += actualBytesAdded / vcb->blockSize;
3112                         }
3113                     }
3114                     filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
3115
3116                     hfs_systemfile_unlock(hfsmp, lockflags);
3117
3118                     if (hfsmp->jnl) {
3119                         (void) hfs_update(vp, TRUE);
3120                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3121                     }
3122
3123                     hfs_end_transaction(hfsmp);
3124                 }
3125
3126
3127                 /*
3128                  * if we get an error and no changes were made then exit
3129                  * otherwise we must do the hfs_update to reflect the changes
3130                  */
3131                 if (retval && (startingPEOF == filebytes))
3132                         goto Err_Exit;
3133
3134                 /*
3135                  * Adjust actualBytesAdded to be allocation block aligned, not
3136                  * clump size aligned.
3137                  * NOTE: So what we are reporting does not affect reality
3138                  * until the file is closed, when we truncate the file to allocation
3139                  * block size.
3140                  */
3141                 if (total_bytes_added != 0 && orig_request_size < total_bytes_added)
3142                         *(ap->a_bytesallocated) =
3143                                 roundup(orig_request_size, (off_t)vcb->blockSize);
3144
3145         } else { /* Shorten the size of the file */
3146
3147                 if (fp->ff_size > length) {
3148                         /*
3149                          * Any buffers that are past the truncation point need to be
3150                          * invalidated (to maintain buffer cache consistency).
3151                          */
3152                 }
3153
3154                 retval = hfs_truncate(vp, length, 0, 0, 0, ap->a_context);
3155                 filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
3156
3157                 /*
3158                  * if we get an error and no changes were made then exit
3159                  * otherwise we must do the hfs_update to reflect the changes
3160                  */
3161                 if (retval && (startingPEOF == filebytes)) goto Err_Exit;
3162 #if QUOTA
3163                 /* These are  bytesreleased */
3164                 (void) hfs_chkdq(cp, (int64_t)-((startingPEOF - filebytes)), NOCRED,0);
3165 #endif /* QUOTA */
3166
3167                 if (fp->ff_size > filebytes) {
3168                         fp->ff_size = filebytes;
3169
3170                         hfs_unlock(cp);
3171                         ubc_setsize(vp, fp->ff_size);
3172                         hfs_lock(cp, HFS_FORCE_LOCK);
3173                 }
3174         }
3175
3176 Std_Exit:
3177         cp->c_touch_chgtime = TRUE;
3178         cp->c_touch_modtime = TRUE;
3179         retval2 = hfs_update(vp, MNT_WAIT);
3180
3181         if (retval == 0)
3182                 retval = retval2;
3183 Err_Exit:
3184         hfs_unlock_truncate(cp, TRUE);
3185         hfs_unlock(cp);
3186         return (retval);
3187 }
3188
3189
3190 /*
3191  * Pagein for HFS filesystem
3192  */
3193 int
3194 hfs_vnop_pagein(struct vnop_pagein_args *ap)
3195 /*
3196         struct vnop_pagein_args {
3197                 vnode_t a_vp,
3198                 upl_t         a_pl,
3199                 vm_offset_t   a_pl_offset,
3200                 off_t         a_f_offset,
3201                 size_t        a_size,
3202                 int           a_flags
3203                 vfs_context_t a_context;
3204         };
3205 */
3206 {
3207         vnode_t vp = ap->a_vp;
3208         int error;
3209
3210 #if HFS_COMPRESSION
3211         if (VNODE_IS_RSRC(vp)) {
3212                 /* allow pageins of the resource fork */
3213         } else {
3214                 int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
3215                 if (compressed) {
3216                         error = decmpfs_pagein_compressed(ap, &compressed, VTOCMP(vp));
3217                         if (compressed) {
3218                                 if (error == 0) {
3219                                         /* successful page-in, update the access time */
3220                                         VTOC(vp)->c_touch_acctime = TRUE;
3221
3222                                         /* compressed files are not hot file candidates */
3223                                         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
3224                                                 VTOF(vp)->ff_bytesread = 0;
3225                                         }
3226                                 }
3227                                 return error;
3228                         }
3229                         /* otherwise the file was converted back to a regular file while we were reading it */
3230                 }
3231         }
3232 #endif
3233
3234         error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
3235                                ap->a_size, (off_t)VTOF(vp)->ff_size, ap->a_flags);
3236         /*
3237          * Keep track of blocks read.
3238          */
3239         if (!vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) {
3240                 struct cnode *cp;
3241                 struct filefork *fp;
3242                 int bytesread;
3243                 int took_cnode_lock = 0;
3244
3245                 cp = VTOC(vp);
3246                 fp = VTOF(vp);
3247
3248                 if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE)
3249                         bytesread = fp->ff_size;
3250                 else
3251                         bytesread = ap->a_size;
3252
3253                 /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
3254                 if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) {
3255                         hfs_lock(cp, HFS_FORCE_LOCK);
3256                         took_cnode_lock = 1;
3257                 }
3258                 /*
3259                  * If this file hasn't been seen since the start of
3260                  * the current sampling period then start over.
3261                  */
3262                 if (cp->c_atime < VTOHFS(vp)->hfc_timebase) {
3263                         struct timeval tv;
3264
3265                         fp->ff_bytesread = bytesread;
3266                         microtime(&tv);
3267                         cp->c_atime = tv.tv_sec;
3268                 } else {
3269                         fp->ff_bytesread += bytesread;
3270                 }
3271                 cp->c_touch_acctime = TRUE;
3272                 if (took_cnode_lock)
3273                         hfs_unlock(cp);
3274         }
3275         return (error);
3276 }
3277
3278 /*
3279  * Pageout for HFS filesystem.
3280  */
3281 int
3282 hfs_vnop_pageout(struct vnop_pageout_args *ap)
3283 /*
3284         struct vnop_pageout_args {
3285            vnode_t a_vp,
3286            upl_t         a_pl,
3287            vm_offset_t   a_pl_offset,
3288            off_t         a_f_offset,
3289            size_t        a_size,
3290            int           a_flags
3291            vfs_context_t a_context;
3292         };
3293 */
3294 {
3295         vnode_t vp = ap->a_vp;
3296         struct cnode *cp;
3297         struct filefork *fp;
3298         int retval = 0;
3299         off_t filesize;
3300         upl_t           upl;
3301         upl_page_info_t* pl;
3302         vm_offset_t     a_pl_offset;
3303         int             a_flags;
3304         int is_pageoutv2 = 0;
3305         kern_return_t kret;
3306
3307         cp = VTOC(vp);
3308         fp = VTOF(vp);
3309
3310         /*
3311          * Figure out where the file ends, for pageout purposes.  If
3312          * ff_new_size > ff_size, then we're in the middle of extending the
3313          * file via a write, so it is safe (and necessary) that we be able
3314          * to pageout up to that point.
3315          */
3316         filesize = fp->ff_size;
3317         if (fp->ff_new_size > filesize)
3318                 filesize = fp->ff_new_size;
3319
3320         a_flags = ap->a_flags;
3321         a_pl_offset = ap->a_pl_offset;
3322
3323         /*
3324          * we can tell if we're getting the new or old behavior from the UPL
3325          */
3326         if ((upl = ap->a_pl) == NULL) {
3327                 int request_flags;
3328
3329                 is_pageoutv2 = 1;
3330                 /*
3331                  * we're in control of any UPL we commit
3332                  * make sure someone hasn't accidentally passed in UPL_NOCOMMIT
3333                  */
3334                 a_flags &= ~UPL_NOCOMMIT;
3335                 a_pl_offset = 0;
3336
3337                 /*
3338                  * take truncate lock (shared) to guard against
3339                  * zero-fill thru fsync interfering, but only for v2
3340                  */
3341                 hfs_lock_truncate(cp, 0);
3342
3343                 if (a_flags & UPL_MSYNC) {
3344                         request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY;
3345                 }
3346                 else {
3347                         request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY;
3348                 }
3349                 kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags);
3350
3351                 if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
3352                         retval = EINVAL;
3353                         goto pageout_done;
3354                 }
3355         }
3356         /*
3357          * from this point forward upl points at the UPL we're working with
3358          * it was either passed in or we succesfully created it
3359          */
3360
3361         /*
3362          * Now that HFS is opting into VFC_VFSVNOP_PAGEOUTV2, we may need to operate on our own
3363          * UPL instead of relying on the UPL passed into us.  We go ahead and do that here,
3364          * scanning for dirty ranges.  We'll issue our own N cluster_pageout calls, for
3365          * N dirty ranges in the UPL.  Note that this is almost a direct copy of the
3366          * logic in vnode_pageout except that we need to do it after grabbing the truncate
3367          * lock in HFS so that we don't lock invert ourselves.
3368          *
3369          * Note that we can still get into this function on behalf of the default pager with
3370          * non-V2 behavior (swapfiles).  However in that case, we did not grab locks above
3371          * since fsync and other writing threads will grab the locks, then mark the
3372          * relevant pages as busy.  But the pageout codepath marks the pages as busy,
3373          * and THEN would attempt to grab the truncate lock, which would result in deadlock.  So
3374          * we do not try to grab anything for the pre-V2 case, which should only be accessed
3375          * by the paging/VM system.
3376          */
3377
3378         if (is_pageoutv2) {
3379                 off_t f_offset;
3380                 int offset;
3381                 int isize;
3382                 int pg_index;
3383                 int error;
3384                 int error_ret = 0;
3385
3386                 isize = ap->a_size;
3387                 f_offset = ap->a_f_offset;
3388
3389                 /*
3390                  * Scan from the back to find the last page in the UPL, so that we
3391                  * aren't looking at a UPL that may have already been freed by the
3392                  * preceding aborts/completions.
3393                  */
3394                 for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
3395                         if (upl_page_present(pl, --pg_index))
3396                                 break;
3397                         if (pg_index == 0) {
3398                                 ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
3399                                 goto pageout_done;
3400                         }
3401                 }
3402
3403                 /*
3404                  * initialize the offset variables before we touch the UPL.
3405                  * a_f_offset is the position into the file, in bytes
3406                  * offset is the position into the UPL, in bytes
3407                  * pg_index is the pg# of the UPL we're operating on.
3408                  * isize is the offset into the UPL of the last non-clean page.
3409                  */
3410                 isize = ((pg_index + 1) * PAGE_SIZE);
3411
3412                 offset = 0;
3413                 pg_index = 0;
3414
3415                 while (isize) {
3416                         int  xsize;
3417                         int  num_of_pages;
3418
3419                         if ( !upl_page_present(pl, pg_index)) {
3420                                 /*
3421                                  * we asked for RET_ONLY_DIRTY, so it's possible
3422                                  * to get back empty slots in the UPL.
3423                                  * just skip over them
3424                                  */
3425                                 f_offset += PAGE_SIZE;
3426                                 offset   += PAGE_SIZE;
3427                                 isize    -= PAGE_SIZE;
3428                                 pg_index++;
3429
3430                                 continue;
3431                         }
3432                         if ( !upl_dirty_page(pl, pg_index)) {
3433                                 panic ("hfs_vnop_pageout: unforeseen clean page @ index %d for UPL %p\n", pg_index, upl);
3434                         }
3435
3436                         /*
3437                          * We know that we have at least one dirty page.
3438                          * Now checking to see how many in a row we have
3439                          */
3440                         num_of_pages = 1;
3441                         xsize = isize - PAGE_SIZE;
3442
3443                         while (xsize) {
3444                                 if ( !upl_dirty_page(pl, pg_index + num_of_pages))
3445                                         break;
3446                                 num_of_pages++;
3447                                 xsize -= PAGE_SIZE;
3448                         }
3449                         xsize = num_of_pages * PAGE_SIZE;
3450
3451                         if (!vnode_isswap(vp)) {
3452                                 off_t end_of_range;
3453                                 int tooklock;
3454
3455                                 tooklock = 0;
3456
3457                                 if (cp->c_lockowner != current_thread()) {
3458                                         if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
3459                                                 /*
3460                                                  * we're in the v2 path, so we are the
3461                                                  * owner of the UPL... we may have already
3462                                                  * processed some of the UPL, so abort it
3463                                                  * from the current working offset to the
3464                                                  * end of the UPL
3465                                                  */
3466                                                 ubc_upl_abort_range(upl,
3467                                                                     offset,
3468                                                                     ap->a_size - offset,
3469                                                                     UPL_ABORT_FREE_ON_EMPTY);
3470                                                 goto pageout_done;
3471                                         }
3472                                         tooklock = 1;
3473                                 }
3474                                 end_of_range = f_offset + xsize - 1;
3475
3476                                 if (end_of_range >= filesize) {
3477                                         end_of_range = (off_t)(filesize - 1);
3478                                 }
3479                                 if (f_offset < filesize) {
3480                                         rl_remove(f_offset, end_of_range, &fp->ff_invalidranges);
3481                                         cp->c_flag |= C_MODIFIED;  /* leof is dirty */
3482                                 }
3483                                 if (tooklock) {
3484                                         hfs_unlock(cp);
3485                                 }
3486                         }
3487                         if ((error = cluster_pageout(vp, upl, offset, f_offset,
3488                                                         xsize, filesize, a_flags))) {
3489                                 if (error_ret == 0)
3490                                         error_ret = error;
3491                         }
3492                         f_offset += xsize;
3493                         offset   += xsize;
3494                         isize    -= xsize;
3495                         pg_index += num_of_pages;
3496                 }
3497                 /* capture errnos bubbled out of cluster_pageout if they occurred */
3498                 if (error_ret != 0) {
3499                         retval = error_ret;
3500                 }
3501         } /* end block for v2 pageout behavior */
3502         else {
3503                 if (!vnode_isswap(vp)) {
3504                         off_t end_of_range;
3505                         int tooklock = 0;
3506
3507                         if (cp->c_lockowner != current_thread()) {
3508                                 if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
3509                                         if (!(a_flags & UPL_NOCOMMIT)) {
3510                                                 ubc_upl_abort_range(upl,
3511                                                                     a_pl_offset,
3512                                                                     ap->a_size,
3513                                                                     UPL_ABORT_FREE_ON_EMPTY);
3514                                         }
3515                                         goto pageout_done;
3516                                 }
3517                                 tooklock = 1;
3518                         }
3519                         end_of_range = ap->a_f_offset + ap->a_size - 1;
3520
3521                         if (end_of_range >= filesize) {
3522                                 end_of_range = (off_t)(filesize - 1);
3523                         }
3524                         if (ap->a_f_offset < filesize) {
3525                                 rl_remove(ap->a_f_offset, end_of_range, &fp->ff_invalidranges);
3526                                 cp->c_flag |= C_MODIFIED;  /* leof is dirty */
3527                         }
3528
3529                         if (tooklock) {
3530                                 hfs_unlock(cp);
3531                         }
3532                 }
3533                 /*
3534                  * just call cluster_pageout for old pre-v2 behavior
3535                  */
3536                 retval = cluster_pageout(vp, upl, a_pl_offset, ap->a_f_offset,
3537                                 ap->a_size, filesize, a_flags);
3538         }
3539
3540         /*
3541          * If data was written, update the modification time of the file.
3542          * If setuid or setgid bits are set and this process is not the
3543          * superuser then clear the setuid and setgid bits as a precaution
3544          * against tampering.
3545          */
3546         if (retval == 0) {
3547                 cp->c_touch_modtime = TRUE;
3548                 cp->c_touch_chgtime = TRUE;
3549                 if ((cp->c_mode & (S_ISUID | S_ISGID)) &&
3550                     (vfs_context_suser(ap->a_context) != 0)) {
3551                         hfs_lock(cp, HFS_FORCE_LOCK);
3552                         cp->c_mode &= ~(S_ISUID | S_ISGID);
3553                         hfs_unlock(cp);
3554                 }
3555         }
3556
3557 pageout_done:
3558         if (is_pageoutv2) {
3559                 /* release truncate lock (shared) */
3560                 hfs_unlock_truncate(cp, 0);
3561         }
3562         return (retval);
3563 }
3564
3565 /*
3566  * Intercept B-Tree node writes to unswap them if necessary.
3567  */
3568 int
3569 hfs_vnop_bwrite(struct vnop_bwrite_args *ap)
3570 {
3571         int retval = 0;
3572         register struct buf *bp = ap->a_bp;
3573         register struct vnode *vp = buf_vnode(bp);
3574         BlockDescriptor block;
3575
3576         /* Trap B-Tree writes */
3577         if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) ||
3578             (VTOC(vp)->c_fileid == kHFSCatalogFileID) ||
3579             (VTOC(vp)->c_fileid == kHFSAttributesFileID) ||
3580             (vp == VTOHFS(vp)->hfc_filevp)) {
3581
3582                 /*
3583                  * Swap and validate the node if it is in native byte order.
3584                  * This is always be true on big endian, so we always validate
3585                  * before writing here.  On little endian, the node typically has
3586                  * been swapped and validated when it was written to the journal,
3587                  * so we won't do anything here.
3588                  */
3589                 if (((u_int16_t *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) {
3590                         /* Prepare the block pointer */
3591                         block.blockHeader = bp;
3592                         block.buffer = (char *)buf_dataptr(bp);
3593                         block.blockNum = buf_lblkno(bp);
3594                         /* not found in cache ==> came from disk */
3595                         block.blockReadFromDisk = (buf_fromcache(bp) == 0);
3596                         block.blockSize = buf_count(bp);
3597
3598                         /* Endian un-swap B-Tree node */
3599                         retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, false);
3600                         if (retval)
3601                                 panic("hfs_vnop_bwrite: about to write corrupt node!\n");
3602                 }
3603         }
3604
3605         /* This buffer shouldn't be locked anymore but if it is clear it */
3606         if ((buf_flags(bp) & B_LOCKED)) {
3607                 // XXXdbg
3608                 if (VTOHFS(vp)->jnl) {
3609                         panic("hfs: CLEARING the lock bit on bp %p\n", bp);
3610                 }
3611                 buf_clearflags(bp, B_LOCKED);
3612         }
3613         retval = vn_bwrite (ap);
3614
3615         return (retval);
3616 }
3617
3618 /*
3619  * Relocate a file to a new location on disk
3620  *  cnode must be locked on entry
3621  *
3622  * Relocation occurs by cloning the file's data from its
3623  * current set of blocks to a new set of blocks. During
3624  * the relocation all of the blocks (old and new) are
3625  * owned by the file.
3626  *
3627  * -----------------
3628  * |///////////////|
3629  * -----------------
3630  * 0               N (file offset)
3631  *
3632  * -----------------     -----------------
3633  * |///////////////|     |               |     STEP 1 (acquire new blocks)
3634  * -----------------     -----------------
3635  * 0               N     N+1             2N
3636  *
3637  * -----------------     -----------------
3638  * |///////////////|     |///////////////|     STEP 2 (clone data)
3639  * -----------------     -----------------
3640  * 0               N     N+1             2N
3641  *
3642  *                       -----------------
3643  *                       |///////////////|     STEP 3 (head truncate blocks)
3644  *                       -----------------
3645  *                       0               N
3646  *
3647  * During steps 2 and 3 page-outs to file offsets less
3648  * than or equal to N are suspended.
3649  *
3650  * During step 3 page-ins to the file get suspended.
3651  */
3652 __private_extern__
3653 int
3654 hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
3655         struct  proc *p)
3656 {
3657         struct  cnode *cp;
3658         struct  filefork *fp;
3659         struct  hfsmount *hfsmp;
3660         u_int32_t  headblks;
3661         u_int32_t  datablks;
3662         u_int32_t  blksize;
3663         u_int32_t  growsize;
3664         u_int32_t  nextallocsave;
3665         daddr64_t  sector_a,  sector_b;
3666         int eflags;
3667         off_t  newbytes;
3668         int  retval;
3669         int lockflags = 0;
3670         int took_trunc_lock = 0;
3671         int started_tr = 0;
3672         enum vtype vnodetype;
3673
3674         vnodetype = vnode_vtype(vp);
3675         if (vnodetype != VREG && vnodetype != VLNK) {
3676                 return (EPERM);
3677         }
3678
3679         hfsmp = VTOHFS(vp);
3680         if (hfsmp->hfs_flags & HFS_FRAGMENTED_FREESPACE) {
3681                 return (ENOSPC);
3682         }
3683
3684         cp = VTOC(vp);
3685         fp = VTOF(vp);
3686         if (fp->ff_unallocblocks)
3687                 return (EINVAL);
3688         blksize = hfsmp->blockSize;
3689         if (blockHint == 0)
3690                 blockHint = hfsmp->nextAllocation;
3691
3692         if ((fp->ff_size > 0x7fffffff) ||
3693             ((fp->ff_size > blksize) && vnodetype == VLNK)) {
3694                 return (EFBIG);
3695         }
3696
3697         //
3698         // We do not believe that this call to hfs_fsync() is
3699         // necessary and it causes a journal transaction
3700         // deadlock so we are removing it.
3701         //
3702         //if (vnodetype == VREG && !vnode_issystem(vp)) {
3703         //      retval = hfs_fsync(vp, MNT_WAIT, 0, p);
3704         //      if (retval)
3705         //              return (retval);
3706         //}
3707
3708         if (!vnode_issystem(vp) && (vnodetype != VLNK)) {
3709                 hfs_unlock(cp);
3710                 hfs_lock_truncate(cp, TRUE);
3711                 /* Force lock since callers expects lock to be held. */
3712                 if ((retval = hfs_lock(cp, HFS_FORCE_LOCK))) {
3713                         hfs_unlock_truncate(cp, TRUE);
3714                         return (retval);
3715                 }
3716                 /* No need to continue if file was removed. */
3717                 if (cp->c_flag & C_NOEXISTS) {
3718                         hfs_unlock_truncate(cp, TRUE);
3719                         return (ENOENT);
3720                 }
3721                 took_trunc_lock = 1;
3722         }
3723         headblks = fp->ff_blocks;
3724         datablks = howmany(fp->ff_size, blksize);
3725         growsize = datablks * blksize;
3726         eflags = kEFContigMask | kEFAllMask | kEFNoClumpMask;
3727         if (blockHint >= hfsmp->hfs_metazone_start &&
3728             blockHint <= hfsmp->hfs_metazone_end)
3729                 eflags |= kEFMetadataMask;
3730
3731         if (hfs_start_transaction(hfsmp) != 0) {
3732                 if (took_trunc_lock)
3733                         hfs_unlock_truncate(cp, TRUE);
3734             return (EINVAL);
3735         }
3736         started_tr = 1;
3737         /*
3738          * Protect the extents b-tree and the allocation bitmap
3739          * during MapFileBlockC and ExtendFileC operations.
3740          */
3741         lockflags = SFL_BITMAP;
3742         if (overflow_extents(fp))
3743                 lockflags |= SFL_EXTENTS;
3744         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3745
3746         retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize - 1, &sector_a, NULL);
3747         if (retval) {
3748                 retval = MacToVFSError(retval);
3749                 goto out;
3750         }
3751
3752         /*
3753          * STEP 1 - acquire new allocation blocks.
3754          */
3755         nextallocsave = hfsmp->nextAllocation;
3756         retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes);
3757         if (eflags & kEFMetadataMask) {
3758                 HFS_MOUNT_LOCK(hfsmp, TRUE);
3759                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave);
3760                 MarkVCBDirty(hfsmp);
3761                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
3762         }
3763
3764         retval = MacToVFSError(retval);
3765         if (retval == 0) {
3766                 cp->c_flag |= C_MODIFIED;
3767                 if (newbytes < growsize) {
3768                         retval = ENOSPC;
3769                         goto restore;
3770                 } else if (fp->ff_blocks < (headblks + datablks)) {
3771                         printf("hfs_relocate: allocation failed");
3772                         retval = ENOSPC;
3773                         goto restore;
3774                 }
3775
3776                 retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize, &sector_b, NULL);
3777                 if (retval) {
3778                         retval = MacToVFSError(retval);
3779                 } else if ((sector_a + 1) == sector_b) {
3780                         retval = ENOSPC;
3781                         goto restore;
3782                 } else if ((eflags & kEFMetadataMask) &&
3783                            ((((u_int64_t)sector_b * hfsmp->hfs_logical_block_size) / blksize) >
3784                               hfsmp->hfs_metazone_end)) {
3785 #if 0
3786                         const char * filestr;
3787                         char emptystr = '\0';
3788
3789                         if (cp->c_desc.cd_nameptr != NULL) {
3790                                 filestr = (const char *)&cp->c_desc.cd_nameptr[0];
3791                         } else if (vnode_name(vp) != NULL) {
3792                                 filestr = vnode_name(vp);
3793                         } else {
3794                                 filestr = &emptystr;
3795                         }
3796 #endif
3797                         retval = ENOSPC;
3798                         goto restore;
3799                 }
3800         }
3801         /* Done with system locks and journal for now. */
3802         hfs_systemfile_unlock(hfsmp, lockflags);
3803         lockflags = 0;
3804         hfs_end_transaction(hfsmp);
3805         started_tr = 0;
3806
3807         if (retval) {
3808                 /*
3809                  * Check to see if failure is due to excessive fragmentation.
3810                  */
3811                 if ((retval == ENOSPC) &&
3812                     (hfs_freeblks(hfsmp, 0) > (datablks * 2))) {
3813                         hfsmp->hfs_flags |= HFS_FRAGMENTED_FREESPACE;
3814                 }
3815                 goto out;
3816         }
3817         /*
3818          * STEP 2 - clone file data into the new allocation blocks.
3819          */
3820
3821         if (vnodetype == VLNK)
3822                 retval = hfs_clonelink(vp, blksize, cred, p);
3823         else if (vnode_issystem(vp))
3824                 retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p);
3825         else
3826                 retval = hfs_clonefile(vp, headblks, datablks, blksize);
3827
3828         /* Start transaction for step 3 or for a restore. */
3829         if (hfs_start_transaction(hfsmp) != 0) {
3830                 retval = EINVAL;
3831                 goto out;
3832         }
3833         started_tr = 1;
3834         if (retval)
3835                 goto restore;
3836
3837         /*
3838          * STEP 3 - switch to cloned data and remove old blocks.
3839          */
3840         lockflags = SFL_BITMAP;
3841         if (overflow_extents(fp))
3842                 lockflags |= SFL_EXTENTS;
3843         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3844
3845         retval = HeadTruncateFile(hfsmp, (FCB*)fp, headblks);
3846
3847         hfs_systemfile_unlock(hfsmp, lockflags);
3848         lockflags = 0;
3849         if (retval)
3850                 goto restore;
3851 out:
3852         if (took_trunc_lock)
3853                 hfs_unlock_truncate(cp, TRUE);
3854
3855         if (lockflags) {
3856                 hfs_systemfile_unlock(hfsmp, lockflags);
3857                 lockflags = 0;
3858         }
3859
3860         /* Push cnode's new extent data to disk. */
3861         if (retval == 0) {
3862                 (void) hfs_update(vp, MNT_WAIT);
3863         }
3864         if (hfsmp->jnl) {
3865                 if (cp->c_cnid < kHFSFirstUserCatalogNodeID)
3866                         (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
3867                 else
3868                         (void) hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
3869         }
3870 exit:
3871         if (started_tr)
3872                 hfs_end_transaction(hfsmp);
3873
3874         return (retval);
3875
3876 restore:
3877         if (fp->ff_blocks == headblks) {
3878                 if (took_trunc_lock)
3879                         hfs_unlock_truncate(cp, TRUE);
3880                 goto exit;
3881         }
3882         /*
3883          * Give back any newly allocated space.
3884          */
3885         if (lockflags == 0) {
3886                 lockflags = SFL_BITMAP;
3887                 if (overflow_extents(fp))
3888                         lockflags |= SFL_EXTENTS;
3889                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3890         }
3891
3892         (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, false);
3893
3894         hfs_systemfile_unlock(hfsmp, lockflags);
3895         lockflags = 0;
3896
3897         if (took_trunc_lock)
3898                 hfs_unlock_truncate(cp, TRUE);
3899         goto exit;
3900 }
3901
3902
3903 /*
3904  * Clone a symlink.
3905  *
3906  */
3907 static int
3908 hfs_clonelink(struct vnode *vp, int blksize, kauth_cred_t cred, __unused struct proc *p)
3909 {
3910         struct buf *head_bp = NULL;
3911         struct buf *tail_bp = NULL;
3912         int error;
3913
3914
3915         error = (int)buf_meta_bread(vp, (daddr64_t)0, blksize, cred, &head_bp);
3916         if (error)
3917                 goto out;
3918
3919         tail_bp = buf_getblk(vp, (daddr64_t)1, blksize, 0, 0, BLK_META);
3920         if (tail_bp == NULL) {
3921                 error = EIO;
3922                 goto out;
3923         }
3924         bcopy((char *)buf_dataptr(head_bp), (char *)buf_dataptr(tail_bp), blksize);
3925         error = (int)buf_bwrite(tail_bp);
3926 out:
3927         if (head_bp) {
3928                 buf_markinvalid(head_bp);
3929                 buf_brelse(head_bp);
3930         }
3931         (void) buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
3932
3933         return (error);
3934 }
3935
3936 /*
3937  * Clone a file's data within the file.
3938  *
3939  */
3940 static int
3941 hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
3942 {
3943         caddr_t  bufp;
3944         size_t  bufsize;
3945         size_t  copysize;
3946         size_t  iosize;
3947         size_t  offset;
3948         off_t   writebase;
3949         uio_t auio;
3950         int  error = 0;
3951
3952         writebase = blkstart * blksize;
3953         copysize = blkcnt * blksize;
3954         iosize = bufsize = MIN(copysize, 128 * 1024);
3955         offset = 0;
3956
3957         if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
3958                 return (ENOMEM);
3959         }
3960         hfs_unlock(VTOC(vp));
3961
3962         auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
3963
3964         while (offset < copysize) {
3965                 iosize = MIN(copysize - offset, iosize);
3966
3967                 uio_reset(auio, offset, UIO_SYSSPACE, UIO_READ);
3968                 uio_addiov(auio, (uintptr_t)bufp, iosize);
3969
3970                 error = cluster_read(vp, auio, copysize, IO_NOCACHE);
3971                 if (error) {
3972                         printf("hfs_clonefile: cluster_read failed - %d\n", error);
3973                         break;
3974                 }
3975                 if (uio_resid(auio) != 0) {
3976                         printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", uio_resid(auio));
3977                         error = EIO;
3978                         break;
3979                 }
3980
3981                 uio_reset(auio, writebase + offset, UIO_SYSSPACE, UIO_WRITE);
3982                 uio_addiov(auio, (uintptr_t)bufp, iosize);
3983
3984                 error = cluster_write(vp, auio, writebase + offset,
3985                                       writebase + offset + iosize,
3986                                       uio_offset(auio), 0, IO_NOCACHE | IO_SYNC);
3987                 if (error) {
3988                         printf("hfs_clonefile: cluster_write failed - %d\n", error);
3989                         break;
3990                 }
3991                 if (uio_resid(auio) != 0) {
3992                         printf("hfs_clonefile: cluster_write failed - uio_resid not zero\n");
3993                         error = EIO;
3994                         break;
3995                 }
3996                 offset += iosize;
3997         }
3998         uio_free(auio);
3999
4000         if ((blksize & PAGE_MASK)) {
4001                 /*
4002                  * since the copy may not have started on a PAGE
4003                  * boundary (or may not have ended on one), we
4004                  * may have pages left in the cache since NOCACHE
4005                  * will let partially written pages linger...
4006                  * lets just flush the entire range to make sure
4007                  * we don't have any pages left that are beyond
4008                  * (or intersect) the real LEOF of this file
4009                  */
4010                 ubc_msync(vp, writebase, writebase + offset, NULL, UBC_INVALIDATE | UBC_PUSHDIRTY);
4011         } else {
4012                 /*
4013                  * No need to call ubc_sync_range or hfs_invalbuf
4014                  * since the file was copied using IO_NOCACHE and
4015                  * the copy was done starting and ending on a page
4016                  * boundary in the file.
4017                  */
4018         }
4019         kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
4020
4021         hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
4022         return (error);
4023 }
4024
4025 /*
4026  * Clone a system (metadata) file.
4027  *
4028  */
4029 static int
4030 hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize,
4031                  kauth_cred_t cred, struct proc *p)
4032 {
4033         caddr_t  bufp;
4034         char * offset;
4035         size_t  bufsize;
4036         size_t  iosize;
4037         struct buf *bp = NULL;
4038         daddr64_t  blkno;
4039         daddr64_t  blk;
4040         daddr64_t  start_blk;
4041         daddr64_t  last_blk;
4042         int  breadcnt;
4043         int  i;
4044         int  error = 0;
4045
4046
4047         iosize = GetLogicalBlockSize(vp);
4048         bufsize = MIN(blkcnt * blksize, 1024 * 1024) & ~(iosize - 1);
4049         breadcnt = bufsize / iosize;
4050
4051         if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
4052                 return (ENOMEM);
4053         }
4054         start_blk = ((daddr64_t)blkstart * blksize) / iosize;
4055         last_blk  = ((daddr64_t)blkcnt * blksize) / iosize;
4056         blkno = 0;
4057
4058         while (blkno < last_blk) {
4059                 /*
4060                  * Read up to a megabyte
4061                  */
4062                 offset = bufp;
4063                 for (i = 0, blk = blkno; (i < breadcnt) && (blk < last_blk); ++i, ++blk) {
4064                         error = (int)buf_meta_bread(vp, blk, iosize, cred, &bp);
4065                         if (error) {
4066                                 printf("hfs_clonesysfile: meta_bread error %d\n", error);
4067                                 goto out;
4068                         }
4069                         if (buf_count(bp) != iosize) {
4070                                 printf("hfs_clonesysfile: b_bcount is only %d\n", buf_count(bp));
4071                                 goto out;
4072                         }
4073                         bcopy((char *)buf_dataptr(bp), offset, iosize);
4074
4075                         buf_markinvalid(bp);
4076                         buf_brelse(bp);
4077                         bp = NULL;
4078
4079                         offset += iosize;
4080                 }
4081
4082                 /*
4083                  * Write up to a megabyte
4084                  */
4085                 offset = bufp;
4086                 for (i = 0; (i < breadcnt) && (blkno < last_blk); ++i, ++blkno) {
4087                         bp = buf_getblk(vp, start_blk + blkno, iosize, 0, 0, BLK_META);
4088                         if (bp == NULL) {
4089                                 printf("hfs_clonesysfile: getblk failed on blk %qd\n", start_blk + blkno);
4090                                 error = EIO;
4091                                 goto out;
4092                         }
4093                         bcopy(offset, (char *)buf_dataptr(bp), iosize);
4094                         error = (int)buf_bwrite(bp);
4095                         bp = NULL;
4096                         if (error)
4097                                 goto out;
4098                         offset += iosize;
4099                 }
4100         }
4101 out:
4102         if (bp) {
4103                 buf_brelse(bp);
4104         }
4105
4106         kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
4107
4108         error = hfs_fsync(vp, MNT_WAIT, 0, p);
4109
4110         return (error);
4111 }