bsd/hfs/hfs_readwrite.c

   1 /*
   2  * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*      @(#)hfs_readwrite.c     1.0
  29  *
  30  *      (c) 1998-2001 Apple Computer, Inc.  All Rights Reserved
  31  *
  32  *      hfs_readwrite.c -- vnode operations to deal with reading and writing files.
  33  *
  34  */
  35
  36 #include <sys/param.h>
  37 #include <sys/systm.h>
  38 #include <sys/resourcevar.h>
  39 #include <sys/kernel.h>
  40 #include <sys/fcntl.h>
  41 #include <sys/filedesc.h>
  42 #include <sys/stat.h>
  43 #include <sys/buf.h>
  44 #include <sys/proc.h>
  45 #include <sys/kauth.h>
  46 #include <sys/vnode.h>
  47 #include <sys/vnode_internal.h>
  48 #include <sys/uio.h>
  49 #include <sys/vfs_context.h>
  50 #include <sys/fsevents.h>
  51 #include <kern/kalloc.h>
  52 #include <sys/disk.h>
  53 #include <sys/sysctl.h>
  54 #include <sys/fsctl.h>
  55
  56 #include <miscfs/specfs/specdev.h>
  57
  58 #include <sys/ubc.h>
  59 #include <sys/ubc_internal.h>
  60
  61 #include <vm/vm_pageout.h>
  62 #include <vm/vm_kern.h>
  63
  64 #include <sys/kdebug.h>
  65
  66 #include        "hfs.h"
  67 #include        "hfs_attrlist.h"
  68 #include        "hfs_endian.h"
  69 #include        "hfs_fsctl.h"
  70 #include        "hfs_quota.h"
  71 #include        "hfscommon/headers/FileMgrInternal.h"
  72 #include        "hfscommon/headers/BTreesInternal.h"
  73 #include        "hfs_cnode.h"
  74 #include        "hfs_dbg.h"
  75
  76 #define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2)))
  77
  78 enum {
  79         MAXHFSFILESIZE = 0x7FFFFFFF             /* this needs to go in the mount structure */
  80 };
  81
  82 /* from bsd/hfs/hfs_vfsops.c */
  83 extern int hfs_vfs_vget (struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context);
  84
  85 static int  hfs_clonelink(struct vnode *, int, kauth_cred_t, struct proc *);
  86 static int  hfs_clonefile(struct vnode *, int, int, int);
  87 static int  hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *);
  88 static int  hfs_minorupdate(struct vnode *vp);
  89 static int  do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skip, vfs_context_t context);
  90
  91
  92 int flush_cache_on_write = 0;
  93 SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files");
  94
  95
  96 /*
  97  * Read data from a file.
  98  */
  99 int
 100 hfs_vnop_read(struct vnop_read_args *ap)
 101 {
 102         uio_t uio = ap->a_uio;
 103         struct vnode *vp = ap->a_vp;
 104         struct cnode *cp;
 105         struct filefork *fp;
 106         struct hfsmount *hfsmp;
 107         off_t filesize;
 108         off_t filebytes;
 109         off_t start_resid = uio_resid(uio);
 110         off_t offset = uio_offset(uio);
 111         int retval = 0;
 112
 113         /* Preflight checks */
 114         if (!vnode_isreg(vp)) {
 115                 /* can only read regular files */
 116                 if (vnode_isdir(vp))
 117                         return (EISDIR);
 118                 else
 119                         return (EPERM);
 120         }
 121         if (start_resid == 0)
 122                 return (0);             /* Nothing left to do */
 123         if (offset < 0)
 124                 return (EINVAL);        /* cant read from a negative offset */
 125
 126 #if HFS_COMPRESSION
 127         if (VNODE_IS_RSRC(vp)) {
 128                 if (hfs_hides_rsrc(ap->a_context, VTOC(vp), 1)) { /* 1 == don't take the cnode lock */
 129                         return 0;
 130                 }
 131                 /* otherwise read the resource fork normally */
 132         } else {
 133                 int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
 134                 if (compressed) {
 135                         retval = decmpfs_read_compressed(ap, &compressed, VTOCMP(vp));
 136                         if (compressed) {
 137                                 if (retval == 0) {
 138                                         /* successful read, update the access time */
 139                                         VTOC(vp)->c_touch_acctime = TRUE;
 140
 141                                         /* compressed files are not hot file candidates */
 142                                         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
 143                                                 VTOF(vp)->ff_bytesread = 0;
 144                                         }
 145                                 }
 146                                 return retval;
 147                         }
 148                         /* otherwise the file was converted back to a regular file while we were reading it */
 149                         retval = 0;
 150                 }
 151         }
 152 #endif /* HFS_COMPRESSION */
 153
 154         cp = VTOC(vp);
 155         fp = VTOF(vp);
 156         hfsmp = VTOHFS(vp);
 157
 158         /* Protect against a size change. */
 159         hfs_lock_truncate(cp, 0);
 160
 161         filesize = fp->ff_size;
 162         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 163         if (offset > filesize) {
 164                 if ((hfsmp->hfs_flags & HFS_STANDARD) &&
 165                     (offset > (off_t)MAXHFSFILESIZE)) {
 166                         retval = EFBIG;
 167                 }
 168                 goto exit;
 169         }
 170
 171         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_START,
 172                 (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0);
 173
 174         retval = cluster_read(vp, uio, filesize, ap->a_ioflag);
 175
 176         cp->c_touch_acctime = TRUE;
 177
 178         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_END,
 179                 (int)uio_offset(uio), uio_resid(uio), (int)filesize,  (int)filebytes, 0);
 180
 181         /*
 182          * Keep track blocks read
 183          */
 184         if (hfsmp->hfc_stage == HFC_RECORDING && retval == 0) {
 185                 int took_cnode_lock = 0;
 186                 off_t bytesread;
 187
 188                 bytesread = start_resid - uio_resid(uio);
 189
 190                 /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
 191                 if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) {
 192                         hfs_lock(cp, HFS_FORCE_LOCK);
 193                         took_cnode_lock = 1;
 194                 }
 195                 /*
 196                  * If this file hasn't been seen since the start of
 197                  * the current sampling period then start over.
 198                  */
 199                 if (cp->c_atime < hfsmp->hfc_timebase) {
 200                         struct timeval tv;
 201
 202                         fp->ff_bytesread = bytesread;
 203                         microtime(&tv);
 204                         cp->c_atime = tv.tv_sec;
 205                 } else {
 206                         fp->ff_bytesread += bytesread;
 207                 }
 208                 if (took_cnode_lock)
 209                         hfs_unlock(cp);
 210         }
 211 exit:
 212         hfs_unlock_truncate(cp, 0);
 213         return (retval);
 214 }
 215
 216 /*
 217  * Write data to a file.
 218  */
 219 int
 220 hfs_vnop_write(struct vnop_write_args *ap)
 221 {
 222         uio_t uio = ap->a_uio;
 223         struct vnode *vp = ap->a_vp;
 224         struct cnode *cp;
 225         struct filefork *fp;
 226         struct hfsmount *hfsmp;
 227         kauth_cred_t cred = NULL;
 228         off_t origFileSize;
 229         off_t writelimit;
 230         off_t bytesToAdd = 0;
 231         off_t actualBytesAdded;
 232         off_t filebytes;
 233         off_t offset;
 234         ssize_t resid;
 235         int eflags;
 236         int ioflag = ap->a_ioflag;
 237         int retval = 0;
 238         int lockflags;
 239         int cnode_locked = 0;
 240         int partialwrite = 0;
 241         int exclusive_lock = 0;
 242
 243 #if HFS_COMPRESSION
 244         if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
 245                 int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
 246                 switch(state) {
 247                         case FILE_IS_COMPRESSED:
 248                                 return EACCES;
 249                         case FILE_IS_CONVERTING:
 250                                 /* if FILE_IS_CONVERTING, we allow writes */
 251                                 break;
 252                         default:
 253                                 printf("invalid state %d for compressed file\n", state);
 254                                 /* fall through */
 255                 }
 256         }
 257 #endif
 258
 259         // LP64todo - fix this! uio_resid may be 64-bit value
 260         resid = uio_resid(uio);
 261         offset = uio_offset(uio);
 262
 263         if (ioflag & IO_APPEND) {
 264             exclusive_lock = 1;
 265         }
 266
 267         if (offset < 0)
 268                 return (EINVAL);
 269         if (resid == 0)
 270                 return (E_NONE);
 271         if (!vnode_isreg(vp))
 272                 return (EPERM);  /* Can only write regular files */
 273
 274         cp = VTOC(vp);
 275         fp = VTOF(vp);
 276         hfsmp = VTOHFS(vp);
 277
 278         eflags = kEFDeferMask;  /* defer file block allocations */
 279 #ifdef HFS_SPARSE_DEV
 280         /*
 281          * When the underlying device is sparse and space
 282          * is low (< 8MB), stop doing delayed allocations
 283          * and begin doing synchronous I/O.
 284          */
 285         if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
 286             (hfs_freeblks(hfsmp, 0) < 2048)) {
 287                 eflags &= ~kEFDeferMask;
 288                 ioflag |= IO_SYNC;
 289         }
 290 #endif /* HFS_SPARSE_DEV */
 291
 292 again:
 293         /* Protect against a size change. */
 294         hfs_lock_truncate(cp, exclusive_lock);
 295
 296         if (ioflag & IO_APPEND) {
 297                 uio_setoffset(uio, fp->ff_size);
 298                 offset = fp->ff_size;
 299         }
 300         if ((cp->c_flags & APPEND) && offset != fp->ff_size) {
 301                 retval = EPERM;
 302                 goto exit;
 303         }
 304
 305         origFileSize = fp->ff_size;
 306         writelimit = offset + resid;
 307         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 308
 309         /* If the truncate lock is shared, and if we either have virtual
 310          * blocks or will need to extend the file, upgrade the truncate
 311          * to exclusive lock.  If upgrade fails, we lose the lock and
 312          * have to get exclusive lock again.  Note that we want to
 313          * grab the truncate lock exclusive even if we're not allocating new blocks
 314          * because we could still be growing past the LEOF.
 315          */
 316         if ((exclusive_lock == 0) &&
 317             ((fp->ff_unallocblocks != 0) || (writelimit > origFileSize))) {
 318                 exclusive_lock = 1;
 319                 /* Lock upgrade failed and we lost our shared lock, try again */
 320                 if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) {
 321                         goto again;
 322                 }
 323         }
 324
 325         if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) {
 326                 goto exit;
 327         }
 328         cnode_locked = 1;
 329
 330         if (!exclusive_lock) {
 331                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_START,
 332                              (int)offset, uio_resid(uio), (int)fp->ff_size,
 333                              (int)filebytes, 0);
 334         }
 335
 336         /* Check if we do not need to extend the file */
 337         if (writelimit <= filebytes) {
 338                 goto sizeok;
 339         }
 340
 341         cred = vfs_context_ucred(ap->a_context);
 342         bytesToAdd = writelimit - filebytes;
 343
 344 #if QUOTA
 345         retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, hfsmp->blockSize)),
 346                            cred, 0);
 347         if (retval)
 348                 goto exit;
 349 #endif /* QUOTA */
 350
 351         if (hfs_start_transaction(hfsmp) != 0) {
 352                 retval = EINVAL;
 353                 goto exit;
 354         }
 355
 356         while (writelimit > filebytes) {
 357                 bytesToAdd = writelimit - filebytes;
 358                 if (cred && suser(cred, NULL) != 0)
 359                         eflags |= kEFReserveMask;
 360
 361                 /* Protect extents b-tree and allocation bitmap */
 362                 lockflags = SFL_BITMAP;
 363                 if (overflow_extents(fp))
 364                         lockflags |= SFL_EXTENTS;
 365                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
 366
 367                 /* Files that are changing size are not hot file candidates. */
 368                 if (hfsmp->hfc_stage == HFC_RECORDING) {
 369                         fp->ff_bytesread = 0;
 370                 }
 371                 retval = MacToVFSError(ExtendFileC (hfsmp, (FCB*)fp, bytesToAdd,
 372                                 0, eflags, &actualBytesAdded));
 373
 374                 hfs_systemfile_unlock(hfsmp, lockflags);
 375
 376                 if ((actualBytesAdded == 0) && (retval == E_NONE))
 377                         retval = ENOSPC;
 378                 if (retval != E_NONE)
 379                         break;
 380                 filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 381                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_NONE,
 382                         (int)offset, uio_resid(uio), (int)fp->ff_size,  (int)filebytes, 0);
 383         }
 384         (void) hfs_update(vp, TRUE);
 385         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
 386         (void) hfs_end_transaction(hfsmp);
 387
 388         /*
 389          * If we didn't grow the file enough try a partial write.
 390          * POSIX expects this behavior.
 391          */
 392         if ((retval == ENOSPC) && (filebytes > offset)) {
 393                 retval = 0;
 394                 partialwrite = 1;
 395                 uio_setresid(uio, (uio_resid(uio) - bytesToAdd));
 396                 resid -= bytesToAdd;
 397                 writelimit = filebytes;
 398         }
 399 sizeok:
 400         if (retval == E_NONE) {
 401                 off_t filesize;
 402                 off_t zero_off;
 403                 off_t tail_off;
 404                 off_t inval_start;
 405                 off_t inval_end;
 406                 off_t io_start;
 407                 int lflag;
 408                 struct rl_entry *invalid_range;
 409
 410                 if (writelimit > fp->ff_size)
 411                         filesize = writelimit;
 412                 else
 413                         filesize = fp->ff_size;
 414
 415                 lflag = ioflag & ~(IO_TAILZEROFILL | IO_HEADZEROFILL | IO_NOZEROVALID | IO_NOZERODIRTY);
 416
 417                 if (offset <= fp->ff_size) {
 418                         zero_off = offset & ~PAGE_MASK_64;
 419
 420                         /* Check to see whether the area between the zero_offset and the start
 421                            of the transfer to see whether is invalid and should be zero-filled
 422                            as part of the transfer:
 423                          */
 424                         if (offset > zero_off) {
 425                                 if (rl_scan(&fp->ff_invalidranges, zero_off, offset - 1, &invalid_range) != RL_NOOVERLAP)
 426                                         lflag |= IO_HEADZEROFILL;
 427                         }
 428                 } else {
 429                         off_t eof_page_base = fp->ff_size & ~PAGE_MASK_64;
 430
 431                         /* The bytes between fp->ff_size and uio->uio_offset must never be
 432                            read without being zeroed.  The current last block is filled with zeroes
 433                            if it holds valid data but in all cases merely do a little bookkeeping
 434                            to track the area from the end of the current last page to the start of
 435                            the area actually written.  For the same reason only the bytes up to the
 436                            start of the page where this write will start is invalidated; any remainder
 437                            before uio->uio_offset is explicitly zeroed as part of the cluster_write.
 438
 439                            Note that inval_start, the start of the page after the current EOF,
 440                            may be past the start of the write, in which case the zeroing
 441                            will be handled by the cluser_write of the actual data.
 442                          */
 443                         inval_start = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
 444                         inval_end = offset & ~PAGE_MASK_64;
 445                         zero_off = fp->ff_size;
 446
 447                         if ((fp->ff_size & PAGE_MASK_64) &&
 448                                 (rl_scan(&fp->ff_invalidranges,
 449                                                         eof_page_base,
 450                                                         fp->ff_size - 1,
 451                                                         &invalid_range) != RL_NOOVERLAP)) {
 452                                 /* The page containing the EOF is not valid, so the
 453                                    entire page must be made inaccessible now.  If the write
 454                                    starts on a page beyond the page containing the eof
 455                                    (inval_end > eof_page_base), add the
 456                                    whole page to the range to be invalidated.  Otherwise
 457                                    (i.e. if the write starts on the same page), zero-fill
 458                                    the entire page explicitly now:
 459                                  */
 460                                 if (inval_end > eof_page_base) {
 461                                         inval_start = eof_page_base;
 462                                 } else {
 463                                         zero_off = eof_page_base;
 464                                 };
 465                         };
 466
 467                         if (inval_start < inval_end) {
 468                                 struct timeval tv;
 469                                 /* There's some range of data that's going to be marked invalid */
 470
 471                                 if (zero_off < inval_start) {
 472                                         /* The pages between inval_start and inval_end are going to be invalidated,
 473                                            and the actual write will start on a page past inval_end.  Now's the last
 474                                            chance to zero-fill the page containing the EOF:
 475                                          */
 476                                         hfs_unlock(cp);
 477                                         cnode_locked = 0;
 478                                         retval = cluster_write(vp, (uio_t) 0,
 479                                                         fp->ff_size, inval_start,
 480                                                         zero_off, (off_t)0,
 481                                                         lflag | IO_HEADZEROFILL | IO_NOZERODIRTY);
 482                                         hfs_lock(cp, HFS_FORCE_LOCK);
 483                                         cnode_locked = 1;
 484                                         if (retval) goto ioerr_exit;
 485                                         offset = uio_offset(uio);
 486                                 };
 487
 488                                 /* Mark the remaining area of the newly allocated space as invalid: */
 489                                 rl_add(inval_start, inval_end - 1 , &fp->ff_invalidranges);
 490                                 microuptime(&tv);
 491                                 cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
 492                                 zero_off = fp->ff_size = inval_end;
 493                         };
 494
 495                         if (offset > zero_off) lflag |= IO_HEADZEROFILL;
 496                 };
 497
 498                 /* Check to see whether the area between the end of the write and the end of
 499                    the page it falls in is invalid and should be zero-filled as part of the transfer:
 500                  */
 501                 tail_off = (writelimit + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
 502                 if (tail_off > filesize) tail_off = filesize;
 503                 if (tail_off > writelimit) {
 504                         if (rl_scan(&fp->ff_invalidranges, writelimit, tail_off - 1, &invalid_range) != RL_NOOVERLAP) {
 505                                 lflag |= IO_TAILZEROFILL;
 506                         };
 507                 };
 508
 509                 /*
 510                  * if the write starts beyond the current EOF (possibly advanced in the
 511                  * zeroing of the last block, above), then we'll zero fill from the current EOF
 512                  * to where the write begins:
 513                  *
 514                  * NOTE: If (and ONLY if) the portion of the file about to be written is
 515                  *       before the current EOF it might be marked as invalid now and must be
 516                  *       made readable (removed from the invalid ranges) before cluster_write
 517                  *       tries to write it:
 518                  */
 519                 io_start = (lflag & IO_HEADZEROFILL) ? zero_off : offset;
 520                 if (io_start < fp->ff_size) {
 521                         off_t io_end;
 522
 523                         io_end = (lflag & IO_TAILZEROFILL) ? tail_off : writelimit;
 524                         rl_remove(io_start, io_end - 1, &fp->ff_invalidranges);
 525                 };
 526
 527                 hfs_unlock(cp);
 528                 cnode_locked = 0;
 529
 530                 /*
 531                  * We need to tell UBC the fork's new size BEFORE calling
 532                  * cluster_write, in case any of the new pages need to be
 533                  * paged out before cluster_write completes (which does happen
 534                  * in embedded systems due to extreme memory pressure).
 535                  * Similarly, we need to tell hfs_vnop_pageout what the new EOF
 536                  * will be, so that it can pass that on to cluster_pageout, and
 537                  * allow those pageouts.
 538                  *
 539                  * We don't update ff_size yet since we don't want pageins to
 540                  * be able to see uninitialized data between the old and new
 541                  * EOF, until cluster_write has completed and initialized that
 542                  * part of the file.
 543                  *
 544                  * The vnode pager relies on the file size last given to UBC via
 545                  * ubc_setsize.  hfs_vnop_pageout relies on fp->ff_new_size or
 546                  * ff_size (whichever is larger).  NOTE: ff_new_size is always
 547                  * zero, unless we are extending the file via write.
 548                  */
 549                 if (filesize > fp->ff_size) {
 550                         fp->ff_new_size = filesize;
 551                         ubc_setsize(vp, filesize);
 552                 }
 553                 retval = cluster_write(vp, uio, fp->ff_size, filesize, zero_off,
 554                                 tail_off, lflag | IO_NOZERODIRTY);
 555                 if (retval) {
 556                         fp->ff_new_size = 0;    /* no longer extending; use ff_size */
 557                         if (filesize > origFileSize) {
 558                                 ubc_setsize(vp, origFileSize);
 559                         }
 560                         goto ioerr_exit;
 561                 }
 562
 563                 if (filesize > origFileSize) {
 564                         fp->ff_size = filesize;
 565
 566                         /* Files that are changing size are not hot file candidates. */
 567                         if (hfsmp->hfc_stage == HFC_RECORDING) {
 568                                 fp->ff_bytesread = 0;
 569                         }
 570                 }
 571                 fp->ff_new_size = 0;    /* ff_size now has the correct size */
 572
 573                 /* If we wrote some bytes, then touch the change and mod times */
 574                 if (resid > uio_resid(uio)) {
 575                         cp->c_touch_chgtime = TRUE;
 576                         cp->c_touch_modtime = TRUE;
 577                 }
 578         }
 579         if (partialwrite) {
 580                 uio_setresid(uio, (uio_resid(uio) + bytesToAdd));
 581                 resid += bytesToAdd;
 582         }
 583
 584         // XXXdbg - see radar 4871353 for more info
 585         {
 586             if (flush_cache_on_write && ((ioflag & IO_NOCACHE) || vnode_isnocache(vp))) {
 587                 VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NULL);
 588             }
 589         }
 590
 591 ioerr_exit:
 592         /*
 593          * If we successfully wrote any data, and we are not the superuser
 594          * we clear the setuid and setgid bits as a precaution against
 595          * tampering.
 596          */
 597         if (cp->c_mode & (S_ISUID | S_ISGID)) {
 598                 cred = vfs_context_ucred(ap->a_context);
 599                 if (resid > uio_resid(uio) && cred && suser(cred, NULL)) {
 600                         if (!cnode_locked) {
 601                                 hfs_lock(cp, HFS_FORCE_LOCK);
 602                                 cnode_locked = 1;
 603                         }
 604                         cp->c_mode &= ~(S_ISUID | S_ISGID);
 605                 }
 606         }
 607         if (retval) {
 608                 if (ioflag & IO_UNIT) {
 609                         if (!cnode_locked) {
 610                                 hfs_lock(cp, HFS_FORCE_LOCK);
 611                                 cnode_locked = 1;
 612                         }
 613                         (void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC,
 614                                            0, 0, ap->a_context);
 615                         // LP64todo - fix this!  resid needs to by user_ssize_t
 616                         uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio))));
 617                         uio_setresid(uio, resid);
 618                         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 619                 }
 620         } else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio))) {
 621                 if (!cnode_locked) {
 622                         hfs_lock(cp, HFS_FORCE_LOCK);
 623                         cnode_locked = 1;
 624                 }
 625                 retval = hfs_update(vp, TRUE);
 626         }
 627         /* Updating vcbWrCnt doesn't need to be atomic. */
 628         hfsmp->vcbWrCnt++;
 629
 630         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_END,
 631                 (int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0);
 632 exit:
 633         if (cnode_locked)
 634                 hfs_unlock(cp);
 635         hfs_unlock_truncate(cp, exclusive_lock);
 636         return (retval);
 637 }
 638
 639 /* support for the "bulk-access" fcntl */
 640
 641 #define CACHE_LEVELS 16
 642 #define NUM_CACHE_ENTRIES (64*16)
 643 #define PARENT_IDS_FLAG 0x100
 644
 645 struct access_cache {
 646        int numcached;
 647        int cachehits; /* these two for statistics gathering */
 648        int lookups;
 649        unsigned int *acache;
 650        unsigned char *haveaccess;
 651 };
 652
 653 struct access_t {
 654         uid_t     uid;              /* IN: effective user id */
 655         short     flags;            /* IN: access requested (i.e. R_OK) */
 656         short     num_groups;       /* IN: number of groups user belongs to */
 657         int       num_files;        /* IN: number of files to process */
 658         int       *file_ids;        /* IN: array of file ids */
 659         gid_t     *groups;          /* IN: array of groups */
 660         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 661 } __attribute__((unavailable)); // this structure is for reference purposes only
 662
 663 struct user32_access_t {
 664         uid_t     uid;              /* IN: effective user id */
 665         short     flags;            /* IN: access requested (i.e. R_OK) */
 666         short     num_groups;       /* IN: number of groups user belongs to */
 667         int       num_files;        /* IN: number of files to process */
 668         user32_addr_t      file_ids;        /* IN: array of file ids */
 669         user32_addr_t      groups;          /* IN: array of groups */
 670         user32_addr_t      access;          /* OUT: access info for each file (0 for 'has access') */
 671 };
 672
 673 struct user64_access_t {
 674         uid_t           uid;                    /* IN: effective user id */
 675         short           flags;                  /* IN: access requested (i.e. R_OK) */
 676         short           num_groups;             /* IN: number of groups user belongs to */
 677         int             num_files;              /* IN: number of files to process */
 678         user64_addr_t   file_ids;               /* IN: array of file ids */
 679         user64_addr_t   groups;                 /* IN: array of groups */
 680         user64_addr_t   access;                 /* OUT: access info for each file (0 for 'has access') */
 681 };
 682
 683
 684 // these are the "extended" versions of the above structures
 685 // note that it is crucial that they be different sized than
 686 // the regular version
 687 struct ext_access_t {
 688         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 689         uint32_t   num_files;       /* IN: number of files to process */
 690         uint32_t   map_size;        /* IN: size of the bit map */
 691         uint32_t  *file_ids;        /* IN: Array of file ids */
 692         char      *bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 693         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 694         uint32_t   num_parents;   /* future use */
 695         cnid_t      *parents;   /* future use */
 696 } __attribute__((unavailable)); // this structure is for reference purposes only
 697
 698 struct user32_ext_access_t {
 699         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 700         uint32_t   num_files;       /* IN: number of files to process */
 701         uint32_t   map_size;        /* IN: size of the bit map */
 702         user32_addr_t  file_ids;        /* IN: Array of file ids */
 703         user32_addr_t     bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 704         user32_addr_t access;          /* OUT: access info for each file (0 for 'has access') */
 705         uint32_t   num_parents;   /* future use */
 706         user32_addr_t parents;   /* future use */
 707 };
 708
 709 struct user64_ext_access_t {
 710         uint32_t      flags;        /* IN: access requested (i.e. R_OK) */
 711         uint32_t      num_files;    /* IN: number of files to process */
 712         uint32_t      map_size;     /* IN: size of the bit map */
 713         user64_addr_t   file_ids;     /* IN: array of file ids */
 714         user64_addr_t   bitmap;       /* IN: array of groups */
 715         user64_addr_t   access;       /* OUT: access info for each file (0 for 'has access') */
 716         uint32_t      num_parents;/* future use */
 717         user64_addr_t   parents;/* future use */
 718 };
 719
 720
 721 /*
 722  * Perform a binary search for the given parent_id. Return value is
 723  * the index if there is a match.  If no_match_indexp is non-NULL it
 724  * will be assigned with the index to insert the item (even if it was
 725  * not found).
 726  */
 727 static int cache_binSearch(cnid_t *array, unsigned int hi, cnid_t parent_id, int *no_match_indexp)
 728 {
 729     int index=-1;
 730     unsigned int lo=0;
 731
 732     do {
 733         unsigned int mid = ((hi - lo)/2) + lo;
 734         unsigned int this_id = array[mid];
 735
 736         if (parent_id == this_id) {
 737             hi = mid;
 738             break;
 739         }
 740
 741         if (parent_id < this_id) {
 742             hi = mid;
 743             continue;
 744         }
 745
 746         if (parent_id > this_id) {
 747             lo = mid + 1;
 748             continue;
 749         }
 750     } while(lo < hi);
 751
 752     /* check if lo and hi converged on the match */
 753     if (parent_id == array[hi]) {
 754         index = hi;
 755     }
 756
 757     if (no_match_indexp) {
 758         *no_match_indexp = hi;
 759     }
 760
 761     return index;
 762 }
 763
 764
 765 static int
 766 lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id)
 767 {
 768     unsigned int hi;
 769     int matches = 0;
 770     int index, no_match_index;
 771
 772     if (cache->numcached == 0) {
 773         *indexp = 0;
 774         return 0; // table is empty, so insert at index=0 and report no match
 775     }
 776
 777     if (cache->numcached > NUM_CACHE_ENTRIES) {
 778         /*printf("hfs: EGAD! numcached is %d... cut our losses and trim to %d\n",
 779           cache->numcached, NUM_CACHE_ENTRIES);*/
 780         cache->numcached = NUM_CACHE_ENTRIES;
 781     }
 782
 783     hi = cache->numcached - 1;
 784
 785     index = cache_binSearch(cache->acache, hi, parent_id, &no_match_index);
 786
 787     /* if no existing entry found, find index for new one */
 788     if (index == -1) {
 789         index = no_match_index;
 790         matches = 0;
 791     } else {
 792         matches = 1;
 793     }
 794
 795     *indexp = index;
 796     return matches;
 797 }
 798
 799 /*
 800  * Add a node to the access_cache at the given index (or do a lookup first
 801  * to find the index if -1 is passed in). We currently do a replace rather
 802  * than an insert if the cache is full.
 803  */
 804 static void
 805 add_node(struct access_cache *cache, int index, cnid_t nodeID, int access)
 806 {
 807     int lookup_index = -1;
 808
 809     /* need to do a lookup first if -1 passed for index */
 810     if (index == -1) {
 811         if (lookup_bucket(cache, &lookup_index, nodeID)) {
 812             if (cache->haveaccess[lookup_index] != access && cache->haveaccess[lookup_index] == ESRCH) {
 813                 // only update an entry if the previous access was ESRCH (i.e. a scope checking error)
 814                 cache->haveaccess[lookup_index] = access;
 815             }
 816
 817             /* mission accomplished */
 818             return;
 819         } else {
 820             index = lookup_index;
 821         }
 822
 823     }
 824
 825     /* if the cache is full, do a replace rather than an insert */
 826     if (cache->numcached >= NUM_CACHE_ENTRIES) {
 827         //printf("hfs: cache is full (%d). replace at index %d\n", cache->numcached, index);
 828         cache->numcached = NUM_CACHE_ENTRIES-1;
 829
 830         if (index > cache->numcached) {
 831             //    printf("hfs: index %d pinned to %d\n", index, cache->numcached);
 832             index = cache->numcached;
 833         }
 834     }
 835
 836     if (index < cache->numcached && index < NUM_CACHE_ENTRIES && nodeID > cache->acache[index]) {
 837         index++;
 838     }
 839
 840     if (index >= 0 && index < cache->numcached) {
 841         /* only do bcopy if we're inserting */
 842         bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) );
 843         bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(unsigned char) );
 844     }
 845
 846     cache->acache[index] = nodeID;
 847     cache->haveaccess[index] = access;
 848     cache->numcached++;
 849 }
 850
 851
 852 struct cinfo {
 853     uid_t   uid;
 854     gid_t   gid;
 855     mode_t  mode;
 856     cnid_t  parentcnid;
 857     u_int16_t recflags;
 858 };
 859
 860 static int
 861 snoop_callback(const struct cat_desc *descp, const struct cat_attr *attrp, void * arg)
 862 {
 863     struct cinfo *cip = (struct cinfo *)arg;
 864
 865     cip->uid = attrp->ca_uid;
 866     cip->gid = attrp->ca_gid;
 867     cip->mode = attrp->ca_mode;
 868     cip->parentcnid = descp->cd_parentcnid;
 869     cip->recflags = attrp->ca_recflags;
 870
 871     return (0);
 872 }
 873
 874 /*
 875  * Lookup the cnid's attr info (uid, gid, and mode) as well as its parent id. If the item
 876  * isn't incore, then go to the catalog.
 877  */
 878 static int
 879 do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, cnid_t cnid,
 880     struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp)
 881 {
 882     int error = 0;
 883
 884     /* if this id matches the one the fsctl was called with, skip the lookup */
 885     if (cnid == skip_cp->c_cnid) {
 886         cnattrp->ca_uid = skip_cp->c_uid;
 887         cnattrp->ca_gid = skip_cp->c_gid;
 888         cnattrp->ca_mode = skip_cp->c_mode;
 889         cnattrp->ca_recflags = skip_cp->c_attr.ca_recflags;
 890         keyp->hfsPlus.parentID = skip_cp->c_parentcnid;
 891     } else {
 892         struct cinfo c_info;
 893
 894         /* otherwise, check the cnode hash incase the file/dir is incore */
 895         if (hfs_chash_snoop(hfsmp, cnid, snoop_callback, &c_info) == 0) {
 896             cnattrp->ca_uid = c_info.uid;
 897             cnattrp->ca_gid = c_info.gid;
 898             cnattrp->ca_mode = c_info.mode;
 899             cnattrp->ca_recflags = c_info.recflags;
 900             keyp->hfsPlus.parentID = c_info.parentcnid;
 901         } else {
 902             int lockflags;
 903
 904             lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
 905
 906             /* lookup this cnid in the catalog */
 907             error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp);
 908
 909             hfs_systemfile_unlock(hfsmp, lockflags);
 910
 911             cache->lookups++;
 912         }
 913     }
 914
 915     return (error);
 916 }
 917
 918
 919 /*
 920  * Compute whether we have access to the given directory (nodeID) and all its parents. Cache
 921  * up to CACHE_LEVELS as we progress towards the root.
 922  */
 923 static int
 924 do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HFSCatalogNodeID nodeID,
 925     struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred,
 926     struct vfs_context *my_context,
 927     char *bitmap,
 928     uint32_t map_size,
 929     cnid_t* parents,
 930     uint32_t num_parents)
 931 {
 932     int                     myErr = 0;
 933     int                     myResult;
 934     HFSCatalogNodeID        thisNodeID;
 935     unsigned int            myPerms;
 936     struct cat_attr         cnattr;
 937     int                     cache_index = -1, scope_index = -1, scope_idx_start = -1;
 938     CatalogKey              catkey;
 939
 940     int i = 0, ids_to_cache = 0;
 941     int parent_ids[CACHE_LEVELS];
 942
 943     thisNodeID = nodeID;
 944     while (thisNodeID >=  kRootDirID) {
 945         myResult = 0;   /* default to "no access" */
 946
 947         /* check the cache before resorting to hitting the catalog */
 948
 949         /* ASSUMPTION: access info of cached entries is "final"... i.e. no need
 950          * to look any further after hitting cached dir */
 951
 952         if (lookup_bucket(cache, &cache_index, thisNodeID)) {
 953             cache->cachehits++;
 954             myErr = cache->haveaccess[cache_index];
 955             if (scope_index != -1) {
 956                 if (myErr == ESRCH) {
 957                     myErr = 0;
 958                 }
 959             } else {
 960                 scope_index = 0;   // so we'll just use the cache result
 961                 scope_idx_start = ids_to_cache;
 962             }
 963             myResult = (myErr == 0) ? 1 : 0;
 964             goto ExitThisRoutine;
 965         }
 966
 967
 968         if (parents) {
 969             int tmp;
 970             tmp = cache_binSearch(parents, num_parents-1, thisNodeID, NULL);
 971             if (scope_index == -1)
 972                 scope_index = tmp;
 973             if (tmp != -1 && scope_idx_start == -1 && ids_to_cache < CACHE_LEVELS) {
 974                 scope_idx_start = ids_to_cache;
 975             }
 976         }
 977
 978         /* remember which parents we want to cache */
 979         if (ids_to_cache < CACHE_LEVELS) {
 980             parent_ids[ids_to_cache] = thisNodeID;
 981             ids_to_cache++;
 982         }
 983         // Inefficient (using modulo) and we might want to use a hash function, not rely on the node id to be "nice"...
 984         if (bitmap && map_size) {
 985             bitmap[(thisNodeID/8)%(map_size)]|=(1<<(thisNodeID&7));
 986         }
 987
 988
 989         /* do the lookup (checks the cnode hash, then the catalog) */
 990         myErr = do_attr_lookup(hfsmp, cache, thisNodeID, skip_cp, &catkey, &cnattr);
 991         if (myErr) {
 992             goto ExitThisRoutine; /* no access */
 993         }
 994
 995         /* Root always gets access. */
 996         if (suser(myp_ucred, NULL) == 0) {
 997                 thisNodeID = catkey.hfsPlus.parentID;
 998                 myResult = 1;
 999                 continue;
1000         }
1001
1002         // if the thing has acl's, do the full permission check
1003         if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1004             struct vnode *vp;
1005
1006             /* get the vnode for this cnid */
1007             myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0);
1008             if ( myErr ) {
1009                 myResult = 0;
1010                 goto ExitThisRoutine;
1011             }
1012
1013             thisNodeID = VTOC(vp)->c_parentcnid;
1014
1015             hfs_unlock(VTOC(vp));
1016
1017             if (vnode_vtype(vp) == VDIR) {
1018                 myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), my_context);
1019             } else {
1020                 myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, my_context);
1021             }
1022
1023             vnode_put(vp);
1024             if (myErr) {
1025                 myResult = 0;
1026                 goto ExitThisRoutine;
1027             }
1028         } else {
1029             unsigned int flags;
1030
1031             myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
1032                 cnattr.ca_mode, hfsmp->hfs_mp,
1033                 myp_ucred, theProcPtr);
1034
1035             if (cnattr.ca_mode & S_IFDIR) {
1036                 flags = R_OK | X_OK;
1037             } else {
1038                 flags = R_OK;
1039             }
1040             if ( (myPerms & flags) != flags) {
1041                 myResult = 0;
1042                 myErr = EACCES;
1043                 goto ExitThisRoutine;   /* no access */
1044             }
1045
1046             /* up the hierarchy we go */
1047             thisNodeID = catkey.hfsPlus.parentID;
1048         }
1049     }
1050
1051     /* if here, we have access to this node */
1052     myResult = 1;
1053
1054   ExitThisRoutine:
1055     if (parents && myErr == 0 && scope_index == -1) {
1056         myErr = ESRCH;
1057     }
1058
1059     if (myErr) {
1060         myResult = 0;
1061     }
1062     *err = myErr;
1063
1064     /* cache the parent directory(ies) */
1065     for (i = 0; i < ids_to_cache; i++) {
1066         if (myErr == 0 && parents && (scope_idx_start == -1 || i > scope_idx_start)) {
1067             add_node(cache, -1, parent_ids[i], ESRCH);
1068         } else {
1069             add_node(cache, -1, parent_ids[i], myErr);
1070         }
1071     }
1072
1073     return (myResult);
1074 }
1075
1076 static int
1077 do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
1078     struct vnop_ioctl_args *ap, int arg_size, vfs_context_t context)
1079 {
1080     boolean_t is64bit;
1081
1082     /*
1083      * NOTE: on entry, the vnode is locked. Incase this vnode
1084      * happens to be in our list of file_ids, we'll note it
1085      * avoid calling hfs_chashget_nowait() on that id as that
1086      * will cause a "locking against myself" panic.
1087      */
1088     Boolean check_leaf = true;
1089
1090     struct user64_ext_access_t *user_access_structp;
1091     struct user64_ext_access_t tmp_user_access;
1092     struct access_cache cache;
1093
1094     int error = 0, prev_parent_check_ok=1;
1095     unsigned int i;
1096
1097     short flags;
1098     unsigned int num_files = 0;
1099     int map_size = 0;
1100     int num_parents = 0;
1101     int *file_ids=NULL;
1102     short *access=NULL;
1103     char *bitmap=NULL;
1104     cnid_t *parents=NULL;
1105     int leaf_index;
1106
1107     cnid_t cnid;
1108     cnid_t prevParent_cnid = 0;
1109     unsigned int myPerms;
1110     short myaccess = 0;
1111     struct cat_attr cnattr;
1112     CatalogKey catkey;
1113     struct cnode *skip_cp = VTOC(vp);
1114     kauth_cred_t cred = vfs_context_ucred(context);
1115     proc_t p = vfs_context_proc(context);
1116
1117     is64bit = proc_is64bit(p);
1118
1119     /* initialize the local cache and buffers */
1120     cache.numcached = 0;
1121     cache.cachehits = 0;
1122     cache.lookups = 0;
1123     cache.acache = NULL;
1124     cache.haveaccess = NULL;
1125
1126     /* struct copyin done during dispatch... need to copy file_id array separately */
1127     if (ap->a_data == NULL) {
1128         error = EINVAL;
1129         goto err_exit_bulk_access;
1130     }
1131
1132     if (is64bit) {
1133         if (arg_size != sizeof(struct user64_ext_access_t)) {
1134             error = EINVAL;
1135             goto err_exit_bulk_access;
1136         }
1137
1138         user_access_structp = (struct user64_ext_access_t *)ap->a_data;
1139
1140     } else if (arg_size == sizeof(struct user32_access_t)) {
1141         struct user32_access_t *accessp = (struct user32_access_t *)ap->a_data;
1142
1143         // convert an old style bulk-access struct to the new style
1144         tmp_user_access.flags     = accessp->flags;
1145         tmp_user_access.num_files = accessp->num_files;
1146         tmp_user_access.map_size  = 0;
1147         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1148         tmp_user_access.bitmap    = USER_ADDR_NULL;
1149         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1150         tmp_user_access.num_parents = 0;
1151         user_access_structp = &tmp_user_access;
1152
1153     } else if (arg_size == sizeof(struct user32_ext_access_t)) {
1154         struct user32_ext_access_t *accessp = (struct user32_ext_access_t *)ap->a_data;
1155
1156         // up-cast from a 32-bit version of the struct
1157         tmp_user_access.flags     = accessp->flags;
1158         tmp_user_access.num_files = accessp->num_files;
1159         tmp_user_access.map_size  = accessp->map_size;
1160         tmp_user_access.num_parents  = accessp->num_parents;
1161
1162         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1163         tmp_user_access.bitmap    = CAST_USER_ADDR_T(accessp->bitmap);
1164         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1165         tmp_user_access.parents    = CAST_USER_ADDR_T(accessp->parents);
1166
1167         user_access_structp = &tmp_user_access;
1168     } else {
1169         error = EINVAL;
1170         goto err_exit_bulk_access;
1171     }
1172
1173     map_size = user_access_structp->map_size;
1174
1175     num_files = user_access_structp->num_files;
1176
1177     num_parents= user_access_structp->num_parents;
1178
1179     if (num_files < 1) {
1180         goto err_exit_bulk_access;
1181     }
1182     if (num_files > 1024) {
1183         error = EINVAL;
1184         goto err_exit_bulk_access;
1185     }
1186
1187     if (num_parents > 1024) {
1188         error = EINVAL;
1189         goto err_exit_bulk_access;
1190     }
1191
1192     file_ids = (int *) kalloc(sizeof(int) * num_files);
1193     access = (short *) kalloc(sizeof(short) * num_files);
1194     if (map_size) {
1195         bitmap = (char *) kalloc(sizeof(char) * map_size);
1196     }
1197
1198     if (num_parents) {
1199         parents = (cnid_t *) kalloc(sizeof(cnid_t) * num_parents);
1200     }
1201
1202     cache.acache = (unsigned int *) kalloc(sizeof(int) * NUM_CACHE_ENTRIES);
1203     cache.haveaccess = (unsigned char *) kalloc(sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1204
1205     if (file_ids == NULL || access == NULL || (map_size != 0 && bitmap == NULL) || cache.acache == NULL || cache.haveaccess == NULL) {
1206         if (file_ids) {
1207             kfree(file_ids, sizeof(int) * num_files);
1208         }
1209         if (bitmap) {
1210             kfree(bitmap, sizeof(char) * map_size);
1211         }
1212         if (access) {
1213             kfree(access, sizeof(short) * num_files);
1214         }
1215         if (cache.acache) {
1216             kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1217         }
1218         if (cache.haveaccess) {
1219             kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1220         }
1221         if (parents) {
1222             kfree(parents, sizeof(cnid_t) * num_parents);
1223         }
1224         return ENOMEM;
1225     }
1226
1227     // make sure the bitmap is zero'ed out...
1228     if (bitmap) {
1229         bzero(bitmap, (sizeof(char) * map_size));
1230     }
1231
1232     if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids,
1233                 num_files * sizeof(int)))) {
1234         goto err_exit_bulk_access;
1235     }
1236
1237     if (num_parents) {
1238         if ((error = copyin(user_access_structp->parents, (caddr_t)parents,
1239                     num_parents * sizeof(cnid_t)))) {
1240             goto err_exit_bulk_access;
1241         }
1242     }
1243
1244     flags = user_access_structp->flags;
1245     if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) {
1246         flags = R_OK;
1247     }
1248
1249     /* check if we've been passed leaf node ids or parent ids */
1250     if (flags & PARENT_IDS_FLAG) {
1251         check_leaf = false;
1252     }
1253
1254     /* Check access to each file_id passed in */
1255     for (i = 0; i < num_files; i++) {
1256         leaf_index=-1;
1257         cnid = (cnid_t) file_ids[i];
1258
1259         /* root always has access */
1260         if ((!parents) && (!suser(cred, NULL))) {
1261             access[i] = 0;
1262             continue;
1263         }
1264
1265         if (check_leaf) {
1266             /* do the lookup (checks the cnode hash, then the catalog) */
1267             error = do_attr_lookup(hfsmp, &cache, cnid, skip_cp, &catkey, &cnattr);
1268             if (error) {
1269                 access[i] = (short) error;
1270                 continue;
1271             }
1272
1273             if (parents) {
1274                 // Check if the leaf matches one of the parent scopes
1275                 leaf_index = cache_binSearch(parents, num_parents-1, cnid, NULL);
1276                 if (leaf_index >= 0 && parents[leaf_index] == cnid)
1277                     prev_parent_check_ok = 0;
1278                 else if (leaf_index >= 0)
1279                     prev_parent_check_ok = 1;
1280             }
1281
1282             // if the thing has acl's, do the full permission check
1283             if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1284                 struct vnode *cvp;
1285                 int myErr = 0;
1286                 /* get the vnode for this cnid */
1287                 myErr = hfs_vget(hfsmp, cnid, &cvp, 0);
1288                 if ( myErr ) {
1289                     access[i] = myErr;
1290                     continue;
1291                 }
1292
1293                 hfs_unlock(VTOC(cvp));
1294
1295                 if (vnode_vtype(cvp) == VDIR) {
1296                     myErr = vnode_authorize(cvp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), context);
1297                 } else {
1298                     myErr = vnode_authorize(cvp, NULL, KAUTH_VNODE_READ_DATA, context);
1299                 }
1300
1301                 vnode_put(cvp);
1302                 if (myErr) {
1303                     access[i] = myErr;
1304                     continue;
1305                 }
1306             } else {
1307                 /* before calling CheckAccess(), check the target file for read access */
1308                 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
1309                     cnattr.ca_mode, hfsmp->hfs_mp, cred, p);
1310
1311                 /* fail fast if no access */
1312                 if ((myPerms & flags) == 0) {
1313                     access[i] = EACCES;
1314                     continue;
1315                 }
1316             }
1317         } else {
1318             /* we were passed an array of parent ids */
1319             catkey.hfsPlus.parentID = cnid;
1320         }
1321
1322         /* if the last guy had the same parent and had access, we're done */
1323         if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0 && prev_parent_check_ok) {
1324             cache.cachehits++;
1325             access[i] = 0;
1326             continue;
1327         }
1328
1329         myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID,
1330             skip_cp, p, cred, context,bitmap, map_size, parents, num_parents);
1331
1332         if (myaccess || (error == ESRCH && leaf_index != -1)) {
1333             access[i] = 0; // have access.. no errors to report
1334         } else {
1335             access[i] = (error != 0 ? (short) error : EACCES);
1336         }
1337
1338         prevParent_cnid = catkey.hfsPlus.parentID;
1339     }
1340
1341     /* copyout the access array */
1342     if ((error = copyout((caddr_t)access, user_access_structp->access,
1343                 num_files * sizeof (short)))) {
1344         goto err_exit_bulk_access;
1345     }
1346     if (map_size && bitmap) {
1347         if ((error = copyout((caddr_t)bitmap, user_access_structp->bitmap,
1348                     map_size * sizeof (char)))) {
1349             goto err_exit_bulk_access;
1350         }
1351     }
1352
1353
1354   err_exit_bulk_access:
1355
1356     //printf("hfs: on exit (err %d), numfiles/numcached/cachehits/lookups is %d/%d/%d/%d\n", error, num_files, cache.numcached, cache.cachehits, cache.lookups);
1357
1358     if (file_ids)
1359         kfree(file_ids, sizeof(int) * num_files);
1360     if (parents)
1361         kfree(parents, sizeof(cnid_t) * num_parents);
1362     if (bitmap)
1363         kfree(bitmap, sizeof(char) * map_size);
1364     if (access)
1365         kfree(access, sizeof(short) * num_files);
1366     if (cache.acache)
1367         kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1368     if (cache.haveaccess)
1369         kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1370
1371     return (error);
1372 }
1373
1374
1375 /* end "bulk-access" support */
1376
1377
1378 /*
1379  * Callback for use with freeze ioctl.
1380  */
1381 static int
1382 hfs_freezewrite_callback(struct vnode *vp, __unused void *cargs)
1383 {
1384         vnode_waitforwrites(vp, 0, 0, 0, "hfs freeze");
1385
1386         return 0;
1387 }
1388
1389 /*
1390  * Control filesystem operating characteristics.
1391  */
1392 int
1393 hfs_vnop_ioctl( struct vnop_ioctl_args /* {
1394                 vnode_t a_vp;
1395                 int  a_command;
1396                 caddr_t  a_data;
1397                 int  a_fflag;
1398                 vfs_context_t a_context;
1399         } */ *ap)
1400 {
1401         struct vnode * vp = ap->a_vp;
1402         struct hfsmount *hfsmp = VTOHFS(vp);
1403         vfs_context_t context = ap->a_context;
1404         kauth_cred_t cred = vfs_context_ucred(context);
1405         proc_t p = vfs_context_proc(context);
1406         struct vfsstatfs *vfsp;
1407         boolean_t is64bit;
1408         off_t jnl_start, jnl_size;
1409         struct hfs_journal_info *jip;
1410 #if HFS_COMPRESSION
1411         int compressed = 0;
1412         off_t uncompressed_size = -1;
1413         int decmpfs_error = 0;
1414
1415         if (ap->a_command == F_RDADVISE) {
1416                 /* we need to inspect the decmpfs state of the file as early as possible */
1417                 compressed = hfs_file_is_compressed(VTOC(vp), 0);
1418                 if (compressed) {
1419                         if (VNODE_IS_RSRC(vp)) {
1420                                 /* if this is the resource fork, treat it as if it were empty */
1421                                 uncompressed_size = 0;
1422                         } else {
1423                                 decmpfs_error = hfs_uncompressed_size_of_compressed_file(NULL, vp, 0, &uncompressed_size, 0);
1424                                 if (decmpfs_error != 0) {
1425                                         /* failed to get the uncompressed size, we'll check for this later */
1426                                         uncompressed_size = -1;
1427                                 }
1428                         }
1429                 }
1430         }
1431 #endif /* HFS_COMPRESSION */
1432
1433         is64bit = proc_is64bit(p);
1434
1435         switch (ap->a_command) {
1436
1437         case HFS_GETPATH:
1438         {
1439                 struct vnode *file_vp;
1440                 cnid_t  cnid;
1441                 int  outlen;
1442                 char *bufptr;
1443                 int error;
1444
1445                 /* Caller must be owner of file system. */
1446                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1447                 if (suser(cred, NULL) &&
1448                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1449                         return (EACCES);
1450                 }
1451                 /* Target vnode must be file system's root. */
1452                 if (!vnode_isvroot(vp)) {
1453                         return (EINVAL);
1454                 }
1455                 bufptr = (char *)ap->a_data;
1456                 cnid = strtoul(bufptr, NULL, 10);
1457
1458                 /* We need to call hfs_vfs_vget to leverage the code that will
1459                  * fix the origin list for us if needed, as opposed to calling
1460                  * hfs_vget, since we will need the parent for build_path call.
1461                  */
1462
1463                 if ((error = hfs_vfs_vget(HFSTOVFS(hfsmp), cnid, &file_vp, context))) {
1464                         return (error);
1465                 }
1466                 error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, 0, context);
1467                 vnode_put(file_vp);
1468
1469                 return (error);
1470         }
1471
1472         case HFS_PREV_LINK:
1473         case HFS_NEXT_LINK:
1474         {
1475                 cnid_t linkfileid;
1476                 cnid_t nextlinkid;
1477                 cnid_t prevlinkid;
1478                 int error;
1479
1480                 /* Caller must be owner of file system. */
1481                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1482                 if (suser(cred, NULL) &&
1483                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1484                         return (EACCES);
1485                 }
1486                 /* Target vnode must be file system's root. */
1487                 if (!vnode_isvroot(vp)) {
1488                         return (EINVAL);
1489                 }
1490                 linkfileid = *(cnid_t *)ap->a_data;
1491                 if (linkfileid < kHFSFirstUserCatalogNodeID) {
1492                         return (EINVAL);
1493                 }
1494                 if ((error = hfs_lookuplink(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) {
1495                         return (error);
1496                 }
1497                 if (ap->a_command == HFS_NEXT_LINK) {
1498                         *(cnid_t *)ap->a_data = nextlinkid;
1499                 } else {
1500                         *(cnid_t *)ap->a_data = prevlinkid;
1501                 }
1502                 return (0);
1503         }
1504
1505         case HFS_RESIZE_PROGRESS: {
1506
1507                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1508                 if (suser(cred, NULL) &&
1509                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1510                         return (EACCES); /* must be owner of file system */
1511                 }
1512                 if (!vnode_isvroot(vp)) {
1513                         return (EINVAL);
1514                 }
1515                 /* file system must not be mounted read-only */
1516                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1517                         return (EROFS);
1518                 }
1519
1520                 return hfs_resize_progress(hfsmp, (u_int32_t *)ap->a_data);
1521         }
1522
1523         case HFS_RESIZE_VOLUME: {
1524                 u_int64_t newsize;
1525                 u_int64_t cursize;
1526
1527                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1528                 if (suser(cred, NULL) &&
1529                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1530                         return (EACCES); /* must be owner of file system */
1531                 }
1532                 if (!vnode_isvroot(vp)) {
1533                         return (EINVAL);
1534                 }
1535
1536                 /* filesystem must not be mounted read only */
1537                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1538                         return (EROFS);
1539                 }
1540                 newsize = *(u_int64_t *)ap->a_data;
1541                 cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
1542
1543                 if (newsize > cursize) {
1544                         return hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context);
1545                 } else if (newsize < cursize) {
1546                         return hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context);
1547                 } else {
1548                         return (0);
1549                 }
1550         }
1551         case HFS_CHANGE_NEXT_ALLOCATION: {
1552                 int error = 0;          /* Assume success */
1553                 u_int32_t location;
1554
1555                 if (vnode_vfsisrdonly(vp)) {
1556                         return (EROFS);
1557                 }
1558                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1559                 if (suser(cred, NULL) &&
1560                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1561                         return (EACCES); /* must be owner of file system */
1562                 }
1563                 if (!vnode_isvroot(vp)) {
1564                         return (EINVAL);
1565                 }
1566                 HFS_MOUNT_LOCK(hfsmp, TRUE);
1567                 location = *(u_int32_t *)ap->a_data;
1568                 if ((location >= hfsmp->allocLimit) &&
1569                         (location != HFS_NO_UPDATE_NEXT_ALLOCATION)) {
1570                         error = EINVAL;
1571                         goto fail_change_next_allocation;
1572                 }
1573                 /* Return previous value. */
1574                 *(u_int32_t *)ap->a_data = hfsmp->nextAllocation;
1575                 if (location == HFS_NO_UPDATE_NEXT_ALLOCATION) {
1576                         /* On magic value for location, set nextAllocation to next block
1577                          * after metadata zone and set flag in mount structure to indicate
1578                          * that nextAllocation should not be updated again.
1579                          */
1580                         if (hfsmp->hfs_metazone_end != 0) {
1581                                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_end + 1);
1582                         }
1583                         hfsmp->hfs_flags |= HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1584                 } else {
1585                         hfsmp->hfs_flags &= ~HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1586                         HFS_UPDATE_NEXT_ALLOCATION(hfsmp, location);
1587                 }
1588                 MarkVCBDirty(hfsmp);
1589 fail_change_next_allocation:
1590                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
1591                 return (error);
1592         }
1593
1594 #ifdef HFS_SPARSE_DEV
1595         case HFS_SETBACKINGSTOREINFO: {
1596                 struct vnode * bsfs_rootvp;
1597                 struct vnode * di_vp;
1598                 struct hfs_backingstoreinfo *bsdata;
1599                 int error = 0;
1600
1601                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1602                         return (EROFS);
1603                 }
1604                 if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
1605                         return (EALREADY);
1606                 }
1607                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1608                 if (suser(cred, NULL) &&
1609                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1610                         return (EACCES); /* must be owner of file system */
1611                 }
1612                 bsdata = (struct hfs_backingstoreinfo *)ap->a_data;
1613                 if (bsdata == NULL) {
1614                         return (EINVAL);
1615                 }
1616                 if ((error = file_vnode(bsdata->backingfd, &di_vp))) {
1617                         return (error);
1618                 }
1619                 if ((error = vnode_getwithref(di_vp))) {
1620                         file_drop(bsdata->backingfd);
1621                         return(error);
1622                 }
1623
1624                 if (vnode_mount(vp) == vnode_mount(di_vp)) {
1625                         (void)vnode_put(di_vp);
1626                         file_drop(bsdata->backingfd);
1627                         return (EINVAL);
1628                 }
1629
1630                 /*
1631                  * Obtain the backing fs root vnode and keep a reference
1632                  * on it.  This reference will be dropped in hfs_unmount.
1633                  */
1634                 error = VFS_ROOT(vnode_mount(di_vp), &bsfs_rootvp, NULL); /* XXX use context! */
1635                 if (error) {
1636                         (void)vnode_put(di_vp);
1637                         file_drop(bsdata->backingfd);
1638                         return (error);
1639                 }
1640                 vnode_ref(bsfs_rootvp);
1641                 vnode_put(bsfs_rootvp);
1642
1643                 hfsmp->hfs_backingfs_rootvp = bsfs_rootvp;
1644                 hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
1645                 hfsmp->hfs_sparsebandblks = bsdata->bandsize / HFSTOVCB(hfsmp)->blockSize;
1646                 hfsmp->hfs_sparsebandblks *= 4;
1647
1648                 vfs_markdependency(hfsmp->hfs_mp);
1649
1650                 /*
1651                  * If the sparse image is on a sparse image file (as opposed to a sparse
1652                  * bundle), then we may need to limit the free space to the maximum size
1653                  * of a file on that volume.  So we query (using pathconf), and if we get
1654                  * a meaningful result, we cache the number of blocks for later use in
1655                  * hfs_freeblks().
1656                  */
1657                 hfsmp->hfs_backingfs_maxblocks = 0;
1658                 if (vnode_vtype(di_vp) == VREG) {
1659                         int terr;
1660                         int hostbits;
1661                         terr = vn_pathconf(di_vp, _PC_FILESIZEBITS, &hostbits, context);
1662                         if (terr == 0 && hostbits != 0 && hostbits < 64) {
1663                                 u_int64_t hostfilesizemax = ((u_int64_t)1) << hostbits;
1664
1665                                 hfsmp->hfs_backingfs_maxblocks = hostfilesizemax / hfsmp->blockSize;
1666                         }
1667                 }
1668
1669                 (void)vnode_put(di_vp);
1670                 file_drop(bsdata->backingfd);
1671                 return (0);
1672         }
1673         case HFS_CLRBACKINGSTOREINFO: {
1674                 struct vnode * tmpvp;
1675
1676                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1677                 if (suser(cred, NULL) &&
1678                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1679                         return (EACCES); /* must be owner of file system */
1680                 }
1681                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1682                         return (EROFS);
1683                 }
1684
1685                 if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
1686                     hfsmp->hfs_backingfs_rootvp) {
1687
1688                         hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
1689                         tmpvp = hfsmp->hfs_backingfs_rootvp;
1690                         hfsmp->hfs_backingfs_rootvp = NULLVP;
1691                         hfsmp->hfs_sparsebandblks = 0;
1692                         vnode_rele(tmpvp);
1693                 }
1694                 return (0);
1695         }
1696 #endif /* HFS_SPARSE_DEV */
1697
1698         case F_FREEZE_FS: {
1699                 struct mount *mp;
1700
1701                 mp = vnode_mount(vp);
1702                 hfsmp = VFSTOHFS(mp);
1703
1704                 if (!(hfsmp->jnl))
1705                         return (ENOTSUP);
1706
1707                 vfsp = vfs_statfs(mp);
1708
1709                 if (kauth_cred_getuid(cred) != vfsp->f_owner &&
1710                         !kauth_cred_issuser(cred))
1711                         return (EACCES);
1712
1713                 lck_rw_lock_exclusive(&hfsmp->hfs_insync);
1714
1715                 // flush things before we get started to try and prevent
1716                 // dirty data from being paged out while we're frozen.
1717                 // note: can't do this after taking the lock as it will
1718                 // deadlock against ourselves.
1719                 vnode_iterate(mp, 0, hfs_freezewrite_callback, NULL);
1720                 hfs_global_exclusive_lock_acquire(hfsmp);
1721
1722                 // DO NOT call hfs_journal_flush() because that takes a
1723                 // shared lock on the global exclusive lock!
1724                 journal_flush(hfsmp->jnl);
1725
1726                 // don't need to iterate on all vnodes, we just need to
1727                 // wait for writes to the system files and the device vnode
1728                 if (HFSTOVCB(hfsmp)->extentsRefNum)
1729                     vnode_waitforwrites(HFSTOVCB(hfsmp)->extentsRefNum, 0, 0, 0, "hfs freeze");
1730                 if (HFSTOVCB(hfsmp)->catalogRefNum)
1731                     vnode_waitforwrites(HFSTOVCB(hfsmp)->catalogRefNum, 0, 0, 0, "hfs freeze");
1732                 if (HFSTOVCB(hfsmp)->allocationsRefNum)
1733                     vnode_waitforwrites(HFSTOVCB(hfsmp)->allocationsRefNum, 0, 0, 0, "hfs freeze");
1734                 if (hfsmp->hfs_attribute_vp)
1735                     vnode_waitforwrites(hfsmp->hfs_attribute_vp, 0, 0, 0, "hfs freeze");
1736                 vnode_waitforwrites(hfsmp->hfs_devvp, 0, 0, 0, "hfs freeze");
1737
1738                 hfsmp->hfs_freezing_proc = current_proc();
1739
1740                 return (0);
1741         }
1742
1743         case F_THAW_FS: {
1744                 vfsp = vfs_statfs(vnode_mount(vp));
1745                 if (kauth_cred_getuid(cred) != vfsp->f_owner &&
1746                         !kauth_cred_issuser(cred))
1747                         return (EACCES);
1748
1749                 // if we're not the one who froze the fs then we
1750                 // can't thaw it.
1751                 if (hfsmp->hfs_freezing_proc != current_proc()) {
1752                     return EPERM;
1753                 }
1754
1755                 // NOTE: if you add code here, also go check the
1756                 //       code that "thaws" the fs in hfs_vnop_close()
1757                 //
1758                 hfsmp->hfs_freezing_proc = NULL;
1759                 hfs_global_exclusive_lock_release(hfsmp);
1760                 lck_rw_unlock_exclusive(&hfsmp->hfs_insync);
1761
1762                 return (0);
1763         }
1764
1765         case HFS_BULKACCESS_FSCTL: {
1766             int size;
1767
1768             if (hfsmp->hfs_flags & HFS_STANDARD) {
1769                 return EINVAL;
1770             }
1771
1772             if (is64bit) {
1773                 size = sizeof(struct user64_access_t);
1774             } else {
1775                 size = sizeof(struct user32_access_t);
1776             }
1777
1778             return do_bulk_access_check(hfsmp, vp, ap, size, context);
1779         }
1780
1781         case HFS_EXT_BULKACCESS_FSCTL: {
1782             int size;
1783
1784             if (hfsmp->hfs_flags & HFS_STANDARD) {
1785                 return EINVAL;
1786             }
1787
1788             if (is64bit) {
1789                 size = sizeof(struct user64_ext_access_t);
1790             } else {
1791                 size = sizeof(struct user32_ext_access_t);
1792             }
1793
1794             return do_bulk_access_check(hfsmp, vp, ap, size, context);
1795         }
1796
1797         case HFS_SETACLSTATE: {
1798                 int state;
1799
1800                 if (ap->a_data == NULL) {
1801                         return (EINVAL);
1802                 }
1803
1804                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1805                 state = *(int *)ap->a_data;
1806
1807                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1808                         return (EROFS);
1809                 }
1810                 // super-user can enable or disable acl's on a volume.
1811                 // the volume owner can only enable acl's
1812                 if (!is_suser() && (state == 0 || kauth_cred_getuid(cred) != vfsp->f_owner)) {
1813                         return (EPERM);
1814                 }
1815                 if (state == 0 || state == 1)
1816                         return hfs_set_volxattr(hfsmp, HFS_SETACLSTATE, state);
1817                 else
1818                         return (EINVAL);
1819         }
1820
1821         case HFS_SET_XATTREXTENTS_STATE: {
1822                 int state;
1823
1824                 if (ap->a_data == NULL) {
1825                         return (EINVAL);
1826                 }
1827
1828                 state = *(int *)ap->a_data;
1829
1830                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1831                         return (EROFS);
1832                 }
1833
1834                 /* Super-user can enable or disable extent-based extended
1835                  * attribute support on a volume
1836                  */
1837                 if (!is_suser()) {
1838                         return (EPERM);
1839                 }
1840                 if (state == 0 || state == 1)
1841                         return hfs_set_volxattr(hfsmp, HFS_SET_XATTREXTENTS_STATE, state);
1842                 else
1843                         return (EINVAL);
1844         }
1845
1846         case F_FULLFSYNC: {
1847                 int error;
1848
1849                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1850                         return (EROFS);
1851                 }
1852                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK);
1853                 if (error == 0) {
1854                         error = hfs_fsync(vp, MNT_WAIT, TRUE, p);
1855                         hfs_unlock(VTOC(vp));
1856                 }
1857
1858                 return error;
1859         }
1860
1861         case F_CHKCLEAN: {
1862                 register struct cnode *cp;
1863                 int error;
1864
1865                 if (!vnode_isreg(vp))
1866                         return EINVAL;
1867
1868                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK);
1869                 if (error == 0) {
1870                         cp = VTOC(vp);
1871                         /*
1872                          * used by regression test to determine if
1873                          * all the dirty pages (via write) have been cleaned
1874                          * after a call to 'fsysnc'.
1875                          */
1876                         error = is_file_clean(vp, VTOF(vp)->ff_size);
1877                         hfs_unlock(cp);
1878                 }
1879                 return (error);
1880         }
1881
1882         case F_RDADVISE: {
1883                 register struct radvisory *ra;
1884                 struct filefork *fp;
1885                 int error;
1886
1887                 if (!vnode_isreg(vp))
1888                         return EINVAL;
1889
1890                 ra = (struct radvisory *)(ap->a_data);
1891                 fp = VTOF(vp);
1892
1893                 /* Protect against a size change. */
1894                 hfs_lock_truncate(VTOC(vp), TRUE);
1895
1896 #if HFS_COMPRESSION
1897                 if (compressed && (uncompressed_size == -1)) {
1898                         /* fetching the uncompressed size failed above, so return the error */
1899                         error = decmpfs_error;
1900                 } else if ((compressed && (ra->ra_offset >= uncompressed_size)) ||
1901                                    (!compressed && (ra->ra_offset >= fp->ff_size))) {
1902                         error = EFBIG;
1903                 }
1904 #else /* HFS_COMPRESSION */
1905                 if (ra->ra_offset >= fp->ff_size) {
1906                         error = EFBIG;
1907                 }
1908 #endif /* HFS_COMPRESSION */
1909                 else {
1910                         error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count);
1911                 }
1912
1913                 hfs_unlock_truncate(VTOC(vp), TRUE);
1914                 return (error);
1915         }
1916
1917         case F_READBOOTSTRAP:
1918         case F_WRITEBOOTSTRAP:
1919         {
1920             struct vnode *devvp = NULL;
1921             user_fbootstraptransfer_t *user_bootstrapp;
1922             int devBlockSize;
1923             int error;
1924             uio_t auio;
1925             daddr64_t blockNumber;
1926             u_int32_t blockOffset;
1927             u_int32_t xfersize;
1928             struct buf *bp;
1929             user_fbootstraptransfer_t user_bootstrap;
1930
1931                 if (!vnode_isvroot(vp))
1932                         return (EINVAL);
1933                 /* LP64 - when caller is a 64 bit process then we are passed a pointer
1934                  * to a user_fbootstraptransfer_t else we get a pointer to a
1935                  * fbootstraptransfer_t which we munge into a user_fbootstraptransfer_t
1936                  */
1937                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1938                         return (EROFS);
1939                 }
1940                 if (is64bit) {
1941                         user_bootstrapp = (user_fbootstraptransfer_t *)ap->a_data;
1942                 }
1943                 else {
1944                 user32_fbootstraptransfer_t *bootstrapp = (user32_fbootstraptransfer_t *)ap->a_data;
1945                         user_bootstrapp = &user_bootstrap;
1946                         user_bootstrap.fbt_offset = bootstrapp->fbt_offset;
1947                         user_bootstrap.fbt_length = bootstrapp->fbt_length;
1948                         user_bootstrap.fbt_buffer = CAST_USER_ADDR_T(bootstrapp->fbt_buffer);
1949                 }
1950                 if (user_bootstrapp->fbt_offset + user_bootstrapp->fbt_length > 1024)
1951                         return EINVAL;
1952
1953             devvp = VTOHFS(vp)->hfs_devvp;
1954                 auio = uio_create(1, user_bootstrapp->fbt_offset,
1955                                                   is64bit ? UIO_USERSPACE64 : UIO_USERSPACE32,
1956                                                   (ap->a_command == F_WRITEBOOTSTRAP) ? UIO_WRITE : UIO_READ);
1957                 uio_addiov(auio, user_bootstrapp->fbt_buffer, user_bootstrapp->fbt_length);
1958
1959             devBlockSize = vfs_devblocksize(vnode_mount(vp));
1960
1961             while (uio_resid(auio) > 0) {
1962                         blockNumber = uio_offset(auio) / devBlockSize;
1963                         error = (int)buf_bread(devvp, blockNumber, devBlockSize, cred, &bp);
1964                         if (error) {
1965                                 if (bp) buf_brelse(bp);
1966                                 uio_free(auio);
1967                                 return error;
1968                         };
1969
1970                         blockOffset = uio_offset(auio) % devBlockSize;
1971                         xfersize = devBlockSize - blockOffset;
1972                         error = uiomove((caddr_t)buf_dataptr(bp) + blockOffset, (int)xfersize, auio);
1973                         if (error) {
1974                                 buf_brelse(bp);
1975                                 uio_free(auio);
1976                                 return error;
1977                         };
1978                         if (uio_rw(auio) == UIO_WRITE) {
1979                                 error = VNOP_BWRITE(bp);
1980                                 if (error) {
1981                                         uio_free(auio);
1982                         return error;
1983                                 }
1984                         } else {
1985                                 buf_brelse(bp);
1986                         };
1987                 };
1988                 uio_free(auio);
1989         };
1990         return 0;
1991
1992         case _IOC(IOC_OUT,'h', 4, 0):     /* Create date in local time */
1993         {
1994                 if (is64bit) {
1995                         *(user_time_t *)(ap->a_data) = (user_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
1996                 }
1997                 else {
1998                         *(user32_time_t *)(ap->a_data) = (user32_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
1999                 }
2000                 return 0;
2001         }
2002
2003         case SPOTLIGHT_FSCTL_GET_MOUNT_TIME:
2004             *(uint32_t *)ap->a_data = hfsmp->hfs_mount_time;
2005             break;
2006
2007         case SPOTLIGHT_FSCTL_GET_LAST_MTIME:
2008             *(uint32_t *)ap->a_data = hfsmp->hfs_last_mounted_mtime;
2009             break;
2010
2011         case HFS_FSCTL_SET_VERY_LOW_DISK:
2012             if (*(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_warninglimit) {
2013                 return EINVAL;
2014             }
2015
2016             hfsmp->hfs_freespace_notify_dangerlimit = *(uint32_t *)ap->a_data;
2017             break;
2018
2019         case HFS_FSCTL_SET_LOW_DISK:
2020             if (   *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel
2021                 || *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_dangerlimit) {
2022
2023                 return EINVAL;
2024             }
2025
2026             hfsmp->hfs_freespace_notify_warninglimit = *(uint32_t *)ap->a_data;
2027             break;
2028
2029         case HFS_FSCTL_SET_DESIRED_DISK:
2030             if (*(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) {
2031                 return EINVAL;
2032             }
2033
2034             hfsmp->hfs_freespace_notify_desiredlevel = *(uint32_t *)ap->a_data;
2035             break;
2036
2037         case HFS_VOLUME_STATUS:
2038             *(uint32_t *)ap->a_data = hfsmp->hfs_notification_conditions;
2039             break;
2040
2041         case HFS_SET_BOOT_INFO:
2042                 if (!vnode_isvroot(vp))
2043                         return(EINVAL);
2044                 if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(HFSTOVFS(hfsmp))->f_owner))
2045                         return(EACCES); /* must be superuser or owner of filesystem */
2046                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2047                         return (EROFS);
2048                 }
2049                 HFS_MOUNT_LOCK(hfsmp, TRUE);
2050                 bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo));
2051                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
2052                 (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
2053                 break;
2054
2055         case HFS_GET_BOOT_INFO:
2056                 if (!vnode_isvroot(vp))
2057                         return(EINVAL);
2058                 HFS_MOUNT_LOCK(hfsmp, TRUE);
2059                 bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo));
2060                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
2061                 break;
2062
2063         case HFS_MARK_BOOT_CORRUPT:
2064                 /* Mark the boot volume corrupt by setting
2065                  * kHFSVolumeInconsistentBit in the volume header.  This will
2066                  * force fsck_hfs on next mount.
2067                  */
2068                 if (!is_suser()) {
2069                         return EACCES;
2070                 }
2071
2072                 /* Allowed only on the root vnode of the boot volume */
2073                 if (!(vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) ||
2074                     !vnode_isvroot(vp)) {
2075                         return EINVAL;
2076                 }
2077                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2078                         return (EROFS);
2079                 }
2080                 printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n");
2081                 hfs_mark_volume_inconsistent(hfsmp);
2082                 break;
2083
2084         case HFS_FSCTL_GET_JOURNAL_INFO:
2085                 jip = (struct hfs_journal_info*)ap->a_data;
2086
2087                 if (vp == NULLVP)
2088                         return EINVAL;
2089
2090             if (hfsmp->jnl == NULL) {
2091                         jnl_start = 0;
2092                         jnl_size  = 0;
2093             } else {
2094                         jnl_start = (off_t)(hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset;
2095                         jnl_size  = (off_t)hfsmp->jnl_size;
2096             }
2097
2098                 jip->jstart = jnl_start;
2099                 jip->jsize = jnl_size;
2100                 break;
2101
2102         case HFS_SET_ALWAYS_ZEROFILL: {
2103             struct cnode *cp = VTOC(vp);
2104
2105             if (*(int *)ap->a_data) {
2106                 cp->c_flag |= C_ALWAYS_ZEROFILL;
2107             } else {
2108                 cp->c_flag &= ~C_ALWAYS_ZEROFILL;
2109             }
2110             break;
2111         }
2112
2113         default:
2114                 return (ENOTTY);
2115         }
2116
2117         return 0;
2118 }
2119
2120 /*
2121  * select
2122  */
2123 int
2124 hfs_vnop_select(__unused struct vnop_select_args *ap)
2125 /*
2126         struct vnop_select_args {
2127                 vnode_t a_vp;
2128                 int  a_which;
2129                 int  a_fflags;
2130                 void *a_wql;
2131                 vfs_context_t a_context;
2132         };
2133 */
2134 {
2135         /*
2136          * We should really check to see if I/O is possible.
2137          */
2138         return (1);
2139 }
2140
2141 /*
2142  * Converts a logical block number to a physical block, and optionally returns
2143  * the amount of remaining blocks in a run. The logical block is based on hfsNode.logBlockSize.
2144  * The physical block number is based on the device block size, currently its 512.
2145  * The block run is returned in logical blocks, and is the REMAINING amount of blocks
2146  */
2147 int
2148 hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, unsigned int *runp)
2149 {
2150         struct filefork *fp = VTOF(vp);
2151         struct hfsmount *hfsmp = VTOHFS(vp);
2152         int  retval = E_NONE;
2153         u_int32_t  logBlockSize;
2154         size_t  bytesContAvail = 0;
2155         off_t  blockposition;
2156         int lockExtBtree;
2157         int lockflags = 0;
2158
2159         /*
2160          * Check for underlying vnode requests and ensure that logical
2161          * to physical mapping is requested.
2162          */
2163         if (vpp != NULL)
2164                 *vpp = hfsmp->hfs_devvp;
2165         if (bnp == NULL)
2166                 return (0);
2167
2168         logBlockSize = GetLogicalBlockSize(vp);
2169         blockposition = (off_t)bn * logBlockSize;
2170
2171         lockExtBtree = overflow_extents(fp);
2172
2173         if (lockExtBtree)
2174                 lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK);
2175
2176         retval = MacToVFSError(
2177                             MapFileBlockC (HFSTOVCB(hfsmp),
2178                                             (FCB*)fp,
2179                                             MAXPHYSIO,
2180                                             blockposition,
2181                                             bnp,
2182                                             &bytesContAvail));
2183
2184         if (lockExtBtree)
2185                 hfs_systemfile_unlock(hfsmp, lockflags);
2186
2187         if (retval == E_NONE) {
2188                 /* Figure out how many read ahead blocks there are */
2189                 if (runp != NULL) {
2190                         if (can_cluster(logBlockSize)) {
2191                                 /* Make sure this result never goes negative: */
2192                                 *runp = (bytesContAvail < logBlockSize) ? 0 : (bytesContAvail / logBlockSize) - 1;
2193                         } else {
2194                                 *runp = 0;
2195                         }
2196                 }
2197         }
2198         return (retval);
2199 }
2200
2201 /*
2202  * Convert logical block number to file offset.
2203  */
2204 int
2205 hfs_vnop_blktooff(struct vnop_blktooff_args *ap)
2206 /*
2207         struct vnop_blktooff_args {
2208                 vnode_t a_vp;
2209                 daddr64_t a_lblkno;
2210                 off_t *a_offset;
2211         };
2212 */
2213 {
2214         if (ap->a_vp == NULL)
2215                 return (EINVAL);
2216         *ap->a_offset = (off_t)ap->a_lblkno * (off_t)GetLogicalBlockSize(ap->a_vp);
2217
2218         return(0);
2219 }
2220
2221 /*
2222  * Convert file offset to logical block number.
2223  */
2224 int
2225 hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap)
2226 /*
2227         struct vnop_offtoblk_args {
2228                 vnode_t a_vp;
2229                 off_t a_offset;
2230                 daddr64_t *a_lblkno;
2231         };
2232 */
2233 {
2234         if (ap->a_vp == NULL)
2235                 return (EINVAL);
2236         *ap->a_lblkno = (daddr64_t)(ap->a_offset / (off_t)GetLogicalBlockSize(ap->a_vp));
2237
2238         return(0);
2239 }
2240
2241 /*
2242  * Map file offset to physical block number.
2243  *
2244  * If this function is called for write operation, and if the file
2245  * had virtual blocks allocated (delayed allocation), real blocks
2246  * are allocated by calling ExtendFileC().
2247  *
2248  * If this function is called for read operation, and if the file
2249  * had virtual blocks allocated (delayed allocation), no change
2250  * to the size of file is done, and if required, rangelist is
2251  * searched for mapping.
2252  *
2253  * System file cnodes are expected to be locked (shared or exclusive).
2254  */
2255 int
2256 hfs_vnop_blockmap(struct vnop_blockmap_args *ap)
2257 /*
2258         struct vnop_blockmap_args {
2259                 vnode_t a_vp;
2260                 off_t a_foffset;
2261                 size_t a_size;
2262                 daddr64_t *a_bpn;
2263                 size_t *a_run;
2264                 void *a_poff;
2265                 int a_flags;
2266                 vfs_context_t a_context;
2267         };
2268 */
2269 {
2270         struct vnode *vp = ap->a_vp;
2271         struct cnode *cp;
2272         struct filefork *fp;
2273         struct hfsmount *hfsmp;
2274         size_t bytesContAvail = 0;
2275         int retval = E_NONE;
2276         int syslocks = 0;
2277         int lockflags = 0;
2278         struct rl_entry *invalid_range;
2279         enum rl_overlaptype overlaptype;
2280         int started_tr = 0;
2281         int tooklock = 0;
2282
2283 #if HFS_COMPRESSION
2284         if (VNODE_IS_RSRC(vp)) {
2285                 /* allow blockmaps to the resource fork */
2286         } else {
2287                 if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
2288                         int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
2289                         switch(state) {
2290                                 case FILE_IS_COMPRESSED:
2291                                         return ENOTSUP;
2292                                 case FILE_IS_CONVERTING:
2293                                         /* if FILE_IS_CONVERTING, we allow blockmap */
2294                                         break;
2295                                 default:
2296                                         printf("invalid state %d for compressed file\n", state);
2297                                         /* fall through */
2298                         }
2299                 }
2300         }
2301 #endif /* HFS_COMPRESSION */
2302
2303         /* Do not allow blockmap operation on a directory */
2304         if (vnode_isdir(vp)) {
2305                 return (ENOTSUP);
2306         }
2307
2308         /*
2309          * Check for underlying vnode requests and ensure that logical
2310          * to physical mapping is requested.
2311          */
2312         if (ap->a_bpn == NULL)
2313                 return (0);
2314
2315         if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) {
2316                 if (VTOC(vp)->c_lockowner != current_thread()) {
2317                         hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
2318                         tooklock = 1;
2319                 }
2320         }
2321         hfsmp = VTOHFS(vp);
2322         cp = VTOC(vp);
2323         fp = VTOF(vp);
2324
2325 retry:
2326         /* Check virtual blocks only when performing write operation */
2327         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
2328                 if (hfs_start_transaction(hfsmp) != 0) {
2329                         retval = EINVAL;
2330                         goto exit;
2331                 } else {
2332                         started_tr = 1;
2333                 }
2334                 syslocks = SFL_EXTENTS | SFL_BITMAP;
2335
2336         } else if (overflow_extents(fp)) {
2337                 syslocks = SFL_EXTENTS;
2338         }
2339
2340         if (syslocks)
2341                 lockflags = hfs_systemfile_lock(hfsmp, syslocks, HFS_EXCLUSIVE_LOCK);
2342
2343         /*
2344          * Check for any delayed allocations.
2345          */
2346         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
2347                 int64_t actbytes;
2348                 u_int32_t loanedBlocks;
2349
2350                 //
2351                 // Make sure we have a transaction.  It's possible
2352                 // that we came in and fp->ff_unallocblocks was zero
2353                 // but during the time we blocked acquiring the extents
2354                 // btree, ff_unallocblocks became non-zero and so we
2355                 // will need to start a transaction.
2356                 //
2357                 if (started_tr == 0) {
2358                         if (syslocks) {
2359                                 hfs_systemfile_unlock(hfsmp, lockflags);
2360                                 syslocks = 0;
2361                         }
2362                         goto retry;
2363                 }
2364
2365                 /*
2366                  * Note: ExtendFileC will Release any blocks on loan and
2367                  * aquire real blocks.  So we ask to extend by zero bytes
2368                  * since ExtendFileC will account for the virtual blocks.
2369                  */
2370
2371                 loanedBlocks = fp->ff_unallocblocks;
2372                 retval = ExtendFileC(hfsmp, (FCB*)fp, 0, 0,
2373                                      kEFAllMask | kEFNoClumpMask, &actbytes);
2374
2375                 if (retval) {
2376                         fp->ff_unallocblocks = loanedBlocks;
2377                         cp->c_blocks += loanedBlocks;
2378                         fp->ff_blocks += loanedBlocks;
2379
2380                         HFS_MOUNT_LOCK(hfsmp, TRUE);
2381                         hfsmp->loanedBlocks += loanedBlocks;
2382                         HFS_MOUNT_UNLOCK(hfsmp, TRUE);
2383
2384                         hfs_systemfile_unlock(hfsmp, lockflags);
2385                         cp->c_flag |= C_MODIFIED;
2386                         if (started_tr) {
2387                                 (void) hfs_update(vp, TRUE);
2388                                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2389
2390                                 hfs_end_transaction(hfsmp);
2391                                 started_tr = 0;
2392                         }
2393                         goto exit;
2394                 }
2395         }
2396
2397         retval = MapFileBlockC(hfsmp, (FCB *)fp, ap->a_size, ap->a_foffset,
2398                                ap->a_bpn, &bytesContAvail);
2399         if (syslocks) {
2400                 hfs_systemfile_unlock(hfsmp, lockflags);
2401                 syslocks = 0;
2402         }
2403
2404         if (started_tr) {
2405                 (void) hfs_update(vp, TRUE);
2406                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2407                 hfs_end_transaction(hfsmp);
2408                 started_tr = 0;
2409         }
2410         if (retval) {
2411                 /* On write, always return error because virtual blocks, if any,
2412                  * should have been allocated in ExtendFileC().  We do not
2413                  * allocate virtual blocks on read, therefore return error
2414                  * only if no virtual blocks are allocated.  Otherwise we search
2415                  * rangelist for zero-fills
2416                  */
2417                 if ((MacToVFSError(retval) != ERANGE) ||
2418                     (ap->a_flags & VNODE_WRITE) ||
2419                     ((ap->a_flags & VNODE_READ) && (fp->ff_unallocblocks == 0))) {
2420                         goto exit;
2421                 }
2422
2423                 /* Validate if the start offset is within logical file size */
2424                 if (ap->a_foffset > fp->ff_size) {
2425                         goto exit;
2426                 }
2427
2428                 /* Searching file extents has failed for read operation, therefore
2429                  * search rangelist for any uncommitted holes in the file.
2430                  */
2431                 overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
2432                                       ap->a_foffset + (off_t)(ap->a_size - 1),
2433                                       &invalid_range);
2434                 switch(overlaptype) {
2435                 case RL_OVERLAPISCONTAINED:
2436                         /* start_offset <= rl_start, end_offset >= rl_end */
2437                         if (ap->a_foffset != invalid_range->rl_start) {
2438                                 break;
2439                         }
2440                 case RL_MATCHINGOVERLAP:
2441                         /* start_offset = rl_start, end_offset = rl_end */
2442                 case RL_OVERLAPCONTAINSRANGE:
2443                         /* start_offset >= rl_start, end_offset <= rl_end */
2444                 case RL_OVERLAPSTARTSBEFORE:
2445                         /* start_offset > rl_start, end_offset >= rl_start */
2446                         if ((off_t)fp->ff_size > (invalid_range->rl_end + 1)) {
2447                                 bytesContAvail = (invalid_range->rl_end + 1) - ap->a_foffset;
2448                         } else {
2449                                 bytesContAvail = fp->ff_size - ap->a_foffset;
2450                         }
2451                         if (bytesContAvail > ap->a_size) {
2452                                 bytesContAvail = ap->a_size;
2453                         }
2454                         *ap->a_bpn = (daddr64_t)-1;
2455                         retval = 0;
2456                         break;
2457                 case RL_OVERLAPENDSAFTER:
2458                         /* start_offset < rl_start, end_offset < rl_end */
2459                 case RL_NOOVERLAP:
2460                         break;
2461                 }
2462                 goto exit;
2463         }
2464
2465         /* MapFileC() found a valid extent in the filefork.  Search the
2466          * mapping information further for invalid file ranges
2467          */
2468         overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
2469                               ap->a_foffset + (off_t)bytesContAvail - 1,
2470                               &invalid_range);
2471         if (overlaptype != RL_NOOVERLAP) {
2472                 switch(overlaptype) {
2473                 case RL_MATCHINGOVERLAP:
2474                 case RL_OVERLAPCONTAINSRANGE:
2475                 case RL_OVERLAPSTARTSBEFORE:
2476                         /* There's no valid block for this byte offset */
2477                         *ap->a_bpn = (daddr64_t)-1;
2478                         /* There's no point limiting the amount to be returned
2479                          * if the invalid range that was hit extends all the way
2480                          * to the EOF (i.e. there's no valid bytes between the
2481                          * end of this range and the file's EOF):
2482                          */
2483                         if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
2484                             ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
2485                                 bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
2486                         }
2487                         break;
2488
2489                 case RL_OVERLAPISCONTAINED:
2490                 case RL_OVERLAPENDSAFTER:
2491                         /* The range of interest hits an invalid block before the end: */
2492                         if (invalid_range->rl_start == ap->a_foffset) {
2493                                 /* There's actually no valid information to be had starting here: */
2494                                 *ap->a_bpn = (daddr64_t)-1;
2495                                 if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
2496                                     ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
2497                                         bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
2498                                 }
2499                         } else {
2500                                 bytesContAvail = invalid_range->rl_start - ap->a_foffset;
2501                         }
2502                         break;
2503
2504                 case RL_NOOVERLAP:
2505                         break;
2506                 } /* end switch */
2507                 if (bytesContAvail > ap->a_size)
2508                         bytesContAvail = ap->a_size;
2509         }
2510
2511 exit:
2512         if (retval == 0) {
2513                 if (ap->a_run)
2514                         *ap->a_run = bytesContAvail;
2515
2516                 if (ap->a_poff)
2517                         *(int *)ap->a_poff = 0;
2518         }
2519
2520         if (tooklock)
2521                 hfs_unlock(cp);
2522
2523         return (MacToVFSError(retval));
2524 }
2525
2526
2527 /*
2528  * prepare and issue the I/O
2529  * buf_strategy knows how to deal
2530  * with requests that require
2531  * fragmented I/Os
2532  */
2533 int
2534 hfs_vnop_strategy(struct vnop_strategy_args *ap)
2535 {
2536         buf_t   bp = ap->a_bp;
2537         vnode_t vp = buf_vnode(bp);
2538
2539         return (buf_strategy(VTOHFS(vp)->hfs_devvp, ap));
2540 }
2541
2542 static int
2543 hfs_minorupdate(struct vnode *vp) {
2544         struct cnode *cp = VTOC(vp);
2545         cp->c_flag &= ~C_MODIFIED;
2546         cp->c_touch_acctime = 0;
2547         cp->c_touch_chgtime = 0;
2548         cp->c_touch_modtime = 0;
2549
2550         return 0;
2551 }
2552
2553 static int
2554 do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_context_t context)
2555 {
2556         register struct cnode *cp = VTOC(vp);
2557         struct filefork *fp = VTOF(vp);
2558         struct proc *p = vfs_context_proc(context);;
2559         kauth_cred_t cred = vfs_context_ucred(context);
2560         int retval;
2561         off_t bytesToAdd;
2562         off_t actualBytesAdded;
2563         off_t filebytes;
2564         u_int32_t fileblocks;
2565         int blksize;
2566         struct hfsmount *hfsmp;
2567         int lockflags;
2568
2569         blksize = VTOVCB(vp)->blockSize;
2570         fileblocks = fp->ff_blocks;
2571         filebytes = (off_t)fileblocks * (off_t)blksize;
2572
2573         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_START,
2574                  (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
2575
2576         if (length < 0)
2577                 return (EINVAL);
2578
2579         /* This should only happen with a corrupt filesystem */
2580         if ((off_t)fp->ff_size < 0)
2581                 return (EINVAL);
2582
2583         if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE))
2584                 return (EFBIG);
2585
2586         hfsmp = VTOHFS(vp);
2587
2588         retval = E_NONE;
2589
2590         /* Files that are changing size are not hot file candidates. */
2591         if (hfsmp->hfc_stage == HFC_RECORDING) {
2592                 fp->ff_bytesread = 0;
2593         }
2594
2595         /*
2596          * We cannot just check if fp->ff_size == length (as an optimization)
2597          * since there may be extra physical blocks that also need truncation.
2598          */
2599 #if QUOTA
2600         if ((retval = hfs_getinoquota(cp)))
2601                 return(retval);
2602 #endif /* QUOTA */
2603
2604         /*
2605          * Lengthen the size of the file. We must ensure that the
2606          * last byte of the file is allocated. Since the smallest
2607          * value of ff_size is 0, length will be at least 1.
2608          */
2609         if (length > (off_t)fp->ff_size) {
2610 #if QUOTA
2611                 retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)),
2612                                    cred, 0);
2613                 if (retval)
2614                         goto Err_Exit;
2615 #endif /* QUOTA */
2616                 /*
2617                  * If we don't have enough physical space then
2618                  * we need to extend the physical size.
2619                  */
2620                 if (length > filebytes) {
2621                         int eflags;
2622                         u_int32_t blockHint = 0;
2623
2624                         /* All or nothing and don't round up to clumpsize. */
2625                         eflags = kEFAllMask | kEFNoClumpMask;
2626
2627                         if (cred && suser(cred, NULL) != 0)
2628                                 eflags |= kEFReserveMask;  /* keep a reserve */
2629
2630                         /*
2631                          * Allocate Journal and Quota files in metadata zone.
2632                          */
2633                         if (filebytes == 0 &&
2634                             hfsmp->hfs_flags & HFS_METADATA_ZONE &&
2635                             hfs_virtualmetafile(cp)) {
2636                                 eflags |= kEFMetadataMask;
2637                                 blockHint = hfsmp->hfs_metazone_start;
2638                         }
2639                         if (hfs_start_transaction(hfsmp) != 0) {
2640                             retval = EINVAL;
2641                             goto Err_Exit;
2642                         }
2643
2644                         /* Protect extents b-tree and allocation bitmap */
2645                         lockflags = SFL_BITMAP;
2646                         if (overflow_extents(fp))
2647                                 lockflags |= SFL_EXTENTS;
2648                         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
2649
2650                         while ((length > filebytes) && (retval == E_NONE)) {
2651                                 bytesToAdd = length - filebytes;
2652                                 retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
2653                                                     (FCB*)fp,
2654                                                     bytesToAdd,
2655                                                     blockHint,
2656                                                     eflags,
2657                                                     &actualBytesAdded));
2658
2659                                 filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
2660                                 if (actualBytesAdded == 0 && retval == E_NONE) {
2661                                         if (length > filebytes)
2662                                                 length = filebytes;
2663                                         break;
2664                                 }
2665                         } /* endwhile */
2666
2667                         hfs_systemfile_unlock(hfsmp, lockflags);
2668
2669                         if (hfsmp->jnl) {
2670                                 if (skipupdate) {
2671                                         (void) hfs_minorupdate(vp);
2672                                 }
2673                                 else {
2674                                         (void) hfs_update(vp, TRUE);
2675                                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2676                                 }
2677                         }
2678
2679                         hfs_end_transaction(hfsmp);
2680
2681                         if (retval)
2682                                 goto Err_Exit;
2683
2684                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
2685                                 (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
2686                 }
2687
2688                 if (!(flags & IO_NOZEROFILL)) {
2689                         if (UBCINFOEXISTS(vp)  && (vnode_issystem(vp) == 0) && retval == E_NONE) {
2690                                 struct rl_entry *invalid_range;
2691                                 off_t zero_limit;
2692
2693                                 zero_limit = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
2694                                 if (length < zero_limit) zero_limit = length;
2695
2696                                 if (length > (off_t)fp->ff_size) {
2697                                         struct timeval tv;
2698
2699                                         /* Extending the file: time to fill out the current last page w. zeroes? */
2700                                         if ((fp->ff_size & PAGE_MASK_64) &&
2701                                             (rl_scan(&fp->ff_invalidranges, fp->ff_size & ~PAGE_MASK_64,
2702                                             fp->ff_size - 1, &invalid_range) == RL_NOOVERLAP)) {
2703
2704                                                 /* There's some valid data at the start of the (current) last page
2705                                                    of the file, so zero out the remainder of that page to ensure the
2706                                                    entire page contains valid data.  Since there is no invalid range
2707                                                    possible past the (current) eof, there's no need to remove anything
2708                                                    from the invalid range list before calling cluster_write():  */
2709                                                 hfs_unlock(cp);
2710                                                 retval = cluster_write(vp, (struct uio *) 0, fp->ff_size, zero_limit,
2711                                                                 fp->ff_size, (off_t)0,
2712                                                                 (flags & IO_SYNC) | IO_HEADZEROFILL | IO_NOZERODIRTY);
2713                                                 hfs_lock(cp, HFS_FORCE_LOCK);
2714                                                 if (retval) goto Err_Exit;
2715
2716                                                 /* Merely invalidate the remaining area, if necessary: */
2717                                                 if (length > zero_limit) {
2718                                                         microuptime(&tv);
2719                                                         rl_add(zero_limit, length - 1, &fp->ff_invalidranges);
2720                                                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
2721                                                 }
2722                                         } else {
2723                                         /* The page containing the (current) eof is invalid: just add the
2724                                            remainder of the page to the invalid list, along with the area
2725                                            being newly allocated:
2726                                          */
2727                                         microuptime(&tv);
2728                                         rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges);
2729                                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
2730                                         };
2731                                 }
2732                         } else {
2733                                         panic("hfs_truncate: invoked on non-UBC object?!");
2734                         };
2735                 }
2736                 cp->c_touch_modtime = TRUE;
2737                 fp->ff_size = length;
2738
2739         } else { /* Shorten the size of the file */
2740
2741                 if ((off_t)fp->ff_size > length) {
2742                         /* Any space previously marked as invalid is now irrelevant: */
2743                         rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges);
2744                 }
2745
2746                 /*
2747                  * Account for any unmapped blocks. Note that the new
2748                  * file length can still end up with unmapped blocks.
2749                  */
2750                 if (fp->ff_unallocblocks > 0) {
2751                         u_int32_t finalblks;
2752                         u_int32_t loanedBlocks;
2753
2754                         HFS_MOUNT_LOCK(hfsmp, TRUE);
2755
2756                         loanedBlocks = fp->ff_unallocblocks;
2757                         cp->c_blocks -= loanedBlocks;
2758                         fp->ff_blocks -= loanedBlocks;
2759                         fp->ff_unallocblocks = 0;
2760
2761                         hfsmp->loanedBlocks -= loanedBlocks;
2762
2763                         finalblks = (length + blksize - 1) / blksize;
2764                         if (finalblks > fp->ff_blocks) {
2765                                 /* calculate required unmapped blocks */
2766                                 loanedBlocks = finalblks - fp->ff_blocks;
2767                                 hfsmp->loanedBlocks += loanedBlocks;
2768
2769                                 fp->ff_unallocblocks = loanedBlocks;
2770                                 cp->c_blocks += loanedBlocks;
2771                                 fp->ff_blocks += loanedBlocks;
2772                         }
2773                         HFS_MOUNT_UNLOCK(hfsmp, TRUE);
2774                 }
2775
2776                 /*
2777                  * For a TBE process the deallocation of the file blocks is
2778                  * delayed until the file is closed.  And hfs_close calls
2779                  * truncate with the IO_NDELAY flag set.  So when IO_NDELAY
2780                  * isn't set, we make sure this isn't a TBE process.
2781                  */
2782                 if ((flags & IO_NDELAY) || (proc_tbe(p) == 0)) {
2783 #if QUOTA
2784                   off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
2785 #endif /* QUOTA */
2786                   if (hfs_start_transaction(hfsmp) != 0) {
2787                       retval = EINVAL;
2788                       goto Err_Exit;
2789                   }
2790
2791                         if (fp->ff_unallocblocks == 0) {
2792                                 /* Protect extents b-tree and allocation bitmap */
2793                                 lockflags = SFL_BITMAP;
2794                                 if (overflow_extents(fp))
2795                                         lockflags |= SFL_EXTENTS;
2796                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
2797
2798                                 retval = MacToVFSError(TruncateFileC(VTOVCB(vp),
2799                                                 (FCB*)fp, length, false));
2800
2801                                 hfs_systemfile_unlock(hfsmp, lockflags);
2802                         }
2803                         if (hfsmp->jnl) {
2804                                 if (retval == 0) {
2805                                         fp->ff_size = length;
2806                                 }
2807                                 if (skipupdate) {
2808                                         (void) hfs_minorupdate(vp);
2809                                 }
2810                                 else {
2811                                         (void) hfs_update(vp, TRUE);
2812                                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2813                                 }
2814                         }
2815                         hfs_end_transaction(hfsmp);
2816
2817                         filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
2818                         if (retval)
2819                                 goto Err_Exit;
2820 #if QUOTA
2821                         /* These are bytesreleased */
2822                         (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0);
2823 #endif /* QUOTA */
2824                 }
2825                 /* Only set update flag if the logical length changes */
2826                 if ((off_t)fp->ff_size != length)
2827                         cp->c_touch_modtime = TRUE;
2828                 fp->ff_size = length;
2829         }
2830         if (cp->c_mode & (S_ISUID | S_ISGID)) {
2831                 if (!vfs_context_issuser(context)) {
2832                         cp->c_mode &= ~(S_ISUID | S_ISGID);
2833                         skipupdate = 0;
2834                 }
2835         }
2836         if (skipupdate) {
2837                 retval = hfs_minorupdate(vp);
2838         }
2839         else {
2840                 cp->c_touch_chgtime = TRUE;     /* status changed */
2841                 cp->c_touch_modtime = TRUE;     /* file data was modified */
2842                 retval = hfs_update(vp, MNT_WAIT);
2843         }
2844         if (retval) {
2845                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
2846                      -1, -1, -1, retval, 0);
2847         }
2848
2849 Err_Exit:
2850
2851         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_END,
2852                  (int)length, (int)fp->ff_size, (int)filebytes, retval, 0);
2853
2854         return (retval);
2855 }
2856
2857
2858
2859 /*
2860  * Truncate a cnode to at most length size, freeing (or adding) the
2861  * disk blocks.
2862  */
2863 __private_extern__
2864 int
2865 hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize,
2866              int skipupdate, vfs_context_t context)
2867 {
2868         struct filefork *fp = VTOF(vp);
2869         off_t filebytes;
2870         u_int32_t fileblocks;
2871         int blksize, error = 0;
2872         struct cnode *cp = VTOC(vp);
2873
2874         /* Cannot truncate an HFS directory! */
2875         if (vnode_isdir(vp)) {
2876                 return (EISDIR);
2877         }
2878         /* A swap file cannot change size. */
2879         if (vnode_isswap(vp) && (length != 0)) {
2880                 return (EPERM);
2881         }
2882
2883         blksize = VTOVCB(vp)->blockSize;
2884         fileblocks = fp->ff_blocks;
2885         filebytes = (off_t)fileblocks * (off_t)blksize;
2886
2887         //
2888         // Have to do this here so that we don't wind up with
2889         // i/o pending for blocks that are about to be released
2890         // if we truncate the file.
2891         //
2892         // If skipsetsize is set, then the caller is responsible
2893         // for the ubc_setsize.
2894         //
2895         // Even if skipsetsize is set, if the length is zero we
2896         // want to call ubc_setsize() because as of SnowLeopard
2897         // it will no longer cause any page-ins and it will drop
2898         // any dirty pages so that we don't do any i/o that we
2899         // don't have to.  This also prevents a race where i/o
2900         // for truncated blocks may overwrite later data if the
2901         // blocks get reallocated to a different file.
2902         //
2903         if (!skipsetsize || length == 0)
2904                 ubc_setsize(vp, length);
2905
2906         // have to loop truncating or growing files that are
2907         // really big because otherwise transactions can get
2908         // enormous and consume too many kernel resources.
2909
2910         if (length < filebytes) {
2911                 while (filebytes > length) {
2912                         if ((filebytes - length) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
2913                                 filebytes -= HFS_BIGFILE_SIZE;
2914                         } else {
2915                                 filebytes = length;
2916                         }
2917                         cp->c_flag |= C_FORCEUPDATE;
2918                         error = do_hfs_truncate(vp, filebytes, flags, skipupdate, context);
2919                         if (error)
2920                                 break;
2921                 }
2922         } else if (length > filebytes) {
2923                 while (filebytes < length) {
2924                         if ((length - filebytes) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
2925                                 filebytes += HFS_BIGFILE_SIZE;
2926                         } else {
2927                                 filebytes = length;
2928                         }
2929                         cp->c_flag |= C_FORCEUPDATE;
2930                         error = do_hfs_truncate(vp, filebytes, flags, skipupdate, context);
2931                         if (error)
2932                                 break;
2933                 }
2934         } else /* Same logical size */ {
2935
2936                 error = do_hfs_truncate(vp, length, flags, skipupdate, context);
2937         }
2938         /* Files that are changing size are not hot file candidates. */
2939         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
2940                 fp->ff_bytesread = 0;
2941         }
2942
2943         return (error);
2944 }
2945
2946
2947
2948 /*
2949  * Preallocate file storage space.
2950  */
2951 int
2952 hfs_vnop_allocate(struct vnop_allocate_args /* {
2953                 vnode_t a_vp;
2954                 off_t a_length;
2955                 u_int32_t  a_flags;
2956                 off_t *a_bytesallocated;
2957                 off_t a_offset;
2958                 vfs_context_t a_context;
2959         } */ *ap)
2960 {
2961         struct vnode *vp = ap->a_vp;
2962         struct cnode *cp;
2963         struct filefork *fp;
2964         ExtendedVCB *vcb;
2965         off_t length = ap->a_length;
2966         off_t startingPEOF;
2967         off_t moreBytesRequested;
2968         off_t actualBytesAdded;
2969         off_t filebytes;
2970         u_int32_t fileblocks;
2971         int retval, retval2;
2972         u_int32_t blockHint;
2973         u_int32_t extendFlags;   /* For call to ExtendFileC */
2974         struct hfsmount *hfsmp;
2975         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
2976         int lockflags;
2977
2978         *(ap->a_bytesallocated) = 0;
2979
2980         if (!vnode_isreg(vp))
2981                 return (EISDIR);
2982         if (length < (off_t)0)
2983                 return (EINVAL);
2984
2985         cp = VTOC(vp);
2986
2987         hfs_lock_truncate(cp, TRUE);
2988
2989         if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
2990                 goto Err_Exit;
2991         }
2992
2993         fp = VTOF(vp);
2994         hfsmp = VTOHFS(vp);
2995         vcb = VTOVCB(vp);
2996
2997         fileblocks = fp->ff_blocks;
2998         filebytes = (off_t)fileblocks * (off_t)vcb->blockSize;
2999
3000         if ((ap->a_flags & ALLOCATEFROMVOL) && (length < filebytes)) {
3001                 retval = EINVAL;
3002                 goto Err_Exit;
3003         }
3004
3005         /* Fill in the flags word for the call to Extend the file */
3006
3007         extendFlags = kEFNoClumpMask;
3008         if (ap->a_flags & ALLOCATECONTIG)
3009                 extendFlags |= kEFContigMask;
3010         if (ap->a_flags & ALLOCATEALL)
3011                 extendFlags |= kEFAllMask;
3012         if (cred && suser(cred, NULL) != 0)
3013                 extendFlags |= kEFReserveMask;
3014         if (hfs_virtualmetafile(cp))
3015                 extendFlags |= kEFMetadataMask;
3016
3017         retval = E_NONE;
3018         blockHint = 0;
3019         startingPEOF = filebytes;
3020
3021         if (ap->a_flags & ALLOCATEFROMPEOF)
3022                 length += filebytes;
3023         else if (ap->a_flags & ALLOCATEFROMVOL)
3024                 blockHint = ap->a_offset / VTOVCB(vp)->blockSize;
3025
3026         /* If no changes are necesary, then we're done */
3027         if (filebytes == length)
3028                 goto Std_Exit;
3029
3030         /*
3031          * Lengthen the size of the file. We must ensure that the
3032          * last byte of the file is allocated. Since the smallest
3033          * value of filebytes is 0, length will be at least 1.
3034          */
3035         if (length > filebytes) {
3036                 off_t total_bytes_added = 0, orig_request_size;
3037
3038                 orig_request_size = moreBytesRequested = length - filebytes;
3039
3040 #if QUOTA
3041                 retval = hfs_chkdq(cp,
3042                                 (int64_t)(roundup(moreBytesRequested, vcb->blockSize)),
3043                                 cred, 0);
3044                 if (retval)
3045                         goto Err_Exit;
3046
3047 #endif /* QUOTA */
3048                 /*
3049                  * Metadata zone checks.
3050                  */
3051                 if (hfsmp->hfs_flags & HFS_METADATA_ZONE) {
3052                         /*
3053                          * Allocate Journal and Quota files in metadata zone.
3054                          */
3055                         if (hfs_virtualmetafile(cp)) {
3056                                 blockHint = hfsmp->hfs_metazone_start;
3057                         } else if ((blockHint >= hfsmp->hfs_metazone_start) &&
3058                                    (blockHint <= hfsmp->hfs_metazone_end)) {
3059                                 /*
3060                                  * Move blockHint outside metadata zone.
3061                                  */
3062                                 blockHint = hfsmp->hfs_metazone_end + 1;
3063                         }
3064                 }
3065
3066
3067                 while ((length > filebytes) && (retval == E_NONE)) {
3068                     off_t bytesRequested;
3069
3070                     if (hfs_start_transaction(hfsmp) != 0) {
3071                         retval = EINVAL;
3072                         goto Err_Exit;
3073                     }
3074
3075                     /* Protect extents b-tree and allocation bitmap */
3076                     lockflags = SFL_BITMAP;
3077                     if (overflow_extents(fp))
3078                         lockflags |= SFL_EXTENTS;
3079                     lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3080
3081                     if (moreBytesRequested >= HFS_BIGFILE_SIZE) {
3082                         bytesRequested = HFS_BIGFILE_SIZE;
3083                     } else {
3084                         bytesRequested = moreBytesRequested;
3085                     }
3086
3087                     if (extendFlags & kEFContigMask) {
3088                             // if we're on a sparse device, this will force it to do a
3089                             // full scan to find the space needed.
3090                             hfsmp->hfs_flags &= ~HFS_DID_CONTIG_SCAN;
3091                     }
3092
3093                     retval = MacToVFSError(ExtendFileC(vcb,
3094                                                 (FCB*)fp,
3095                                                 bytesRequested,
3096                                                 blockHint,
3097                                                 extendFlags,
3098                                                 &actualBytesAdded));
3099
3100                     if (retval == E_NONE) {
3101                         *(ap->a_bytesallocated) += actualBytesAdded;
3102                         total_bytes_added += actualBytesAdded;
3103                         moreBytesRequested -= actualBytesAdded;
3104                         if (blockHint != 0) {
3105                             blockHint += actualBytesAdded / vcb->blockSize;
3106                         }
3107                     }
3108                     filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
3109
3110                     hfs_systemfile_unlock(hfsmp, lockflags);
3111
3112                     if (hfsmp->jnl) {
3113                         (void) hfs_update(vp, TRUE);
3114                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3115                     }
3116
3117                     hfs_end_transaction(hfsmp);
3118                 }
3119
3120
3121                 /*
3122                  * if we get an error and no changes were made then exit
3123                  * otherwise we must do the hfs_update to reflect the changes
3124                  */
3125                 if (retval && (startingPEOF == filebytes))
3126                         goto Err_Exit;
3127
3128                 /*
3129                  * Adjust actualBytesAdded to be allocation block aligned, not
3130                  * clump size aligned.
3131                  * NOTE: So what we are reporting does not affect reality
3132                  * until the file is closed, when we truncate the file to allocation
3133                  * block size.
3134                  */
3135                 if (total_bytes_added != 0 && orig_request_size < total_bytes_added)
3136                         *(ap->a_bytesallocated) =
3137                                 roundup(orig_request_size, (off_t)vcb->blockSize);
3138
3139         } else { /* Shorten the size of the file */
3140
3141                 if (fp->ff_size > length) {
3142                         /*
3143                          * Any buffers that are past the truncation point need to be
3144                          * invalidated (to maintain buffer cache consistency).
3145                          */
3146                 }
3147
3148                 retval = hfs_truncate(vp, length, 0, 0, 0, ap->a_context);
3149                 filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
3150
3151                 /*
3152                  * if we get an error and no changes were made then exit
3153                  * otherwise we must do the hfs_update to reflect the changes
3154                  */
3155                 if (retval && (startingPEOF == filebytes)) goto Err_Exit;
3156 #if QUOTA
3157                 /* These are  bytesreleased */
3158                 (void) hfs_chkdq(cp, (int64_t)-((startingPEOF - filebytes)), NOCRED,0);
3159 #endif /* QUOTA */
3160
3161                 if (fp->ff_size > filebytes) {
3162                         fp->ff_size = filebytes;
3163
3164                         hfs_unlock(cp);
3165                         ubc_setsize(vp, fp->ff_size);
3166                         hfs_lock(cp, HFS_FORCE_LOCK);
3167                 }
3168         }
3169
3170 Std_Exit:
3171         cp->c_touch_chgtime = TRUE;
3172         cp->c_touch_modtime = TRUE;
3173         retval2 = hfs_update(vp, MNT_WAIT);
3174
3175         if (retval == 0)
3176                 retval = retval2;
3177 Err_Exit:
3178         hfs_unlock_truncate(cp, TRUE);
3179         hfs_unlock(cp);
3180         return (retval);
3181 }
3182
3183
3184 /*
3185  * Pagein for HFS filesystem
3186  */
3187 int
3188 hfs_vnop_pagein(struct vnop_pagein_args *ap)
3189 /*
3190         struct vnop_pagein_args {
3191                 vnode_t a_vp,
3192                 upl_t         a_pl,
3193                 vm_offset_t   a_pl_offset,
3194                 off_t         a_f_offset,
3195                 size_t        a_size,
3196                 int           a_flags
3197                 vfs_context_t a_context;
3198         };
3199 */
3200 {
3201         vnode_t vp = ap->a_vp;
3202         int error;
3203
3204 #if HFS_COMPRESSION
3205         if (VNODE_IS_RSRC(vp)) {
3206                 /* allow pageins of the resource fork */
3207         } else {
3208                 int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
3209                 if (compressed) {
3210                         error = decmpfs_pagein_compressed(ap, &compressed, VTOCMP(vp));
3211                         if (compressed) {
3212                                 if (error == 0) {
3213                                         /* successful page-in, update the access time */
3214                                         VTOC(vp)->c_touch_acctime = TRUE;
3215
3216                                         /* compressed files are not hot file candidates */
3217                                         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
3218                                                 VTOF(vp)->ff_bytesread = 0;
3219                                         }
3220                                 }
3221                                 return error;
3222                         }
3223                         /* otherwise the file was converted back to a regular file while we were reading it */
3224                 }
3225         }
3226 #endif
3227
3228         error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
3229                                ap->a_size, (off_t)VTOF(vp)->ff_size, ap->a_flags);
3230         /*
3231          * Keep track of blocks read.
3232          */
3233         if (!vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) {
3234                 struct cnode *cp;
3235                 struct filefork *fp;
3236                 int bytesread;
3237                 int took_cnode_lock = 0;
3238
3239                 cp = VTOC(vp);
3240                 fp = VTOF(vp);
3241
3242                 if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE)
3243                         bytesread = fp->ff_size;
3244                 else
3245                         bytesread = ap->a_size;
3246
3247                 /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
3248                 if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) {
3249                         hfs_lock(cp, HFS_FORCE_LOCK);
3250                         took_cnode_lock = 1;
3251                 }
3252                 /*
3253                  * If this file hasn't been seen since the start of
3254                  * the current sampling period then start over.
3255                  */
3256                 if (cp->c_atime < VTOHFS(vp)->hfc_timebase) {
3257                         struct timeval tv;
3258
3259                         fp->ff_bytesread = bytesread;
3260                         microtime(&tv);
3261                         cp->c_atime = tv.tv_sec;
3262                 } else {
3263                         fp->ff_bytesread += bytesread;
3264                 }
3265                 cp->c_touch_acctime = TRUE;
3266                 if (took_cnode_lock)
3267                         hfs_unlock(cp);
3268         }
3269         return (error);
3270 }
3271
3272 /*
3273  * Pageout for HFS filesystem.
3274  */
3275 int
3276 hfs_vnop_pageout(struct vnop_pageout_args *ap)
3277 /*
3278         struct vnop_pageout_args {
3279            vnode_t a_vp,
3280            upl_t         a_pl,
3281            vm_offset_t   a_pl_offset,
3282            off_t         a_f_offset,
3283            size_t        a_size,
3284            int           a_flags
3285            vfs_context_t a_context;
3286         };
3287 */
3288 {
3289         vnode_t vp = ap->a_vp;
3290         struct cnode *cp;
3291         struct filefork *fp;
3292         int retval = 0;
3293         off_t filesize;
3294         upl_t           upl;
3295         upl_page_info_t* pl;
3296         vm_offset_t     a_pl_offset;
3297         int             a_flags;
3298         int is_pageoutv2 = 0;
3299         kern_return_t kret;
3300
3301         cp = VTOC(vp);
3302         fp = VTOF(vp);
3303
3304         /*
3305          * Figure out where the file ends, for pageout purposes.  If
3306          * ff_new_size > ff_size, then we're in the middle of extending the
3307          * file via a write, so it is safe (and necessary) that we be able
3308          * to pageout up to that point.
3309          */
3310         filesize = fp->ff_size;
3311         if (fp->ff_new_size > filesize)
3312                 filesize = fp->ff_new_size;
3313
3314         a_flags = ap->a_flags;
3315         a_pl_offset = ap->a_pl_offset;
3316
3317         /*
3318          * we can tell if we're getting the new or old behavior from the UPL
3319          */
3320         if ((upl = ap->a_pl) == NULL) {
3321                 int request_flags;
3322
3323                 is_pageoutv2 = 1;
3324                 /*
3325                  * we're in control of any UPL we commit
3326                  * make sure someone hasn't accidentally passed in UPL_NOCOMMIT
3327                  */
3328                 a_flags &= ~UPL_NOCOMMIT;
3329                 a_pl_offset = 0;
3330
3331                 /*
3332                  * take truncate lock (shared) to guard against
3333                  * zero-fill thru fsync interfering, but only for v2
3334                  */
3335                 hfs_lock_truncate(cp, 0);
3336
3337                 if (a_flags & UPL_MSYNC) {
3338                         request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY;
3339                 }
3340                 else {
3341                         request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY;
3342                 }
3343                 kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags);
3344
3345                 if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
3346                         retval = EINVAL;
3347                         goto pageout_done;
3348                 }
3349         }
3350         /*
3351          * from this point forward upl points at the UPL we're working with
3352          * it was either passed in or we succesfully created it
3353          */
3354
3355         /*
3356          * Now that HFS is opting into VFC_VFSVNOP_PAGEOUTV2, we may need to operate on our own
3357          * UPL instead of relying on the UPL passed into us.  We go ahead and do that here,
3358          * scanning for dirty ranges.  We'll issue our own N cluster_pageout calls, for
3359          * N dirty ranges in the UPL.  Note that this is almost a direct copy of the
3360          * logic in vnode_pageout except that we need to do it after grabbing the truncate
3361          * lock in HFS so that we don't lock invert ourselves.
3362          *
3363          * Note that we can still get into this function on behalf of the default pager with
3364          * non-V2 behavior (swapfiles).  However in that case, we did not grab locks above
3365          * since fsync and other writing threads will grab the locks, then mark the
3366          * relevant pages as busy.  But the pageout codepath marks the pages as busy,
3367          * and THEN would attempt to grab the truncate lock, which would result in deadlock.  So
3368          * we do not try to grab anything for the pre-V2 case, which should only be accessed
3369          * by the paging/VM system.
3370          */
3371
3372         if (is_pageoutv2) {
3373                 off_t f_offset;
3374                 int offset;
3375                 int isize;
3376                 int pg_index;
3377                 int error;
3378                 int error_ret = 0;
3379
3380                 isize = ap->a_size;
3381                 f_offset = ap->a_f_offset;
3382
3383                 /*
3384                  * Scan from the back to find the last page in the UPL, so that we
3385                  * aren't looking at a UPL that may have already been freed by the
3386                  * preceding aborts/completions.
3387                  */
3388                 for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
3389                         if (upl_page_present(pl, --pg_index))
3390                                 break;
3391                         if (pg_index == 0) {
3392                                 ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
3393                                 goto pageout_done;
3394                         }
3395                 }
3396
3397                 /*
3398                  * initialize the offset variables before we touch the UPL.
3399                  * a_f_offset is the position into the file, in bytes
3400                  * offset is the position into the UPL, in bytes
3401                  * pg_index is the pg# of the UPL we're operating on.
3402                  * isize is the offset into the UPL of the last non-clean page.
3403                  */
3404                 isize = ((pg_index + 1) * PAGE_SIZE);
3405
3406                 offset = 0;
3407                 pg_index = 0;
3408
3409                 while (isize) {
3410                         int  xsize;
3411                         int  num_of_pages;
3412
3413                         if ( !upl_page_present(pl, pg_index)) {
3414                                 /*
3415                                  * we asked for RET_ONLY_DIRTY, so it's possible
3416                                  * to get back empty slots in the UPL.
3417                                  * just skip over them
3418                                  */
3419                                 f_offset += PAGE_SIZE;
3420                                 offset   += PAGE_SIZE;
3421                                 isize    -= PAGE_SIZE;
3422                                 pg_index++;
3423
3424                                 continue;
3425                         }
3426                         if ( !upl_dirty_page(pl, pg_index)) {
3427                                 panic ("hfs_vnop_pageout: unforeseen clean page @ index %d for UPL %p\n", pg_index, upl);
3428                         }
3429
3430                         /*
3431                          * We know that we have at least one dirty page.
3432                          * Now checking to see how many in a row we have
3433                          */
3434                         num_of_pages = 1;
3435                         xsize = isize - PAGE_SIZE;
3436
3437                         while (xsize) {
3438                                 if ( !upl_dirty_page(pl, pg_index + num_of_pages))
3439                                         break;
3440                                 num_of_pages++;
3441                                 xsize -= PAGE_SIZE;
3442                         }
3443                         xsize = num_of_pages * PAGE_SIZE;
3444
3445                         if (!vnode_isswap(vp)) {
3446                                 off_t end_of_range;
3447                                 int tooklock;
3448
3449                                 tooklock = 0;
3450
3451                                 if (cp->c_lockowner != current_thread()) {
3452                                         if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
3453                                                 /*
3454                                                  * we're in the v2 path, so we are the
3455                                                  * owner of the UPL... we may have already
3456                                                  * processed some of the UPL, so abort it
3457                                                  * from the current working offset to the
3458                                                  * end of the UPL
3459                                                  */
3460                                                 ubc_upl_abort_range(upl,
3461                                                                     offset,
3462                                                                     ap->a_size - offset,
3463                                                                     UPL_ABORT_FREE_ON_EMPTY);
3464                                                 goto pageout_done;
3465                                         }
3466                                         tooklock = 1;
3467                                 }
3468                                 end_of_range = f_offset + xsize - 1;
3469
3470                                 if (end_of_range >= filesize) {
3471                                         end_of_range = (off_t)(filesize - 1);
3472                                 }
3473                                 if (f_offset < filesize) {
3474                                         rl_remove(f_offset, end_of_range, &fp->ff_invalidranges);
3475                                         cp->c_flag |= C_MODIFIED;  /* leof is dirty */
3476                                 }
3477                                 if (tooklock) {
3478                                         hfs_unlock(cp);
3479                                 }
3480                         }
3481                         if ((error = cluster_pageout(vp, upl, offset, f_offset,
3482                                                         xsize, filesize, a_flags))) {
3483                                 if (error_ret == 0)
3484                                         error_ret = error;
3485                         }
3486                         f_offset += xsize;
3487                         offset   += xsize;
3488                         isize    -= xsize;
3489                         pg_index += num_of_pages;
3490                 }
3491                 /* capture errnos bubbled out of cluster_pageout if they occurred */
3492                 if (error_ret != 0) {
3493                         retval = error_ret;
3494                 }
3495         } /* end block for v2 pageout behavior */
3496         else {
3497                 if (!vnode_isswap(vp)) {
3498                         off_t end_of_range;
3499                         int tooklock = 0;
3500
3501                         if (cp->c_lockowner != current_thread()) {
3502                                 if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
3503                                         if (!(a_flags & UPL_NOCOMMIT)) {
3504                                                 ubc_upl_abort_range(upl,
3505                                                                     a_pl_offset,
3506                                                                     ap->a_size,
3507                                                                     UPL_ABORT_FREE_ON_EMPTY);
3508                                         }
3509                                         goto pageout_done;
3510                                 }
3511                                 tooklock = 1;
3512                         }
3513                         end_of_range = ap->a_f_offset + ap->a_size - 1;
3514
3515                         if (end_of_range >= filesize) {
3516                                 end_of_range = (off_t)(filesize - 1);
3517                         }
3518                         if (ap->a_f_offset < filesize) {
3519                                 rl_remove(ap->a_f_offset, end_of_range, &fp->ff_invalidranges);
3520                                 cp->c_flag |= C_MODIFIED;  /* leof is dirty */
3521                         }
3522
3523                         if (tooklock) {
3524                                 hfs_unlock(cp);
3525                         }
3526                 }
3527                 /*
3528                  * just call cluster_pageout for old pre-v2 behavior
3529                  */
3530                 retval = cluster_pageout(vp, upl, a_pl_offset, ap->a_f_offset,
3531                                 ap->a_size, filesize, a_flags);
3532         }
3533
3534         /*
3535          * If data was written, update the modification time of the file.
3536          * If setuid or setgid bits are set and this process is not the
3537          * superuser then clear the setuid and setgid bits as a precaution
3538          * against tampering.
3539          */
3540         if (retval == 0) {
3541                 cp->c_touch_modtime = TRUE;
3542                 cp->c_touch_chgtime = TRUE;
3543                 if ((cp->c_mode & (S_ISUID | S_ISGID)) &&
3544                     (vfs_context_suser(ap->a_context) != 0)) {
3545                         hfs_lock(cp, HFS_FORCE_LOCK);
3546                         cp->c_mode &= ~(S_ISUID | S_ISGID);
3547                         hfs_unlock(cp);
3548                 }
3549         }
3550
3551 pageout_done:
3552         if (is_pageoutv2) {
3553                 /* release truncate lock (shared) */
3554                 hfs_unlock_truncate(cp, 0);
3555         }
3556         return (retval);
3557 }
3558
3559 /*
3560  * Intercept B-Tree node writes to unswap them if necessary.
3561  */
3562 int
3563 hfs_vnop_bwrite(struct vnop_bwrite_args *ap)
3564 {
3565         int retval = 0;
3566         register struct buf *bp = ap->a_bp;
3567         register struct vnode *vp = buf_vnode(bp);
3568         BlockDescriptor block;
3569
3570         /* Trap B-Tree writes */
3571         if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) ||
3572             (VTOC(vp)->c_fileid == kHFSCatalogFileID) ||
3573             (VTOC(vp)->c_fileid == kHFSAttributesFileID) ||
3574             (vp == VTOHFS(vp)->hfc_filevp)) {
3575
3576                 /*
3577                  * Swap and validate the node if it is in native byte order.
3578                  * This is always be true on big endian, so we always validate
3579                  * before writing here.  On little endian, the node typically has
3580                  * been swapped and validated when it was written to the journal,
3581                  * so we won't do anything here.
3582                  */
3583                 if (((u_int16_t *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) {
3584                         /* Prepare the block pointer */
3585                         block.blockHeader = bp;
3586                         block.buffer = (char *)buf_dataptr(bp);
3587                         block.blockNum = buf_lblkno(bp);
3588                         /* not found in cache ==> came from disk */
3589                         block.blockReadFromDisk = (buf_fromcache(bp) == 0);
3590                         block.blockSize = buf_count(bp);
3591
3592                         /* Endian un-swap B-Tree node */
3593                         retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, false);
3594                         if (retval)
3595                                 panic("hfs_vnop_bwrite: about to write corrupt node!\n");
3596                 }
3597         }
3598
3599         /* This buffer shouldn't be locked anymore but if it is clear it */
3600         if ((buf_flags(bp) & B_LOCKED)) {
3601                 // XXXdbg
3602                 if (VTOHFS(vp)->jnl) {
3603                         panic("hfs: CLEARING the lock bit on bp %p\n", bp);
3604                 }
3605                 buf_clearflags(bp, B_LOCKED);
3606         }
3607         retval = vn_bwrite (ap);
3608
3609         return (retval);
3610 }
3611
3612 /*
3613  * Relocate a file to a new location on disk
3614  *  cnode must be locked on entry
3615  *
3616  * Relocation occurs by cloning the file's data from its
3617  * current set of blocks to a new set of blocks. During
3618  * the relocation all of the blocks (old and new) are
3619  * owned by the file.
3620  *
3621  * -----------------
3622  * |///////////////|
3623  * -----------------
3624  * 0               N (file offset)
3625  *
3626  * -----------------     -----------------
3627  * |///////////////|     |               |     STEP 1 (acquire new blocks)
3628  * -----------------     -----------------
3629  * 0               N     N+1             2N
3630  *
3631  * -----------------     -----------------
3632  * |///////////////|     |///////////////|     STEP 2 (clone data)
3633  * -----------------     -----------------
3634  * 0               N     N+1             2N
3635  *
3636  *                       -----------------
3637  *                       |///////////////|     STEP 3 (head truncate blocks)
3638  *                       -----------------
3639  *                       0               N
3640  *
3641  * During steps 2 and 3 page-outs to file offsets less
3642  * than or equal to N are suspended.
3643  *
3644  * During step 3 page-ins to the file get suspended.
3645  */
3646 __private_extern__
3647 int
3648 hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
3649         struct  proc *p)
3650 {
3651         struct  cnode *cp;
3652         struct  filefork *fp;
3653         struct  hfsmount *hfsmp;
3654         u_int32_t  headblks;
3655         u_int32_t  datablks;
3656         u_int32_t  blksize;
3657         u_int32_t  growsize;
3658         u_int32_t  nextallocsave;
3659         daddr64_t  sector_a,  sector_b;
3660         int eflags;
3661         off_t  newbytes;
3662         int  retval;
3663         int lockflags = 0;
3664         int took_trunc_lock = 0;
3665         int started_tr = 0;
3666         enum vtype vnodetype;
3667
3668         vnodetype = vnode_vtype(vp);
3669         if (vnodetype != VREG && vnodetype != VLNK) {
3670                 return (EPERM);
3671         }
3672
3673         hfsmp = VTOHFS(vp);
3674         if (hfsmp->hfs_flags & HFS_FRAGMENTED_FREESPACE) {
3675                 return (ENOSPC);
3676         }
3677
3678         cp = VTOC(vp);
3679         fp = VTOF(vp);
3680         if (fp->ff_unallocblocks)
3681                 return (EINVAL);
3682         blksize = hfsmp->blockSize;
3683         if (blockHint == 0)
3684                 blockHint = hfsmp->nextAllocation;
3685
3686         if ((fp->ff_size > 0x7fffffff) ||
3687             ((fp->ff_size > blksize) && vnodetype == VLNK)) {
3688                 return (EFBIG);
3689         }
3690
3691         //
3692         // We do not believe that this call to hfs_fsync() is
3693         // necessary and it causes a journal transaction
3694         // deadlock so we are removing it.
3695         //
3696         //if (vnodetype == VREG && !vnode_issystem(vp)) {
3697         //      retval = hfs_fsync(vp, MNT_WAIT, 0, p);
3698         //      if (retval)
3699         //              return (retval);
3700         //}
3701
3702         if (!vnode_issystem(vp) && (vnodetype != VLNK)) {
3703                 hfs_unlock(cp);
3704                 hfs_lock_truncate(cp, TRUE);
3705                 /* Force lock since callers expects lock to be held. */
3706                 if ((retval = hfs_lock(cp, HFS_FORCE_LOCK))) {
3707                         hfs_unlock_truncate(cp, TRUE);
3708                         return (retval);
3709                 }
3710                 /* No need to continue if file was removed. */
3711                 if (cp->c_flag & C_NOEXISTS) {
3712                         hfs_unlock_truncate(cp, TRUE);
3713                         return (ENOENT);
3714                 }
3715                 took_trunc_lock = 1;
3716         }
3717         headblks = fp->ff_blocks;
3718         datablks = howmany(fp->ff_size, blksize);
3719         growsize = datablks * blksize;
3720         eflags = kEFContigMask | kEFAllMask | kEFNoClumpMask;
3721         if (blockHint >= hfsmp->hfs_metazone_start &&
3722             blockHint <= hfsmp->hfs_metazone_end)
3723                 eflags |= kEFMetadataMask;
3724
3725         if (hfs_start_transaction(hfsmp) != 0) {
3726                 if (took_trunc_lock)
3727                         hfs_unlock_truncate(cp, TRUE);
3728             return (EINVAL);
3729         }
3730         started_tr = 1;
3731         /*
3732          * Protect the extents b-tree and the allocation bitmap
3733          * during MapFileBlockC and ExtendFileC operations.
3734          */
3735         lockflags = SFL_BITMAP;
3736         if (overflow_extents(fp))
3737                 lockflags |= SFL_EXTENTS;
3738         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3739
3740         retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize - 1, &sector_a, NULL);
3741         if (retval) {
3742                 retval = MacToVFSError(retval);
3743                 goto out;
3744         }
3745
3746         /*
3747          * STEP 1 - acquire new allocation blocks.
3748          */
3749         nextallocsave = hfsmp->nextAllocation;
3750         retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes);
3751         if (eflags & kEFMetadataMask) {
3752                 HFS_MOUNT_LOCK(hfsmp, TRUE);
3753                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave);
3754                 MarkVCBDirty(hfsmp);
3755                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
3756         }
3757
3758         retval = MacToVFSError(retval);
3759         if (retval == 0) {
3760                 cp->c_flag |= C_MODIFIED;
3761                 if (newbytes < growsize) {
3762                         retval = ENOSPC;
3763                         goto restore;
3764                 } else if (fp->ff_blocks < (headblks + datablks)) {
3765                         printf("hfs_relocate: allocation failed");
3766                         retval = ENOSPC;
3767                         goto restore;
3768                 }
3769
3770                 retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize, &sector_b, NULL);
3771                 if (retval) {
3772                         retval = MacToVFSError(retval);
3773                 } else if ((sector_a + 1) == sector_b) {
3774                         retval = ENOSPC;
3775                         goto restore;
3776                 } else if ((eflags & kEFMetadataMask) &&
3777                            ((((u_int64_t)sector_b * hfsmp->hfs_logical_block_size) / blksize) >
3778                               hfsmp->hfs_metazone_end)) {
3779 #if 0
3780                         const char * filestr;
3781                         char emptystr = '\0';
3782
3783                         if (cp->c_desc.cd_nameptr != NULL) {
3784                                 filestr = (const char *)&cp->c_desc.cd_nameptr[0];
3785                         } else if (vnode_name(vp) != NULL) {
3786                                 filestr = vnode_name(vp);
3787                         } else {
3788                                 filestr = &emptystr;
3789                         }
3790 #endif
3791                         retval = ENOSPC;
3792                         goto restore;
3793                 }
3794         }
3795         /* Done with system locks and journal for now. */
3796         hfs_systemfile_unlock(hfsmp, lockflags);
3797         lockflags = 0;
3798         hfs_end_transaction(hfsmp);
3799         started_tr = 0;
3800
3801         if (retval) {
3802                 /*
3803                  * Check to see if failure is due to excessive fragmentation.
3804                  */
3805                 if ((retval == ENOSPC) &&
3806                     (hfs_freeblks(hfsmp, 0) > (datablks * 2))) {
3807                         hfsmp->hfs_flags |= HFS_FRAGMENTED_FREESPACE;
3808                 }
3809                 goto out;
3810         }
3811         /*
3812          * STEP 2 - clone file data into the new allocation blocks.
3813          */
3814
3815         if (vnodetype == VLNK)
3816                 retval = hfs_clonelink(vp, blksize, cred, p);
3817         else if (vnode_issystem(vp))
3818                 retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p);
3819         else
3820                 retval = hfs_clonefile(vp, headblks, datablks, blksize);
3821
3822         /* Start transaction for step 3 or for a restore. */
3823         if (hfs_start_transaction(hfsmp) != 0) {
3824                 retval = EINVAL;
3825                 goto out;
3826         }
3827         started_tr = 1;
3828         if (retval)
3829                 goto restore;
3830
3831         /*
3832          * STEP 3 - switch to cloned data and remove old blocks.
3833          */
3834         lockflags = SFL_BITMAP;
3835         if (overflow_extents(fp))
3836                 lockflags |= SFL_EXTENTS;
3837         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3838
3839         retval = HeadTruncateFile(hfsmp, (FCB*)fp, headblks);
3840
3841         hfs_systemfile_unlock(hfsmp, lockflags);
3842         lockflags = 0;
3843         if (retval)
3844                 goto restore;
3845 out:
3846         if (took_trunc_lock)
3847                 hfs_unlock_truncate(cp, TRUE);
3848
3849         if (lockflags) {
3850                 hfs_systemfile_unlock(hfsmp, lockflags);
3851                 lockflags = 0;
3852         }
3853
3854         /* Push cnode's new extent data to disk. */
3855         if (retval == 0) {
3856                 (void) hfs_update(vp, MNT_WAIT);
3857         }
3858         if (hfsmp->jnl) {
3859                 if (cp->c_cnid < kHFSFirstUserCatalogNodeID)
3860                         (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
3861                 else
3862                         (void) hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
3863         }
3864 exit:
3865         if (started_tr)
3866                 hfs_end_transaction(hfsmp);
3867
3868         return (retval);
3869
3870 restore:
3871         if (fp->ff_blocks == headblks) {
3872                 if (took_trunc_lock)
3873                         hfs_unlock_truncate(cp, TRUE);
3874                 goto exit;
3875         }
3876         /*
3877          * Give back any newly allocated space.
3878          */
3879         if (lockflags == 0) {
3880                 lockflags = SFL_BITMAP;
3881                 if (overflow_extents(fp))
3882                         lockflags |= SFL_EXTENTS;
3883                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3884         }
3885
3886         (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, false);
3887
3888         hfs_systemfile_unlock(hfsmp, lockflags);
3889         lockflags = 0;
3890
3891         if (took_trunc_lock)
3892                 hfs_unlock_truncate(cp, TRUE);
3893         goto exit;
3894 }
3895
3896
3897 /*
3898  * Clone a symlink.
3899  *
3900  */
3901 static int
3902 hfs_clonelink(struct vnode *vp, int blksize, kauth_cred_t cred, __unused struct proc *p)
3903 {
3904         struct buf *head_bp = NULL;
3905         struct buf *tail_bp = NULL;
3906         int error;
3907
3908
3909         error = (int)buf_meta_bread(vp, (daddr64_t)0, blksize, cred, &head_bp);
3910         if (error)
3911                 goto out;
3912
3913         tail_bp = buf_getblk(vp, (daddr64_t)1, blksize, 0, 0, BLK_META);
3914         if (tail_bp == NULL) {
3915                 error = EIO;
3916                 goto out;
3917         }
3918         bcopy((char *)buf_dataptr(head_bp), (char *)buf_dataptr(tail_bp), blksize);
3919         error = (int)buf_bwrite(tail_bp);
3920 out:
3921         if (head_bp) {
3922                 buf_markinvalid(head_bp);
3923                 buf_brelse(head_bp);
3924         }
3925         (void) buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
3926
3927         return (error);
3928 }
3929
3930 /*
3931  * Clone a file's data within the file.
3932  *
3933  */
3934 static int
3935 hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
3936 {
3937         caddr_t  bufp;
3938         size_t  bufsize;
3939         size_t  copysize;
3940         size_t  iosize;
3941         size_t  offset;
3942         off_t   writebase;
3943         uio_t auio;
3944         int  error = 0;
3945
3946         writebase = blkstart * blksize;
3947         copysize = blkcnt * blksize;
3948         iosize = bufsize = MIN(copysize, 128 * 1024);
3949         offset = 0;
3950
3951         if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
3952                 return (ENOMEM);
3953         }
3954         hfs_unlock(VTOC(vp));
3955
3956         auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
3957
3958         while (offset < copysize) {
3959                 iosize = MIN(copysize - offset, iosize);
3960
3961                 uio_reset(auio, offset, UIO_SYSSPACE, UIO_READ);
3962                 uio_addiov(auio, (uintptr_t)bufp, iosize);
3963
3964                 error = cluster_read(vp, auio, copysize, IO_NOCACHE);
3965                 if (error) {
3966                         printf("hfs_clonefile: cluster_read failed - %d\n", error);
3967                         break;
3968                 }
3969                 if (uio_resid(auio) != 0) {
3970                         printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", uio_resid(auio));
3971                         error = EIO;
3972                         break;
3973                 }
3974
3975                 uio_reset(auio, writebase + offset, UIO_SYSSPACE, UIO_WRITE);
3976                 uio_addiov(auio, (uintptr_t)bufp, iosize);
3977
3978                 error = cluster_write(vp, auio, writebase + offset,
3979                                       writebase + offset + iosize,
3980                                       uio_offset(auio), 0, IO_NOCACHE | IO_SYNC);
3981                 if (error) {
3982                         printf("hfs_clonefile: cluster_write failed - %d\n", error);
3983                         break;
3984                 }
3985                 if (uio_resid(auio) != 0) {
3986                         printf("hfs_clonefile: cluster_write failed - uio_resid not zero\n");
3987                         error = EIO;
3988                         break;
3989                 }
3990                 offset += iosize;
3991         }
3992         uio_free(auio);
3993
3994         if ((blksize & PAGE_MASK)) {
3995                 /*
3996                  * since the copy may not have started on a PAGE
3997                  * boundary (or may not have ended on one), we
3998                  * may have pages left in the cache since NOCACHE
3999                  * will let partially written pages linger...
4000                  * lets just flush the entire range to make sure
4001                  * we don't have any pages left that are beyond
4002                  * (or intersect) the real LEOF of this file
4003                  */
4004                 ubc_msync(vp, writebase, writebase + offset, NULL, UBC_INVALIDATE | UBC_PUSHDIRTY);
4005         } else {
4006                 /*
4007                  * No need to call ubc_sync_range or hfs_invalbuf
4008                  * since the file was copied using IO_NOCACHE and
4009                  * the copy was done starting and ending on a page
4010                  * boundary in the file.
4011                  */
4012         }
4013         kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
4014
4015         hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
4016         return (error);
4017 }
4018
4019 /*
4020  * Clone a system (metadata) file.
4021  *
4022  */
4023 static int
4024 hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize,
4025                  kauth_cred_t cred, struct proc *p)
4026 {
4027         caddr_t  bufp;
4028         char * offset;
4029         size_t  bufsize;
4030         size_t  iosize;
4031         struct buf *bp = NULL;
4032         daddr64_t  blkno;
4033         daddr64_t  blk;
4034         daddr64_t  start_blk;
4035         daddr64_t  last_blk;
4036         int  breadcnt;
4037         int  i;
4038         int  error = 0;
4039
4040
4041         iosize = GetLogicalBlockSize(vp);
4042         bufsize = MIN(blkcnt * blksize, 1024 * 1024) & ~(iosize - 1);
4043         breadcnt = bufsize / iosize;
4044
4045         if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
4046                 return (ENOMEM);
4047         }
4048         start_blk = ((daddr64_t)blkstart * blksize) / iosize;
4049         last_blk  = ((daddr64_t)blkcnt * blksize) / iosize;
4050         blkno = 0;
4051
4052         while (blkno < last_blk) {
4053                 /*
4054                  * Read up to a megabyte
4055                  */
4056                 offset = bufp;
4057                 for (i = 0, blk = blkno; (i < breadcnt) && (blk < last_blk); ++i, ++blk) {
4058                         error = (int)buf_meta_bread(vp, blk, iosize, cred, &bp);
4059                         if (error) {
4060                                 printf("hfs_clonesysfile: meta_bread error %d\n", error);
4061                                 goto out;
4062                         }
4063                         if (buf_count(bp) != iosize) {
4064                                 printf("hfs_clonesysfile: b_bcount is only %d\n", buf_count(bp));
4065                                 goto out;
4066                         }
4067                         bcopy((char *)buf_dataptr(bp), offset, iosize);
4068
4069                         buf_markinvalid(bp);
4070                         buf_brelse(bp);
4071                         bp = NULL;
4072
4073                         offset += iosize;
4074                 }
4075
4076                 /*
4077                  * Write up to a megabyte
4078                  */
4079                 offset = bufp;
4080                 for (i = 0; (i < breadcnt) && (blkno < last_blk); ++i, ++blkno) {
4081                         bp = buf_getblk(vp, start_blk + blkno, iosize, 0, 0, BLK_META);
4082                         if (bp == NULL) {
4083                                 printf("hfs_clonesysfile: getblk failed on blk %qd\n", start_blk + blkno);
4084                                 error = EIO;
4085                                 goto out;
4086                         }
4087                         bcopy(offset, (char *)buf_dataptr(bp), iosize);
4088                         error = (int)buf_bwrite(bp);
4089                         bp = NULL;
4090                         if (error)
4091                                 goto out;
4092                         offset += iosize;
4093                 }
4094         }
4095 out:
4096         if (bp) {
4097                 buf_brelse(bp);
4098         }
4099
4100         kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
4101
4102         error = hfs_fsync(vp, MNT_WAIT, 0, p);
4103
4104         return (error);
4105 }