bsd/hfs/hfs_readwrite.c

   1 /*
   2  * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*      @(#)hfs_readwrite.c     1.0
  29  *
  30  *      (c) 1998-2001 Apple Computer, Inc.  All Rights Reserved
  31  *
  32  *      hfs_readwrite.c -- vnode operations to deal with reading and writing files.
  33  *
  34  */
  35
  36 #include <sys/param.h>
  37 #include <sys/systm.h>
  38 #include <sys/resourcevar.h>
  39 #include <sys/kernel.h>
  40 #include <sys/fcntl.h>
  41 #include <sys/filedesc.h>
  42 #include <sys/stat.h>
  43 #include <sys/buf.h>
  44 #include <sys/proc.h>
  45 #include <sys/kauth.h>
  46 #include <sys/vnode.h>
  47 #include <sys/vnode_internal.h>
  48 #include <sys/uio.h>
  49 #include <sys/vfs_context.h>
  50 #include <sys/fsevents.h>
  51 #include <kern/kalloc.h>
  52 #include <sys/disk.h>
  53 #include <sys/sysctl.h>
  54
  55 #include <miscfs/specfs/specdev.h>
  56
  57 #include <sys/ubc.h>
  58 #include <sys/ubc_internal.h>
  59
  60 #include <vm/vm_pageout.h>
  61 #include <vm/vm_kern.h>
  62
  63 #include <sys/kdebug.h>
  64
  65 #include        "hfs.h"
  66 #include        "hfs_attrlist.h"
  67 #include        "hfs_endian.h"
  68 #include        "hfs_fsctl.h"
  69 #include        "hfs_quota.h"
  70 #include        "hfscommon/headers/FileMgrInternal.h"
  71 #include        "hfscommon/headers/BTreesInternal.h"
  72 #include        "hfs_cnode.h"
  73 #include        "hfs_dbg.h"
  74
  75 #define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2)))
  76
  77 enum {
  78         MAXHFSFILESIZE = 0x7FFFFFFF             /* this needs to go in the mount structure */
  79 };
  80
  81 /* from bsd/vfs/vfs_cluster.c */
  82 extern int is_file_clean(vnode_t vp, off_t filesize);
  83 /* from bsd/hfs/hfs_vfsops.c */
  84 extern int hfs_vfs_vget(struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context);
  85
  86 static int  hfs_clonelink(struct vnode *, int, kauth_cred_t, struct proc *);
  87 static int  hfs_clonefile(struct vnode *, int, int, int);
  88 static int  hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *);
  89
  90 int flush_cache_on_write = 0;
  91 SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files");
  92
  93
  94 /*
  95  * Read data from a file.
  96  */
  97 int
  98 hfs_vnop_read(struct vnop_read_args *ap)
  99 {
 100         uio_t uio = ap->a_uio;
 101         struct vnode *vp = ap->a_vp;
 102         struct cnode *cp;
 103         struct filefork *fp;
 104         struct hfsmount *hfsmp;
 105         off_t filesize;
 106         off_t filebytes;
 107         off_t start_resid = uio_resid(uio);
 108         off_t offset = uio_offset(uio);
 109         int retval = 0;
 110
 111
 112         /* Preflight checks */
 113         if (!vnode_isreg(vp)) {
 114                 /* can only read regular files */
 115                 if (vnode_isdir(vp))
 116                         return (EISDIR);
 117                 else
 118                         return (EPERM);
 119         }
 120         if (start_resid == 0)
 121                 return (0);             /* Nothing left to do */
 122         if (offset < 0)
 123                 return (EINVAL);        /* cant read from a negative offset */
 124
 125         cp = VTOC(vp);
 126         fp = VTOF(vp);
 127         hfsmp = VTOHFS(vp);
 128
 129         /* Protect against a size change. */
 130         hfs_lock_truncate(cp, 0);
 131
 132         filesize = fp->ff_size;
 133         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 134         if (offset > filesize) {
 135                 if ((hfsmp->hfs_flags & HFS_STANDARD) &&
 136                     (offset > (off_t)MAXHFSFILESIZE)) {
 137                         retval = EFBIG;
 138                 }
 139                 goto exit;
 140         }
 141
 142         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_START,
 143                 (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0);
 144
 145         retval = cluster_read(vp, uio, filesize, ap->a_ioflag);
 146
 147         cp->c_touch_acctime = TRUE;
 148
 149         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_END,
 150                 (int)uio_offset(uio), uio_resid(uio), (int)filesize,  (int)filebytes, 0);
 151
 152         /*
 153          * Keep track blocks read
 154          */
 155         if (hfsmp->hfc_stage == HFC_RECORDING && retval == 0) {
 156                 int took_cnode_lock = 0;
 157                 off_t bytesread;
 158
 159                 bytesread = start_resid - uio_resid(uio);
 160
 161                 /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
 162                 if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) {
 163                         hfs_lock(cp, HFS_FORCE_LOCK);
 164                         took_cnode_lock = 1;
 165                 }
 166                 /*
 167                  * If this file hasn't been seen since the start of
 168                  * the current sampling period then start over.
 169                  */
 170                 if (cp->c_atime < hfsmp->hfc_timebase) {
 171                         struct timeval tv;
 172
 173                         fp->ff_bytesread = bytesread;
 174                         microtime(&tv);
 175                         cp->c_atime = tv.tv_sec;
 176                 } else {
 177                         fp->ff_bytesread += bytesread;
 178                 }
 179                 if (took_cnode_lock)
 180                         hfs_unlock(cp);
 181         }
 182 exit:
 183         hfs_unlock_truncate(cp, 0);
 184         return (retval);
 185 }
 186
 187 /*
 188  * Write data to a file.
 189  */
 190 int
 191 hfs_vnop_write(struct vnop_write_args *ap)
 192 {
 193         uio_t uio = ap->a_uio;
 194         struct vnode *vp = ap->a_vp;
 195         struct cnode *cp;
 196         struct filefork *fp;
 197         struct hfsmount *hfsmp;
 198         kauth_cred_t cred = NULL;
 199         off_t origFileSize;
 200         off_t writelimit;
 201         off_t bytesToAdd = 0;
 202         off_t actualBytesAdded;
 203         off_t filebytes;
 204         off_t offset;
 205         size_t resid;
 206         int eflags;
 207         int ioflag = ap->a_ioflag;
 208         int retval = 0;
 209         int lockflags;
 210         int cnode_locked = 0;
 211         int partialwrite = 0;
 212         int exclusive_lock = 0;
 213
 214         // LP64todo - fix this! uio_resid may be 64-bit value
 215         resid = uio_resid(uio);
 216         offset = uio_offset(uio);
 217
 218         if (ioflag & IO_APPEND) {
 219             exclusive_lock = 1;
 220         }
 221
 222         if (offset < 0)
 223                 return (EINVAL);
 224         if (resid == 0)
 225                 return (E_NONE);
 226         if (!vnode_isreg(vp))
 227                 return (EPERM);  /* Can only write regular files */
 228
 229         cp = VTOC(vp);
 230         fp = VTOF(vp);
 231         hfsmp = VTOHFS(vp);
 232
 233         eflags = kEFDeferMask;  /* defer file block allocations */
 234 #ifdef HFS_SPARSE_DEV
 235         /*
 236          * When the underlying device is sparse and space
 237          * is low (< 8MB), stop doing delayed allocations
 238          * and begin doing synchronous I/O.
 239          */
 240         if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
 241             (hfs_freeblks(hfsmp, 0) < 2048)) {
 242                 eflags &= ~kEFDeferMask;
 243                 ioflag |= IO_SYNC;
 244         }
 245 #endif /* HFS_SPARSE_DEV */
 246
 247 again:
 248         /* Protect against a size change. */
 249         hfs_lock_truncate(cp, exclusive_lock);
 250
 251         if (ioflag & IO_APPEND) {
 252                 uio_setoffset(uio, fp->ff_size);
 253                 offset = fp->ff_size;
 254         }
 255         if ((cp->c_flags & APPEND) && offset != fp->ff_size) {
 256                 retval = EPERM;
 257                 goto exit;
 258         }
 259
 260         origFileSize = fp->ff_size;
 261         writelimit = offset + resid;
 262         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 263
 264         /* If the truncate lock is shared, and if we either have virtual
 265          * blocks or will need to extend the file, upgrade the truncate
 266          * to exclusive lock.  If upgrade fails, we lose the lock and
 267          * have to get exclusive lock again
 268          */
 269         if ((exclusive_lock == 0) &&
 270             ((fp->ff_unallocblocks != 0) || (writelimit > filebytes))) {
 271                 exclusive_lock = 1;
 272                 /* Lock upgrade failed and we lost our shared lock, try again */
 273                 if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) {
 274                         goto again;
 275                 }
 276         }
 277
 278         if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) {
 279                 goto exit;
 280         }
 281         cnode_locked = 1;
 282
 283         if (!exclusive_lock) {
 284                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_START,
 285                              (int)offset, uio_resid(uio), (int)fp->ff_size,
 286                              (int)filebytes, 0);
 287         }
 288
 289         /* Check if we do not need to extend the file */
 290         if (writelimit <= filebytes) {
 291                 goto sizeok;
 292         }
 293
 294         cred = vfs_context_ucred(ap->a_context);
 295         bytesToAdd = writelimit - filebytes;
 296
 297 #if QUOTA
 298         retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, hfsmp->blockSize)),
 299                            cred, 0);
 300         if (retval)
 301                 goto exit;
 302 #endif /* QUOTA */
 303
 304         if (hfs_start_transaction(hfsmp) != 0) {
 305                 retval = EINVAL;
 306                 goto exit;
 307         }
 308
 309         while (writelimit > filebytes) {
 310                 bytesToAdd = writelimit - filebytes;
 311                 if (cred && suser(cred, NULL) != 0)
 312                         eflags |= kEFReserveMask;
 313
 314                 /* Protect extents b-tree and allocation bitmap */
 315                 lockflags = SFL_BITMAP;
 316                 if (overflow_extents(fp))
 317                         lockflags |= SFL_EXTENTS;
 318                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
 319
 320                 /* Files that are changing size are not hot file candidates. */
 321                 if (hfsmp->hfc_stage == HFC_RECORDING) {
 322                         fp->ff_bytesread = 0;
 323                 }
 324                 retval = MacToVFSError(ExtendFileC (hfsmp, (FCB*)fp, bytesToAdd,
 325                                 0, eflags, &actualBytesAdded));
 326
 327                 hfs_systemfile_unlock(hfsmp, lockflags);
 328
 329                 if ((actualBytesAdded == 0) && (retval == E_NONE))
 330                         retval = ENOSPC;
 331                 if (retval != E_NONE)
 332                         break;
 333                 filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 334                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_NONE,
 335                         (int)offset, uio_resid(uio), (int)fp->ff_size,  (int)filebytes, 0);
 336         }
 337         (void) hfs_update(vp, TRUE);
 338         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
 339         (void) hfs_end_transaction(hfsmp);
 340
 341         /*
 342          * If we didn't grow the file enough try a partial write.
 343          * POSIX expects this behavior.
 344          */
 345         if ((retval == ENOSPC) && (filebytes > offset)) {
 346                 retval = 0;
 347                 partialwrite = 1;
 348                 uio_setresid(uio, (uio_resid(uio) - bytesToAdd));
 349                 resid -= bytesToAdd;
 350                 writelimit = filebytes;
 351         }
 352 sizeok:
 353         if (retval == E_NONE) {
 354                 off_t filesize;
 355                 off_t zero_off;
 356                 off_t tail_off;
 357                 off_t inval_start;
 358                 off_t inval_end;
 359                 off_t io_start;
 360                 int lflag;
 361                 struct rl_entry *invalid_range;
 362
 363                 if (writelimit > fp->ff_size)
 364                         filesize = writelimit;
 365                 else
 366                         filesize = fp->ff_size;
 367
 368                 lflag = ioflag & ~(IO_TAILZEROFILL | IO_HEADZEROFILL | IO_NOZEROVALID | IO_NOZERODIRTY);
 369
 370                 if (offset <= fp->ff_size) {
 371                         zero_off = offset & ~PAGE_MASK_64;
 372
 373                         /* Check to see whether the area between the zero_offset and the start
 374                            of the transfer to see whether is invalid and should be zero-filled
 375                            as part of the transfer:
 376                          */
 377                         if (offset > zero_off) {
 378                                 if (rl_scan(&fp->ff_invalidranges, zero_off, offset - 1, &invalid_range) != RL_NOOVERLAP)
 379                                         lflag |= IO_HEADZEROFILL;
 380                         }
 381                 } else {
 382                         off_t eof_page_base = fp->ff_size & ~PAGE_MASK_64;
 383
 384                         /* The bytes between fp->ff_size and uio->uio_offset must never be
 385                            read without being zeroed.  The current last block is filled with zeroes
 386                            if it holds valid data but in all cases merely do a little bookkeeping
 387                            to track the area from the end of the current last page to the start of
 388                            the area actually written.  For the same reason only the bytes up to the
 389                            start of the page where this write will start is invalidated; any remainder
 390                            before uio->uio_offset is explicitly zeroed as part of the cluster_write.
 391
 392                            Note that inval_start, the start of the page after the current EOF,
 393                            may be past the start of the write, in which case the zeroing
 394                            will be handled by the cluser_write of the actual data.
 395                          */
 396                         inval_start = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
 397                         inval_end = offset & ~PAGE_MASK_64;
 398                         zero_off = fp->ff_size;
 399
 400                         if ((fp->ff_size & PAGE_MASK_64) &&
 401                                 (rl_scan(&fp->ff_invalidranges,
 402                                                         eof_page_base,
 403                                                         fp->ff_size - 1,
 404                                                         &invalid_range) != RL_NOOVERLAP)) {
 405                                 /* The page containing the EOF is not valid, so the
 406                                    entire page must be made inaccessible now.  If the write
 407                                    starts on a page beyond the page containing the eof
 408                                    (inval_end > eof_page_base), add the
 409                                    whole page to the range to be invalidated.  Otherwise
 410                                    (i.e. if the write starts on the same page), zero-fill
 411                                    the entire page explicitly now:
 412                                  */
 413                                 if (inval_end > eof_page_base) {
 414                                         inval_start = eof_page_base;
 415                                 } else {
 416                                         zero_off = eof_page_base;
 417                                 };
 418                         };
 419
 420                         if (inval_start < inval_end) {
 421                                 struct timeval tv;
 422                                 /* There's some range of data that's going to be marked invalid */
 423
 424                                 if (zero_off < inval_start) {
 425                                         /* The pages between inval_start and inval_end are going to be invalidated,
 426                                            and the actual write will start on a page past inval_end.  Now's the last
 427                                            chance to zero-fill the page containing the EOF:
 428                                          */
 429                                         hfs_unlock(cp);
 430                                         cnode_locked = 0;
 431                                         retval = cluster_write(vp, (uio_t) 0,
 432                                                         fp->ff_size, inval_start,
 433                                                         zero_off, (off_t)0,
 434                                                         lflag | IO_HEADZEROFILL | IO_NOZERODIRTY);
 435                                         hfs_lock(cp, HFS_FORCE_LOCK);
 436                                         cnode_locked = 1;
 437                                         if (retval) goto ioerr_exit;
 438                                         offset = uio_offset(uio);
 439                                 };
 440
 441                                 /* Mark the remaining area of the newly allocated space as invalid: */
 442                                 rl_add(inval_start, inval_end - 1 , &fp->ff_invalidranges);
 443                                 microuptime(&tv);
 444                                 cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
 445                                 zero_off = fp->ff_size = inval_end;
 446                         };
 447
 448                         if (offset > zero_off) lflag |= IO_HEADZEROFILL;
 449                 };
 450
 451                 /* Check to see whether the area between the end of the write and the end of
 452                    the page it falls in is invalid and should be zero-filled as part of the transfer:
 453                  */
 454                 tail_off = (writelimit + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
 455                 if (tail_off > filesize) tail_off = filesize;
 456                 if (tail_off > writelimit) {
 457                         if (rl_scan(&fp->ff_invalidranges, writelimit, tail_off - 1, &invalid_range) != RL_NOOVERLAP) {
 458                                 lflag |= IO_TAILZEROFILL;
 459                         };
 460                 };
 461
 462                 /*
 463                  * if the write starts beyond the current EOF (possibly advanced in the
 464                  * zeroing of the last block, above), then we'll zero fill from the current EOF
 465                  * to where the write begins:
 466                  *
 467                  * NOTE: If (and ONLY if) the portion of the file about to be written is
 468                  *       before the current EOF it might be marked as invalid now and must be
 469                  *       made readable (removed from the invalid ranges) before cluster_write
 470                  *       tries to write it:
 471                  */
 472                 io_start = (lflag & IO_HEADZEROFILL) ? zero_off : offset;
 473                 if (io_start < fp->ff_size) {
 474                         off_t io_end;
 475
 476                         io_end = (lflag & IO_TAILZEROFILL) ? tail_off : writelimit;
 477                         rl_remove(io_start, io_end - 1, &fp->ff_invalidranges);
 478                 };
 479
 480                 hfs_unlock(cp);
 481                 cnode_locked = 0;
 482
 483                 /*
 484                  * We need to tell UBC the fork's new size BEFORE calling
 485                  * cluster_write, in case any of the new pages need to be
 486                  * paged out before cluster_write completes (which does happen
 487                  * in embedded systems due to extreme memory pressure).
 488                  * Similarly, we need to tell hfs_vnop_pageout what the new EOF
 489                  * will be, so that it can pass that on to cluster_pageout, and
 490                  * allow those pageouts.
 491                  *
 492                  * We don't update ff_size yet since we don't want pageins to
 493                  * be able to see uninitialized data between the old and new
 494                  * EOF, until cluster_write has completed and initialized that
 495                  * part of the file.
 496                  *
 497                  * The vnode pager relies on the file size last given to UBC via
 498                  * ubc_setsize.  hfs_vnop_pageout relies on fp->ff_new_size or
 499                  * ff_size (whichever is larger).  NOTE: ff_new_size is always
 500                  * zero, unless we are extending the file via write.
 501                  */
 502                 if (filesize > fp->ff_size) {
 503                         fp->ff_new_size = filesize;
 504                         ubc_setsize(vp, filesize);
 505                 }
 506                 retval = cluster_write(vp, uio, fp->ff_size, filesize, zero_off,
 507                                 tail_off, lflag | IO_NOZERODIRTY);
 508                 if (retval) {
 509                         fp->ff_new_size = 0;    /* no longer extending; use ff_size */
 510                         if (filesize > origFileSize) {
 511                                 ubc_setsize(vp, origFileSize);
 512                         }
 513                         goto ioerr_exit;
 514                 }
 515
 516                 if (filesize > origFileSize) {
 517                         fp->ff_size = filesize;
 518
 519                         /* Files that are changing size are not hot file candidates. */
 520                         if (hfsmp->hfc_stage == HFC_RECORDING) {
 521                                 fp->ff_bytesread = 0;
 522                         }
 523                 }
 524                 fp->ff_new_size = 0;    /* ff_size now has the correct size */
 525
 526                 /* If we wrote some bytes, then touch the change and mod times */
 527                 if (resid > uio_resid(uio)) {
 528                         cp->c_touch_chgtime = TRUE;
 529                         cp->c_touch_modtime = TRUE;
 530                 }
 531         }
 532         if (partialwrite) {
 533                 uio_setresid(uio, (uio_resid(uio) + bytesToAdd));
 534                 resid += bytesToAdd;
 535         }
 536
 537         // XXXdbg - see radar 4871353 for more info
 538         {
 539             if (flush_cache_on_write && ((ioflag & IO_NOCACHE) || vnode_isnocache(vp))) {
 540                 VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NULL);
 541             }
 542         }
 543         HFS_KNOTE(vp, NOTE_WRITE);
 544
 545 ioerr_exit:
 546         /*
 547          * If we successfully wrote any data, and we are not the superuser
 548          * we clear the setuid and setgid bits as a precaution against
 549          * tampering.
 550          */
 551         if (cp->c_mode & (S_ISUID | S_ISGID)) {
 552                 cred = vfs_context_ucred(ap->a_context);
 553                 if (resid > uio_resid(uio) && cred && suser(cred, NULL)) {
 554                         if (!cnode_locked) {
 555                                 hfs_lock(cp, HFS_FORCE_LOCK);
 556                                 cnode_locked = 1;
 557                         }
 558                         cp->c_mode &= ~(S_ISUID | S_ISGID);
 559                 }
 560         }
 561         if (retval) {
 562                 if (ioflag & IO_UNIT) {
 563                         if (!cnode_locked) {
 564                                 hfs_lock(cp, HFS_FORCE_LOCK);
 565                                 cnode_locked = 1;
 566                         }
 567                         (void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC,
 568                                            0, ap->a_context);
 569                         // LP64todo - fix this!  resid needs to by user_ssize_t
 570                         uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio))));
 571                         uio_setresid(uio, resid);
 572                         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 573                 }
 574         } else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio))) {
 575                 if (!cnode_locked) {
 576                         hfs_lock(cp, HFS_FORCE_LOCK);
 577                         cnode_locked = 1;
 578                 }
 579                 retval = hfs_update(vp, TRUE);
 580         }
 581         /* Updating vcbWrCnt doesn't need to be atomic. */
 582         hfsmp->vcbWrCnt++;
 583
 584         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_END,
 585                 (int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0);
 586 exit:
 587         if (cnode_locked)
 588                 hfs_unlock(cp);
 589         hfs_unlock_truncate(cp, exclusive_lock);
 590         return (retval);
 591 }
 592
 593 /* support for the "bulk-access" fcntl */
 594
 595 #define CACHE_LEVELS 16
 596 #define NUM_CACHE_ENTRIES (64*16)
 597 #define PARENT_IDS_FLAG 0x100
 598
 599 struct access_cache {
 600        int numcached;
 601        int cachehits; /* these two for statistics gathering */
 602        int lookups;
 603        unsigned int *acache;
 604        unsigned char *haveaccess;
 605 };
 606
 607 struct access_t {
 608         uid_t     uid;              /* IN: effective user id */
 609         short     flags;            /* IN: access requested (i.e. R_OK) */
 610         short     num_groups;       /* IN: number of groups user belongs to */
 611         int       num_files;        /* IN: number of files to process */
 612         int       *file_ids;        /* IN: array of file ids */
 613         gid_t     *groups;          /* IN: array of groups */
 614         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 615 };
 616
 617 struct user_access_t {
 618         uid_t           uid;                    /* IN: effective user id */
 619         short           flags;                  /* IN: access requested (i.e. R_OK) */
 620         short           num_groups;             /* IN: number of groups user belongs to */
 621         int             num_files;              /* IN: number of files to process */
 622         user_addr_t     file_ids;               /* IN: array of file ids */
 623         user_addr_t     groups;                 /* IN: array of groups */
 624         user_addr_t     access;                 /* OUT: access info for each file (0 for 'has access') */
 625 };
 626
 627
 628 // these are the "extended" versions of the above structures
 629 // note that it is crucial that they be different sized than
 630 // the regular version
 631 struct ext_access_t {
 632         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 633         uint32_t   num_files;       /* IN: number of files to process */
 634         uint32_t   map_size;        /* IN: size of the bit map */
 635         uint32_t  *file_ids;        /* IN: Array of file ids */
 636         char      *bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 637         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 638         uint32_t   num_parents;   /* future use */
 639         cnid_t      *parents;   /* future use */
 640 };
 641
 642 struct ext_user_access_t {
 643         uint32_t      flags;        /* IN: access requested (i.e. R_OK) */
 644         uint32_t      num_files;    /* IN: number of files to process */
 645         uint32_t      map_size;     /* IN: size of the bit map */
 646         user_addr_t   file_ids;     /* IN: array of file ids */
 647         user_addr_t   bitmap;       /* IN: array of groups */
 648         user_addr_t   access;       /* OUT: access info for each file (0 for 'has access') */
 649         uint32_t      num_parents;/* future use */
 650         user_addr_t   parents;/* future use */
 651 };
 652
 653
 654 /*
 655  * Perform a binary search for the given parent_id. Return value is
 656  * the index if there is a match.  If no_match_indexp is non-NULL it
 657  * will be assigned with the index to insert the item (even if it was
 658  * not found).
 659  */
 660 static int cache_binSearch(cnid_t *array, unsigned int hi, cnid_t parent_id, int *no_match_indexp)
 661 {
 662     int index=-1;
 663     unsigned int lo=0;
 664
 665     do {
 666         unsigned int mid = ((hi - lo)/2) + lo;
 667         unsigned int this_id = array[mid];
 668
 669         if (parent_id == this_id) {
 670             hi = mid;
 671             break;
 672         }
 673
 674         if (parent_id < this_id) {
 675             hi = mid;
 676             continue;
 677         }
 678
 679         if (parent_id > this_id) {
 680             lo = mid + 1;
 681             continue;
 682         }
 683     } while(lo < hi);
 684
 685     /* check if lo and hi converged on the match */
 686     if (parent_id == array[hi]) {
 687         index = hi;
 688     }
 689
 690     if (no_match_indexp) {
 691         *no_match_indexp = hi;
 692     }
 693
 694     return index;
 695 }
 696
 697
 698 static int
 699 lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id)
 700 {
 701     unsigned int hi;
 702     int matches = 0;
 703     int index, no_match_index;
 704
 705     if (cache->numcached == 0) {
 706         *indexp = 0;
 707         return 0; // table is empty, so insert at index=0 and report no match
 708     }
 709
 710     if (cache->numcached > NUM_CACHE_ENTRIES) {
 711         /*printf("EGAD! numcached is %d... cut our losses and trim to %d\n",
 712           cache->numcached, NUM_CACHE_ENTRIES);*/
 713         cache->numcached = NUM_CACHE_ENTRIES;
 714     }
 715
 716     hi = cache->numcached - 1;
 717
 718     index = cache_binSearch(cache->acache, hi, parent_id, &no_match_index);
 719
 720     /* if no existing entry found, find index for new one */
 721     if (index == -1) {
 722         index = no_match_index;
 723         matches = 0;
 724     } else {
 725         matches = 1;
 726     }
 727
 728     *indexp = index;
 729     return matches;
 730 }
 731
 732 /*
 733  * Add a node to the access_cache at the given index (or do a lookup first
 734  * to find the index if -1 is passed in). We currently do a replace rather
 735  * than an insert if the cache is full.
 736  */
 737 static void
 738 add_node(struct access_cache *cache, int index, cnid_t nodeID, int access)
 739 {
 740     int lookup_index = -1;
 741
 742     /* need to do a lookup first if -1 passed for index */
 743     if (index == -1) {
 744         if (lookup_bucket(cache, &lookup_index, nodeID)) {
 745             if (cache->haveaccess[lookup_index] != access && cache->haveaccess[lookup_index] == ESRCH) {
 746                 // only update an entry if the previous access was ESRCH (i.e. a scope checking error)
 747                 cache->haveaccess[lookup_index] = access;
 748             }
 749
 750             /* mission accomplished */
 751             return;
 752         } else {
 753             index = lookup_index;
 754         }
 755
 756     }
 757
 758     /* if the cache is full, do a replace rather than an insert */
 759     if (cache->numcached >= NUM_CACHE_ENTRIES) {
 760         //printf("cache is full (%d). replace at index %d\n", cache->numcached, index);
 761         cache->numcached = NUM_CACHE_ENTRIES-1;
 762
 763         if (index > cache->numcached) {
 764             //    printf("index %d pinned to %d\n", index, cache->numcached);
 765             index = cache->numcached;
 766         }
 767     }
 768
 769     if (index < cache->numcached && index < NUM_CACHE_ENTRIES && nodeID > cache->acache[index]) {
 770         index++;
 771     }
 772
 773     if (index >= 0 && index < cache->numcached) {
 774         /* only do bcopy if we're inserting */
 775         bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) );
 776         bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(unsigned char) );
 777     }
 778
 779     cache->acache[index] = nodeID;
 780     cache->haveaccess[index] = access;
 781     cache->numcached++;
 782 }
 783
 784
 785 struct cinfo {
 786     uid_t   uid;
 787     gid_t   gid;
 788     mode_t  mode;
 789     cnid_t  parentcnid;
 790     u_int16_t recflags;
 791 };
 792
 793 static int
 794 snoop_callback(const struct cat_desc *descp, const struct cat_attr *attrp, void * arg)
 795 {
 796     struct cinfo *cip = (struct cinfo *)arg;
 797
 798     cip->uid = attrp->ca_uid;
 799     cip->gid = attrp->ca_gid;
 800     cip->mode = attrp->ca_mode;
 801     cip->parentcnid = descp->cd_parentcnid;
 802     cip->recflags = attrp->ca_recflags;
 803
 804     return (0);
 805 }
 806
 807 /*
 808  * Lookup the cnid's attr info (uid, gid, and mode) as well as its parent id. If the item
 809  * isn't incore, then go to the catalog.
 810  */
 811 static int
 812 do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, dev_t dev, cnid_t cnid,
 813     struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp)
 814 {
 815     int error = 0;
 816
 817     /* if this id matches the one the fsctl was called with, skip the lookup */
 818     if (cnid == skip_cp->c_cnid) {
 819         cnattrp->ca_uid = skip_cp->c_uid;
 820         cnattrp->ca_gid = skip_cp->c_gid;
 821         cnattrp->ca_mode = skip_cp->c_mode;
 822         keyp->hfsPlus.parentID = skip_cp->c_parentcnid;
 823     } else {
 824         struct cinfo c_info;
 825
 826         /* otherwise, check the cnode hash incase the file/dir is incore */
 827         if (hfs_chash_snoop(dev, cnid, snoop_callback, &c_info) == 0) {
 828             cnattrp->ca_uid = c_info.uid;
 829             cnattrp->ca_gid = c_info.gid;
 830             cnattrp->ca_mode = c_info.mode;
 831             cnattrp->ca_recflags = c_info.recflags;
 832             keyp->hfsPlus.parentID = c_info.parentcnid;
 833         } else {
 834             int lockflags;
 835
 836             lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
 837
 838             /* lookup this cnid in the catalog */
 839             error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp);
 840
 841             hfs_systemfile_unlock(hfsmp, lockflags);
 842
 843             cache->lookups++;
 844         }
 845     }
 846
 847     return (error);
 848 }
 849
 850
 851 /*
 852  * Compute whether we have access to the given directory (nodeID) and all its parents. Cache
 853  * up to CACHE_LEVELS as we progress towards the root.
 854  */
 855 static int
 856 do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HFSCatalogNodeID nodeID,
 857     struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred, dev_t dev,
 858     struct vfs_context *my_context,
 859     char *bitmap,
 860     uint32_t map_size,
 861     cnid_t* parents,
 862     uint32_t num_parents)
 863 {
 864     int                     myErr = 0;
 865     int                     myResult;
 866     HFSCatalogNodeID        thisNodeID;
 867     unsigned int            myPerms;
 868     struct cat_attr         cnattr;
 869     int                     cache_index = -1, scope_index = -1, scope_idx_start = -1;
 870     CatalogKey              catkey;
 871
 872     int i = 0, ids_to_cache = 0;
 873     int parent_ids[CACHE_LEVELS];
 874
 875     thisNodeID = nodeID;
 876     while (thisNodeID >=  kRootDirID) {
 877         myResult = 0;   /* default to "no access" */
 878
 879         /* check the cache before resorting to hitting the catalog */
 880
 881         /* ASSUMPTION: access info of cached entries is "final"... i.e. no need
 882          * to look any further after hitting cached dir */
 883
 884         if (lookup_bucket(cache, &cache_index, thisNodeID)) {
 885             cache->cachehits++;
 886             myErr = cache->haveaccess[cache_index];
 887             if (scope_index != -1) {
 888                 if (myErr == ESRCH) {
 889                     myErr = 0;
 890                 }
 891             } else {
 892                 scope_index = 0;   // so we'll just use the cache result
 893                 scope_idx_start = ids_to_cache;
 894             }
 895             myResult = (myErr == 0) ? 1 : 0;
 896             goto ExitThisRoutine;
 897         }
 898
 899
 900         if (parents) {
 901             int tmp;
 902             tmp = cache_binSearch(parents, num_parents-1, thisNodeID, NULL);
 903             if (scope_index == -1)
 904                 scope_index = tmp;
 905             if (tmp != -1 && scope_idx_start == -1 && ids_to_cache < CACHE_LEVELS) {
 906                 scope_idx_start = ids_to_cache;
 907             }
 908         }
 909
 910         /* remember which parents we want to cache */
 911         if (ids_to_cache < CACHE_LEVELS) {
 912             parent_ids[ids_to_cache] = thisNodeID;
 913             ids_to_cache++;
 914         }
 915         // Inefficient (using modulo) and we might want to use a hash function, not rely on the node id to be "nice"...
 916         if (bitmap && map_size) {
 917             bitmap[(thisNodeID/8)%(map_size)]|=(1<<(thisNodeID&7));
 918         }
 919
 920
 921         /* do the lookup (checks the cnode hash, then the catalog) */
 922         myErr = do_attr_lookup(hfsmp, cache, dev, thisNodeID, skip_cp, &catkey, &cnattr);
 923         if (myErr) {
 924             goto ExitThisRoutine; /* no access */
 925         }
 926
 927         /* Root always gets access. */
 928         if (suser(myp_ucred, NULL) == 0) {
 929                 thisNodeID = catkey.hfsPlus.parentID;
 930                 myResult = 1;
 931                 continue;
 932         }
 933
 934         // if the thing has acl's, do the full permission check
 935         if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
 936             struct vnode *vp;
 937
 938             /* get the vnode for this cnid */
 939             myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0);
 940             if ( myErr ) {
 941                 myResult = 0;
 942                 goto ExitThisRoutine;
 943             }
 944
 945             thisNodeID = VTOC(vp)->c_parentcnid;
 946
 947             hfs_unlock(VTOC(vp));
 948
 949             if (vnode_vtype(vp) == VDIR) {
 950                 myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), my_context);
 951             } else {
 952                 myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, my_context);
 953             }
 954
 955             vnode_put(vp);
 956             if (myErr) {
 957                 myResult = 0;
 958                 goto ExitThisRoutine;
 959             }
 960         } else {
 961             unsigned int flags;
 962
 963             myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
 964                 cnattr.ca_mode, hfsmp->hfs_mp,
 965                 myp_ucred, theProcPtr);
 966
 967             if (cnattr.ca_mode & S_IFDIR) {
 968                 flags = R_OK | X_OK;
 969             } else {
 970                 flags = R_OK;
 971             }
 972             if ( (myPerms & flags) != flags) {
 973                 myResult = 0;
 974                 myErr = EACCES;
 975                 goto ExitThisRoutine;   /* no access */
 976             }
 977
 978             /* up the hierarchy we go */
 979             thisNodeID = catkey.hfsPlus.parentID;
 980         }
 981     }
 982
 983     /* if here, we have access to this node */
 984     myResult = 1;
 985
 986   ExitThisRoutine:
 987     if (parents && myErr == 0 && scope_index == -1) {
 988         myErr = ESRCH;
 989     }
 990
 991     if (myErr) {
 992         myResult = 0;
 993     }
 994     *err = myErr;
 995
 996     /* cache the parent directory(ies) */
 997     for (i = 0; i < ids_to_cache; i++) {
 998         if (myErr == 0 && parents && (scope_idx_start == -1 || i > scope_idx_start)) {
 999             add_node(cache, -1, parent_ids[i], ESRCH);
1000         } else {
1001             add_node(cache, -1, parent_ids[i], myErr);
1002         }
1003     }
1004
1005     return (myResult);
1006 }
1007
1008 static int
1009 do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
1010     struct vnop_ioctl_args *ap, int arg_size, vfs_context_t context)
1011 {
1012     boolean_t is64bit;
1013
1014     /*
1015      * NOTE: on entry, the vnode is locked. Incase this vnode
1016      * happens to be in our list of file_ids, we'll note it
1017      * avoid calling hfs_chashget_nowait() on that id as that
1018      * will cause a "locking against myself" panic.
1019      */
1020     Boolean check_leaf = true;
1021
1022     struct ext_user_access_t *user_access_structp;
1023     struct ext_user_access_t tmp_user_access;
1024     struct access_cache cache;
1025
1026     int error = 0;
1027     unsigned int i;
1028
1029     dev_t dev = VTOC(vp)->c_dev;
1030
1031     short flags;
1032     unsigned int num_files = 0;
1033     int map_size = 0;
1034     int num_parents = 0;
1035     int *file_ids=NULL;
1036     short *access=NULL;
1037     char *bitmap=NULL;
1038     cnid_t *parents=NULL;
1039     int leaf_index;
1040
1041     cnid_t cnid;
1042     cnid_t prevParent_cnid = 0;
1043     unsigned int myPerms;
1044     short myaccess = 0;
1045     struct cat_attr cnattr;
1046     CatalogKey catkey;
1047     struct cnode *skip_cp = VTOC(vp);
1048     kauth_cred_t cred = vfs_context_ucred(context);
1049     proc_t p = vfs_context_proc(context);
1050
1051     is64bit = proc_is64bit(p);
1052
1053     /* initialize the local cache and buffers */
1054     cache.numcached = 0;
1055     cache.cachehits = 0;
1056     cache.lookups = 0;
1057     cache.acache = NULL;
1058     cache.haveaccess = NULL;
1059
1060     /* struct copyin done during dispatch... need to copy file_id array separately */
1061     if (ap->a_data == NULL) {
1062         error = EINVAL;
1063         goto err_exit_bulk_access;
1064     }
1065
1066     if (is64bit) {
1067         if (arg_size != sizeof(struct ext_user_access_t)) {
1068             error = EINVAL;
1069             goto err_exit_bulk_access;
1070         }
1071
1072         user_access_structp = (struct ext_user_access_t *)ap->a_data;
1073
1074     } else if (arg_size == sizeof(struct access_t)) {
1075         struct access_t *accessp = (struct access_t *)ap->a_data;
1076
1077         // convert an old style bulk-access struct to the new style
1078         tmp_user_access.flags     = accessp->flags;
1079         tmp_user_access.num_files = accessp->num_files;
1080         tmp_user_access.map_size  = 0;
1081         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1082         tmp_user_access.bitmap    = USER_ADDR_NULL;
1083         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1084         tmp_user_access.num_parents = 0;
1085         user_access_structp = &tmp_user_access;
1086
1087     } else if (arg_size == sizeof(struct ext_access_t)) {
1088         struct ext_access_t *accessp = (struct ext_access_t *)ap->a_data;
1089
1090         // up-cast from a 32-bit version of the struct
1091         tmp_user_access.flags     = accessp->flags;
1092         tmp_user_access.num_files = accessp->num_files;
1093         tmp_user_access.map_size  = accessp->map_size;
1094         tmp_user_access.num_parents  = accessp->num_parents;
1095
1096         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1097         tmp_user_access.bitmap    = CAST_USER_ADDR_T(accessp->bitmap);
1098         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1099         tmp_user_access.parents    = CAST_USER_ADDR_T(accessp->parents);
1100
1101         user_access_structp = &tmp_user_access;
1102     } else {
1103         error = EINVAL;
1104         goto err_exit_bulk_access;
1105     }
1106
1107     map_size = user_access_structp->map_size;
1108
1109     num_files = user_access_structp->num_files;
1110
1111     num_parents= user_access_structp->num_parents;
1112
1113     if (num_files < 1) {
1114         goto err_exit_bulk_access;
1115     }
1116     if (num_files > 1024) {
1117         error = EINVAL;
1118         goto err_exit_bulk_access;
1119     }
1120
1121     if (num_parents > 1024) {
1122         error = EINVAL;
1123         goto err_exit_bulk_access;
1124     }
1125
1126     file_ids = (int *) kalloc(sizeof(int) * num_files);
1127     access = (short *) kalloc(sizeof(short) * num_files);
1128     if (map_size) {
1129         bitmap = (char *) kalloc(sizeof(char) * map_size);
1130     }
1131
1132     if (num_parents) {
1133         parents = (cnid_t *) kalloc(sizeof(cnid_t) * num_parents);
1134     }
1135
1136     cache.acache = (unsigned int *) kalloc(sizeof(int) * NUM_CACHE_ENTRIES);
1137     cache.haveaccess = (unsigned char *) kalloc(sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1138
1139     if (file_ids == NULL || access == NULL || (map_size != 0 && bitmap == NULL) || cache.acache == NULL || cache.haveaccess == NULL) {
1140         if (file_ids) {
1141             kfree(file_ids, sizeof(int) * num_files);
1142         }
1143         if (bitmap) {
1144             kfree(bitmap, sizeof(char) * map_size);
1145         }
1146         if (access) {
1147             kfree(access, sizeof(short) * num_files);
1148         }
1149         if (cache.acache) {
1150             kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1151         }
1152         if (cache.haveaccess) {
1153             kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1154         }
1155         if (parents) {
1156             kfree(parents, sizeof(cnid_t) * num_parents);
1157         }
1158         return ENOMEM;
1159     }
1160
1161     // make sure the bitmap is zero'ed out...
1162     if (bitmap) {
1163         bzero(bitmap, (sizeof(char) * map_size));
1164     }
1165
1166     if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids,
1167                 num_files * sizeof(int)))) {
1168         goto err_exit_bulk_access;
1169     }
1170
1171     if (num_parents) {
1172         if ((error = copyin(user_access_structp->parents, (caddr_t)parents,
1173                     num_parents * sizeof(cnid_t)))) {
1174             goto err_exit_bulk_access;
1175         }
1176     }
1177
1178     flags = user_access_structp->flags;
1179     if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) {
1180         flags = R_OK;
1181     }
1182
1183     /* check if we've been passed leaf node ids or parent ids */
1184     if (flags & PARENT_IDS_FLAG) {
1185         check_leaf = false;
1186     }
1187
1188     /* Check access to each file_id passed in */
1189     for (i = 0; i < num_files; i++) {
1190         leaf_index=-1;
1191         cnid = (cnid_t) file_ids[i];
1192
1193         /* root always has access */
1194         if ((!parents) && (!suser(cred, NULL))) {
1195             access[i] = 0;
1196             continue;
1197         }
1198
1199         if (check_leaf) {
1200             /* do the lookup (checks the cnode hash, then the catalog) */
1201             error = do_attr_lookup(hfsmp, &cache, dev, cnid, skip_cp, &catkey, &cnattr);
1202             if (error) {
1203                 access[i] = (short) error;
1204                 continue;
1205             }
1206
1207             if (parents) {
1208                 // Check if the leaf matches one of the parent scopes
1209                 leaf_index = cache_binSearch(parents, num_parents-1, cnid, NULL);
1210             }
1211
1212             // if the thing has acl's, do the full permission check
1213             if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1214                 struct vnode *cvp;
1215                 int myErr = 0;
1216                 /* get the vnode for this cnid */
1217                 myErr = hfs_vget(hfsmp, cnid, &cvp, 0);
1218                 if ( myErr ) {
1219                     access[i] = myErr;
1220                     continue;
1221                 }
1222
1223                 hfs_unlock(VTOC(cvp));
1224
1225                 if (vnode_vtype(cvp) == VDIR) {
1226                     myErr = vnode_authorize(cvp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), context);
1227                 } else {
1228                     myErr = vnode_authorize(cvp, NULL, KAUTH_VNODE_READ_DATA, context);
1229                 }
1230
1231                 vnode_put(cvp);
1232                 if (myErr) {
1233                     access[i] = myErr;
1234                     continue;
1235                 }
1236             } else {
1237                 /* before calling CheckAccess(), check the target file for read access */
1238                 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
1239                     cnattr.ca_mode, hfsmp->hfs_mp, cred, p);
1240
1241                 /* fail fast if no access */
1242                 if ((myPerms & flags) == 0) {
1243                     access[i] = EACCES;
1244                     continue;
1245                 }
1246             }
1247         } else {
1248             /* we were passed an array of parent ids */
1249             catkey.hfsPlus.parentID = cnid;
1250         }
1251
1252         /* if the last guy had the same parent and had access, we're done */
1253         if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0) {
1254             cache.cachehits++;
1255             access[i] = 0;
1256             continue;
1257         }
1258
1259         myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID,
1260             skip_cp, p, cred, dev, context,bitmap, map_size, parents, num_parents);
1261
1262         if (myaccess || (error == ESRCH && leaf_index != -1)) {
1263             access[i] = 0; // have access.. no errors to report
1264         } else {
1265             access[i] = (error != 0 ? (short) error : EACCES);
1266         }
1267
1268         prevParent_cnid = catkey.hfsPlus.parentID;
1269     }
1270
1271     /* copyout the access array */
1272     if ((error = copyout((caddr_t)access, user_access_structp->access,
1273                 num_files * sizeof (short)))) {
1274         goto err_exit_bulk_access;
1275     }
1276     if (map_size && bitmap) {
1277         if ((error = copyout((caddr_t)bitmap, user_access_structp->bitmap,
1278                     map_size * sizeof (char)))) {
1279             goto err_exit_bulk_access;
1280         }
1281     }
1282
1283
1284   err_exit_bulk_access:
1285
1286     //printf("on exit (err %d), numfiles/numcached/cachehits/lookups is %d/%d/%d/%d\n", error, num_files, cache.numcached, cache.cachehits, cache.lookups);
1287
1288     if (file_ids)
1289         kfree(file_ids, sizeof(int) * num_files);
1290     if (parents)
1291         kfree(parents, sizeof(cnid_t) * num_parents);
1292     if (bitmap)
1293         kfree(bitmap, sizeof(char) * map_size);
1294     if (access)
1295         kfree(access, sizeof(short) * num_files);
1296     if (cache.acache)
1297         kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1298     if (cache.haveaccess)
1299         kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1300
1301     return (error);
1302 }
1303
1304
1305 /* end "bulk-access" support */
1306
1307
1308 /*
1309  * Callback for use with freeze ioctl.
1310  */
1311 static int
1312 hfs_freezewrite_callback(struct vnode *vp, __unused void *cargs)
1313 {
1314         vnode_waitforwrites(vp, 0, 0, 0, "hfs freeze");
1315
1316         return 0;
1317 }
1318
1319 /*
1320  * Control filesystem operating characteristics.
1321  */
1322 int
1323 hfs_vnop_ioctl( struct vnop_ioctl_args /* {
1324                 vnode_t a_vp;
1325                 int  a_command;
1326                 caddr_t  a_data;
1327                 int  a_fflag;
1328                 vfs_context_t a_context;
1329         } */ *ap)
1330 {
1331         struct vnode * vp = ap->a_vp;
1332         struct hfsmount *hfsmp = VTOHFS(vp);
1333         vfs_context_t context = ap->a_context;
1334         kauth_cred_t cred = vfs_context_ucred(context);
1335         proc_t p = vfs_context_proc(context);
1336         struct vfsstatfs *vfsp;
1337         boolean_t is64bit;
1338
1339         is64bit = proc_is64bit(p);
1340
1341         switch (ap->a_command) {
1342
1343         case HFS_GETPATH:
1344         {
1345                 struct vnode *file_vp;
1346                 cnid_t  cnid;
1347                 int  outlen;
1348                 char *bufptr;
1349                 int error;
1350
1351                 /* Caller must be owner of file system. */
1352                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1353                 if (suser(cred, NULL) &&
1354                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1355                         return (EACCES);
1356                 }
1357                 /* Target vnode must be file system's root. */
1358                 if (!vnode_isvroot(vp)) {
1359                         return (EINVAL);
1360                 }
1361                 bufptr = (char *)ap->a_data;
1362                 cnid = strtoul(bufptr, NULL, 10);
1363
1364                 /* We need to call hfs_vfs_vget to leverage the code that will fix the
1365                  * origin list for us if needed, as opposed to calling hfs_vget, since
1366                  * we will need it for the subsequent build_path call.
1367                  */
1368                 if ((error = hfs_vfs_vget(HFSTOVFS(hfsmp), cnid, &file_vp, context))) {
1369                         return (error);
1370                 }
1371                 error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, 0, context);
1372                 vnode_put(file_vp);
1373
1374                 return (error);
1375         }
1376
1377         case HFS_PREV_LINK:
1378         case HFS_NEXT_LINK:
1379         {
1380                 cnid_t linkfileid;
1381                 cnid_t nextlinkid;
1382                 cnid_t prevlinkid;
1383                 int error;
1384
1385                 /* Caller must be owner of file system. */
1386                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1387                 if (suser(cred, NULL) &&
1388                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1389                         return (EACCES);
1390                 }
1391                 /* Target vnode must be file system's root. */
1392                 if (!vnode_isvroot(vp)) {
1393                         return (EINVAL);
1394                 }
1395                 linkfileid = *(cnid_t *)ap->a_data;
1396                 if (linkfileid < kHFSFirstUserCatalogNodeID) {
1397                         return (EINVAL);
1398                 }
1399                 if ((error = hfs_lookuplink(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) {
1400                         return (error);
1401                 }
1402                 if (ap->a_command == HFS_NEXT_LINK) {
1403                         *(cnid_t *)ap->a_data = nextlinkid;
1404                 } else {
1405                         *(cnid_t *)ap->a_data = prevlinkid;
1406                 }
1407                 return (0);
1408         }
1409
1410         case HFS_RESIZE_PROGRESS: {
1411
1412                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1413                 if (suser(cred, NULL) &&
1414                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1415                         return (EACCES); /* must be owner of file system */
1416                 }
1417                 if (!vnode_isvroot(vp)) {
1418                         return (EINVAL);
1419                 }
1420                 return hfs_resize_progress(hfsmp, (u_int32_t *)ap->a_data);
1421         }
1422
1423         case HFS_RESIZE_VOLUME: {
1424                 u_int64_t newsize;
1425                 u_int64_t cursize;
1426
1427                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1428                 if (suser(cred, NULL) &&
1429                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1430                         return (EACCES); /* must be owner of file system */
1431                 }
1432                 if (!vnode_isvroot(vp)) {
1433                         return (EINVAL);
1434                 }
1435                 newsize = *(u_int64_t *)ap->a_data;
1436                 cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
1437
1438                 if (newsize > cursize) {
1439                         return hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context);
1440                 } else if (newsize < cursize) {
1441                         return hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context);
1442                 } else {
1443                         return (0);
1444                 }
1445         }
1446         case HFS_CHANGE_NEXT_ALLOCATION: {
1447                 int error = 0;          /* Assume success */
1448                 u_int32_t location;
1449
1450                 if (vnode_vfsisrdonly(vp)) {
1451                         return (EROFS);
1452                 }
1453                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1454                 if (suser(cred, NULL) &&
1455                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1456                         return (EACCES); /* must be owner of file system */
1457                 }
1458                 if (!vnode_isvroot(vp)) {
1459                         return (EINVAL);
1460                 }
1461                 HFS_MOUNT_LOCK(hfsmp, TRUE);
1462                 location = *(u_int32_t *)ap->a_data;
1463                 if ((location >= hfsmp->allocLimit) &&
1464                         (location != HFS_NO_UPDATE_NEXT_ALLOCATION)) {
1465                         error = EINVAL;
1466                         goto fail_change_next_allocation;
1467                 }
1468                 /* Return previous value. */
1469                 *(u_int32_t *)ap->a_data = hfsmp->nextAllocation;
1470                 if (location == HFS_NO_UPDATE_NEXT_ALLOCATION) {
1471                         /* On magic value for location, set nextAllocation to next block
1472                          * after metadata zone and set flag in mount structure to indicate
1473                          * that nextAllocation should not be updated again.
1474                          */
1475                         HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_end + 1);
1476                         hfsmp->hfs_flags |= HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1477                 } else {
1478                         hfsmp->hfs_flags &= ~HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1479                         HFS_UPDATE_NEXT_ALLOCATION(hfsmp, location);
1480                 }
1481                 MarkVCBDirty(hfsmp);
1482 fail_change_next_allocation:
1483                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
1484                 return (error);
1485         }
1486
1487 #ifdef HFS_SPARSE_DEV
1488         case HFS_SETBACKINGSTOREINFO: {
1489                 struct vnode * bsfs_rootvp;
1490                 struct vnode * di_vp;
1491                 struct hfs_backingstoreinfo *bsdata;
1492                 int error = 0;
1493
1494                 if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
1495                         return (EALREADY);
1496                 }
1497                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1498                 if (suser(cred, NULL) &&
1499                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1500                         return (EACCES); /* must be owner of file system */
1501                 }
1502                 bsdata = (struct hfs_backingstoreinfo *)ap->a_data;
1503                 if (bsdata == NULL) {
1504                         return (EINVAL);
1505                 }
1506                 if ((error = file_vnode(bsdata->backingfd, &di_vp))) {
1507                         return (error);
1508                 }
1509                 if ((error = vnode_getwithref(di_vp))) {
1510                         file_drop(bsdata->backingfd);
1511                         return(error);
1512                 }
1513
1514                 if (vnode_mount(vp) == vnode_mount(di_vp)) {
1515                         (void)vnode_put(di_vp);
1516                         file_drop(bsdata->backingfd);
1517                         return (EINVAL);
1518                 }
1519
1520                 /*
1521                  * Obtain the backing fs root vnode and keep a reference
1522                  * on it.  This reference will be dropped in hfs_unmount.
1523                  */
1524                 error = VFS_ROOT(vnode_mount(di_vp), &bsfs_rootvp, NULL); /* XXX use context! */
1525                 if (error) {
1526                         (void)vnode_put(di_vp);
1527                         file_drop(bsdata->backingfd);
1528                         return (error);
1529                 }
1530                 vnode_ref(bsfs_rootvp);
1531                 vnode_put(bsfs_rootvp);
1532
1533                 hfsmp->hfs_backingfs_rootvp = bsfs_rootvp;
1534                 hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
1535                 hfsmp->hfs_sparsebandblks = bsdata->bandsize / HFSTOVCB(hfsmp)->blockSize;
1536                 hfsmp->hfs_sparsebandblks *= 4;
1537
1538                 vfs_markdependency(hfsmp->hfs_mp);
1539
1540                 (void)vnode_put(di_vp);
1541                 file_drop(bsdata->backingfd);
1542                 return (0);
1543         }
1544         case HFS_CLRBACKINGSTOREINFO: {
1545                 struct vnode * tmpvp;
1546
1547                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1548                 if (suser(cred, NULL) &&
1549                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1550                         return (EACCES); /* must be owner of file system */
1551                 }
1552                 if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
1553                     hfsmp->hfs_backingfs_rootvp) {
1554
1555                         hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
1556                         tmpvp = hfsmp->hfs_backingfs_rootvp;
1557                         hfsmp->hfs_backingfs_rootvp = NULLVP;
1558                         hfsmp->hfs_sparsebandblks = 0;
1559                         vnode_rele(tmpvp);
1560                 }
1561                 return (0);
1562         }
1563 #endif /* HFS_SPARSE_DEV */
1564
1565         case F_FREEZE_FS: {
1566                 struct mount *mp;
1567
1568                 if (!is_suser())
1569                         return (EACCES);
1570
1571                 mp = vnode_mount(vp);
1572                 hfsmp = VFSTOHFS(mp);
1573
1574                 if (!(hfsmp->jnl))
1575                         return (ENOTSUP);
1576
1577                 lck_rw_lock_exclusive(&hfsmp->hfs_insync);
1578
1579                 // flush things before we get started to try and prevent
1580                 // dirty data from being paged out while we're frozen.
1581                 // note: can't do this after taking the lock as it will
1582                 // deadlock against ourselves.
1583                 vnode_iterate(mp, 0, hfs_freezewrite_callback, NULL);
1584                 hfs_global_exclusive_lock_acquire(hfsmp);
1585                 journal_flush(hfsmp->jnl);
1586
1587                 // don't need to iterate on all vnodes, we just need to
1588                 // wait for writes to the system files and the device vnode
1589                 if (HFSTOVCB(hfsmp)->extentsRefNum)
1590                     vnode_waitforwrites(HFSTOVCB(hfsmp)->extentsRefNum, 0, 0, 0, "hfs freeze");
1591                 if (HFSTOVCB(hfsmp)->catalogRefNum)
1592                     vnode_waitforwrites(HFSTOVCB(hfsmp)->catalogRefNum, 0, 0, 0, "hfs freeze");
1593                 if (HFSTOVCB(hfsmp)->allocationsRefNum)
1594                     vnode_waitforwrites(HFSTOVCB(hfsmp)->allocationsRefNum, 0, 0, 0, "hfs freeze");
1595                 if (hfsmp->hfs_attribute_vp)
1596                     vnode_waitforwrites(hfsmp->hfs_attribute_vp, 0, 0, 0, "hfs freeze");
1597                 vnode_waitforwrites(hfsmp->hfs_devvp, 0, 0, 0, "hfs freeze");
1598
1599                 hfsmp->hfs_freezing_proc = current_proc();
1600
1601                 return (0);
1602         }
1603
1604         case F_THAW_FS: {
1605                 if (!is_suser())
1606                         return (EACCES);
1607
1608                 // if we're not the one who froze the fs then we
1609                 // can't thaw it.
1610                 if (hfsmp->hfs_freezing_proc != current_proc()) {
1611                     return EPERM;
1612                 }
1613
1614                 // NOTE: if you add code here, also go check the
1615                 //       code that "thaws" the fs in hfs_vnop_close()
1616                 //
1617                 hfsmp->hfs_freezing_proc = NULL;
1618                 hfs_global_exclusive_lock_release(hfsmp);
1619                 lck_rw_unlock_exclusive(&hfsmp->hfs_insync);
1620
1621                 return (0);
1622         }
1623
1624         case HFS_BULKACCESS_FSCTL: {
1625             int size;
1626
1627             if (hfsmp->hfs_flags & HFS_STANDARD) {
1628                 return EINVAL;
1629             }
1630
1631             if (is64bit) {
1632                 size = sizeof(struct user_access_t);
1633             } else {
1634                 size = sizeof(struct access_t);
1635             }
1636
1637             return do_bulk_access_check(hfsmp, vp, ap, size, context);
1638         }
1639
1640         case HFS_EXT_BULKACCESS_FSCTL: {
1641             int size;
1642
1643             if (hfsmp->hfs_flags & HFS_STANDARD) {
1644                 return EINVAL;
1645             }
1646
1647             if (is64bit) {
1648                 size = sizeof(struct ext_user_access_t);
1649             } else {
1650                 size = sizeof(struct ext_access_t);
1651             }
1652
1653             return do_bulk_access_check(hfsmp, vp, ap, size, context);
1654         }
1655
1656         case HFS_SETACLSTATE: {
1657                 int state;
1658
1659                 if (ap->a_data == NULL) {
1660                         return (EINVAL);
1661                 }
1662
1663                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1664                 state = *(int *)ap->a_data;
1665
1666                 // super-user can enable or disable acl's on a volume.
1667                 // the volume owner can only enable acl's
1668                 if (!is_suser() && (state == 0 || kauth_cred_getuid(cred) != vfsp->f_owner)) {
1669                         return (EPERM);
1670                 }
1671                 if (state == 0 || state == 1)
1672                         return hfs_set_volxattr(hfsmp, HFS_SETACLSTATE, state);
1673                 else
1674                         return (EINVAL);
1675         }
1676
1677         case HFS_SET_XATTREXTENTS_STATE: {
1678                 int state;
1679
1680                 if (ap->a_data == NULL) {
1681                         return (EINVAL);
1682                 }
1683
1684                 state = *(int *)ap->a_data;
1685
1686                 /* Super-user can enable or disable extent-based extended
1687                  * attribute support on a volume
1688                  */
1689                 if (!is_suser()) {
1690                         return (EPERM);
1691                 }
1692                 if (state == 0 || state == 1)
1693                         return hfs_set_volxattr(hfsmp, HFS_SET_XATTREXTENTS_STATE, state);
1694                 else
1695                         return (EINVAL);
1696         }
1697
1698         case F_FULLFSYNC: {
1699                 int error;
1700
1701                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK);
1702                 if (error == 0) {
1703                         error = hfs_fsync(vp, MNT_WAIT, TRUE, p);
1704                         hfs_unlock(VTOC(vp));
1705                 }
1706
1707                 return error;
1708         }
1709
1710         case F_CHKCLEAN: {
1711                 register struct cnode *cp;
1712                 int error;
1713
1714                 if (!vnode_isreg(vp))
1715                         return EINVAL;
1716
1717                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK);
1718                 if (error == 0) {
1719                         cp = VTOC(vp);
1720                         /*
1721                          * used by regression test to determine if
1722                          * all the dirty pages (via write) have been cleaned
1723                          * after a call to 'fsysnc'.
1724                          */
1725                         error = is_file_clean(vp, VTOF(vp)->ff_size);
1726                         hfs_unlock(cp);
1727                 }
1728                 return (error);
1729         }
1730
1731         case F_RDADVISE: {
1732                 register struct radvisory *ra;
1733                 struct filefork *fp;
1734                 int error;
1735
1736                 if (!vnode_isreg(vp))
1737                         return EINVAL;
1738
1739                 ra = (struct radvisory *)(ap->a_data);
1740                 fp = VTOF(vp);
1741
1742                 /* Protect against a size change. */
1743                 hfs_lock_truncate(VTOC(vp), TRUE);
1744
1745                 if (ra->ra_offset >= fp->ff_size) {
1746                         error = EFBIG;
1747                 } else {
1748                         error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count);
1749                 }
1750
1751                 hfs_unlock_truncate(VTOC(vp), TRUE);
1752                 return (error);
1753         }
1754
1755         case F_READBOOTSTRAP:
1756         case F_WRITEBOOTSTRAP:
1757         {
1758             struct vnode *devvp = NULL;
1759             user_fbootstraptransfer_t *user_bootstrapp;
1760             int devBlockSize;
1761             int error;
1762             uio_t auio;
1763             daddr64_t blockNumber;
1764             u_long blockOffset;
1765             u_long xfersize;
1766             struct buf *bp;
1767             user_fbootstraptransfer_t user_bootstrap;
1768
1769                 if (!vnode_isvroot(vp))
1770                         return (EINVAL);
1771                 /* LP64 - when caller is a 64 bit process then we are passed a pointer
1772                  * to a user_fbootstraptransfer_t else we get a pointer to a
1773                  * fbootstraptransfer_t which we munge into a user_fbootstraptransfer_t
1774                  */
1775                 if (is64bit) {
1776                         user_bootstrapp = (user_fbootstraptransfer_t *)ap->a_data;
1777                 }
1778                 else {
1779                 fbootstraptransfer_t *bootstrapp = (fbootstraptransfer_t *)ap->a_data;
1780                         user_bootstrapp = &user_bootstrap;
1781                         user_bootstrap.fbt_offset = bootstrapp->fbt_offset;
1782                         user_bootstrap.fbt_length = bootstrapp->fbt_length;
1783                         user_bootstrap.fbt_buffer = CAST_USER_ADDR_T(bootstrapp->fbt_buffer);
1784                 }
1785                 if (user_bootstrapp->fbt_offset + user_bootstrapp->fbt_length > 1024)
1786                         return EINVAL;
1787
1788             devvp = VTOHFS(vp)->hfs_devvp;
1789                 auio = uio_create(1, user_bootstrapp->fbt_offset,
1790                                                   is64bit ? UIO_USERSPACE64 : UIO_USERSPACE32,
1791                                                   (ap->a_command == F_WRITEBOOTSTRAP) ? UIO_WRITE : UIO_READ);
1792                 uio_addiov(auio, user_bootstrapp->fbt_buffer, user_bootstrapp->fbt_length);
1793
1794             devBlockSize = vfs_devblocksize(vnode_mount(vp));
1795
1796             while (uio_resid(auio) > 0) {
1797                         blockNumber = uio_offset(auio) / devBlockSize;
1798                         error = (int)buf_bread(devvp, blockNumber, devBlockSize, cred, &bp);
1799                         if (error) {
1800                                 if (bp) buf_brelse(bp);
1801                                 uio_free(auio);
1802                                 return error;
1803                         };
1804
1805                         blockOffset = uio_offset(auio) % devBlockSize;
1806                         xfersize = devBlockSize - blockOffset;
1807                         error = uiomove((caddr_t)buf_dataptr(bp) + blockOffset, (int)xfersize, auio);
1808                         if (error) {
1809                                 buf_brelse(bp);
1810                                 uio_free(auio);
1811                                 return error;
1812                         };
1813                         if (uio_rw(auio) == UIO_WRITE) {
1814                                 error = VNOP_BWRITE(bp);
1815                                 if (error) {
1816                                         uio_free(auio);
1817                         return error;
1818                                 }
1819                         } else {
1820                                 buf_brelse(bp);
1821                         };
1822                 };
1823                 uio_free(auio);
1824         };
1825         return 0;
1826
1827         case _IOC(IOC_OUT,'h', 4, 0):     /* Create date in local time */
1828         {
1829                 if (is64bit) {
1830                         *(user_time_t *)(ap->a_data) = (user_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
1831                 }
1832                 else {
1833                         *(time_t *)(ap->a_data) = to_bsd_time(VTOVCB(vp)->localCreateDate);
1834                 }
1835                 return 0;
1836         }
1837
1838         case HFS_GET_MOUNT_TIME:
1839             return copyout(&hfsmp->hfs_mount_time, CAST_USER_ADDR_T(ap->a_data), sizeof(hfsmp->hfs_mount_time));
1840             break;
1841
1842         case HFS_GET_LAST_MTIME:
1843             return copyout(&hfsmp->hfs_last_mounted_mtime, CAST_USER_ADDR_T(ap->a_data), sizeof(hfsmp->hfs_last_mounted_mtime));
1844             break;
1845
1846         case HFS_SET_BOOT_INFO:
1847                 if (!vnode_isvroot(vp))
1848                         return(EINVAL);
1849                 if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(HFSTOVFS(hfsmp))->f_owner))
1850                         return(EACCES); /* must be superuser or owner of filesystem */
1851                 HFS_MOUNT_LOCK(hfsmp, TRUE);
1852                 bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo));
1853                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
1854                 (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
1855                 break;
1856
1857         case HFS_GET_BOOT_INFO:
1858                 if (!vnode_isvroot(vp))
1859                         return(EINVAL);
1860                 HFS_MOUNT_LOCK(hfsmp, TRUE);
1861                 bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo));
1862                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
1863                 break;
1864
1865         case HFS_MARK_BOOT_CORRUPT:
1866                 /* Mark the boot volume corrupt by setting
1867                  * kHFSVolumeInconsistentBit in the volume header.  This will
1868                  * force fsck_hfs on next mount.
1869                  */
1870                 if (!is_suser()) {
1871                         return EACCES;
1872                 }
1873
1874                 /* Allowed only on the root vnode of the boot volume */
1875                 if (!(vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) ||
1876                     !vnode_isvroot(vp)) {
1877                         return EINVAL;
1878                 }
1879
1880                 printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n");
1881                 hfs_mark_volume_inconsistent(hfsmp);
1882                 break;
1883
1884         default:
1885                 return (ENOTTY);
1886         }
1887
1888     /* Should never get here */
1889         return 0;
1890 }
1891
1892 /*
1893  * select
1894  */
1895 int
1896 hfs_vnop_select(__unused struct vnop_select_args *ap)
1897 /*
1898         struct vnop_select_args {
1899                 vnode_t a_vp;
1900                 int  a_which;
1901                 int  a_fflags;
1902                 void *a_wql;
1903                 vfs_context_t a_context;
1904         };
1905 */
1906 {
1907         /*
1908          * We should really check to see if I/O is possible.
1909          */
1910         return (1);
1911 }
1912
1913 /*
1914  * Converts a logical block number to a physical block, and optionally returns
1915  * the amount of remaining blocks in a run. The logical block is based on hfsNode.logBlockSize.
1916  * The physical block number is based on the device block size, currently its 512.
1917  * The block run is returned in logical blocks, and is the REMAINING amount of blocks
1918  */
1919 int
1920 hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, unsigned int *runp)
1921 {
1922         struct filefork *fp = VTOF(vp);
1923         struct hfsmount *hfsmp = VTOHFS(vp);
1924         int  retval = E_NONE;
1925         u_int32_t  logBlockSize;
1926         size_t  bytesContAvail = 0;
1927         off_t  blockposition;
1928         int lockExtBtree;
1929         int lockflags = 0;
1930
1931         /*
1932          * Check for underlying vnode requests and ensure that logical
1933          * to physical mapping is requested.
1934          */
1935         if (vpp != NULL)
1936                 *vpp = hfsmp->hfs_devvp;
1937         if (bnp == NULL)
1938                 return (0);
1939
1940         logBlockSize = GetLogicalBlockSize(vp);
1941         blockposition = (off_t)bn * logBlockSize;
1942
1943         lockExtBtree = overflow_extents(fp);
1944
1945         if (lockExtBtree)
1946                 lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK);
1947
1948         retval = MacToVFSError(
1949                             MapFileBlockC (HFSTOVCB(hfsmp),
1950                                             (FCB*)fp,
1951                                             MAXPHYSIO,
1952                                             blockposition,
1953                                             bnp,
1954                                             &bytesContAvail));
1955
1956         if (lockExtBtree)
1957                 hfs_systemfile_unlock(hfsmp, lockflags);
1958
1959         if (retval == E_NONE) {
1960                 /* Figure out how many read ahead blocks there are */
1961                 if (runp != NULL) {
1962                         if (can_cluster(logBlockSize)) {
1963                                 /* Make sure this result never goes negative: */
1964                                 *runp = (bytesContAvail < logBlockSize) ? 0 : (bytesContAvail / logBlockSize) - 1;
1965                         } else {
1966                                 *runp = 0;
1967                         }
1968                 }
1969         }
1970         return (retval);
1971 }
1972
1973 /*
1974  * Convert logical block number to file offset.
1975  */
1976 int
1977 hfs_vnop_blktooff(struct vnop_blktooff_args *ap)
1978 /*
1979         struct vnop_blktooff_args {
1980                 vnode_t a_vp;
1981                 daddr64_t a_lblkno;
1982                 off_t *a_offset;
1983         };
1984 */
1985 {
1986         if (ap->a_vp == NULL)
1987                 return (EINVAL);
1988         *ap->a_offset = (off_t)ap->a_lblkno * (off_t)GetLogicalBlockSize(ap->a_vp);
1989
1990         return(0);
1991 }
1992
1993 /*
1994  * Convert file offset to logical block number.
1995  */
1996 int
1997 hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap)
1998 /*
1999         struct vnop_offtoblk_args {
2000                 vnode_t a_vp;
2001                 off_t a_offset;
2002                 daddr64_t *a_lblkno;
2003         };
2004 */
2005 {
2006         if (ap->a_vp == NULL)
2007                 return (EINVAL);
2008         *ap->a_lblkno = (daddr64_t)(ap->a_offset / (off_t)GetLogicalBlockSize(ap->a_vp));
2009
2010         return(0);
2011 }
2012
2013 /*
2014  * Map file offset to physical block number.
2015  *
2016  * If this function is called for write operation, and if the file
2017  * had virtual blocks allocated (delayed allocation), real blocks
2018  * are allocated by calling ExtendFileC().
2019  *
2020  * If this function is called for read operation, and if the file
2021  * had virtual blocks allocated (delayed allocation), no change
2022  * to the size of file is done, and if required, rangelist is
2023  * searched for mapping.
2024  *
2025  * System file cnodes are expected to be locked (shared or exclusive).
2026  */
2027 int
2028 hfs_vnop_blockmap(struct vnop_blockmap_args *ap)
2029 /*
2030         struct vnop_blockmap_args {
2031                 vnode_t a_vp;
2032                 off_t a_foffset;
2033                 size_t a_size;
2034                 daddr64_t *a_bpn;
2035                 size_t *a_run;
2036                 void *a_poff;
2037                 int a_flags;
2038                 vfs_context_t a_context;
2039         };
2040 */
2041 {
2042         struct vnode *vp = ap->a_vp;
2043         struct cnode *cp;
2044         struct filefork *fp;
2045         struct hfsmount *hfsmp;
2046         size_t bytesContAvail = 0;
2047         int retval = E_NONE;
2048         int syslocks = 0;
2049         int lockflags = 0;
2050         struct rl_entry *invalid_range;
2051         enum rl_overlaptype overlaptype;
2052         int started_tr = 0;
2053         int tooklock = 0;
2054
2055         /* Do not allow blockmap operation on a directory */
2056         if (vnode_isdir(vp)) {
2057                 return (ENOTSUP);
2058         }
2059
2060         /*
2061          * Check for underlying vnode requests and ensure that logical
2062          * to physical mapping is requested.
2063          */
2064         if (ap->a_bpn == NULL)
2065                 return (0);
2066
2067         if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) {
2068                 if (VTOC(vp)->c_lockowner != current_thread()) {
2069                         hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
2070                         tooklock = 1;
2071                 }
2072         }
2073         hfsmp = VTOHFS(vp);
2074         cp = VTOC(vp);
2075         fp = VTOF(vp);
2076
2077 retry:
2078         /* Check virtual blocks only when performing write operation */
2079         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
2080                 if (hfs_start_transaction(hfsmp) != 0) {
2081                         retval = EINVAL;
2082                         goto exit;
2083                 } else {
2084                         started_tr = 1;
2085                 }
2086                 syslocks = SFL_EXTENTS | SFL_BITMAP;
2087
2088         } else if (overflow_extents(fp)) {
2089                 syslocks = SFL_EXTENTS;
2090         }
2091
2092         if (syslocks)
2093                 lockflags = hfs_systemfile_lock(hfsmp, syslocks, HFS_EXCLUSIVE_LOCK);
2094
2095         /*
2096          * Check for any delayed allocations.
2097          */
2098         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
2099                 int64_t actbytes;
2100                 u_int32_t loanedBlocks;
2101
2102                 //
2103                 // Make sure we have a transaction.  It's possible
2104                 // that we came in and fp->ff_unallocblocks was zero
2105                 // but during the time we blocked acquiring the extents
2106                 // btree, ff_unallocblocks became non-zero and so we
2107                 // will need to start a transaction.
2108                 //
2109                 if (started_tr == 0) {
2110                         if (syslocks) {
2111                                 hfs_systemfile_unlock(hfsmp, lockflags);
2112                                 syslocks = 0;
2113                         }
2114                         goto retry;
2115                 }
2116
2117                 /*
2118                  * Note: ExtendFileC will Release any blocks on loan and
2119                  * aquire real blocks.  So we ask to extend by zero bytes
2120                  * since ExtendFileC will account for the virtual blocks.
2121                  */
2122
2123                 loanedBlocks = fp->ff_unallocblocks;
2124                 retval = ExtendFileC(hfsmp, (FCB*)fp, 0, 0,
2125                                      kEFAllMask | kEFNoClumpMask, &actbytes);
2126
2127                 if (retval) {
2128                         fp->ff_unallocblocks = loanedBlocks;
2129                         cp->c_blocks += loanedBlocks;
2130                         fp->ff_blocks += loanedBlocks;
2131
2132                         HFS_MOUNT_LOCK(hfsmp, TRUE);
2133                         hfsmp->loanedBlocks += loanedBlocks;
2134                         HFS_MOUNT_UNLOCK(hfsmp, TRUE);
2135
2136                         hfs_systemfile_unlock(hfsmp, lockflags);
2137                         cp->c_flag |= C_MODIFIED;
2138                         if (started_tr) {
2139                                 (void) hfs_update(vp, TRUE);
2140                                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2141
2142                                 hfs_end_transaction(hfsmp);
2143                                 started_tr = 0;
2144                         }
2145                         goto exit;
2146                 }
2147         }
2148
2149         retval = MapFileBlockC(hfsmp, (FCB *)fp, ap->a_size, ap->a_foffset,
2150                                ap->a_bpn, &bytesContAvail);
2151         if (syslocks) {
2152                 hfs_systemfile_unlock(hfsmp, lockflags);
2153                 syslocks = 0;
2154         }
2155
2156         if (started_tr) {
2157                 (void) hfs_update(vp, TRUE);
2158                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2159                 hfs_end_transaction(hfsmp);
2160                 started_tr = 0;
2161         }
2162         if (retval) {
2163                 /* On write, always return error because virtual blocks, if any,
2164                  * should have been allocated in ExtendFileC().  We do not
2165                  * allocate virtual blocks on read, therefore return error
2166                  * only if no virtual blocks are allocated.  Otherwise we search
2167                  * rangelist for zero-fills
2168                  */
2169                 if ((MacToVFSError(retval) != ERANGE) ||
2170                     (ap->a_flags & VNODE_WRITE) ||
2171                     ((ap->a_flags & VNODE_READ) && (fp->ff_unallocblocks == 0))) {
2172                         goto exit;
2173                 }
2174
2175                 /* Validate if the start offset is within logical file size */
2176                 if (ap->a_foffset > fp->ff_size) {
2177                         goto exit;
2178                 }
2179
2180                 /* Searching file extents has failed for read operation, therefore
2181                  * search rangelist for any uncommitted holes in the file.
2182                  */
2183                 overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
2184                                       ap->a_foffset + (off_t)(ap->a_size - 1),
2185                                       &invalid_range);
2186                 switch(overlaptype) {
2187                 case RL_OVERLAPISCONTAINED:
2188                         /* start_offset <= rl_start, end_offset >= rl_end */
2189                         if (ap->a_foffset != invalid_range->rl_start) {
2190                                 break;
2191                         }
2192                 case RL_MATCHINGOVERLAP:
2193                         /* start_offset = rl_start, end_offset = rl_end */
2194                 case RL_OVERLAPCONTAINSRANGE:
2195                         /* start_offset >= rl_start, end_offset <= rl_end */
2196                 case RL_OVERLAPSTARTSBEFORE:
2197                         /* start_offset > rl_start, end_offset >= rl_start */
2198                         if ((off_t)fp->ff_size > (invalid_range->rl_end + 1)) {
2199                                 bytesContAvail = (invalid_range->rl_end + 1) - ap->a_foffset;
2200                         } else {
2201                                 bytesContAvail = fp->ff_size - ap->a_foffset;
2202                         }
2203                         if (bytesContAvail > ap->a_size) {
2204                                 bytesContAvail = ap->a_size;
2205                         }
2206                         *ap->a_bpn = (daddr64_t)-1;
2207                         retval = 0;
2208                         break;
2209                 case RL_OVERLAPENDSAFTER:
2210                         /* start_offset < rl_start, end_offset < rl_end */
2211                 case RL_NOOVERLAP:
2212                         break;
2213                 }
2214                 goto exit;
2215         }
2216
2217         /* MapFileC() found a valid extent in the filefork.  Search the
2218          * mapping information further for invalid file ranges
2219          */
2220         overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
2221                               ap->a_foffset + (off_t)bytesContAvail - 1,
2222                               &invalid_range);
2223         if (overlaptype != RL_NOOVERLAP) {
2224                 switch(overlaptype) {
2225                 case RL_MATCHINGOVERLAP:
2226                 case RL_OVERLAPCONTAINSRANGE:
2227                 case RL_OVERLAPSTARTSBEFORE:
2228                         /* There's no valid block for this byte offset */
2229                         *ap->a_bpn = (daddr64_t)-1;
2230                         /* There's no point limiting the amount to be returned
2231                          * if the invalid range that was hit extends all the way
2232                          * to the EOF (i.e. there's no valid bytes between the
2233                          * end of this range and the file's EOF):
2234                          */
2235                         if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
2236                             (invalid_range->rl_end + 1 - ap->a_foffset < bytesContAvail)) {
2237                                 bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
2238                         }
2239                         break;
2240
2241                 case RL_OVERLAPISCONTAINED:
2242                 case RL_OVERLAPENDSAFTER:
2243                         /* The range of interest hits an invalid block before the end: */
2244                         if (invalid_range->rl_start == ap->a_foffset) {
2245                                 /* There's actually no valid information to be had starting here: */
2246                                 *ap->a_bpn = (daddr64_t)-1;
2247                                 if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
2248                                     (invalid_range->rl_end + 1 - ap->a_foffset < bytesContAvail)) {
2249                                         bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
2250                                 }
2251                         } else {
2252                                 bytesContAvail = invalid_range->rl_start - ap->a_foffset;
2253                         }
2254                         break;
2255
2256                 case RL_NOOVERLAP:
2257                         break;
2258                 } /* end switch */
2259                 if (bytesContAvail > ap->a_size)
2260                         bytesContAvail = ap->a_size;
2261         }
2262
2263 exit:
2264         if (retval == 0) {
2265                 if (ap->a_run)
2266                         *ap->a_run = bytesContAvail;
2267
2268                 if (ap->a_poff)
2269                         *(int *)ap->a_poff = 0;
2270         }
2271
2272         if (tooklock)
2273                 hfs_unlock(cp);
2274
2275         return (MacToVFSError(retval));
2276 }
2277
2278
2279 /*
2280  * prepare and issue the I/O
2281  * buf_strategy knows how to deal
2282  * with requests that require
2283  * fragmented I/Os
2284  */
2285 int
2286 hfs_vnop_strategy(struct vnop_strategy_args *ap)
2287 {
2288         buf_t   bp = ap->a_bp;
2289         vnode_t vp = buf_vnode(bp);
2290
2291         return (buf_strategy(VTOHFS(vp)->hfs_devvp, ap));
2292 }
2293
2294
2295 static int
2296 do_hfs_truncate(struct vnode *vp, off_t length, int flags, vfs_context_t context)
2297 {
2298         register struct cnode *cp = VTOC(vp);
2299         struct filefork *fp = VTOF(vp);
2300         struct proc *p = vfs_context_proc(context);;
2301         kauth_cred_t cred = vfs_context_ucred(context);
2302         int retval;
2303         off_t bytesToAdd;
2304         off_t actualBytesAdded;
2305         off_t filebytes;
2306         u_long fileblocks;
2307         int blksize;
2308         struct hfsmount *hfsmp;
2309         int lockflags;
2310
2311         blksize = VTOVCB(vp)->blockSize;
2312         fileblocks = fp->ff_blocks;
2313         filebytes = (off_t)fileblocks * (off_t)blksize;
2314
2315         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_START,
2316                  (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
2317
2318         if (length < 0)
2319                 return (EINVAL);
2320
2321         /* This should only happen with a corrupt filesystem */
2322         if ((off_t)fp->ff_size < 0)
2323                 return (EINVAL);
2324
2325         if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE))
2326                 return (EFBIG);
2327
2328         hfsmp = VTOHFS(vp);
2329
2330         retval = E_NONE;
2331
2332         /* Files that are changing size are not hot file candidates. */
2333         if (hfsmp->hfc_stage == HFC_RECORDING) {
2334                 fp->ff_bytesread = 0;
2335         }
2336
2337         /*
2338          * We cannot just check if fp->ff_size == length (as an optimization)
2339          * since there may be extra physical blocks that also need truncation.
2340          */
2341 #if QUOTA
2342         if ((retval = hfs_getinoquota(cp)))
2343                 return(retval);
2344 #endif /* QUOTA */
2345
2346         /*
2347          * Lengthen the size of the file. We must ensure that the
2348          * last byte of the file is allocated. Since the smallest
2349          * value of ff_size is 0, length will be at least 1.
2350          */
2351         if (length > (off_t)fp->ff_size) {
2352 #if QUOTA
2353                 retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)),
2354                                    cred, 0);
2355                 if (retval)
2356                         goto Err_Exit;
2357 #endif /* QUOTA */
2358                 /*
2359                  * If we don't have enough physical space then
2360                  * we need to extend the physical size.
2361                  */
2362                 if (length > filebytes) {
2363                         int eflags;
2364                         u_long blockHint = 0;
2365
2366                         /* All or nothing and don't round up to clumpsize. */
2367                         eflags = kEFAllMask | kEFNoClumpMask;
2368
2369                         if (cred && suser(cred, NULL) != 0)
2370                                 eflags |= kEFReserveMask;  /* keep a reserve */
2371
2372                         /*
2373                          * Allocate Journal and Quota files in metadata zone.
2374                          */
2375                         if (filebytes == 0 &&
2376                             hfsmp->hfs_flags & HFS_METADATA_ZONE &&
2377                             hfs_virtualmetafile(cp)) {
2378                                 eflags |= kEFMetadataMask;
2379                                 blockHint = hfsmp->hfs_metazone_start;
2380                         }
2381                         if (hfs_start_transaction(hfsmp) != 0) {
2382                             retval = EINVAL;
2383                             goto Err_Exit;
2384                         }
2385
2386                         /* Protect extents b-tree and allocation bitmap */
2387                         lockflags = SFL_BITMAP;
2388                         if (overflow_extents(fp))
2389                                 lockflags |= SFL_EXTENTS;
2390                         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
2391
2392                         while ((length > filebytes) && (retval == E_NONE)) {
2393                                 bytesToAdd = length - filebytes;
2394                                 retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
2395                                                     (FCB*)fp,
2396                                                     bytesToAdd,
2397                                                     blockHint,
2398                                                     eflags,
2399                                                     &actualBytesAdded));
2400
2401                                 filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
2402                                 if (actualBytesAdded == 0 && retval == E_NONE) {
2403                                         if (length > filebytes)
2404                                                 length = filebytes;
2405                                         break;
2406                                 }
2407                         } /* endwhile */
2408
2409                         hfs_systemfile_unlock(hfsmp, lockflags);
2410
2411                         if (hfsmp->jnl) {
2412                             (void) hfs_update(vp, TRUE);
2413                             (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2414                         }
2415
2416                         hfs_end_transaction(hfsmp);
2417
2418                         if (retval)
2419                                 goto Err_Exit;
2420
2421                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
2422                                 (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
2423                 }
2424
2425                 if (!(flags & IO_NOZEROFILL)) {
2426                         if (UBCINFOEXISTS(vp)  && (vnode_issystem(vp) == 0) && retval == E_NONE) {
2427                                 struct rl_entry *invalid_range;
2428                                 off_t zero_limit;
2429
2430                                 zero_limit = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
2431                                 if (length < zero_limit) zero_limit = length;
2432
2433                                 if (length > (off_t)fp->ff_size) {
2434                                         struct timeval tv;
2435
2436                                         /* Extending the file: time to fill out the current last page w. zeroes? */
2437                                         if ((fp->ff_size & PAGE_MASK_64) &&
2438                                             (rl_scan(&fp->ff_invalidranges, fp->ff_size & ~PAGE_MASK_64,
2439                                             fp->ff_size - 1, &invalid_range) == RL_NOOVERLAP)) {
2440
2441                                                 /* There's some valid data at the start of the (current) last page
2442                                                    of the file, so zero out the remainder of that page to ensure the
2443                                                    entire page contains valid data.  Since there is no invalid range
2444                                                    possible past the (current) eof, there's no need to remove anything
2445                                                    from the invalid range list before calling cluster_write():  */
2446                                                 hfs_unlock(cp);
2447                                                 retval = cluster_write(vp, (struct uio *) 0, fp->ff_size, zero_limit,
2448                                                                 fp->ff_size, (off_t)0,
2449                                                                 (flags & IO_SYNC) | IO_HEADZEROFILL | IO_NOZERODIRTY);
2450                                                 hfs_lock(cp, HFS_FORCE_LOCK);
2451                                                 if (retval) goto Err_Exit;
2452
2453                                                 /* Merely invalidate the remaining area, if necessary: */
2454                                                 if (length > zero_limit) {
2455                                                         microuptime(&tv);
2456                                                         rl_add(zero_limit, length - 1, &fp->ff_invalidranges);
2457                                                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
2458                                                 }
2459                                         } else {
2460                                         /* The page containing the (current) eof is invalid: just add the
2461                                            remainder of the page to the invalid list, along with the area
2462                                            being newly allocated:
2463                                          */
2464                                         microuptime(&tv);
2465                                         rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges);
2466                                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
2467                                         };
2468                                 }
2469                         } else {
2470                                         panic("hfs_truncate: invoked on non-UBC object?!");
2471                         };
2472                 }
2473                 cp->c_touch_modtime = TRUE;
2474                 fp->ff_size = length;
2475
2476         } else { /* Shorten the size of the file */
2477
2478                 if ((off_t)fp->ff_size > length) {
2479                         /* Any space previously marked as invalid is now irrelevant: */
2480                         rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges);
2481                 }
2482
2483                 /*
2484                  * Account for any unmapped blocks. Note that the new
2485                  * file length can still end up with unmapped blocks.
2486                  */
2487                 if (fp->ff_unallocblocks > 0) {
2488                         u_int32_t finalblks;
2489                         u_int32_t loanedBlocks;
2490
2491                         HFS_MOUNT_LOCK(hfsmp, TRUE);
2492
2493                         loanedBlocks = fp->ff_unallocblocks;
2494                         cp->c_blocks -= loanedBlocks;
2495                         fp->ff_blocks -= loanedBlocks;
2496                         fp->ff_unallocblocks = 0;
2497
2498                         hfsmp->loanedBlocks -= loanedBlocks;
2499
2500                         finalblks = (length + blksize - 1) / blksize;
2501                         if (finalblks > fp->ff_blocks) {
2502                                 /* calculate required unmapped blocks */
2503                                 loanedBlocks = finalblks - fp->ff_blocks;
2504                                 hfsmp->loanedBlocks += loanedBlocks;
2505
2506                                 fp->ff_unallocblocks = loanedBlocks;
2507                                 cp->c_blocks += loanedBlocks;
2508                                 fp->ff_blocks += loanedBlocks;
2509                         }
2510                         HFS_MOUNT_UNLOCK(hfsmp, TRUE);
2511                 }
2512
2513                 /*
2514                  * For a TBE process the deallocation of the file blocks is
2515                  * delayed until the file is closed.  And hfs_close calls
2516                  * truncate with the IO_NDELAY flag set.  So when IO_NDELAY
2517                  * isn't set, we make sure this isn't a TBE process.
2518                  */
2519                 if ((flags & IO_NDELAY) || (proc_tbe(p) == 0)) {
2520 #if QUOTA
2521                   off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
2522 #endif /* QUOTA */
2523                   if (hfs_start_transaction(hfsmp) != 0) {
2524                       retval = EINVAL;
2525                       goto Err_Exit;
2526                   }
2527
2528                         if (fp->ff_unallocblocks == 0) {
2529                                 /* Protect extents b-tree and allocation bitmap */
2530                                 lockflags = SFL_BITMAP;
2531                                 if (overflow_extents(fp))
2532                                         lockflags |= SFL_EXTENTS;
2533                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
2534
2535                                 retval = MacToVFSError(TruncateFileC(VTOVCB(vp),
2536                                                 (FCB*)fp, length, false));
2537
2538                                 hfs_systemfile_unlock(hfsmp, lockflags);
2539                         }
2540                         if (hfsmp->jnl) {
2541                                 if (retval == 0) {
2542                                         fp->ff_size = length;
2543                                 }
2544                                 (void) hfs_update(vp, TRUE);
2545                                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2546                         }
2547
2548                         hfs_end_transaction(hfsmp);
2549
2550                         filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
2551                         if (retval)
2552                                 goto Err_Exit;
2553 #if QUOTA
2554                         /* These are bytesreleased */
2555                         (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0);
2556 #endif /* QUOTA */
2557                 }
2558                 /* Only set update flag if the logical length changes */
2559                 if ((off_t)fp->ff_size != length)
2560                         cp->c_touch_modtime = TRUE;
2561                 fp->ff_size = length;
2562         }
2563         cp->c_touch_chgtime = TRUE;     /* status changed */
2564         cp->c_touch_modtime = TRUE;     /* file data was modified */
2565         retval = hfs_update(vp, MNT_WAIT);
2566         if (retval) {
2567                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
2568                      -1, -1, -1, retval, 0);
2569         }
2570
2571 Err_Exit:
2572
2573         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_END,
2574                  (int)length, (int)fp->ff_size, (int)filebytes, retval, 0);
2575
2576         return (retval);
2577 }
2578
2579
2580
2581 /*
2582  * Truncate a cnode to at most length size, freeing (or adding) the
2583  * disk blocks.
2584  */
2585 __private_extern__
2586 int
2587 hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize,
2588              vfs_context_t context)
2589 {
2590         struct filefork *fp = VTOF(vp);
2591         off_t filebytes;
2592         u_long fileblocks;
2593         int blksize, error = 0;
2594         struct cnode *cp = VTOC(vp);
2595
2596         /* Cannot truncate an HFS directory! */
2597         if (vnode_isdir(vp)) {
2598                 return (EISDIR);
2599         }
2600         /* A swap file cannot change size. */
2601         if (vnode_isswap(vp) && (length != 0)) {
2602                 return (EPERM);
2603         }
2604
2605         blksize = VTOVCB(vp)->blockSize;
2606         fileblocks = fp->ff_blocks;
2607         filebytes = (off_t)fileblocks * (off_t)blksize;
2608
2609         //
2610         // Have to do this here so that we don't wind up with
2611         // i/o pending for blocks that are about to be released
2612         // if we truncate the file.
2613         //
2614         // If skipsetsize is set, then the caller is responsible
2615         // for the ubc_setsize.
2616         //
2617         if (!skipsetsize)
2618                 ubc_setsize(vp, length);
2619
2620         // have to loop truncating or growing files that are
2621         // really big because otherwise transactions can get
2622         // enormous and consume too many kernel resources.
2623
2624         if (length < filebytes) {
2625                 while (filebytes > length) {
2626                         if ((filebytes - length) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
2627                                 filebytes -= HFS_BIGFILE_SIZE;
2628                         } else {
2629                                 filebytes = length;
2630                         }
2631                         cp->c_flag |= C_FORCEUPDATE;
2632                         error = do_hfs_truncate(vp, filebytes, flags, context);
2633                         if (error)
2634                                 break;
2635                 }
2636         } else if (length > filebytes) {
2637                 while (filebytes < length) {
2638                         if ((length - filebytes) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
2639                                 filebytes += HFS_BIGFILE_SIZE;
2640                         } else {
2641                                 filebytes = length;
2642                         }
2643                         cp->c_flag |= C_FORCEUPDATE;
2644                         error = do_hfs_truncate(vp, filebytes, flags, context);
2645                         if (error)
2646                                 break;
2647                 }
2648         } else /* Same logical size */ {
2649
2650                 error = do_hfs_truncate(vp, length, flags, context);
2651         }
2652         /* Files that are changing size are not hot file candidates. */
2653         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
2654                 fp->ff_bytesread = 0;
2655         }
2656
2657         return (error);
2658 }
2659
2660
2661
2662 /*
2663  * Preallocate file storage space.
2664  */
2665 int
2666 hfs_vnop_allocate(struct vnop_allocate_args /* {
2667                 vnode_t a_vp;
2668                 off_t a_length;
2669                 u_int32_t  a_flags;
2670                 off_t *a_bytesallocated;
2671                 off_t a_offset;
2672                 vfs_context_t a_context;
2673         } */ *ap)
2674 {
2675         struct vnode *vp = ap->a_vp;
2676         struct cnode *cp;
2677         struct filefork *fp;
2678         ExtendedVCB *vcb;
2679         off_t length = ap->a_length;
2680         off_t startingPEOF;
2681         off_t moreBytesRequested;
2682         off_t actualBytesAdded;
2683         off_t filebytes;
2684         u_long fileblocks;
2685         int retval, retval2;
2686         u_int32_t blockHint;
2687         u_int32_t extendFlags;   /* For call to ExtendFileC */
2688         struct hfsmount *hfsmp;
2689         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
2690         int lockflags;
2691
2692         *(ap->a_bytesallocated) = 0;
2693
2694         if (!vnode_isreg(vp))
2695                 return (EISDIR);
2696         if (length < (off_t)0)
2697                 return (EINVAL);
2698
2699         cp = VTOC(vp);
2700
2701         hfs_lock_truncate(cp, TRUE);
2702
2703         if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
2704                 goto Err_Exit;
2705         }
2706
2707         fp = VTOF(vp);
2708         hfsmp = VTOHFS(vp);
2709         vcb = VTOVCB(vp);
2710
2711         fileblocks = fp->ff_blocks;
2712         filebytes = (off_t)fileblocks * (off_t)vcb->blockSize;
2713
2714         if ((ap->a_flags & ALLOCATEFROMVOL) && (length < filebytes)) {
2715                 retval = EINVAL;
2716                 goto Err_Exit;
2717         }
2718
2719         /* Fill in the flags word for the call to Extend the file */
2720
2721         extendFlags = kEFNoClumpMask;
2722         if (ap->a_flags & ALLOCATECONTIG)
2723                 extendFlags |= kEFContigMask;
2724         if (ap->a_flags & ALLOCATEALL)
2725                 extendFlags |= kEFAllMask;
2726         if (cred && suser(cred, NULL) != 0)
2727                 extendFlags |= kEFReserveMask;
2728
2729         retval = E_NONE;
2730         blockHint = 0;
2731         startingPEOF = filebytes;
2732
2733         if (ap->a_flags & ALLOCATEFROMPEOF)
2734                 length += filebytes;
2735         else if (ap->a_flags & ALLOCATEFROMVOL)
2736                 blockHint = ap->a_offset / VTOVCB(vp)->blockSize;
2737
2738         /* If no changes are necesary, then we're done */
2739         if (filebytes == length)
2740                 goto Std_Exit;
2741
2742         /*
2743          * Lengthen the size of the file. We must ensure that the
2744          * last byte of the file is allocated. Since the smallest
2745          * value of filebytes is 0, length will be at least 1.
2746          */
2747         if (length > filebytes) {
2748                 off_t total_bytes_added = 0, orig_request_size;
2749
2750                 orig_request_size = moreBytesRequested = length - filebytes;
2751
2752 #if QUOTA
2753                 retval = hfs_chkdq(cp,
2754                                 (int64_t)(roundup(moreBytesRequested, vcb->blockSize)),
2755                                 cred, 0);
2756                 if (retval)
2757                         goto Err_Exit;
2758
2759 #endif /* QUOTA */
2760                 /*
2761                  * Metadata zone checks.
2762                  */
2763                 if (hfsmp->hfs_flags & HFS_METADATA_ZONE) {
2764                         /*
2765                          * Allocate Journal and Quota files in metadata zone.
2766                          */
2767                         if (hfs_virtualmetafile(cp)) {
2768                                 extendFlags |= kEFMetadataMask;
2769                                 blockHint = hfsmp->hfs_metazone_start;
2770                         } else if ((blockHint >= hfsmp->hfs_metazone_start) &&
2771                                    (blockHint <= hfsmp->hfs_metazone_end)) {
2772                                 /*
2773                                  * Move blockHint outside metadata zone.
2774                                  */
2775                                 blockHint = hfsmp->hfs_metazone_end + 1;
2776                         }
2777                 }
2778
2779
2780                 while ((length > filebytes) && (retval == E_NONE)) {
2781                     off_t bytesRequested;
2782
2783                     if (hfs_start_transaction(hfsmp) != 0) {
2784                         retval = EINVAL;
2785                         goto Err_Exit;
2786                     }
2787
2788                     /* Protect extents b-tree and allocation bitmap */
2789                     lockflags = SFL_BITMAP;
2790                     if (overflow_extents(fp))
2791                         lockflags |= SFL_EXTENTS;
2792                     lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
2793
2794                     if (moreBytesRequested >= HFS_BIGFILE_SIZE) {
2795                         bytesRequested = HFS_BIGFILE_SIZE;
2796                     } else {
2797                         bytesRequested = moreBytesRequested;
2798                     }
2799
2800                     retval = MacToVFSError(ExtendFileC(vcb,
2801                                                 (FCB*)fp,
2802                                                 bytesRequested,
2803                                                 blockHint,
2804                                                 extendFlags,
2805                                                 &actualBytesAdded));
2806
2807                     if (retval == E_NONE) {
2808                         *(ap->a_bytesallocated) += actualBytesAdded;
2809                         total_bytes_added += actualBytesAdded;
2810                         moreBytesRequested -= actualBytesAdded;
2811                         if (blockHint != 0) {
2812                             blockHint += actualBytesAdded / vcb->blockSize;
2813                         }
2814                     }
2815                     filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
2816
2817                     hfs_systemfile_unlock(hfsmp, lockflags);
2818
2819                     if (hfsmp->jnl) {
2820                         (void) hfs_update(vp, TRUE);
2821                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2822                     }
2823
2824                     hfs_end_transaction(hfsmp);
2825                 }
2826
2827
2828                 /*
2829                  * if we get an error and no changes were made then exit
2830                  * otherwise we must do the hfs_update to reflect the changes
2831                  */
2832                 if (retval && (startingPEOF == filebytes))
2833                         goto Err_Exit;
2834
2835                 /*
2836                  * Adjust actualBytesAdded to be allocation block aligned, not
2837                  * clump size aligned.
2838                  * NOTE: So what we are reporting does not affect reality
2839                  * until the file is closed, when we truncate the file to allocation
2840                  * block size.
2841                  */
2842                 if (total_bytes_added != 0 && orig_request_size < total_bytes_added)
2843                         *(ap->a_bytesallocated) =
2844                                 roundup(orig_request_size, (off_t)vcb->blockSize);
2845
2846         } else { /* Shorten the size of the file */
2847
2848                 if (fp->ff_size > length) {
2849                         /*
2850                          * Any buffers that are past the truncation point need to be
2851                          * invalidated (to maintain buffer cache consistency).
2852                          */
2853                 }
2854
2855                 retval = hfs_truncate(vp, length, 0, 0, ap->a_context);
2856                 filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
2857
2858                 /*
2859                  * if we get an error and no changes were made then exit
2860                  * otherwise we must do the hfs_update to reflect the changes
2861                  */
2862                 if (retval && (startingPEOF == filebytes)) goto Err_Exit;
2863 #if QUOTA
2864                 /* These are  bytesreleased */
2865                 (void) hfs_chkdq(cp, (int64_t)-((startingPEOF - filebytes)), NOCRED,0);
2866 #endif /* QUOTA */
2867
2868                 if (fp->ff_size > filebytes) {
2869                         fp->ff_size = filebytes;
2870
2871                         hfs_unlock(cp);
2872                         ubc_setsize(vp, fp->ff_size);
2873                         hfs_lock(cp, HFS_FORCE_LOCK);
2874                 }
2875         }
2876
2877 Std_Exit:
2878         cp->c_touch_chgtime = TRUE;
2879         cp->c_touch_modtime = TRUE;
2880         retval2 = hfs_update(vp, MNT_WAIT);
2881
2882         if (retval == 0)
2883                 retval = retval2;
2884 Err_Exit:
2885         hfs_unlock_truncate(cp, TRUE);
2886         hfs_unlock(cp);
2887         return (retval);
2888 }
2889
2890
2891 /*
2892  * Pagein for HFS filesystem
2893  */
2894 int
2895 hfs_vnop_pagein(struct vnop_pagein_args *ap)
2896 /*
2897         struct vnop_pagein_args {
2898                 vnode_t a_vp,
2899                 upl_t         a_pl,
2900                 vm_offset_t   a_pl_offset,
2901                 off_t         a_f_offset,
2902                 size_t        a_size,
2903                 int           a_flags
2904                 vfs_context_t a_context;
2905         };
2906 */
2907 {
2908         vnode_t vp = ap->a_vp;
2909         int error;
2910
2911         error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
2912                                ap->a_size, (off_t)VTOF(vp)->ff_size, ap->a_flags);
2913         /*
2914          * Keep track of blocks read.
2915          */
2916         if (!vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) {
2917                 struct cnode *cp;
2918                 struct filefork *fp;
2919                 int bytesread;
2920                 int took_cnode_lock = 0;
2921
2922                 cp = VTOC(vp);
2923                 fp = VTOF(vp);
2924
2925                 if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE)
2926                         bytesread = fp->ff_size;
2927                 else
2928                         bytesread = ap->a_size;
2929
2930                 /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
2931                 if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) {
2932                         hfs_lock(cp, HFS_FORCE_LOCK);
2933                         took_cnode_lock = 1;
2934                 }
2935                 /*
2936                  * If this file hasn't been seen since the start of
2937                  * the current sampling period then start over.
2938                  */
2939                 if (cp->c_atime < VTOHFS(vp)->hfc_timebase) {
2940                         struct timeval tv;
2941
2942                         fp->ff_bytesread = bytesread;
2943                         microtime(&tv);
2944                         cp->c_atime = tv.tv_sec;
2945                 } else {
2946                         fp->ff_bytesread += bytesread;
2947                 }
2948                 cp->c_touch_acctime = TRUE;
2949                 if (took_cnode_lock)
2950                         hfs_unlock(cp);
2951         }
2952         return (error);
2953 }
2954
2955 /*
2956  * Pageout for HFS filesystem.
2957  */
2958 int
2959 hfs_vnop_pageout(struct vnop_pageout_args *ap)
2960 /*
2961         struct vnop_pageout_args {
2962            vnode_t a_vp,
2963            upl_t         a_pl,
2964            vm_offset_t   a_pl_offset,
2965            off_t         a_f_offset,
2966            size_t        a_size,
2967            int           a_flags
2968            vfs_context_t a_context;
2969         };
2970 */
2971 {
2972         vnode_t vp = ap->a_vp;
2973         struct cnode *cp;
2974         struct filefork *fp;
2975         int retval;
2976         off_t filesize;
2977
2978         cp = VTOC(vp);
2979         fp = VTOF(vp);
2980
2981         /*
2982          * Figure out where the file ends, for pageout purposes.  If
2983          * ff_new_size > ff_size, then we're in the middle of extending the
2984          * file via a write, so it is safe (and necessary) that we be able
2985          * to pageout up to that point.
2986          */
2987         filesize = fp->ff_size;
2988         if (fp->ff_new_size > filesize)
2989                 filesize = fp->ff_new_size;
2990
2991         if (!vnode_isswap(vp)) {
2992                 off_t end_of_range;
2993                 int tooklock = 0;
2994
2995                 if (cp->c_lockowner != current_thread()) {
2996                     if ( (retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
2997                         if (!(ap->a_flags & UPL_NOCOMMIT)) {
2998                                 ubc_upl_abort_range(ap->a_pl,
2999                                                     ap->a_pl_offset,
3000                                                     ap->a_size,
3001                                                     UPL_ABORT_FREE_ON_EMPTY);
3002                         }
3003                         return (retval);
3004                     }
3005                     tooklock = 1;
3006                 }
3007
3008                 end_of_range = ap->a_f_offset + ap->a_size - 1;
3009
3010                 if (end_of_range >= filesize) {
3011                         end_of_range = (off_t)(filesize - 1);
3012                 }
3013                 if (ap->a_f_offset < filesize) {
3014                         rl_remove(ap->a_f_offset, end_of_range, &fp->ff_invalidranges);
3015                         cp->c_flag |= C_MODIFIED;  /* leof is dirty */
3016                 }
3017
3018                 if (tooklock) {
3019                     hfs_unlock(cp);
3020                 }
3021         }
3022
3023         retval = cluster_pageout(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
3024                                  ap->a_size, filesize, ap->a_flags);
3025
3026         /*
3027          * If data was written, and setuid or setgid bits are set and
3028          * this process is not the superuser then clear the setuid and
3029          * setgid bits as a precaution against tampering.
3030          */
3031         if ((retval == 0) &&
3032             (cp->c_mode & (S_ISUID | S_ISGID)) &&
3033             (vfs_context_suser(ap->a_context) != 0)) {
3034                 hfs_lock(cp, HFS_FORCE_LOCK);
3035                 cp->c_mode &= ~(S_ISUID | S_ISGID);
3036                 cp->c_touch_chgtime = TRUE;
3037                 hfs_unlock(cp);
3038         }
3039         return (retval);
3040 }
3041
3042 /*
3043  * Intercept B-Tree node writes to unswap them if necessary.
3044  */
3045 int
3046 hfs_vnop_bwrite(struct vnop_bwrite_args *ap)
3047 {
3048         int retval = 0;
3049         register struct buf *bp = ap->a_bp;
3050         register struct vnode *vp = buf_vnode(bp);
3051         BlockDescriptor block;
3052
3053         /* Trap B-Tree writes */
3054         if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) ||
3055             (VTOC(vp)->c_fileid == kHFSCatalogFileID) ||
3056             (VTOC(vp)->c_fileid == kHFSAttributesFileID) ||
3057             (vp == VTOHFS(vp)->hfc_filevp)) {
3058
3059                 /*
3060                  * Swap and validate the node if it is in native byte order.
3061                  * This is always be true on big endian, so we always validate
3062                  * before writing here.  On little endian, the node typically has
3063                  * been swapped and validated when it was written to the journal,
3064                  * so we won't do anything here.
3065                  */
3066                 if (((u_int16_t *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) {
3067                         /* Prepare the block pointer */
3068                         block.blockHeader = bp;
3069                         block.buffer = (char *)buf_dataptr(bp);
3070                         block.blockNum = buf_lblkno(bp);
3071                         /* not found in cache ==> came from disk */
3072                         block.blockReadFromDisk = (buf_fromcache(bp) == 0);
3073                         block.blockSize = buf_count(bp);
3074
3075                         /* Endian un-swap B-Tree node */
3076                         retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, false);
3077                         if (retval)
3078                                 panic("hfs_vnop_bwrite: about to write corrupt node!\n");
3079                 }
3080         }
3081
3082         /* This buffer shouldn't be locked anymore but if it is clear it */
3083         if ((buf_flags(bp) & B_LOCKED)) {
3084                 // XXXdbg
3085                 if (VTOHFS(vp)->jnl) {
3086                         panic("hfs: CLEARING the lock bit on bp %p\n", bp);
3087                 }
3088                 buf_clearflags(bp, B_LOCKED);
3089         }
3090         retval = vn_bwrite (ap);
3091
3092         return (retval);
3093 }
3094
3095 /*
3096  * Relocate a file to a new location on disk
3097  *  cnode must be locked on entry
3098  *
3099  * Relocation occurs by cloning the file's data from its
3100  * current set of blocks to a new set of blocks. During
3101  * the relocation all of the blocks (old and new) are
3102  * owned by the file.
3103  *
3104  * -----------------
3105  * |///////////////|
3106  * -----------------
3107  * 0               N (file offset)
3108  *
3109  * -----------------     -----------------
3110  * |///////////////|     |               |     STEP 1 (acquire new blocks)
3111  * -----------------     -----------------
3112  * 0               N     N+1             2N
3113  *
3114  * -----------------     -----------------
3115  * |///////////////|     |///////////////|     STEP 2 (clone data)
3116  * -----------------     -----------------
3117  * 0               N     N+1             2N
3118  *
3119  *                       -----------------
3120  *                       |///////////////|     STEP 3 (head truncate blocks)
3121  *                       -----------------
3122  *                       0               N
3123  *
3124  * During steps 2 and 3 page-outs to file offsets less
3125  * than or equal to N are suspended.
3126  *
3127  * During step 3 page-ins to the file get suspended.
3128  */
3129 __private_extern__
3130 int
3131 hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
3132         struct  proc *p)
3133 {
3134         struct  cnode *cp;
3135         struct  filefork *fp;
3136         struct  hfsmount *hfsmp;
3137         u_int32_t  headblks;
3138         u_int32_t  datablks;
3139         u_int32_t  blksize;
3140         u_int32_t  growsize;
3141         u_int32_t  nextallocsave;
3142         daddr64_t  sector_a,  sector_b;
3143         int eflags;
3144         off_t  newbytes;
3145         int  retval;
3146         int lockflags = 0;
3147         int took_trunc_lock = 0;
3148         int started_tr = 0;
3149         enum vtype vnodetype;
3150
3151         vnodetype = vnode_vtype(vp);
3152         if (vnodetype != VREG && vnodetype != VLNK) {
3153                 return (EPERM);
3154         }
3155
3156         hfsmp = VTOHFS(vp);
3157         if (hfsmp->hfs_flags & HFS_FRAGMENTED_FREESPACE) {
3158                 return (ENOSPC);
3159         }
3160
3161         cp = VTOC(vp);
3162         fp = VTOF(vp);
3163         if (fp->ff_unallocblocks)
3164                 return (EINVAL);
3165         blksize = hfsmp->blockSize;
3166         if (blockHint == 0)
3167                 blockHint = hfsmp->nextAllocation;
3168
3169         if ((fp->ff_size > 0x7fffffff) ||
3170             ((fp->ff_size > blksize) && vnodetype == VLNK)) {
3171                 return (EFBIG);
3172         }
3173
3174         //
3175         // We do not believe that this call to hfs_fsync() is
3176         // necessary and it causes a journal transaction
3177         // deadlock so we are removing it.
3178         //
3179         //if (vnodetype == VREG && !vnode_issystem(vp)) {
3180         //      retval = hfs_fsync(vp, MNT_WAIT, 0, p);
3181         //      if (retval)
3182         //              return (retval);
3183         //}
3184
3185         if (!vnode_issystem(vp) && (vnodetype != VLNK)) {
3186                 hfs_unlock(cp);
3187                 hfs_lock_truncate(cp, TRUE);
3188                 /* Force lock since callers expects lock to be held. */
3189                 if ((retval = hfs_lock(cp, HFS_FORCE_LOCK))) {
3190                         hfs_unlock_truncate(cp, TRUE);
3191                         return (retval);
3192                 }
3193                 /* No need to continue if file was removed. */
3194                 if (cp->c_flag & C_NOEXISTS) {
3195                         hfs_unlock_truncate(cp, TRUE);
3196                         return (ENOENT);
3197                 }
3198                 took_trunc_lock = 1;
3199         }
3200         headblks = fp->ff_blocks;
3201         datablks = howmany(fp->ff_size, blksize);
3202         growsize = datablks * blksize;
3203         eflags = kEFContigMask | kEFAllMask | kEFNoClumpMask;
3204         if (blockHint >= hfsmp->hfs_metazone_start &&
3205             blockHint <= hfsmp->hfs_metazone_end)
3206                 eflags |= kEFMetadataMask;
3207
3208         if (hfs_start_transaction(hfsmp) != 0) {
3209                 if (took_trunc_lock)
3210                         hfs_unlock_truncate(cp, TRUE);
3211             return (EINVAL);
3212         }
3213         started_tr = 1;
3214         /*
3215          * Protect the extents b-tree and the allocation bitmap
3216          * during MapFileBlockC and ExtendFileC operations.
3217          */
3218         lockflags = SFL_BITMAP;
3219         if (overflow_extents(fp))
3220                 lockflags |= SFL_EXTENTS;
3221         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3222
3223         retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize - 1, &sector_a, NULL);
3224         if (retval) {
3225                 retval = MacToVFSError(retval);
3226                 goto out;
3227         }
3228
3229         /*
3230          * STEP 1 - acquire new allocation blocks.
3231          */
3232         nextallocsave = hfsmp->nextAllocation;
3233         retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes);
3234         if (eflags & kEFMetadataMask) {
3235                 HFS_MOUNT_LOCK(hfsmp, TRUE);
3236                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave);
3237                 MarkVCBDirty(hfsmp);
3238                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
3239         }
3240
3241         retval = MacToVFSError(retval);
3242         if (retval == 0) {
3243                 cp->c_flag |= C_MODIFIED;
3244                 if (newbytes < growsize) {
3245                         retval = ENOSPC;
3246                         goto restore;
3247                 } else if (fp->ff_blocks < (headblks + datablks)) {
3248                         printf("hfs_relocate: allocation failed");
3249                         retval = ENOSPC;
3250                         goto restore;
3251                 }
3252
3253                 retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize, &sector_b, NULL);
3254                 if (retval) {
3255                         retval = MacToVFSError(retval);
3256                 } else if ((sector_a + 1) == sector_b) {
3257                         retval = ENOSPC;
3258                         goto restore;
3259                 } else if ((eflags & kEFMetadataMask) &&
3260                            ((((u_int64_t)sector_b * hfsmp->hfs_logical_block_size) / blksize) >
3261                               hfsmp->hfs_metazone_end)) {
3262                         const char * filestr;
3263                         char emptystr = '\0';
3264
3265                         if (cp->c_desc.cd_nameptr != NULL) {
3266                                 filestr = (const char *)&cp->c_desc.cd_nameptr[0];
3267                         } else if (vnode_name(vp) != NULL) {
3268                                 filestr = vnode_name(vp);
3269                         } else {
3270                                 filestr = &emptystr;
3271                         }
3272                         printf("hfs_relocate: %s didn't move into MDZ (%d blks)\n", filestr, fp->ff_blocks);
3273                         retval = ENOSPC;
3274                         goto restore;
3275                 }
3276         }
3277         /* Done with system locks and journal for now. */
3278         hfs_systemfile_unlock(hfsmp, lockflags);
3279         lockflags = 0;
3280         hfs_end_transaction(hfsmp);
3281         started_tr = 0;
3282
3283         if (retval) {
3284                 /*
3285                  * Check to see if failure is due to excessive fragmentation.
3286                  */
3287                 if ((retval == ENOSPC) &&
3288                     (hfs_freeblks(hfsmp, 0) > (datablks * 2))) {
3289                         hfsmp->hfs_flags |= HFS_FRAGMENTED_FREESPACE;
3290                 }
3291                 goto out;
3292         }
3293         /*
3294          * STEP 2 - clone file data into the new allocation blocks.
3295          */
3296
3297         if (vnodetype == VLNK)
3298                 retval = hfs_clonelink(vp, blksize, cred, p);
3299         else if (vnode_issystem(vp))
3300                 retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p);
3301         else
3302                 retval = hfs_clonefile(vp, headblks, datablks, blksize);
3303
3304         /* Start transaction for step 3 or for a restore. */
3305         if (hfs_start_transaction(hfsmp) != 0) {
3306                 retval = EINVAL;
3307                 goto out;
3308         }
3309         started_tr = 1;
3310         if (retval)
3311                 goto restore;
3312
3313         /*
3314          * STEP 3 - switch to cloned data and remove old blocks.
3315          */
3316         lockflags = SFL_BITMAP;
3317         if (overflow_extents(fp))
3318                 lockflags |= SFL_EXTENTS;
3319         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3320
3321         retval = HeadTruncateFile(hfsmp, (FCB*)fp, headblks);
3322
3323         hfs_systemfile_unlock(hfsmp, lockflags);
3324         lockflags = 0;
3325         if (retval)
3326                 goto restore;
3327 out:
3328         if (took_trunc_lock)
3329                 hfs_unlock_truncate(cp, TRUE);
3330
3331         if (lockflags) {
3332                 hfs_systemfile_unlock(hfsmp, lockflags);
3333                 lockflags = 0;
3334         }
3335
3336         /* Push cnode's new extent data to disk. */
3337         if (retval == 0) {
3338                 (void) hfs_update(vp, MNT_WAIT);
3339         }
3340         if (hfsmp->jnl) {
3341                 if (cp->c_cnid < kHFSFirstUserCatalogNodeID)
3342                         (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
3343                 else
3344                         (void) hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
3345         }
3346 exit:
3347         if (started_tr)
3348                 hfs_end_transaction(hfsmp);
3349
3350         return (retval);
3351
3352 restore:
3353         if (fp->ff_blocks == headblks) {
3354                 if (took_trunc_lock)
3355                         hfs_unlock_truncate(cp, TRUE);
3356                 goto exit;
3357         }
3358         /*
3359          * Give back any newly allocated space.
3360          */
3361         if (lockflags == 0) {
3362                 lockflags = SFL_BITMAP;
3363                 if (overflow_extents(fp))
3364                         lockflags |= SFL_EXTENTS;
3365                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3366         }
3367
3368         (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, false);
3369
3370         hfs_systemfile_unlock(hfsmp, lockflags);
3371         lockflags = 0;
3372
3373         if (took_trunc_lock)
3374                 hfs_unlock_truncate(cp, TRUE);
3375         goto exit;
3376 }
3377
3378
3379 /*
3380  * Clone a symlink.
3381  *
3382  */
3383 static int
3384 hfs_clonelink(struct vnode *vp, int blksize, kauth_cred_t cred, __unused struct proc *p)
3385 {
3386         struct buf *head_bp = NULL;
3387         struct buf *tail_bp = NULL;
3388         int error;
3389
3390
3391         error = (int)buf_meta_bread(vp, (daddr64_t)0, blksize, cred, &head_bp);
3392         if (error)
3393                 goto out;
3394
3395         tail_bp = buf_getblk(vp, (daddr64_t)1, blksize, 0, 0, BLK_META);
3396         if (tail_bp == NULL) {
3397                 error = EIO;
3398                 goto out;
3399         }
3400         bcopy((char *)buf_dataptr(head_bp), (char *)buf_dataptr(tail_bp), blksize);
3401         error = (int)buf_bwrite(tail_bp);
3402 out:
3403         if (head_bp) {
3404                 buf_markinvalid(head_bp);
3405                 buf_brelse(head_bp);
3406         }
3407         (void) buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
3408
3409         return (error);
3410 }
3411
3412 /*
3413  * Clone a file's data within the file.
3414  *
3415  */
3416 static int
3417 hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
3418 {
3419         caddr_t  bufp;
3420         size_t  writebase;
3421         size_t  bufsize;
3422         size_t  copysize;
3423         size_t  iosize;
3424         off_t   filesize;
3425         size_t  offset;
3426         uio_t auio;
3427         int  error = 0;
3428
3429         filesize = VTOF(vp)->ff_blocks * blksize;  /* virtual file size */
3430         writebase = blkstart * blksize;
3431         copysize = blkcnt * blksize;
3432         iosize = bufsize = MIN(copysize, 128 * 1024);
3433         offset = 0;
3434
3435         if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
3436                 return (ENOMEM);
3437         }
3438         hfs_unlock(VTOC(vp));
3439
3440         auio = uio_create(1, 0, UIO_SYSSPACE32, UIO_READ);
3441
3442         while (offset < copysize) {
3443                 iosize = MIN(copysize - offset, iosize);
3444
3445                 uio_reset(auio, offset, UIO_SYSSPACE32, UIO_READ);
3446                 uio_addiov(auio, (uintptr_t)bufp, iosize);
3447
3448                 error = cluster_read(vp, auio, copysize, IO_NOCACHE);
3449                 if (error) {
3450                         printf("hfs_clonefile: cluster_read failed - %d\n", error);
3451                         break;
3452                 }
3453                 if (uio_resid(auio) != 0) {
3454                         printf("clonedata: cluster_read: uio_resid = %lld\n", uio_resid(auio));
3455                         error = EIO;
3456                         break;
3457                 }
3458
3459                 uio_reset(auio, writebase + offset, UIO_SYSSPACE32, UIO_WRITE);
3460                 uio_addiov(auio, (uintptr_t)bufp, iosize);
3461
3462                 error = cluster_write(vp, auio, filesize + offset,
3463                                       filesize + offset + iosize,
3464                                       uio_offset(auio), 0, IO_NOCACHE | IO_SYNC);
3465                 if (error) {
3466                         printf("hfs_clonefile: cluster_write failed - %d\n", error);
3467                         break;
3468                 }
3469                 if (uio_resid(auio) != 0) {
3470                         printf("hfs_clonefile: cluster_write failed - uio_resid not zero\n");
3471                         error = EIO;
3472                         break;
3473                 }
3474                 offset += iosize;
3475         }
3476         uio_free(auio);
3477
3478         /*
3479          * No need to call ubc_sync_range or hfs_invalbuf
3480          * since the file was copied using IO_NOCACHE.
3481          */
3482
3483         kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
3484
3485         hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
3486         return (error);
3487 }
3488
3489 /*
3490  * Clone a system (metadata) file.
3491  *
3492  */
3493 static int
3494 hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize,
3495                  kauth_cred_t cred, struct proc *p)
3496 {
3497         caddr_t  bufp;
3498         char * offset;
3499         size_t  bufsize;
3500         size_t  iosize;
3501         struct buf *bp = NULL;
3502         daddr64_t  blkno;
3503         daddr64_t  blk;
3504         daddr64_t  start_blk;
3505         daddr64_t  last_blk;
3506         int  breadcnt;
3507         int  i;
3508         int  error = 0;
3509
3510
3511         iosize = GetLogicalBlockSize(vp);
3512         bufsize = MIN(blkcnt * blksize, 1024 * 1024) & ~(iosize - 1);
3513         breadcnt = bufsize / iosize;
3514
3515         if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
3516                 return (ENOMEM);
3517         }
3518         start_blk = ((daddr64_t)blkstart * blksize) / iosize;
3519         last_blk  = ((daddr64_t)blkcnt * blksize) / iosize;
3520         blkno = 0;
3521
3522         while (blkno < last_blk) {
3523                 /*
3524                  * Read up to a megabyte
3525                  */
3526                 offset = bufp;
3527                 for (i = 0, blk = blkno; (i < breadcnt) && (blk < last_blk); ++i, ++blk) {
3528                         error = (int)buf_meta_bread(vp, blk, iosize, cred, &bp);
3529                         if (error) {
3530                                 printf("hfs_clonesysfile: meta_bread error %d\n", error);
3531                                 goto out;
3532                         }
3533                         if (buf_count(bp) != iosize) {
3534                                 printf("hfs_clonesysfile: b_bcount is only %d\n", buf_count(bp));
3535                                 goto out;
3536                         }
3537                         bcopy((char *)buf_dataptr(bp), offset, iosize);
3538
3539                         buf_markinvalid(bp);
3540                         buf_brelse(bp);
3541                         bp = NULL;
3542
3543                         offset += iosize;
3544                 }
3545
3546                 /*
3547                  * Write up to a megabyte
3548                  */
3549                 offset = bufp;
3550                 for (i = 0; (i < breadcnt) && (blkno < last_blk); ++i, ++blkno) {
3551                         bp = buf_getblk(vp, start_blk + blkno, iosize, 0, 0, BLK_META);
3552                         if (bp == NULL) {
3553                                 printf("hfs_clonesysfile: getblk failed on blk %qd\n", start_blk + blkno);
3554                                 error = EIO;
3555                                 goto out;
3556                         }
3557                         bcopy(offset, (char *)buf_dataptr(bp), iosize);
3558                         error = (int)buf_bwrite(bp);
3559                         bp = NULL;
3560                         if (error)
3561                                 goto out;
3562                         offset += iosize;
3563                 }
3564         }
3565 out:
3566         if (bp) {
3567                 buf_brelse(bp);
3568         }
3569
3570         kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
3571
3572         error = hfs_fsync(vp, MNT_WAIT, 0, p);
3573
3574         return (error);
3575 }