bsd/hfs/hfs_readwrite.c

   1 /*
   2  * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*      @(#)hfs_readwrite.c     1.0
  29  *
  30  *      (c) 1998-2001 Apple Computer, Inc.  All Rights Reserved
  31  *
  32  *      hfs_readwrite.c -- vnode operations to deal with reading and writing files.
  33  *
  34  */
  35
  36 #include <sys/param.h>
  37 #include <sys/systm.h>
  38 #include <sys/resourcevar.h>
  39 #include <sys/kernel.h>
  40 #include <sys/fcntl.h>
  41 #include <sys/filedesc.h>
  42 #include <sys/stat.h>
  43 #include <sys/buf.h>
  44 #include <sys/proc.h>
  45 #include <sys/kauth.h>
  46 #include <sys/vnode.h>
  47 #include <sys/vnode_internal.h>
  48 #include <sys/uio.h>
  49 #include <sys/vfs_context.h>
  50 #include <sys/fsevents.h>
  51 #include <kern/kalloc.h>
  52 #include <sys/disk.h>
  53 #include <sys/sysctl.h>
  54
  55 #include <miscfs/specfs/specdev.h>
  56
  57 #include <sys/ubc.h>
  58 #include <sys/ubc_internal.h>
  59
  60 #include <vm/vm_pageout.h>
  61 #include <vm/vm_kern.h>
  62
  63 #include <sys/kdebug.h>
  64
  65 #include        "hfs.h"
  66 #include        "hfs_attrlist.h"
  67 #include        "hfs_endian.h"
  68 #include        "hfs_fsctl.h"
  69 #include        "hfs_quota.h"
  70 #include        "hfscommon/headers/FileMgrInternal.h"
  71 #include        "hfscommon/headers/BTreesInternal.h"
  72 #include        "hfs_cnode.h"
  73 #include        "hfs_dbg.h"
  74
  75 #define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2)))
  76
  77 enum {
  78         MAXHFSFILESIZE = 0x7FFFFFFF             /* this needs to go in the mount structure */
  79 };
  80
  81 /* from bsd/vfs/vfs_cluster.c */
  82 extern int is_file_clean(vnode_t vp, off_t filesize);
  83 /* from bsd/hfs/hfs_vfsops.c */
  84 extern int hfs_vfs_vget(struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context);
  85
  86 static int  hfs_clonelink(struct vnode *, int, kauth_cred_t, struct proc *);
  87 static int  hfs_clonefile(struct vnode *, int, int, int);
  88 static int  hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *);
  89
  90 int flush_cache_on_write = 0;
  91 SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files");
  92
  93
  94 /*
  95  * Read data from a file.
  96  */
  97 int
  98 hfs_vnop_read(struct vnop_read_args *ap)
  99 {
 100         uio_t uio = ap->a_uio;
 101         struct vnode *vp = ap->a_vp;
 102         struct cnode *cp;
 103         struct filefork *fp;
 104         struct hfsmount *hfsmp;
 105         off_t filesize;
 106         off_t filebytes;
 107         off_t start_resid = uio_resid(uio);
 108         off_t offset = uio_offset(uio);
 109         int retval = 0;
 110
 111
 112         /* Preflight checks */
 113         if (!vnode_isreg(vp)) {
 114                 /* can only read regular files */
 115                 if (vnode_isdir(vp))
 116                         return (EISDIR);
 117                 else
 118                         return (EPERM);
 119         }
 120         if (start_resid == 0)
 121                 return (0);             /* Nothing left to do */
 122         if (offset < 0)
 123                 return (EINVAL);        /* cant read from a negative offset */
 124
 125         cp = VTOC(vp);
 126         fp = VTOF(vp);
 127         hfsmp = VTOHFS(vp);
 128
 129         /* Protect against a size change. */
 130         hfs_lock_truncate(cp, 0);
 131
 132         filesize = fp->ff_size;
 133         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 134         if (offset > filesize) {
 135                 if ((hfsmp->hfs_flags & HFS_STANDARD) &&
 136                     (offset > (off_t)MAXHFSFILESIZE)) {
 137                         retval = EFBIG;
 138                 }
 139                 goto exit;
 140         }
 141
 142         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_START,
 143                 (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0);
 144
 145         retval = cluster_read(vp, uio, filesize, ap->a_ioflag);
 146
 147         cp->c_touch_acctime = TRUE;
 148
 149         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_END,
 150                 (int)uio_offset(uio), uio_resid(uio), (int)filesize,  (int)filebytes, 0);
 151
 152         /*
 153          * Keep track blocks read
 154          */
 155         if (hfsmp->hfc_stage == HFC_RECORDING && retval == 0) {
 156                 int took_cnode_lock = 0;
 157                 off_t bytesread;
 158
 159                 bytesread = start_resid - uio_resid(uio);
 160
 161                 /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
 162                 if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) {
 163                         hfs_lock(cp, HFS_FORCE_LOCK);
 164                         took_cnode_lock = 1;
 165                 }
 166                 /*
 167                  * If this file hasn't been seen since the start of
 168                  * the current sampling period then start over.
 169                  */
 170                 if (cp->c_atime < hfsmp->hfc_timebase) {
 171                         struct timeval tv;
 172
 173                         fp->ff_bytesread = bytesread;
 174                         microtime(&tv);
 175                         cp->c_atime = tv.tv_sec;
 176                 } else {
 177                         fp->ff_bytesread += bytesread;
 178                 }
 179                 if (took_cnode_lock)
 180                         hfs_unlock(cp);
 181         }
 182 exit:
 183         hfs_unlock_truncate(cp, 0);
 184         return (retval);
 185 }
 186
 187 /*
 188  * Write data to a file.
 189  */
 190 int
 191 hfs_vnop_write(struct vnop_write_args *ap)
 192 {
 193         uio_t uio = ap->a_uio;
 194         struct vnode *vp = ap->a_vp;
 195         struct cnode *cp;
 196         struct filefork *fp;
 197         struct hfsmount *hfsmp;
 198         kauth_cred_t cred = NULL;
 199         off_t origFileSize;
 200         off_t writelimit;
 201         off_t bytesToAdd = 0;
 202         off_t actualBytesAdded;
 203         off_t filebytes;
 204         off_t offset;
 205         size_t resid;
 206         int eflags;
 207         int ioflag = ap->a_ioflag;
 208         int retval = 0;
 209         int lockflags;
 210         int cnode_locked = 0;
 211         int partialwrite = 0;
 212         int exclusive_lock = 0;
 213
 214         // LP64todo - fix this! uio_resid may be 64-bit value
 215         resid = uio_resid(uio);
 216         offset = uio_offset(uio);
 217
 218         if (ioflag & IO_APPEND) {
 219             exclusive_lock = 1;
 220         }
 221
 222         if (offset < 0)
 223                 return (EINVAL);
 224         if (resid == 0)
 225                 return (E_NONE);
 226         if (!vnode_isreg(vp))
 227                 return (EPERM);  /* Can only write regular files */
 228
 229         cp = VTOC(vp);
 230         fp = VTOF(vp);
 231         hfsmp = VTOHFS(vp);
 232
 233         eflags = kEFDeferMask;  /* defer file block allocations */
 234 #ifdef HFS_SPARSE_DEV
 235         /*
 236          * When the underlying device is sparse and space
 237          * is low (< 8MB), stop doing delayed allocations
 238          * and begin doing synchronous I/O.
 239          */
 240         if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
 241             (hfs_freeblks(hfsmp, 0) < 2048)) {
 242                 eflags &= ~kEFDeferMask;
 243                 ioflag |= IO_SYNC;
 244         }
 245 #endif /* HFS_SPARSE_DEV */
 246
 247 again:
 248         /* Protect against a size change. */
 249         hfs_lock_truncate(cp, exclusive_lock);
 250
 251         if (ioflag & IO_APPEND) {
 252                 uio_setoffset(uio, fp->ff_size);
 253                 offset = fp->ff_size;
 254         }
 255         if ((cp->c_flags & APPEND) && offset != fp->ff_size) {
 256                 retval = EPERM;
 257                 goto exit;
 258         }
 259
 260         origFileSize = fp->ff_size;
 261         writelimit = offset + resid;
 262         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 263
 264         /* If the truncate lock is shared, and if we either have virtual
 265          * blocks or will need to extend the file, upgrade the truncate
 266          * to exclusive lock.  If upgrade fails, we lose the lock and
 267          * have to get exclusive lock again
 268          */
 269         if ((exclusive_lock == 0) &&
 270             ((fp->ff_unallocblocks != 0) || (writelimit > filebytes))) {
 271                 exclusive_lock = 1;
 272                 /* Lock upgrade failed and we lost our shared lock, try again */
 273                 if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) {
 274                         goto again;
 275                 }
 276         }
 277
 278         if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) {
 279                 goto exit;
 280         }
 281         cnode_locked = 1;
 282
 283         if (!exclusive_lock) {
 284                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_START,
 285                              (int)offset, uio_resid(uio), (int)fp->ff_size,
 286                              (int)filebytes, 0);
 287         }
 288
 289         /* Check if we do not need to extend the file */
 290         if (writelimit <= filebytes) {
 291                 goto sizeok;
 292         }
 293
 294         cred = vfs_context_ucred(ap->a_context);
 295         bytesToAdd = writelimit - filebytes;
 296
 297 #if QUOTA
 298         retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, hfsmp->blockSize)),
 299                            cred, 0);
 300         if (retval)
 301                 goto exit;
 302 #endif /* QUOTA */
 303
 304         if (hfs_start_transaction(hfsmp) != 0) {
 305                 retval = EINVAL;
 306                 goto exit;
 307         }
 308
 309         while (writelimit > filebytes) {
 310                 bytesToAdd = writelimit - filebytes;
 311                 if (cred && suser(cred, NULL) != 0)
 312                         eflags |= kEFReserveMask;
 313
 314                 /* Protect extents b-tree and allocation bitmap */
 315                 lockflags = SFL_BITMAP;
 316                 if (overflow_extents(fp))
 317                         lockflags |= SFL_EXTENTS;
 318                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
 319
 320                 /* Files that are changing size are not hot file candidates. */
 321                 if (hfsmp->hfc_stage == HFC_RECORDING) {
 322                         fp->ff_bytesread = 0;
 323                 }
 324                 retval = MacToVFSError(ExtendFileC (hfsmp, (FCB*)fp, bytesToAdd,
 325                                 0, eflags, &actualBytesAdded));
 326
 327                 hfs_systemfile_unlock(hfsmp, lockflags);
 328
 329                 if ((actualBytesAdded == 0) && (retval == E_NONE))
 330                         retval = ENOSPC;
 331                 if (retval != E_NONE)
 332                         break;
 333                 filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 334                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_NONE,
 335                         (int)offset, uio_resid(uio), (int)fp->ff_size,  (int)filebytes, 0);
 336         }
 337         (void) hfs_update(vp, TRUE);
 338         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
 339         (void) hfs_end_transaction(hfsmp);
 340
 341         /*
 342          * If we didn't grow the file enough try a partial write.
 343          * POSIX expects this behavior.
 344          */
 345         if ((retval == ENOSPC) && (filebytes > offset)) {
 346                 retval = 0;
 347                 partialwrite = 1;
 348                 uio_setresid(uio, (uio_resid(uio) - bytesToAdd));
 349                 resid -= bytesToAdd;
 350                 writelimit = filebytes;
 351         }
 352 sizeok:
 353         if (retval == E_NONE) {
 354                 off_t filesize;
 355                 off_t zero_off;
 356                 off_t tail_off;
 357                 off_t inval_start;
 358                 off_t inval_end;
 359                 off_t io_start;
 360                 int lflag;
 361                 struct rl_entry *invalid_range;
 362
 363                 if (writelimit > fp->ff_size)
 364                         filesize = writelimit;
 365                 else
 366                         filesize = fp->ff_size;
 367
 368                 lflag = ioflag & ~(IO_TAILZEROFILL | IO_HEADZEROFILL | IO_NOZEROVALID | IO_NOZERODIRTY);
 369
 370                 if (offset <= fp->ff_size) {
 371                         zero_off = offset & ~PAGE_MASK_64;
 372
 373                         /* Check to see whether the area between the zero_offset and the start
 374                            of the transfer to see whether is invalid and should be zero-filled
 375                            as part of the transfer:
 376                          */
 377                         if (offset > zero_off) {
 378                                 if (rl_scan(&fp->ff_invalidranges, zero_off, offset - 1, &invalid_range) != RL_NOOVERLAP)
 379                                         lflag |= IO_HEADZEROFILL;
 380                         }
 381                 } else {
 382                         off_t eof_page_base = fp->ff_size & ~PAGE_MASK_64;
 383
 384                         /* The bytes between fp->ff_size and uio->uio_offset must never be
 385                            read without being zeroed.  The current last block is filled with zeroes
 386                            if it holds valid data but in all cases merely do a little bookkeeping
 387                            to track the area from the end of the current last page to the start of
 388                            the area actually written.  For the same reason only the bytes up to the
 389                            start of the page where this write will start is invalidated; any remainder
 390                            before uio->uio_offset is explicitly zeroed as part of the cluster_write.
 391
 392                            Note that inval_start, the start of the page after the current EOF,
 393                            may be past the start of the write, in which case the zeroing
 394                            will be handled by the cluser_write of the actual data.
 395                          */
 396                         inval_start = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
 397                         inval_end = offset & ~PAGE_MASK_64;
 398                         zero_off = fp->ff_size;
 399
 400                         if ((fp->ff_size & PAGE_MASK_64) &&
 401                                 (rl_scan(&fp->ff_invalidranges,
 402                                                         eof_page_base,
 403                                                         fp->ff_size - 1,
 404                                                         &invalid_range) != RL_NOOVERLAP)) {
 405                                 /* The page containing the EOF is not valid, so the
 406                                    entire page must be made inaccessible now.  If the write
 407                                    starts on a page beyond the page containing the eof
 408                                    (inval_end > eof_page_base), add the
 409                                    whole page to the range to be invalidated.  Otherwise
 410                                    (i.e. if the write starts on the same page), zero-fill
 411                                    the entire page explicitly now:
 412                                  */
 413                                 if (inval_end > eof_page_base) {
 414                                         inval_start = eof_page_base;
 415                                 } else {
 416                                         zero_off = eof_page_base;
 417                                 };
 418                         };
 419
 420                         if (inval_start < inval_end) {
 421                                 struct timeval tv;
 422                                 /* There's some range of data that's going to be marked invalid */
 423
 424                                 if (zero_off < inval_start) {
 425                                         /* The pages between inval_start and inval_end are going to be invalidated,
 426                                            and the actual write will start on a page past inval_end.  Now's the last
 427                                            chance to zero-fill the page containing the EOF:
 428                                          */
 429                                         hfs_unlock(cp);
 430                                         cnode_locked = 0;
 431                                         retval = cluster_write(vp, (uio_t) 0,
 432                                                         fp->ff_size, inval_start,
 433                                                         zero_off, (off_t)0,
 434                                                         lflag | IO_HEADZEROFILL | IO_NOZERODIRTY);
 435                                         hfs_lock(cp, HFS_FORCE_LOCK);
 436                                         cnode_locked = 1;
 437                                         if (retval) goto ioerr_exit;
 438                                         offset = uio_offset(uio);
 439                                 };
 440
 441                                 /* Mark the remaining area of the newly allocated space as invalid: */
 442                                 rl_add(inval_start, inval_end - 1 , &fp->ff_invalidranges);
 443                                 microuptime(&tv);
 444                                 cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
 445                                 zero_off = fp->ff_size = inval_end;
 446                         };
 447
 448                         if (offset > zero_off) lflag |= IO_HEADZEROFILL;
 449                 };
 450
 451                 /* Check to see whether the area between the end of the write and the end of
 452                    the page it falls in is invalid and should be zero-filled as part of the transfer:
 453                  */
 454                 tail_off = (writelimit + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
 455                 if (tail_off > filesize) tail_off = filesize;
 456                 if (tail_off > writelimit) {
 457                         if (rl_scan(&fp->ff_invalidranges, writelimit, tail_off - 1, &invalid_range) != RL_NOOVERLAP) {
 458                                 lflag |= IO_TAILZEROFILL;
 459                         };
 460                 };
 461
 462                 /*
 463                  * if the write starts beyond the current EOF (possibly advanced in the
 464                  * zeroing of the last block, above), then we'll zero fill from the current EOF
 465                  * to where the write begins:
 466                  *
 467                  * NOTE: If (and ONLY if) the portion of the file about to be written is
 468                  *       before the current EOF it might be marked as invalid now and must be
 469                  *       made readable (removed from the invalid ranges) before cluster_write
 470                  *       tries to write it:
 471                  */
 472                 io_start = (lflag & IO_HEADZEROFILL) ? zero_off : offset;
 473                 if (io_start < fp->ff_size) {
 474                         off_t io_end;
 475
 476                         io_end = (lflag & IO_TAILZEROFILL) ? tail_off : writelimit;
 477                         rl_remove(io_start, io_end - 1, &fp->ff_invalidranges);
 478                 };
 479
 480                 hfs_unlock(cp);
 481                 cnode_locked = 0;
 482
 483                 /*
 484                  * We need to tell UBC the fork's new size BEFORE calling
 485                  * cluster_write, in case any of the new pages need to be
 486                  * paged out before cluster_write completes (which does happen
 487                  * in embedded systems due to extreme memory pressure).
 488                  * Similarly, we need to tell hfs_vnop_pageout what the new EOF
 489                  * will be, so that it can pass that on to cluster_pageout, and
 490                  * allow those pageouts.
 491                  *
 492                  * We don't update ff_size yet since we don't want pageins to
 493                  * be able to see uninitialized data between the old and new
 494                  * EOF, until cluster_write has completed and initialized that
 495                  * part of the file.
 496                  *
 497                  * The vnode pager relies on the file size last given to UBC via
 498                  * ubc_setsize.  hfs_vnop_pageout relies on fp->ff_new_size or
 499                  * ff_size (whichever is larger).  NOTE: ff_new_size is always
 500                  * zero, unless we are extending the file via write.
 501                  */
 502                 if (filesize > fp->ff_size) {
 503                         fp->ff_new_size = filesize;
 504                         ubc_setsize(vp, filesize);
 505                 }
 506                 retval = cluster_write(vp, uio, fp->ff_size, filesize, zero_off,
 507                                 tail_off, lflag | IO_NOZERODIRTY);
 508                 if (retval) {
 509                         fp->ff_new_size = 0;    /* no longer extending; use ff_size */
 510                         if (filesize > origFileSize) {
 511                                 ubc_setsize(vp, origFileSize);
 512                         }
 513                         goto ioerr_exit;
 514                 }
 515
 516                 if (filesize > origFileSize) {
 517                         fp->ff_size = filesize;
 518
 519                         /* Files that are changing size are not hot file candidates. */
 520                         if (hfsmp->hfc_stage == HFC_RECORDING) {
 521                                 fp->ff_bytesread = 0;
 522                         }
 523                 }
 524                 fp->ff_new_size = 0;    /* ff_size now has the correct size */
 525
 526                 /* If we wrote some bytes, then touch the change and mod times */
 527                 if (resid > uio_resid(uio)) {
 528                         cp->c_touch_chgtime = TRUE;
 529                         cp->c_touch_modtime = TRUE;
 530                 }
 531         }
 532         if (partialwrite) {
 533                 uio_setresid(uio, (uio_resid(uio) + bytesToAdd));
 534                 resid += bytesToAdd;
 535         }
 536
 537         // XXXdbg - see radar 4871353 for more info
 538         {
 539             if (flush_cache_on_write && ((ioflag & IO_NOCACHE) || vnode_isnocache(vp))) {
 540                 VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NULL);
 541             }
 542         }
 543         HFS_KNOTE(vp, NOTE_WRITE);
 544
 545 ioerr_exit:
 546         /*
 547          * If we successfully wrote any data, and we are not the superuser
 548          * we clear the setuid and setgid bits as a precaution against
 549          * tampering.
 550          */
 551         if (cp->c_mode & (S_ISUID | S_ISGID)) {
 552                 cred = vfs_context_ucred(ap->a_context);
 553                 if (resid > uio_resid(uio) && cred && suser(cred, NULL)) {
 554                         if (!cnode_locked) {
 555                                 hfs_lock(cp, HFS_FORCE_LOCK);
 556                                 cnode_locked = 1;
 557                         }
 558                         cp->c_mode &= ~(S_ISUID | S_ISGID);
 559                 }
 560         }
 561         if (retval) {
 562                 if (ioflag & IO_UNIT) {
 563                         if (!cnode_locked) {
 564                                 hfs_lock(cp, HFS_FORCE_LOCK);
 565                                 cnode_locked = 1;
 566                         }
 567                         (void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC,
 568                                            0, ap->a_context);
 569                         // LP64todo - fix this!  resid needs to by user_ssize_t
 570                         uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio))));
 571                         uio_setresid(uio, resid);
 572                         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 573                 }
 574         } else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio))) {
 575                 if (!cnode_locked) {
 576                         hfs_lock(cp, HFS_FORCE_LOCK);
 577                         cnode_locked = 1;
 578                 }
 579                 retval = hfs_update(vp, TRUE);
 580         }
 581         /* Updating vcbWrCnt doesn't need to be atomic. */
 582         hfsmp->vcbWrCnt++;
 583
 584         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_END,
 585                 (int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0);
 586 exit:
 587         if (cnode_locked)
 588                 hfs_unlock(cp);
 589         hfs_unlock_truncate(cp, exclusive_lock);
 590         return (retval);
 591 }
 592
 593 /* support for the "bulk-access" fcntl */
 594
 595 #define CACHE_LEVELS 16
 596 #define NUM_CACHE_ENTRIES (64*16)
 597 #define PARENT_IDS_FLAG 0x100
 598
 599 struct access_cache {
 600        int numcached;
 601        int cachehits; /* these two for statistics gathering */
 602        int lookups;
 603        unsigned int *acache;
 604        unsigned char *haveaccess;
 605 };
 606
 607 struct access_t {
 608         uid_t     uid;              /* IN: effective user id */
 609         short     flags;            /* IN: access requested (i.e. R_OK) */
 610         short     num_groups;       /* IN: number of groups user belongs to */
 611         int       num_files;        /* IN: number of files to process */
 612         int       *file_ids;        /* IN: array of file ids */
 613         gid_t     *groups;          /* IN: array of groups */
 614         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 615 };
 616
 617 struct user_access_t {
 618         uid_t           uid;                    /* IN: effective user id */
 619         short           flags;                  /* IN: access requested (i.e. R_OK) */
 620         short           num_groups;             /* IN: number of groups user belongs to */
 621         int             num_files;              /* IN: number of files to process */
 622         user_addr_t     file_ids;               /* IN: array of file ids */
 623         user_addr_t     groups;                 /* IN: array of groups */
 624         user_addr_t     access;                 /* OUT: access info for each file (0 for 'has access') */
 625 };
 626
 627
 628 // these are the "extended" versions of the above structures
 629 // note that it is crucial that they be different sized than
 630 // the regular version
 631 struct ext_access_t {
 632         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 633         uint32_t   num_files;       /* IN: number of files to process */
 634         uint32_t   map_size;        /* IN: size of the bit map */
 635         uint32_t  *file_ids;        /* IN: Array of file ids */
 636         char      *bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 637         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 638         uint32_t   num_parents;   /* future use */
 639         cnid_t      *parents;   /* future use */
 640 };
 641
 642 struct ext_user_access_t {
 643         uint32_t      flags;        /* IN: access requested (i.e. R_OK) */
 644         uint32_t      num_files;    /* IN: number of files to process */
 645         uint32_t      map_size;     /* IN: size of the bit map */
 646         user_addr_t   file_ids;     /* IN: array of file ids */
 647         user_addr_t   bitmap;       /* IN: array of groups */
 648         user_addr_t   access;       /* OUT: access info for each file (0 for 'has access') */
 649         uint32_t      num_parents;/* future use */
 650         user_addr_t   parents;/* future use */
 651 };
 652
 653
 654 /*
 655  * Perform a binary search for the given parent_id. Return value is
 656  * the index if there is a match.  If no_match_indexp is non-NULL it
 657  * will be assigned with the index to insert the item (even if it was
 658  * not found).
 659  */
 660 static int cache_binSearch(cnid_t *array, unsigned int hi, cnid_t parent_id, int *no_match_indexp)
 661 {
 662     int index=-1;
 663     unsigned int lo=0;
 664
 665     do {
 666         unsigned int mid = ((hi - lo)/2) + lo;
 667         unsigned int this_id = array[mid];
 668
 669         if (parent_id == this_id) {
 670             hi = mid;
 671             break;
 672         }
 673
 674         if (parent_id < this_id) {
 675             hi = mid;
 676             continue;
 677         }
 678
 679         if (parent_id > this_id) {
 680             lo = mid + 1;
 681             continue;
 682         }
 683     } while(lo < hi);
 684
 685     /* check if lo and hi converged on the match */
 686     if (parent_id == array[hi]) {
 687         index = hi;
 688     }
 689
 690     if (no_match_indexp) {
 691         *no_match_indexp = hi;
 692     }
 693
 694     return index;
 695 }
 696
 697
 698 static int
 699 lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id)
 700 {
 701     unsigned int hi;
 702     int matches = 0;
 703     int index, no_match_index;
 704
 705     if (cache->numcached == 0) {
 706         *indexp = 0;
 707         return 0; // table is empty, so insert at index=0 and report no match
 708     }
 709
 710     if (cache->numcached > NUM_CACHE_ENTRIES) {
 711         /*printf("EGAD! numcached is %d... cut our losses and trim to %d\n",
 712           cache->numcached, NUM_CACHE_ENTRIES);*/
 713         cache->numcached = NUM_CACHE_ENTRIES;
 714     }
 715
 716     hi = cache->numcached - 1;
 717
 718     index = cache_binSearch(cache->acache, hi, parent_id, &no_match_index);
 719
 720     /* if no existing entry found, find index for new one */
 721     if (index == -1) {
 722         index = no_match_index;
 723         matches = 0;
 724     } else {
 725         matches = 1;
 726     }
 727
 728     *indexp = index;
 729     return matches;
 730 }
 731
 732 /*
 733  * Add a node to the access_cache at the given index (or do a lookup first
 734  * to find the index if -1 is passed in). We currently do a replace rather
 735  * than an insert if the cache is full.
 736  */
 737 static void
 738 add_node(struct access_cache *cache, int index, cnid_t nodeID, int access)
 739 {
 740     int lookup_index = -1;
 741
 742     /* need to do a lookup first if -1 passed for index */
 743     if (index == -1) {
 744         if (lookup_bucket(cache, &lookup_index, nodeID)) {
 745             if (cache->haveaccess[lookup_index] != access && cache->haveaccess[lookup_index] == ESRCH) {
 746                 // only update an entry if the previous access was ESRCH (i.e. a scope checking error)
 747                 cache->haveaccess[lookup_index] = access;
 748             }
 749
 750             /* mission accomplished */
 751             return;
 752         } else {
 753             index = lookup_index;
 754         }
 755
 756     }
 757
 758     /* if the cache is full, do a replace rather than an insert */
 759     if (cache->numcached >= NUM_CACHE_ENTRIES) {
 760         //printf("cache is full (%d). replace at index %d\n", cache->numcached, index);
 761         cache->numcached = NUM_CACHE_ENTRIES-1;
 762
 763         if (index > cache->numcached) {
 764             //    printf("index %d pinned to %d\n", index, cache->numcached);
 765             index = cache->numcached;
 766         }
 767     }
 768
 769     if (index < cache->numcached && index < NUM_CACHE_ENTRIES && nodeID > cache->acache[index]) {
 770         index++;
 771     }
 772
 773     if (index >= 0 && index < cache->numcached) {
 774         /* only do bcopy if we're inserting */
 775         bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) );
 776         bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(unsigned char) );
 777     }
 778
 779     cache->acache[index] = nodeID;
 780     cache->haveaccess[index] = access;
 781     cache->numcached++;
 782 }
 783
 784
 785 struct cinfo {
 786     uid_t   uid;
 787     gid_t   gid;
 788     mode_t  mode;
 789     cnid_t  parentcnid;
 790     u_int16_t recflags;
 791 };
 792
 793 static int
 794 snoop_callback(const struct cat_desc *descp, const struct cat_attr *attrp, void * arg)
 795 {
 796     struct cinfo *cip = (struct cinfo *)arg;
 797
 798     cip->uid = attrp->ca_uid;
 799     cip->gid = attrp->ca_gid;
 800     cip->mode = attrp->ca_mode;
 801     cip->parentcnid = descp->cd_parentcnid;
 802     cip->recflags = attrp->ca_recflags;
 803
 804     return (0);
 805 }
 806
 807 /*
 808  * Lookup the cnid's attr info (uid, gid, and mode) as well as its parent id. If the item
 809  * isn't incore, then go to the catalog.
 810  */
 811 static int
 812 do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, dev_t dev, cnid_t cnid,
 813     struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp)
 814 {
 815     int error = 0;
 816
 817     /* if this id matches the one the fsctl was called with, skip the lookup */
 818     if (cnid == skip_cp->c_cnid) {
 819         cnattrp->ca_uid = skip_cp->c_uid;
 820         cnattrp->ca_gid = skip_cp->c_gid;
 821         cnattrp->ca_mode = skip_cp->c_mode;
 822         keyp->hfsPlus.parentID = skip_cp->c_parentcnid;
 823     } else {
 824         struct cinfo c_info;
 825
 826         /* otherwise, check the cnode hash incase the file/dir is incore */
 827         if (hfs_chash_snoop(dev, cnid, snoop_callback, &c_info) == 0) {
 828             cnattrp->ca_uid = c_info.uid;
 829             cnattrp->ca_gid = c_info.gid;
 830             cnattrp->ca_mode = c_info.mode;
 831             cnattrp->ca_recflags = c_info.recflags;
 832             keyp->hfsPlus.parentID = c_info.parentcnid;
 833         } else {
 834             int lockflags;
 835
 836             lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
 837
 838             /* lookup this cnid in the catalog */
 839             error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp);
 840
 841             hfs_systemfile_unlock(hfsmp, lockflags);
 842
 843             cache->lookups++;
 844         }
 845     }
 846
 847     return (error);
 848 }
 849
 850
 851 /*
 852  * Compute whether we have access to the given directory (nodeID) and all its parents. Cache
 853  * up to CACHE_LEVELS as we progress towards the root.
 854  */
 855 static int
 856 do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HFSCatalogNodeID nodeID,
 857     struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred, dev_t dev,
 858     struct vfs_context *my_context,
 859     char *bitmap,
 860     uint32_t map_size,
 861     cnid_t* parents,
 862     uint32_t num_parents)
 863 {
 864     int                     myErr = 0;
 865     int                     myResult;
 866     HFSCatalogNodeID        thisNodeID;
 867     unsigned int            myPerms;
 868     struct cat_attr         cnattr;
 869     int                     cache_index = -1, scope_index = -1, scope_idx_start = -1;
 870     CatalogKey              catkey;
 871
 872     int i = 0, ids_to_cache = 0;
 873     int parent_ids[CACHE_LEVELS];
 874
 875     thisNodeID = nodeID;
 876     while (thisNodeID >=  kRootDirID) {
 877         myResult = 0;   /* default to "no access" */
 878
 879         /* check the cache before resorting to hitting the catalog */
 880
 881         /* ASSUMPTION: access info of cached entries is "final"... i.e. no need
 882          * to look any further after hitting cached dir */
 883
 884         if (lookup_bucket(cache, &cache_index, thisNodeID)) {
 885             cache->cachehits++;
 886             myErr = cache->haveaccess[cache_index];
 887             if (scope_index != -1) {
 888                 if (myErr == ESRCH) {
 889                     myErr = 0;
 890                 }
 891             } else {
 892                 scope_index = 0;   // so we'll just use the cache result
 893                 scope_idx_start = ids_to_cache;
 894             }
 895             myResult = (myErr == 0) ? 1 : 0;
 896             goto ExitThisRoutine;
 897         }
 898
 899
 900         if (parents) {
 901             int tmp;
 902             tmp = cache_binSearch(parents, num_parents-1, thisNodeID, NULL);
 903             if (scope_index == -1)
 904                 scope_index = tmp;
 905             if (tmp != -1 && scope_idx_start == -1 && ids_to_cache < CACHE_LEVELS) {
 906                 scope_idx_start = ids_to_cache;
 907             }
 908         }
 909
 910         /* remember which parents we want to cache */
 911         if (ids_to_cache < CACHE_LEVELS) {
 912             parent_ids[ids_to_cache] = thisNodeID;
 913             ids_to_cache++;
 914         }
 915         // Inefficient (using modulo) and we might want to use a hash function, not rely on the node id to be "nice"...
 916         if (bitmap && map_size) {
 917             bitmap[(thisNodeID/8)%(map_size)]|=(1<<(thisNodeID&7));
 918         }
 919
 920
 921         /* do the lookup (checks the cnode hash, then the catalog) */
 922         myErr = do_attr_lookup(hfsmp, cache, dev, thisNodeID, skip_cp, &catkey, &cnattr);
 923         if (myErr) {
 924             goto ExitThisRoutine; /* no access */
 925         }
 926
 927         /* Root always gets access. */
 928         if (suser(myp_ucred, NULL) == 0) {
 929                 thisNodeID = catkey.hfsPlus.parentID;
 930                 myResult = 1;
 931                 continue;
 932         }
 933
 934         // if the thing has acl's, do the full permission check
 935         if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
 936             struct vnode *vp;
 937
 938             /* get the vnode for this cnid */
 939             myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0);
 940             if ( myErr ) {
 941                 myResult = 0;
 942                 goto ExitThisRoutine;
 943             }
 944
 945             thisNodeID = VTOC(vp)->c_parentcnid;
 946
 947             hfs_unlock(VTOC(vp));
 948
 949             if (vnode_vtype(vp) == VDIR) {
 950                 myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), my_context);
 951             } else {
 952                 myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, my_context);
 953             }
 954
 955             vnode_put(vp);
 956             if (myErr) {
 957                 myResult = 0;
 958                 goto ExitThisRoutine;
 959             }
 960         } else {
 961             unsigned int flags;
 962
 963             myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
 964                 cnattr.ca_mode, hfsmp->hfs_mp,
 965                 myp_ucred, theProcPtr);
 966
 967             if (cnattr.ca_mode & S_IFDIR) {
 968                 flags = R_OK | X_OK;
 969             } else {
 970                 flags = R_OK;
 971             }
 972             if ( (myPerms & flags) != flags) {
 973                 myResult = 0;
 974                 myErr = EACCES;
 975                 goto ExitThisRoutine;   /* no access */
 976             }
 977
 978             /* up the hierarchy we go */
 979             thisNodeID = catkey.hfsPlus.parentID;
 980         }
 981     }
 982
 983     /* if here, we have access to this node */
 984     myResult = 1;
 985
 986   ExitThisRoutine:
 987     if (parents && myErr == 0 && scope_index == -1) {
 988         myErr = ESRCH;
 989     }
 990
 991     if (myErr) {
 992         myResult = 0;
 993     }
 994     *err = myErr;
 995
 996     /* cache the parent directory(ies) */
 997     for (i = 0; i < ids_to_cache; i++) {
 998         if (myErr == 0 && parents && (scope_idx_start == -1 || i > scope_idx_start)) {
 999             add_node(cache, -1, parent_ids[i], ESRCH);
1000         } else {
1001             add_node(cache, -1, parent_ids[i], myErr);
1002         }
1003     }
1004
1005     return (myResult);
1006 }
1007
1008 static int
1009 do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
1010     struct vnop_ioctl_args *ap, int arg_size, vfs_context_t context)
1011 {
1012     boolean_t is64bit;
1013
1014     /*
1015      * NOTE: on entry, the vnode is locked. Incase this vnode
1016      * happens to be in our list of file_ids, we'll note it
1017      * avoid calling hfs_chashget_nowait() on that id as that
1018      * will cause a "locking against myself" panic.
1019      */
1020     Boolean check_leaf = true;
1021
1022     struct ext_user_access_t *user_access_structp;
1023     struct ext_user_access_t tmp_user_access;
1024     struct access_cache cache;
1025
1026     int error = 0;
1027     unsigned int i;
1028
1029     dev_t dev = VTOC(vp)->c_dev;
1030
1031     short flags;
1032     unsigned int num_files = 0;
1033     int map_size = 0;
1034     int num_parents = 0;
1035     int *file_ids=NULL;
1036     short *access=NULL;
1037     char *bitmap=NULL;
1038     cnid_t *parents=NULL;
1039     int leaf_index;
1040
1041     cnid_t cnid;
1042     cnid_t prevParent_cnid = 0;
1043     unsigned int myPerms;
1044     short myaccess = 0;
1045     struct cat_attr cnattr;
1046     CatalogKey catkey;
1047     struct cnode *skip_cp = VTOC(vp);
1048     kauth_cred_t cred = vfs_context_ucred(context);
1049     proc_t p = vfs_context_proc(context);
1050
1051     is64bit = proc_is64bit(p);
1052
1053     /* initialize the local cache and buffers */
1054     cache.numcached = 0;
1055     cache.cachehits = 0;
1056     cache.lookups = 0;
1057     cache.acache = NULL;
1058     cache.haveaccess = NULL;
1059
1060     /* struct copyin done during dispatch... need to copy file_id array separately */
1061     if (ap->a_data == NULL) {
1062         error = EINVAL;
1063         goto err_exit_bulk_access;
1064     }
1065
1066     if (is64bit) {
1067         if (arg_size != sizeof(struct ext_user_access_t)) {
1068             error = EINVAL;
1069             goto err_exit_bulk_access;
1070         }
1071
1072         user_access_structp = (struct ext_user_access_t *)ap->a_data;
1073
1074     } else if (arg_size == sizeof(struct access_t)) {
1075         struct access_t *accessp = (struct access_t *)ap->a_data;
1076
1077         // convert an old style bulk-access struct to the new style
1078         tmp_user_access.flags     = accessp->flags;
1079         tmp_user_access.num_files = accessp->num_files;
1080         tmp_user_access.map_size  = 0;
1081         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1082         tmp_user_access.bitmap    = USER_ADDR_NULL;
1083         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1084         tmp_user_access.num_parents = 0;
1085         user_access_structp = &tmp_user_access;
1086
1087     } else if (arg_size == sizeof(struct ext_access_t)) {
1088         struct ext_access_t *accessp = (struct ext_access_t *)ap->a_data;
1089
1090         // up-cast from a 32-bit version of the struct
1091         tmp_user_access.flags     = accessp->flags;
1092         tmp_user_access.num_files = accessp->num_files;
1093         tmp_user_access.map_size  = accessp->map_size;
1094         tmp_user_access.num_parents  = accessp->num_parents;
1095
1096         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1097         tmp_user_access.bitmap    = CAST_USER_ADDR_T(accessp->bitmap);
1098         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1099         tmp_user_access.parents    = CAST_USER_ADDR_T(accessp->parents);
1100
1101         user_access_structp = &tmp_user_access;
1102     } else {
1103         error = EINVAL;
1104         goto err_exit_bulk_access;
1105     }
1106
1107     map_size = user_access_structp->map_size;
1108
1109     num_files = user_access_structp->num_files;
1110
1111     num_parents= user_access_structp->num_parents;
1112
1113     if (num_files < 1) {
1114         goto err_exit_bulk_access;
1115     }
1116     if (num_files > 1024) {
1117         error = EINVAL;
1118         goto err_exit_bulk_access;
1119     }
1120
1121     if (num_parents > 1024) {
1122         error = EINVAL;
1123         goto err_exit_bulk_access;
1124     }
1125
1126     file_ids = (int *) kalloc(sizeof(int) * num_files);
1127     access = (short *) kalloc(sizeof(short) * num_files);
1128     if (map_size) {
1129         bitmap = (char *) kalloc(sizeof(char) * map_size);
1130     }
1131
1132     if (num_parents) {
1133         parents = (cnid_t *) kalloc(sizeof(cnid_t) * num_parents);
1134     }
1135
1136     cache.acache = (unsigned int *) kalloc(sizeof(int) * NUM_CACHE_ENTRIES);
1137     cache.haveaccess = (unsigned char *) kalloc(sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1138
1139     if (file_ids == NULL || access == NULL || (map_size != 0 && bitmap == NULL) || cache.acache == NULL || cache.haveaccess == NULL) {
1140         if (file_ids) {
1141             kfree(file_ids, sizeof(int) * num_files);
1142         }
1143         if (bitmap) {
1144             kfree(bitmap, sizeof(char) * map_size);
1145         }
1146         if (access) {
1147             kfree(access, sizeof(short) * num_files);
1148         }
1149         if (cache.acache) {
1150             kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1151         }
1152         if (cache.haveaccess) {
1153             kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1154         }
1155         if (parents) {
1156             kfree(parents, sizeof(cnid_t) * num_parents);
1157         }
1158         return ENOMEM;
1159     }
1160
1161     // make sure the bitmap is zero'ed out...
1162     if (bitmap) {
1163         bzero(bitmap, (sizeof(char) * map_size));
1164     }
1165
1166     if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids,
1167                 num_files * sizeof(int)))) {
1168         goto err_exit_bulk_access;
1169     }
1170
1171     if (num_parents) {
1172         if ((error = copyin(user_access_structp->parents, (caddr_t)parents,
1173                     num_parents * sizeof(cnid_t)))) {
1174             goto err_exit_bulk_access;
1175         }
1176     }
1177
1178     flags = user_access_structp->flags;
1179     if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) {
1180         flags = R_OK;
1181     }
1182
1183     /* check if we've been passed leaf node ids or parent ids */
1184     if (flags & PARENT_IDS_FLAG) {
1185         check_leaf = false;
1186     }
1187
1188     /* Check access to each file_id passed in */
1189     for (i = 0; i < num_files; i++) {
1190         leaf_index=-1;
1191         cnid = (cnid_t) file_ids[i];
1192
1193         /* root always has access */
1194         if ((!parents) && (!suser(cred, NULL))) {
1195             access[i] = 0;
1196             continue;
1197         }
1198
1199         if (check_leaf) {
1200             /* do the lookup (checks the cnode hash, then the catalog) */
1201             error = do_attr_lookup(hfsmp, &cache, dev, cnid, skip_cp, &catkey, &cnattr);
1202             if (error) {
1203                 access[i] = (short) error;
1204                 continue;
1205             }
1206
1207             if (parents) {
1208                 // Check if the leaf matches one of the parent scopes
1209                 leaf_index = cache_binSearch(parents, num_parents-1, cnid, NULL);
1210             }
1211
1212             // if the thing has acl's, do the full permission check
1213             if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1214                 struct vnode *cvp;
1215                 int myErr = 0;
1216                 /* get the vnode for this cnid */
1217                 myErr = hfs_vget(hfsmp, cnid, &cvp, 0);
1218                 if ( myErr ) {
1219                     access[i] = myErr;
1220                     continue;
1221                 }
1222
1223                 hfs_unlock(VTOC(cvp));
1224
1225                 if (vnode_vtype(cvp) == VDIR) {
1226                     myErr = vnode_authorize(cvp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), context);
1227                 } else {
1228                     myErr = vnode_authorize(cvp, NULL, KAUTH_VNODE_READ_DATA, context);
1229                 }
1230
1231                 vnode_put(cvp);
1232                 if (myErr) {
1233                     access[i] = myErr;
1234                     continue;
1235                 }
1236             } else {
1237                 /* before calling CheckAccess(), check the target file for read access */
1238                 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
1239                     cnattr.ca_mode, hfsmp->hfs_mp, cred, p);
1240
1241                 /* fail fast if no access */
1242                 if ((myPerms & flags) == 0) {
1243                     access[i] = EACCES;
1244                     continue;
1245                 }
1246             }
1247         } else {
1248             /* we were passed an array of parent ids */
1249             catkey.hfsPlus.parentID = cnid;
1250         }
1251
1252         /* if the last guy had the same parent and had access, we're done */
1253         if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0) {
1254             cache.cachehits++;
1255             access[i] = 0;
1256             continue;
1257         }
1258
1259         myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID,
1260             skip_cp, p, cred, dev, context,bitmap, map_size, parents, num_parents);
1261
1262         if (myaccess || (error == ESRCH && leaf_index != -1)) {
1263             access[i] = 0; // have access.. no errors to report
1264         } else {
1265             access[i] = (error != 0 ? (short) error : EACCES);
1266         }
1267
1268         prevParent_cnid = catkey.hfsPlus.parentID;
1269     }
1270
1271     /* copyout the access array */
1272     if ((error = copyout((caddr_t)access, user_access_structp->access,
1273                 num_files * sizeof (short)))) {
1274         goto err_exit_bulk_access;
1275     }
1276     if (map_size && bitmap) {
1277         if ((error = copyout((caddr_t)bitmap, user_access_structp->bitmap,
1278                     map_size * sizeof (char)))) {
1279             goto err_exit_bulk_access;
1280         }
1281     }
1282
1283
1284   err_exit_bulk_access:
1285
1286     //printf("on exit (err %d), numfiles/numcached/cachehits/lookups is %d/%d/%d/%d\n", error, num_files, cache.numcached, cache.cachehits, cache.lookups);
1287
1288     if (file_ids)
1289         kfree(file_ids, sizeof(int) * num_files);
1290     if (parents)
1291         kfree(parents, sizeof(cnid_t) * num_parents);
1292     if (bitmap)
1293         kfree(bitmap, sizeof(char) * map_size);
1294     if (access)
1295         kfree(access, sizeof(short) * num_files);
1296     if (cache.acache)
1297         kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1298     if (cache.haveaccess)
1299         kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1300
1301     return (error);
1302 }
1303
1304
1305 /* end "bulk-access" support */
1306
1307
1308 /*
1309  * Callback for use with freeze ioctl.
1310  */
1311 static int
1312 hfs_freezewrite_callback(struct vnode *vp, __unused void *cargs)
1313 {
1314         vnode_waitforwrites(vp, 0, 0, 0, "hfs freeze");
1315
1316         return 0;
1317 }
1318
1319 /*
1320  * Control filesystem operating characteristics.
1321  */
1322 int
1323 hfs_vnop_ioctl( struct vnop_ioctl_args /* {
1324                 vnode_t a_vp;
1325                 int  a_command;
1326                 caddr_t  a_data;
1327                 int  a_fflag;
1328                 vfs_context_t a_context;
1329         } */ *ap)
1330 {
1331         struct vnode * vp = ap->a_vp;
1332         struct hfsmount *hfsmp = VTOHFS(vp);
1333         vfs_context_t context = ap->a_context;
1334         kauth_cred_t cred = vfs_context_ucred(context);
1335         proc_t p = vfs_context_proc(context);
1336         struct vfsstatfs *vfsp;
1337         boolean_t is64bit;
1338
1339         is64bit = proc_is64bit(p);
1340
1341         switch (ap->a_command) {
1342
1343         case HFS_GETPATH:
1344         {
1345                 struct vnode *file_vp;
1346                 cnid_t  cnid;
1347                 int  outlen;
1348                 char *bufptr;
1349                 int error;
1350
1351                 /* Caller must be owner of file system. */
1352                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1353                 if (suser(cred, NULL) &&
1354                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1355                         return (EACCES);
1356                 }
1357                 /* Target vnode must be file system's root. */
1358                 if (!vnode_isvroot(vp)) {
1359                         return (EINVAL);
1360                 }
1361                 bufptr = (char *)ap->a_data;
1362                 cnid = strtoul(bufptr, NULL, 10);
1363
1364                 /* We need to call hfs_vfs_vget to leverage the code that will fix the
1365                  * origin list for us if needed, as opposed to calling hfs_vget, since
1366                  * we will need it for the subsequent build_path call.
1367                  */
1368                 if ((error = hfs_vfs_vget(HFSTOVFS(hfsmp), cnid, &file_vp, context))) {
1369                         return (error);
1370                 }
1371                 error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, 0, context);
1372                 vnode_put(file_vp);
1373
1374                 return (error);
1375         }
1376
1377         case HFS_PREV_LINK:
1378         case HFS_NEXT_LINK:
1379         {
1380                 cnid_t linkfileid;
1381                 cnid_t nextlinkid;
1382                 cnid_t prevlinkid;
1383                 int error;
1384
1385                 /* Caller must be owner of file system. */
1386                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1387                 if (suser(cred, NULL) &&
1388                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1389                         return (EACCES);
1390                 }
1391                 /* Target vnode must be file system's root. */
1392                 if (!vnode_isvroot(vp)) {
1393                         return (EINVAL);
1394                 }
1395                 linkfileid = *(cnid_t *)ap->a_data;
1396                 if (linkfileid < kHFSFirstUserCatalogNodeID) {
1397                         return (EINVAL);
1398                 }
1399                 if ((error = hfs_lookuplink(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) {
1400                         return (error);
1401                 }
1402                 if (ap->a_command == HFS_NEXT_LINK) {
1403                         *(cnid_t *)ap->a_data = nextlinkid;
1404                 } else {
1405                         *(cnid_t *)ap->a_data = prevlinkid;
1406                 }
1407                 return (0);
1408         }
1409
1410         case HFS_RESIZE_PROGRESS: {
1411
1412                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1413                 if (suser(cred, NULL) &&
1414                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1415                         return (EACCES); /* must be owner of file system */
1416                 }
1417                 if (!vnode_isvroot(vp)) {
1418                         return (EINVAL);
1419                 }
1420                 return hfs_resize_progress(hfsmp, (u_int32_t *)ap->a_data);
1421         }
1422
1423         case HFS_RESIZE_VOLUME: {
1424                 u_int64_t newsize;
1425                 u_int64_t cursize;
1426
1427                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1428                 if (suser(cred, NULL) &&
1429                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1430                         return (EACCES); /* must be owner of file system */
1431                 }
1432                 if (!vnode_isvroot(vp)) {
1433                         return (EINVAL);
1434                 }
1435                 newsize = *(u_int64_t *)ap->a_data;
1436                 cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
1437
1438                 if (newsize > cursize) {
1439                         return hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context);
1440                 } else if (newsize < cursize) {
1441                         return hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context);
1442                 } else {
1443                         return (0);
1444                 }
1445         }
1446         case HFS_CHANGE_NEXT_ALLOCATION: {
1447                 int error = 0;          /* Assume success */
1448                 u_int32_t location;
1449
1450                 if (vnode_vfsisrdonly(vp)) {
1451                         return (EROFS);
1452                 }
1453                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1454                 if (suser(cred, NULL) &&
1455                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1456                         return (EACCES); /* must be owner of file system */
1457                 }
1458                 if (!vnode_isvroot(vp)) {
1459                         return (EINVAL);
1460                 }
1461                 HFS_MOUNT_LOCK(hfsmp, TRUE);
1462                 location = *(u_int32_t *)ap->a_data;
1463                 if ((location >= hfsmp->allocLimit) &&
1464                         (location != HFS_NO_UPDATE_NEXT_ALLOCATION)) {
1465                         error = EINVAL;
1466                         goto fail_change_next_allocation;
1467                 }
1468                 /* Return previous value. */
1469                 *(u_int32_t *)ap->a_data = hfsmp->nextAllocation;
1470                 if (location == HFS_NO_UPDATE_NEXT_ALLOCATION) {
1471                         /* On magic value for location, set nextAllocation to next block
1472                          * after metadata zone and set flag in mount structure to indicate
1473                          * that nextAllocation should not be updated again.
1474                          */
1475                         HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_end + 1);
1476                         hfsmp->hfs_flags |= HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1477                 } else {
1478                         hfsmp->hfs_flags &= ~HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1479                         HFS_UPDATE_NEXT_ALLOCATION(hfsmp, location);
1480                 }
1481                 MarkVCBDirty(hfsmp);
1482 fail_change_next_allocation:
1483                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
1484                 return (error);
1485         }
1486
1487 #ifdef HFS_SPARSE_DEV
1488         case HFS_SETBACKINGSTOREINFO: {
1489                 struct vnode * bsfs_rootvp;
1490                 struct vnode * di_vp;
1491                 struct hfs_backingstoreinfo *bsdata;
1492                 int error = 0;
1493
1494                 if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
1495                         return (EALREADY);
1496                 }
1497                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1498                 if (suser(cred, NULL) &&
1499                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1500                         return (EACCES); /* must be owner of file system */
1501                 }
1502                 bsdata = (struct hfs_backingstoreinfo *)ap->a_data;
1503                 if (bsdata == NULL) {
1504                         return (EINVAL);
1505                 }
1506                 if ((error = file_vnode(bsdata->backingfd, &di_vp))) {
1507                         return (error);
1508                 }
1509                 if ((error = vnode_getwithref(di_vp))) {
1510                         file_drop(bsdata->backingfd);
1511                         return(error);
1512                 }
1513
1514                 if (vnode_mount(vp) == vnode_mount(di_vp)) {
1515                         (void)vnode_put(di_vp);
1516                         file_drop(bsdata->backingfd);
1517                         return (EINVAL);
1518                 }
1519
1520                 /*
1521                  * Obtain the backing fs root vnode and keep a reference
1522                  * on it.  This reference will be dropped in hfs_unmount.
1523                  */
1524                 error = VFS_ROOT(vnode_mount(di_vp), &bsfs_rootvp, NULL); /* XXX use context! */
1525                 if (error) {
1526                         (void)vnode_put(di_vp);
1527                         file_drop(bsdata->backingfd);
1528                         return (error);
1529                 }
1530                 vnode_ref(bsfs_rootvp);
1531                 vnode_put(bsfs_rootvp);
1532
1533                 hfsmp->hfs_backingfs_rootvp = bsfs_rootvp;
1534                 hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
1535                 hfsmp->hfs_sparsebandblks = bsdata->bandsize / HFSTOVCB(hfsmp)->blockSize;
1536                 hfsmp->hfs_sparsebandblks *= 4;
1537
1538                 vfs_markdependency(hfsmp->hfs_mp);
1539
1540                 (void)vnode_put(di_vp);
1541                 file_drop(bsdata->backingfd);
1542                 return (0);
1543         }
1544         case HFS_CLRBACKINGSTOREINFO: {
1545                 struct vnode * tmpvp;
1546
1547                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1548                 if (suser(cred, NULL) &&
1549                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1550                         return (EACCES); /* must be owner of file system */
1551                 }
1552                 if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
1553                     hfsmp->hfs_backingfs_rootvp) {
1554
1555                         hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
1556                         tmpvp = hfsmp->hfs_backingfs_rootvp;
1557                         hfsmp->hfs_backingfs_rootvp = NULLVP;
1558                         hfsmp->hfs_sparsebandblks = 0;
1559                         vnode_rele(tmpvp);
1560                 }
1561                 return (0);
1562         }
1563 #endif /* HFS_SPARSE_DEV */
1564
1565         case F_FREEZE_FS: {
1566                 struct mount *mp;
1567
1568                 if (!is_suser())
1569                         return (EACCES);
1570
1571                 mp = vnode_mount(vp);
1572                 hfsmp = VFSTOHFS(mp);
1573
1574                 if (!(hfsmp->jnl))
1575                         return (ENOTSUP);
1576
1577                 lck_rw_lock_exclusive(&hfsmp->hfs_insync);
1578
1579                 // flush things before we get started to try and prevent
1580                 // dirty data from being paged out while we're frozen.
1581                 // note: can't do this after taking the lock as it will
1582                 // deadlock against ourselves.
1583                 vnode_iterate(mp, 0, hfs_freezewrite_callback, NULL);
1584                 hfs_global_exclusive_lock_acquire(hfsmp);
1585                 journal_flush(hfsmp->jnl);
1586
1587                 // don't need to iterate on all vnodes, we just need to
1588                 // wait for writes to the system files and the device vnode
1589                 if (HFSTOVCB(hfsmp)->extentsRefNum)
1590                     vnode_waitforwrites(HFSTOVCB(hfsmp)->extentsRefNum, 0, 0, 0, "hfs freeze");
1591                 if (HFSTOVCB(hfsmp)->catalogRefNum)
1592                     vnode_waitforwrites(HFSTOVCB(hfsmp)->catalogRefNum, 0, 0, 0, "hfs freeze");
1593                 if (HFSTOVCB(hfsmp)->allocationsRefNum)
1594                     vnode_waitforwrites(HFSTOVCB(hfsmp)->allocationsRefNum, 0, 0, 0, "hfs freeze");
1595                 if (hfsmp->hfs_attribute_vp)
1596                     vnode_waitforwrites(hfsmp->hfs_attribute_vp, 0, 0, 0, "hfs freeze");
1597                 vnode_waitforwrites(hfsmp->hfs_devvp, 0, 0, 0, "hfs freeze");
1598
1599                 hfsmp->hfs_freezing_proc = current_proc();
1600
1601                 return (0);
1602         }
1603
1604         case F_THAW_FS: {
1605                 if (!is_suser())
1606                         return (EACCES);
1607
1608                 // if we're not the one who froze the fs then we
1609                 // can't thaw it.
1610                 if (hfsmp->hfs_freezing_proc != current_proc()) {
1611                     return EPERM;
1612                 }
1613
1614                 // NOTE: if you add code here, also go check the
1615                 //       code that "thaws" the fs in hfs_vnop_close()
1616                 //
1617                 hfsmp->hfs_freezing_proc = NULL;
1618                 hfs_global_exclusive_lock_release(hfsmp);
1619                 lck_rw_unlock_exclusive(&hfsmp->hfs_insync);
1620
1621                 return (0);
1622         }
1623
1624         case HFS_BULKACCESS_FSCTL: {
1625             int size;
1626
1627             if (hfsmp->hfs_flags & HFS_STANDARD) {
1628                 return EINVAL;
1629             }
1630
1631             if (is64bit) {
1632                 size = sizeof(struct user_access_t);
1633             } else {
1634                 size = sizeof(struct access_t);
1635             }
1636
1637             return do_bulk_access_check(hfsmp, vp, ap, size, context);
1638         }
1639
1640         case HFS_EXT_BULKACCESS_FSCTL: {
1641             int size;
1642
1643             if (hfsmp->hfs_flags & HFS_STANDARD) {
1644                 return EINVAL;
1645             }
1646
1647             if (is64bit) {
1648                 size = sizeof(struct ext_user_access_t);
1649             } else {
1650                 size = sizeof(struct ext_access_t);
1651             }
1652
1653             return do_bulk_access_check(hfsmp, vp, ap, size, context);
1654         }
1655
1656         case HFS_SETACLSTATE: {
1657                 int state;
1658
1659                 if (ap->a_data == NULL) {
1660                         return (EINVAL);
1661                 }
1662
1663                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1664                 state = *(int *)ap->a_data;
1665
1666                 // super-user can enable or disable acl's on a volume.
1667                 // the volume owner can only enable acl's
1668                 if (!is_suser() && (state == 0 || kauth_cred_getuid(cred) != vfsp->f_owner)) {
1669                         return (EPERM);
1670                 }
1671                 if (state == 0 || state == 1)
1672                         return hfs_set_volxattr(hfsmp, HFS_SETACLSTATE, state);
1673                 else
1674                         return (EINVAL);
1675         }
1676
1677         case HFS_SET_XATTREXTENTS_STATE: {
1678                 int state;
1679
1680                 if (ap->a_data == NULL) {
1681                         return (EINVAL);
1682                 }
1683
1684                 state = *(int *)ap->a_data;
1685
1686                 /* Super-user can enable or disable extent-based extended
1687                  * attribute support on a volume
1688                  */
1689                 if (!is_suser()) {
1690                         return (EPERM);
1691                 }
1692                 if (state == 0 || state == 1)
1693                         return hfs_set_volxattr(hfsmp, HFS_SET_XATTREXTENTS_STATE, state);
1694                 else
1695                         return (EINVAL);
1696         }
1697
1698         case F_FULLFSYNC: {
1699                 int error;
1700
1701                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK);
1702                 if (error == 0) {
1703                         error = hfs_fsync(vp, MNT_WAIT, TRUE, p);
1704                         hfs_unlock(VTOC(vp));
1705                 }
1706
1707                 return error;
1708         }
1709
1710         case F_CHKCLEAN: {
1711                 register struct cnode *cp;
1712                 int error;
1713
1714                 if (!vnode_isreg(vp))
1715                         return EINVAL;
1716
1717                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK);
1718                 if (error == 0) {
1719                         cp = VTOC(vp);
1720                         /*
1721                          * used by regression test to determine if
1722                          * all the dirty pages (via write) have been cleaned
1723                          * after a call to 'fsysnc'.
1724                          */
1725                         error = is_file_clean(vp, VTOF(vp)->ff_size);
1726                         hfs_unlock(cp);
1727                 }
1728                 return (error);
1729         }
1730
1731         case F_RDADVISE: {
1732                 register struct radvisory *ra;
1733                 struct filefork *fp;
1734                 int error;
1735
1736                 if (!vnode_isreg(vp))
1737                         return EINVAL;
1738
1739                 ra = (struct radvisory *)(ap->a_data);
1740                 fp = VTOF(vp);
1741
1742                 /* Protect against a size change. */
1743                 hfs_lock_truncate(VTOC(vp), TRUE);
1744
1745                 if (ra->ra_offset >= fp->ff_size) {
1746                         error = EFBIG;
1747                 } else {
1748                         error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count);
1749                 }
1750
1751                 hfs_unlock_truncate(VTOC(vp), TRUE);
1752                 return (error);
1753         }
1754
1755         case F_READBOOTSTRAP:
1756         case F_WRITEBOOTSTRAP:
1757         {
1758             struct vnode *devvp = NULL;
1759             user_fbootstraptransfer_t *user_bootstrapp;
1760             int devBlockSize;
1761             int error;
1762             uio_t auio;
1763             daddr64_t blockNumber;
1764             u_long blockOffset;
1765             u_long xfersize;
1766             struct buf *bp;
1767             user_fbootstraptransfer_t user_bootstrap;
1768
1769                 if (!vnode_isvroot(vp))
1770                         return (EINVAL);
1771                 /* LP64 - when caller is a 64 bit process then we are passed a pointer
1772                  * to a user_fbootstraptransfer_t else we get a pointer to a
1773                  * fbootstraptransfer_t which we munge into a user_fbootstraptransfer_t
1774                  */
1775                 if (is64bit) {
1776                         user_bootstrapp = (user_fbootstraptransfer_t *)ap->a_data;
1777                 }
1778                 else {
1779                 fbootstraptransfer_t *bootstrapp = (fbootstraptransfer_t *)ap->a_data;
1780                         user_bootstrapp = &user_bootstrap;
1781                         user_bootstrap.fbt_offset = bootstrapp->fbt_offset;
1782                         user_bootstrap.fbt_length = bootstrapp->fbt_length;
1783                         user_bootstrap.fbt_buffer = CAST_USER_ADDR_T(bootstrapp->fbt_buffer);
1784                 }
1785                 if (user_bootstrapp->fbt_offset + user_bootstrapp->fbt_length > 1024)
1786                         return EINVAL;
1787
1788             devvp = VTOHFS(vp)->hfs_devvp;
1789                 auio = uio_create(1, user_bootstrapp->fbt_offset,
1790                                                   is64bit ? UIO_USERSPACE64 : UIO_USERSPACE32,
1791                                                   (ap->a_command == F_WRITEBOOTSTRAP) ? UIO_WRITE : UIO_READ);
1792                 uio_addiov(auio, user_bootstrapp->fbt_buffer, user_bootstrapp->fbt_length);
1793
1794             devBlockSize = vfs_devblocksize(vnode_mount(vp));
1795
1796             while (uio_resid(auio) > 0) {
1797                         blockNumber = uio_offset(auio) / devBlockSize;
1798                         error = (int)buf_bread(devvp, blockNumber, devBlockSize, cred, &bp);
1799                         if (error) {
1800                                 if (bp) buf_brelse(bp);
1801                                 uio_free(auio);
1802                                 return error;
1803                         };
1804
1805                         blockOffset = uio_offset(auio) % devBlockSize;
1806                         xfersize = devBlockSize - blockOffset;
1807                         error = uiomove((caddr_t)buf_dataptr(bp) + blockOffset, (int)xfersize, auio);
1808                         if (error) {
1809                                 buf_brelse(bp);
1810                                 uio_free(auio);
1811                                 return error;
1812                         };
1813                         if (uio_rw(auio) == UIO_WRITE) {
1814                                 error = VNOP_BWRITE(bp);
1815                                 if (error) {
1816                                         uio_free(auio);
1817                         return error;
1818                                 }
1819                         } else {
1820                                 buf_brelse(bp);
1821                         };
1822                 };
1823                 uio_free(auio);
1824         };
1825         return 0;
1826
1827         case _IOC(IOC_OUT,'h', 4, 0):     /* Create date in local time */
1828         {
1829                 if (is64bit) {
1830                         *(user_time_t *)(ap->a_data) = (user_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
1831                 }
1832                 else {
1833                         *(time_t *)(ap->a_data) = to_bsd_time(VTOVCB(vp)->localCreateDate);
1834                 }
1835                 return 0;
1836         }
1837
1838         case HFS_GET_MOUNT_TIME:
1839             if (is64bit) {
1840                 *(user_time_t *)(ap->a_data) = (user_time_t) hfsmp->hfs_mount_time;
1841             } else {
1842                 *(time_t *)(ap->a_data) = (time_t) hfsmp->hfs_mount_time;
1843             }
1844                 return 0;
1845
1846         case HFS_GET_LAST_MTIME:
1847             if (is64bit) {
1848                 *(user_time_t *)(ap->a_data) = (user_time_t) hfsmp->hfs_last_mounted_mtime;
1849             } else {
1850                 *(time_t *)(ap->a_data) = (time_t) hfsmp->hfs_last_mounted_mtime;
1851             }
1852                 return 0;
1853
1854         case HFS_SET_BOOT_INFO:
1855                 if (!vnode_isvroot(vp))
1856                         return(EINVAL);
1857                 if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(HFSTOVFS(hfsmp))->f_owner))
1858                         return(EACCES); /* must be superuser or owner of filesystem */
1859                 HFS_MOUNT_LOCK(hfsmp, TRUE);
1860                 bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo));
1861                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
1862                 (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
1863                 break;
1864
1865         case HFS_GET_BOOT_INFO:
1866                 if (!vnode_isvroot(vp))
1867                         return(EINVAL);
1868                 HFS_MOUNT_LOCK(hfsmp, TRUE);
1869                 bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo));
1870                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
1871                 break;
1872
1873         case HFS_MARK_BOOT_CORRUPT:
1874                 /* Mark the boot volume corrupt by setting
1875                  * kHFSVolumeInconsistentBit in the volume header.  This will
1876                  * force fsck_hfs on next mount.
1877                  */
1878                 if (!is_suser()) {
1879                         return EACCES;
1880                 }
1881
1882                 /* Allowed only on the root vnode of the boot volume */
1883                 if (!(vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) ||
1884                     !vnode_isvroot(vp)) {
1885                         return EINVAL;
1886                 }
1887
1888                 printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n");
1889                 hfs_mark_volume_inconsistent(hfsmp);
1890                 break;
1891
1892         default:
1893                 return (ENOTTY);
1894         }
1895
1896     /* Should never get here */
1897         return 0;
1898 }
1899
1900 /*
1901  * select
1902  */
1903 int
1904 hfs_vnop_select(__unused struct vnop_select_args *ap)
1905 /*
1906         struct vnop_select_args {
1907                 vnode_t a_vp;
1908                 int  a_which;
1909                 int  a_fflags;
1910                 void *a_wql;
1911                 vfs_context_t a_context;
1912         };
1913 */
1914 {
1915         /*
1916          * We should really check to see if I/O is possible.
1917          */
1918         return (1);
1919 }
1920
1921 /*
1922  * Converts a logical block number to a physical block, and optionally returns
1923  * the amount of remaining blocks in a run. The logical block is based on hfsNode.logBlockSize.
1924  * The physical block number is based on the device block size, currently its 512.
1925  * The block run is returned in logical blocks, and is the REMAINING amount of blocks
1926  */
1927 int
1928 hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, unsigned int *runp)
1929 {
1930         struct filefork *fp = VTOF(vp);
1931         struct hfsmount *hfsmp = VTOHFS(vp);
1932         int  retval = E_NONE;
1933         u_int32_t  logBlockSize;
1934         size_t  bytesContAvail = 0;
1935         off_t  blockposition;
1936         int lockExtBtree;
1937         int lockflags = 0;
1938
1939         /*
1940          * Check for underlying vnode requests and ensure that logical
1941          * to physical mapping is requested.
1942          */
1943         if (vpp != NULL)
1944                 *vpp = hfsmp->hfs_devvp;
1945         if (bnp == NULL)
1946                 return (0);
1947
1948         logBlockSize = GetLogicalBlockSize(vp);
1949         blockposition = (off_t)bn * logBlockSize;
1950
1951         lockExtBtree = overflow_extents(fp);
1952
1953         if (lockExtBtree)
1954                 lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK);
1955
1956         retval = MacToVFSError(
1957                             MapFileBlockC (HFSTOVCB(hfsmp),
1958                                             (FCB*)fp,
1959                                             MAXPHYSIO,
1960                                             blockposition,
1961                                             bnp,
1962                                             &bytesContAvail));
1963
1964         if (lockExtBtree)
1965                 hfs_systemfile_unlock(hfsmp, lockflags);
1966
1967         if (retval == E_NONE) {
1968                 /* Figure out how many read ahead blocks there are */
1969                 if (runp != NULL) {
1970                         if (can_cluster(logBlockSize)) {
1971                                 /* Make sure this result never goes negative: */
1972                                 *runp = (bytesContAvail < logBlockSize) ? 0 : (bytesContAvail / logBlockSize) - 1;
1973                         } else {
1974                                 *runp = 0;
1975                         }
1976                 }
1977         }
1978         return (retval);
1979 }
1980
1981 /*
1982  * Convert logical block number to file offset.
1983  */
1984 int
1985 hfs_vnop_blktooff(struct vnop_blktooff_args *ap)
1986 /*
1987         struct vnop_blktooff_args {
1988                 vnode_t a_vp;
1989                 daddr64_t a_lblkno;
1990                 off_t *a_offset;
1991         };
1992 */
1993 {
1994         if (ap->a_vp == NULL)
1995                 return (EINVAL);
1996         *ap->a_offset = (off_t)ap->a_lblkno * (off_t)GetLogicalBlockSize(ap->a_vp);
1997
1998         return(0);
1999 }
2000
2001 /*
2002  * Convert file offset to logical block number.
2003  */
2004 int
2005 hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap)
2006 /*
2007         struct vnop_offtoblk_args {
2008                 vnode_t a_vp;
2009                 off_t a_offset;
2010                 daddr64_t *a_lblkno;
2011         };
2012 */
2013 {
2014         if (ap->a_vp == NULL)
2015                 return (EINVAL);
2016         *ap->a_lblkno = (daddr64_t)(ap->a_offset / (off_t)GetLogicalBlockSize(ap->a_vp));
2017
2018         return(0);
2019 }
2020
2021 /*
2022  * Map file offset to physical block number.
2023  *
2024  * If this function is called for write operation, and if the file
2025  * had virtual blocks allocated (delayed allocation), real blocks
2026  * are allocated by calling ExtendFileC().
2027  *
2028  * If this function is called for read operation, and if the file
2029  * had virtual blocks allocated (delayed allocation), no change
2030  * to the size of file is done, and if required, rangelist is
2031  * searched for mapping.
2032  *
2033  * System file cnodes are expected to be locked (shared or exclusive).
2034  */
2035 int
2036 hfs_vnop_blockmap(struct vnop_blockmap_args *ap)
2037 /*
2038         struct vnop_blockmap_args {
2039                 vnode_t a_vp;
2040                 off_t a_foffset;
2041                 size_t a_size;
2042                 daddr64_t *a_bpn;
2043                 size_t *a_run;
2044                 void *a_poff;
2045                 int a_flags;
2046                 vfs_context_t a_context;
2047         };
2048 */
2049 {
2050         struct vnode *vp = ap->a_vp;
2051         struct cnode *cp;
2052         struct filefork *fp;
2053         struct hfsmount *hfsmp;
2054         size_t bytesContAvail = 0;
2055         int retval = E_NONE;
2056         int syslocks = 0;
2057         int lockflags = 0;
2058         struct rl_entry *invalid_range;
2059         enum rl_overlaptype overlaptype;
2060         int started_tr = 0;
2061         int tooklock = 0;
2062
2063         /* Do not allow blockmap operation on a directory */
2064         if (vnode_isdir(vp)) {
2065                 return (ENOTSUP);
2066         }
2067
2068         /*
2069          * Check for underlying vnode requests and ensure that logical
2070          * to physical mapping is requested.
2071          */
2072         if (ap->a_bpn == NULL)
2073                 return (0);
2074
2075         if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) {
2076                 if (VTOC(vp)->c_lockowner != current_thread()) {
2077                         hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
2078                         tooklock = 1;
2079                 }
2080         }
2081         hfsmp = VTOHFS(vp);
2082         cp = VTOC(vp);
2083         fp = VTOF(vp);
2084
2085 retry:
2086         /* Check virtual blocks only when performing write operation */
2087         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
2088                 if (hfs_start_transaction(hfsmp) != 0) {
2089                         retval = EINVAL;
2090                         goto exit;
2091                 } else {
2092                         started_tr = 1;
2093                 }
2094                 syslocks = SFL_EXTENTS | SFL_BITMAP;
2095
2096         } else if (overflow_extents(fp)) {
2097                 syslocks = SFL_EXTENTS;
2098         }
2099
2100         if (syslocks)
2101                 lockflags = hfs_systemfile_lock(hfsmp, syslocks, HFS_EXCLUSIVE_LOCK);
2102
2103         /*
2104          * Check for any delayed allocations.
2105          */
2106         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
2107                 int64_t actbytes;
2108                 u_int32_t loanedBlocks;
2109
2110                 //
2111                 // Make sure we have a transaction.  It's possible
2112                 // that we came in and fp->ff_unallocblocks was zero
2113                 // but during the time we blocked acquiring the extents
2114                 // btree, ff_unallocblocks became non-zero and so we
2115                 // will need to start a transaction.
2116                 //
2117                 if (started_tr == 0) {
2118                         if (syslocks) {
2119                                 hfs_systemfile_unlock(hfsmp, lockflags);
2120                                 syslocks = 0;
2121                         }
2122                         goto retry;
2123                 }
2124
2125                 /*
2126                  * Note: ExtendFileC will Release any blocks on loan and
2127                  * aquire real blocks.  So we ask to extend by zero bytes
2128                  * since ExtendFileC will account for the virtual blocks.
2129                  */
2130
2131                 loanedBlocks = fp->ff_unallocblocks;
2132                 retval = ExtendFileC(hfsmp, (FCB*)fp, 0, 0,
2133                                      kEFAllMask | kEFNoClumpMask, &actbytes);
2134
2135                 if (retval) {
2136                         fp->ff_unallocblocks = loanedBlocks;
2137                         cp->c_blocks += loanedBlocks;
2138                         fp->ff_blocks += loanedBlocks;
2139
2140                         HFS_MOUNT_LOCK(hfsmp, TRUE);
2141                         hfsmp->loanedBlocks += loanedBlocks;
2142                         HFS_MOUNT_UNLOCK(hfsmp, TRUE);
2143
2144                         hfs_systemfile_unlock(hfsmp, lockflags);
2145                         cp->c_flag |= C_MODIFIED;
2146                         if (started_tr) {
2147                                 (void) hfs_update(vp, TRUE);
2148                                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2149
2150                                 hfs_end_transaction(hfsmp);
2151                                 started_tr = 0;
2152                         }
2153                         goto exit;
2154                 }
2155         }
2156
2157         retval = MapFileBlockC(hfsmp, (FCB *)fp, ap->a_size, ap->a_foffset,
2158                                ap->a_bpn, &bytesContAvail);
2159         if (syslocks) {
2160                 hfs_systemfile_unlock(hfsmp, lockflags);
2161                 syslocks = 0;
2162         }
2163
2164         if (started_tr) {
2165                 (void) hfs_update(vp, TRUE);
2166                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2167                 hfs_end_transaction(hfsmp);
2168                 started_tr = 0;
2169         }
2170         if (retval) {
2171                 /* On write, always return error because virtual blocks, if any,
2172                  * should have been allocated in ExtendFileC().  We do not
2173                  * allocate virtual blocks on read, therefore return error
2174                  * only if no virtual blocks are allocated.  Otherwise we search
2175                  * rangelist for zero-fills
2176                  */
2177                 if ((MacToVFSError(retval) != ERANGE) ||
2178                     (ap->a_flags & VNODE_WRITE) ||
2179                     ((ap->a_flags & VNODE_READ) && (fp->ff_unallocblocks == 0))) {
2180                         goto exit;
2181                 }
2182
2183                 /* Validate if the start offset is within logical file size */
2184                 if (ap->a_foffset > fp->ff_size) {
2185                         goto exit;
2186                 }
2187
2188                 /* Searching file extents has failed for read operation, therefore
2189                  * search rangelist for any uncommitted holes in the file.
2190                  */
2191                 overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
2192                                       ap->a_foffset + (off_t)(ap->a_size - 1),
2193                                       &invalid_range);
2194                 switch(overlaptype) {
2195                 case RL_OVERLAPISCONTAINED:
2196                         /* start_offset <= rl_start, end_offset >= rl_end */
2197                         if (ap->a_foffset != invalid_range->rl_start) {
2198                                 break;
2199                         }
2200                 case RL_MATCHINGOVERLAP:
2201                         /* start_offset = rl_start, end_offset = rl_end */
2202                 case RL_OVERLAPCONTAINSRANGE:
2203                         /* start_offset >= rl_start, end_offset <= rl_end */
2204                 case RL_OVERLAPSTARTSBEFORE:
2205                         /* start_offset > rl_start, end_offset >= rl_start */
2206                         if ((off_t)fp->ff_size > (invalid_range->rl_end + 1)) {
2207                                 bytesContAvail = (invalid_range->rl_end + 1) - ap->a_foffset;
2208                         } else {
2209                                 bytesContAvail = fp->ff_size - ap->a_foffset;
2210                         }
2211                         if (bytesContAvail > ap->a_size) {
2212                                 bytesContAvail = ap->a_size;
2213                         }
2214                         *ap->a_bpn = (daddr64_t)-1;
2215                         retval = 0;
2216                         break;
2217                 case RL_OVERLAPENDSAFTER:
2218                         /* start_offset < rl_start, end_offset < rl_end */
2219                 case RL_NOOVERLAP:
2220                         break;
2221                 }
2222                 goto exit;
2223         }
2224
2225         /* MapFileC() found a valid extent in the filefork.  Search the
2226          * mapping information further for invalid file ranges
2227          */
2228         overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
2229                               ap->a_foffset + (off_t)bytesContAvail - 1,
2230                               &invalid_range);
2231         if (overlaptype != RL_NOOVERLAP) {
2232                 switch(overlaptype) {
2233                 case RL_MATCHINGOVERLAP:
2234                 case RL_OVERLAPCONTAINSRANGE:
2235                 case RL_OVERLAPSTARTSBEFORE:
2236                         /* There's no valid block for this byte offset */
2237                         *ap->a_bpn = (daddr64_t)-1;
2238                         /* There's no point limiting the amount to be returned
2239                          * if the invalid range that was hit extends all the way
2240                          * to the EOF (i.e. there's no valid bytes between the
2241                          * end of this range and the file's EOF):
2242                          */
2243                         if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
2244                             (invalid_range->rl_end + 1 - ap->a_foffset < bytesContAvail)) {
2245                                 bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
2246                         }
2247                         break;
2248
2249                 case RL_OVERLAPISCONTAINED:
2250                 case RL_OVERLAPENDSAFTER:
2251                         /* The range of interest hits an invalid block before the end: */
2252                         if (invalid_range->rl_start == ap->a_foffset) {
2253                                 /* There's actually no valid information to be had starting here: */
2254                                 *ap->a_bpn = (daddr64_t)-1;
2255                                 if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
2256                                     (invalid_range->rl_end + 1 - ap->a_foffset < bytesContAvail)) {
2257                                         bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
2258                                 }
2259                         } else {
2260                                 bytesContAvail = invalid_range->rl_start - ap->a_foffset;
2261                         }
2262                         break;
2263
2264                 case RL_NOOVERLAP:
2265                         break;
2266                 } /* end switch */
2267                 if (bytesContAvail > ap->a_size)
2268                         bytesContAvail = ap->a_size;
2269         }
2270
2271 exit:
2272         if (retval == 0) {
2273                 if (ap->a_run)
2274                         *ap->a_run = bytesContAvail;
2275
2276                 if (ap->a_poff)
2277                         *(int *)ap->a_poff = 0;
2278         }
2279
2280         if (tooklock)
2281                 hfs_unlock(cp);
2282
2283         return (MacToVFSError(retval));
2284 }
2285
2286
2287 /*
2288  * prepare and issue the I/O
2289  * buf_strategy knows how to deal
2290  * with requests that require
2291  * fragmented I/Os
2292  */
2293 int
2294 hfs_vnop_strategy(struct vnop_strategy_args *ap)
2295 {
2296         buf_t   bp = ap->a_bp;
2297         vnode_t vp = buf_vnode(bp);
2298
2299         return (buf_strategy(VTOHFS(vp)->hfs_devvp, ap));
2300 }
2301
2302
2303 static int
2304 do_hfs_truncate(struct vnode *vp, off_t length, int flags, vfs_context_t context)
2305 {
2306         register struct cnode *cp = VTOC(vp);
2307         struct filefork *fp = VTOF(vp);
2308         struct proc *p = vfs_context_proc(context);;
2309         kauth_cred_t cred = vfs_context_ucred(context);
2310         int retval;
2311         off_t bytesToAdd;
2312         off_t actualBytesAdded;
2313         off_t filebytes;
2314         u_long fileblocks;
2315         int blksize;
2316         struct hfsmount *hfsmp;
2317         int lockflags;
2318
2319         blksize = VTOVCB(vp)->blockSize;
2320         fileblocks = fp->ff_blocks;
2321         filebytes = (off_t)fileblocks * (off_t)blksize;
2322
2323         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_START,
2324                  (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
2325
2326         if (length < 0)
2327                 return (EINVAL);
2328
2329         /* This should only happen with a corrupt filesystem */
2330         if ((off_t)fp->ff_size < 0)
2331                 return (EINVAL);
2332
2333         if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE))
2334                 return (EFBIG);
2335
2336         hfsmp = VTOHFS(vp);
2337
2338         retval = E_NONE;
2339
2340         /* Files that are changing size are not hot file candidates. */
2341         if (hfsmp->hfc_stage == HFC_RECORDING) {
2342                 fp->ff_bytesread = 0;
2343         }
2344
2345         /*
2346          * We cannot just check if fp->ff_size == length (as an optimization)
2347          * since there may be extra physical blocks that also need truncation.
2348          */
2349 #if QUOTA
2350         if ((retval = hfs_getinoquota(cp)))
2351                 return(retval);
2352 #endif /* QUOTA */
2353
2354         /*
2355          * Lengthen the size of the file. We must ensure that the
2356          * last byte of the file is allocated. Since the smallest
2357          * value of ff_size is 0, length will be at least 1.
2358          */
2359         if (length > (off_t)fp->ff_size) {
2360 #if QUOTA
2361                 retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)),
2362                                    cred, 0);
2363                 if (retval)
2364                         goto Err_Exit;
2365 #endif /* QUOTA */
2366                 /*
2367                  * If we don't have enough physical space then
2368                  * we need to extend the physical size.
2369                  */
2370                 if (length > filebytes) {
2371                         int eflags;
2372                         u_long blockHint = 0;
2373
2374                         /* All or nothing and don't round up to clumpsize. */
2375                         eflags = kEFAllMask | kEFNoClumpMask;
2376
2377                         if (cred && suser(cred, NULL) != 0)
2378                                 eflags |= kEFReserveMask;  /* keep a reserve */
2379
2380                         /*
2381                          * Allocate Journal and Quota files in metadata zone.
2382                          */
2383                         if (filebytes == 0 &&
2384                             hfsmp->hfs_flags & HFS_METADATA_ZONE &&
2385                             hfs_virtualmetafile(cp)) {
2386                                 eflags |= kEFMetadataMask;
2387                                 blockHint = hfsmp->hfs_metazone_start;
2388                         }
2389                         if (hfs_start_transaction(hfsmp) != 0) {
2390                             retval = EINVAL;
2391                             goto Err_Exit;
2392                         }
2393
2394                         /* Protect extents b-tree and allocation bitmap */
2395                         lockflags = SFL_BITMAP;
2396                         if (overflow_extents(fp))
2397                                 lockflags |= SFL_EXTENTS;
2398                         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
2399
2400                         while ((length > filebytes) && (retval == E_NONE)) {
2401                                 bytesToAdd = length - filebytes;
2402                                 retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
2403                                                     (FCB*)fp,
2404                                                     bytesToAdd,
2405                                                     blockHint,
2406                                                     eflags,
2407                                                     &actualBytesAdded));
2408
2409                                 filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
2410                                 if (actualBytesAdded == 0 && retval == E_NONE) {
2411                                         if (length > filebytes)
2412                                                 length = filebytes;
2413                                         break;
2414                                 }
2415                         } /* endwhile */
2416
2417                         hfs_systemfile_unlock(hfsmp, lockflags);
2418
2419                         if (hfsmp->jnl) {
2420                             (void) hfs_update(vp, TRUE);
2421                             (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2422                         }
2423
2424                         hfs_end_transaction(hfsmp);
2425
2426                         if (retval)
2427                                 goto Err_Exit;
2428
2429                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
2430                                 (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
2431                 }
2432
2433                 if (!(flags & IO_NOZEROFILL)) {
2434                         if (UBCINFOEXISTS(vp)  && (vnode_issystem(vp) == 0) && retval == E_NONE) {
2435                                 struct rl_entry *invalid_range;
2436                                 off_t zero_limit;
2437
2438                                 zero_limit = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
2439                                 if (length < zero_limit) zero_limit = length;
2440
2441                                 if (length > (off_t)fp->ff_size) {
2442                                         struct timeval tv;
2443
2444                                         /* Extending the file: time to fill out the current last page w. zeroes? */
2445                                         if ((fp->ff_size & PAGE_MASK_64) &&
2446                                             (rl_scan(&fp->ff_invalidranges, fp->ff_size & ~PAGE_MASK_64,
2447                                             fp->ff_size - 1, &invalid_range) == RL_NOOVERLAP)) {
2448
2449                                                 /* There's some valid data at the start of the (current) last page
2450                                                    of the file, so zero out the remainder of that page to ensure the
2451                                                    entire page contains valid data.  Since there is no invalid range
2452                                                    possible past the (current) eof, there's no need to remove anything
2453                                                    from the invalid range list before calling cluster_write():  */
2454                                                 hfs_unlock(cp);
2455                                                 retval = cluster_write(vp, (struct uio *) 0, fp->ff_size, zero_limit,
2456                                                                 fp->ff_size, (off_t)0,
2457                                                                 (flags & IO_SYNC) | IO_HEADZEROFILL | IO_NOZERODIRTY);
2458                                                 hfs_lock(cp, HFS_FORCE_LOCK);
2459                                                 if (retval) goto Err_Exit;
2460
2461                                                 /* Merely invalidate the remaining area, if necessary: */
2462                                                 if (length > zero_limit) {
2463                                                         microuptime(&tv);
2464                                                         rl_add(zero_limit, length - 1, &fp->ff_invalidranges);
2465                                                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
2466                                                 }
2467                                         } else {
2468                                         /* The page containing the (current) eof is invalid: just add the
2469                                            remainder of the page to the invalid list, along with the area
2470                                            being newly allocated:
2471                                          */
2472                                         microuptime(&tv);
2473                                         rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges);
2474                                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
2475                                         };
2476                                 }
2477                         } else {
2478                                         panic("hfs_truncate: invoked on non-UBC object?!");
2479                         };
2480                 }
2481                 cp->c_touch_modtime = TRUE;
2482                 fp->ff_size = length;
2483
2484         } else { /* Shorten the size of the file */
2485
2486                 if ((off_t)fp->ff_size > length) {
2487                         /* Any space previously marked as invalid is now irrelevant: */
2488                         rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges);
2489                 }
2490
2491                 /*
2492                  * Account for any unmapped blocks. Note that the new
2493                  * file length can still end up with unmapped blocks.
2494                  */
2495                 if (fp->ff_unallocblocks > 0) {
2496                         u_int32_t finalblks;
2497                         u_int32_t loanedBlocks;
2498
2499                         HFS_MOUNT_LOCK(hfsmp, TRUE);
2500
2501                         loanedBlocks = fp->ff_unallocblocks;
2502                         cp->c_blocks -= loanedBlocks;
2503                         fp->ff_blocks -= loanedBlocks;
2504                         fp->ff_unallocblocks = 0;
2505
2506                         hfsmp->loanedBlocks -= loanedBlocks;
2507
2508                         finalblks = (length + blksize - 1) / blksize;
2509                         if (finalblks > fp->ff_blocks) {
2510                                 /* calculate required unmapped blocks */
2511                                 loanedBlocks = finalblks - fp->ff_blocks;
2512                                 hfsmp->loanedBlocks += loanedBlocks;
2513
2514                                 fp->ff_unallocblocks = loanedBlocks;
2515                                 cp->c_blocks += loanedBlocks;
2516                                 fp->ff_blocks += loanedBlocks;
2517                         }
2518                         HFS_MOUNT_UNLOCK(hfsmp, TRUE);
2519                 }
2520
2521                 /*
2522                  * For a TBE process the deallocation of the file blocks is
2523                  * delayed until the file is closed.  And hfs_close calls
2524                  * truncate with the IO_NDELAY flag set.  So when IO_NDELAY
2525                  * isn't set, we make sure this isn't a TBE process.
2526                  */
2527                 if ((flags & IO_NDELAY) || (proc_tbe(p) == 0)) {
2528 #if QUOTA
2529                   off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
2530 #endif /* QUOTA */
2531                   if (hfs_start_transaction(hfsmp) != 0) {
2532                       retval = EINVAL;
2533                       goto Err_Exit;
2534                   }
2535
2536                         if (fp->ff_unallocblocks == 0) {
2537                                 /* Protect extents b-tree and allocation bitmap */
2538                                 lockflags = SFL_BITMAP;
2539                                 if (overflow_extents(fp))
2540                                         lockflags |= SFL_EXTENTS;
2541                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
2542
2543                                 retval = MacToVFSError(TruncateFileC(VTOVCB(vp),
2544                                                 (FCB*)fp, length, false));
2545
2546                                 hfs_systemfile_unlock(hfsmp, lockflags);
2547                         }
2548                         if (hfsmp->jnl) {
2549                                 if (retval == 0) {
2550                                         fp->ff_size = length;
2551                                 }
2552                                 (void) hfs_update(vp, TRUE);
2553                                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2554                         }
2555
2556                         hfs_end_transaction(hfsmp);
2557
2558                         filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
2559                         if (retval)
2560                                 goto Err_Exit;
2561 #if QUOTA
2562                         /* These are bytesreleased */
2563                         (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0);
2564 #endif /* QUOTA */
2565                 }
2566                 /* Only set update flag if the logical length changes */
2567                 if ((off_t)fp->ff_size != length)
2568                         cp->c_touch_modtime = TRUE;
2569                 fp->ff_size = length;
2570         }
2571         cp->c_touch_chgtime = TRUE;     /* status changed */
2572         cp->c_touch_modtime = TRUE;     /* file data was modified */
2573         retval = hfs_update(vp, MNT_WAIT);
2574         if (retval) {
2575                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
2576                      -1, -1, -1, retval, 0);
2577         }
2578
2579 Err_Exit:
2580
2581         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_END,
2582                  (int)length, (int)fp->ff_size, (int)filebytes, retval, 0);
2583
2584         return (retval);
2585 }
2586
2587
2588
2589 /*
2590  * Truncate a cnode to at most length size, freeing (or adding) the
2591  * disk blocks.
2592  */
2593 __private_extern__
2594 int
2595 hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize,
2596              vfs_context_t context)
2597 {
2598         struct filefork *fp = VTOF(vp);
2599         off_t filebytes;
2600         u_long fileblocks;
2601         int blksize, error = 0;
2602         struct cnode *cp = VTOC(vp);
2603
2604         /* Cannot truncate an HFS directory! */
2605         if (vnode_isdir(vp)) {
2606                 return (EISDIR);
2607         }
2608         /* A swap file cannot change size. */
2609         if (vnode_isswap(vp) && (length != 0)) {
2610                 return (EPERM);
2611         }
2612
2613         blksize = VTOVCB(vp)->blockSize;
2614         fileblocks = fp->ff_blocks;
2615         filebytes = (off_t)fileblocks * (off_t)blksize;
2616
2617         //
2618         // Have to do this here so that we don't wind up with
2619         // i/o pending for blocks that are about to be released
2620         // if we truncate the file.
2621         //
2622         // If skipsetsize is set, then the caller is responsible
2623         // for the ubc_setsize.
2624         //
2625         if (!skipsetsize)
2626                 ubc_setsize(vp, length);
2627
2628         // have to loop truncating or growing files that are
2629         // really big because otherwise transactions can get
2630         // enormous and consume too many kernel resources.
2631
2632         if (length < filebytes) {
2633                 while (filebytes > length) {
2634                         if ((filebytes - length) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
2635                                 filebytes -= HFS_BIGFILE_SIZE;
2636                         } else {
2637                                 filebytes = length;
2638                         }
2639                         cp->c_flag |= C_FORCEUPDATE;
2640                         error = do_hfs_truncate(vp, filebytes, flags, context);
2641                         if (error)
2642                                 break;
2643                 }
2644         } else if (length > filebytes) {
2645                 while (filebytes < length) {
2646                         if ((length - filebytes) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
2647                                 filebytes += HFS_BIGFILE_SIZE;
2648                         } else {
2649                                 filebytes = length;
2650                         }
2651                         cp->c_flag |= C_FORCEUPDATE;
2652                         error = do_hfs_truncate(vp, filebytes, flags, context);
2653                         if (error)
2654                                 break;
2655                 }
2656         } else /* Same logical size */ {
2657
2658                 error = do_hfs_truncate(vp, length, flags, context);
2659         }
2660         /* Files that are changing size are not hot file candidates. */
2661         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
2662                 fp->ff_bytesread = 0;
2663         }
2664
2665         return (error);
2666 }
2667
2668
2669
2670 /*
2671  * Preallocate file storage space.
2672  */
2673 int
2674 hfs_vnop_allocate(struct vnop_allocate_args /* {
2675                 vnode_t a_vp;
2676                 off_t a_length;
2677                 u_int32_t  a_flags;
2678                 off_t *a_bytesallocated;
2679                 off_t a_offset;
2680                 vfs_context_t a_context;
2681         } */ *ap)
2682 {
2683         struct vnode *vp = ap->a_vp;
2684         struct cnode *cp;
2685         struct filefork *fp;
2686         ExtendedVCB *vcb;
2687         off_t length = ap->a_length;
2688         off_t startingPEOF;
2689         off_t moreBytesRequested;
2690         off_t actualBytesAdded;
2691         off_t filebytes;
2692         u_long fileblocks;
2693         int retval, retval2;
2694         u_int32_t blockHint;
2695         u_int32_t extendFlags;   /* For call to ExtendFileC */
2696         struct hfsmount *hfsmp;
2697         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
2698         int lockflags;
2699
2700         *(ap->a_bytesallocated) = 0;
2701
2702         if (!vnode_isreg(vp))
2703                 return (EISDIR);
2704         if (length < (off_t)0)
2705                 return (EINVAL);
2706
2707         cp = VTOC(vp);
2708
2709         hfs_lock_truncate(cp, TRUE);
2710
2711         if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
2712                 goto Err_Exit;
2713         }
2714
2715         fp = VTOF(vp);
2716         hfsmp = VTOHFS(vp);
2717         vcb = VTOVCB(vp);
2718
2719         fileblocks = fp->ff_blocks;
2720         filebytes = (off_t)fileblocks * (off_t)vcb->blockSize;
2721
2722         if ((ap->a_flags & ALLOCATEFROMVOL) && (length < filebytes)) {
2723                 retval = EINVAL;
2724                 goto Err_Exit;
2725         }
2726
2727         /* Fill in the flags word for the call to Extend the file */
2728
2729         extendFlags = kEFNoClumpMask;
2730         if (ap->a_flags & ALLOCATECONTIG)
2731                 extendFlags |= kEFContigMask;
2732         if (ap->a_flags & ALLOCATEALL)
2733                 extendFlags |= kEFAllMask;
2734         if (cred && suser(cred, NULL) != 0)
2735                 extendFlags |= kEFReserveMask;
2736
2737         retval = E_NONE;
2738         blockHint = 0;
2739         startingPEOF = filebytes;
2740
2741         if (ap->a_flags & ALLOCATEFROMPEOF)
2742                 length += filebytes;
2743         else if (ap->a_flags & ALLOCATEFROMVOL)
2744                 blockHint = ap->a_offset / VTOVCB(vp)->blockSize;
2745
2746         /* If no changes are necesary, then we're done */
2747         if (filebytes == length)
2748                 goto Std_Exit;
2749
2750         /*
2751          * Lengthen the size of the file. We must ensure that the
2752          * last byte of the file is allocated. Since the smallest
2753          * value of filebytes is 0, length will be at least 1.
2754          */
2755         if (length > filebytes) {
2756                 off_t total_bytes_added = 0, orig_request_size;
2757
2758                 orig_request_size = moreBytesRequested = length - filebytes;
2759
2760 #if QUOTA
2761                 retval = hfs_chkdq(cp,
2762                                 (int64_t)(roundup(moreBytesRequested, vcb->blockSize)),
2763                                 cred, 0);
2764                 if (retval)
2765                         goto Err_Exit;
2766
2767 #endif /* QUOTA */
2768                 /*
2769                  * Metadata zone checks.
2770                  */
2771                 if (hfsmp->hfs_flags & HFS_METADATA_ZONE) {
2772                         /*
2773                          * Allocate Journal and Quota files in metadata zone.
2774                          */
2775                         if (hfs_virtualmetafile(cp)) {
2776                                 extendFlags |= kEFMetadataMask;
2777                                 blockHint = hfsmp->hfs_metazone_start;
2778                         } else if ((blockHint >= hfsmp->hfs_metazone_start) &&
2779                                    (blockHint <= hfsmp->hfs_metazone_end)) {
2780                                 /*
2781                                  * Move blockHint outside metadata zone.
2782                                  */
2783                                 blockHint = hfsmp->hfs_metazone_end + 1;
2784                         }
2785                 }
2786
2787
2788                 while ((length > filebytes) && (retval == E_NONE)) {
2789                     off_t bytesRequested;
2790
2791                     if (hfs_start_transaction(hfsmp) != 0) {
2792                         retval = EINVAL;
2793                         goto Err_Exit;
2794                     }
2795
2796                     /* Protect extents b-tree and allocation bitmap */
2797                     lockflags = SFL_BITMAP;
2798                     if (overflow_extents(fp))
2799                         lockflags |= SFL_EXTENTS;
2800                     lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
2801
2802                     if (moreBytesRequested >= HFS_BIGFILE_SIZE) {
2803                         bytesRequested = HFS_BIGFILE_SIZE;
2804                     } else {
2805                         bytesRequested = moreBytesRequested;
2806                     }
2807
2808                     retval = MacToVFSError(ExtendFileC(vcb,
2809                                                 (FCB*)fp,
2810                                                 bytesRequested,
2811                                                 blockHint,
2812                                                 extendFlags,
2813                                                 &actualBytesAdded));
2814
2815                     if (retval == E_NONE) {
2816                         *(ap->a_bytesallocated) += actualBytesAdded;
2817                         total_bytes_added += actualBytesAdded;
2818                         moreBytesRequested -= actualBytesAdded;
2819                         if (blockHint != 0) {
2820                             blockHint += actualBytesAdded / vcb->blockSize;
2821                         }
2822                     }
2823                     filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
2824
2825                     hfs_systemfile_unlock(hfsmp, lockflags);
2826
2827                     if (hfsmp->jnl) {
2828                         (void) hfs_update(vp, TRUE);
2829                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2830                     }
2831
2832                     hfs_end_transaction(hfsmp);
2833                 }
2834
2835
2836                 /*
2837                  * if we get an error and no changes were made then exit
2838                  * otherwise we must do the hfs_update to reflect the changes
2839                  */
2840                 if (retval && (startingPEOF == filebytes))
2841                         goto Err_Exit;
2842
2843                 /*
2844                  * Adjust actualBytesAdded to be allocation block aligned, not
2845                  * clump size aligned.
2846                  * NOTE: So what we are reporting does not affect reality
2847                  * until the file is closed, when we truncate the file to allocation
2848                  * block size.
2849                  */
2850                 if (total_bytes_added != 0 && orig_request_size < total_bytes_added)
2851                         *(ap->a_bytesallocated) =
2852                                 roundup(orig_request_size, (off_t)vcb->blockSize);
2853
2854         } else { /* Shorten the size of the file */
2855
2856                 if (fp->ff_size > length) {
2857                         /*
2858                          * Any buffers that are past the truncation point need to be
2859                          * invalidated (to maintain buffer cache consistency).
2860                          */
2861                 }
2862
2863                 retval = hfs_truncate(vp, length, 0, 0, ap->a_context);
2864                 filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
2865
2866                 /*
2867                  * if we get an error and no changes were made then exit
2868                  * otherwise we must do the hfs_update to reflect the changes
2869                  */
2870                 if (retval && (startingPEOF == filebytes)) goto Err_Exit;
2871 #if QUOTA
2872                 /* These are  bytesreleased */
2873                 (void) hfs_chkdq(cp, (int64_t)-((startingPEOF - filebytes)), NOCRED,0);
2874 #endif /* QUOTA */
2875
2876                 if (fp->ff_size > filebytes) {
2877                         fp->ff_size = filebytes;
2878
2879                         hfs_unlock(cp);
2880                         ubc_setsize(vp, fp->ff_size);
2881                         hfs_lock(cp, HFS_FORCE_LOCK);
2882                 }
2883         }
2884
2885 Std_Exit:
2886         cp->c_touch_chgtime = TRUE;
2887         cp->c_touch_modtime = TRUE;
2888         retval2 = hfs_update(vp, MNT_WAIT);
2889
2890         if (retval == 0)
2891                 retval = retval2;
2892 Err_Exit:
2893         hfs_unlock_truncate(cp, TRUE);
2894         hfs_unlock(cp);
2895         return (retval);
2896 }
2897
2898
2899 /*
2900  * Pagein for HFS filesystem
2901  */
2902 int
2903 hfs_vnop_pagein(struct vnop_pagein_args *ap)
2904 /*
2905         struct vnop_pagein_args {
2906                 vnode_t a_vp,
2907                 upl_t         a_pl,
2908                 vm_offset_t   a_pl_offset,
2909                 off_t         a_f_offset,
2910                 size_t        a_size,
2911                 int           a_flags
2912                 vfs_context_t a_context;
2913         };
2914 */
2915 {
2916         vnode_t vp = ap->a_vp;
2917         int error;
2918
2919         error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
2920                                ap->a_size, (off_t)VTOF(vp)->ff_size, ap->a_flags);
2921         /*
2922          * Keep track of blocks read.
2923          */
2924         if (!vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) {
2925                 struct cnode *cp;
2926                 struct filefork *fp;
2927                 int bytesread;
2928                 int took_cnode_lock = 0;
2929
2930                 cp = VTOC(vp);
2931                 fp = VTOF(vp);
2932
2933                 if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE)
2934                         bytesread = fp->ff_size;
2935                 else
2936                         bytesread = ap->a_size;
2937
2938                 /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
2939                 if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) {
2940                         hfs_lock(cp, HFS_FORCE_LOCK);
2941                         took_cnode_lock = 1;
2942                 }
2943                 /*
2944                  * If this file hasn't been seen since the start of
2945                  * the current sampling period then start over.
2946                  */
2947                 if (cp->c_atime < VTOHFS(vp)->hfc_timebase) {
2948                         struct timeval tv;
2949
2950                         fp->ff_bytesread = bytesread;
2951                         microtime(&tv);
2952                         cp->c_atime = tv.tv_sec;
2953                 } else {
2954                         fp->ff_bytesread += bytesread;
2955                 }
2956                 cp->c_touch_acctime = TRUE;
2957                 if (took_cnode_lock)
2958                         hfs_unlock(cp);
2959         }
2960         return (error);
2961 }
2962
2963 /*
2964  * Pageout for HFS filesystem.
2965  */
2966 int
2967 hfs_vnop_pageout(struct vnop_pageout_args *ap)
2968 /*
2969         struct vnop_pageout_args {
2970            vnode_t a_vp,
2971            upl_t         a_pl,
2972            vm_offset_t   a_pl_offset,
2973            off_t         a_f_offset,
2974            size_t        a_size,
2975            int           a_flags
2976            vfs_context_t a_context;
2977         };
2978 */
2979 {
2980         vnode_t vp = ap->a_vp;
2981         struct cnode *cp;
2982         struct filefork *fp;
2983         int retval;
2984         off_t filesize;
2985
2986         cp = VTOC(vp);
2987         fp = VTOF(vp);
2988
2989         /*
2990          * Figure out where the file ends, for pageout purposes.  If
2991          * ff_new_size > ff_size, then we're in the middle of extending the
2992          * file via a write, so it is safe (and necessary) that we be able
2993          * to pageout up to that point.
2994          */
2995         filesize = fp->ff_size;
2996         if (fp->ff_new_size > filesize)
2997                 filesize = fp->ff_new_size;
2998
2999         if (!vnode_isswap(vp)) {
3000                 off_t end_of_range;
3001                 int tooklock = 0;
3002
3003                 if (cp->c_lockowner != current_thread()) {
3004                     if ( (retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
3005                         if (!(ap->a_flags & UPL_NOCOMMIT)) {
3006                                 ubc_upl_abort_range(ap->a_pl,
3007                                                     ap->a_pl_offset,
3008                                                     ap->a_size,
3009                                                     UPL_ABORT_FREE_ON_EMPTY);
3010                         }
3011                         return (retval);
3012                     }
3013                     tooklock = 1;
3014                 }
3015
3016                 end_of_range = ap->a_f_offset + ap->a_size - 1;
3017
3018                 if (end_of_range >= filesize) {
3019                         end_of_range = (off_t)(filesize - 1);
3020                 }
3021                 if (ap->a_f_offset < filesize) {
3022                         rl_remove(ap->a_f_offset, end_of_range, &fp->ff_invalidranges);
3023                         cp->c_flag |= C_MODIFIED;  /* leof is dirty */
3024                 }
3025
3026                 if (tooklock) {
3027                     hfs_unlock(cp);
3028                 }
3029         }
3030
3031         retval = cluster_pageout(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
3032                                  ap->a_size, filesize, ap->a_flags);
3033
3034         /*
3035          * If data was written, and setuid or setgid bits are set and
3036          * this process is not the superuser then clear the setuid and
3037          * setgid bits as a precaution against tampering.
3038          */
3039         if ((retval == 0) &&
3040             (cp->c_mode & (S_ISUID | S_ISGID)) &&
3041             (vfs_context_suser(ap->a_context) != 0)) {
3042                 hfs_lock(cp, HFS_FORCE_LOCK);
3043                 cp->c_mode &= ~(S_ISUID | S_ISGID);
3044                 cp->c_touch_chgtime = TRUE;
3045                 hfs_unlock(cp);
3046         }
3047         return (retval);
3048 }
3049
3050 /*
3051  * Intercept B-Tree node writes to unswap them if necessary.
3052  */
3053 int
3054 hfs_vnop_bwrite(struct vnop_bwrite_args *ap)
3055 {
3056         int retval = 0;
3057         register struct buf *bp = ap->a_bp;
3058         register struct vnode *vp = buf_vnode(bp);
3059         BlockDescriptor block;
3060
3061         /* Trap B-Tree writes */
3062         if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) ||
3063             (VTOC(vp)->c_fileid == kHFSCatalogFileID) ||
3064             (VTOC(vp)->c_fileid == kHFSAttributesFileID) ||
3065             (vp == VTOHFS(vp)->hfc_filevp)) {
3066
3067                 /*
3068                  * Swap and validate the node if it is in native byte order.
3069                  * This is always be true on big endian, so we always validate
3070                  * before writing here.  On little endian, the node typically has
3071                  * been swapped and validated when it was written to the journal,
3072                  * so we won't do anything here.
3073                  */
3074                 if (((u_int16_t *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) {
3075                         /* Prepare the block pointer */
3076                         block.blockHeader = bp;
3077                         block.buffer = (char *)buf_dataptr(bp);
3078                         block.blockNum = buf_lblkno(bp);
3079                         /* not found in cache ==> came from disk */
3080                         block.blockReadFromDisk = (buf_fromcache(bp) == 0);
3081                         block.blockSize = buf_count(bp);
3082
3083                         /* Endian un-swap B-Tree node */
3084                         retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, false);
3085                         if (retval)
3086                                 panic("hfs_vnop_bwrite: about to write corrupt node!\n");
3087                 }
3088         }
3089
3090         /* This buffer shouldn't be locked anymore but if it is clear it */
3091         if ((buf_flags(bp) & B_LOCKED)) {
3092                 // XXXdbg
3093                 if (VTOHFS(vp)->jnl) {
3094                         panic("hfs: CLEARING the lock bit on bp %p\n", bp);
3095                 }
3096                 buf_clearflags(bp, B_LOCKED);
3097         }
3098         retval = vn_bwrite (ap);
3099
3100         return (retval);
3101 }
3102
3103 /*
3104  * Relocate a file to a new location on disk
3105  *  cnode must be locked on entry
3106  *
3107  * Relocation occurs by cloning the file's data from its
3108  * current set of blocks to a new set of blocks. During
3109  * the relocation all of the blocks (old and new) are
3110  * owned by the file.
3111  *
3112  * -----------------
3113  * |///////////////|
3114  * -----------------
3115  * 0               N (file offset)
3116  *
3117  * -----------------     -----------------
3118  * |///////////////|     |               |     STEP 1 (acquire new blocks)
3119  * -----------------     -----------------
3120  * 0               N     N+1             2N
3121  *
3122  * -----------------     -----------------
3123  * |///////////////|     |///////////////|     STEP 2 (clone data)
3124  * -----------------     -----------------
3125  * 0               N     N+1             2N
3126  *
3127  *                       -----------------
3128  *                       |///////////////|     STEP 3 (head truncate blocks)
3129  *                       -----------------
3130  *                       0               N
3131  *
3132  * During steps 2 and 3 page-outs to file offsets less
3133  * than or equal to N are suspended.
3134  *
3135  * During step 3 page-ins to the file get suspended.
3136  */
3137 __private_extern__
3138 int
3139 hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
3140         struct  proc *p)
3141 {
3142         struct  cnode *cp;
3143         struct  filefork *fp;
3144         struct  hfsmount *hfsmp;
3145         u_int32_t  headblks;
3146         u_int32_t  datablks;
3147         u_int32_t  blksize;
3148         u_int32_t  growsize;
3149         u_int32_t  nextallocsave;
3150         daddr64_t  sector_a,  sector_b;
3151         int eflags;
3152         off_t  newbytes;
3153         int  retval;
3154         int lockflags = 0;
3155         int took_trunc_lock = 0;
3156         int started_tr = 0;
3157         enum vtype vnodetype;
3158
3159         vnodetype = vnode_vtype(vp);
3160         if (vnodetype != VREG && vnodetype != VLNK) {
3161                 return (EPERM);
3162         }
3163
3164         hfsmp = VTOHFS(vp);
3165         if (hfsmp->hfs_flags & HFS_FRAGMENTED_FREESPACE) {
3166                 return (ENOSPC);
3167         }
3168
3169         cp = VTOC(vp);
3170         fp = VTOF(vp);
3171         if (fp->ff_unallocblocks)
3172                 return (EINVAL);
3173         blksize = hfsmp->blockSize;
3174         if (blockHint == 0)
3175                 blockHint = hfsmp->nextAllocation;
3176
3177         if ((fp->ff_size > 0x7fffffff) ||
3178             ((fp->ff_size > blksize) && vnodetype == VLNK)) {
3179                 return (EFBIG);
3180         }
3181
3182         //
3183         // We do not believe that this call to hfs_fsync() is
3184         // necessary and it causes a journal transaction
3185         // deadlock so we are removing it.
3186         //
3187         //if (vnodetype == VREG && !vnode_issystem(vp)) {
3188         //      retval = hfs_fsync(vp, MNT_WAIT, 0, p);
3189         //      if (retval)
3190         //              return (retval);
3191         //}
3192
3193         if (!vnode_issystem(vp) && (vnodetype != VLNK)) {
3194                 hfs_unlock(cp);
3195                 hfs_lock_truncate(cp, TRUE);
3196                 /* Force lock since callers expects lock to be held. */
3197                 if ((retval = hfs_lock(cp, HFS_FORCE_LOCK))) {
3198                         hfs_unlock_truncate(cp, TRUE);
3199                         return (retval);
3200                 }
3201                 /* No need to continue if file was removed. */
3202                 if (cp->c_flag & C_NOEXISTS) {
3203                         hfs_unlock_truncate(cp, TRUE);
3204                         return (ENOENT);
3205                 }
3206                 took_trunc_lock = 1;
3207         }
3208         headblks = fp->ff_blocks;
3209         datablks = howmany(fp->ff_size, blksize);
3210         growsize = datablks * blksize;
3211         eflags = kEFContigMask | kEFAllMask | kEFNoClumpMask;
3212         if (blockHint >= hfsmp->hfs_metazone_start &&
3213             blockHint <= hfsmp->hfs_metazone_end)
3214                 eflags |= kEFMetadataMask;
3215
3216         if (hfs_start_transaction(hfsmp) != 0) {
3217                 if (took_trunc_lock)
3218                         hfs_unlock_truncate(cp, TRUE);
3219             return (EINVAL);
3220         }
3221         started_tr = 1;
3222         /*
3223          * Protect the extents b-tree and the allocation bitmap
3224          * during MapFileBlockC and ExtendFileC operations.
3225          */
3226         lockflags = SFL_BITMAP;
3227         if (overflow_extents(fp))
3228                 lockflags |= SFL_EXTENTS;
3229         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3230
3231         retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize - 1, &sector_a, NULL);
3232         if (retval) {
3233                 retval = MacToVFSError(retval);
3234                 goto out;
3235         }
3236
3237         /*
3238          * STEP 1 - acquire new allocation blocks.
3239          */
3240         nextallocsave = hfsmp->nextAllocation;
3241         retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes);
3242         if (eflags & kEFMetadataMask) {
3243                 HFS_MOUNT_LOCK(hfsmp, TRUE);
3244                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave);
3245                 MarkVCBDirty(hfsmp);
3246                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
3247         }
3248
3249         retval = MacToVFSError(retval);
3250         if (retval == 0) {
3251                 cp->c_flag |= C_MODIFIED;
3252                 if (newbytes < growsize) {
3253                         retval = ENOSPC;
3254                         goto restore;
3255                 } else if (fp->ff_blocks < (headblks + datablks)) {
3256                         printf("hfs_relocate: allocation failed");
3257                         retval = ENOSPC;
3258                         goto restore;
3259                 }
3260
3261                 retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize, &sector_b, NULL);
3262                 if (retval) {
3263                         retval = MacToVFSError(retval);
3264                 } else if ((sector_a + 1) == sector_b) {
3265                         retval = ENOSPC;
3266                         goto restore;
3267                 } else if ((eflags & kEFMetadataMask) &&
3268                            ((((u_int64_t)sector_b * hfsmp->hfs_logical_block_size) / blksize) >
3269                               hfsmp->hfs_metazone_end)) {
3270                         const char * filestr;
3271                         char emptystr = '\0';
3272
3273                         if (cp->c_desc.cd_nameptr != NULL) {
3274                                 filestr = (const char *)&cp->c_desc.cd_nameptr[0];
3275                         } else if (vnode_name(vp) != NULL) {
3276                                 filestr = vnode_name(vp);
3277                         } else {
3278                                 filestr = &emptystr;
3279                         }
3280                         printf("hfs_relocate: %s didn't move into MDZ (%d blks)\n", filestr, fp->ff_blocks);
3281                         retval = ENOSPC;
3282                         goto restore;
3283                 }
3284         }
3285         /* Done with system locks and journal for now. */
3286         hfs_systemfile_unlock(hfsmp, lockflags);
3287         lockflags = 0;
3288         hfs_end_transaction(hfsmp);
3289         started_tr = 0;
3290
3291         if (retval) {
3292                 /*
3293                  * Check to see if failure is due to excessive fragmentation.
3294                  */
3295                 if ((retval == ENOSPC) &&
3296                     (hfs_freeblks(hfsmp, 0) > (datablks * 2))) {
3297                         hfsmp->hfs_flags |= HFS_FRAGMENTED_FREESPACE;
3298                 }
3299                 goto out;
3300         }
3301         /*
3302          * STEP 2 - clone file data into the new allocation blocks.
3303          */
3304
3305         if (vnodetype == VLNK)
3306                 retval = hfs_clonelink(vp, blksize, cred, p);
3307         else if (vnode_issystem(vp))
3308                 retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p);
3309         else
3310                 retval = hfs_clonefile(vp, headblks, datablks, blksize);
3311
3312         /* Start transaction for step 3 or for a restore. */
3313         if (hfs_start_transaction(hfsmp) != 0) {
3314                 retval = EINVAL;
3315                 goto out;
3316         }
3317         started_tr = 1;
3318         if (retval)
3319                 goto restore;
3320
3321         /*
3322          * STEP 3 - switch to cloned data and remove old blocks.
3323          */
3324         lockflags = SFL_BITMAP;
3325         if (overflow_extents(fp))
3326                 lockflags |= SFL_EXTENTS;
3327         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3328
3329         retval = HeadTruncateFile(hfsmp, (FCB*)fp, headblks);
3330
3331         hfs_systemfile_unlock(hfsmp, lockflags);
3332         lockflags = 0;
3333         if (retval)
3334                 goto restore;
3335 out:
3336         if (took_trunc_lock)
3337                 hfs_unlock_truncate(cp, TRUE);
3338
3339         if (lockflags) {
3340                 hfs_systemfile_unlock(hfsmp, lockflags);
3341                 lockflags = 0;
3342         }
3343
3344         /* Push cnode's new extent data to disk. */
3345         if (retval == 0) {
3346                 (void) hfs_update(vp, MNT_WAIT);
3347         }
3348         if (hfsmp->jnl) {
3349                 if (cp->c_cnid < kHFSFirstUserCatalogNodeID)
3350                         (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
3351                 else
3352                         (void) hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
3353         }
3354 exit:
3355         if (started_tr)
3356                 hfs_end_transaction(hfsmp);
3357
3358         return (retval);
3359
3360 restore:
3361         if (fp->ff_blocks == headblks) {
3362                 if (took_trunc_lock)
3363                         hfs_unlock_truncate(cp, TRUE);
3364                 goto exit;
3365         }
3366         /*
3367          * Give back any newly allocated space.
3368          */
3369         if (lockflags == 0) {
3370                 lockflags = SFL_BITMAP;
3371                 if (overflow_extents(fp))
3372                         lockflags |= SFL_EXTENTS;
3373                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3374         }
3375
3376         (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, false);
3377
3378         hfs_systemfile_unlock(hfsmp, lockflags);
3379         lockflags = 0;
3380
3381         if (took_trunc_lock)
3382                 hfs_unlock_truncate(cp, TRUE);
3383         goto exit;
3384 }
3385
3386
3387 /*
3388  * Clone a symlink.
3389  *
3390  */
3391 static int
3392 hfs_clonelink(struct vnode *vp, int blksize, kauth_cred_t cred, __unused struct proc *p)
3393 {
3394         struct buf *head_bp = NULL;
3395         struct buf *tail_bp = NULL;
3396         int error;
3397
3398
3399         error = (int)buf_meta_bread(vp, (daddr64_t)0, blksize, cred, &head_bp);
3400         if (error)
3401                 goto out;
3402
3403         tail_bp = buf_getblk(vp, (daddr64_t)1, blksize, 0, 0, BLK_META);
3404         if (tail_bp == NULL) {
3405                 error = EIO;
3406                 goto out;
3407         }
3408         bcopy((char *)buf_dataptr(head_bp), (char *)buf_dataptr(tail_bp), blksize);
3409         error = (int)buf_bwrite(tail_bp);
3410 out:
3411         if (head_bp) {
3412                 buf_markinvalid(head_bp);
3413                 buf_brelse(head_bp);
3414         }
3415         (void) buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
3416
3417         return (error);
3418 }
3419
3420 /*
3421  * Clone a file's data within the file.
3422  *
3423  */
3424 static int
3425 hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
3426 {
3427         caddr_t  bufp;
3428         size_t  writebase;
3429         size_t  bufsize;
3430         size_t  copysize;
3431         size_t  iosize;
3432         off_t   filesize;
3433         size_t  offset;
3434         uio_t auio;
3435         int  error = 0;
3436
3437         filesize = VTOF(vp)->ff_blocks * blksize;  /* virtual file size */
3438         writebase = blkstart * blksize;
3439         copysize = blkcnt * blksize;
3440         iosize = bufsize = MIN(copysize, 128 * 1024);
3441         offset = 0;
3442
3443         if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
3444                 return (ENOMEM);
3445         }
3446         hfs_unlock(VTOC(vp));
3447
3448         auio = uio_create(1, 0, UIO_SYSSPACE32, UIO_READ);
3449
3450         while (offset < copysize) {
3451                 iosize = MIN(copysize - offset, iosize);
3452
3453                 uio_reset(auio, offset, UIO_SYSSPACE32, UIO_READ);
3454                 uio_addiov(auio, (uintptr_t)bufp, iosize);
3455
3456                 error = cluster_read(vp, auio, copysize, IO_NOCACHE);
3457                 if (error) {
3458                         printf("hfs_clonefile: cluster_read failed - %d\n", error);
3459                         break;
3460                 }
3461                 if (uio_resid(auio) != 0) {
3462                         printf("clonedata: cluster_read: uio_resid = %lld\n", uio_resid(auio));
3463                         error = EIO;
3464                         break;
3465                 }
3466
3467                 uio_reset(auio, writebase + offset, UIO_SYSSPACE32, UIO_WRITE);
3468                 uio_addiov(auio, (uintptr_t)bufp, iosize);
3469
3470                 error = cluster_write(vp, auio, filesize + offset,
3471                                       filesize + offset + iosize,
3472                                       uio_offset(auio), 0, IO_NOCACHE | IO_SYNC);
3473                 if (error) {
3474                         printf("hfs_clonefile: cluster_write failed - %d\n", error);
3475                         break;
3476                 }
3477                 if (uio_resid(auio) != 0) {
3478                         printf("hfs_clonefile: cluster_write failed - uio_resid not zero\n");
3479                         error = EIO;
3480                         break;
3481                 }
3482                 offset += iosize;
3483         }
3484         uio_free(auio);
3485
3486         /*
3487          * No need to call ubc_sync_range or hfs_invalbuf
3488          * since the file was copied using IO_NOCACHE.
3489          */
3490
3491         kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
3492
3493         hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
3494         return (error);
3495 }
3496
3497 /*
3498  * Clone a system (metadata) file.
3499  *
3500  */
3501 static int
3502 hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize,
3503                  kauth_cred_t cred, struct proc *p)
3504 {
3505         caddr_t  bufp;
3506         char * offset;
3507         size_t  bufsize;
3508         size_t  iosize;
3509         struct buf *bp = NULL;
3510         daddr64_t  blkno;
3511         daddr64_t  blk;
3512         daddr64_t  start_blk;
3513         daddr64_t  last_blk;
3514         int  breadcnt;
3515         int  i;
3516         int  error = 0;
3517
3518
3519         iosize = GetLogicalBlockSize(vp);
3520         bufsize = MIN(blkcnt * blksize, 1024 * 1024) & ~(iosize - 1);
3521         breadcnt = bufsize / iosize;
3522
3523         if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
3524                 return (ENOMEM);
3525         }
3526         start_blk = ((daddr64_t)blkstart * blksize) / iosize;
3527         last_blk  = ((daddr64_t)blkcnt * blksize) / iosize;
3528         blkno = 0;
3529
3530         while (blkno < last_blk) {
3531                 /*
3532                  * Read up to a megabyte
3533                  */
3534                 offset = bufp;
3535                 for (i = 0, blk = blkno; (i < breadcnt) && (blk < last_blk); ++i, ++blk) {
3536                         error = (int)buf_meta_bread(vp, blk, iosize, cred, &bp);
3537                         if (error) {
3538                                 printf("hfs_clonesysfile: meta_bread error %d\n", error);
3539                                 goto out;
3540                         }
3541                         if (buf_count(bp) != iosize) {
3542                                 printf("hfs_clonesysfile: b_bcount is only %d\n", buf_count(bp));
3543                                 goto out;
3544                         }
3545                         bcopy((char *)buf_dataptr(bp), offset, iosize);
3546
3547                         buf_markinvalid(bp);
3548                         buf_brelse(bp);
3549                         bp = NULL;
3550
3551                         offset += iosize;
3552                 }
3553
3554                 /*
3555                  * Write up to a megabyte
3556                  */
3557                 offset = bufp;
3558                 for (i = 0; (i < breadcnt) && (blkno < last_blk); ++i, ++blkno) {
3559                         bp = buf_getblk(vp, start_blk + blkno, iosize, 0, 0, BLK_META);
3560                         if (bp == NULL) {
3561                                 printf("hfs_clonesysfile: getblk failed on blk %qd\n", start_blk + blkno);
3562                                 error = EIO;
3563                                 goto out;
3564                         }
3565                         bcopy(offset, (char *)buf_dataptr(bp), iosize);
3566                         error = (int)buf_bwrite(bp);
3567                         bp = NULL;
3568                         if (error)
3569                                 goto out;
3570                         offset += iosize;
3571                 }
3572         }
3573 out:
3574         if (bp) {
3575                 buf_brelse(bp);
3576         }
3577
3578         kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
3579
3580         error = hfs_fsync(vp, MNT_WAIT, 0, p);
3581
3582         return (error);
3583 }