bsd/hfs/hfs_readwrite.c

   1 /*
   2  * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*      @(#)hfs_readwrite.c     1.0
  29  *
  30  *      (c) 1998-2001 Apple Computer, Inc.  All Rights Reserved
  31  *
  32  *      hfs_readwrite.c -- vnode operations to deal with reading and writing files.
  33  *
  34  */
  35
  36 #include <sys/param.h>
  37 #include <sys/systm.h>
  38 #include <sys/resourcevar.h>
  39 #include <sys/kernel.h>
  40 #include <sys/fcntl.h>
  41 #include <sys/filedesc.h>
  42 #include <sys/stat.h>
  43 #include <sys/buf.h>
  44 #include <sys/proc.h>
  45 #include <sys/kauth.h>
  46 #include <sys/vnode.h>
  47 #include <sys/vnode_internal.h>
  48 #include <sys/uio.h>
  49 #include <sys/vfs_context.h>
  50 #include <sys/fsevents.h>
  51 #include <kern/kalloc.h>
  52 #include <sys/disk.h>
  53 #include <sys/sysctl.h>
  54
  55 #include <miscfs/specfs/specdev.h>
  56
  57 #include <sys/ubc.h>
  58 #include <sys/ubc_internal.h>
  59
  60 #include <vm/vm_pageout.h>
  61 #include <vm/vm_kern.h>
  62
  63 #include <sys/kdebug.h>
  64
  65 #include        "hfs.h"
  66 #include        "hfs_attrlist.h"
  67 #include        "hfs_endian.h"
  68 #include        "hfs_fsctl.h"
  69 #include        "hfs_quota.h"
  70 #include        "hfscommon/headers/FileMgrInternal.h"
  71 #include        "hfscommon/headers/BTreesInternal.h"
  72 #include        "hfs_cnode.h"
  73 #include        "hfs_dbg.h"
  74
  75 #define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2)))
  76
  77 enum {
  78         MAXHFSFILESIZE = 0x7FFFFFFF             /* this needs to go in the mount structure */
  79 };
  80
  81 /* from bsd/vfs/vfs_cluster.c */
  82 extern int is_file_clean(vnode_t vp, off_t filesize);
  83 /* from bsd/hfs/hfs_vfsops.c */
  84 extern int hfs_vfs_vget(struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context);
  85
  86 static int  hfs_clonelink(struct vnode *, int, kauth_cred_t, struct proc *);
  87 static int  hfs_clonefile(struct vnode *, int, int, int);
  88 static int  hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *);
  89
  90 int flush_cache_on_write = 0;
  91 SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files");
  92
  93
  94 /*
  95  * Read data from a file.
  96  */
  97 int
  98 hfs_vnop_read(struct vnop_read_args *ap)
  99 {
 100         uio_t uio = ap->a_uio;
 101         struct vnode *vp = ap->a_vp;
 102         struct cnode *cp;
 103         struct filefork *fp;
 104         struct hfsmount *hfsmp;
 105         off_t filesize;
 106         off_t filebytes;
 107         off_t start_resid = uio_resid(uio);
 108         off_t offset = uio_offset(uio);
 109         int retval = 0;
 110
 111
 112         /* Preflight checks */
 113         if (!vnode_isreg(vp)) {
 114                 /* can only read regular files */
 115                 if (vnode_isdir(vp))
 116                         return (EISDIR);
 117                 else
 118                         return (EPERM);
 119         }
 120         if (start_resid == 0)
 121                 return (0);             /* Nothing left to do */
 122         if (offset < 0)
 123                 return (EINVAL);        /* cant read from a negative offset */
 124
 125         cp = VTOC(vp);
 126         fp = VTOF(vp);
 127         hfsmp = VTOHFS(vp);
 128
 129         /* Protect against a size change. */
 130         hfs_lock_truncate(cp, 0);
 131
 132         filesize = fp->ff_size;
 133         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 134         if (offset > filesize) {
 135                 if ((hfsmp->hfs_flags & HFS_STANDARD) &&
 136                     (offset > (off_t)MAXHFSFILESIZE)) {
 137                         retval = EFBIG;
 138                 }
 139                 goto exit;
 140         }
 141
 142         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_START,
 143                 (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0);
 144
 145         retval = cluster_read(vp, uio, filesize, ap->a_ioflag);
 146
 147         cp->c_touch_acctime = TRUE;
 148
 149         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_END,
 150                 (int)uio_offset(uio), uio_resid(uio), (int)filesize,  (int)filebytes, 0);
 151
 152         /*
 153          * Keep track blocks read
 154          */
 155         if (hfsmp->hfc_stage == HFC_RECORDING && retval == 0) {
 156                 int took_cnode_lock = 0;
 157                 off_t bytesread;
 158
 159                 bytesread = start_resid - uio_resid(uio);
 160
 161                 /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
 162                 if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) {
 163                         hfs_lock(cp, HFS_FORCE_LOCK);
 164                         took_cnode_lock = 1;
 165                 }
 166                 /*
 167                  * If this file hasn't been seen since the start of
 168                  * the current sampling period then start over.
 169                  */
 170                 if (cp->c_atime < hfsmp->hfc_timebase) {
 171                         struct timeval tv;
 172
 173                         fp->ff_bytesread = bytesread;
 174                         microtime(&tv);
 175                         cp->c_atime = tv.tv_sec;
 176                 } else {
 177                         fp->ff_bytesread += bytesread;
 178                 }
 179                 if (took_cnode_lock)
 180                         hfs_unlock(cp);
 181         }
 182 exit:
 183         hfs_unlock_truncate(cp, 0);
 184         return (retval);
 185 }
 186
 187 /*
 188  * Write data to a file.
 189  */
 190 int
 191 hfs_vnop_write(struct vnop_write_args *ap)
 192 {
 193         uio_t uio = ap->a_uio;
 194         struct vnode *vp = ap->a_vp;
 195         struct cnode *cp;
 196         struct filefork *fp;
 197         struct hfsmount *hfsmp;
 198         kauth_cred_t cred = NULL;
 199         off_t origFileSize;
 200         off_t writelimit;
 201         off_t bytesToAdd = 0;
 202         off_t actualBytesAdded;
 203         off_t filebytes;
 204         off_t offset;
 205         size_t resid;
 206         int eflags;
 207         int ioflag = ap->a_ioflag;
 208         int retval = 0;
 209         int lockflags;
 210         int cnode_locked = 0;
 211         int partialwrite = 0;
 212         int exclusive_lock = 0;
 213
 214         // LP64todo - fix this! uio_resid may be 64-bit value
 215         resid = uio_resid(uio);
 216         offset = uio_offset(uio);
 217
 218         if (ioflag & IO_APPEND) {
 219             exclusive_lock = 1;
 220         }
 221
 222         if (offset < 0)
 223                 return (EINVAL);
 224         if (resid == 0)
 225                 return (E_NONE);
 226         if (!vnode_isreg(vp))
 227                 return (EPERM);  /* Can only write regular files */
 228
 229         cp = VTOC(vp);
 230         fp = VTOF(vp);
 231         hfsmp = VTOHFS(vp);
 232
 233         eflags = kEFDeferMask;  /* defer file block allocations */
 234 #ifdef HFS_SPARSE_DEV
 235         /*
 236          * When the underlying device is sparse and space
 237          * is low (< 8MB), stop doing delayed allocations
 238          * and begin doing synchronous I/O.
 239          */
 240         if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
 241             (hfs_freeblks(hfsmp, 0) < 2048)) {
 242                 eflags &= ~kEFDeferMask;
 243                 ioflag |= IO_SYNC;
 244         }
 245 #endif /* HFS_SPARSE_DEV */
 246
 247 again:
 248         /* Protect against a size change. */
 249         hfs_lock_truncate(cp, exclusive_lock);
 250
 251         if (ioflag & IO_APPEND) {
 252                 uio_setoffset(uio, fp->ff_size);
 253                 offset = fp->ff_size;
 254         }
 255         if ((cp->c_flags & APPEND) && offset != fp->ff_size) {
 256                 retval = EPERM;
 257                 goto exit;
 258         }
 259
 260         origFileSize = fp->ff_size;
 261         writelimit = offset + resid;
 262         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 263
 264         /* If the truncate lock is shared, and if we either have virtual
 265          * blocks or will need to extend the file, upgrade the truncate
 266          * to exclusive lock.  If upgrade fails, we lose the lock and
 267          * have to get exclusive lock again
 268          */
 269         if ((exclusive_lock == 0) &&
 270             ((fp->ff_unallocblocks != 0) || (writelimit > filebytes))) {
 271                 exclusive_lock = 1;
 272                 /* Lock upgrade failed and we lost our shared lock, try again */
 273                 if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) {
 274                         goto again;
 275                 }
 276         }
 277
 278         if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) {
 279                 goto exit;
 280         }
 281         cnode_locked = 1;
 282
 283         if (!exclusive_lock) {
 284                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_START,
 285                              (int)offset, uio_resid(uio), (int)fp->ff_size,
 286                              (int)filebytes, 0);
 287         }
 288
 289         /* Check if we do not need to extend the file */
 290         if (writelimit <= filebytes) {
 291                 goto sizeok;
 292         }
 293
 294         cred = vfs_context_ucred(ap->a_context);
 295         bytesToAdd = writelimit - filebytes;
 296
 297 #if QUOTA
 298         retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, hfsmp->blockSize)),
 299                            cred, 0);
 300         if (retval)
 301                 goto exit;
 302 #endif /* QUOTA */
 303
 304         if (hfs_start_transaction(hfsmp) != 0) {
 305                 retval = EINVAL;
 306                 goto exit;
 307         }
 308
 309         while (writelimit > filebytes) {
 310                 bytesToAdd = writelimit - filebytes;
 311                 if (cred && suser(cred, NULL) != 0)
 312                         eflags |= kEFReserveMask;
 313
 314                 /* Protect extents b-tree and allocation bitmap */
 315                 lockflags = SFL_BITMAP;
 316                 if (overflow_extents(fp))
 317                         lockflags |= SFL_EXTENTS;
 318                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
 319
 320                 /* Files that are changing size are not hot file candidates. */
 321                 if (hfsmp->hfc_stage == HFC_RECORDING) {
 322                         fp->ff_bytesread = 0;
 323                 }
 324                 retval = MacToVFSError(ExtendFileC (hfsmp, (FCB*)fp, bytesToAdd,
 325                                 0, eflags, &actualBytesAdded));
 326
 327                 hfs_systemfile_unlock(hfsmp, lockflags);
 328
 329                 if ((actualBytesAdded == 0) && (retval == E_NONE))
 330                         retval = ENOSPC;
 331                 if (retval != E_NONE)
 332                         break;
 333                 filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 334                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_NONE,
 335                         (int)offset, uio_resid(uio), (int)fp->ff_size,  (int)filebytes, 0);
 336         }
 337         (void) hfs_update(vp, TRUE);
 338         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
 339         (void) hfs_end_transaction(hfsmp);
 340
 341         /*
 342          * If we didn't grow the file enough try a partial write.
 343          * POSIX expects this behavior.
 344          */
 345         if ((retval == ENOSPC) && (filebytes > offset)) {
 346                 retval = 0;
 347                 partialwrite = 1;
 348                 uio_setresid(uio, (uio_resid(uio) - bytesToAdd));
 349                 resid -= bytesToAdd;
 350                 writelimit = filebytes;
 351         }
 352 sizeok:
 353         if (retval == E_NONE) {
 354                 off_t filesize;
 355                 off_t zero_off;
 356                 off_t tail_off;
 357                 off_t inval_start;
 358                 off_t inval_end;
 359                 off_t io_start;
 360                 int lflag;
 361                 struct rl_entry *invalid_range;
 362
 363                 if (writelimit > fp->ff_size)
 364                         filesize = writelimit;
 365                 else
 366                         filesize = fp->ff_size;
 367
 368                 lflag = ioflag & ~(IO_TAILZEROFILL | IO_HEADZEROFILL | IO_NOZEROVALID | IO_NOZERODIRTY);
 369
 370                 if (offset <= fp->ff_size) {
 371                         zero_off = offset & ~PAGE_MASK_64;
 372
 373                         /* Check to see whether the area between the zero_offset and the start
 374                            of the transfer to see whether is invalid and should be zero-filled
 375                            as part of the transfer:
 376                          */
 377                         if (offset > zero_off) {
 378                                 if (rl_scan(&fp->ff_invalidranges, zero_off, offset - 1, &invalid_range) != RL_NOOVERLAP)
 379                                         lflag |= IO_HEADZEROFILL;
 380                         }
 381                 } else {
 382                         off_t eof_page_base = fp->ff_size & ~PAGE_MASK_64;
 383
 384                         /* The bytes between fp->ff_size and uio->uio_offset must never be
 385                            read without being zeroed.  The current last block is filled with zeroes
 386                            if it holds valid data but in all cases merely do a little bookkeeping
 387                            to track the area from the end of the current last page to the start of
 388                            the area actually written.  For the same reason only the bytes up to the
 389                            start of the page where this write will start is invalidated; any remainder
 390                            before uio->uio_offset is explicitly zeroed as part of the cluster_write.
 391
 392                            Note that inval_start, the start of the page after the current EOF,
 393                            may be past the start of the write, in which case the zeroing
 394                            will be handled by the cluser_write of the actual data.
 395                          */
 396                         inval_start = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
 397                         inval_end = offset & ~PAGE_MASK_64;
 398                         zero_off = fp->ff_size;
 399
 400                         if ((fp->ff_size & PAGE_MASK_64) &&
 401                                 (rl_scan(&fp->ff_invalidranges,
 402                                                         eof_page_base,
 403                                                         fp->ff_size - 1,
 404                                                         &invalid_range) != RL_NOOVERLAP)) {
 405                                 /* The page containing the EOF is not valid, so the
 406                                    entire page must be made inaccessible now.  If the write
 407                                    starts on a page beyond the page containing the eof
 408                                    (inval_end > eof_page_base), add the
 409                                    whole page to the range to be invalidated.  Otherwise
 410                                    (i.e. if the write starts on the same page), zero-fill
 411                                    the entire page explicitly now:
 412                                  */
 413                                 if (inval_end > eof_page_base) {
 414                                         inval_start = eof_page_base;
 415                                 } else {
 416                                         zero_off = eof_page_base;
 417                                 };
 418                         };
 419
 420                         if (inval_start < inval_end) {
 421                                 struct timeval tv;
 422                                 /* There's some range of data that's going to be marked invalid */
 423
 424                                 if (zero_off < inval_start) {
 425                                         /* The pages between inval_start and inval_end are going to be invalidated,
 426                                            and the actual write will start on a page past inval_end.  Now's the last
 427                                            chance to zero-fill the page containing the EOF:
 428                                          */
 429                                         hfs_unlock(cp);
 430                                         cnode_locked = 0;
 431                                         retval = cluster_write(vp, (uio_t) 0,
 432                                                         fp->ff_size, inval_start,
 433                                                         zero_off, (off_t)0,
 434                                                         lflag | IO_HEADZEROFILL | IO_NOZERODIRTY);
 435                                         hfs_lock(cp, HFS_FORCE_LOCK);
 436                                         cnode_locked = 1;
 437                                         if (retval) goto ioerr_exit;
 438                                         offset = uio_offset(uio);
 439                                 };
 440
 441                                 /* Mark the remaining area of the newly allocated space as invalid: */
 442                                 rl_add(inval_start, inval_end - 1 , &fp->ff_invalidranges);
 443                                 microuptime(&tv);
 444                                 cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
 445                                 zero_off = fp->ff_size = inval_end;
 446                         };
 447
 448                         if (offset > zero_off) lflag |= IO_HEADZEROFILL;
 449                 };
 450
 451                 /* Check to see whether the area between the end of the write and the end of
 452                    the page it falls in is invalid and should be zero-filled as part of the transfer:
 453                  */
 454                 tail_off = (writelimit + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
 455                 if (tail_off > filesize) tail_off = filesize;
 456                 if (tail_off > writelimit) {
 457                         if (rl_scan(&fp->ff_invalidranges, writelimit, tail_off - 1, &invalid_range) != RL_NOOVERLAP) {
 458                                 lflag |= IO_TAILZEROFILL;
 459                         };
 460                 };
 461
 462                 /*
 463                  * if the write starts beyond the current EOF (possibly advanced in the
 464                  * zeroing of the last block, above), then we'll zero fill from the current EOF
 465                  * to where the write begins:
 466                  *
 467                  * NOTE: If (and ONLY if) the portion of the file about to be written is
 468                  *       before the current EOF it might be marked as invalid now and must be
 469                  *       made readable (removed from the invalid ranges) before cluster_write
 470                  *       tries to write it:
 471                  */
 472                 io_start = (lflag & IO_HEADZEROFILL) ? zero_off : offset;
 473                 if (io_start < fp->ff_size) {
 474                         off_t io_end;
 475
 476                         io_end = (lflag & IO_TAILZEROFILL) ? tail_off : writelimit;
 477                         rl_remove(io_start, io_end - 1, &fp->ff_invalidranges);
 478                 };
 479
 480                 hfs_unlock(cp);
 481                 cnode_locked = 0;
 482                 retval = cluster_write(vp, uio, fp->ff_size, filesize, zero_off,
 483                                 tail_off, lflag | IO_NOZERODIRTY);
 484                 if (retval) {
 485                         goto ioerr_exit;
 486                 }
 487                 offset = uio_offset(uio);
 488                 if (offset > fp->ff_size) {
 489                         fp->ff_size = offset;
 490
 491                         ubc_setsize(vp, fp->ff_size);       /* XXX check errors */
 492                         /* Files that are changing size are not hot file candidates. */
 493                         if (hfsmp->hfc_stage == HFC_RECORDING)
 494                                 fp->ff_bytesread = 0;
 495                 }
 496                 if (resid > uio_resid(uio)) {
 497                         cp->c_touch_chgtime = TRUE;
 498                         cp->c_touch_modtime = TRUE;
 499                 }
 500         }
 501         if (partialwrite) {
 502                 uio_setresid(uio, (uio_resid(uio) + bytesToAdd));
 503                 resid += bytesToAdd;
 504         }
 505
 506         // XXXdbg - see radar 4871353 for more info
 507         {
 508             if (flush_cache_on_write && ((ioflag & IO_NOCACHE) || vnode_isnocache(vp))) {
 509                 VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NULL);
 510             }
 511         }
 512         HFS_KNOTE(vp, NOTE_WRITE);
 513
 514 ioerr_exit:
 515         /*
 516          * If we successfully wrote any data, and we are not the superuser
 517          * we clear the setuid and setgid bits as a precaution against
 518          * tampering.
 519          */
 520         if (cp->c_mode & (S_ISUID | S_ISGID)) {
 521                 cred = vfs_context_ucred(ap->a_context);
 522                 if (resid > uio_resid(uio) && cred && suser(cred, NULL)) {
 523                         if (!cnode_locked) {
 524                                 hfs_lock(cp, HFS_FORCE_LOCK);
 525                                 cnode_locked = 1;
 526                         }
 527                         cp->c_mode &= ~(S_ISUID | S_ISGID);
 528                 }
 529         }
 530         if (retval) {
 531                 if (ioflag & IO_UNIT) {
 532                         if (!cnode_locked) {
 533                                 hfs_lock(cp, HFS_FORCE_LOCK);
 534                                 cnode_locked = 1;
 535                         }
 536                         (void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC,
 537                                            0, ap->a_context);
 538                         // LP64todo - fix this!  resid needs to by user_ssize_t
 539                         uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio))));
 540                         uio_setresid(uio, resid);
 541                         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 542                 }
 543         } else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio))) {
 544                 if (!cnode_locked) {
 545                         hfs_lock(cp, HFS_FORCE_LOCK);
 546                         cnode_locked = 1;
 547                 }
 548                 retval = hfs_update(vp, TRUE);
 549         }
 550         /* Updating vcbWrCnt doesn't need to be atomic. */
 551         hfsmp->vcbWrCnt++;
 552
 553         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_END,
 554                 (int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0);
 555 exit:
 556         if (cnode_locked)
 557                 hfs_unlock(cp);
 558         hfs_unlock_truncate(cp, exclusive_lock);
 559         return (retval);
 560 }
 561
 562 /* support for the "bulk-access" fcntl */
 563
 564 #define CACHE_LEVELS 16
 565 #define NUM_CACHE_ENTRIES (64*16)
 566 #define PARENT_IDS_FLAG 0x100
 567
 568 struct access_cache {
 569        int numcached;
 570        int cachehits; /* these two for statistics gathering */
 571        int lookups;
 572        unsigned int *acache;
 573        unsigned char *haveaccess;
 574 };
 575
 576 struct access_t {
 577         uid_t     uid;              /* IN: effective user id */
 578         short     flags;            /* IN: access requested (i.e. R_OK) */
 579         short     num_groups;       /* IN: number of groups user belongs to */
 580         int       num_files;        /* IN: number of files to process */
 581         int       *file_ids;        /* IN: array of file ids */
 582         gid_t     *groups;          /* IN: array of groups */
 583         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 584 };
 585
 586 struct user_access_t {
 587         uid_t           uid;                    /* IN: effective user id */
 588         short           flags;                  /* IN: access requested (i.e. R_OK) */
 589         short           num_groups;             /* IN: number of groups user belongs to */
 590         int             num_files;              /* IN: number of files to process */
 591         user_addr_t     file_ids;               /* IN: array of file ids */
 592         user_addr_t     groups;                 /* IN: array of groups */
 593         user_addr_t     access;                 /* OUT: access info for each file (0 for 'has access') */
 594 };
 595
 596
 597 // these are the "extended" versions of the above structures
 598 // note that it is crucial that they be different sized than
 599 // the regular version
 600 struct ext_access_t {
 601         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 602         uint32_t   num_files;       /* IN: number of files to process */
 603         uint32_t   map_size;        /* IN: size of the bit map */
 604         uint32_t  *file_ids;        /* IN: Array of file ids */
 605         char      *bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 606         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 607         uint32_t   num_parents;   /* future use */
 608         cnid_t      *parents;   /* future use */
 609 };
 610
 611 struct ext_user_access_t {
 612         uint32_t      flags;        /* IN: access requested (i.e. R_OK) */
 613         uint32_t      num_files;    /* IN: number of files to process */
 614         uint32_t      map_size;     /* IN: size of the bit map */
 615         user_addr_t   file_ids;     /* IN: array of file ids */
 616         user_addr_t   bitmap;       /* IN: array of groups */
 617         user_addr_t   access;       /* OUT: access info for each file (0 for 'has access') */
 618         uint32_t      num_parents;/* future use */
 619         user_addr_t   parents;/* future use */
 620 };
 621
 622
 623 /*
 624  * Perform a binary search for the given parent_id. Return value is
 625  * the index if there is a match.  If no_match_indexp is non-NULL it
 626  * will be assigned with the index to insert the item (even if it was
 627  * not found).
 628  */
 629 static int cache_binSearch(cnid_t *array, unsigned int hi, cnid_t parent_id, int *no_match_indexp)
 630 {
 631     int index=-1;
 632     unsigned int lo=0;
 633
 634     do {
 635         unsigned int mid = ((hi - lo)/2) + lo;
 636         unsigned int this_id = array[mid];
 637
 638         if (parent_id == this_id) {
 639             hi = mid;
 640             break;
 641         }
 642
 643         if (parent_id < this_id) {
 644             hi = mid;
 645             continue;
 646         }
 647
 648         if (parent_id > this_id) {
 649             lo = mid + 1;
 650             continue;
 651         }
 652     } while(lo < hi);
 653
 654     /* check if lo and hi converged on the match */
 655     if (parent_id == array[hi]) {
 656         index = hi;
 657     }
 658
 659     if (no_match_indexp) {
 660         *no_match_indexp = hi;
 661     }
 662
 663     return index;
 664 }
 665
 666
 667 static int
 668 lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id)
 669 {
 670     unsigned int hi;
 671     int matches = 0;
 672     int index, no_match_index;
 673
 674     if (cache->numcached == 0) {
 675         *indexp = 0;
 676         return 0; // table is empty, so insert at index=0 and report no match
 677     }
 678
 679     if (cache->numcached > NUM_CACHE_ENTRIES) {
 680         /*printf("EGAD! numcached is %d... cut our losses and trim to %d\n",
 681           cache->numcached, NUM_CACHE_ENTRIES);*/
 682         cache->numcached = NUM_CACHE_ENTRIES;
 683     }
 684
 685     hi = cache->numcached - 1;
 686
 687     index = cache_binSearch(cache->acache, hi, parent_id, &no_match_index);
 688
 689     /* if no existing entry found, find index for new one */
 690     if (index == -1) {
 691         index = no_match_index;
 692         matches = 0;
 693     } else {
 694         matches = 1;
 695     }
 696
 697     *indexp = index;
 698     return matches;
 699 }
 700
 701 /*
 702  * Add a node to the access_cache at the given index (or do a lookup first
 703  * to find the index if -1 is passed in). We currently do a replace rather
 704  * than an insert if the cache is full.
 705  */
 706 static void
 707 add_node(struct access_cache *cache, int index, cnid_t nodeID, int access)
 708 {
 709     int lookup_index = -1;
 710
 711     /* need to do a lookup first if -1 passed for index */
 712     if (index == -1) {
 713         if (lookup_bucket(cache, &lookup_index, nodeID)) {
 714             if (cache->haveaccess[lookup_index] != access && cache->haveaccess[lookup_index] == ESRCH) {
 715                 // only update an entry if the previous access was ESRCH (i.e. a scope checking error)
 716                 cache->haveaccess[lookup_index] = access;
 717             }
 718
 719             /* mission accomplished */
 720             return;
 721         } else {
 722             index = lookup_index;
 723         }
 724
 725     }
 726
 727     /* if the cache is full, do a replace rather than an insert */
 728     if (cache->numcached >= NUM_CACHE_ENTRIES) {
 729         //printf("cache is full (%d). replace at index %d\n", cache->numcached, index);
 730         cache->numcached = NUM_CACHE_ENTRIES-1;
 731
 732         if (index > cache->numcached) {
 733             //    printf("index %d pinned to %d\n", index, cache->numcached);
 734             index = cache->numcached;
 735         }
 736     }
 737
 738     if (index < cache->numcached && index < NUM_CACHE_ENTRIES && nodeID > cache->acache[index]) {
 739         index++;
 740     }
 741
 742     if (index >= 0 && index < cache->numcached) {
 743         /* only do bcopy if we're inserting */
 744         bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) );
 745         bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(unsigned char) );
 746     }
 747
 748     cache->acache[index] = nodeID;
 749     cache->haveaccess[index] = access;
 750     cache->numcached++;
 751 }
 752
 753
 754 struct cinfo {
 755     uid_t   uid;
 756     gid_t   gid;
 757     mode_t  mode;
 758     cnid_t  parentcnid;
 759     u_int16_t recflags;
 760 };
 761
 762 static int
 763 snoop_callback(const struct cat_desc *descp, const struct cat_attr *attrp, void * arg)
 764 {
 765     struct cinfo *cip = (struct cinfo *)arg;
 766
 767     cip->uid = attrp->ca_uid;
 768     cip->gid = attrp->ca_gid;
 769     cip->mode = attrp->ca_mode;
 770     cip->parentcnid = descp->cd_parentcnid;
 771     cip->recflags = attrp->ca_recflags;
 772
 773     return (0);
 774 }
 775
 776 /*
 777  * Lookup the cnid's attr info (uid, gid, and mode) as well as its parent id. If the item
 778  * isn't incore, then go to the catalog.
 779  */
 780 static int
 781 do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, dev_t dev, cnid_t cnid,
 782     struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp)
 783 {
 784     int error = 0;
 785
 786     /* if this id matches the one the fsctl was called with, skip the lookup */
 787     if (cnid == skip_cp->c_cnid) {
 788         cnattrp->ca_uid = skip_cp->c_uid;
 789         cnattrp->ca_gid = skip_cp->c_gid;
 790         cnattrp->ca_mode = skip_cp->c_mode;
 791         keyp->hfsPlus.parentID = skip_cp->c_parentcnid;
 792     } else {
 793         struct cinfo c_info;
 794
 795         /* otherwise, check the cnode hash incase the file/dir is incore */
 796         if (hfs_chash_snoop(dev, cnid, snoop_callback, &c_info) == 0) {
 797             cnattrp->ca_uid = c_info.uid;
 798             cnattrp->ca_gid = c_info.gid;
 799             cnattrp->ca_mode = c_info.mode;
 800             cnattrp->ca_recflags = c_info.recflags;
 801             keyp->hfsPlus.parentID = c_info.parentcnid;
 802         } else {
 803             int lockflags;
 804
 805             lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
 806
 807             /* lookup this cnid in the catalog */
 808             error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp);
 809
 810             hfs_systemfile_unlock(hfsmp, lockflags);
 811
 812             cache->lookups++;
 813         }
 814     }
 815
 816     return (error);
 817 }
 818
 819
 820 /*
 821  * Compute whether we have access to the given directory (nodeID) and all its parents. Cache
 822  * up to CACHE_LEVELS as we progress towards the root.
 823  */
 824 static int
 825 do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HFSCatalogNodeID nodeID,
 826     struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred, dev_t dev,
 827     struct vfs_context *my_context,
 828     char *bitmap,
 829     uint32_t map_size,
 830     cnid_t* parents,
 831     uint32_t num_parents)
 832 {
 833     int                     myErr = 0;
 834     int                     myResult;
 835     HFSCatalogNodeID        thisNodeID;
 836     unsigned int            myPerms;
 837     struct cat_attr         cnattr;
 838     int                     cache_index = -1, scope_index = -1, scope_idx_start = -1;
 839     CatalogKey              catkey;
 840
 841     int i = 0, ids_to_cache = 0;
 842     int parent_ids[CACHE_LEVELS];
 843
 844     thisNodeID = nodeID;
 845     while (thisNodeID >=  kRootDirID) {
 846         myResult = 0;   /* default to "no access" */
 847
 848         /* check the cache before resorting to hitting the catalog */
 849
 850         /* ASSUMPTION: access info of cached entries is "final"... i.e. no need
 851          * to look any further after hitting cached dir */
 852
 853         if (lookup_bucket(cache, &cache_index, thisNodeID)) {
 854             cache->cachehits++;
 855             myErr = cache->haveaccess[cache_index];
 856             if (scope_index != -1) {
 857                 if (myErr == ESRCH) {
 858                     myErr = 0;
 859                 }
 860             } else {
 861                 scope_index = 0;   // so we'll just use the cache result
 862                 scope_idx_start = ids_to_cache;
 863             }
 864             myResult = (myErr == 0) ? 1 : 0;
 865             goto ExitThisRoutine;
 866         }
 867
 868
 869         if (parents) {
 870             int tmp;
 871             tmp = cache_binSearch(parents, num_parents-1, thisNodeID, NULL);
 872             if (scope_index == -1)
 873                 scope_index = tmp;
 874             if (tmp != -1 && scope_idx_start == -1 && ids_to_cache < CACHE_LEVELS) {
 875                 scope_idx_start = ids_to_cache;
 876             }
 877         }
 878
 879         /* remember which parents we want to cache */
 880         if (ids_to_cache < CACHE_LEVELS) {
 881             parent_ids[ids_to_cache] = thisNodeID;
 882             ids_to_cache++;
 883         }
 884         // Inefficient (using modulo) and we might want to use a hash function, not rely on the node id to be "nice"...
 885         if (bitmap && map_size) {
 886             bitmap[(thisNodeID/8)%(map_size)]|=(1<<(thisNodeID&7));
 887         }
 888
 889
 890         /* do the lookup (checks the cnode hash, then the catalog) */
 891         myErr = do_attr_lookup(hfsmp, cache, dev, thisNodeID, skip_cp, &catkey, &cnattr);
 892         if (myErr) {
 893             goto ExitThisRoutine; /* no access */
 894         }
 895
 896         /* Root always gets access. */
 897         if (suser(myp_ucred, NULL) == 0) {
 898                 thisNodeID = catkey.hfsPlus.parentID;
 899                 myResult = 1;
 900                 continue;
 901         }
 902
 903         // if the thing has acl's, do the full permission check
 904         if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
 905             struct vnode *vp;
 906
 907             /* get the vnode for this cnid */
 908             myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0);
 909             if ( myErr ) {
 910                 myResult = 0;
 911                 goto ExitThisRoutine;
 912             }
 913
 914             thisNodeID = VTOC(vp)->c_parentcnid;
 915
 916             hfs_unlock(VTOC(vp));
 917
 918             if (vnode_vtype(vp) == VDIR) {
 919                 myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), my_context);
 920             } else {
 921                 myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, my_context);
 922             }
 923
 924             vnode_put(vp);
 925             if (myErr) {
 926                 myResult = 0;
 927                 goto ExitThisRoutine;
 928             }
 929         } else {
 930             unsigned int flags;
 931
 932             myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
 933                 cnattr.ca_mode, hfsmp->hfs_mp,
 934                 myp_ucred, theProcPtr);
 935
 936             if (cnattr.ca_mode & S_IFDIR) {
 937                 flags = R_OK | X_OK;
 938             } else {
 939                 flags = R_OK;
 940             }
 941             if ( (myPerms & flags) != flags) {
 942                 myResult = 0;
 943                 myErr = EACCES;
 944                 goto ExitThisRoutine;   /* no access */
 945             }
 946
 947             /* up the hierarchy we go */
 948             thisNodeID = catkey.hfsPlus.parentID;
 949         }
 950     }
 951
 952     /* if here, we have access to this node */
 953     myResult = 1;
 954
 955   ExitThisRoutine:
 956     if (parents && myErr == 0 && scope_index == -1) {
 957         myErr = ESRCH;
 958     }
 959
 960     if (myErr) {
 961         myResult = 0;
 962     }
 963     *err = myErr;
 964
 965     /* cache the parent directory(ies) */
 966     for (i = 0; i < ids_to_cache; i++) {
 967         if (myErr == 0 && parents && (scope_idx_start == -1 || i > scope_idx_start)) {
 968             add_node(cache, -1, parent_ids[i], ESRCH);
 969         } else {
 970             add_node(cache, -1, parent_ids[i], myErr);
 971         }
 972     }
 973
 974     return (myResult);
 975 }
 976
 977 static int
 978 do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
 979     struct vnop_ioctl_args *ap, int arg_size, vfs_context_t context)
 980 {
 981     boolean_t is64bit;
 982
 983     /*
 984      * NOTE: on entry, the vnode is locked. Incase this vnode
 985      * happens to be in our list of file_ids, we'll note it
 986      * avoid calling hfs_chashget_nowait() on that id as that
 987      * will cause a "locking against myself" panic.
 988      */
 989     Boolean check_leaf = true;
 990
 991     struct ext_user_access_t *user_access_structp;
 992     struct ext_user_access_t tmp_user_access;
 993     struct access_cache cache;
 994
 995     int error = 0;
 996     unsigned int i;
 997
 998     dev_t dev = VTOC(vp)->c_dev;
 999
1000     short flags;
1001     unsigned int num_files = 0;
1002     int map_size = 0;
1003     int num_parents = 0;
1004     int *file_ids=NULL;
1005     short *access=NULL;
1006     char *bitmap=NULL;
1007     cnid_t *parents=NULL;
1008     int leaf_index;
1009
1010     cnid_t cnid;
1011     cnid_t prevParent_cnid = 0;
1012     unsigned int myPerms;
1013     short myaccess = 0;
1014     struct cat_attr cnattr;
1015     CatalogKey catkey;
1016     struct cnode *skip_cp = VTOC(vp);
1017     kauth_cred_t cred = vfs_context_ucred(context);
1018     proc_t p = vfs_context_proc(context);
1019
1020     is64bit = proc_is64bit(p);
1021
1022     /* initialize the local cache and buffers */
1023     cache.numcached = 0;
1024     cache.cachehits = 0;
1025     cache.lookups = 0;
1026     cache.acache = NULL;
1027     cache.haveaccess = NULL;
1028
1029     /* struct copyin done during dispatch... need to copy file_id array separately */
1030     if (ap->a_data == NULL) {
1031         error = EINVAL;
1032         goto err_exit_bulk_access;
1033     }
1034
1035     if (is64bit) {
1036         if (arg_size != sizeof(struct ext_user_access_t)) {
1037             error = EINVAL;
1038             goto err_exit_bulk_access;
1039         }
1040
1041         user_access_structp = (struct ext_user_access_t *)ap->a_data;
1042
1043     } else if (arg_size == sizeof(struct access_t)) {
1044         struct access_t *accessp = (struct access_t *)ap->a_data;
1045
1046         // convert an old style bulk-access struct to the new style
1047         tmp_user_access.flags     = accessp->flags;
1048         tmp_user_access.num_files = accessp->num_files;
1049         tmp_user_access.map_size  = 0;
1050         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1051         tmp_user_access.bitmap    = USER_ADDR_NULL;
1052         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1053         tmp_user_access.num_parents = 0;
1054         user_access_structp = &tmp_user_access;
1055
1056     } else if (arg_size == sizeof(struct ext_access_t)) {
1057         struct ext_access_t *accessp = (struct ext_access_t *)ap->a_data;
1058
1059         // up-cast from a 32-bit version of the struct
1060         tmp_user_access.flags     = accessp->flags;
1061         tmp_user_access.num_files = accessp->num_files;
1062         tmp_user_access.map_size  = accessp->map_size;
1063         tmp_user_access.num_parents  = accessp->num_parents;
1064
1065         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1066         tmp_user_access.bitmap    = CAST_USER_ADDR_T(accessp->bitmap);
1067         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1068         tmp_user_access.parents    = CAST_USER_ADDR_T(accessp->parents);
1069
1070         user_access_structp = &tmp_user_access;
1071     } else {
1072         error = EINVAL;
1073         goto err_exit_bulk_access;
1074     }
1075
1076     map_size = user_access_structp->map_size;
1077
1078     num_files = user_access_structp->num_files;
1079
1080     num_parents= user_access_structp->num_parents;
1081
1082     if (num_files < 1) {
1083         goto err_exit_bulk_access;
1084     }
1085     if (num_files > 1024) {
1086         error = EINVAL;
1087         goto err_exit_bulk_access;
1088     }
1089
1090     if (num_parents > 1024) {
1091         error = EINVAL;
1092         goto err_exit_bulk_access;
1093     }
1094
1095     file_ids = (int *) kalloc(sizeof(int) * num_files);
1096     access = (short *) kalloc(sizeof(short) * num_files);
1097     if (map_size) {
1098         bitmap = (char *) kalloc(sizeof(char) * map_size);
1099     }
1100
1101     if (num_parents) {
1102         parents = (cnid_t *) kalloc(sizeof(cnid_t) * num_parents);
1103     }
1104
1105     cache.acache = (unsigned int *) kalloc(sizeof(int) * NUM_CACHE_ENTRIES);
1106     cache.haveaccess = (unsigned char *) kalloc(sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1107
1108     if (file_ids == NULL || access == NULL || (map_size != 0 && bitmap == NULL) || cache.acache == NULL || cache.haveaccess == NULL) {
1109         if (file_ids) {
1110             kfree(file_ids, sizeof(int) * num_files);
1111         }
1112         if (bitmap) {
1113             kfree(bitmap, sizeof(char) * map_size);
1114         }
1115         if (access) {
1116             kfree(access, sizeof(short) * num_files);
1117         }
1118         if (cache.acache) {
1119             kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1120         }
1121         if (cache.haveaccess) {
1122             kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1123         }
1124         if (parents) {
1125             kfree(parents, sizeof(cnid_t) * num_parents);
1126         }
1127         return ENOMEM;
1128     }
1129
1130     // make sure the bitmap is zero'ed out...
1131     if (bitmap) {
1132         bzero(bitmap, (sizeof(char) * map_size));
1133     }
1134
1135     if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids,
1136                 num_files * sizeof(int)))) {
1137         goto err_exit_bulk_access;
1138     }
1139
1140     if (num_parents) {
1141         if ((error = copyin(user_access_structp->parents, (caddr_t)parents,
1142                     num_parents * sizeof(cnid_t)))) {
1143             goto err_exit_bulk_access;
1144         }
1145     }
1146
1147     flags = user_access_structp->flags;
1148     if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) {
1149         flags = R_OK;
1150     }
1151
1152     /* check if we've been passed leaf node ids or parent ids */
1153     if (flags & PARENT_IDS_FLAG) {
1154         check_leaf = false;
1155     }
1156
1157     /* Check access to each file_id passed in */
1158     for (i = 0; i < num_files; i++) {
1159         leaf_index=-1;
1160         cnid = (cnid_t) file_ids[i];
1161
1162         /* root always has access */
1163         if ((!parents) && (!suser(cred, NULL))) {
1164             access[i] = 0;
1165             continue;
1166         }
1167
1168         if (check_leaf) {
1169             /* do the lookup (checks the cnode hash, then the catalog) */
1170             error = do_attr_lookup(hfsmp, &cache, dev, cnid, skip_cp, &catkey, &cnattr);
1171             if (error) {
1172                 access[i] = (short) error;
1173                 continue;
1174             }
1175
1176             if (parents) {
1177                 // Check if the leaf matches one of the parent scopes
1178                 leaf_index = cache_binSearch(parents, num_parents-1, cnid, NULL);
1179             }
1180
1181             // if the thing has acl's, do the full permission check
1182             if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1183                 struct vnode *cvp;
1184                 int myErr = 0;
1185                 /* get the vnode for this cnid */
1186                 myErr = hfs_vget(hfsmp, cnid, &cvp, 0);
1187                 if ( myErr ) {
1188                     access[i] = myErr;
1189                     continue;
1190                 }
1191
1192                 hfs_unlock(VTOC(cvp));
1193
1194                 if (vnode_vtype(cvp) == VDIR) {
1195                     myErr = vnode_authorize(cvp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), context);
1196                 } else {
1197                     myErr = vnode_authorize(cvp, NULL, KAUTH_VNODE_READ_DATA, context);
1198                 }
1199
1200                 vnode_put(cvp);
1201                 if (myErr) {
1202                     access[i] = myErr;
1203                     continue;
1204                 }
1205             } else {
1206                 /* before calling CheckAccess(), check the target file for read access */
1207                 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
1208                     cnattr.ca_mode, hfsmp->hfs_mp, cred, p);
1209
1210                 /* fail fast if no access */
1211                 if ((myPerms & flags) == 0) {
1212                     access[i] = EACCES;
1213                     continue;
1214                 }
1215             }
1216         } else {
1217             /* we were passed an array of parent ids */
1218             catkey.hfsPlus.parentID = cnid;
1219         }
1220
1221         /* if the last guy had the same parent and had access, we're done */
1222         if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0) {
1223             cache.cachehits++;
1224             access[i] = 0;
1225             continue;
1226         }
1227
1228         myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID,
1229             skip_cp, p, cred, dev, context,bitmap, map_size, parents, num_parents);
1230
1231         if (myaccess || (error == ESRCH && leaf_index != -1)) {
1232             access[i] = 0; // have access.. no errors to report
1233         } else {
1234             access[i] = (error != 0 ? (short) error : EACCES);
1235         }
1236
1237         prevParent_cnid = catkey.hfsPlus.parentID;
1238     }
1239
1240     /* copyout the access array */
1241     if ((error = copyout((caddr_t)access, user_access_structp->access,
1242                 num_files * sizeof (short)))) {
1243         goto err_exit_bulk_access;
1244     }
1245     if (map_size && bitmap) {
1246         if ((error = copyout((caddr_t)bitmap, user_access_structp->bitmap,
1247                     map_size * sizeof (char)))) {
1248             goto err_exit_bulk_access;
1249         }
1250     }
1251
1252
1253   err_exit_bulk_access:
1254
1255     //printf("on exit (err %d), numfiles/numcached/cachehits/lookups is %d/%d/%d/%d\n", error, num_files, cache.numcached, cache.cachehits, cache.lookups);
1256
1257     if (file_ids)
1258         kfree(file_ids, sizeof(int) * num_files);
1259     if (parents)
1260         kfree(parents, sizeof(cnid_t) * num_parents);
1261     if (bitmap)
1262         kfree(bitmap, sizeof(char) * map_size);
1263     if (access)
1264         kfree(access, sizeof(short) * num_files);
1265     if (cache.acache)
1266         kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1267     if (cache.haveaccess)
1268         kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1269
1270     return (error);
1271 }
1272
1273
1274 /* end "bulk-access" support */
1275
1276
1277 /*
1278  * Callback for use with freeze ioctl.
1279  */
1280 static int
1281 hfs_freezewrite_callback(struct vnode *vp, __unused void *cargs)
1282 {
1283         vnode_waitforwrites(vp, 0, 0, 0, "hfs freeze");
1284
1285         return 0;
1286 }
1287
1288 /*
1289  * Control filesystem operating characteristics.
1290  */
1291 int
1292 hfs_vnop_ioctl( struct vnop_ioctl_args /* {
1293                 vnode_t a_vp;
1294                 int  a_command;
1295                 caddr_t  a_data;
1296                 int  a_fflag;
1297                 vfs_context_t a_context;
1298         } */ *ap)
1299 {
1300         struct vnode * vp = ap->a_vp;
1301         struct hfsmount *hfsmp = VTOHFS(vp);
1302         vfs_context_t context = ap->a_context;
1303         kauth_cred_t cred = vfs_context_ucred(context);
1304         proc_t p = vfs_context_proc(context);
1305         struct vfsstatfs *vfsp;
1306         boolean_t is64bit;
1307
1308         is64bit = proc_is64bit(p);
1309
1310         switch (ap->a_command) {
1311
1312         case HFS_GETPATH:
1313         {
1314                 struct vnode *file_vp;
1315                 cnid_t  cnid;
1316                 int  outlen;
1317                 char *bufptr;
1318                 int error;
1319
1320                 /* Caller must be owner of file system. */
1321                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1322                 if (suser(cred, NULL) &&
1323                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1324                         return (EACCES);
1325                 }
1326                 /* Target vnode must be file system's root. */
1327                 if (!vnode_isvroot(vp)) {
1328                         return (EINVAL);
1329                 }
1330                 bufptr = (char *)ap->a_data;
1331                 cnid = strtoul(bufptr, NULL, 10);
1332
1333                 /* We need to call hfs_vfs_vget to leverage the code that will fix the
1334                  * origin list for us if needed, as opposed to calling hfs_vget, since
1335                  * we will need it for the subsequent build_path call.
1336                  */
1337                 if ((error = hfs_vfs_vget(HFSTOVFS(hfsmp), cnid, &file_vp, context))) {
1338                         return (error);
1339                 }
1340                 error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, 0, context);
1341                 vnode_put(file_vp);
1342
1343                 return (error);
1344         }
1345
1346         case HFS_PREV_LINK:
1347         case HFS_NEXT_LINK:
1348         {
1349                 cnid_t linkfileid;
1350                 cnid_t nextlinkid;
1351                 cnid_t prevlinkid;
1352                 int error;
1353
1354                 /* Caller must be owner of file system. */
1355                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1356                 if (suser(cred, NULL) &&
1357                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1358                         return (EACCES);
1359                 }
1360                 /* Target vnode must be file system's root. */
1361                 if (!vnode_isvroot(vp)) {
1362                         return (EINVAL);
1363                 }
1364                 linkfileid = *(cnid_t *)ap->a_data;
1365                 if (linkfileid < kHFSFirstUserCatalogNodeID) {
1366                         return (EINVAL);
1367                 }
1368                 if ((error = hfs_lookuplink(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) {
1369                         return (error);
1370                 }
1371                 if (ap->a_command == HFS_NEXT_LINK) {
1372                         *(cnid_t *)ap->a_data = nextlinkid;
1373                 } else {
1374                         *(cnid_t *)ap->a_data = prevlinkid;
1375                 }
1376                 return (0);
1377         }
1378
1379         case HFS_RESIZE_PROGRESS: {
1380
1381                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1382                 if (suser(cred, NULL) &&
1383                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1384                         return (EACCES); /* must be owner of file system */
1385                 }
1386                 if (!vnode_isvroot(vp)) {
1387                         return (EINVAL);
1388                 }
1389                 return hfs_resize_progress(hfsmp, (u_int32_t *)ap->a_data);
1390         }
1391
1392         case HFS_RESIZE_VOLUME: {
1393                 u_int64_t newsize;
1394                 u_int64_t cursize;
1395
1396                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1397                 if (suser(cred, NULL) &&
1398                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1399                         return (EACCES); /* must be owner of file system */
1400                 }
1401                 if (!vnode_isvroot(vp)) {
1402                         return (EINVAL);
1403                 }
1404                 newsize = *(u_int64_t *)ap->a_data;
1405                 cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
1406
1407                 if (newsize > cursize) {
1408                         return hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context);
1409                 } else if (newsize < cursize) {
1410                         return hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context);
1411                 } else {
1412                         return (0);
1413                 }
1414         }
1415         case HFS_CHANGE_NEXT_ALLOCATION: {
1416                 int error = 0;          /* Assume success */
1417                 u_int32_t location;
1418
1419                 if (vnode_vfsisrdonly(vp)) {
1420                         return (EROFS);
1421                 }
1422                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1423                 if (suser(cred, NULL) &&
1424                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1425                         return (EACCES); /* must be owner of file system */
1426                 }
1427                 if (!vnode_isvroot(vp)) {
1428                         return (EINVAL);
1429                 }
1430                 HFS_MOUNT_LOCK(hfsmp, TRUE);
1431                 location = *(u_int32_t *)ap->a_data;
1432                 if ((location >= hfsmp->allocLimit) &&
1433                         (location != HFS_NO_UPDATE_NEXT_ALLOCATION)) {
1434                         error = EINVAL;
1435                         goto fail_change_next_allocation;
1436                 }
1437                 /* Return previous value. */
1438                 *(u_int32_t *)ap->a_data = hfsmp->nextAllocation;
1439                 if (location == HFS_NO_UPDATE_NEXT_ALLOCATION) {
1440                         /* On magic value for location, set nextAllocation to next block
1441                          * after metadata zone and set flag in mount structure to indicate
1442                          * that nextAllocation should not be updated again.
1443                          */
1444                         HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_end + 1);
1445                         hfsmp->hfs_flags |= HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1446                 } else {
1447                         hfsmp->hfs_flags &= ~HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1448                         HFS_UPDATE_NEXT_ALLOCATION(hfsmp, location);
1449                 }
1450                 MarkVCBDirty(hfsmp);
1451 fail_change_next_allocation:
1452                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
1453                 return (error);
1454         }
1455
1456 #ifdef HFS_SPARSE_DEV
1457         case HFS_SETBACKINGSTOREINFO: {
1458                 struct vnode * bsfs_rootvp;
1459                 struct vnode * di_vp;
1460                 struct hfs_backingstoreinfo *bsdata;
1461                 int error = 0;
1462
1463                 if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
1464                         return (EALREADY);
1465                 }
1466                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1467                 if (suser(cred, NULL) &&
1468                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1469                         return (EACCES); /* must be owner of file system */
1470                 }
1471                 bsdata = (struct hfs_backingstoreinfo *)ap->a_data;
1472                 if (bsdata == NULL) {
1473                         return (EINVAL);
1474                 }
1475                 if ((error = file_vnode(bsdata->backingfd, &di_vp))) {
1476                         return (error);
1477                 }
1478                 if ((error = vnode_getwithref(di_vp))) {
1479                         file_drop(bsdata->backingfd);
1480                         return(error);
1481                 }
1482
1483                 if (vnode_mount(vp) == vnode_mount(di_vp)) {
1484                         (void)vnode_put(di_vp);
1485                         file_drop(bsdata->backingfd);
1486                         return (EINVAL);
1487                 }
1488
1489                 /*
1490                  * Obtain the backing fs root vnode and keep a reference
1491                  * on it.  This reference will be dropped in hfs_unmount.
1492                  */
1493                 error = VFS_ROOT(vnode_mount(di_vp), &bsfs_rootvp, NULL); /* XXX use context! */
1494                 if (error) {
1495                         (void)vnode_put(di_vp);
1496                         file_drop(bsdata->backingfd);
1497                         return (error);
1498                 }
1499                 vnode_ref(bsfs_rootvp);
1500                 vnode_put(bsfs_rootvp);
1501
1502                 hfsmp->hfs_backingfs_rootvp = bsfs_rootvp;
1503                 hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
1504                 hfsmp->hfs_sparsebandblks = bsdata->bandsize / HFSTOVCB(hfsmp)->blockSize;
1505                 hfsmp->hfs_sparsebandblks *= 4;
1506
1507                 vfs_markdependency(hfsmp->hfs_mp);
1508
1509                 (void)vnode_put(di_vp);
1510                 file_drop(bsdata->backingfd);
1511                 return (0);
1512         }
1513         case HFS_CLRBACKINGSTOREINFO: {
1514                 struct vnode * tmpvp;
1515
1516                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1517                 if (suser(cred, NULL) &&
1518                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1519                         return (EACCES); /* must be owner of file system */
1520                 }
1521                 if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
1522                     hfsmp->hfs_backingfs_rootvp) {
1523
1524                         hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
1525                         tmpvp = hfsmp->hfs_backingfs_rootvp;
1526                         hfsmp->hfs_backingfs_rootvp = NULLVP;
1527                         hfsmp->hfs_sparsebandblks = 0;
1528                         vnode_rele(tmpvp);
1529                 }
1530                 return (0);
1531         }
1532 #endif /* HFS_SPARSE_DEV */
1533
1534         case F_FREEZE_FS: {
1535                 struct mount *mp;
1536
1537                 if (!is_suser())
1538                         return (EACCES);
1539
1540                 mp = vnode_mount(vp);
1541                 hfsmp = VFSTOHFS(mp);
1542
1543                 if (!(hfsmp->jnl))
1544                         return (ENOTSUP);
1545
1546                 lck_rw_lock_exclusive(&hfsmp->hfs_insync);
1547
1548                 // flush things before we get started to try and prevent
1549                 // dirty data from being paged out while we're frozen.
1550                 // note: can't do this after taking the lock as it will
1551                 // deadlock against ourselves.
1552                 vnode_iterate(mp, 0, hfs_freezewrite_callback, NULL);
1553                 hfs_global_exclusive_lock_acquire(hfsmp);
1554                 journal_flush(hfsmp->jnl);
1555
1556                 // don't need to iterate on all vnodes, we just need to
1557                 // wait for writes to the system files and the device vnode
1558                 if (HFSTOVCB(hfsmp)->extentsRefNum)
1559                     vnode_waitforwrites(HFSTOVCB(hfsmp)->extentsRefNum, 0, 0, 0, "hfs freeze");
1560                 if (HFSTOVCB(hfsmp)->catalogRefNum)
1561                     vnode_waitforwrites(HFSTOVCB(hfsmp)->catalogRefNum, 0, 0, 0, "hfs freeze");
1562                 if (HFSTOVCB(hfsmp)->allocationsRefNum)
1563                     vnode_waitforwrites(HFSTOVCB(hfsmp)->allocationsRefNum, 0, 0, 0, "hfs freeze");
1564                 if (hfsmp->hfs_attribute_vp)
1565                     vnode_waitforwrites(hfsmp->hfs_attribute_vp, 0, 0, 0, "hfs freeze");
1566                 vnode_waitforwrites(hfsmp->hfs_devvp, 0, 0, 0, "hfs freeze");
1567
1568                 hfsmp->hfs_freezing_proc = current_proc();
1569
1570                 return (0);
1571         }
1572
1573         case F_THAW_FS: {
1574                 if (!is_suser())
1575                         return (EACCES);
1576
1577                 // if we're not the one who froze the fs then we
1578                 // can't thaw it.
1579                 if (hfsmp->hfs_freezing_proc != current_proc()) {
1580                     return EPERM;
1581                 }
1582
1583                 // NOTE: if you add code here, also go check the
1584                 //       code that "thaws" the fs in hfs_vnop_close()
1585                 //
1586                 hfsmp->hfs_freezing_proc = NULL;
1587                 hfs_global_exclusive_lock_release(hfsmp);
1588                 lck_rw_unlock_exclusive(&hfsmp->hfs_insync);
1589
1590                 return (0);
1591         }
1592
1593         case HFS_BULKACCESS_FSCTL: {
1594             int size;
1595
1596             if (hfsmp->hfs_flags & HFS_STANDARD) {
1597                 return EINVAL;
1598             }
1599
1600             if (is64bit) {
1601                 size = sizeof(struct user_access_t);
1602             } else {
1603                 size = sizeof(struct access_t);
1604             }
1605
1606             return do_bulk_access_check(hfsmp, vp, ap, size, context);
1607         }
1608
1609         case HFS_EXT_BULKACCESS_FSCTL: {
1610             int size;
1611
1612             if (hfsmp->hfs_flags & HFS_STANDARD) {
1613                 return EINVAL;
1614             }
1615
1616             if (is64bit) {
1617                 size = sizeof(struct ext_user_access_t);
1618             } else {
1619                 size = sizeof(struct ext_access_t);
1620             }
1621
1622             return do_bulk_access_check(hfsmp, vp, ap, size, context);
1623         }
1624
1625         case HFS_SETACLSTATE: {
1626                 int state;
1627
1628                 if (ap->a_data == NULL) {
1629                         return (EINVAL);
1630                 }
1631
1632                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1633                 state = *(int *)ap->a_data;
1634
1635                 // super-user can enable or disable acl's on a volume.
1636                 // the volume owner can only enable acl's
1637                 if (!is_suser() && (state == 0 || kauth_cred_getuid(cred) != vfsp->f_owner)) {
1638                         return (EPERM);
1639                 }
1640                 if (state == 0 || state == 1)
1641                         return hfs_set_volxattr(hfsmp, HFS_SETACLSTATE, state);
1642                 else
1643                         return (EINVAL);
1644         }
1645
1646         case HFS_SET_XATTREXTENTS_STATE: {
1647                 int state;
1648
1649                 if (ap->a_data == NULL) {
1650                         return (EINVAL);
1651                 }
1652
1653                 state = *(int *)ap->a_data;
1654
1655                 /* Super-user can enable or disable extent-based extended
1656                  * attribute support on a volume
1657                  */
1658                 if (!is_suser()) {
1659                         return (EPERM);
1660                 }
1661                 if (state == 0 || state == 1)
1662                         return hfs_set_volxattr(hfsmp, HFS_SET_XATTREXTENTS_STATE, state);
1663                 else
1664                         return (EINVAL);
1665         }
1666
1667         case F_FULLFSYNC: {
1668                 int error;
1669
1670                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK);
1671                 if (error == 0) {
1672                         error = hfs_fsync(vp, MNT_WAIT, TRUE, p);
1673                         hfs_unlock(VTOC(vp));
1674                 }
1675
1676                 return error;
1677         }
1678
1679         case F_CHKCLEAN: {
1680                 register struct cnode *cp;
1681                 int error;
1682
1683                 if (!vnode_isreg(vp))
1684                         return EINVAL;
1685
1686                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK);
1687                 if (error == 0) {
1688                         cp = VTOC(vp);
1689                         /*
1690                          * used by regression test to determine if
1691                          * all the dirty pages (via write) have been cleaned
1692                          * after a call to 'fsysnc'.
1693                          */
1694                         error = is_file_clean(vp, VTOF(vp)->ff_size);
1695                         hfs_unlock(cp);
1696                 }
1697                 return (error);
1698         }
1699
1700         case F_RDADVISE: {
1701                 register struct radvisory *ra;
1702                 struct filefork *fp;
1703                 int error;
1704
1705                 if (!vnode_isreg(vp))
1706                         return EINVAL;
1707
1708                 ra = (struct radvisory *)(ap->a_data);
1709                 fp = VTOF(vp);
1710
1711                 /* Protect against a size change. */
1712                 hfs_lock_truncate(VTOC(vp), TRUE);
1713
1714                 if (ra->ra_offset >= fp->ff_size) {
1715                         error = EFBIG;
1716                 } else {
1717                         error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count);
1718                 }
1719
1720                 hfs_unlock_truncate(VTOC(vp), TRUE);
1721                 return (error);
1722         }
1723
1724         case F_READBOOTSTRAP:
1725         case F_WRITEBOOTSTRAP:
1726         {
1727             struct vnode *devvp = NULL;
1728             user_fbootstraptransfer_t *user_bootstrapp;
1729             int devBlockSize;
1730             int error;
1731             uio_t auio;
1732             daddr64_t blockNumber;
1733             u_long blockOffset;
1734             u_long xfersize;
1735             struct buf *bp;
1736             user_fbootstraptransfer_t user_bootstrap;
1737
1738                 if (!vnode_isvroot(vp))
1739                         return (EINVAL);
1740                 /* LP64 - when caller is a 64 bit process then we are passed a pointer
1741                  * to a user_fbootstraptransfer_t else we get a pointer to a
1742                  * fbootstraptransfer_t which we munge into a user_fbootstraptransfer_t
1743                  */
1744                 if (is64bit) {
1745                         user_bootstrapp = (user_fbootstraptransfer_t *)ap->a_data;
1746                 }
1747                 else {
1748                 fbootstraptransfer_t *bootstrapp = (fbootstraptransfer_t *)ap->a_data;
1749                         user_bootstrapp = &user_bootstrap;
1750                         user_bootstrap.fbt_offset = bootstrapp->fbt_offset;
1751                         user_bootstrap.fbt_length = bootstrapp->fbt_length;
1752                         user_bootstrap.fbt_buffer = CAST_USER_ADDR_T(bootstrapp->fbt_buffer);
1753                 }
1754                 if (user_bootstrapp->fbt_offset + user_bootstrapp->fbt_length > 1024)
1755                         return EINVAL;
1756
1757             devvp = VTOHFS(vp)->hfs_devvp;
1758                 auio = uio_create(1, user_bootstrapp->fbt_offset,
1759                                                   is64bit ? UIO_USERSPACE64 : UIO_USERSPACE32,
1760                                                   (ap->a_command == F_WRITEBOOTSTRAP) ? UIO_WRITE : UIO_READ);
1761                 uio_addiov(auio, user_bootstrapp->fbt_buffer, user_bootstrapp->fbt_length);
1762
1763             devBlockSize = vfs_devblocksize(vnode_mount(vp));
1764
1765             while (uio_resid(auio) > 0) {
1766                         blockNumber = uio_offset(auio) / devBlockSize;
1767                         error = (int)buf_bread(devvp, blockNumber, devBlockSize, cred, &bp);
1768                         if (error) {
1769                                 if (bp) buf_brelse(bp);
1770                                 uio_free(auio);
1771                                 return error;
1772                         };
1773
1774                         blockOffset = uio_offset(auio) % devBlockSize;
1775                         xfersize = devBlockSize - blockOffset;
1776                         error = uiomove((caddr_t)buf_dataptr(bp) + blockOffset, (int)xfersize, auio);
1777                         if (error) {
1778                                 buf_brelse(bp);
1779                                 uio_free(auio);
1780                                 return error;
1781                         };
1782                         if (uio_rw(auio) == UIO_WRITE) {
1783                                 error = VNOP_BWRITE(bp);
1784                                 if (error) {
1785                                         uio_free(auio);
1786                         return error;
1787                                 }
1788                         } else {
1789                                 buf_brelse(bp);
1790                         };
1791                 };
1792                 uio_free(auio);
1793         };
1794         return 0;
1795
1796         case _IOC(IOC_OUT,'h', 4, 0):     /* Create date in local time */
1797         {
1798                 if (is64bit) {
1799                         *(user_time_t *)(ap->a_data) = (user_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
1800                 }
1801                 else {
1802                         *(time_t *)(ap->a_data) = to_bsd_time(VTOVCB(vp)->localCreateDate);
1803                 }
1804                 return 0;
1805         }
1806
1807         case HFS_GET_MOUNT_TIME:
1808             return copyout(&hfsmp->hfs_mount_time, CAST_USER_ADDR_T(ap->a_data), sizeof(hfsmp->hfs_mount_time));
1809             break;
1810
1811         case HFS_GET_LAST_MTIME:
1812             return copyout(&hfsmp->hfs_last_mounted_mtime, CAST_USER_ADDR_T(ap->a_data), sizeof(hfsmp->hfs_last_mounted_mtime));
1813             break;
1814
1815         case HFS_SET_BOOT_INFO:
1816                 if (!vnode_isvroot(vp))
1817                         return(EINVAL);
1818                 if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(HFSTOVFS(hfsmp))->f_owner))
1819                         return(EACCES); /* must be superuser or owner of filesystem */
1820                 HFS_MOUNT_LOCK(hfsmp, TRUE);
1821                 bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo));
1822                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
1823                 (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
1824                 break;
1825
1826         case HFS_GET_BOOT_INFO:
1827                 if (!vnode_isvroot(vp))
1828                         return(EINVAL);
1829                 HFS_MOUNT_LOCK(hfsmp, TRUE);
1830                 bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo));
1831                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
1832                 break;
1833
1834         case HFS_MARK_BOOT_CORRUPT:
1835                 /* Mark the boot volume corrupt by setting
1836                  * kHFSVolumeInconsistentBit in the volume header.  This will
1837                  * force fsck_hfs on next mount.
1838                  */
1839                 if (!is_suser()) {
1840                         return EACCES;
1841                 }
1842
1843                 /* Allowed only on the root vnode of the boot volume */
1844                 if (!(vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) ||
1845                     !vnode_isvroot(vp)) {
1846                         return EINVAL;
1847                 }
1848
1849                 printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n");
1850                 hfs_mark_volume_inconsistent(hfsmp);
1851                 break;
1852
1853         default:
1854                 return (ENOTTY);
1855         }
1856
1857     /* Should never get here */
1858         return 0;
1859 }
1860
1861 /*
1862  * select
1863  */
1864 int
1865 hfs_vnop_select(__unused struct vnop_select_args *ap)
1866 /*
1867         struct vnop_select_args {
1868                 vnode_t a_vp;
1869                 int  a_which;
1870                 int  a_fflags;
1871                 void *a_wql;
1872                 vfs_context_t a_context;
1873         };
1874 */
1875 {
1876         /*
1877          * We should really check to see if I/O is possible.
1878          */
1879         return (1);
1880 }
1881
1882 /*
1883  * Converts a logical block number to a physical block, and optionally returns
1884  * the amount of remaining blocks in a run. The logical block is based on hfsNode.logBlockSize.
1885  * The physical block number is based on the device block size, currently its 512.
1886  * The block run is returned in logical blocks, and is the REMAINING amount of blocks
1887  */
1888 int
1889 hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, unsigned int *runp)
1890 {
1891         struct filefork *fp = VTOF(vp);
1892         struct hfsmount *hfsmp = VTOHFS(vp);
1893         int  retval = E_NONE;
1894         u_int32_t  logBlockSize;
1895         size_t  bytesContAvail = 0;
1896         off_t  blockposition;
1897         int lockExtBtree;
1898         int lockflags = 0;
1899
1900         /*
1901          * Check for underlying vnode requests and ensure that logical
1902          * to physical mapping is requested.
1903          */
1904         if (vpp != NULL)
1905                 *vpp = hfsmp->hfs_devvp;
1906         if (bnp == NULL)
1907                 return (0);
1908
1909         logBlockSize = GetLogicalBlockSize(vp);
1910         blockposition = (off_t)bn * logBlockSize;
1911
1912         lockExtBtree = overflow_extents(fp);
1913
1914         if (lockExtBtree)
1915                 lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK);
1916
1917         retval = MacToVFSError(
1918                             MapFileBlockC (HFSTOVCB(hfsmp),
1919                                             (FCB*)fp,
1920                                             MAXPHYSIO,
1921                                             blockposition,
1922                                             bnp,
1923                                             &bytesContAvail));
1924
1925         if (lockExtBtree)
1926                 hfs_systemfile_unlock(hfsmp, lockflags);
1927
1928         if (retval == E_NONE) {
1929                 /* Figure out how many read ahead blocks there are */
1930                 if (runp != NULL) {
1931                         if (can_cluster(logBlockSize)) {
1932                                 /* Make sure this result never goes negative: */
1933                                 *runp = (bytesContAvail < logBlockSize) ? 0 : (bytesContAvail / logBlockSize) - 1;
1934                         } else {
1935                                 *runp = 0;
1936                         }
1937                 }
1938         }
1939         return (retval);
1940 }
1941
1942 /*
1943  * Convert logical block number to file offset.
1944  */
1945 int
1946 hfs_vnop_blktooff(struct vnop_blktooff_args *ap)
1947 /*
1948         struct vnop_blktooff_args {
1949                 vnode_t a_vp;
1950                 daddr64_t a_lblkno;
1951                 off_t *a_offset;
1952         };
1953 */
1954 {
1955         if (ap->a_vp == NULL)
1956                 return (EINVAL);
1957         *ap->a_offset = (off_t)ap->a_lblkno * (off_t)GetLogicalBlockSize(ap->a_vp);
1958
1959         return(0);
1960 }
1961
1962 /*
1963  * Convert file offset to logical block number.
1964  */
1965 int
1966 hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap)
1967 /*
1968         struct vnop_offtoblk_args {
1969                 vnode_t a_vp;
1970                 off_t a_offset;
1971                 daddr64_t *a_lblkno;
1972         };
1973 */
1974 {
1975         if (ap->a_vp == NULL)
1976                 return (EINVAL);
1977         *ap->a_lblkno = (daddr64_t)(ap->a_offset / (off_t)GetLogicalBlockSize(ap->a_vp));
1978
1979         return(0);
1980 }
1981
1982 /*
1983  * Map file offset to physical block number.
1984  *
1985  * If this function is called for write operation, and if the file
1986  * had virtual blocks allocated (delayed allocation), real blocks
1987  * are allocated by calling ExtendFileC().
1988  *
1989  * If this function is called for read operation, and if the file
1990  * had virtual blocks allocated (delayed allocation), no change
1991  * to the size of file is done, and if required, rangelist is
1992  * searched for mapping.
1993  *
1994  * System file cnodes are expected to be locked (shared or exclusive).
1995  */
1996 int
1997 hfs_vnop_blockmap(struct vnop_blockmap_args *ap)
1998 /*
1999         struct vnop_blockmap_args {
2000                 vnode_t a_vp;
2001                 off_t a_foffset;
2002                 size_t a_size;
2003                 daddr64_t *a_bpn;
2004                 size_t *a_run;
2005                 void *a_poff;
2006                 int a_flags;
2007                 vfs_context_t a_context;
2008         };
2009 */
2010 {
2011         struct vnode *vp = ap->a_vp;
2012         struct cnode *cp;
2013         struct filefork *fp;
2014         struct hfsmount *hfsmp;
2015         size_t bytesContAvail = 0;
2016         int retval = E_NONE;
2017         int syslocks = 0;
2018         int lockflags = 0;
2019         struct rl_entry *invalid_range;
2020         enum rl_overlaptype overlaptype;
2021         int started_tr = 0;
2022         int tooklock = 0;
2023
2024         /* Do not allow blockmap operation on a directory */
2025         if (vnode_isdir(vp)) {
2026                 return (ENOTSUP);
2027         }
2028
2029         /*
2030          * Check for underlying vnode requests and ensure that logical
2031          * to physical mapping is requested.
2032          */
2033         if (ap->a_bpn == NULL)
2034                 return (0);
2035
2036         if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) {
2037                 if (VTOC(vp)->c_lockowner != current_thread()) {
2038                         hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
2039                         tooklock = 1;
2040                 }
2041         }
2042         hfsmp = VTOHFS(vp);
2043         cp = VTOC(vp);
2044         fp = VTOF(vp);
2045
2046 retry:
2047         /* Check virtual blocks only when performing write operation */
2048         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
2049                 if (hfs_start_transaction(hfsmp) != 0) {
2050                         retval = EINVAL;
2051                         goto exit;
2052                 } else {
2053                         started_tr = 1;
2054                 }
2055                 syslocks = SFL_EXTENTS | SFL_BITMAP;
2056
2057         } else if (overflow_extents(fp)) {
2058                 syslocks = SFL_EXTENTS;
2059         }
2060
2061         if (syslocks)
2062                 lockflags = hfs_systemfile_lock(hfsmp, syslocks, HFS_EXCLUSIVE_LOCK);
2063
2064         /*
2065          * Check for any delayed allocations.
2066          */
2067         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
2068                 int64_t actbytes;
2069                 u_int32_t loanedBlocks;
2070
2071                 //
2072                 // Make sure we have a transaction.  It's possible
2073                 // that we came in and fp->ff_unallocblocks was zero
2074                 // but during the time we blocked acquiring the extents
2075                 // btree, ff_unallocblocks became non-zero and so we
2076                 // will need to start a transaction.
2077                 //
2078                 if (started_tr == 0) {
2079                         if (syslocks) {
2080                                 hfs_systemfile_unlock(hfsmp, lockflags);
2081                                 syslocks = 0;
2082                         }
2083                         goto retry;
2084                 }
2085
2086                 /*
2087                  * Note: ExtendFileC will Release any blocks on loan and
2088                  * aquire real blocks.  So we ask to extend by zero bytes
2089                  * since ExtendFileC will account for the virtual blocks.
2090                  */
2091
2092                 loanedBlocks = fp->ff_unallocblocks;
2093                 retval = ExtendFileC(hfsmp, (FCB*)fp, 0, 0,
2094                                      kEFAllMask | kEFNoClumpMask, &actbytes);
2095
2096                 if (retval) {
2097                         fp->ff_unallocblocks = loanedBlocks;
2098                         cp->c_blocks += loanedBlocks;
2099                         fp->ff_blocks += loanedBlocks;
2100
2101                         HFS_MOUNT_LOCK(hfsmp, TRUE);
2102                         hfsmp->loanedBlocks += loanedBlocks;
2103                         HFS_MOUNT_UNLOCK(hfsmp, TRUE);
2104
2105                         hfs_systemfile_unlock(hfsmp, lockflags);
2106                         cp->c_flag |= C_MODIFIED;
2107                         if (started_tr) {
2108                                 (void) hfs_update(vp, TRUE);
2109                                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2110
2111                                 hfs_end_transaction(hfsmp);
2112                                 started_tr = 0;
2113                         }
2114                         goto exit;
2115                 }
2116         }
2117
2118         retval = MapFileBlockC(hfsmp, (FCB *)fp, ap->a_size, ap->a_foffset,
2119                                ap->a_bpn, &bytesContAvail);
2120         if (syslocks) {
2121                 hfs_systemfile_unlock(hfsmp, lockflags);
2122                 syslocks = 0;
2123         }
2124
2125         if (started_tr) {
2126                 (void) hfs_update(vp, TRUE);
2127                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2128                 hfs_end_transaction(hfsmp);
2129                 started_tr = 0;
2130         }
2131         if (retval) {
2132                 /* On write, always return error because virtual blocks, if any,
2133                  * should have been allocated in ExtendFileC().  We do not
2134                  * allocate virtual blocks on read, therefore return error
2135                  * only if no virtual blocks are allocated.  Otherwise we search
2136                  * rangelist for zero-fills
2137                  */
2138                 if ((MacToVFSError(retval) != ERANGE) ||
2139                     (ap->a_flags & VNODE_WRITE) ||
2140                     ((ap->a_flags & VNODE_READ) && (fp->ff_unallocblocks == 0))) {
2141                         goto exit;
2142                 }
2143
2144                 /* Validate if the start offset is within logical file size */
2145                 if (ap->a_foffset > fp->ff_size) {
2146                         goto exit;
2147                 }
2148
2149                 /* Searching file extents has failed for read operation, therefore
2150                  * search rangelist for any uncommitted holes in the file.
2151                  */
2152                 overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
2153                                       ap->a_foffset + (off_t)(ap->a_size - 1),
2154                                       &invalid_range);
2155                 switch(overlaptype) {
2156                 case RL_OVERLAPISCONTAINED:
2157                         /* start_offset <= rl_start, end_offset >= rl_end */
2158                         if (ap->a_foffset != invalid_range->rl_start) {
2159                                 break;
2160                         }
2161                 case RL_MATCHINGOVERLAP:
2162                         /* start_offset = rl_start, end_offset = rl_end */
2163                 case RL_OVERLAPCONTAINSRANGE:
2164                         /* start_offset >= rl_start, end_offset <= rl_end */
2165                 case RL_OVERLAPSTARTSBEFORE:
2166                         /* start_offset > rl_start, end_offset >= rl_start */
2167                         if ((off_t)fp->ff_size > (invalid_range->rl_end + 1)) {
2168                                 bytesContAvail = (invalid_range->rl_end + 1) - ap->a_foffset;
2169                         } else {
2170                                 bytesContAvail = fp->ff_size - ap->a_foffset;
2171                         }
2172                         if (bytesContAvail > ap->a_size) {
2173                                 bytesContAvail = ap->a_size;
2174                         }
2175                         *ap->a_bpn = (daddr64_t)-1;
2176                         retval = 0;
2177                         break;
2178                 case RL_OVERLAPENDSAFTER:
2179                         /* start_offset < rl_start, end_offset < rl_end */
2180                 case RL_NOOVERLAP:
2181                         break;
2182                 }
2183                 goto exit;
2184         }
2185
2186         /* MapFileC() found a valid extent in the filefork.  Search the
2187          * mapping information further for invalid file ranges
2188          */
2189         overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
2190                               ap->a_foffset + (off_t)bytesContAvail - 1,
2191                               &invalid_range);
2192         if (overlaptype != RL_NOOVERLAP) {
2193                 switch(overlaptype) {
2194                 case RL_MATCHINGOVERLAP:
2195                 case RL_OVERLAPCONTAINSRANGE:
2196                 case RL_OVERLAPSTARTSBEFORE:
2197                         /* There's no valid block for this byte offset */
2198                         *ap->a_bpn = (daddr64_t)-1;
2199                         /* There's no point limiting the amount to be returned
2200                          * if the invalid range that was hit extends all the way
2201                          * to the EOF (i.e. there's no valid bytes between the
2202                          * end of this range and the file's EOF):
2203                          */
2204                         if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
2205                             (invalid_range->rl_end + 1 - ap->a_foffset < bytesContAvail)) {
2206                                 bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
2207                         }
2208                         break;
2209
2210                 case RL_OVERLAPISCONTAINED:
2211                 case RL_OVERLAPENDSAFTER:
2212                         /* The range of interest hits an invalid block before the end: */
2213                         if (invalid_range->rl_start == ap->a_foffset) {
2214                                 /* There's actually no valid information to be had starting here: */
2215                                 *ap->a_bpn = (daddr64_t)-1;
2216                                 if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
2217                                     (invalid_range->rl_end + 1 - ap->a_foffset < bytesContAvail)) {
2218                                         bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
2219                                 }
2220                         } else {
2221                                 bytesContAvail = invalid_range->rl_start - ap->a_foffset;
2222                         }
2223                         break;
2224
2225                 case RL_NOOVERLAP:
2226                         break;
2227                 } /* end switch */
2228                 if (bytesContAvail > ap->a_size)
2229                         bytesContAvail = ap->a_size;
2230         }
2231
2232 exit:
2233         if (retval == 0) {
2234                 if (ap->a_run)
2235                         *ap->a_run = bytesContAvail;
2236
2237                 if (ap->a_poff)
2238                         *(int *)ap->a_poff = 0;
2239         }
2240
2241         if (tooklock)
2242                 hfs_unlock(cp);
2243
2244         return (MacToVFSError(retval));
2245 }
2246
2247
2248 /*
2249  * prepare and issue the I/O
2250  * buf_strategy knows how to deal
2251  * with requests that require
2252  * fragmented I/Os
2253  */
2254 int
2255 hfs_vnop_strategy(struct vnop_strategy_args *ap)
2256 {
2257         buf_t   bp = ap->a_bp;
2258         vnode_t vp = buf_vnode(bp);
2259
2260         return (buf_strategy(VTOHFS(vp)->hfs_devvp, ap));
2261 }
2262
2263
2264 static int
2265 do_hfs_truncate(struct vnode *vp, off_t length, int flags, vfs_context_t context)
2266 {
2267         register struct cnode *cp = VTOC(vp);
2268         struct filefork *fp = VTOF(vp);
2269         struct proc *p = vfs_context_proc(context);;
2270         kauth_cred_t cred = vfs_context_ucred(context);
2271         int retval;
2272         off_t bytesToAdd;
2273         off_t actualBytesAdded;
2274         off_t filebytes;
2275         u_long fileblocks;
2276         int blksize;
2277         struct hfsmount *hfsmp;
2278         int lockflags;
2279
2280         blksize = VTOVCB(vp)->blockSize;
2281         fileblocks = fp->ff_blocks;
2282         filebytes = (off_t)fileblocks * (off_t)blksize;
2283
2284         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_START,
2285                  (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
2286
2287         if (length < 0)
2288                 return (EINVAL);
2289
2290         /* This should only happen with a corrupt filesystem */
2291         if ((off_t)fp->ff_size < 0)
2292                 return (EINVAL);
2293
2294         if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE))
2295                 return (EFBIG);
2296
2297         hfsmp = VTOHFS(vp);
2298
2299         retval = E_NONE;
2300
2301         /* Files that are changing size are not hot file candidates. */
2302         if (hfsmp->hfc_stage == HFC_RECORDING) {
2303                 fp->ff_bytesread = 0;
2304         }
2305
2306         /*
2307          * We cannot just check if fp->ff_size == length (as an optimization)
2308          * since there may be extra physical blocks that also need truncation.
2309          */
2310 #if QUOTA
2311         if ((retval = hfs_getinoquota(cp)))
2312                 return(retval);
2313 #endif /* QUOTA */
2314
2315         /*
2316          * Lengthen the size of the file. We must ensure that the
2317          * last byte of the file is allocated. Since the smallest
2318          * value of ff_size is 0, length will be at least 1.
2319          */
2320         if (length > (off_t)fp->ff_size) {
2321 #if QUOTA
2322                 retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)),
2323                                    cred, 0);
2324                 if (retval)
2325                         goto Err_Exit;
2326 #endif /* QUOTA */
2327                 /*
2328                  * If we don't have enough physical space then
2329                  * we need to extend the physical size.
2330                  */
2331                 if (length > filebytes) {
2332                         int eflags;
2333                         u_long blockHint = 0;
2334
2335                         /* All or nothing and don't round up to clumpsize. */
2336                         eflags = kEFAllMask | kEFNoClumpMask;
2337
2338                         if (cred && suser(cred, NULL) != 0)
2339                                 eflags |= kEFReserveMask;  /* keep a reserve */
2340
2341                         /*
2342                          * Allocate Journal and Quota files in metadata zone.
2343                          */
2344                         if (filebytes == 0 &&
2345                             hfsmp->hfs_flags & HFS_METADATA_ZONE &&
2346                             hfs_virtualmetafile(cp)) {
2347                                 eflags |= kEFMetadataMask;
2348                                 blockHint = hfsmp->hfs_metazone_start;
2349                         }
2350                         if (hfs_start_transaction(hfsmp) != 0) {
2351                             retval = EINVAL;
2352                             goto Err_Exit;
2353                         }
2354
2355                         /* Protect extents b-tree and allocation bitmap */
2356                         lockflags = SFL_BITMAP;
2357                         if (overflow_extents(fp))
2358                                 lockflags |= SFL_EXTENTS;
2359                         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
2360
2361                         while ((length > filebytes) && (retval == E_NONE)) {
2362                                 bytesToAdd = length - filebytes;
2363                                 retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
2364                                                     (FCB*)fp,
2365                                                     bytesToAdd,
2366                                                     blockHint,
2367                                                     eflags,
2368                                                     &actualBytesAdded));
2369
2370                                 filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
2371                                 if (actualBytesAdded == 0 && retval == E_NONE) {
2372                                         if (length > filebytes)
2373                                                 length = filebytes;
2374                                         break;
2375                                 }
2376                         } /* endwhile */
2377
2378                         hfs_systemfile_unlock(hfsmp, lockflags);
2379
2380                         if (hfsmp->jnl) {
2381                             (void) hfs_update(vp, TRUE);
2382                             (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2383                         }
2384
2385                         hfs_end_transaction(hfsmp);
2386
2387                         if (retval)
2388                                 goto Err_Exit;
2389
2390                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
2391                                 (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
2392                 }
2393
2394                 if (!(flags & IO_NOZEROFILL)) {
2395                         if (UBCINFOEXISTS(vp)  && (vnode_issystem(vp) == 0) && retval == E_NONE) {
2396                                 struct rl_entry *invalid_range;
2397                                 off_t zero_limit;
2398
2399                                 zero_limit = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
2400                                 if (length < zero_limit) zero_limit = length;
2401
2402                                 if (length > (off_t)fp->ff_size) {
2403                                         struct timeval tv;
2404
2405                                         /* Extending the file: time to fill out the current last page w. zeroes? */
2406                                         if ((fp->ff_size & PAGE_MASK_64) &&
2407                                             (rl_scan(&fp->ff_invalidranges, fp->ff_size & ~PAGE_MASK_64,
2408                                             fp->ff_size - 1, &invalid_range) == RL_NOOVERLAP)) {
2409
2410                                                 /* There's some valid data at the start of the (current) last page
2411                                                    of the file, so zero out the remainder of that page to ensure the
2412                                                    entire page contains valid data.  Since there is no invalid range
2413                                                    possible past the (current) eof, there's no need to remove anything
2414                                                    from the invalid range list before calling cluster_write():  */
2415                                                 hfs_unlock(cp);
2416                                                 retval = cluster_write(vp, (struct uio *) 0, fp->ff_size, zero_limit,
2417                                                                 fp->ff_size, (off_t)0,
2418                                                                 (flags & IO_SYNC) | IO_HEADZEROFILL | IO_NOZERODIRTY);
2419                                                 hfs_lock(cp, HFS_FORCE_LOCK);
2420                                                 if (retval) goto Err_Exit;
2421
2422                                                 /* Merely invalidate the remaining area, if necessary: */
2423                                                 if (length > zero_limit) {
2424                                                         microuptime(&tv);
2425                                                         rl_add(zero_limit, length - 1, &fp->ff_invalidranges);
2426                                                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
2427                                                 }
2428                                         } else {
2429                                         /* The page containing the (current) eof is invalid: just add the
2430                                            remainder of the page to the invalid list, along with the area
2431                                            being newly allocated:
2432                                          */
2433                                         microuptime(&tv);
2434                                         rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges);
2435                                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
2436                                         };
2437                                 }
2438                         } else {
2439                                         panic("hfs_truncate: invoked on non-UBC object?!");
2440                         };
2441                 }
2442                 cp->c_touch_modtime = TRUE;
2443                 fp->ff_size = length;
2444
2445         } else { /* Shorten the size of the file */
2446
2447                 if ((off_t)fp->ff_size > length) {
2448                         /* Any space previously marked as invalid is now irrelevant: */
2449                         rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges);
2450                 }
2451
2452                 /*
2453                  * Account for any unmapped blocks. Note that the new
2454                  * file length can still end up with unmapped blocks.
2455                  */
2456                 if (fp->ff_unallocblocks > 0) {
2457                         u_int32_t finalblks;
2458                         u_int32_t loanedBlocks;
2459
2460                         HFS_MOUNT_LOCK(hfsmp, TRUE);
2461
2462                         loanedBlocks = fp->ff_unallocblocks;
2463                         cp->c_blocks -= loanedBlocks;
2464                         fp->ff_blocks -= loanedBlocks;
2465                         fp->ff_unallocblocks = 0;
2466
2467                         hfsmp->loanedBlocks -= loanedBlocks;
2468
2469                         finalblks = (length + blksize - 1) / blksize;
2470                         if (finalblks > fp->ff_blocks) {
2471                                 /* calculate required unmapped blocks */
2472                                 loanedBlocks = finalblks - fp->ff_blocks;
2473                                 hfsmp->loanedBlocks += loanedBlocks;
2474
2475                                 fp->ff_unallocblocks = loanedBlocks;
2476                                 cp->c_blocks += loanedBlocks;
2477                                 fp->ff_blocks += loanedBlocks;
2478                         }
2479                         HFS_MOUNT_UNLOCK(hfsmp, TRUE);
2480                 }
2481
2482                 /*
2483                  * For a TBE process the deallocation of the file blocks is
2484                  * delayed until the file is closed.  And hfs_close calls
2485                  * truncate with the IO_NDELAY flag set.  So when IO_NDELAY
2486                  * isn't set, we make sure this isn't a TBE process.
2487                  */
2488                 if ((flags & IO_NDELAY) || (proc_tbe(p) == 0)) {
2489 #if QUOTA
2490                   off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
2491 #endif /* QUOTA */
2492                   if (hfs_start_transaction(hfsmp) != 0) {
2493                       retval = EINVAL;
2494                       goto Err_Exit;
2495                   }
2496
2497                         if (fp->ff_unallocblocks == 0) {
2498                                 /* Protect extents b-tree and allocation bitmap */
2499                                 lockflags = SFL_BITMAP;
2500                                 if (overflow_extents(fp))
2501                                         lockflags |= SFL_EXTENTS;
2502                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
2503
2504                                 retval = MacToVFSError(TruncateFileC(VTOVCB(vp),
2505                                                 (FCB*)fp, length, false));
2506
2507                                 hfs_systemfile_unlock(hfsmp, lockflags);
2508                         }
2509                         if (hfsmp->jnl) {
2510                                 if (retval == 0) {
2511                                         fp->ff_size = length;
2512                                 }
2513                                 (void) hfs_update(vp, TRUE);
2514                                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2515                         }
2516
2517                         hfs_end_transaction(hfsmp);
2518
2519                         filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
2520                         if (retval)
2521                                 goto Err_Exit;
2522 #if QUOTA
2523                         /* These are bytesreleased */
2524                         (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0);
2525 #endif /* QUOTA */
2526                 }
2527                 /* Only set update flag if the logical length changes */
2528                 if ((off_t)fp->ff_size != length)
2529                         cp->c_touch_modtime = TRUE;
2530                 fp->ff_size = length;
2531         }
2532         cp->c_touch_chgtime = TRUE;     /* status changed */
2533         cp->c_touch_modtime = TRUE;     /* file data was modified */
2534         retval = hfs_update(vp, MNT_WAIT);
2535         if (retval) {
2536                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
2537                      -1, -1, -1, retval, 0);
2538         }
2539
2540 Err_Exit:
2541
2542         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_END,
2543                  (int)length, (int)fp->ff_size, (int)filebytes, retval, 0);
2544
2545         return (retval);
2546 }
2547
2548
2549
2550 /*
2551  * Truncate a cnode to at most length size, freeing (or adding) the
2552  * disk blocks.
2553  */
2554 __private_extern__
2555 int
2556 hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize,
2557              vfs_context_t context)
2558 {
2559         struct filefork *fp = VTOF(vp);
2560         off_t filebytes;
2561         u_long fileblocks;
2562         int blksize, error = 0;
2563         struct cnode *cp = VTOC(vp);
2564
2565         /* Cannot truncate an HFS directory! */
2566         if (vnode_isdir(vp)) {
2567                 return (EISDIR);
2568         }
2569         /* A swap file cannot change size. */
2570         if (vnode_isswap(vp) && (length != 0)) {
2571                 return (EPERM);
2572         }
2573
2574         blksize = VTOVCB(vp)->blockSize;
2575         fileblocks = fp->ff_blocks;
2576         filebytes = (off_t)fileblocks * (off_t)blksize;
2577
2578         //
2579         // Have to do this here so that we don't wind up with
2580         // i/o pending for blocks that are about to be released
2581         // if we truncate the file.
2582         //
2583         // If skipsetsize is set, then the caller is responsible
2584         // for the ubc_setsize.
2585         //
2586         if (!skipsetsize)
2587                 ubc_setsize(vp, length);
2588
2589         // have to loop truncating or growing files that are
2590         // really big because otherwise transactions can get
2591         // enormous and consume too many kernel resources.
2592
2593         if (length < filebytes) {
2594                 while (filebytes > length) {
2595                         if ((filebytes - length) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
2596                                 filebytes -= HFS_BIGFILE_SIZE;
2597                         } else {
2598                                 filebytes = length;
2599                         }
2600                         cp->c_flag |= C_FORCEUPDATE;
2601                         error = do_hfs_truncate(vp, filebytes, flags, context);
2602                         if (error)
2603                                 break;
2604                 }
2605         } else if (length > filebytes) {
2606                 while (filebytes < length) {
2607                         if ((length - filebytes) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
2608                                 filebytes += HFS_BIGFILE_SIZE;
2609                         } else {
2610                                 filebytes = length;
2611                         }
2612                         cp->c_flag |= C_FORCEUPDATE;
2613                         error = do_hfs_truncate(vp, filebytes, flags, context);
2614                         if (error)
2615                                 break;
2616                 }
2617         } else /* Same logical size */ {
2618
2619                 error = do_hfs_truncate(vp, length, flags, context);
2620         }
2621         /* Files that are changing size are not hot file candidates. */
2622         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
2623                 fp->ff_bytesread = 0;
2624         }
2625
2626         return (error);
2627 }
2628
2629
2630
2631 /*
2632  * Preallocate file storage space.
2633  */
2634 int
2635 hfs_vnop_allocate(struct vnop_allocate_args /* {
2636                 vnode_t a_vp;
2637                 off_t a_length;
2638                 u_int32_t  a_flags;
2639                 off_t *a_bytesallocated;
2640                 off_t a_offset;
2641                 vfs_context_t a_context;
2642         } */ *ap)
2643 {
2644         struct vnode *vp = ap->a_vp;
2645         struct cnode *cp;
2646         struct filefork *fp;
2647         ExtendedVCB *vcb;
2648         off_t length = ap->a_length;
2649         off_t startingPEOF;
2650         off_t moreBytesRequested;
2651         off_t actualBytesAdded;
2652         off_t filebytes;
2653         u_long fileblocks;
2654         int retval, retval2;
2655         u_int32_t blockHint;
2656         u_int32_t extendFlags;   /* For call to ExtendFileC */
2657         struct hfsmount *hfsmp;
2658         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
2659         int lockflags;
2660
2661         *(ap->a_bytesallocated) = 0;
2662
2663         if (!vnode_isreg(vp))
2664                 return (EISDIR);
2665         if (length < (off_t)0)
2666                 return (EINVAL);
2667
2668         cp = VTOC(vp);
2669
2670         hfs_lock_truncate(cp, TRUE);
2671
2672         if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
2673                 goto Err_Exit;
2674         }
2675
2676         fp = VTOF(vp);
2677         hfsmp = VTOHFS(vp);
2678         vcb = VTOVCB(vp);
2679
2680         fileblocks = fp->ff_blocks;
2681         filebytes = (off_t)fileblocks * (off_t)vcb->blockSize;
2682
2683         if ((ap->a_flags & ALLOCATEFROMVOL) && (length < filebytes)) {
2684                 retval = EINVAL;
2685                 goto Err_Exit;
2686         }
2687
2688         /* Fill in the flags word for the call to Extend the file */
2689
2690         extendFlags = kEFNoClumpMask;
2691         if (ap->a_flags & ALLOCATECONTIG)
2692                 extendFlags |= kEFContigMask;
2693         if (ap->a_flags & ALLOCATEALL)
2694                 extendFlags |= kEFAllMask;
2695         if (cred && suser(cred, NULL) != 0)
2696                 extendFlags |= kEFReserveMask;
2697
2698         retval = E_NONE;
2699         blockHint = 0;
2700         startingPEOF = filebytes;
2701
2702         if (ap->a_flags & ALLOCATEFROMPEOF)
2703                 length += filebytes;
2704         else if (ap->a_flags & ALLOCATEFROMVOL)
2705                 blockHint = ap->a_offset / VTOVCB(vp)->blockSize;
2706
2707         /* If no changes are necesary, then we're done */
2708         if (filebytes == length)
2709                 goto Std_Exit;
2710
2711         /*
2712          * Lengthen the size of the file. We must ensure that the
2713          * last byte of the file is allocated. Since the smallest
2714          * value of filebytes is 0, length will be at least 1.
2715          */
2716         if (length > filebytes) {
2717                 off_t total_bytes_added = 0, orig_request_size;
2718
2719                 orig_request_size = moreBytesRequested = length - filebytes;
2720
2721 #if QUOTA
2722                 retval = hfs_chkdq(cp,
2723                                 (int64_t)(roundup(moreBytesRequested, vcb->blockSize)),
2724                                 cred, 0);
2725                 if (retval)
2726                         goto Err_Exit;
2727
2728 #endif /* QUOTA */
2729                 /*
2730                  * Metadata zone checks.
2731                  */
2732                 if (hfsmp->hfs_flags & HFS_METADATA_ZONE) {
2733                         /*
2734                          * Allocate Journal and Quota files in metadata zone.
2735                          */
2736                         if (hfs_virtualmetafile(cp)) {
2737                                 extendFlags |= kEFMetadataMask;
2738                                 blockHint = hfsmp->hfs_metazone_start;
2739                         } else if ((blockHint >= hfsmp->hfs_metazone_start) &&
2740                                    (blockHint <= hfsmp->hfs_metazone_end)) {
2741                                 /*
2742                                  * Move blockHint outside metadata zone.
2743                                  */
2744                                 blockHint = hfsmp->hfs_metazone_end + 1;
2745                         }
2746                 }
2747
2748
2749                 while ((length > filebytes) && (retval == E_NONE)) {
2750                     off_t bytesRequested;
2751
2752                     if (hfs_start_transaction(hfsmp) != 0) {
2753                         retval = EINVAL;
2754                         goto Err_Exit;
2755                     }
2756
2757                     /* Protect extents b-tree and allocation bitmap */
2758                     lockflags = SFL_BITMAP;
2759                     if (overflow_extents(fp))
2760                         lockflags |= SFL_EXTENTS;
2761                     lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
2762
2763                     if (moreBytesRequested >= HFS_BIGFILE_SIZE) {
2764                         bytesRequested = HFS_BIGFILE_SIZE;
2765                     } else {
2766                         bytesRequested = moreBytesRequested;
2767                     }
2768
2769                     retval = MacToVFSError(ExtendFileC(vcb,
2770                                                 (FCB*)fp,
2771                                                 bytesRequested,
2772                                                 blockHint,
2773                                                 extendFlags,
2774                                                 &actualBytesAdded));
2775
2776                     if (retval == E_NONE) {
2777                         *(ap->a_bytesallocated) += actualBytesAdded;
2778                         total_bytes_added += actualBytesAdded;
2779                         moreBytesRequested -= actualBytesAdded;
2780                         if (blockHint != 0) {
2781                             blockHint += actualBytesAdded / vcb->blockSize;
2782                         }
2783                     }
2784                     filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
2785
2786                     hfs_systemfile_unlock(hfsmp, lockflags);
2787
2788                     if (hfsmp->jnl) {
2789                         (void) hfs_update(vp, TRUE);
2790                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2791                     }
2792
2793                     hfs_end_transaction(hfsmp);
2794                 }
2795
2796
2797                 /*
2798                  * if we get an error and no changes were made then exit
2799                  * otherwise we must do the hfs_update to reflect the changes
2800                  */
2801                 if (retval && (startingPEOF == filebytes))
2802                         goto Err_Exit;
2803
2804                 /*
2805                  * Adjust actualBytesAdded to be allocation block aligned, not
2806                  * clump size aligned.
2807                  * NOTE: So what we are reporting does not affect reality
2808                  * until the file is closed, when we truncate the file to allocation
2809                  * block size.
2810                  */
2811                 if (total_bytes_added != 0 && orig_request_size < total_bytes_added)
2812                         *(ap->a_bytesallocated) =
2813                                 roundup(orig_request_size, (off_t)vcb->blockSize);
2814
2815         } else { /* Shorten the size of the file */
2816
2817                 if (fp->ff_size > length) {
2818                         /*
2819                          * Any buffers that are past the truncation point need to be
2820                          * invalidated (to maintain buffer cache consistency).
2821                          */
2822                 }
2823
2824                 retval = hfs_truncate(vp, length, 0, 0, ap->a_context);
2825                 filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
2826
2827                 /*
2828                  * if we get an error and no changes were made then exit
2829                  * otherwise we must do the hfs_update to reflect the changes
2830                  */
2831                 if (retval && (startingPEOF == filebytes)) goto Err_Exit;
2832 #if QUOTA
2833                 /* These are  bytesreleased */
2834                 (void) hfs_chkdq(cp, (int64_t)-((startingPEOF - filebytes)), NOCRED,0);
2835 #endif /* QUOTA */
2836
2837                 if (fp->ff_size > filebytes) {
2838                         fp->ff_size = filebytes;
2839
2840                         hfs_unlock(cp);
2841                         ubc_setsize(vp, fp->ff_size);
2842                         hfs_lock(cp, HFS_FORCE_LOCK);
2843                 }
2844         }
2845
2846 Std_Exit:
2847         cp->c_touch_chgtime = TRUE;
2848         cp->c_touch_modtime = TRUE;
2849         retval2 = hfs_update(vp, MNT_WAIT);
2850
2851         if (retval == 0)
2852                 retval = retval2;
2853 Err_Exit:
2854         hfs_unlock_truncate(cp, TRUE);
2855         hfs_unlock(cp);
2856         return (retval);
2857 }
2858
2859
2860 /*
2861  * Pagein for HFS filesystem
2862  */
2863 int
2864 hfs_vnop_pagein(struct vnop_pagein_args *ap)
2865 /*
2866         struct vnop_pagein_args {
2867                 vnode_t a_vp,
2868                 upl_t         a_pl,
2869                 vm_offset_t   a_pl_offset,
2870                 off_t         a_f_offset,
2871                 size_t        a_size,
2872                 int           a_flags
2873                 vfs_context_t a_context;
2874         };
2875 */
2876 {
2877         vnode_t vp = ap->a_vp;
2878         int error;
2879
2880         error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
2881                                ap->a_size, (off_t)VTOF(vp)->ff_size, ap->a_flags);
2882         /*
2883          * Keep track of blocks read.
2884          */
2885         if (!vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) {
2886                 struct cnode *cp;
2887                 struct filefork *fp;
2888                 int bytesread;
2889                 int took_cnode_lock = 0;
2890
2891                 cp = VTOC(vp);
2892                 fp = VTOF(vp);
2893
2894                 if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE)
2895                         bytesread = fp->ff_size;
2896                 else
2897                         bytesread = ap->a_size;
2898
2899                 /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
2900                 if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) {
2901                         hfs_lock(cp, HFS_FORCE_LOCK);
2902                         took_cnode_lock = 1;
2903                 }
2904                 /*
2905                  * If this file hasn't been seen since the start of
2906                  * the current sampling period then start over.
2907                  */
2908                 if (cp->c_atime < VTOHFS(vp)->hfc_timebase) {
2909                         struct timeval tv;
2910
2911                         fp->ff_bytesread = bytesread;
2912                         microtime(&tv);
2913                         cp->c_atime = tv.tv_sec;
2914                 } else {
2915                         fp->ff_bytesread += bytesread;
2916                 }
2917                 cp->c_touch_acctime = TRUE;
2918                 if (took_cnode_lock)
2919                         hfs_unlock(cp);
2920         }
2921         return (error);
2922 }
2923
2924 /*
2925  * Pageout for HFS filesystem.
2926  */
2927 int
2928 hfs_vnop_pageout(struct vnop_pageout_args *ap)
2929 /*
2930         struct vnop_pageout_args {
2931            vnode_t a_vp,
2932            upl_t         a_pl,
2933            vm_offset_t   a_pl_offset,
2934            off_t         a_f_offset,
2935            size_t        a_size,
2936            int           a_flags
2937            vfs_context_t a_context;
2938         };
2939 */
2940 {
2941         vnode_t vp = ap->a_vp;
2942         struct cnode *cp;
2943         struct filefork *fp;
2944         int retval;
2945         off_t filesize;
2946
2947         cp = VTOC(vp);
2948         fp = VTOF(vp);
2949
2950         if (vnode_isswap(vp)) {
2951                 filesize = fp->ff_size;
2952         } else {
2953                 off_t end_of_range;
2954                 int tooklock = 0;
2955
2956                 if (cp->c_lockowner != current_thread()) {
2957                     if ( (retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
2958                         if (!(ap->a_flags & UPL_NOCOMMIT)) {
2959                                 ubc_upl_abort_range(ap->a_pl,
2960                                                     ap->a_pl_offset,
2961                                                     ap->a_size,
2962                                                     UPL_ABORT_FREE_ON_EMPTY);
2963                         }
2964                         return (retval);
2965                     }
2966                     tooklock = 1;
2967                 }
2968
2969                 filesize = fp->ff_size;
2970                 end_of_range = ap->a_f_offset + ap->a_size - 1;
2971
2972                 if (end_of_range >= filesize) {
2973                         end_of_range = (off_t)(filesize - 1);
2974                 }
2975                 if (ap->a_f_offset < filesize) {
2976                         rl_remove(ap->a_f_offset, end_of_range, &fp->ff_invalidranges);
2977                         cp->c_flag |= C_MODIFIED;  /* leof is dirty */
2978                 }
2979
2980                 if (tooklock) {
2981                     hfs_unlock(cp);
2982                 }
2983         }
2984
2985         retval = cluster_pageout(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
2986                                  ap->a_size, filesize, ap->a_flags);
2987
2988         /*
2989          * If data was written, and setuid or setgid bits are set and
2990          * this process is not the superuser then clear the setuid and
2991          * setgid bits as a precaution against tampering.
2992          */
2993         if ((retval == 0) &&
2994             (cp->c_mode & (S_ISUID | S_ISGID)) &&
2995             (vfs_context_suser(ap->a_context) != 0)) {
2996                 hfs_lock(cp, HFS_FORCE_LOCK);
2997                 cp->c_mode &= ~(S_ISUID | S_ISGID);
2998                 cp->c_touch_chgtime = TRUE;
2999                 hfs_unlock(cp);
3000         }
3001         return (retval);
3002 }
3003
3004 /*
3005  * Intercept B-Tree node writes to unswap them if necessary.
3006  */
3007 int
3008 hfs_vnop_bwrite(struct vnop_bwrite_args *ap)
3009 {
3010         int retval = 0;
3011         register struct buf *bp = ap->a_bp;
3012         register struct vnode *vp = buf_vnode(bp);
3013         BlockDescriptor block;
3014
3015         /* Trap B-Tree writes */
3016         if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) ||
3017             (VTOC(vp)->c_fileid == kHFSCatalogFileID) ||
3018             (VTOC(vp)->c_fileid == kHFSAttributesFileID) ||
3019             (vp == VTOHFS(vp)->hfc_filevp)) {
3020
3021                 /*
3022                  * Swap and validate the node if it is in native byte order.
3023                  * This is always be true on big endian, so we always validate
3024                  * before writing here.  On little endian, the node typically has
3025                  * been swapped and validated when it was written to the journal,
3026                  * so we won't do anything here.
3027                  */
3028                 if (((u_int16_t *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) {
3029                         /* Prepare the block pointer */
3030                         block.blockHeader = bp;
3031                         block.buffer = (char *)buf_dataptr(bp);
3032                         block.blockNum = buf_lblkno(bp);
3033                         /* not found in cache ==> came from disk */
3034                         block.blockReadFromDisk = (buf_fromcache(bp) == 0);
3035                         block.blockSize = buf_count(bp);
3036
3037                         /* Endian un-swap B-Tree node */
3038                         retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, false);
3039                         if (retval)
3040                                 panic("hfs_vnop_bwrite: about to write corrupt node!\n");
3041                 }
3042         }
3043
3044         /* This buffer shouldn't be locked anymore but if it is clear it */
3045         if ((buf_flags(bp) & B_LOCKED)) {
3046                 // XXXdbg
3047                 if (VTOHFS(vp)->jnl) {
3048                         panic("hfs: CLEARING the lock bit on bp %p\n", bp);
3049                 }
3050                 buf_clearflags(bp, B_LOCKED);
3051         }
3052         retval = vn_bwrite (ap);
3053
3054         return (retval);
3055 }
3056
3057 /*
3058  * Relocate a file to a new location on disk
3059  *  cnode must be locked on entry
3060  *
3061  * Relocation occurs by cloning the file's data from its
3062  * current set of blocks to a new set of blocks. During
3063  * the relocation all of the blocks (old and new) are
3064  * owned by the file.
3065  *
3066  * -----------------
3067  * |///////////////|
3068  * -----------------
3069  * 0               N (file offset)
3070  *
3071  * -----------------     -----------------
3072  * |///////////////|     |               |     STEP 1 (acquire new blocks)
3073  * -----------------     -----------------
3074  * 0               N     N+1             2N
3075  *
3076  * -----------------     -----------------
3077  * |///////////////|     |///////////////|     STEP 2 (clone data)
3078  * -----------------     -----------------
3079  * 0               N     N+1             2N
3080  *
3081  *                       -----------------
3082  *                       |///////////////|     STEP 3 (head truncate blocks)
3083  *                       -----------------
3084  *                       0               N
3085  *
3086  * During steps 2 and 3 page-outs to file offsets less
3087  * than or equal to N are suspended.
3088  *
3089  * During step 3 page-ins to the file get suspended.
3090  */
3091 __private_extern__
3092 int
3093 hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
3094         struct  proc *p)
3095 {
3096         struct  cnode *cp;
3097         struct  filefork *fp;
3098         struct  hfsmount *hfsmp;
3099         u_int32_t  headblks;
3100         u_int32_t  datablks;
3101         u_int32_t  blksize;
3102         u_int32_t  growsize;
3103         u_int32_t  nextallocsave;
3104         daddr64_t  sector_a,  sector_b;
3105         int eflags;
3106         off_t  newbytes;
3107         int  retval;
3108         int lockflags = 0;
3109         int took_trunc_lock = 0;
3110         int started_tr = 0;
3111         enum vtype vnodetype;
3112
3113         vnodetype = vnode_vtype(vp);
3114         if (vnodetype != VREG && vnodetype != VLNK) {
3115                 return (EPERM);
3116         }
3117
3118         hfsmp = VTOHFS(vp);
3119         if (hfsmp->hfs_flags & HFS_FRAGMENTED_FREESPACE) {
3120                 return (ENOSPC);
3121         }
3122
3123         cp = VTOC(vp);
3124         fp = VTOF(vp);
3125         if (fp->ff_unallocblocks)
3126                 return (EINVAL);
3127         blksize = hfsmp->blockSize;
3128         if (blockHint == 0)
3129                 blockHint = hfsmp->nextAllocation;
3130
3131         if ((fp->ff_size > 0x7fffffff) ||
3132             ((fp->ff_size > blksize) && vnodetype == VLNK)) {
3133                 return (EFBIG);
3134         }
3135
3136         //
3137         // We do not believe that this call to hfs_fsync() is
3138         // necessary and it causes a journal transaction
3139         // deadlock so we are removing it.
3140         //
3141         //if (vnodetype == VREG && !vnode_issystem(vp)) {
3142         //      retval = hfs_fsync(vp, MNT_WAIT, 0, p);
3143         //      if (retval)
3144         //              return (retval);
3145         //}
3146
3147         if (!vnode_issystem(vp) && (vnodetype != VLNK)) {
3148                 hfs_unlock(cp);
3149                 hfs_lock_truncate(cp, TRUE);
3150                 /* Force lock since callers expects lock to be held. */
3151                 if ((retval = hfs_lock(cp, HFS_FORCE_LOCK))) {
3152                         hfs_unlock_truncate(cp, TRUE);
3153                         return (retval);
3154                 }
3155                 /* No need to continue if file was removed. */
3156                 if (cp->c_flag & C_NOEXISTS) {
3157                         hfs_unlock_truncate(cp, TRUE);
3158                         return (ENOENT);
3159                 }
3160                 took_trunc_lock = 1;
3161         }
3162         headblks = fp->ff_blocks;
3163         datablks = howmany(fp->ff_size, blksize);
3164         growsize = datablks * blksize;
3165         eflags = kEFContigMask | kEFAllMask | kEFNoClumpMask;
3166         if (blockHint >= hfsmp->hfs_metazone_start &&
3167             blockHint <= hfsmp->hfs_metazone_end)
3168                 eflags |= kEFMetadataMask;
3169
3170         if (hfs_start_transaction(hfsmp) != 0) {
3171                 if (took_trunc_lock)
3172                         hfs_unlock_truncate(cp, TRUE);
3173             return (EINVAL);
3174         }
3175         started_tr = 1;
3176         /*
3177          * Protect the extents b-tree and the allocation bitmap
3178          * during MapFileBlockC and ExtendFileC operations.
3179          */
3180         lockflags = SFL_BITMAP;
3181         if (overflow_extents(fp))
3182                 lockflags |= SFL_EXTENTS;
3183         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3184
3185         retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize - 1, &sector_a, NULL);
3186         if (retval) {
3187                 retval = MacToVFSError(retval);
3188                 goto out;
3189         }
3190
3191         /*
3192          * STEP 1 - acquire new allocation blocks.
3193          */
3194         nextallocsave = hfsmp->nextAllocation;
3195         retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes);
3196         if (eflags & kEFMetadataMask) {
3197                 HFS_MOUNT_LOCK(hfsmp, TRUE);
3198                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave);
3199                 MarkVCBDirty(hfsmp);
3200                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
3201         }
3202
3203         retval = MacToVFSError(retval);
3204         if (retval == 0) {
3205                 cp->c_flag |= C_MODIFIED;
3206                 if (newbytes < growsize) {
3207                         retval = ENOSPC;
3208                         goto restore;
3209                 } else if (fp->ff_blocks < (headblks + datablks)) {
3210                         printf("hfs_relocate: allocation failed");
3211                         retval = ENOSPC;
3212                         goto restore;
3213                 }
3214
3215                 retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize, &sector_b, NULL);
3216                 if (retval) {
3217                         retval = MacToVFSError(retval);
3218                 } else if ((sector_a + 1) == sector_b) {
3219                         retval = ENOSPC;
3220                         goto restore;
3221                 } else if ((eflags & kEFMetadataMask) &&
3222                            ((((u_int64_t)sector_b * hfsmp->hfs_phys_block_size) / blksize) >
3223                               hfsmp->hfs_metazone_end)) {
3224                         const char * filestr;
3225                         char emptystr = '\0';
3226
3227                         if (cp->c_desc.cd_nameptr != NULL) {
3228                                 filestr = (const char *)&cp->c_desc.cd_nameptr[0];
3229                         } else if (vnode_name(vp) != NULL) {
3230                                 filestr = vnode_name(vp);
3231                         } else {
3232                                 filestr = &emptystr;
3233                         }
3234                         printf("hfs_relocate: %s didn't move into MDZ (%d blks)\n", filestr, fp->ff_blocks);
3235                         retval = ENOSPC;
3236                         goto restore;
3237                 }
3238         }
3239         /* Done with system locks and journal for now. */
3240         hfs_systemfile_unlock(hfsmp, lockflags);
3241         lockflags = 0;
3242         hfs_end_transaction(hfsmp);
3243         started_tr = 0;
3244
3245         if (retval) {
3246                 /*
3247                  * Check to see if failure is due to excessive fragmentation.
3248                  */
3249                 if ((retval == ENOSPC) &&
3250                     (hfs_freeblks(hfsmp, 0) > (datablks * 2))) {
3251                         hfsmp->hfs_flags |= HFS_FRAGMENTED_FREESPACE;
3252                 }
3253                 goto out;
3254         }
3255         /*
3256          * STEP 2 - clone file data into the new allocation blocks.
3257          */
3258
3259         if (vnodetype == VLNK)
3260                 retval = hfs_clonelink(vp, blksize, cred, p);
3261         else if (vnode_issystem(vp))
3262                 retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p);
3263         else
3264                 retval = hfs_clonefile(vp, headblks, datablks, blksize);
3265
3266         /* Start transaction for step 3 or for a restore. */
3267         if (hfs_start_transaction(hfsmp) != 0) {
3268                 retval = EINVAL;
3269                 goto out;
3270         }
3271         started_tr = 1;
3272         if (retval)
3273                 goto restore;
3274
3275         /*
3276          * STEP 3 - switch to cloned data and remove old blocks.
3277          */
3278         lockflags = SFL_BITMAP;
3279         if (overflow_extents(fp))
3280                 lockflags |= SFL_EXTENTS;
3281         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3282
3283         retval = HeadTruncateFile(hfsmp, (FCB*)fp, headblks);
3284
3285         hfs_systemfile_unlock(hfsmp, lockflags);
3286         lockflags = 0;
3287         if (retval)
3288                 goto restore;
3289 out:
3290         if (took_trunc_lock)
3291                 hfs_unlock_truncate(cp, TRUE);
3292
3293         if (lockflags) {
3294                 hfs_systemfile_unlock(hfsmp, lockflags);
3295                 lockflags = 0;
3296         }
3297
3298         /* Push cnode's new extent data to disk. */
3299         if (retval == 0) {
3300                 (void) hfs_update(vp, MNT_WAIT);
3301         }
3302         if (hfsmp->jnl) {
3303                 if (cp->c_cnid < kHFSFirstUserCatalogNodeID)
3304                         (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
3305                 else
3306                         (void) hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
3307         }
3308 exit:
3309         if (started_tr)
3310                 hfs_end_transaction(hfsmp);
3311
3312         return (retval);
3313
3314 restore:
3315         if (fp->ff_blocks == headblks) {
3316                 if (took_trunc_lock)
3317                         hfs_unlock_truncate(cp, TRUE);
3318                 goto exit;
3319         }
3320         /*
3321          * Give back any newly allocated space.
3322          */
3323         if (lockflags == 0) {
3324                 lockflags = SFL_BITMAP;
3325                 if (overflow_extents(fp))
3326                         lockflags |= SFL_EXTENTS;
3327                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3328         }
3329
3330         (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, false);
3331
3332         hfs_systemfile_unlock(hfsmp, lockflags);
3333         lockflags = 0;
3334
3335         if (took_trunc_lock)
3336                 hfs_unlock_truncate(cp, TRUE);
3337         goto exit;
3338 }
3339
3340
3341 /*
3342  * Clone a symlink.
3343  *
3344  */
3345 static int
3346 hfs_clonelink(struct vnode *vp, int blksize, kauth_cred_t cred, __unused struct proc *p)
3347 {
3348         struct buf *head_bp = NULL;
3349         struct buf *tail_bp = NULL;
3350         int error;
3351
3352
3353         error = (int)buf_meta_bread(vp, (daddr64_t)0, blksize, cred, &head_bp);
3354         if (error)
3355                 goto out;
3356
3357         tail_bp = buf_getblk(vp, (daddr64_t)1, blksize, 0, 0, BLK_META);
3358         if (tail_bp == NULL) {
3359                 error = EIO;
3360                 goto out;
3361         }
3362         bcopy((char *)buf_dataptr(head_bp), (char *)buf_dataptr(tail_bp), blksize);
3363         error = (int)buf_bwrite(tail_bp);
3364 out:
3365         if (head_bp) {
3366                 buf_markinvalid(head_bp);
3367                 buf_brelse(head_bp);
3368         }
3369         (void) buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
3370
3371         return (error);
3372 }
3373
3374 /*
3375  * Clone a file's data within the file.
3376  *
3377  */
3378 static int
3379 hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
3380 {
3381         caddr_t  bufp;
3382         size_t  writebase;
3383         size_t  bufsize;
3384         size_t  copysize;
3385         size_t  iosize;
3386         off_t   filesize;
3387         size_t  offset;
3388         uio_t auio;
3389         int  error = 0;
3390
3391         filesize = VTOF(vp)->ff_blocks * blksize;  /* virtual file size */
3392         writebase = blkstart * blksize;
3393         copysize = blkcnt * blksize;
3394         iosize = bufsize = MIN(copysize, 128 * 1024);
3395         offset = 0;
3396
3397         if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
3398                 return (ENOMEM);
3399         }
3400         hfs_unlock(VTOC(vp));
3401
3402         auio = uio_create(1, 0, UIO_SYSSPACE32, UIO_READ);
3403
3404         while (offset < copysize) {
3405                 iosize = MIN(copysize - offset, iosize);
3406
3407                 uio_reset(auio, offset, UIO_SYSSPACE32, UIO_READ);
3408                 uio_addiov(auio, (uintptr_t)bufp, iosize);
3409
3410                 error = cluster_read(vp, auio, copysize, IO_NOCACHE);
3411                 if (error) {
3412                         printf("hfs_clonefile: cluster_read failed - %d\n", error);
3413                         break;
3414                 }
3415                 if (uio_resid(auio) != 0) {
3416                         printf("clonedata: cluster_read: uio_resid = %lld\n", uio_resid(auio));
3417                         error = EIO;
3418                         break;
3419                 }
3420
3421                 uio_reset(auio, writebase + offset, UIO_SYSSPACE32, UIO_WRITE);
3422                 uio_addiov(auio, (uintptr_t)bufp, iosize);
3423
3424                 error = cluster_write(vp, auio, filesize + offset,
3425                                       filesize + offset + iosize,
3426                                       uio_offset(auio), 0, IO_NOCACHE | IO_SYNC);
3427                 if (error) {
3428                         printf("hfs_clonefile: cluster_write failed - %d\n", error);
3429                         break;
3430                 }
3431                 if (uio_resid(auio) != 0) {
3432                         printf("hfs_clonefile: cluster_write failed - uio_resid not zero\n");
3433                         error = EIO;
3434                         break;
3435                 }
3436                 offset += iosize;
3437         }
3438         uio_free(auio);
3439
3440         /*
3441          * No need to call ubc_sync_range or hfs_invalbuf
3442          * since the file was copied using IO_NOCACHE.
3443          */
3444
3445         kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
3446
3447         hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
3448         return (error);
3449 }
3450
3451 /*
3452  * Clone a system (metadata) file.
3453  *
3454  */
3455 static int
3456 hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize,
3457                  kauth_cred_t cred, struct proc *p)
3458 {
3459         caddr_t  bufp;
3460         char * offset;
3461         size_t  bufsize;
3462         size_t  iosize;
3463         struct buf *bp = NULL;
3464         daddr64_t  blkno;
3465         daddr64_t  blk;
3466         daddr64_t  start_blk;
3467         daddr64_t  last_blk;
3468         int  breadcnt;
3469         int  i;
3470         int  error = 0;
3471
3472
3473         iosize = GetLogicalBlockSize(vp);
3474         bufsize = MIN(blkcnt * blksize, 1024 * 1024) & ~(iosize - 1);
3475         breadcnt = bufsize / iosize;
3476
3477         if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
3478                 return (ENOMEM);
3479         }
3480         start_blk = ((daddr64_t)blkstart * blksize) / iosize;
3481         last_blk  = ((daddr64_t)blkcnt * blksize) / iosize;
3482         blkno = 0;
3483
3484         while (blkno < last_blk) {
3485                 /*
3486                  * Read up to a megabyte
3487                  */
3488                 offset = bufp;
3489                 for (i = 0, blk = blkno; (i < breadcnt) && (blk < last_blk); ++i, ++blk) {
3490                         error = (int)buf_meta_bread(vp, blk, iosize, cred, &bp);
3491                         if (error) {
3492                                 printf("hfs_clonesysfile: meta_bread error %d\n", error);
3493                                 goto out;
3494                         }
3495                         if (buf_count(bp) != iosize) {
3496                                 printf("hfs_clonesysfile: b_bcount is only %d\n", buf_count(bp));
3497                                 goto out;
3498                         }
3499                         bcopy((char *)buf_dataptr(bp), offset, iosize);
3500
3501                         buf_markinvalid(bp);
3502                         buf_brelse(bp);
3503                         bp = NULL;
3504
3505                         offset += iosize;
3506                 }
3507
3508                 /*
3509                  * Write up to a megabyte
3510                  */
3511                 offset = bufp;
3512                 for (i = 0; (i < breadcnt) && (blkno < last_blk); ++i, ++blkno) {
3513                         bp = buf_getblk(vp, start_blk + blkno, iosize, 0, 0, BLK_META);
3514                         if (bp == NULL) {
3515                                 printf("hfs_clonesysfile: getblk failed on blk %qd\n", start_blk + blkno);
3516                                 error = EIO;
3517                                 goto out;
3518                         }
3519                         bcopy(offset, (char *)buf_dataptr(bp), iosize);
3520                         error = (int)buf_bwrite(bp);
3521                         bp = NULL;
3522                         if (error)
3523                                 goto out;
3524                         offset += iosize;
3525                 }
3526         }
3527 out:
3528         if (bp) {
3529                 buf_brelse(bp);
3530         }
3531
3532         kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
3533
3534         error = hfs_fsync(vp, MNT_WAIT, 0, p);
3535
3536         return (error);
3537 }