X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/b4c24cb9d3df001f2892dc4ed451bc769ff28a9f..7e4a7d3939db04e70062ae6c7bf24b8c8b2f5a7c:/bsd/hfs/hfs_readwrite.c diff --git a/bsd/hfs/hfs_readwrite.c b/bsd/hfs/hfs_readwrite.c index 6f0311411..6dc30afad 100644 --- a/bsd/hfs/hfs_readwrite.c +++ b/bsd/hfs/hfs_readwrite.c @@ -1,23 +1,29 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. * - * @APPLE_LICENSE_HEADER_START@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. * - * @APPLE_LICENSE_HEADER_END@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* @(#)hfs_readwrite.c 1.0 * @@ -32,381 +38,372 @@ #include #include #include +#include #include #include #include +#include #include +#include #include +#include +#include +#include +#include +#include +#include #include #include +#include + #include +#include #include #include "hfs.h" +#include "hfs_attrlist.h" #include "hfs_endian.h" +#include "hfs_fsctl.h" #include "hfs_quota.h" #include "hfscommon/headers/FileMgrInternal.h" #include "hfscommon/headers/BTreesInternal.h" #include "hfs_cnode.h" #include "hfs_dbg.h" -extern int overflow_extents(struct filefork *fp); - #define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2))) enum { MAXHFSFILESIZE = 0x7FFFFFFF /* this needs to go in the mount structure */ }; -extern u_int32_t GetLogicalBlockSize(struct vnode *vp); +/* from bsd/hfs/hfs_vfsops.c */ +extern int hfs_vfs_vget (struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context); +static int hfs_clonelink(struct vnode *, int, kauth_cred_t, struct proc *); +static int hfs_clonefile(struct vnode *, int, int, int); +static int hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *); +static int hfs_minorupdate(struct vnode *vp); +static int do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skip, vfs_context_t context); -/***************************************************************************** -* -* Operations on vnodes -* -*****************************************************************************/ -/* -#% read vp L L L -# - vop_read { - IN struct vnode *vp; - INOUT struct uio *uio; - IN int ioflag; - IN struct ucred *cred; +int flush_cache_on_write = 0; +SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files"); - */ +/* + * Read data from a file. + */ int -hfs_read(ap) - struct vop_read_args /* { - struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - struct ucred *a_cred; - } */ *ap; +hfs_vnop_read(struct vnop_read_args *ap) { - register struct uio *uio = ap->a_uio; - register struct vnode *vp = ap->a_vp; + uio_t uio = ap->a_uio; + struct vnode *vp = ap->a_vp; struct cnode *cp; struct filefork *fp; - struct buf *bp; - daddr_t logBlockNo; - u_long fragSize, moveSize, startOffset, ioxfersize; - int devBlockSize = 0; - off_t bytesRemaining; + struct hfsmount *hfsmp; + off_t filesize; + off_t filebytes; + off_t start_resid = uio_resid(uio); + off_t offset = uio_offset(uio); int retval = 0; - off_t filesize; - off_t filebytes; /* Preflight checks */ - if (vp->v_type != VREG && vp->v_type != VLNK) - return (EISDIR); /* HFS can only read files */ - if (uio->uio_resid == 0) + if (!vnode_isreg(vp)) { + /* can only read regular files */ + if (vnode_isdir(vp)) + return (EISDIR); + else + return (EPERM); + } + if (start_resid == 0) return (0); /* Nothing left to do */ - if (uio->uio_offset < 0) + if (offset < 0) return (EINVAL); /* cant read from a negative offset */ + +#if HFS_COMPRESSION + if (VNODE_IS_RSRC(vp)) { + if (hfs_hides_rsrc(ap->a_context, VTOC(vp), 1)) { /* 1 == don't take the cnode lock */ + return 0; + } + /* otherwise read the resource fork normally */ + } else { + int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */ + if (compressed) { + retval = decmpfs_read_compressed(ap, &compressed, VTOCMP(vp)); + if (compressed) { + if (retval == 0) { + /* successful read, update the access time */ + VTOC(vp)->c_touch_acctime = TRUE; + + /* compressed files are not hot file candidates */ + if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) { + VTOF(vp)->ff_bytesread = 0; + } + } + return retval; + } + /* otherwise the file was converted back to a regular file while we were reading it */ + retval = 0; + } + } +#endif /* HFS_COMPRESSION */ cp = VTOC(vp); fp = VTOF(vp); + hfsmp = VTOHFS(vp); + + /* Protect against a size change. */ + hfs_lock_truncate(cp, 0); + filesize = fp->ff_size; - filebytes = (off_t)fp->ff_blocks * (off_t)VTOVCB(vp)->blockSize; - if (uio->uio_offset > filesize) { - if ((!ISHFSPLUS(VTOVCB(vp))) && (uio->uio_offset > (off_t)MAXHFSFILESIZE)) - return (EFBIG); - else - return (0); + filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize; + if (offset > filesize) { + if ((hfsmp->hfs_flags & HFS_STANDARD) && + (offset > (off_t)MAXHFSFILESIZE)) { + retval = EFBIG; + } + goto exit; } - VOP_DEVBLOCKSIZE(cp->c_devvp, &devBlockSize); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_START, - (int)uio->uio_offset, uio->uio_resid, (int)filesize, (int)filebytes, 0); + (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0); - if (UBCISVALID(vp)) { - retval = cluster_read(vp, uio, filesize, devBlockSize, 0); - } else { + retval = cluster_read(vp, uio, filesize, ap->a_ioflag); - for (retval = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { - - if ((bytesRemaining = (filesize - uio->uio_offset)) <= 0) - break; - - logBlockNo = (daddr_t)(uio->uio_offset / PAGE_SIZE_64); - startOffset = (u_long) (uio->uio_offset & PAGE_MASK_64); - fragSize = PAGE_SIZE; - - if (((logBlockNo * PAGE_SIZE) + fragSize) < filesize) - ioxfersize = fragSize; - else { - ioxfersize = filesize - (logBlockNo * PAGE_SIZE); - ioxfersize = (ioxfersize + (devBlockSize - 1)) & ~(devBlockSize - 1); - } - moveSize = ioxfersize; - moveSize -= startOffset; - - if (bytesRemaining < moveSize) - moveSize = bytesRemaining; - - if (uio->uio_resid < moveSize) { - moveSize = uio->uio_resid; - }; - if (moveSize == 0) { - break; - }; - - if (( uio->uio_offset + fragSize) >= filesize) { - retval = bread(vp, logBlockNo, ioxfersize, NOCRED, &bp); - - } else if (logBlockNo - 1 == vp->v_lastr && !(vp->v_flag & VRAOFF)) { - daddr_t nextLogBlockNo = logBlockNo + 1; - int nextsize; - - if (((nextLogBlockNo * PAGE_SIZE) + - (daddr_t)fragSize) < filesize) - nextsize = fragSize; - else { - nextsize = filesize - (nextLogBlockNo * PAGE_SIZE); - nextsize = (nextsize + (devBlockSize - 1)) & ~(devBlockSize - 1); - } - retval = breadn(vp, logBlockNo, ioxfersize, &nextLogBlockNo, &nextsize, 1, NOCRED, &bp); - } else { - retval = bread(vp, logBlockNo, ioxfersize, NOCRED, &bp); - }; - - if (retval != E_NONE) { - if (bp) { - brelse(bp); - bp = NULL; - } - break; - }; - vp->v_lastr = logBlockNo; - - /* - * We should only get non-zero b_resid when an I/O retval - * has occurred, which should cause us to break above. - * However, if the short read did not cause an retval, - * then we want to ensure that we do not uiomove bad - * or uninitialized data. - */ - ioxfersize -= bp->b_resid; - - if (ioxfersize < moveSize) { /* XXX PPD This should take the offset into account, too! */ - if (ioxfersize == 0) - break; - moveSize = ioxfersize; - } - if ((startOffset + moveSize) > bp->b_bcount) - panic("hfs_read: bad startOffset or moveSize\n"); - - if ((retval = uiomove((caddr_t)bp->b_data + startOffset, (int)moveSize, uio))) - break; - - if (S_ISREG(cp->c_mode) && - (((startOffset + moveSize) == fragSize) || (uio->uio_offset == filesize))) { - bp->b_flags |= B_AGE; - }; - - brelse(bp); - /* Start of loop resets bp to NULL before reaching outside this block... */ - } - - if (bp != NULL) { - brelse(bp); - } - } - - cp->c_flag |= C_ACCESS; + cp->c_touch_acctime = TRUE; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_END, - (int)uio->uio_offset, uio->uio_resid, (int)filesize, (int)filebytes, 0); + (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0); + + /* + * Keep track blocks read + */ + if (hfsmp->hfc_stage == HFC_RECORDING && retval == 0) { + int took_cnode_lock = 0; + off_t bytesread; + bytesread = start_resid - uio_resid(uio); + + /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */ + if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) { + hfs_lock(cp, HFS_FORCE_LOCK); + took_cnode_lock = 1; + } + /* + * If this file hasn't been seen since the start of + * the current sampling period then start over. + */ + if (cp->c_atime < hfsmp->hfc_timebase) { + struct timeval tv; + + fp->ff_bytesread = bytesread; + microtime(&tv); + cp->c_atime = tv.tv_sec; + } else { + fp->ff_bytesread += bytesread; + } + if (took_cnode_lock) + hfs_unlock(cp); + } +exit: + hfs_unlock_truncate(cp, 0); return (retval); } /* - * Write data to a file or directory. -#% write vp L L L -# - vop_write { - IN struct vnode *vp; - INOUT struct uio *uio; - IN int ioflag; - IN struct ucred *cred; - - */ + * Write data to a file. + */ int -hfs_write(ap) - struct vop_write_args /* { - struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - struct ucred *a_cred; - } */ *ap; +hfs_vnop_write(struct vnop_write_args *ap) { + uio_t uio = ap->a_uio; struct vnode *vp = ap->a_vp; - struct uio *uio = ap->a_uio; struct cnode *cp; struct filefork *fp; - struct buf *bp; - struct proc *p; - struct timeval tv; - ExtendedVCB *vcb; - int devBlockSize = 0; - daddr_t logBlockNo; - long fragSize; - off_t origFileSize, currOffset, writelimit, bytesToAdd; - off_t actualBytesAdded; - u_long blkoffset, resid, xfersize, clearSize; - int eflags, ioflag; - int retval; - off_t filebytes; - u_long fileblocks; struct hfsmount *hfsmp; - int started_tr = 0, grabbed_lock = 0; + kauth_cred_t cred = NULL; + off_t origFileSize; + off_t writelimit; + off_t bytesToAdd = 0; + off_t actualBytesAdded; + off_t filebytes; + off_t offset; + ssize_t resid; + int eflags; + int ioflag = ap->a_ioflag; + int retval = 0; + int lockflags; + int cnode_locked = 0; + int partialwrite = 0; + int exclusive_lock = 0; + +#if HFS_COMPRESSION + if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */ + int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp)); + switch(state) { + case FILE_IS_COMPRESSED: + return EACCES; + case FILE_IS_CONVERTING: + /* if FILE_IS_CONVERTING, we allow writes */ + break; + default: + printf("invalid state %d for compressed file\n", state); + /* fall through */ + } + } +#endif - ioflag = ap->a_ioflag; + // LP64todo - fix this! uio_resid may be 64-bit value + resid = uio_resid(uio); + offset = uio_offset(uio); - if (uio->uio_offset < 0) + if (ioflag & IO_APPEND) { + exclusive_lock = 1; + } + + if (offset < 0) return (EINVAL); - if (uio->uio_resid == 0) + if (resid == 0) return (E_NONE); - if (vp->v_type != VREG && vp->v_type != VLNK) - return (EISDIR); /* Can only write files */ + if (!vnode_isreg(vp)) + return (EPERM); /* Can only write regular files */ cp = VTOC(vp); fp = VTOF(vp); - vcb = VTOVCB(vp); - fileblocks = fp->ff_blocks; - filebytes = (off_t)fileblocks * (off_t)vcb->blockSize; - - if (ioflag & IO_APPEND) - uio->uio_offset = fp->ff_size; - if ((cp->c_flags & APPEND) && uio->uio_offset != fp->ff_size) - return (EPERM); - - // XXXdbg - don't allow modification of the journal or journal_info_block - if (VTOHFS(vp)->jnl && cp->c_datafork) { - struct HFSPlusExtentDescriptor *extd; + hfsmp = VTOHFS(vp); - extd = &cp->c_datafork->ff_data.cf_extents[0]; - if (extd->startBlock == VTOVCB(vp)->vcbJinfoBlock || extd->startBlock == VTOHFS(vp)->jnl_start) { - return EPERM; - } + eflags = kEFDeferMask; /* defer file block allocations */ +#ifdef HFS_SPARSE_DEV + /* + * When the underlying device is sparse and space + * is low (< 8MB), stop doing delayed allocations + * and begin doing synchronous I/O. + */ + if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) && + (hfs_freeblks(hfsmp, 0) < 2048)) { + eflags &= ~kEFDeferMask; + ioflag |= IO_SYNC; } +#endif /* HFS_SPARSE_DEV */ - writelimit = uio->uio_offset + uio->uio_resid; +again: + /* Protect against a size change. */ + hfs_lock_truncate(cp, exclusive_lock); - /* - * Maybe this should be above the vnode op call, but so long as - * file servers have no limits, I don't think it matters. - */ - p = uio->uio_procp; - if (vp->v_type == VREG && p && - writelimit > p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { - psignal(p, SIGXFSZ); - return (EFBIG); + if (ioflag & IO_APPEND) { + uio_setoffset(uio, fp->ff_size); + offset = fp->ff_size; + } + if ((cp->c_flags & APPEND) && offset != fp->ff_size) { + retval = EPERM; + goto exit; } - p = current_proc(); - - VOP_DEVBLOCKSIZE(cp->c_devvp, &devBlockSize); - resid = uio->uio_resid; origFileSize = fp->ff_size; - eflags = kEFDeferMask; /* defer file block allocations */ - filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize; - - /* - * NOTE: In the following loop there are two positions tracked: - * currOffset is the current I/O starting offset. currOffset - * is never >LEOF; the LEOF is nudged along with currOffset as - * data is zeroed or written. uio->uio_offset is the start of - * the current I/O operation. It may be arbitrarily beyond - * currOffset. - * - * The following is true at all times: - * currOffset <= LEOF <= uio->uio_offset <= writelimit + writelimit = offset + resid; + filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize; + + /* If the truncate lock is shared, and if we either have virtual + * blocks or will need to extend the file, upgrade the truncate + * to exclusive lock. If upgrade fails, we lose the lock and + * have to get exclusive lock again. Note that we want to + * grab the truncate lock exclusive even if we're not allocating new blocks + * because we could still be growing past the LEOF. */ - currOffset = MIN(uio->uio_offset, fp->ff_size); + if ((exclusive_lock == 0) && + ((fp->ff_unallocblocks != 0) || (writelimit > origFileSize))) { + exclusive_lock = 1; + /* Lock upgrade failed and we lost our shared lock, try again */ + if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) { + goto again; + } + } + + if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) { + goto exit; + } + cnode_locked = 1; + + if (!exclusive_lock) { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_START, + (int)offset, uio_resid(uio), (int)fp->ff_size, + (int)filebytes, 0); + } - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_START, - (int)uio->uio_offset, uio->uio_resid, (int)fp->ff_size, (int)filebytes, 0); - retval = 0; + /* Check if we do not need to extend the file */ + if (writelimit <= filebytes) { + goto sizeok; + } - /* Now test if we need to extend the file */ - /* Doing so will adjust the filebytes for us */ + cred = vfs_context_ucred(ap->a_context); + bytesToAdd = writelimit - filebytes; #if QUOTA - if(writelimit > filebytes) { - bytesToAdd = writelimit - filebytes; - - retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, vcb->blockSize)), - ap->a_cred, 0); - if (retval) - return (retval); - } + retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, hfsmp->blockSize)), + cred, 0); + if (retval) + goto exit; #endif /* QUOTA */ - hfsmp = VTOHFS(vp); - if (writelimit > filebytes) { - hfs_global_shared_lock_acquire(hfsmp); - grabbed_lock = 1; - } - if (hfsmp->jnl && (writelimit > filebytes)) { - if (journal_start_transaction(hfsmp->jnl) != 0) { - hfs_global_shared_lock_release(hfsmp); - return EINVAL; - } - started_tr = 1; + if (hfs_start_transaction(hfsmp) != 0) { + retval = EINVAL; + goto exit; } while (writelimit > filebytes) { - bytesToAdd = writelimit - filebytes; - if (suser(ap->a_cred, NULL) != 0) + if (cred && suser(cred, NULL) != 0) eflags |= kEFReserveMask; - /* lock extents b-tree (also protects volume bitmap) */ - retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, current_proc()); - if (retval != E_NONE) - break; - - retval = MacToVFSError(ExtendFileC (vcb, (FCB*)fp, bytesToAdd, + /* Protect extents b-tree and allocation bitmap */ + lockflags = SFL_BITMAP; + if (overflow_extents(fp)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + + /* Files that are changing size are not hot file candidates. */ + if (hfsmp->hfc_stage == HFC_RECORDING) { + fp->ff_bytesread = 0; + } + retval = MacToVFSError(ExtendFileC (hfsmp, (FCB*)fp, bytesToAdd, 0, eflags, &actualBytesAdded)); - (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, p); + hfs_systemfile_unlock(hfsmp, lockflags); + if ((actualBytesAdded == 0) && (retval == E_NONE)) retval = ENOSPC; if (retval != E_NONE) break; - filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize; + filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_NONE, - (int)uio->uio_offset, uio->uio_resid, (int)fp->ff_size, (int)filebytes, 0); + (int)offset, uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0); } + (void) hfs_update(vp, TRUE); + (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); + (void) hfs_end_transaction(hfsmp); - // XXXdbg - if (started_tr) { - hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); - journal_end_transaction(hfsmp->jnl); - started_tr = 0; - } - if (grabbed_lock) { - hfs_global_shared_lock_release(hfsmp); - grabbed_lock = 0; + /* + * If we didn't grow the file enough try a partial write. + * POSIX expects this behavior. + */ + if ((retval == ENOSPC) && (filebytes > offset)) { + retval = 0; + partialwrite = 1; + uio_setresid(uio, (uio_resid(uio) - bytesToAdd)); + resid -= bytesToAdd; + writelimit = filebytes; } - - if (UBCISVALID(vp) && retval == E_NONE) { +sizeok: + if (retval == E_NONE) { off_t filesize; off_t zero_off; off_t tail_off; off_t inval_start; off_t inval_end; - off_t io_start, io_end; + off_t io_start; int lflag; struct rl_entry *invalid_range; @@ -415,17 +412,19 @@ hfs_write(ap) else filesize = fp->ff_size; - lflag = (ioflag & IO_SYNC); + lflag = ioflag & ~(IO_TAILZEROFILL | IO_HEADZEROFILL | IO_NOZEROVALID | IO_NOZERODIRTY); - if (uio->uio_offset <= fp->ff_size) { - zero_off = uio->uio_offset & ~PAGE_MASK_64; + if (offset <= fp->ff_size) { + zero_off = offset & ~PAGE_MASK_64; /* Check to see whether the area between the zero_offset and the start of the transfer to see whether is invalid and should be zero-filled as part of the transfer: */ - if (rl_scan(&fp->ff_invalidranges, zero_off, uio->uio_offset - 1, &invalid_range) != RL_NOOVERLAP) - lflag |= IO_HEADZEROFILL; + if (offset > zero_off) { + if (rl_scan(&fp->ff_invalidranges, zero_off, offset - 1, &invalid_range) != RL_NOOVERLAP) + lflag |= IO_HEADZEROFILL; + } } else { off_t eof_page_base = fp->ff_size & ~PAGE_MASK_64; @@ -442,7 +441,7 @@ hfs_write(ap) will be handled by the cluser_write of the actual data. */ inval_start = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64; - inval_end = uio->uio_offset & ~PAGE_MASK_64; + inval_end = offset & ~PAGE_MASK_64; zero_off = fp->ff_size; if ((fp->ff_size & PAGE_MASK_64) && @@ -466,6 +465,7 @@ hfs_write(ap) }; if (inval_start < inval_end) { + struct timeval tv; /* There's some range of data that's going to be marked invalid */ if (zero_off < inval_start) { @@ -473,20 +473,26 @@ hfs_write(ap) and the actual write will start on a page past inval_end. Now's the last chance to zero-fill the page containing the EOF: */ - retval = cluster_write(vp, (struct uio *) 0, + hfs_unlock(cp); + cnode_locked = 0; + retval = cluster_write(vp, (uio_t) 0, fp->ff_size, inval_start, - zero_off, (off_t)0, devBlockSize, + zero_off, (off_t)0, lflag | IO_HEADZEROFILL | IO_NOZERODIRTY); + hfs_lock(cp, HFS_FORCE_LOCK); + cnode_locked = 1; if (retval) goto ioerr_exit; + offset = uio_offset(uio); }; /* Mark the remaining area of the newly allocated space as invalid: */ rl_add(inval_start, inval_end - 1 , &fp->ff_invalidranges); - cp->c_zftimeout = time.tv_sec + ZFTIMELIMIT; + microuptime(&tv); + cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT; zero_off = fp->ff_size = inval_end; }; - if (uio->uio_offset > zero_off) lflag |= IO_HEADZEROFILL; + if (offset > zero_off) lflag |= IO_HEADZEROFILL; }; /* Check to see whether the area between the end of the write and the end of @@ -510,118 +516,76 @@ hfs_write(ap) * made readable (removed from the invalid ranges) before cluster_write * tries to write it: */ - io_start = (lflag & IO_HEADZEROFILL) ? zero_off : uio->uio_offset; - io_end = (lflag & IO_TAILZEROFILL) ? tail_off : writelimit; + io_start = (lflag & IO_HEADZEROFILL) ? zero_off : offset; if (io_start < fp->ff_size) { + off_t io_end; + + io_end = (lflag & IO_TAILZEROFILL) ? tail_off : writelimit; rl_remove(io_start, io_end - 1, &fp->ff_invalidranges); }; - retval = cluster_write(vp, uio, fp->ff_size, filesize, zero_off, - tail_off, devBlockSize, lflag | IO_NOZERODIRTY); - - if (uio->uio_offset > fp->ff_size) { - fp->ff_size = uio->uio_offset; - ubc_setsize(vp, fp->ff_size); /* XXX check errors */ + hfs_unlock(cp); + cnode_locked = 0; + + /* + * We need to tell UBC the fork's new size BEFORE calling + * cluster_write, in case any of the new pages need to be + * paged out before cluster_write completes (which does happen + * in embedded systems due to extreme memory pressure). + * Similarly, we need to tell hfs_vnop_pageout what the new EOF + * will be, so that it can pass that on to cluster_pageout, and + * allow those pageouts. + * + * We don't update ff_size yet since we don't want pageins to + * be able to see uninitialized data between the old and new + * EOF, until cluster_write has completed and initialized that + * part of the file. + * + * The vnode pager relies on the file size last given to UBC via + * ubc_setsize. hfs_vnop_pageout relies on fp->ff_new_size or + * ff_size (whichever is larger). NOTE: ff_new_size is always + * zero, unless we are extending the file via write. + */ + if (filesize > fp->ff_size) { + fp->ff_new_size = filesize; + ubc_setsize(vp, filesize); } - if (resid > uio->uio_resid) - cp->c_flag |= C_CHANGE | C_UPDATE; - } else { - while (retval == E_NONE && uio->uio_resid > 0) { - logBlockNo = currOffset / PAGE_SIZE; - blkoffset = currOffset & PAGE_MASK; - - if ((filebytes - currOffset) < PAGE_SIZE_64) - fragSize = filebytes - ((off_t)logBlockNo * PAGE_SIZE_64); - else - fragSize = PAGE_SIZE; - xfersize = fragSize - blkoffset; - - /* Make any adjustments for boundary conditions */ - if (currOffset + (off_t)xfersize > writelimit) - xfersize = writelimit - currOffset; - - /* - * There is no need to read into bp if: - * We start on a block boundary and will overwrite the whole block - * - * OR - */ - if ((blkoffset == 0) && (xfersize >= fragSize)) { - bp = getblk(vp, logBlockNo, fragSize, 0, 0, BLK_READ); - retval = 0; - - if (bp->b_blkno == -1) { - brelse(bp); - retval = EIO; /* XXX */ - break; - } - } else { - - if (currOffset == fp->ff_size && blkoffset == 0) { - bp = getblk(vp, logBlockNo, fragSize, 0, 0, BLK_READ); - retval = 0; - if (bp->b_blkno == -1) { - brelse(bp); - retval = EIO; /* XXX */ - break; - } - } else { - /* - * This I/O transfer is not sufficiently aligned, - * so read the affected block into a buffer: - */ - retval = bread(vp, logBlockNo, fragSize, ap->a_cred, &bp); - if (retval != E_NONE) { - if (bp) - brelse(bp); - break; - } - } - } - - /* See if we are starting to write within file boundaries: - * If not, then we need to present a "hole" for the area - * between the current EOF and the start of the current - * I/O operation: - * - * Note that currOffset is only less than uio_offset if - * uio_offset > LEOF... - */ - if (uio->uio_offset > currOffset) { - clearSize = MIN(uio->uio_offset - currOffset, xfersize); - bzero(bp->b_data + blkoffset, clearSize); - currOffset += clearSize; - blkoffset += clearSize; - xfersize -= clearSize; - } - - if (xfersize > 0) { - retval = uiomove((caddr_t)bp->b_data + blkoffset, (int)xfersize, uio); - currOffset += xfersize; - } - - if (ioflag & IO_SYNC) { - (void)VOP_BWRITE(bp); - } else if ((xfersize + blkoffset) == fragSize) { - bp->b_flags |= B_AGE; - bawrite(bp); - } else { - bdwrite(bp); + retval = cluster_write(vp, uio, fp->ff_size, filesize, zero_off, + tail_off, lflag | IO_NOZERODIRTY); + if (retval) { + fp->ff_new_size = 0; /* no longer extending; use ff_size */ + if (filesize > origFileSize) { + ubc_setsize(vp, origFileSize); } - - /* Update the EOF if we just extended the file - * (the PEOF has already been moved out and the - * block mapping table has been updated): - */ - if (currOffset > fp->ff_size) { - fp->ff_size = currOffset; - if (UBCISVALID(vp)) - ubc_setsize(vp, fp->ff_size); /* XXX check errors */ + goto ioerr_exit; + } + + if (filesize > origFileSize) { + fp->ff_size = filesize; + + /* Files that are changing size are not hot file candidates. */ + if (hfsmp->hfc_stage == HFC_RECORDING) { + fp->ff_bytesread = 0; } - if (retval || (resid == 0)) - break; - cp->c_flag |= C_CHANGE | C_UPDATE; - } /* endwhile */ + } + fp->ff_new_size = 0; /* ff_size now has the correct size */ + + /* If we wrote some bytes, then touch the change and mod times */ + if (resid > uio_resid(uio)) { + cp->c_touch_chgtime = TRUE; + cp->c_touch_modtime = TRUE; + } + } + if (partialwrite) { + uio_setresid(uio, (uio_resid(uio) + bytesToAdd)); + resid += bytesToAdd; + } + + // XXXdbg - see radar 4871353 for more info + { + if (flush_cache_on_write && ((ioflag & IO_NOCACHE) || vnode_isnocache(vp))) { + VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NULL); + } } ioerr_exit: @@ -630,1389 +594,3511 @@ ioerr_exit: * we clear the setuid and setgid bits as a precaution against * tampering. */ - if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0) - cp->c_mode &= ~(S_ISUID | S_ISGID); - + if (cp->c_mode & (S_ISUID | S_ISGID)) { + cred = vfs_context_ucred(ap->a_context); + if (resid > uio_resid(uio) && cred && suser(cred, NULL)) { + if (!cnode_locked) { + hfs_lock(cp, HFS_FORCE_LOCK); + cnode_locked = 1; + } + cp->c_mode &= ~(S_ISUID | S_ISGID); + } + } if (retval) { if (ioflag & IO_UNIT) { - (void)VOP_TRUNCATE(vp, origFileSize, - ioflag & IO_SYNC, ap->a_cred, uio->uio_procp); - uio->uio_offset -= resid - uio->uio_resid; - uio->uio_resid = resid; - filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize; + if (!cnode_locked) { + hfs_lock(cp, HFS_FORCE_LOCK); + cnode_locked = 1; + } + (void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC, + 0, 0, ap->a_context); + // LP64todo - fix this! resid needs to by user_ssize_t + uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio)))); + uio_setresid(uio, resid); + filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize; + } + } else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio))) { + if (!cnode_locked) { + hfs_lock(cp, HFS_FORCE_LOCK); + cnode_locked = 1; } - } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) { - tv = time; - retval = VOP_UPDATE(vp, &tv, &tv, 1); + retval = hfs_update(vp, TRUE); } + /* Updating vcbWrCnt doesn't need to be atomic. */ + hfsmp->vcbWrCnt++; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_END, - (int)uio->uio_offset, uio->uio_resid, (int)fp->ff_size, (int)filebytes, 0); - + (int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0); +exit: + if (cnode_locked) + hfs_unlock(cp); + hfs_unlock_truncate(cp, exclusive_lock); return (retval); } +/* support for the "bulk-access" fcntl */ -/* - -#% ioctl vp U U U -# - vop_ioctl { - IN struct vnode *vp; - IN u_long command; - IN caddr_t data; - IN int fflag; - IN struct ucred *cred; - IN struct proc *p; - - */ +#define CACHE_LEVELS 16 +#define NUM_CACHE_ENTRIES (64*16) +#define PARENT_IDS_FLAG 0x100 +struct access_cache { + int numcached; + int cachehits; /* these two for statistics gathering */ + int lookups; + unsigned int *acache; + unsigned char *haveaccess; +}; -/* ARGSUSED */ -int -hfs_ioctl(ap) - struct vop_ioctl_args /* { - struct vnode *a_vp; - int a_command; - caddr_t a_data; - int a_fflag; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; -{ - switch (ap->a_command) { - case 1: { - register struct cnode *cp; - register struct vnode *vp; - register struct radvisory *ra; - struct filefork *fp; - int devBlockSize = 0; - int error; +struct access_t { + uid_t uid; /* IN: effective user id */ + short flags; /* IN: access requested (i.e. R_OK) */ + short num_groups; /* IN: number of groups user belongs to */ + int num_files; /* IN: number of files to process */ + int *file_ids; /* IN: array of file ids */ + gid_t *groups; /* IN: array of groups */ + short *access; /* OUT: access info for each file (0 for 'has access') */ +} __attribute__((unavailable)); // this structure is for reference purposes only + +struct user32_access_t { + uid_t uid; /* IN: effective user id */ + short flags; /* IN: access requested (i.e. R_OK) */ + short num_groups; /* IN: number of groups user belongs to */ + int num_files; /* IN: number of files to process */ + user32_addr_t file_ids; /* IN: array of file ids */ + user32_addr_t groups; /* IN: array of groups */ + user32_addr_t access; /* OUT: access info for each file (0 for 'has access') */ +}; - vp = ap->a_vp; +struct user64_access_t { + uid_t uid; /* IN: effective user id */ + short flags; /* IN: access requested (i.e. R_OK) */ + short num_groups; /* IN: number of groups user belongs to */ + int num_files; /* IN: number of files to process */ + user64_addr_t file_ids; /* IN: array of file ids */ + user64_addr_t groups; /* IN: array of groups */ + user64_addr_t access; /* OUT: access info for each file (0 for 'has access') */ +}; - if (vp->v_type != VREG) - return EINVAL; - - VOP_LEASE(vp, ap->a_p, ap->a_cred, LEASE_READ); - error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, ap->a_p); - if (error) - return (error); - ra = (struct radvisory *)(ap->a_data); - cp = VTOC(vp); - fp = VTOF(vp); +// these are the "extended" versions of the above structures +// note that it is crucial that they be different sized than +// the regular version +struct ext_access_t { + uint32_t flags; /* IN: access requested (i.e. R_OK) */ + uint32_t num_files; /* IN: number of files to process */ + uint32_t map_size; /* IN: size of the bit map */ + uint32_t *file_ids; /* IN: Array of file ids */ + char *bitmap; /* OUT: hash-bitmap of interesting directory ids */ + short *access; /* OUT: access info for each file (0 for 'has access') */ + uint32_t num_parents; /* future use */ + cnid_t *parents; /* future use */ +} __attribute__((unavailable)); // this structure is for reference purposes only + +struct user32_ext_access_t { + uint32_t flags; /* IN: access requested (i.e. R_OK) */ + uint32_t num_files; /* IN: number of files to process */ + uint32_t map_size; /* IN: size of the bit map */ + user32_addr_t file_ids; /* IN: Array of file ids */ + user32_addr_t bitmap; /* OUT: hash-bitmap of interesting directory ids */ + user32_addr_t access; /* OUT: access info for each file (0 for 'has access') */ + uint32_t num_parents; /* future use */ + user32_addr_t parents; /* future use */ +}; - if (ra->ra_offset >= fp->ff_size) { - VOP_UNLOCK(vp, 0, ap->a_p); - return (EFBIG); - } - VOP_DEVBLOCKSIZE(cp->c_devvp, &devBlockSize); +struct user64_ext_access_t { + uint32_t flags; /* IN: access requested (i.e. R_OK) */ + uint32_t num_files; /* IN: number of files to process */ + uint32_t map_size; /* IN: size of the bit map */ + user64_addr_t file_ids; /* IN: array of file ids */ + user64_addr_t bitmap; /* IN: array of groups */ + user64_addr_t access; /* OUT: access info for each file (0 for 'has access') */ + uint32_t num_parents;/* future use */ + user64_addr_t parents;/* future use */ +}; - error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count, devBlockSize); - VOP_UNLOCK(vp, 0, ap->a_p); - return (error); +/* + * Perform a binary search for the given parent_id. Return value is + * the index if there is a match. If no_match_indexp is non-NULL it + * will be assigned with the index to insert the item (even if it was + * not found). + */ +static int cache_binSearch(cnid_t *array, unsigned int hi, cnid_t parent_id, int *no_match_indexp) +{ + int index=-1; + unsigned int lo=0; + + do { + unsigned int mid = ((hi - lo)/2) + lo; + unsigned int this_id = array[mid]; + + if (parent_id == this_id) { + hi = mid; + break; } + + if (parent_id < this_id) { + hi = mid; + continue; + } + + if (parent_id > this_id) { + lo = mid + 1; + continue; + } + } while(lo < hi); - case 2: /* F_READBOOTBLOCKS */ - case 3: /* F_WRITEBOOTBLOCKS */ - { - struct vnode *vp = ap->a_vp; - struct vnode *devvp = NULL; - struct fbootstraptransfer *btd = (struct fbootstraptransfer *)ap->a_data; - int devBlockSize; - int error; - struct iovec aiov; - struct uio auio; - u_long blockNumber; - u_long blockOffset; - u_long xfersize; - struct buf *bp; - - if ((vp->v_flag & VROOT) == 0) return EINVAL; - if (btd->fbt_offset + btd->fbt_length > 1024) return EINVAL; - - devvp = VTOHFS(vp)->hfs_devvp; - aiov.iov_base = btd->fbt_buffer; - aiov.iov_len = btd->fbt_length; - - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_offset = btd->fbt_offset; - auio.uio_resid = btd->fbt_length; - auio.uio_segflg = UIO_USERSPACE; - auio.uio_rw = (ap->a_command == 3) ? UIO_WRITE : UIO_READ; /* F_WRITEBOOTSTRAP / F_READBOOTSTRAP */ - auio.uio_procp = ap->a_p; - - VOP_DEVBLOCKSIZE(devvp, &devBlockSize); - - while (auio.uio_resid > 0) { - blockNumber = auio.uio_offset / devBlockSize; - error = bread(devvp, blockNumber, devBlockSize, ap->a_cred, &bp); - if (error) { - if (bp) brelse(bp); - return error; - }; - - blockOffset = auio.uio_offset % devBlockSize; - xfersize = devBlockSize - blockOffset; - error = uiomove((caddr_t)bp->b_data + blockOffset, (int)xfersize, &auio); - if (error) { - brelse(bp); - return error; - }; - if (auio.uio_rw == UIO_WRITE) { - error = VOP_BWRITE(bp); - if (error) return error; - } else { - brelse(bp); - }; - }; - }; - return 0; - - case _IOC(IOC_OUT,'h', 4, 0): /* Create date in local time */ - { - *(time_t *)(ap->a_data) = to_bsd_time(VTOVCB(ap->a_vp)->localCreateDate); - return 0; - } - - default: - return (ENOTTY); + /* check if lo and hi converged on the match */ + if (parent_id == array[hi]) { + index = hi; + } + + if (no_match_indexp) { + *no_match_indexp = hi; } - /* Should never get here */ - return 0; + return index; } - -/* ARGSUSED */ -int -hfs_select(ap) - struct vop_select_args /* { - struct vnode *a_vp; - int a_which; - int a_fflags; - struct ucred *a_cred; - void *a_wql; - struct proc *a_p; - } */ *ap; + + +static int +lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id) { - /* - * We should really check to see if I/O is possible. - */ - return (1); + unsigned int hi; + int matches = 0; + int index, no_match_index; + + if (cache->numcached == 0) { + *indexp = 0; + return 0; // table is empty, so insert at index=0 and report no match + } + + if (cache->numcached > NUM_CACHE_ENTRIES) { + /*printf("hfs: EGAD! numcached is %d... cut our losses and trim to %d\n", + cache->numcached, NUM_CACHE_ENTRIES);*/ + cache->numcached = NUM_CACHE_ENTRIES; + } + + hi = cache->numcached - 1; + + index = cache_binSearch(cache->acache, hi, parent_id, &no_match_index); + + /* if no existing entry found, find index for new one */ + if (index == -1) { + index = no_match_index; + matches = 0; + } else { + matches = 1; + } + + *indexp = index; + return matches; } /* - * Bmap converts a the logical block number of a file to its physical block - * number on the disk. - */ - -/* - * vp - address of vnode file the file - * bn - which logical block to convert to a physical block number. - * vpp - returns the vnode for the block special file holding the filesystem - * containing the file of interest - * bnp - address of where to return the filesystem physical block number -#% bmap vp L L L -#% bmap vpp - U - -# - vop_bmap { - IN struct vnode *vp; - IN daddr_t bn; - OUT struct vnode **vpp; - IN daddr_t *bnp; - OUT int *runp; - */ -/* - * Converts a logical block number to a physical block, and optionally returns - * the amount of remaining blocks in a run. The logical block is based on hfsNode.logBlockSize. - * The physical block number is based on the device block size, currently its 512. - * The block run is returned in logical blocks, and is the REMAINING amount of blocks + * Add a node to the access_cache at the given index (or do a lookup first + * to find the index if -1 is passed in). We currently do a replace rather + * than an insert if the cache is full. */ - -int -hfs_bmap(ap) - struct vop_bmap_args /* { - struct vnode *a_vp; - daddr_t a_bn; - struct vnode **a_vpp; - daddr_t *a_bnp; - int *a_runp; - } */ *ap; +static void +add_node(struct access_cache *cache, int index, cnid_t nodeID, int access) { - struct vnode *vp = ap->a_vp; - struct cnode *cp = VTOC(vp); - struct filefork *fp = VTOF(vp); - struct hfsmount *hfsmp = VTOHFS(vp); - int retval = E_NONE; - daddr_t logBlockSize; - size_t bytesContAvail = 0; - off_t blockposition; - struct proc *p = NULL; - int lockExtBtree; - struct rl_entry *invalid_range; - enum rl_overlaptype overlaptype; + int lookup_index = -1; + + /* need to do a lookup first if -1 passed for index */ + if (index == -1) { + if (lookup_bucket(cache, &lookup_index, nodeID)) { + if (cache->haveaccess[lookup_index] != access && cache->haveaccess[lookup_index] == ESRCH) { + // only update an entry if the previous access was ESRCH (i.e. a scope checking error) + cache->haveaccess[lookup_index] = access; + } - /* - * Check for underlying vnode requests and ensure that logical - * to physical mapping is requested. - */ - if (ap->a_vpp != NULL) - *ap->a_vpp = cp->c_devvp; - if (ap->a_bnp == NULL) - return (0); + /* mission accomplished */ + return; + } else { + index = lookup_index; + } - /* Only clustered I/O should have delayed allocations. */ - DBG_ASSERT(fp->ff_unallocblocks == 0); + } - logBlockSize = GetLogicalBlockSize(vp); - blockposition = (off_t)ap->a_bn * (off_t)logBlockSize; + /* if the cache is full, do a replace rather than an insert */ + if (cache->numcached >= NUM_CACHE_ENTRIES) { + //printf("hfs: cache is full (%d). replace at index %d\n", cache->numcached, index); + cache->numcached = NUM_CACHE_ENTRIES-1; - lockExtBtree = overflow_extents(fp); - if (lockExtBtree) { - p = current_proc(); - retval = hfs_metafilelocking(hfsmp, kHFSExtentsFileID, - LK_EXCLUSIVE | LK_CANRECURSE, p); - if (retval) - return (retval); + if (index > cache->numcached) { + // printf("hfs: index %d pinned to %d\n", index, cache->numcached); + index = cache->numcached; } + } - retval = MacToVFSError( - MapFileBlockC (HFSTOVCB(hfsmp), - (FCB*)fp, - MAXPHYSIO, - blockposition, - ap->a_bnp, - &bytesContAvail)); + if (index < cache->numcached && index < NUM_CACHE_ENTRIES && nodeID > cache->acache[index]) { + index++; + } - if (lockExtBtree) (void) hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_RELEASE, p); - - if (retval == E_NONE) { - /* Adjust the mapping information for invalid file ranges: */ - overlaptype = rl_scan(&fp->ff_invalidranges, - blockposition, - blockposition + MAXPHYSIO - 1, - &invalid_range); - if (overlaptype != RL_NOOVERLAP) { - switch(overlaptype) { - case RL_MATCHINGOVERLAP: - case RL_OVERLAPCONTAINSRANGE: - case RL_OVERLAPSTARTSBEFORE: - /* There's no valid block for this byte offset: */ - *ap->a_bnp = (daddr_t)-1; - bytesContAvail = invalid_range->rl_end + 1 - blockposition; - break; - - case RL_OVERLAPISCONTAINED: - case RL_OVERLAPENDSAFTER: - /* The range of interest hits an invalid block before the end: */ - if (invalid_range->rl_start == blockposition) { - /* There's actually no valid information to be had starting here: */ - *ap->a_bnp = (daddr_t)-1; - if ((fp->ff_size > (invalid_range->rl_end + 1)) && - (invalid_range->rl_end + 1 - blockposition < bytesContAvail)) { - bytesContAvail = invalid_range->rl_end + 1 - blockposition; - }; - } else { - bytesContAvail = invalid_range->rl_start - blockposition; - }; - break; - }; - if (bytesContAvail > MAXPHYSIO) bytesContAvail = MAXPHYSIO; - }; - - /* Figure out how many read ahead blocks there are */ - if (ap->a_runp != NULL) { - if (can_cluster(logBlockSize)) { - /* Make sure this result never goes negative: */ - *ap->a_runp = (bytesContAvail < logBlockSize) ? 0 : (bytesContAvail / logBlockSize) - 1; - } else { - *ap->a_runp = 0; - }; - }; - }; - - return (retval); + if (index >= 0 && index < cache->numcached) { + /* only do bcopy if we're inserting */ + bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) ); + bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(unsigned char) ); + } + + cache->acache[index] = nodeID; + cache->haveaccess[index] = access; + cache->numcached++; } -/* blktooff converts logical block number to file offset */ -int -hfs_blktooff(ap) - struct vop_blktooff_args /* { - struct vnode *a_vp; - daddr_t a_lblkno; - off_t *a_offset; - } */ *ap; -{ - if (ap->a_vp == NULL) - return (EINVAL); - *ap->a_offset = (off_t)ap->a_lblkno * PAGE_SIZE_64; +struct cinfo { + uid_t uid; + gid_t gid; + mode_t mode; + cnid_t parentcnid; + u_int16_t recflags; +}; - return(0); -} +static int +snoop_callback(const struct cat_desc *descp, const struct cat_attr *attrp, void * arg) +{ + struct cinfo *cip = (struct cinfo *)arg; -int -hfs_offtoblk(ap) - struct vop_offtoblk_args /* { - struct vnode *a_vp; - off_t a_offset; - daddr_t *a_lblkno; - } */ *ap; -{ - if (ap->a_vp == NULL) - return (EINVAL); - *ap->a_lblkno = ap->a_offset / PAGE_SIZE_64; + cip->uid = attrp->ca_uid; + cip->gid = attrp->ca_gid; + cip->mode = attrp->ca_mode; + cip->parentcnid = descp->cd_parentcnid; + cip->recflags = attrp->ca_recflags; + + return (0); +} - return(0); +/* + * Lookup the cnid's attr info (uid, gid, and mode) as well as its parent id. If the item + * isn't incore, then go to the catalog. + */ +static int +do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, cnid_t cnid, + struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp) +{ + int error = 0; + + /* if this id matches the one the fsctl was called with, skip the lookup */ + if (cnid == skip_cp->c_cnid) { + cnattrp->ca_uid = skip_cp->c_uid; + cnattrp->ca_gid = skip_cp->c_gid; + cnattrp->ca_mode = skip_cp->c_mode; + cnattrp->ca_recflags = skip_cp->c_attr.ca_recflags; + keyp->hfsPlus.parentID = skip_cp->c_parentcnid; + } else { + struct cinfo c_info; + + /* otherwise, check the cnode hash incase the file/dir is incore */ + if (hfs_chash_snoop(hfsmp, cnid, snoop_callback, &c_info) == 0) { + cnattrp->ca_uid = c_info.uid; + cnattrp->ca_gid = c_info.gid; + cnattrp->ca_mode = c_info.mode; + cnattrp->ca_recflags = c_info.recflags; + keyp->hfsPlus.parentID = c_info.parentcnid; + } else { + int lockflags; + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + + /* lookup this cnid in the catalog */ + error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp); + + hfs_systemfile_unlock(hfsmp, lockflags); + + cache->lookups++; + } + } + + return (error); } -int -hfs_cmap(ap) - struct vop_cmap_args /* { - struct vnode *a_vp; - off_t a_foffset; - size_t a_size; - daddr_t *a_bpn; - size_t *a_run; - void *a_poff; - } */ *ap; + +/* + * Compute whether we have access to the given directory (nodeID) and all its parents. Cache + * up to CACHE_LEVELS as we progress towards the root. + */ +static int +do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HFSCatalogNodeID nodeID, + struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred, + struct vfs_context *my_context, + char *bitmap, + uint32_t map_size, + cnid_t* parents, + uint32_t num_parents) { - struct hfsmount *hfsmp = VTOHFS(ap->a_vp); - struct filefork *fp = VTOF(ap->a_vp); - size_t bytesContAvail = 0; - int retval = E_NONE; - int lockExtBtree = 0; - struct proc *p = NULL; - struct rl_entry *invalid_range; - enum rl_overlaptype overlaptype; - int started_tr = 0, grabbed_lock = 0; + int myErr = 0; + int myResult; + HFSCatalogNodeID thisNodeID; + unsigned int myPerms; + struct cat_attr cnattr; + int cache_index = -1, scope_index = -1, scope_idx_start = -1; + CatalogKey catkey; + + int i = 0, ids_to_cache = 0; + int parent_ids[CACHE_LEVELS]; + + thisNodeID = nodeID; + while (thisNodeID >= kRootDirID) { + myResult = 0; /* default to "no access" */ + + /* check the cache before resorting to hitting the catalog */ + + /* ASSUMPTION: access info of cached entries is "final"... i.e. no need + * to look any further after hitting cached dir */ + + if (lookup_bucket(cache, &cache_index, thisNodeID)) { + cache->cachehits++; + myErr = cache->haveaccess[cache_index]; + if (scope_index != -1) { + if (myErr == ESRCH) { + myErr = 0; + } + } else { + scope_index = 0; // so we'll just use the cache result + scope_idx_start = ids_to_cache; + } + myResult = (myErr == 0) ? 1 : 0; + goto ExitThisRoutine; + } - /* - * Check for underlying vnode requests and ensure that logical - * to physical mapping is requested. - */ - if (ap->a_bpn == NULL) - return (0); - p = current_proc(); - if (fp->ff_unallocblocks) { - lockExtBtree = 1; + if (parents) { + int tmp; + tmp = cache_binSearch(parents, num_parents-1, thisNodeID, NULL); + if (scope_index == -1) + scope_index = tmp; + if (tmp != -1 && scope_idx_start == -1 && ids_to_cache < CACHE_LEVELS) { + scope_idx_start = ids_to_cache; + } + } - // XXXdbg - hfs_global_shared_lock_acquire(hfsmp); - grabbed_lock = 1; + /* remember which parents we want to cache */ + if (ids_to_cache < CACHE_LEVELS) { + parent_ids[ids_to_cache] = thisNodeID; + ids_to_cache++; + } + // Inefficient (using modulo) and we might want to use a hash function, not rely on the node id to be "nice"... + if (bitmap && map_size) { + bitmap[(thisNodeID/8)%(map_size)]|=(1<<(thisNodeID&7)); + } + - if (hfsmp->jnl) { - if (journal_start_transaction(hfsmp->jnl) != 0) { - hfs_global_shared_lock_release(hfsmp); - return EINVAL; - } else { - started_tr = 1; - } - } + /* do the lookup (checks the cnode hash, then the catalog) */ + myErr = do_attr_lookup(hfsmp, cache, thisNodeID, skip_cp, &catkey, &cnattr); + if (myErr) { + goto ExitThisRoutine; /* no access */ + } - if (retval = hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_EXCLUSIVE | LK_CANRECURSE, p)) { - if (started_tr) { - journal_end_transaction(hfsmp->jnl); - } - if (grabbed_lock) { - hfs_global_shared_lock_release(hfsmp); - } - return (retval); - } - } else if (overflow_extents(fp)) { - lockExtBtree = 1; - if (retval = hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_EXCLUSIVE | LK_CANRECURSE, p)) { - return retval; - } + /* Root always gets access. */ + if (suser(myp_ucred, NULL) == 0) { + thisNodeID = catkey.hfsPlus.parentID; + myResult = 1; + continue; } - /* - * Check for any delayed allocations. - */ - if (fp->ff_unallocblocks) { - SInt64 reqbytes, actbytes; + // if the thing has acl's, do the full permission check + if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) { + struct vnode *vp; - reqbytes = (SInt64)fp->ff_unallocblocks * - (SInt64)HFSTOVCB(hfsmp)->blockSize; - /* - * Release the blocks on loan and aquire some real ones. - * Note that we can race someone else for these blocks - * (and lose) so cmap needs to handle a failure here. - * Currently this race can't occur because all allocations - * are protected by an exclusive lock on the Extents - * Overflow file. - */ - HFSTOVCB(hfsmp)->loanedBlocks -= fp->ff_unallocblocks; - FTOC(fp)->c_blocks -= fp->ff_unallocblocks; - fp->ff_blocks -= fp->ff_unallocblocks; - fp->ff_unallocblocks = 0; - - while (retval == 0 && reqbytes > 0) { - retval = MacToVFSError(ExtendFileC(HFSTOVCB(hfsmp), - (FCB*)fp, reqbytes, 0, - kEFAllMask | kEFNoClumpMask, &actbytes)); - if (retval == 0 && actbytes == 0) - retval = ENOSPC; - - if (retval) { - fp->ff_unallocblocks = - reqbytes / HFSTOVCB(hfsmp)->blockSize; - HFSTOVCB(hfsmp)->loanedBlocks += fp->ff_unallocblocks; - FTOC(fp)->c_blocks += fp->ff_unallocblocks; - fp->ff_blocks += fp->ff_unallocblocks; - } - reqbytes -= actbytes; - } + /* get the vnode for this cnid */ + myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0); + if ( myErr ) { + myResult = 0; + goto ExitThisRoutine; + } - if (retval) { - (void) hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_RELEASE, p); - if (started_tr) { - hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); - journal_end_transaction(hfsmp->jnl); - } - if (grabbed_lock) { - hfs_global_shared_lock_release(hfsmp); - } - return (retval); - } - VTOC(ap->a_vp)->c_flag |= C_MODIFIED; - } + thisNodeID = VTOC(vp)->c_parentcnid; - retval = MacToVFSError( - MapFileBlockC (HFSTOVCB(hfsmp), - (FCB *)fp, - ap->a_size, - ap->a_foffset, - ap->a_bpn, - &bytesContAvail)); + hfs_unlock(VTOC(vp)); - if (lockExtBtree) - (void) hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_RELEASE, p); + if (vnode_vtype(vp) == VDIR) { + myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), my_context); + } else { + myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, my_context); + } - // XXXdbg - if (started_tr) { - hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); - journal_end_transaction(hfsmp->jnl); - started_tr = 0; - } - if (grabbed_lock) { - hfs_global_shared_lock_release(hfsmp); - grabbed_lock = 0; + vnode_put(vp); + if (myErr) { + myResult = 0; + goto ExitThisRoutine; + } + } else { + unsigned int flags; + + myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, + cnattr.ca_mode, hfsmp->hfs_mp, + myp_ucred, theProcPtr); + + if (cnattr.ca_mode & S_IFDIR) { + flags = R_OK | X_OK; + } else { + flags = R_OK; + } + if ( (myPerms & flags) != flags) { + myResult = 0; + myErr = EACCES; + goto ExitThisRoutine; /* no access */ + } + + /* up the hierarchy we go */ + thisNodeID = catkey.hfsPlus.parentID; } - - if (retval == E_NONE) { - /* Adjust the mapping information for invalid file ranges: */ - overlaptype = rl_scan(&fp->ff_invalidranges, - ap->a_foffset, - ap->a_foffset + (off_t)bytesContAvail - 1, - &invalid_range); - if (overlaptype != RL_NOOVERLAP) { - switch(overlaptype) { - case RL_MATCHINGOVERLAP: - case RL_OVERLAPCONTAINSRANGE: - case RL_OVERLAPSTARTSBEFORE: - /* There's no valid block for this byte offset: */ - *ap->a_bpn = (daddr_t)-1; - - /* There's no point limiting the amount to be returned if the - invalid range that was hit extends all the way to the EOF - (i.e. there's no valid bytes between the end of this range - and the file's EOF): - */ - if ((fp->ff_size > (invalid_range->rl_end + 1)) && - (invalid_range->rl_end + 1 - ap->a_foffset < bytesContAvail)) { - bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset; - }; - break; - - case RL_OVERLAPISCONTAINED: - case RL_OVERLAPENDSAFTER: - /* The range of interest hits an invalid block before the end: */ - if (invalid_range->rl_start == ap->a_foffset) { - /* There's actually no valid information to be had starting here: */ - *ap->a_bpn = (daddr_t)-1; - if ((fp->ff_size > (invalid_range->rl_end + 1)) && - (invalid_range->rl_end + 1 - ap->a_foffset < bytesContAvail)) { - bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset; - }; - } else { - bytesContAvail = invalid_range->rl_start - ap->a_foffset; - }; - break; - }; - if (bytesContAvail > ap->a_size) bytesContAvail = ap->a_size; - }; - - if (ap->a_run) *ap->a_run = bytesContAvail; - }; + } - if (ap->a_poff) - *(int *)ap->a_poff = 0; + /* if here, we have access to this node */ + myResult = 1; - return (retval); -} + ExitThisRoutine: + if (parents && myErr == 0 && scope_index == -1) { + myErr = ESRCH; + } + + if (myErr) { + myResult = 0; + } + *err = myErr; + + /* cache the parent directory(ies) */ + for (i = 0; i < ids_to_cache; i++) { + if (myErr == 0 && parents && (scope_idx_start == -1 || i > scope_idx_start)) { + add_node(cache, -1, parent_ids[i], ESRCH); + } else { + add_node(cache, -1, parent_ids[i], myErr); + } + } + return (myResult); +} -/* - * Read or write a buffer that is not contiguous on disk. We loop over - * each device block, copying to or from caller's buffer. - * - * We could be a bit more efficient by transferring as much data as is - * contiguous. But since this routine should rarely be called, and that - * would be more complicated; best to keep it simple. - */ static int -hfs_strategy_fragmented(struct buf *bp) +do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp, + struct vnop_ioctl_args *ap, int arg_size, vfs_context_t context) { - register struct vnode *vp = bp->b_vp; - register struct cnode *cp = VTOC(vp); - register struct vnode *devvp = cp->c_devvp; - caddr_t ioaddr; /* Address of fragment within bp */ - struct buf *frag = NULL; /* For reading or writing a single block */ - int retval = 0; - long remaining; /* Bytes (in bp) left to transfer */ - off_t offset; /* Logical offset of current fragment in vp */ - u_long block_size; /* Size of one device block (and one I/O) */ + boolean_t is64bit; + + /* + * NOTE: on entry, the vnode is locked. Incase this vnode + * happens to be in our list of file_ids, we'll note it + * avoid calling hfs_chashget_nowait() on that id as that + * will cause a "locking against myself" panic. + */ + Boolean check_leaf = true; + + struct user64_ext_access_t *user_access_structp; + struct user64_ext_access_t tmp_user_access; + struct access_cache cache; + + int error = 0, prev_parent_check_ok=1; + unsigned int i; + + short flags; + unsigned int num_files = 0; + int map_size = 0; + int num_parents = 0; + int *file_ids=NULL; + short *access=NULL; + char *bitmap=NULL; + cnid_t *parents=NULL; + int leaf_index; - /* Make sure we redo this mapping for the next I/O */ - bp->b_blkno = bp->b_lblkno; + cnid_t cnid; + cnid_t prevParent_cnid = 0; + unsigned int myPerms; + short myaccess = 0; + struct cat_attr cnattr; + CatalogKey catkey; + struct cnode *skip_cp = VTOC(vp); + kauth_cred_t cred = vfs_context_ucred(context); + proc_t p = vfs_context_proc(context); + + is64bit = proc_is64bit(p); + + /* initialize the local cache and buffers */ + cache.numcached = 0; + cache.cachehits = 0; + cache.lookups = 0; + cache.acache = NULL; + cache.haveaccess = NULL; + + /* struct copyin done during dispatch... need to copy file_id array separately */ + if (ap->a_data == NULL) { + error = EINVAL; + goto err_exit_bulk_access; + } + + if (is64bit) { + if (arg_size != sizeof(struct user64_ext_access_t)) { + error = EINVAL; + goto err_exit_bulk_access; + } + + user_access_structp = (struct user64_ext_access_t *)ap->a_data; + + } else if (arg_size == sizeof(struct user32_access_t)) { + struct user32_access_t *accessp = (struct user32_access_t *)ap->a_data; + + // convert an old style bulk-access struct to the new style + tmp_user_access.flags = accessp->flags; + tmp_user_access.num_files = accessp->num_files; + tmp_user_access.map_size = 0; + tmp_user_access.file_ids = CAST_USER_ADDR_T(accessp->file_ids); + tmp_user_access.bitmap = USER_ADDR_NULL; + tmp_user_access.access = CAST_USER_ADDR_T(accessp->access); + tmp_user_access.num_parents = 0; + user_access_structp = &tmp_user_access; + + } else if (arg_size == sizeof(struct user32_ext_access_t)) { + struct user32_ext_access_t *accessp = (struct user32_ext_access_t *)ap->a_data; + + // up-cast from a 32-bit version of the struct + tmp_user_access.flags = accessp->flags; + tmp_user_access.num_files = accessp->num_files; + tmp_user_access.map_size = accessp->map_size; + tmp_user_access.num_parents = accessp->num_parents; + + tmp_user_access.file_ids = CAST_USER_ADDR_T(accessp->file_ids); + tmp_user_access.bitmap = CAST_USER_ADDR_T(accessp->bitmap); + tmp_user_access.access = CAST_USER_ADDR_T(accessp->access); + tmp_user_access.parents = CAST_USER_ADDR_T(accessp->parents); + + user_access_structp = &tmp_user_access; + } else { + error = EINVAL; + goto err_exit_bulk_access; + } + + map_size = user_access_structp->map_size; + + num_files = user_access_structp->num_files; + + num_parents= user_access_structp->num_parents; + + if (num_files < 1) { + goto err_exit_bulk_access; + } + if (num_files > 1024) { + error = EINVAL; + goto err_exit_bulk_access; + } + + if (num_parents > 1024) { + error = EINVAL; + goto err_exit_bulk_access; + } + + file_ids = (int *) kalloc(sizeof(int) * num_files); + access = (short *) kalloc(sizeof(short) * num_files); + if (map_size) { + bitmap = (char *) kalloc(sizeof(char) * map_size); + } + + if (num_parents) { + parents = (cnid_t *) kalloc(sizeof(cnid_t) * num_parents); + } + + cache.acache = (unsigned int *) kalloc(sizeof(int) * NUM_CACHE_ENTRIES); + cache.haveaccess = (unsigned char *) kalloc(sizeof(unsigned char) * NUM_CACHE_ENTRIES); + + if (file_ids == NULL || access == NULL || (map_size != 0 && bitmap == NULL) || cache.acache == NULL || cache.haveaccess == NULL) { + if (file_ids) { + kfree(file_ids, sizeof(int) * num_files); + } + if (bitmap) { + kfree(bitmap, sizeof(char) * map_size); + } + if (access) { + kfree(access, sizeof(short) * num_files); + } + if (cache.acache) { + kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES); + } + if (cache.haveaccess) { + kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES); + } + if (parents) { + kfree(parents, sizeof(cnid_t) * num_parents); + } + return ENOMEM; + } + + // make sure the bitmap is zero'ed out... + if (bitmap) { + bzero(bitmap, (sizeof(char) * map_size)); + } + + if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids, + num_files * sizeof(int)))) { + goto err_exit_bulk_access; + } - /* Set up the logical position and number of bytes to read/write */ - offset = (off_t) bp->b_lblkno * (off_t) GetLogicalBlockSize(vp); - block_size = VTOHFS(vp)->hfs_phys_block_size; + if (num_parents) { + if ((error = copyin(user_access_structp->parents, (caddr_t)parents, + num_parents * sizeof(cnid_t)))) { + goto err_exit_bulk_access; + } + } - /* Get an empty buffer to do the deblocking */ - frag = geteblk(block_size); - if (ISSET(bp->b_flags, B_READ)) - SET(frag->b_flags, B_READ); - - for (ioaddr = bp->b_data, remaining = bp->b_bcount; remaining != 0; - ioaddr += block_size, offset += block_size, - remaining -= block_size) { - frag->b_resid = frag->b_bcount; - CLR(frag->b_flags, B_DONE); - - /* Map the current position to a physical block number */ - retval = VOP_CMAP(vp, offset, block_size, &frag->b_lblkno, - NULL, NULL); - if (retval != 0) - break; + flags = user_access_structp->flags; + if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) { + flags = R_OK; + } + + /* check if we've been passed leaf node ids or parent ids */ + if (flags & PARENT_IDS_FLAG) { + check_leaf = false; + } + + /* Check access to each file_id passed in */ + for (i = 0; i < num_files; i++) { + leaf_index=-1; + cnid = (cnid_t) file_ids[i]; + + /* root always has access */ + if ((!parents) && (!suser(cred, NULL))) { + access[i] = 0; + continue; + } + + if (check_leaf) { + /* do the lookup (checks the cnode hash, then the catalog) */ + error = do_attr_lookup(hfsmp, &cache, cnid, skip_cp, &catkey, &cnattr); + if (error) { + access[i] = (short) error; + continue; + } + + if (parents) { + // Check if the leaf matches one of the parent scopes + leaf_index = cache_binSearch(parents, num_parents-1, cnid, NULL); + if (leaf_index >= 0 && parents[leaf_index] == cnid) + prev_parent_check_ok = 0; + else if (leaf_index >= 0) + prev_parent_check_ok = 1; + } - /* - * Did we try to read a hole? - * (Should never happen for metadata!) - */ - if ((long)frag->b_lblkno == -1) { - bzero(ioaddr, block_size); - continue; + // if the thing has acl's, do the full permission check + if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) { + struct vnode *cvp; + int myErr = 0; + /* get the vnode for this cnid */ + myErr = hfs_vget(hfsmp, cnid, &cvp, 0); + if ( myErr ) { + access[i] = myErr; + continue; } - /* If writing, copy before I/O */ - if (!ISSET(bp->b_flags, B_READ)) - bcopy(ioaddr, frag->b_data, block_size); - - /* Call the device to do the I/O and wait for it */ - frag->b_blkno = frag->b_lblkno; - frag->b_vp = devvp; /* Used to dispatch via VOP_STRATEGY */ - frag->b_dev = devvp->v_rdev; - retval = VOP_STRATEGY(frag); - frag->b_vp = NULL; - if (retval != 0) - break; - retval = biowait(frag); - if (retval != 0) - break; + hfs_unlock(VTOC(cvp)); + + if (vnode_vtype(cvp) == VDIR) { + myErr = vnode_authorize(cvp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), context); + } else { + myErr = vnode_authorize(cvp, NULL, KAUTH_VNODE_READ_DATA, context); + } + + vnode_put(cvp); + if (myErr) { + access[i] = myErr; + continue; + } + } else { + /* before calling CheckAccess(), check the target file for read access */ + myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, + cnattr.ca_mode, hfsmp->hfs_mp, cred, p); - /* If reading, copy after the I/O */ - if (ISSET(bp->b_flags, B_READ)) - bcopy(frag->b_data, ioaddr, block_size); + /* fail fast if no access */ + if ((myPerms & flags) == 0) { + access[i] = EACCES; + continue; + } + } + } else { + /* we were passed an array of parent ids */ + catkey.hfsPlus.parentID = cnid; } + + /* if the last guy had the same parent and had access, we're done */ + if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0 && prev_parent_check_ok) { + cache.cachehits++; + access[i] = 0; + continue; + } + + myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID, + skip_cp, p, cred, context,bitmap, map_size, parents, num_parents); + + if (myaccess || (error == ESRCH && leaf_index != -1)) { + access[i] = 0; // have access.. no errors to report + } else { + access[i] = (error != 0 ? (short) error : EACCES); + } + + prevParent_cnid = catkey.hfsPlus.parentID; + } + + /* copyout the access array */ + if ((error = copyout((caddr_t)access, user_access_structp->access, + num_files * sizeof (short)))) { + goto err_exit_bulk_access; + } + if (map_size && bitmap) { + if ((error = copyout((caddr_t)bitmap, user_access_structp->bitmap, + map_size * sizeof (char)))) { + goto err_exit_bulk_access; + } + } - frag->b_vp = NULL; - // - // XXXdbg - in the case that this is a meta-data block, it won't affect - // the journal because this bp is for a physical disk block, - // not a logical block that is part of the catalog or extents - // files. - SET(frag->b_flags, B_INVAL); - brelse(frag); - - if ((bp->b_error = retval) != 0) - SET(bp->b_flags, B_ERROR); - - biodone(bp); /* This I/O is now complete */ - return retval; + + err_exit_bulk_access: + + //printf("hfs: on exit (err %d), numfiles/numcached/cachehits/lookups is %d/%d/%d/%d\n", error, num_files, cache.numcached, cache.cachehits, cache.lookups); + + if (file_ids) + kfree(file_ids, sizeof(int) * num_files); + if (parents) + kfree(parents, sizeof(cnid_t) * num_parents); + if (bitmap) + kfree(bitmap, sizeof(char) * map_size); + if (access) + kfree(access, sizeof(short) * num_files); + if (cache.acache) + kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES); + if (cache.haveaccess) + kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES); + + return (error); } +/* end "bulk-access" support */ + + /* - * Calculate the logical to physical mapping if not done already, - * then call the device strategy routine. -# -#vop_strategy { -# IN struct buf *bp; - */ + * Callback for use with freeze ioctl. + */ +static int +hfs_freezewrite_callback(struct vnode *vp, __unused void *cargs) +{ + vnode_waitforwrites(vp, 0, 0, 0, "hfs freeze"); + + return 0; +} + +/* + * Control filesystem operating characteristics. + */ int -hfs_strategy(ap) - struct vop_strategy_args /* { - struct buf *a_bp; - } */ *ap; +hfs_vnop_ioctl( struct vnop_ioctl_args /* { + vnode_t a_vp; + int a_command; + caddr_t a_data; + int a_fflag; + vfs_context_t a_context; + } */ *ap) { - register struct buf *bp = ap->a_bp; - register struct vnode *vp = bp->b_vp; - register struct cnode *cp = VTOC(vp); - int retval = 0; - off_t offset; - size_t bytes_contig; + struct vnode * vp = ap->a_vp; + struct hfsmount *hfsmp = VTOHFS(vp); + vfs_context_t context = ap->a_context; + kauth_cred_t cred = vfs_context_ucred(context); + proc_t p = vfs_context_proc(context); + struct vfsstatfs *vfsp; + boolean_t is64bit; + off_t jnl_start, jnl_size; + struct hfs_journal_info *jip; +#if HFS_COMPRESSION + int compressed = 0; + off_t uncompressed_size = -1; + int decmpfs_error = 0; - if ( !(bp->b_flags & B_VECTORLIST)) { - if (vp->v_type == VBLK || vp->v_type == VCHR) - panic("hfs_strategy: device vnode passed!"); + if (ap->a_command == F_RDADVISE) { + /* we need to inspect the decmpfs state of the file as early as possible */ + compressed = hfs_file_is_compressed(VTOC(vp), 0); + if (compressed) { + if (VNODE_IS_RSRC(vp)) { + /* if this is the resource fork, treat it as if it were empty */ + uncompressed_size = 0; + } else { + decmpfs_error = hfs_uncompressed_size_of_compressed_file(NULL, vp, 0, &uncompressed_size, 0); + if (decmpfs_error != 0) { + /* failed to get the uncompressed size, we'll check for this later */ + uncompressed_size = -1; + } + } + } + } +#endif /* HFS_COMPRESSION */ - if (bp->b_flags & B_PAGELIST) { - /* - * If we have a page list associated with this bp, - * then go through cluster_bp since it knows how to - * deal with a page request that might span non- - * contiguous physical blocks on the disk... - */ - retval = cluster_bp(bp); - vp = cp->c_devvp; - bp->b_dev = vp->v_rdev; + is64bit = proc_is64bit(p); - return (retval); + switch (ap->a_command) { + + case HFS_GETPATH: + { + struct vnode *file_vp; + cnid_t cnid; + int outlen; + char *bufptr; + int error; + + /* Caller must be owner of file system. */ + vfsp = vfs_statfs(HFSTOVFS(hfsmp)); + if (suser(cred, NULL) && + kauth_cred_getuid(cred) != vfsp->f_owner) { + return (EACCES); } - - /* - * If we don't already know the filesystem relative block - * number then get it using VOP_BMAP(). If VOP_BMAP() - * returns the block number as -1 then we've got a hole in - * the file. Although HFS filesystems don't create files with - * holes, invalidating of subranges of the file (lazy zero - * filling) may create such a situation. + /* Target vnode must be file system's root. */ + if (!vnode_isvroot(vp)) { + return (EINVAL); + } + bufptr = (char *)ap->a_data; + cnid = strtoul(bufptr, NULL, 10); + + /* We need to call hfs_vfs_vget to leverage the code that will + * fix the origin list for us if needed, as opposed to calling + * hfs_vget, since we will need the parent for build_path call. */ - if (bp->b_blkno == bp->b_lblkno) { - offset = (off_t) bp->b_lblkno * - (off_t) GetLogicalBlockSize(vp); - - if ((retval = VOP_CMAP(vp, offset, bp->b_bcount, - &bp->b_blkno, &bytes_contig, NULL))) { - bp->b_error = retval; - bp->b_flags |= B_ERROR; - biodone(bp); - return (retval); - } - if (bytes_contig < bp->b_bcount) - { - /* - * We were asked to read a block that wasn't - * contiguous, so we have to read each of the - * pieces and copy them into the buffer. - * Since ordinary file I/O goes through - * cluster_io (which won't ask us for - * discontiguous data), this is probably an - * attempt to read or write metadata. - */ - return hfs_strategy_fragmented(bp); - } - if ((long)bp->b_blkno == -1) - clrbuf(bp); + + if ((error = hfs_vfs_vget(HFSTOVFS(hfsmp), cnid, &file_vp, context))) { + return (error); } - if ((long)bp->b_blkno == -1) { - biodone(bp); - return (0); + error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, 0, context); + vnode_put(file_vp); + + return (error); + } + + case HFS_PREV_LINK: + case HFS_NEXT_LINK: + { + cnid_t linkfileid; + cnid_t nextlinkid; + cnid_t prevlinkid; + int error; + + /* Caller must be owner of file system. */ + vfsp = vfs_statfs(HFSTOVFS(hfsmp)); + if (suser(cred, NULL) && + kauth_cred_getuid(cred) != vfsp->f_owner) { + return (EACCES); } - if (bp->b_validend == 0) { - /* - * Record the exact size of the I/O transfer about to - * be made: - */ - bp->b_validend = bp->b_bcount; + /* Target vnode must be file system's root. */ + if (!vnode_isvroot(vp)) { + return (EINVAL); } + linkfileid = *(cnid_t *)ap->a_data; + if (linkfileid < kHFSFirstUserCatalogNodeID) { + return (EINVAL); + } + if ((error = hfs_lookuplink(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) { + return (error); + } + if (ap->a_command == HFS_NEXT_LINK) { + *(cnid_t *)ap->a_data = nextlinkid; + } else { + *(cnid_t *)ap->a_data = prevlinkid; + } + return (0); } - vp = cp->c_devvp; - bp->b_dev = vp->v_rdev; - return VOCALL (vp->v_op, VOFFSET(vop_strategy), ap); -} + case HFS_RESIZE_PROGRESS: { + vfsp = vfs_statfs(HFSTOVFS(hfsmp)); + if (suser(cred, NULL) && + kauth_cred_getuid(cred) != vfsp->f_owner) { + return (EACCES); /* must be owner of file system */ + } + if (!vnode_isvroot(vp)) { + return (EINVAL); + } + /* file system must not be mounted read-only */ + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } -/* -# -#% truncate vp L L L -# -vop_truncate { - IN struct vnode *vp; - IN off_t length; - IN int flags; (IO_SYNC) - IN struct ucred *cred; - IN struct proc *p; -}; - * Truncate a cnode to at most length size, freeing (or adding) the - * disk blocks. - */ -int hfs_truncate(ap) - struct vop_truncate_args /* { - struct vnode *a_vp; - off_t a_length; - int a_flags; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; -{ - register struct vnode *vp = ap->a_vp; - register struct cnode *cp = VTOC(vp); - struct filefork *fp = VTOF(vp); - off_t length; - long vflags; - struct timeval tv; - int retval; - off_t bytesToAdd; - off_t actualBytesAdded; - off_t filebytes; - u_long fileblocks; - int blksize; - struct hfsmount *hfsmp; + return hfs_resize_progress(hfsmp, (u_int32_t *)ap->a_data); + } - if (vp->v_type != VREG && vp->v_type != VLNK) - return (EISDIR); /* cannot truncate an HFS directory! */ + case HFS_RESIZE_VOLUME: { + u_int64_t newsize; + u_int64_t cursize; - length = ap->a_length; - blksize = VTOVCB(vp)->blockSize; - fileblocks = fp->ff_blocks; - filebytes = (off_t)fileblocks * (off_t)blksize; + vfsp = vfs_statfs(HFSTOVFS(hfsmp)); + if (suser(cred, NULL) && + kauth_cred_getuid(cred) != vfsp->f_owner) { + return (EACCES); /* must be owner of file system */ + } + if (!vnode_isvroot(vp)) { + return (EINVAL); + } + + /* filesystem must not be mounted read only */ + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } + newsize = *(u_int64_t *)ap->a_data; + cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize; + + if (newsize > cursize) { + return hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context); + } else if (newsize < cursize) { + return hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context); + } else { + return (0); + } + } + case HFS_CHANGE_NEXT_ALLOCATION: { + int error = 0; /* Assume success */ + u_int32_t location; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_START, - (int)length, (int)fp->ff_size, (int)filebytes, 0, 0); + if (vnode_vfsisrdonly(vp)) { + return (EROFS); + } + vfsp = vfs_statfs(HFSTOVFS(hfsmp)); + if (suser(cred, NULL) && + kauth_cred_getuid(cred) != vfsp->f_owner) { + return (EACCES); /* must be owner of file system */ + } + if (!vnode_isvroot(vp)) { + return (EINVAL); + } + HFS_MOUNT_LOCK(hfsmp, TRUE); + location = *(u_int32_t *)ap->a_data; + if ((location >= hfsmp->allocLimit) && + (location != HFS_NO_UPDATE_NEXT_ALLOCATION)) { + error = EINVAL; + goto fail_change_next_allocation; + } + /* Return previous value. */ + *(u_int32_t *)ap->a_data = hfsmp->nextAllocation; + if (location == HFS_NO_UPDATE_NEXT_ALLOCATION) { + /* On magic value for location, set nextAllocation to next block + * after metadata zone and set flag in mount structure to indicate + * that nextAllocation should not be updated again. + */ + if (hfsmp->hfs_metazone_end != 0) { + HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_end + 1); + } + hfsmp->hfs_flags |= HFS_SKIP_UPDATE_NEXT_ALLOCATION; + } else { + hfsmp->hfs_flags &= ~HFS_SKIP_UPDATE_NEXT_ALLOCATION; + HFS_UPDATE_NEXT_ALLOCATION(hfsmp, location); + } + MarkVCBDirty(hfsmp); +fail_change_next_allocation: + HFS_MOUNT_UNLOCK(hfsmp, TRUE); + return (error); + } - if (length < 0) - return (EINVAL); +#ifdef HFS_SPARSE_DEV + case HFS_SETBACKINGSTOREINFO: { + struct vnode * bsfs_rootvp; + struct vnode * di_vp; + struct hfs_backingstoreinfo *bsdata; + int error = 0; + + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } + if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { + return (EALREADY); + } + vfsp = vfs_statfs(HFSTOVFS(hfsmp)); + if (suser(cred, NULL) && + kauth_cred_getuid(cred) != vfsp->f_owner) { + return (EACCES); /* must be owner of file system */ + } + bsdata = (struct hfs_backingstoreinfo *)ap->a_data; + if (bsdata == NULL) { + return (EINVAL); + } + if ((error = file_vnode(bsdata->backingfd, &di_vp))) { + return (error); + } + if ((error = vnode_getwithref(di_vp))) { + file_drop(bsdata->backingfd); + return(error); + } - if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE)) - return (EFBIG); + if (vnode_mount(vp) == vnode_mount(di_vp)) { + (void)vnode_put(di_vp); + file_drop(bsdata->backingfd); + return (EINVAL); + } - hfsmp = VTOHFS(vp); + /* + * Obtain the backing fs root vnode and keep a reference + * on it. This reference will be dropped in hfs_unmount. + */ + error = VFS_ROOT(vnode_mount(di_vp), &bsfs_rootvp, NULL); /* XXX use context! */ + if (error) { + (void)vnode_put(di_vp); + file_drop(bsdata->backingfd); + return (error); + } + vnode_ref(bsfs_rootvp); + vnode_put(bsfs_rootvp); - tv = time; - retval = E_NONE; + hfsmp->hfs_backingfs_rootvp = bsfs_rootvp; + hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE; + hfsmp->hfs_sparsebandblks = bsdata->bandsize / HFSTOVCB(hfsmp)->blockSize; + hfsmp->hfs_sparsebandblks *= 4; - /* - * We cannot just check if fp->ff_size == length (as an optimization) - * since there may be extra physical blocks that also need truncation. - */ -#if QUOTA - if (retval = hfs_getinoquota(cp)) - return(retval); -#endif /* QUOTA */ + vfs_markdependency(hfsmp->hfs_mp); - /* - * Lengthen the size of the file. We must ensure that the - * last byte of the file is allocated. Since the smallest - * value of ff_size is 0, length will be at least 1. - */ - if (length > fp->ff_size) { -#if QUOTA - retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)), - ap->a_cred, 0); - if (retval) - goto Err_Exit; -#endif /* QUOTA */ /* - * If we don't have enough physical space then - * we need to extend the physical size. + * If the sparse image is on a sparse image file (as opposed to a sparse + * bundle), then we may need to limit the free space to the maximum size + * of a file on that volume. So we query (using pathconf), and if we get + * a meaningful result, we cache the number of blocks for later use in + * hfs_freeblks(). */ - if (length > filebytes) { - int eflags; - - /* All or nothing and don't round up to clumpsize. */ - eflags = kEFAllMask | kEFNoClumpMask; + hfsmp->hfs_backingfs_maxblocks = 0; + if (vnode_vtype(di_vp) == VREG) { + int terr; + int hostbits; + terr = vn_pathconf(di_vp, _PC_FILESIZEBITS, &hostbits, context); + if (terr == 0 && hostbits != 0 && hostbits < 64) { + u_int64_t hostfilesizemax = ((u_int64_t)1) << hostbits; + + hfsmp->hfs_backingfs_maxblocks = hostfilesizemax / hfsmp->blockSize; + } + } + + (void)vnode_put(di_vp); + file_drop(bsdata->backingfd); + return (0); + } + case HFS_CLRBACKINGSTOREINFO: { + struct vnode * tmpvp; - if (suser(ap->a_cred, NULL) != 0) - eflags |= kEFReserveMask; /* keep a reserve */ + vfsp = vfs_statfs(HFSTOVFS(hfsmp)); + if (suser(cred, NULL) && + kauth_cred_getuid(cred) != vfsp->f_owner) { + return (EACCES); /* must be owner of file system */ + } + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } - // XXXdbg - hfs_global_shared_lock_acquire(hfsmp); - if (hfsmp->jnl) { - if (journal_start_transaction(hfsmp->jnl) != 0) { - retval = EINVAL; - goto Err_Exit; - } - } + if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) && + hfsmp->hfs_backingfs_rootvp) { - /* lock extents b-tree (also protects volume bitmap) */ - retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p); - if (retval) { - if (hfsmp->jnl) { - journal_end_transaction(hfsmp->jnl); - } - hfs_global_shared_lock_release(hfsmp); + hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE; + tmpvp = hfsmp->hfs_backingfs_rootvp; + hfsmp->hfs_backingfs_rootvp = NULLVP; + hfsmp->hfs_sparsebandblks = 0; + vnode_rele(tmpvp); + } + return (0); + } +#endif /* HFS_SPARSE_DEV */ - goto Err_Exit; - } + case F_FREEZE_FS: { + struct mount *mp; + + mp = vnode_mount(vp); + hfsmp = VFSTOHFS(mp); - while ((length > filebytes) && (retval == E_NONE)) { - bytesToAdd = length - filebytes; - retval = MacToVFSError(ExtendFileC(VTOVCB(vp), - (FCB*)fp, - bytesToAdd, - 0, - eflags, - &actualBytesAdded)); + if (!(hfsmp->jnl)) + return (ENOTSUP); - filebytes = (off_t)fp->ff_blocks * (off_t)blksize; - if (actualBytesAdded == 0 && retval == E_NONE) { - if (length > filebytes) - length = filebytes; - break; - } - } /* endwhile */ + vfsp = vfs_statfs(mp); + + if (kauth_cred_getuid(cred) != vfsp->f_owner && + !kauth_cred_issuser(cred)) + return (EACCES); - (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p); + lck_rw_lock_exclusive(&hfsmp->hfs_insync); + + // flush things before we get started to try and prevent + // dirty data from being paged out while we're frozen. + // note: can't do this after taking the lock as it will + // deadlock against ourselves. + vnode_iterate(mp, 0, hfs_freezewrite_callback, NULL); + hfs_global_exclusive_lock_acquire(hfsmp); + + // DO NOT call hfs_journal_flush() because that takes a + // shared lock on the global exclusive lock! + journal_flush(hfsmp->jnl); + + // don't need to iterate on all vnodes, we just need to + // wait for writes to the system files and the device vnode + if (HFSTOVCB(hfsmp)->extentsRefNum) + vnode_waitforwrites(HFSTOVCB(hfsmp)->extentsRefNum, 0, 0, 0, "hfs freeze"); + if (HFSTOVCB(hfsmp)->catalogRefNum) + vnode_waitforwrites(HFSTOVCB(hfsmp)->catalogRefNum, 0, 0, 0, "hfs freeze"); + if (HFSTOVCB(hfsmp)->allocationsRefNum) + vnode_waitforwrites(HFSTOVCB(hfsmp)->allocationsRefNum, 0, 0, 0, "hfs freeze"); + if (hfsmp->hfs_attribute_vp) + vnode_waitforwrites(hfsmp->hfs_attribute_vp, 0, 0, 0, "hfs freeze"); + vnode_waitforwrites(hfsmp->hfs_devvp, 0, 0, 0, "hfs freeze"); + + hfsmp->hfs_freezing_proc = current_proc(); - // XXXdbg - if (hfsmp->jnl) { - hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); - journal_end_transaction(hfsmp->jnl); - } - hfs_global_shared_lock_release(hfsmp); + return (0); + } - if (retval) - goto Err_Exit; + case F_THAW_FS: { + vfsp = vfs_statfs(vnode_mount(vp)); + if (kauth_cred_getuid(cred) != vfsp->f_owner && + !kauth_cred_issuser(cred)) + return (EACCES); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE, - (int)length, (int)fp->ff_size, (int)filebytes, 0, 0); + // if we're not the one who froze the fs then we + // can't thaw it. + if (hfsmp->hfs_freezing_proc != current_proc()) { + return EPERM; } - - if (!(ap->a_flags & IO_NOZEROFILL)) { - if (UBCINFOEXISTS(vp) && retval == E_NONE) { - struct rl_entry *invalid_range; - int devBlockSize; - off_t zero_limit; - - zero_limit = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64; - if (length < zero_limit) zero_limit = length; - if (length > fp->ff_size) { - /* Extending the file: time to fill out the current last page w. zeroes? */ - if ((fp->ff_size & PAGE_MASK_64) && - (rl_scan(&fp->ff_invalidranges, fp->ff_size & ~PAGE_MASK_64, - fp->ff_size - 1, &invalid_range) == RL_NOOVERLAP)) { - - /* There's some valid data at the start of the (current) last page - of the file, so zero out the remainder of that page to ensure the - entire page contains valid data. Since there is no invalid range - possible past the (current) eof, there's no need to remove anything - from the invalid range list before calling cluster_write(): */ - VOP_DEVBLOCKSIZE(cp->c_devvp, &devBlockSize); - retval = cluster_write(vp, (struct uio *) 0, fp->ff_size, zero_limit, - fp->ff_size, (off_t)0, devBlockSize, - (ap->a_flags & IO_SYNC) | IO_HEADZEROFILL | IO_NOZERODIRTY); - if (retval) goto Err_Exit; - - /* Merely invalidate the remaining area, if necessary: */ - if (length > zero_limit) { - rl_add(zero_limit, length - 1, &fp->ff_invalidranges); - cp->c_zftimeout = time.tv_sec + ZFTIMELIMIT; - } - } else { - /* The page containing the (current) eof is invalid: just add the - remainder of the page to the invalid list, along with the area - being newly allocated: - */ - rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges); - cp->c_zftimeout = time.tv_sec + ZFTIMELIMIT; - }; - } - } else { - panic("hfs_truncate: invoked on non-UBC object?!"); - }; - } - cp->c_flag |= C_UPDATE; - fp->ff_size = length; + // NOTE: if you add code here, also go check the + // code that "thaws" the fs in hfs_vnop_close() + // + hfsmp->hfs_freezing_proc = NULL; + hfs_global_exclusive_lock_release(hfsmp); + lck_rw_unlock_exclusive(&hfsmp->hfs_insync); - if (UBCISVALID(vp)) - ubc_setsize(vp, fp->ff_size); /* XXX check errors */ + return (0); + } - } else { /* Shorten the size of the file */ + case HFS_BULKACCESS_FSCTL: { + int size; + + if (hfsmp->hfs_flags & HFS_STANDARD) { + return EINVAL; + } - if (fp->ff_size > length) { - /* - * Any buffers that are past the truncation point need to be - * invalidated (to maintain buffer cache consistency). For - * simplicity, we invalidate all the buffers by calling vinvalbuf. - */ - if (UBCISVALID(vp)) - ubc_setsize(vp, length); /* XXX check errors */ + if (is64bit) { + size = sizeof(struct user64_access_t); + } else { + size = sizeof(struct user32_access_t); + } + + return do_bulk_access_check(hfsmp, vp, ap, size, context); + } - vflags = ((length > 0) ? V_SAVE : 0) | V_SAVEMETA; - retval = vinvalbuf(vp, vflags, ap->a_cred, ap->a_p, 0, 0); + case HFS_EXT_BULKACCESS_FSCTL: { + int size; - /* Any space previously marked as invalid is now irrelevant: */ - rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges); + if (hfsmp->hfs_flags & HFS_STANDARD) { + return EINVAL; + } + + if (is64bit) { + size = sizeof(struct user64_ext_access_t); + } else { + size = sizeof(struct user32_ext_access_t); + } + + return do_bulk_access_check(hfsmp, vp, ap, size, context); + } + + case HFS_SETACLSTATE: { + int state; + + if (ap->a_data == NULL) { + return (EINVAL); } - /* - * Account for any unmapped blocks. Note that the new - * file length can still end up with unmapped blocks. - */ - if (fp->ff_unallocblocks > 0) { - u_int32_t finalblks; + vfsp = vfs_statfs(HFSTOVFS(hfsmp)); + state = *(int *)ap->a_data; - /* lock extents b-tree */ - retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, - LK_EXCLUSIVE, ap->a_p); - if (retval) - goto Err_Exit; + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } + // super-user can enable or disable acl's on a volume. + // the volume owner can only enable acl's + if (!is_suser() && (state == 0 || kauth_cred_getuid(cred) != vfsp->f_owner)) { + return (EPERM); + } + if (state == 0 || state == 1) + return hfs_set_volxattr(hfsmp, HFS_SETACLSTATE, state); + else + return (EINVAL); + } - VTOVCB(vp)->loanedBlocks -= fp->ff_unallocblocks; - cp->c_blocks -= fp->ff_unallocblocks; - fp->ff_blocks -= fp->ff_unallocblocks; - fp->ff_unallocblocks = 0; + case HFS_SET_XATTREXTENTS_STATE: { + int state; - finalblks = (length + blksize - 1) / blksize; - if (finalblks > fp->ff_blocks) { - /* calculate required unmapped blocks */ - fp->ff_unallocblocks = finalblks - fp->ff_blocks; - VTOVCB(vp)->loanedBlocks += fp->ff_unallocblocks; - cp->c_blocks += fp->ff_unallocblocks; - fp->ff_blocks += fp->ff_unallocblocks; - } - (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, - LK_RELEASE, ap->a_p); + if (ap->a_data == NULL) { + return (EINVAL); } - /* - * For a TBE process the deallocation of the file blocks is - * delayed until the file is closed. And hfs_close calls - * truncate with the IO_NDELAY flag set. So when IO_NDELAY - * isn't set, we make sure this isn't a TBE process. + state = *(int *)ap->a_data; + + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } + + /* Super-user can enable or disable extent-based extended + * attribute support on a volume */ - if ((ap->a_flags & IO_NDELAY) || (!ISSET(ap->a_p->p_flag, P_TBE))) { -#if QUOTA - off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize); -#endif /* QUOTA */ - // XXXdbg - hfs_global_shared_lock_acquire(hfsmp); - if (hfsmp->jnl) { - if (journal_start_transaction(hfsmp->jnl) != 0) { - retval = EINVAL; - goto Err_Exit; - } - } + if (!is_suser()) { + return (EPERM); + } + if (state == 0 || state == 1) + return hfs_set_volxattr(hfsmp, HFS_SET_XATTREXTENTS_STATE, state); + else + return (EINVAL); + } - /* lock extents b-tree (also protects volume bitmap) */ - retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p); - if (retval) { - if (hfsmp->jnl) { - journal_end_transaction(hfsmp->jnl); - } - hfs_global_shared_lock_release(hfsmp); - goto Err_Exit; - } - - if (fp->ff_unallocblocks == 0) - retval = MacToVFSError(TruncateFileC(VTOVCB(vp), - (FCB*)fp, length, false)); + case F_FULLFSYNC: { + int error; + + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } + error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK); + if (error == 0) { + error = hfs_fsync(vp, MNT_WAIT, TRUE, p); + hfs_unlock(VTOC(vp)); + } - (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p); + return error; + } - // XXXdbg - if (hfsmp->jnl) { - hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); - journal_end_transaction(hfsmp->jnl); - } - hfs_global_shared_lock_release(hfsmp); + case F_CHKCLEAN: { + register struct cnode *cp; + int error; - filebytes = (off_t)fp->ff_blocks * (off_t)blksize; - if (retval) - goto Err_Exit; -#if QUOTA - /* These are bytesreleased */ - (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0); -#endif /* QUOTA */ + if (!vnode_isreg(vp)) + return EINVAL; + + error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK); + if (error == 0) { + cp = VTOC(vp); + /* + * used by regression test to determine if + * all the dirty pages (via write) have been cleaned + * after a call to 'fsysnc'. + */ + error = is_file_clean(vp, VTOF(vp)->ff_size); + hfs_unlock(cp); } - /* Only set update flag if the logical length changes */ - if (fp->ff_size != length) - cp->c_flag |= C_UPDATE; - fp->ff_size = length; + return (error); } - cp->c_flag |= C_CHANGE; - retval = VOP_UPDATE(vp, &tv, &tv, MNT_WAIT); - if (retval) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE, - -1, -1, -1, retval, 0); + + case F_RDADVISE: { + register struct radvisory *ra; + struct filefork *fp; + int error; + + if (!vnode_isreg(vp)) + return EINVAL; + + ra = (struct radvisory *)(ap->a_data); + fp = VTOF(vp); + + /* Protect against a size change. */ + hfs_lock_truncate(VTOC(vp), TRUE); + +#if HFS_COMPRESSION + if (compressed && (uncompressed_size == -1)) { + /* fetching the uncompressed size failed above, so return the error */ + error = decmpfs_error; + } else if ((compressed && (ra->ra_offset >= uncompressed_size)) || + (!compressed && (ra->ra_offset >= fp->ff_size))) { + error = EFBIG; + } +#else /* HFS_COMPRESSION */ + if (ra->ra_offset >= fp->ff_size) { + error = EFBIG; + } +#endif /* HFS_COMPRESSION */ + else { + error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count); + } + + hfs_unlock_truncate(VTOC(vp), TRUE); + return (error); + } + + case F_READBOOTSTRAP: + case F_WRITEBOOTSTRAP: + { + struct vnode *devvp = NULL; + user_fbootstraptransfer_t *user_bootstrapp; + int devBlockSize; + int error; + uio_t auio; + daddr64_t blockNumber; + u_int32_t blockOffset; + u_int32_t xfersize; + struct buf *bp; + user_fbootstraptransfer_t user_bootstrap; + + if (!vnode_isvroot(vp)) + return (EINVAL); + /* LP64 - when caller is a 64 bit process then we are passed a pointer + * to a user_fbootstraptransfer_t else we get a pointer to a + * fbootstraptransfer_t which we munge into a user_fbootstraptransfer_t + */ + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } + if (is64bit) { + user_bootstrapp = (user_fbootstraptransfer_t *)ap->a_data; + } + else { + user32_fbootstraptransfer_t *bootstrapp = (user32_fbootstraptransfer_t *)ap->a_data; + user_bootstrapp = &user_bootstrap; + user_bootstrap.fbt_offset = bootstrapp->fbt_offset; + user_bootstrap.fbt_length = bootstrapp->fbt_length; + user_bootstrap.fbt_buffer = CAST_USER_ADDR_T(bootstrapp->fbt_buffer); + } + if (user_bootstrapp->fbt_offset + user_bootstrapp->fbt_length > 1024) + return EINVAL; + + devvp = VTOHFS(vp)->hfs_devvp; + auio = uio_create(1, user_bootstrapp->fbt_offset, + is64bit ? UIO_USERSPACE64 : UIO_USERSPACE32, + (ap->a_command == F_WRITEBOOTSTRAP) ? UIO_WRITE : UIO_READ); + uio_addiov(auio, user_bootstrapp->fbt_buffer, user_bootstrapp->fbt_length); + + devBlockSize = vfs_devblocksize(vnode_mount(vp)); + + while (uio_resid(auio) > 0) { + blockNumber = uio_offset(auio) / devBlockSize; + error = (int)buf_bread(devvp, blockNumber, devBlockSize, cred, &bp); + if (error) { + if (bp) buf_brelse(bp); + uio_free(auio); + return error; + }; + + blockOffset = uio_offset(auio) % devBlockSize; + xfersize = devBlockSize - blockOffset; + error = uiomove((caddr_t)buf_dataptr(bp) + blockOffset, (int)xfersize, auio); + if (error) { + buf_brelse(bp); + uio_free(auio); + return error; + }; + if (uio_rw(auio) == UIO_WRITE) { + error = VNOP_BWRITE(bp); + if (error) { + uio_free(auio); + return error; + } + } else { + buf_brelse(bp); + }; + }; + uio_free(auio); + }; + return 0; + + case _IOC(IOC_OUT,'h', 4, 0): /* Create date in local time */ + { + if (is64bit) { + *(user_time_t *)(ap->a_data) = (user_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate)); + } + else { + *(user32_time_t *)(ap->a_data) = (user32_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate)); + } + return 0; + } + + case SPOTLIGHT_FSCTL_GET_MOUNT_TIME: + *(uint32_t *)ap->a_data = hfsmp->hfs_mount_time; + break; + + case SPOTLIGHT_FSCTL_GET_LAST_MTIME: + *(uint32_t *)ap->a_data = hfsmp->hfs_last_mounted_mtime; + break; + + case HFS_FSCTL_SET_VERY_LOW_DISK: + if (*(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_warninglimit) { + return EINVAL; + } + + hfsmp->hfs_freespace_notify_dangerlimit = *(uint32_t *)ap->a_data; + break; + + case HFS_FSCTL_SET_LOW_DISK: + if ( *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel + || *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_dangerlimit) { + + return EINVAL; + } + + hfsmp->hfs_freespace_notify_warninglimit = *(uint32_t *)ap->a_data; + break; + + case HFS_FSCTL_SET_DESIRED_DISK: + if (*(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) { + return EINVAL; + } + + hfsmp->hfs_freespace_notify_desiredlevel = *(uint32_t *)ap->a_data; + break; + + case HFS_VOLUME_STATUS: + *(uint32_t *)ap->a_data = hfsmp->hfs_notification_conditions; + break; + + case HFS_SET_BOOT_INFO: + if (!vnode_isvroot(vp)) + return(EINVAL); + if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(HFSTOVFS(hfsmp))->f_owner)) + return(EACCES); /* must be superuser or owner of filesystem */ + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } + HFS_MOUNT_LOCK(hfsmp, TRUE); + bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo)); + HFS_MOUNT_UNLOCK(hfsmp, TRUE); + (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0); + break; + + case HFS_GET_BOOT_INFO: + if (!vnode_isvroot(vp)) + return(EINVAL); + HFS_MOUNT_LOCK(hfsmp, TRUE); + bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo)); + HFS_MOUNT_UNLOCK(hfsmp, TRUE); + break; + + case HFS_MARK_BOOT_CORRUPT: + /* Mark the boot volume corrupt by setting + * kHFSVolumeInconsistentBit in the volume header. This will + * force fsck_hfs on next mount. + */ + if (!is_suser()) { + return EACCES; + } + + /* Allowed only on the root vnode of the boot volume */ + if (!(vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) || + !vnode_isvroot(vp)) { + return EINVAL; + } + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } + printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n"); + hfs_mark_volume_inconsistent(hfsmp); + break; + + case HFS_FSCTL_GET_JOURNAL_INFO: + jip = (struct hfs_journal_info*)ap->a_data; + + if (vp == NULLVP) + return EINVAL; + + if (hfsmp->jnl == NULL) { + jnl_start = 0; + jnl_size = 0; + } else { + jnl_start = (off_t)(hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset; + jnl_size = (off_t)hfsmp->jnl_size; + } + + jip->jstart = jnl_start; + jip->jsize = jnl_size; + break; + + case HFS_SET_ALWAYS_ZEROFILL: { + struct cnode *cp = VTOC(vp); + + if (*(int *)ap->a_data) { + cp->c_flag |= C_ALWAYS_ZEROFILL; + } else { + cp->c_flag &= ~C_ALWAYS_ZEROFILL; + } + break; + } + + default: + return (ENOTTY); + } + + return 0; +} + +/* + * select + */ +int +hfs_vnop_select(__unused struct vnop_select_args *ap) +/* + struct vnop_select_args { + vnode_t a_vp; + int a_which; + int a_fflags; + void *a_wql; + vfs_context_t a_context; + }; +*/ +{ + /* + * We should really check to see if I/O is possible. + */ + return (1); +} + +/* + * Converts a logical block number to a physical block, and optionally returns + * the amount of remaining blocks in a run. The logical block is based on hfsNode.logBlockSize. + * The physical block number is based on the device block size, currently its 512. + * The block run is returned in logical blocks, and is the REMAINING amount of blocks + */ +int +hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, unsigned int *runp) +{ + struct filefork *fp = VTOF(vp); + struct hfsmount *hfsmp = VTOHFS(vp); + int retval = E_NONE; + u_int32_t logBlockSize; + size_t bytesContAvail = 0; + off_t blockposition; + int lockExtBtree; + int lockflags = 0; + + /* + * Check for underlying vnode requests and ensure that logical + * to physical mapping is requested. + */ + if (vpp != NULL) + *vpp = hfsmp->hfs_devvp; + if (bnp == NULL) + return (0); + + logBlockSize = GetLogicalBlockSize(vp); + blockposition = (off_t)bn * logBlockSize; + + lockExtBtree = overflow_extents(fp); + + if (lockExtBtree) + lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); + + retval = MacToVFSError( + MapFileBlockC (HFSTOVCB(hfsmp), + (FCB*)fp, + MAXPHYSIO, + blockposition, + bnp, + &bytesContAvail)); + + if (lockExtBtree) + hfs_systemfile_unlock(hfsmp, lockflags); + + if (retval == E_NONE) { + /* Figure out how many read ahead blocks there are */ + if (runp != NULL) { + if (can_cluster(logBlockSize)) { + /* Make sure this result never goes negative: */ + *runp = (bytesContAvail < logBlockSize) ? 0 : (bytesContAvail / logBlockSize) - 1; + } else { + *runp = 0; + } + } + } + return (retval); +} + +/* + * Convert logical block number to file offset. + */ +int +hfs_vnop_blktooff(struct vnop_blktooff_args *ap) +/* + struct vnop_blktooff_args { + vnode_t a_vp; + daddr64_t a_lblkno; + off_t *a_offset; + }; +*/ +{ + if (ap->a_vp == NULL) + return (EINVAL); + *ap->a_offset = (off_t)ap->a_lblkno * (off_t)GetLogicalBlockSize(ap->a_vp); + + return(0); +} + +/* + * Convert file offset to logical block number. + */ +int +hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap) +/* + struct vnop_offtoblk_args { + vnode_t a_vp; + off_t a_offset; + daddr64_t *a_lblkno; + }; +*/ +{ + if (ap->a_vp == NULL) + return (EINVAL); + *ap->a_lblkno = (daddr64_t)(ap->a_offset / (off_t)GetLogicalBlockSize(ap->a_vp)); + + return(0); +} + +/* + * Map file offset to physical block number. + * + * If this function is called for write operation, and if the file + * had virtual blocks allocated (delayed allocation), real blocks + * are allocated by calling ExtendFileC(). + * + * If this function is called for read operation, and if the file + * had virtual blocks allocated (delayed allocation), no change + * to the size of file is done, and if required, rangelist is + * searched for mapping. + * + * System file cnodes are expected to be locked (shared or exclusive). + */ +int +hfs_vnop_blockmap(struct vnop_blockmap_args *ap) +/* + struct vnop_blockmap_args { + vnode_t a_vp; + off_t a_foffset; + size_t a_size; + daddr64_t *a_bpn; + size_t *a_run; + void *a_poff; + int a_flags; + vfs_context_t a_context; + }; +*/ +{ + struct vnode *vp = ap->a_vp; + struct cnode *cp; + struct filefork *fp; + struct hfsmount *hfsmp; + size_t bytesContAvail = 0; + int retval = E_NONE; + int syslocks = 0; + int lockflags = 0; + struct rl_entry *invalid_range; + enum rl_overlaptype overlaptype; + int started_tr = 0; + int tooklock = 0; + +#if HFS_COMPRESSION + if (VNODE_IS_RSRC(vp)) { + /* allow blockmaps to the resource fork */ + } else { + if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */ + int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp)); + switch(state) { + case FILE_IS_COMPRESSED: + return ENOTSUP; + case FILE_IS_CONVERTING: + /* if FILE_IS_CONVERTING, we allow blockmap */ + break; + default: + printf("invalid state %d for compressed file\n", state); + /* fall through */ + } + } + } +#endif /* HFS_COMPRESSION */ + + /* Do not allow blockmap operation on a directory */ + if (vnode_isdir(vp)) { + return (ENOTSUP); + } + + /* + * Check for underlying vnode requests and ensure that logical + * to physical mapping is requested. + */ + if (ap->a_bpn == NULL) + return (0); + + if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) { + if (VTOC(vp)->c_lockowner != current_thread()) { + hfs_lock(VTOC(vp), HFS_FORCE_LOCK); + tooklock = 1; + } + } + hfsmp = VTOHFS(vp); + cp = VTOC(vp); + fp = VTOF(vp); + +retry: + /* Check virtual blocks only when performing write operation */ + if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) { + if (hfs_start_transaction(hfsmp) != 0) { + retval = EINVAL; + goto exit; + } else { + started_tr = 1; + } + syslocks = SFL_EXTENTS | SFL_BITMAP; + + } else if (overflow_extents(fp)) { + syslocks = SFL_EXTENTS; + } + + if (syslocks) + lockflags = hfs_systemfile_lock(hfsmp, syslocks, HFS_EXCLUSIVE_LOCK); + + /* + * Check for any delayed allocations. + */ + if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) { + int64_t actbytes; + u_int32_t loanedBlocks; + + // + // Make sure we have a transaction. It's possible + // that we came in and fp->ff_unallocblocks was zero + // but during the time we blocked acquiring the extents + // btree, ff_unallocblocks became non-zero and so we + // will need to start a transaction. + // + if (started_tr == 0) { + if (syslocks) { + hfs_systemfile_unlock(hfsmp, lockflags); + syslocks = 0; + } + goto retry; + } + + /* + * Note: ExtendFileC will Release any blocks on loan and + * aquire real blocks. So we ask to extend by zero bytes + * since ExtendFileC will account for the virtual blocks. + */ + + loanedBlocks = fp->ff_unallocblocks; + retval = ExtendFileC(hfsmp, (FCB*)fp, 0, 0, + kEFAllMask | kEFNoClumpMask, &actbytes); + + if (retval) { + fp->ff_unallocblocks = loanedBlocks; + cp->c_blocks += loanedBlocks; + fp->ff_blocks += loanedBlocks; + + HFS_MOUNT_LOCK(hfsmp, TRUE); + hfsmp->loanedBlocks += loanedBlocks; + HFS_MOUNT_UNLOCK(hfsmp, TRUE); + + hfs_systemfile_unlock(hfsmp, lockflags); + cp->c_flag |= C_MODIFIED; + if (started_tr) { + (void) hfs_update(vp, TRUE); + (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); + + hfs_end_transaction(hfsmp); + started_tr = 0; + } + goto exit; + } + } + + retval = MapFileBlockC(hfsmp, (FCB *)fp, ap->a_size, ap->a_foffset, + ap->a_bpn, &bytesContAvail); + if (syslocks) { + hfs_systemfile_unlock(hfsmp, lockflags); + syslocks = 0; + } + + if (started_tr) { + (void) hfs_update(vp, TRUE); + (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); + hfs_end_transaction(hfsmp); + started_tr = 0; + } + if (retval) { + /* On write, always return error because virtual blocks, if any, + * should have been allocated in ExtendFileC(). We do not + * allocate virtual blocks on read, therefore return error + * only if no virtual blocks are allocated. Otherwise we search + * rangelist for zero-fills + */ + if ((MacToVFSError(retval) != ERANGE) || + (ap->a_flags & VNODE_WRITE) || + ((ap->a_flags & VNODE_READ) && (fp->ff_unallocblocks == 0))) { + goto exit; + } + + /* Validate if the start offset is within logical file size */ + if (ap->a_foffset > fp->ff_size) { + goto exit; + } + + /* Searching file extents has failed for read operation, therefore + * search rangelist for any uncommitted holes in the file. + */ + overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset, + ap->a_foffset + (off_t)(ap->a_size - 1), + &invalid_range); + switch(overlaptype) { + case RL_OVERLAPISCONTAINED: + /* start_offset <= rl_start, end_offset >= rl_end */ + if (ap->a_foffset != invalid_range->rl_start) { + break; + } + case RL_MATCHINGOVERLAP: + /* start_offset = rl_start, end_offset = rl_end */ + case RL_OVERLAPCONTAINSRANGE: + /* start_offset >= rl_start, end_offset <= rl_end */ + case RL_OVERLAPSTARTSBEFORE: + /* start_offset > rl_start, end_offset >= rl_start */ + if ((off_t)fp->ff_size > (invalid_range->rl_end + 1)) { + bytesContAvail = (invalid_range->rl_end + 1) - ap->a_foffset; + } else { + bytesContAvail = fp->ff_size - ap->a_foffset; + } + if (bytesContAvail > ap->a_size) { + bytesContAvail = ap->a_size; + } + *ap->a_bpn = (daddr64_t)-1; + retval = 0; + break; + case RL_OVERLAPENDSAFTER: + /* start_offset < rl_start, end_offset < rl_end */ + case RL_NOOVERLAP: + break; + } + goto exit; + } + + /* MapFileC() found a valid extent in the filefork. Search the + * mapping information further for invalid file ranges + */ + overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset, + ap->a_foffset + (off_t)bytesContAvail - 1, + &invalid_range); + if (overlaptype != RL_NOOVERLAP) { + switch(overlaptype) { + case RL_MATCHINGOVERLAP: + case RL_OVERLAPCONTAINSRANGE: + case RL_OVERLAPSTARTSBEFORE: + /* There's no valid block for this byte offset */ + *ap->a_bpn = (daddr64_t)-1; + /* There's no point limiting the amount to be returned + * if the invalid range that was hit extends all the way + * to the EOF (i.e. there's no valid bytes between the + * end of this range and the file's EOF): + */ + if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) && + ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) { + bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset; + } + break; + + case RL_OVERLAPISCONTAINED: + case RL_OVERLAPENDSAFTER: + /* The range of interest hits an invalid block before the end: */ + if (invalid_range->rl_start == ap->a_foffset) { + /* There's actually no valid information to be had starting here: */ + *ap->a_bpn = (daddr64_t)-1; + if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) && + ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) { + bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset; + } + } else { + bytesContAvail = invalid_range->rl_start - ap->a_foffset; + } + break; + + case RL_NOOVERLAP: + break; + } /* end switch */ + if (bytesContAvail > ap->a_size) + bytesContAvail = ap->a_size; + } + +exit: + if (retval == 0) { + if (ap->a_run) + *ap->a_run = bytesContAvail; + + if (ap->a_poff) + *(int *)ap->a_poff = 0; + } + + if (tooklock) + hfs_unlock(cp); + + return (MacToVFSError(retval)); +} + + +/* + * prepare and issue the I/O + * buf_strategy knows how to deal + * with requests that require + * fragmented I/Os + */ +int +hfs_vnop_strategy(struct vnop_strategy_args *ap) +{ + buf_t bp = ap->a_bp; + vnode_t vp = buf_vnode(bp); + + return (buf_strategy(VTOHFS(vp)->hfs_devvp, ap)); +} + +static int +hfs_minorupdate(struct vnode *vp) { + struct cnode *cp = VTOC(vp); + cp->c_flag &= ~C_MODIFIED; + cp->c_touch_acctime = 0; + cp->c_touch_chgtime = 0; + cp->c_touch_modtime = 0; + + return 0; +} + +static int +do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_context_t context) +{ + register struct cnode *cp = VTOC(vp); + struct filefork *fp = VTOF(vp); + struct proc *p = vfs_context_proc(context);; + kauth_cred_t cred = vfs_context_ucred(context); + int retval; + off_t bytesToAdd; + off_t actualBytesAdded; + off_t filebytes; + u_int32_t fileblocks; + int blksize; + struct hfsmount *hfsmp; + int lockflags; + + blksize = VTOVCB(vp)->blockSize; + fileblocks = fp->ff_blocks; + filebytes = (off_t)fileblocks * (off_t)blksize; + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_START, + (int)length, (int)fp->ff_size, (int)filebytes, 0, 0); + + if (length < 0) + return (EINVAL); + + /* This should only happen with a corrupt filesystem */ + if ((off_t)fp->ff_size < 0) + return (EINVAL); + + if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE)) + return (EFBIG); + + hfsmp = VTOHFS(vp); + + retval = E_NONE; + + /* Files that are changing size are not hot file candidates. */ + if (hfsmp->hfc_stage == HFC_RECORDING) { + fp->ff_bytesread = 0; + } + + /* + * We cannot just check if fp->ff_size == length (as an optimization) + * since there may be extra physical blocks that also need truncation. + */ +#if QUOTA + if ((retval = hfs_getinoquota(cp))) + return(retval); +#endif /* QUOTA */ + + /* + * Lengthen the size of the file. We must ensure that the + * last byte of the file is allocated. Since the smallest + * value of ff_size is 0, length will be at least 1. + */ + if (length > (off_t)fp->ff_size) { +#if QUOTA + retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)), + cred, 0); + if (retval) + goto Err_Exit; +#endif /* QUOTA */ + /* + * If we don't have enough physical space then + * we need to extend the physical size. + */ + if (length > filebytes) { + int eflags; + u_int32_t blockHint = 0; + + /* All or nothing and don't round up to clumpsize. */ + eflags = kEFAllMask | kEFNoClumpMask; + + if (cred && suser(cred, NULL) != 0) + eflags |= kEFReserveMask; /* keep a reserve */ + + /* + * Allocate Journal and Quota files in metadata zone. + */ + if (filebytes == 0 && + hfsmp->hfs_flags & HFS_METADATA_ZONE && + hfs_virtualmetafile(cp)) { + eflags |= kEFMetadataMask; + blockHint = hfsmp->hfs_metazone_start; + } + if (hfs_start_transaction(hfsmp) != 0) { + retval = EINVAL; + goto Err_Exit; + } + + /* Protect extents b-tree and allocation bitmap */ + lockflags = SFL_BITMAP; + if (overflow_extents(fp)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + + while ((length > filebytes) && (retval == E_NONE)) { + bytesToAdd = length - filebytes; + retval = MacToVFSError(ExtendFileC(VTOVCB(vp), + (FCB*)fp, + bytesToAdd, + blockHint, + eflags, + &actualBytesAdded)); + + filebytes = (off_t)fp->ff_blocks * (off_t)blksize; + if (actualBytesAdded == 0 && retval == E_NONE) { + if (length > filebytes) + length = filebytes; + break; + } + } /* endwhile */ + + hfs_systemfile_unlock(hfsmp, lockflags); + + if (hfsmp->jnl) { + if (skipupdate) { + (void) hfs_minorupdate(vp); + } + else { + (void) hfs_update(vp, TRUE); + (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); + } + } + + hfs_end_transaction(hfsmp); + + if (retval) + goto Err_Exit; + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE, + (int)length, (int)fp->ff_size, (int)filebytes, 0, 0); + } + + if (!(flags & IO_NOZEROFILL)) { + if (UBCINFOEXISTS(vp) && (vnode_issystem(vp) == 0) && retval == E_NONE) { + struct rl_entry *invalid_range; + off_t zero_limit; + + zero_limit = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64; + if (length < zero_limit) zero_limit = length; + + if (length > (off_t)fp->ff_size) { + struct timeval tv; + + /* Extending the file: time to fill out the current last page w. zeroes? */ + if ((fp->ff_size & PAGE_MASK_64) && + (rl_scan(&fp->ff_invalidranges, fp->ff_size & ~PAGE_MASK_64, + fp->ff_size - 1, &invalid_range) == RL_NOOVERLAP)) { + + /* There's some valid data at the start of the (current) last page + of the file, so zero out the remainder of that page to ensure the + entire page contains valid data. Since there is no invalid range + possible past the (current) eof, there's no need to remove anything + from the invalid range list before calling cluster_write(): */ + hfs_unlock(cp); + retval = cluster_write(vp, (struct uio *) 0, fp->ff_size, zero_limit, + fp->ff_size, (off_t)0, + (flags & IO_SYNC) | IO_HEADZEROFILL | IO_NOZERODIRTY); + hfs_lock(cp, HFS_FORCE_LOCK); + if (retval) goto Err_Exit; + + /* Merely invalidate the remaining area, if necessary: */ + if (length > zero_limit) { + microuptime(&tv); + rl_add(zero_limit, length - 1, &fp->ff_invalidranges); + cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT; + } + } else { + /* The page containing the (current) eof is invalid: just add the + remainder of the page to the invalid list, along with the area + being newly allocated: + */ + microuptime(&tv); + rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges); + cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT; + }; + } + } else { + panic("hfs_truncate: invoked on non-UBC object?!"); + }; + } + cp->c_touch_modtime = TRUE; + fp->ff_size = length; + + } else { /* Shorten the size of the file */ + + if ((off_t)fp->ff_size > length) { + /* Any space previously marked as invalid is now irrelevant: */ + rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges); + } + + /* + * Account for any unmapped blocks. Note that the new + * file length can still end up with unmapped blocks. + */ + if (fp->ff_unallocblocks > 0) { + u_int32_t finalblks; + u_int32_t loanedBlocks; + + HFS_MOUNT_LOCK(hfsmp, TRUE); + + loanedBlocks = fp->ff_unallocblocks; + cp->c_blocks -= loanedBlocks; + fp->ff_blocks -= loanedBlocks; + fp->ff_unallocblocks = 0; + + hfsmp->loanedBlocks -= loanedBlocks; + + finalblks = (length + blksize - 1) / blksize; + if (finalblks > fp->ff_blocks) { + /* calculate required unmapped blocks */ + loanedBlocks = finalblks - fp->ff_blocks; + hfsmp->loanedBlocks += loanedBlocks; + + fp->ff_unallocblocks = loanedBlocks; + cp->c_blocks += loanedBlocks; + fp->ff_blocks += loanedBlocks; + } + HFS_MOUNT_UNLOCK(hfsmp, TRUE); + } + + /* + * For a TBE process the deallocation of the file blocks is + * delayed until the file is closed. And hfs_close calls + * truncate with the IO_NDELAY flag set. So when IO_NDELAY + * isn't set, we make sure this isn't a TBE process. + */ + if ((flags & IO_NDELAY) || (proc_tbe(p) == 0)) { +#if QUOTA + off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize); +#endif /* QUOTA */ + if (hfs_start_transaction(hfsmp) != 0) { + retval = EINVAL; + goto Err_Exit; + } + + if (fp->ff_unallocblocks == 0) { + /* Protect extents b-tree and allocation bitmap */ + lockflags = SFL_BITMAP; + if (overflow_extents(fp)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + + retval = MacToVFSError(TruncateFileC(VTOVCB(vp), + (FCB*)fp, length, false)); + + hfs_systemfile_unlock(hfsmp, lockflags); + } + if (hfsmp->jnl) { + if (retval == 0) { + fp->ff_size = length; + } + if (skipupdate) { + (void) hfs_minorupdate(vp); + } + else { + (void) hfs_update(vp, TRUE); + (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); + } + } + hfs_end_transaction(hfsmp); + + filebytes = (off_t)fp->ff_blocks * (off_t)blksize; + if (retval) + goto Err_Exit; +#if QUOTA + /* These are bytesreleased */ + (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0); +#endif /* QUOTA */ + } + /* Only set update flag if the logical length changes */ + if ((off_t)fp->ff_size != length) + cp->c_touch_modtime = TRUE; + fp->ff_size = length; + } + if (cp->c_mode & (S_ISUID | S_ISGID)) { + if (!vfs_context_issuser(context)) { + cp->c_mode &= ~(S_ISUID | S_ISGID); + skipupdate = 0; + } + } + if (skipupdate) { + retval = hfs_minorupdate(vp); + } + else { + cp->c_touch_chgtime = TRUE; /* status changed */ + cp->c_touch_modtime = TRUE; /* file data was modified */ + retval = hfs_update(vp, MNT_WAIT); + } + if (retval) { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE, + -1, -1, -1, retval, 0); + } + +Err_Exit: + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_END, + (int)length, (int)fp->ff_size, (int)filebytes, retval, 0); + + return (retval); +} + + + +/* + * Truncate a cnode to at most length size, freeing (or adding) the + * disk blocks. + */ +__private_extern__ +int +hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize, + int skipupdate, vfs_context_t context) +{ + struct filefork *fp = VTOF(vp); + off_t filebytes; + u_int32_t fileblocks; + int blksize, error = 0; + struct cnode *cp = VTOC(vp); + + /* Cannot truncate an HFS directory! */ + if (vnode_isdir(vp)) { + return (EISDIR); + } + /* A swap file cannot change size. */ + if (vnode_isswap(vp) && (length != 0)) { + return (EPERM); + } + + blksize = VTOVCB(vp)->blockSize; + fileblocks = fp->ff_blocks; + filebytes = (off_t)fileblocks * (off_t)blksize; + + // + // Have to do this here so that we don't wind up with + // i/o pending for blocks that are about to be released + // if we truncate the file. + // + // If skipsetsize is set, then the caller is responsible + // for the ubc_setsize. + // + // Even if skipsetsize is set, if the length is zero we + // want to call ubc_setsize() because as of SnowLeopard + // it will no longer cause any page-ins and it will drop + // any dirty pages so that we don't do any i/o that we + // don't have to. This also prevents a race where i/o + // for truncated blocks may overwrite later data if the + // blocks get reallocated to a different file. + // + if (!skipsetsize || length == 0) + ubc_setsize(vp, length); + + // have to loop truncating or growing files that are + // really big because otherwise transactions can get + // enormous and consume too many kernel resources. + + if (length < filebytes) { + while (filebytes > length) { + if ((filebytes - length) > HFS_BIGFILE_SIZE && overflow_extents(fp)) { + filebytes -= HFS_BIGFILE_SIZE; + } else { + filebytes = length; + } + cp->c_flag |= C_FORCEUPDATE; + error = do_hfs_truncate(vp, filebytes, flags, skipupdate, context); + if (error) + break; + } + } else if (length > filebytes) { + while (filebytes < length) { + if ((length - filebytes) > HFS_BIGFILE_SIZE && overflow_extents(fp)) { + filebytes += HFS_BIGFILE_SIZE; + } else { + filebytes = length; + } + cp->c_flag |= C_FORCEUPDATE; + error = do_hfs_truncate(vp, filebytes, flags, skipupdate, context); + if (error) + break; + } + } else /* Same logical size */ { + + error = do_hfs_truncate(vp, length, flags, skipupdate, context); + } + /* Files that are changing size are not hot file candidates. */ + if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) { + fp->ff_bytesread = 0; + } + + return (error); +} + + + +/* + * Preallocate file storage space. + */ +int +hfs_vnop_allocate(struct vnop_allocate_args /* { + vnode_t a_vp; + off_t a_length; + u_int32_t a_flags; + off_t *a_bytesallocated; + off_t a_offset; + vfs_context_t a_context; + } */ *ap) +{ + struct vnode *vp = ap->a_vp; + struct cnode *cp; + struct filefork *fp; + ExtendedVCB *vcb; + off_t length = ap->a_length; + off_t startingPEOF; + off_t moreBytesRequested; + off_t actualBytesAdded; + off_t filebytes; + u_int32_t fileblocks; + int retval, retval2; + u_int32_t blockHint; + u_int32_t extendFlags; /* For call to ExtendFileC */ + struct hfsmount *hfsmp; + kauth_cred_t cred = vfs_context_ucred(ap->a_context); + int lockflags; + + *(ap->a_bytesallocated) = 0; + + if (!vnode_isreg(vp)) + return (EISDIR); + if (length < (off_t)0) + return (EINVAL); + + cp = VTOC(vp); + + hfs_lock_truncate(cp, TRUE); + + if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) { + goto Err_Exit; + } + + fp = VTOF(vp); + hfsmp = VTOHFS(vp); + vcb = VTOVCB(vp); + + fileblocks = fp->ff_blocks; + filebytes = (off_t)fileblocks * (off_t)vcb->blockSize; + + if ((ap->a_flags & ALLOCATEFROMVOL) && (length < filebytes)) { + retval = EINVAL; + goto Err_Exit; + } + + /* Fill in the flags word for the call to Extend the file */ + + extendFlags = kEFNoClumpMask; + if (ap->a_flags & ALLOCATECONTIG) + extendFlags |= kEFContigMask; + if (ap->a_flags & ALLOCATEALL) + extendFlags |= kEFAllMask; + if (cred && suser(cred, NULL) != 0) + extendFlags |= kEFReserveMask; + if (hfs_virtualmetafile(cp)) + extendFlags |= kEFMetadataMask; + + retval = E_NONE; + blockHint = 0; + startingPEOF = filebytes; + + if (ap->a_flags & ALLOCATEFROMPEOF) + length += filebytes; + else if (ap->a_flags & ALLOCATEFROMVOL) + blockHint = ap->a_offset / VTOVCB(vp)->blockSize; + + /* If no changes are necesary, then we're done */ + if (filebytes == length) + goto Std_Exit; + + /* + * Lengthen the size of the file. We must ensure that the + * last byte of the file is allocated. Since the smallest + * value of filebytes is 0, length will be at least 1. + */ + if (length > filebytes) { + off_t total_bytes_added = 0, orig_request_size; + + orig_request_size = moreBytesRequested = length - filebytes; + +#if QUOTA + retval = hfs_chkdq(cp, + (int64_t)(roundup(moreBytesRequested, vcb->blockSize)), + cred, 0); + if (retval) + goto Err_Exit; + +#endif /* QUOTA */ + /* + * Metadata zone checks. + */ + if (hfsmp->hfs_flags & HFS_METADATA_ZONE) { + /* + * Allocate Journal and Quota files in metadata zone. + */ + if (hfs_virtualmetafile(cp)) { + blockHint = hfsmp->hfs_metazone_start; + } else if ((blockHint >= hfsmp->hfs_metazone_start) && + (blockHint <= hfsmp->hfs_metazone_end)) { + /* + * Move blockHint outside metadata zone. + */ + blockHint = hfsmp->hfs_metazone_end + 1; + } + } + + + while ((length > filebytes) && (retval == E_NONE)) { + off_t bytesRequested; + + if (hfs_start_transaction(hfsmp) != 0) { + retval = EINVAL; + goto Err_Exit; + } + + /* Protect extents b-tree and allocation bitmap */ + lockflags = SFL_BITMAP; + if (overflow_extents(fp)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + + if (moreBytesRequested >= HFS_BIGFILE_SIZE) { + bytesRequested = HFS_BIGFILE_SIZE; + } else { + bytesRequested = moreBytesRequested; + } + + if (extendFlags & kEFContigMask) { + // if we're on a sparse device, this will force it to do a + // full scan to find the space needed. + hfsmp->hfs_flags &= ~HFS_DID_CONTIG_SCAN; + } + + retval = MacToVFSError(ExtendFileC(vcb, + (FCB*)fp, + bytesRequested, + blockHint, + extendFlags, + &actualBytesAdded)); + + if (retval == E_NONE) { + *(ap->a_bytesallocated) += actualBytesAdded; + total_bytes_added += actualBytesAdded; + moreBytesRequested -= actualBytesAdded; + if (blockHint != 0) { + blockHint += actualBytesAdded / vcb->blockSize; + } + } + filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize; + + hfs_systemfile_unlock(hfsmp, lockflags); + + if (hfsmp->jnl) { + (void) hfs_update(vp, TRUE); + (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); + } + + hfs_end_transaction(hfsmp); + } + + + /* + * if we get an error and no changes were made then exit + * otherwise we must do the hfs_update to reflect the changes + */ + if (retval && (startingPEOF == filebytes)) + goto Err_Exit; + + /* + * Adjust actualBytesAdded to be allocation block aligned, not + * clump size aligned. + * NOTE: So what we are reporting does not affect reality + * until the file is closed, when we truncate the file to allocation + * block size. + */ + if (total_bytes_added != 0 && orig_request_size < total_bytes_added) + *(ap->a_bytesallocated) = + roundup(orig_request_size, (off_t)vcb->blockSize); + + } else { /* Shorten the size of the file */ + + if (fp->ff_size > length) { + /* + * Any buffers that are past the truncation point need to be + * invalidated (to maintain buffer cache consistency). + */ + } + + retval = hfs_truncate(vp, length, 0, 0, 0, ap->a_context); + filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize; + + /* + * if we get an error and no changes were made then exit + * otherwise we must do the hfs_update to reflect the changes + */ + if (retval && (startingPEOF == filebytes)) goto Err_Exit; +#if QUOTA + /* These are bytesreleased */ + (void) hfs_chkdq(cp, (int64_t)-((startingPEOF - filebytes)), NOCRED,0); +#endif /* QUOTA */ + + if (fp->ff_size > filebytes) { + fp->ff_size = filebytes; + + hfs_unlock(cp); + ubc_setsize(vp, fp->ff_size); + hfs_lock(cp, HFS_FORCE_LOCK); + } + } + +Std_Exit: + cp->c_touch_chgtime = TRUE; + cp->c_touch_modtime = TRUE; + retval2 = hfs_update(vp, MNT_WAIT); + + if (retval == 0) + retval = retval2; +Err_Exit: + hfs_unlock_truncate(cp, TRUE); + hfs_unlock(cp); + return (retval); +} + + +/* + * Pagein for HFS filesystem + */ +int +hfs_vnop_pagein(struct vnop_pagein_args *ap) +/* + struct vnop_pagein_args { + vnode_t a_vp, + upl_t a_pl, + vm_offset_t a_pl_offset, + off_t a_f_offset, + size_t a_size, + int a_flags + vfs_context_t a_context; + }; +*/ +{ + vnode_t vp = ap->a_vp; + int error; + +#if HFS_COMPRESSION + if (VNODE_IS_RSRC(vp)) { + /* allow pageins of the resource fork */ + } else { + int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */ + if (compressed) { + error = decmpfs_pagein_compressed(ap, &compressed, VTOCMP(vp)); + if (compressed) { + if (error == 0) { + /* successful page-in, update the access time */ + VTOC(vp)->c_touch_acctime = TRUE; + + /* compressed files are not hot file candidates */ + if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) { + VTOF(vp)->ff_bytesread = 0; + } + } + return error; + } + /* otherwise the file was converted back to a regular file while we were reading it */ + } + } +#endif + + error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset, + ap->a_size, (off_t)VTOF(vp)->ff_size, ap->a_flags); + /* + * Keep track of blocks read. + */ + if (!vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) { + struct cnode *cp; + struct filefork *fp; + int bytesread; + int took_cnode_lock = 0; + + cp = VTOC(vp); + fp = VTOF(vp); + + if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE) + bytesread = fp->ff_size; + else + bytesread = ap->a_size; + + /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */ + if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) { + hfs_lock(cp, HFS_FORCE_LOCK); + took_cnode_lock = 1; + } + /* + * If this file hasn't been seen since the start of + * the current sampling period then start over. + */ + if (cp->c_atime < VTOHFS(vp)->hfc_timebase) { + struct timeval tv; + + fp->ff_bytesread = bytesread; + microtime(&tv); + cp->c_atime = tv.tv_sec; + } else { + fp->ff_bytesread += bytesread; + } + cp->c_touch_acctime = TRUE; + if (took_cnode_lock) + hfs_unlock(cp); + } + return (error); +} + +/* + * Pageout for HFS filesystem. + */ +int +hfs_vnop_pageout(struct vnop_pageout_args *ap) +/* + struct vnop_pageout_args { + vnode_t a_vp, + upl_t a_pl, + vm_offset_t a_pl_offset, + off_t a_f_offset, + size_t a_size, + int a_flags + vfs_context_t a_context; + }; +*/ +{ + vnode_t vp = ap->a_vp; + struct cnode *cp; + struct filefork *fp; + int retval = 0; + off_t filesize; + upl_t upl; + upl_page_info_t* pl; + vm_offset_t a_pl_offset; + int a_flags; + int is_pageoutv2 = 0; + + cp = VTOC(vp); + fp = VTOF(vp); + + /* + * Figure out where the file ends, for pageout purposes. If + * ff_new_size > ff_size, then we're in the middle of extending the + * file via a write, so it is safe (and necessary) that we be able + * to pageout up to that point. + */ + filesize = fp->ff_size; + if (fp->ff_new_size > filesize) + filesize = fp->ff_new_size; + + a_flags = ap->a_flags; + a_pl_offset = ap->a_pl_offset; + + /* + * we can tell if we're getting the new or old behavior from the UPL + */ + if ((upl = ap->a_pl) == NULL) { + int request_flags; + + is_pageoutv2 = 1; + /* + * we're in control of any UPL we commit + * make sure someone hasn't accidentally passed in UPL_NOCOMMIT + */ + a_flags &= ~UPL_NOCOMMIT; + a_pl_offset = 0; + + /* + * take truncate lock (shared) to guard against + * zero-fill thru fsync interfering, but only for v2 + */ + hfs_lock_truncate(cp, 0); + + if (a_flags & UPL_MSYNC) { + request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY; + } + else { + request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY; + } + ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags); + + if (upl == (upl_t) NULL) { + retval = EINVAL; + goto pageout_done; + } + } + /* + * from this point forward upl points at the UPL we're working with + * it was either passed in or we succesfully created it + */ + + /* + * Now that HFS is opting into VFC_VFSVNOP_PAGEOUTV2, we may need to operate on our own + * UPL instead of relying on the UPL passed into us. We go ahead and do that here, + * scanning for dirty ranges. We'll issue our own N cluster_pageout calls, for + * N dirty ranges in the UPL. Note that this is almost a direct copy of the + * logic in vnode_pageout except that we need to do it after grabbing the truncate + * lock in HFS so that we don't lock invert ourselves. + * + * Note that we can still get into this function on behalf of the default pager with + * non-V2 behavior (swapfiles). However in that case, we did not grab locks above + * since fsync and other writing threads will grab the locks, then mark the + * relevant pages as busy. But the pageout codepath marks the pages as busy, + * and THEN would attempt to grab the truncate lock, which would result in deadlock. So + * we do not try to grab anything for the pre-V2 case, which should only be accessed + * by the paging/VM system. + */ + + if (is_pageoutv2) { + off_t f_offset; + int offset; + int isize; + int pg_index; + int error; + int error_ret = 0; + + isize = ap->a_size; + f_offset = ap->a_f_offset; + + /* + * Scan from the back to find the last page in the UPL, so that we + * aren't looking at a UPL that may have already been freed by the + * preceding aborts/completions. + */ + for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) { + if (upl_page_present(pl, --pg_index)) + break; + if (pg_index == 0) { + ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY); + goto pageout_done; + } + } + + /* + * initialize the offset variables before we touch the UPL. + * a_f_offset is the position into the file, in bytes + * offset is the position into the UPL, in bytes + * pg_index is the pg# of the UPL we're operating on. + * isize is the offset into the UPL of the last non-clean page. + */ + isize = ((pg_index + 1) * PAGE_SIZE); + + offset = 0; + pg_index = 0; + + while (isize) { + int xsize; + int num_of_pages; + + if ( !upl_page_present(pl, pg_index)) { + /* + * we asked for RET_ONLY_DIRTY, so it's possible + * to get back empty slots in the UPL. + * just skip over them + */ + f_offset += PAGE_SIZE; + offset += PAGE_SIZE; + isize -= PAGE_SIZE; + pg_index++; + + continue; + } + if ( !upl_dirty_page(pl, pg_index)) { + panic ("hfs_vnop_pageout: unforeseen clean page @ index %d for UPL %p\n", pg_index, upl); + } + + /* + * We know that we have at least one dirty page. + * Now checking to see how many in a row we have + */ + num_of_pages = 1; + xsize = isize - PAGE_SIZE; + + while (xsize) { + if ( !upl_dirty_page(pl, pg_index + num_of_pages)) + break; + num_of_pages++; + xsize -= PAGE_SIZE; + } + xsize = num_of_pages * PAGE_SIZE; + + if (!vnode_isswap(vp)) { + off_t end_of_range; + int tooklock; + + tooklock = 0; + + if (cp->c_lockowner != current_thread()) { + if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) { + /* + * we're in the v2 path, so we are the + * owner of the UPL... we may have already + * processed some of the UPL, so abort it + * from the current working offset to the + * end of the UPL + */ + ubc_upl_abort_range(upl, + offset, + ap->a_size - offset, + UPL_ABORT_FREE_ON_EMPTY); + goto pageout_done; + } + tooklock = 1; + } + end_of_range = f_offset + xsize - 1; + + if (end_of_range >= filesize) { + end_of_range = (off_t)(filesize - 1); + } + if (f_offset < filesize) { + rl_remove(f_offset, end_of_range, &fp->ff_invalidranges); + cp->c_flag |= C_MODIFIED; /* leof is dirty */ + } + if (tooklock) { + hfs_unlock(cp); + } + } + if ((error = cluster_pageout(vp, upl, offset, f_offset, + xsize, filesize, a_flags))) { + if (error_ret == 0) + error_ret = error; + } + f_offset += xsize; + offset += xsize; + isize -= xsize; + pg_index += num_of_pages; + } + /* capture errnos bubbled out of cluster_pageout if they occurred */ + if (error_ret != 0) { + retval = error_ret; + } + } /* end block for v2 pageout behavior */ + else { + if (!vnode_isswap(vp)) { + off_t end_of_range; + int tooklock = 0; + + if (cp->c_lockowner != current_thread()) { + if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) { + if (!(a_flags & UPL_NOCOMMIT)) { + ubc_upl_abort_range(upl, + a_pl_offset, + ap->a_size, + UPL_ABORT_FREE_ON_EMPTY); + } + goto pageout_done; + } + tooklock = 1; + } + end_of_range = ap->a_f_offset + ap->a_size - 1; + + if (end_of_range >= filesize) { + end_of_range = (off_t)(filesize - 1); + } + if (ap->a_f_offset < filesize) { + rl_remove(ap->a_f_offset, end_of_range, &fp->ff_invalidranges); + cp->c_flag |= C_MODIFIED; /* leof is dirty */ + } + + if (tooklock) { + hfs_unlock(cp); + } + } + /* + * just call cluster_pageout for old pre-v2 behavior + */ + retval = cluster_pageout(vp, upl, a_pl_offset, ap->a_f_offset, + ap->a_size, filesize, a_flags); + } + + /* + * If data was written, update the modification time of the file. + * If setuid or setgid bits are set and this process is not the + * superuser then clear the setuid and setgid bits as a precaution + * against tampering. + */ + if (retval == 0) { + cp->c_touch_modtime = TRUE; + cp->c_touch_chgtime = TRUE; + if ((cp->c_mode & (S_ISUID | S_ISGID)) && + (vfs_context_suser(ap->a_context) != 0)) { + hfs_lock(cp, HFS_FORCE_LOCK); + cp->c_mode &= ~(S_ISUID | S_ISGID); + hfs_unlock(cp); + } + } + +pageout_done: + if (is_pageoutv2) { + /* release truncate lock (shared) */ + hfs_unlock_truncate(cp, 0); + } + return (retval); +} + +/* + * Intercept B-Tree node writes to unswap them if necessary. + */ +int +hfs_vnop_bwrite(struct vnop_bwrite_args *ap) +{ + int retval = 0; + register struct buf *bp = ap->a_bp; + register struct vnode *vp = buf_vnode(bp); + BlockDescriptor block; + + /* Trap B-Tree writes */ + if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) || + (VTOC(vp)->c_fileid == kHFSCatalogFileID) || + (VTOC(vp)->c_fileid == kHFSAttributesFileID) || + (vp == VTOHFS(vp)->hfc_filevp)) { + + /* + * Swap and validate the node if it is in native byte order. + * This is always be true on big endian, so we always validate + * before writing here. On little endian, the node typically has + * been swapped and validated when it was written to the journal, + * so we won't do anything here. + */ + if (((u_int16_t *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) { + /* Prepare the block pointer */ + block.blockHeader = bp; + block.buffer = (char *)buf_dataptr(bp); + block.blockNum = buf_lblkno(bp); + /* not found in cache ==> came from disk */ + block.blockReadFromDisk = (buf_fromcache(bp) == 0); + block.blockSize = buf_count(bp); + + /* Endian un-swap B-Tree node */ + retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, false); + if (retval) + panic("hfs_vnop_bwrite: about to write corrupt node!\n"); + } } -Err_Exit: - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_END, - (int)length, (int)fp->ff_size, (int)filebytes, retval, 0); + /* This buffer shouldn't be locked anymore but if it is clear it */ + if ((buf_flags(bp) & B_LOCKED)) { + // XXXdbg + if (VTOHFS(vp)->jnl) { + panic("hfs: CLEARING the lock bit on bp %p\n", bp); + } + buf_clearflags(bp, B_LOCKED); + } + retval = vn_bwrite (ap); return (retval); } - - /* -# -#% allocate vp L L L -# -vop_allocate { - IN struct vnode *vp; - IN off_t length; - IN int flags; - OUT off_t *bytesallocated; - IN off_t offset; - IN struct ucred *cred; - IN struct proc *p; -}; - * allocate a cnode to at most length size + * Relocate a file to a new location on disk + * cnode must be locked on entry + * + * Relocation occurs by cloning the file's data from its + * current set of blocks to a new set of blocks. During + * the relocation all of the blocks (old and new) are + * owned by the file. + * + * ----------------- + * |///////////////| + * ----------------- + * 0 N (file offset) + * + * ----------------- ----------------- + * |///////////////| | | STEP 1 (acquire new blocks) + * ----------------- ----------------- + * 0 N N+1 2N + * + * ----------------- ----------------- + * |///////////////| |///////////////| STEP 2 (clone data) + * ----------------- ----------------- + * 0 N N+1 2N + * + * ----------------- + * |///////////////| STEP 3 (head truncate blocks) + * ----------------- + * 0 N + * + * During steps 2 and 3 page-outs to file offsets less + * than or equal to N are suspended. + * + * During step 3 page-ins to the file get suspended. */ -int hfs_allocate(ap) - struct vop_allocate_args /* { - struct vnode *a_vp; - off_t a_length; - u_int32_t a_flags; - off_t *a_bytesallocated; - off_t a_offset; - struct ucred *a_cred; - struct proc *a_p; - } */ *ap; +__private_extern__ +int +hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, + struct proc *p) { - struct vnode *vp = ap->a_vp; - struct cnode *cp = VTOC(vp); - struct filefork *fp = VTOF(vp); - off_t length = ap->a_length; - off_t startingPEOF; - off_t moreBytesRequested; - off_t actualBytesAdded; - off_t filebytes; - u_long fileblocks; - long vflags; - struct timeval tv; - int retval, retval2; - UInt32 blockHint; - UInt32 extendFlags =0; /* For call to ExtendFileC */ - struct hfsmount *hfsmp; - + struct cnode *cp; + struct filefork *fp; + struct hfsmount *hfsmp; + u_int32_t headblks; + u_int32_t datablks; + u_int32_t blksize; + u_int32_t growsize; + u_int32_t nextallocsave; + daddr64_t sector_a, sector_b; + int eflags; + off_t newbytes; + int retval; + int lockflags = 0; + int took_trunc_lock = 0; + int started_tr = 0; + enum vtype vnodetype; + + vnodetype = vnode_vtype(vp); + if (vnodetype != VREG && vnodetype != VLNK) { + return (EPERM); + } + hfsmp = VTOHFS(vp); + if (hfsmp->hfs_flags & HFS_FRAGMENTED_FREESPACE) { + return (ENOSPC); + } - *(ap->a_bytesallocated) = 0; - fileblocks = fp->ff_blocks; - filebytes = (off_t)fileblocks * (off_t)VTOVCB(vp)->blockSize; - - if (length < (off_t)0) - return (EINVAL); - if (vp->v_type != VREG && vp->v_type != VLNK) - return (EISDIR); - if ((ap->a_flags & ALLOCATEFROMVOL) && (length <= filebytes)) + cp = VTOC(vp); + fp = VTOF(vp); + if (fp->ff_unallocblocks) return (EINVAL); + blksize = hfsmp->blockSize; + if (blockHint == 0) + blockHint = hfsmp->nextAllocation; - /* Fill in the flags word for the call to Extend the file */ - - if (ap->a_flags & ALLOCATECONTIG) - extendFlags |= kEFContigMask; - - if (ap->a_flags & ALLOCATEALL) - extendFlags |= kEFAllMask; - - if (suser(ap->a_cred, NULL) != 0) - extendFlags |= kEFReserveMask; - - tv = time; - retval = E_NONE; - blockHint = 0; - startingPEOF = filebytes; + if ((fp->ff_size > 0x7fffffff) || + ((fp->ff_size > blksize) && vnodetype == VLNK)) { + return (EFBIG); + } - if (ap->a_flags & ALLOCATEFROMPEOF) - length += filebytes; - else if (ap->a_flags & ALLOCATEFROMVOL) - blockHint = ap->a_offset / VTOVCB(vp)->blockSize; + // + // We do not believe that this call to hfs_fsync() is + // necessary and it causes a journal transaction + // deadlock so we are removing it. + // + //if (vnodetype == VREG && !vnode_issystem(vp)) { + // retval = hfs_fsync(vp, MNT_WAIT, 0, p); + // if (retval) + // return (retval); + //} + + if (!vnode_issystem(vp) && (vnodetype != VLNK)) { + hfs_unlock(cp); + hfs_lock_truncate(cp, TRUE); + /* Force lock since callers expects lock to be held. */ + if ((retval = hfs_lock(cp, HFS_FORCE_LOCK))) { + hfs_unlock_truncate(cp, TRUE); + return (retval); + } + /* No need to continue if file was removed. */ + if (cp->c_flag & C_NOEXISTS) { + hfs_unlock_truncate(cp, TRUE); + return (ENOENT); + } + took_trunc_lock = 1; + } + headblks = fp->ff_blocks; + datablks = howmany(fp->ff_size, blksize); + growsize = datablks * blksize; + eflags = kEFContigMask | kEFAllMask | kEFNoClumpMask; + if (blockHint >= hfsmp->hfs_metazone_start && + blockHint <= hfsmp->hfs_metazone_end) + eflags |= kEFMetadataMask; + + if (hfs_start_transaction(hfsmp) != 0) { + if (took_trunc_lock) + hfs_unlock_truncate(cp, TRUE); + return (EINVAL); + } + started_tr = 1; + /* + * Protect the extents b-tree and the allocation bitmap + * during MapFileBlockC and ExtendFileC operations. + */ + lockflags = SFL_BITMAP; + if (overflow_extents(fp)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); - /* If no changes are necesary, then we're done */ - if (filebytes == length) - goto Std_Exit; + retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize - 1, §or_a, NULL); + if (retval) { + retval = MacToVFSError(retval); + goto out; + } /* - * Lengthen the size of the file. We must ensure that the - * last byte of the file is allocated. Since the smallest - * value of filebytes is 0, length will be at least 1. + * STEP 1 - acquire new allocation blocks. */ - if (length > filebytes) { - moreBytesRequested = length - filebytes; - -#if QUOTA - retval = hfs_chkdq(cp, - (int64_t)(roundup(moreBytesRequested, VTOVCB(vp)->blockSize)), - ap->a_cred, 0); - if (retval) - return (retval); + nextallocsave = hfsmp->nextAllocation; + retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes); + if (eflags & kEFMetadataMask) { + HFS_MOUNT_LOCK(hfsmp, TRUE); + HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave); + MarkVCBDirty(hfsmp); + HFS_MOUNT_UNLOCK(hfsmp, TRUE); + } -#endif /* QUOTA */ - // XXXdbg - hfs_global_shared_lock_acquire(hfsmp); - if (hfsmp->jnl) { - if (journal_start_transaction(hfsmp->jnl) != 0) { - retval = EINVAL; - goto Err_Exit; - } + retval = MacToVFSError(retval); + if (retval == 0) { + cp->c_flag |= C_MODIFIED; + if (newbytes < growsize) { + retval = ENOSPC; + goto restore; + } else if (fp->ff_blocks < (headblks + datablks)) { + printf("hfs_relocate: allocation failed"); + retval = ENOSPC; + goto restore; } - /* lock extents b-tree (also protects volume bitmap) */ - retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p); + retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize, §or_b, NULL); if (retval) { - if (hfsmp->jnl) { - journal_end_transaction(hfsmp->jnl); + retval = MacToVFSError(retval); + } else if ((sector_a + 1) == sector_b) { + retval = ENOSPC; + goto restore; + } else if ((eflags & kEFMetadataMask) && + ((((u_int64_t)sector_b * hfsmp->hfs_logical_block_size) / blksize) > + hfsmp->hfs_metazone_end)) { +#if 0 + const char * filestr; + char emptystr = '\0'; + + if (cp->c_desc.cd_nameptr != NULL) { + filestr = (const char *)&cp->c_desc.cd_nameptr[0]; + } else if (vnode_name(vp) != NULL) { + filestr = vnode_name(vp); + } else { + filestr = &emptystr; } - hfs_global_shared_lock_release(hfsmp); - goto Err_Exit; - } - - retval = MacToVFSError(ExtendFileC(VTOVCB(vp), - (FCB*)fp, - moreBytesRequested, - blockHint, - extendFlags, - &actualBytesAdded)); - - *(ap->a_bytesallocated) = actualBytesAdded; - filebytes = (off_t)fp->ff_blocks * (off_t)VTOVCB(vp)->blockSize; - - (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p); - - // XXXdbg - if (hfsmp->jnl) { - hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); - journal_end_transaction(hfsmp->jnl); +#endif + retval = ENOSPC; + goto restore; } - hfs_global_shared_lock_release(hfsmp); + } + /* Done with system locks and journal for now. */ + hfs_systemfile_unlock(hfsmp, lockflags); + lockflags = 0; + hfs_end_transaction(hfsmp); + started_tr = 0; + if (retval) { /* - * if we get an error and no changes were made then exit - * otherwise we must do the VOP_UPDATE to reflect the changes - */ - if (retval && (startingPEOF == filebytes)) - goto Err_Exit; - - /* - * Adjust actualBytesAdded to be allocation block aligned, not - * clump size aligned. - * NOTE: So what we are reporting does not affect reality - * until the file is closed, when we truncate the file to allocation - * block size. + * Check to see if failure is due to excessive fragmentation. */ - if ((actualBytesAdded != 0) && (moreBytesRequested < actualBytesAdded)) - *(ap->a_bytesallocated) = - roundup(moreBytesRequested, (off_t)VTOVCB(vp)->blockSize); - - } else { /* Shorten the size of the file */ - - if (fp->ff_size > length) { - /* - * Any buffers that are past the truncation point need to be - * invalidated (to maintain buffer cache consistency). For - * simplicity, we invalidate all the buffers by calling vinvalbuf. - */ - vflags = ((length > 0) ? V_SAVE : 0) | V_SAVEMETA; - (void) vinvalbuf(vp, vflags, ap->a_cred, ap->a_p, 0, 0); - } - - // XXXdbg - hfs_global_shared_lock_acquire(hfsmp); - if (hfsmp->jnl) { - if (journal_start_transaction(hfsmp->jnl) != 0) { - retval = EINVAL; - goto Err_Exit; - } + if ((retval == ENOSPC) && + (hfs_freeblks(hfsmp, 0) > (datablks * 2))) { + hfsmp->hfs_flags |= HFS_FRAGMENTED_FREESPACE; } + goto out; + } + /* + * STEP 2 - clone file data into the new allocation blocks. + */ - /* lock extents b-tree (also protects volume bitmap) */ - retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p); - if (retval) { - if (hfsmp->jnl) { - journal_end_transaction(hfsmp->jnl); - } - hfs_global_shared_lock_release(hfsmp); - - goto Err_Exit; - } - - retval = MacToVFSError( - TruncateFileC( - VTOVCB(vp), - (FCB*)fp, - length, - false)); - (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p); - filebytes = (off_t)fp->ff_blocks * (off_t)VTOVCB(vp)->blockSize; + if (vnodetype == VLNK) + retval = hfs_clonelink(vp, blksize, cred, p); + else if (vnode_issystem(vp)) + retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p); + else + retval = hfs_clonefile(vp, headblks, datablks, blksize); + + /* Start transaction for step 3 or for a restore. */ + if (hfs_start_transaction(hfsmp) != 0) { + retval = EINVAL; + goto out; + } + started_tr = 1; + if (retval) + goto restore; - if (hfsmp->jnl) { - hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); - journal_end_transaction(hfsmp->jnl); - } - hfs_global_shared_lock_release(hfsmp); - + /* + * STEP 3 - switch to cloned data and remove old blocks. + */ + lockflags = SFL_BITMAP; + if (overflow_extents(fp)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + + retval = HeadTruncateFile(hfsmp, (FCB*)fp, headblks); + + hfs_systemfile_unlock(hfsmp, lockflags); + lockflags = 0; + if (retval) + goto restore; +out: + if (took_trunc_lock) + hfs_unlock_truncate(cp, TRUE); + + if (lockflags) { + hfs_systemfile_unlock(hfsmp, lockflags); + lockflags = 0; + } - /* - * if we get an error and no changes were made then exit - * otherwise we must do the VOP_UPDATE to reflect the changes - */ - if (retval && (startingPEOF == filebytes)) goto Err_Exit; -#if QUOTA - /* These are bytesreleased */ - (void) hfs_chkdq(cp, (int64_t)-((startingPEOF - filebytes)), NOCRED,0); -#endif /* QUOTA */ + /* Push cnode's new extent data to disk. */ + if (retval == 0) { + (void) hfs_update(vp, MNT_WAIT); + } + if (hfsmp->jnl) { + if (cp->c_cnid < kHFSFirstUserCatalogNodeID) + (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH); + else + (void) hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); + } +exit: + if (started_tr) + hfs_end_transaction(hfsmp); - if (fp->ff_size > filebytes) { - fp->ff_size = filebytes; + return (retval); - if (UBCISVALID(vp)) - ubc_setsize(vp, fp->ff_size); /* XXX check errors */ - } +restore: + if (fp->ff_blocks == headblks) { + if (took_trunc_lock) + hfs_unlock_truncate(cp, TRUE); + goto exit; + } + /* + * Give back any newly allocated space. + */ + if (lockflags == 0) { + lockflags = SFL_BITMAP; + if (overflow_extents(fp)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); } -Std_Exit: - cp->c_flag |= C_CHANGE | C_UPDATE; - retval2 = VOP_UPDATE(vp, &tv, &tv, MNT_WAIT); + (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, false); - if (retval == 0) - retval = retval2; -Err_Exit: - return (retval); + hfs_systemfile_unlock(hfsmp, lockflags); + lockflags = 0; + + if (took_trunc_lock) + hfs_unlock_truncate(cp, TRUE); + goto exit; } /* - * pagein for HFS filesystem + * Clone a symlink. + * */ -int -hfs_pagein(ap) - struct vop_pagein_args /* { - struct vnode *a_vp, - upl_t a_pl, - vm_offset_t a_pl_offset, - off_t a_f_offset, - size_t a_size, - struct ucred *a_cred, - int a_flags - } */ *ap; +static int +hfs_clonelink(struct vnode *vp, int blksize, kauth_cred_t cred, __unused struct proc *p) { - register struct vnode *vp = ap->a_vp; - int devBlockSize = 0; + struct buf *head_bp = NULL; + struct buf *tail_bp = NULL; int error; - if (vp->v_type != VREG && vp->v_type != VLNK) - panic("hfs_pagein: vp not UBC type\n"); - VOP_DEVBLOCKSIZE(VTOC(vp)->c_devvp, &devBlockSize); + error = (int)buf_meta_bread(vp, (daddr64_t)0, blksize, cred, &head_bp); + if (error) + goto out; + + tail_bp = buf_getblk(vp, (daddr64_t)1, blksize, 0, 0, BLK_META); + if (tail_bp == NULL) { + error = EIO; + goto out; + } + bcopy((char *)buf_dataptr(head_bp), (char *)buf_dataptr(tail_bp), blksize); + error = (int)buf_bwrite(tail_bp); +out: + if (head_bp) { + buf_markinvalid(head_bp); + buf_brelse(head_bp); + } + (void) buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0); - error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset, - ap->a_size, (off_t)VTOF(vp)->ff_size, devBlockSize, - ap->a_flags); return (error); } -/* - * pageout for HFS filesystem. +/* + * Clone a file's data within the file. + * */ -int -hfs_pageout(ap) - struct vop_pageout_args /* { - struct vnode *a_vp, - upl_t a_pl, - vm_offset_t a_pl_offset, - off_t a_f_offset, - size_t a_size, - struct ucred *a_cred, - int a_flags - } */ *ap; +static int +hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize) { - struct vnode *vp = ap->a_vp; - struct cnode *cp = VTOC(vp); - struct filefork *fp = VTOF(vp); - int retval; - int devBlockSize = 0; - off_t end_of_range; - off_t filesize; - - if (UBCINVALID(vp)) - panic("hfs_pageout: Not a VREG: vp=%x", vp); - - VOP_DEVBLOCKSIZE(cp->c_devvp, &devBlockSize); - filesize = fp->ff_size; - end_of_range = ap->a_f_offset + ap->a_size - 1; + caddr_t bufp; + size_t bufsize; + size_t copysize; + size_t iosize; + size_t offset; + off_t writebase; + uio_t auio; + int error = 0; + + writebase = blkstart * blksize; + copysize = blkcnt * blksize; + iosize = bufsize = MIN(copysize, 128 * 1024); + offset = 0; + + if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) { + return (ENOMEM); + } + hfs_unlock(VTOC(vp)); + + auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ); + + while (offset < copysize) { + iosize = MIN(copysize - offset, iosize); + + uio_reset(auio, offset, UIO_SYSSPACE, UIO_READ); + uio_addiov(auio, (uintptr_t)bufp, iosize); + + error = cluster_read(vp, auio, copysize, IO_NOCACHE); + if (error) { + printf("hfs_clonefile: cluster_read failed - %d\n", error); + break; + } + if (uio_resid(auio) != 0) { + printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", uio_resid(auio)); + error = EIO; + break; + } - if (end_of_range >= filesize) - end_of_range = (off_t)(filesize - 1); - if (ap->a_f_offset < filesize) - rl_remove(ap->a_f_offset, end_of_range, &fp->ff_invalidranges); + uio_reset(auio, writebase + offset, UIO_SYSSPACE, UIO_WRITE); + uio_addiov(auio, (uintptr_t)bufp, iosize); - retval = cluster_pageout(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset, ap->a_size, - filesize, devBlockSize, ap->a_flags); + error = cluster_write(vp, auio, writebase + offset, + writebase + offset + iosize, + uio_offset(auio), 0, IO_NOCACHE | IO_SYNC); + if (error) { + printf("hfs_clonefile: cluster_write failed - %d\n", error); + break; + } + if (uio_resid(auio) != 0) { + printf("hfs_clonefile: cluster_write failed - uio_resid not zero\n"); + error = EIO; + break; + } + offset += iosize; + } + uio_free(auio); - /* - * If we successfully wrote any data, and we are not the superuser - * we clear the setuid and setgid bits as a precaution against - * tampering. - */ - if (retval == 0 && ap->a_cred && ap->a_cred->cr_uid != 0) - cp->c_mode &= ~(S_ISUID | S_ISGID); + if ((blksize & PAGE_MASK)) { + /* + * since the copy may not have started on a PAGE + * boundary (or may not have ended on one), we + * may have pages left in the cache since NOCACHE + * will let partially written pages linger... + * lets just flush the entire range to make sure + * we don't have any pages left that are beyond + * (or intersect) the real LEOF of this file + */ + ubc_msync(vp, writebase, writebase + offset, NULL, UBC_INVALIDATE | UBC_PUSHDIRTY); + } else { + /* + * No need to call ubc_sync_range or hfs_invalbuf + * since the file was copied using IO_NOCACHE and + * the copy was done starting and ending on a page + * boundary in the file. + */ + } + kmem_free(kernel_map, (vm_offset_t)bufp, bufsize); - return (retval); + hfs_lock(VTOC(vp), HFS_FORCE_LOCK); + return (error); } /* - * Intercept B-Tree node writes to unswap them if necessary. -# -#vop_bwrite { -# IN struct buf *bp; + * Clone a system (metadata) file. + * */ -int -hfs_bwrite(ap) - struct vop_bwrite_args /* { - struct buf *a_bp; - } */ *ap; +static int +hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize, + kauth_cred_t cred, struct proc *p) { - int retval = 0; - register struct buf *bp = ap->a_bp; - register struct vnode *vp = bp->b_vp; -#if BYTE_ORDER == LITTLE_ENDIAN - BlockDescriptor block; + caddr_t bufp; + char * offset; + size_t bufsize; + size_t iosize; + struct buf *bp = NULL; + daddr64_t blkno; + daddr64_t blk; + daddr64_t start_blk; + daddr64_t last_blk; + int breadcnt; + int i; + int error = 0; + + + iosize = GetLogicalBlockSize(vp); + bufsize = MIN(blkcnt * blksize, 1024 * 1024) & ~(iosize - 1); + breadcnt = bufsize / iosize; + + if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) { + return (ENOMEM); + } + start_blk = ((daddr64_t)blkstart * blksize) / iosize; + last_blk = ((daddr64_t)blkcnt * blksize) / iosize; + blkno = 0; + + while (blkno < last_blk) { + /* + * Read up to a megabyte + */ + offset = bufp; + for (i = 0, blk = blkno; (i < breadcnt) && (blk < last_blk); ++i, ++blk) { + error = (int)buf_meta_bread(vp, blk, iosize, cred, &bp); + if (error) { + printf("hfs_clonesysfile: meta_bread error %d\n", error); + goto out; + } + if (buf_count(bp) != iosize) { + printf("hfs_clonesysfile: b_bcount is only %d\n", buf_count(bp)); + goto out; + } + bcopy((char *)buf_dataptr(bp), offset, iosize); - /* Trap B-Tree writes */ - if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) || - (VTOC(vp)->c_fileid == kHFSCatalogFileID)) { + buf_markinvalid(bp); + buf_brelse(bp); + bp = NULL; - /* Swap if the B-Tree node is in native byte order */ - if (((UInt16 *)((char *)bp->b_data + bp->b_bcount - 2))[0] == 0x000e) { - /* Prepare the block pointer */ - block.blockHeader = bp; - block.buffer = bp->b_data; - /* not found in cache ==> came from disk */ - block.blockReadFromDisk = (bp->b_flags & B_CACHE) == 0; - block.blockSize = bp->b_bcount; - - /* Endian un-swap B-Tree node */ - SWAP_BT_NODE (&block, ISHFSPLUS (VTOVCB(vp)), VTOC(vp)->c_fileid, 1); + offset += iosize; + } + + /* + * Write up to a megabyte + */ + offset = bufp; + for (i = 0; (i < breadcnt) && (blkno < last_blk); ++i, ++blkno) { + bp = buf_getblk(vp, start_blk + blkno, iosize, 0, 0, BLK_META); + if (bp == NULL) { + printf("hfs_clonesysfile: getblk failed on blk %qd\n", start_blk + blkno); + error = EIO; + goto out; + } + bcopy(offset, (char *)buf_dataptr(bp), iosize); + error = (int)buf_bwrite(bp); + bp = NULL; + if (error) + goto out; + offset += iosize; } - - /* We don't check to make sure that it's 0x0e00 because it could be all zeros */ } -#endif - /* This buffer shouldn't be locked anymore but if it is clear it */ - if (ISSET(bp->b_flags, B_LOCKED)) { - // XXXdbg - if (VTOHFS(vp)->jnl) { - panic("hfs: CLEARING the lock bit on bp 0x%x\n", bp); - } - CLR(bp->b_flags, B_LOCKED); - printf("hfs_bwrite: called with lock bit set\n"); +out: + if (bp) { + buf_brelse(bp); } - retval = vn_bwrite (ap); - return (retval); + kmem_free(kernel_map, (vm_offset_t)bufp, bufsize); + + error = hfs_fsync(vp, MNT_WAIT, 0, p); + + return (error); }