X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/89b3af67bb32e691275bf6fa803d1834b2284115..c18c124eaa464aaaa5549e99e5a70fc9cbb50944:/bsd/hfs/hfs_readwrite.c diff --git a/bsd/hfs/hfs_readwrite.c b/bsd/hfs/hfs_readwrite.c index b9c8bf912..f09bdc7d2 100644 --- a/bsd/hfs/hfs_readwrite.c +++ b/bsd/hfs/hfs_readwrite.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -41,71 +41,62 @@ #include #include #include +#include #include #include #include +#include #include #include +#include +#include #include #include +#include +#include +#include #include #include +#include + #include #include #include #include "hfs.h" +#include "hfs_attrlist.h" #include "hfs_endian.h" -#include "hfs_fsctl.h" +#include "hfs_fsctl.h" #include "hfs_quota.h" #include "hfscommon/headers/FileMgrInternal.h" #include "hfscommon/headers/BTreesInternal.h" #include "hfs_cnode.h" #include "hfs_dbg.h" -extern int overflow_extents(struct filefork *fp); - #define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2))) enum { MAXHFSFILESIZE = 0x7FFFFFFF /* this needs to go in the mount structure */ }; -extern u_int32_t GetLogicalBlockSize(struct vnode *vp); - -extern int hfs_setextendedsecurity(struct hfsmount *, int); - +/* from bsd/hfs/hfs_vfsops.c */ +extern int hfs_vfs_vget (struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context); -static int hfs_clonelink(struct vnode *, int, kauth_cred_t, struct proc *); static int hfs_clonefile(struct vnode *, int, int, int); static int hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *); +static int hfs_minorupdate(struct vnode *vp); +static int do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skip, vfs_context_t context); + +/* from bsd/hfs/hfs_vnops.c */ +extern decmpfs_cnode* hfs_lazy_init_decmpfs_cnode (struct cnode *cp); -int flush_cache_on_write = 0; -SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files"); - - -/***************************************************************************** -* -* I/O Operations on vnodes -* -*****************************************************************************/ -int hfs_vnop_read(struct vnop_read_args *); -int hfs_vnop_write(struct vnop_write_args *); -int hfs_vnop_ioctl(struct vnop_ioctl_args *); -int hfs_vnop_select(struct vnop_select_args *); -int hfs_vnop_blktooff(struct vnop_blktooff_args *); -int hfs_vnop_offtoblk(struct vnop_offtoblk_args *); -int hfs_vnop_blockmap(struct vnop_blockmap_args *); -int hfs_vnop_strategy(struct vnop_strategy_args *); -int hfs_vnop_allocate(struct vnop_allocate_args *); -int hfs_vnop_pagein(struct vnop_pagein_args *); -int hfs_vnop_pageout(struct vnop_pageout_args *); -int hfs_vnop_bwrite(struct vnop_bwrite_args *); +int flush_cache_on_write = 0; +SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW | CTLFLAG_LOCKED, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files"); /* * Read data from a file. @@ -113,6 +104,16 @@ int hfs_vnop_bwrite(struct vnop_bwrite_args *); int hfs_vnop_read(struct vnop_read_args *ap) { + /* + struct vnop_read_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + struct uio *a_uio; + int a_ioflag; + vfs_context_t a_context; + }; + */ + uio_t uio = ap->a_uio; struct vnode *vp = ap->a_vp; struct cnode *cp; @@ -123,7 +124,9 @@ hfs_vnop_read(struct vnop_read_args *ap) off_t start_resid = uio_resid(uio); off_t offset = uio_offset(uio); int retval = 0; - + int took_truncate_lock = 0; + int io_throttle = 0; + int throttled_count = 0; /* Preflight checks */ if (!vnode_isreg(vp)) { @@ -138,15 +141,84 @@ hfs_vnop_read(struct vnop_read_args *ap) if (offset < 0) return (EINVAL); /* cant read from a negative offset */ + if ((ap->a_ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) == + (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) { + /* Don't allow unencrypted io request from user space */ + return EPERM; + } + + + +#if HFS_COMPRESSION + if (VNODE_IS_RSRC(vp)) { + if (hfs_hides_rsrc(ap->a_context, VTOC(vp), 1)) { /* 1 == don't take the cnode lock */ + return 0; + } + /* otherwise read the resource fork normally */ + } else { + int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */ + if (compressed) { + retval = decmpfs_read_compressed(ap, &compressed, VTOCMP(vp)); + if (compressed) { + if (retval == 0) { + /* successful read, update the access time */ + VTOC(vp)->c_touch_acctime = TRUE; + + /* compressed files are not hot file candidates */ + if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) { + VTOF(vp)->ff_bytesread = 0; + } + } + return retval; + } + /* otherwise the file was converted back to a regular file while we were reading it */ + retval = 0; + } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) { + int error; + + error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP); + if (error) { + return error; + } + + } + } +#endif /* HFS_COMPRESSION */ + cp = VTOC(vp); fp = VTOF(vp); hfsmp = VTOHFS(vp); +#if CONFIG_PROTECT + if ((retval = cp_handle_vnop (vp, CP_READ_ACCESS, ap->a_ioflag)) != 0) { + goto exit; + } +#endif + + /* + * If this read request originated from a syscall (as opposed to + * an in-kernel page fault or something), then set it up for + * throttle checks + */ + if (ap->a_ioflag & IO_SYSCALL_DISPATCH) { + io_throttle = IO_RETURN_ON_THROTTLE; + } + +read_again: + /* Protect against a size change. */ - hfs_lock_truncate(cp, 0); + hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT); + took_truncate_lock = 1; filesize = fp->ff_size; filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize; + + /* + * Check the file size. Note that per POSIX spec, we return 0 at + * file EOF, so attempting a read at an offset that is too big + * should just return 0 on HFS+. Since the return value was initialized + * to 0 above, we just jump to exit. HFS Standard has its own behavior. + */ if (offset > filesize) { if ((hfsmp->hfs_flags & HFS_STANDARD) && (offset > (off_t)MAXHFSFILESIZE)) { @@ -155,20 +227,20 @@ hfs_vnop_read(struct vnop_read_args *ap) goto exit; } - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_START, + KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_START, (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0); - retval = cluster_read(vp, uio, filesize, 0); + retval = cluster_read(vp, uio, filesize, ap->a_ioflag |io_throttle); cp->c_touch_acctime = TRUE; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_END, + KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_END, (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0); /* * Keep track blocks read */ - if (VTOHFS(vp)->hfc_stage == HFC_RECORDING && retval == 0) { + if (hfsmp->hfc_stage == HFC_RECORDING && retval == 0) { int took_cnode_lock = 0; off_t bytesread; @@ -176,14 +248,14 @@ hfs_vnop_read(struct vnop_read_args *ap) /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */ if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) { - hfs_lock(cp, HFS_FORCE_LOCK); + hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); took_cnode_lock = 1; } /* * If this file hasn't been seen since the start of * the current sampling period then start over. */ - if (cp->c_atime < VTOHFS(vp)->hfc_timebase) { + if (cp->c_atime < hfsmp->hfc_timebase) { struct timeval tv; fp->ff_bytesread = bytesread; @@ -196,7 +268,19 @@ hfs_vnop_read(struct vnop_read_args *ap) hfs_unlock(cp); } exit: - hfs_unlock_truncate(cp); + if (took_truncate_lock) { + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + } + if (retval == EAGAIN) { + throttle_lowpri_io(1); + throttled_count++; + + retval = 0; + goto read_again; + } + if (throttled_count) { + throttle_info_reset_window((uthread_t)get_bsdthread_info(current_thread())); + } return (retval); } @@ -214,18 +298,62 @@ hfs_vnop_write(struct vnop_write_args *ap) kauth_cred_t cred = NULL; off_t origFileSize; off_t writelimit; - off_t bytesToAdd; + off_t bytesToAdd = 0; off_t actualBytesAdded; off_t filebytes; off_t offset; - size_t resid; + ssize_t resid; int eflags; int ioflag = ap->a_ioflag; int retval = 0; int lockflags; int cnode_locked = 0; + int partialwrite = 0; + int do_snapshot = 1; + time_t orig_ctime=VTOC(vp)->c_ctime; + int took_truncate_lock = 0; + int io_return_on_throttle = 0; + int throttled_count = 0; + struct rl_entry *invalid_range; + +#if HFS_COMPRESSION + if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */ + int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp)); + switch(state) { + case FILE_IS_COMPRESSED: + return EACCES; + case FILE_IS_CONVERTING: + /* if FILE_IS_CONVERTING, we allow writes but do not + bother with snapshots or else we will deadlock. + */ + do_snapshot = 0; + break; + default: + printf("invalid state %d for compressed file\n", state); + /* fall through */ + } + } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) { + int error; + + error = check_for_dataless_file(vp, NAMESPACE_HANDLER_WRITE_OP); + if (error != 0) { + return error; + } + } + + if (do_snapshot) { + check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_WRITE_OP, uio); + } + +#endif + + if ((ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) == + (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) { + /* Don't allow unencrypted io request from user space */ + return EPERM; + } + - // LP64todo - fix this! uio_resid may be 64-bit value resid = uio_resid(uio); offset = uio_offset(uio); @@ -236,32 +364,18 @@ hfs_vnop_write(struct vnop_write_args *ap) if (!vnode_isreg(vp)) return (EPERM); /* Can only write regular files */ - /* Protect against a size change. */ - hfs_lock_truncate(VTOC(vp), TRUE); - - if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) { - hfs_unlock_truncate(VTOC(vp)); - return (retval); - } - cnode_locked = 1; cp = VTOC(vp); fp = VTOF(vp); hfsmp = VTOHFS(vp); - filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize; - if (ioflag & IO_APPEND) { - uio_setoffset(uio, fp->ff_size); - offset = fp->ff_size; - } - if ((cp->c_flags & APPEND) && offset != fp->ff_size) { - retval = EPERM; +#if CONFIG_PROTECT + if ((retval = cp_handle_vnop (vp, CP_WRITE_ACCESS, 0)) != 0) { goto exit; } +#endif - origFileSize = fp->ff_size; eflags = kEFDeferMask; /* defer file block allocations */ - -#ifdef HFS_SPARSE_DEV +#if HFS_SPARSE_DEV /* * When the underlying device is sparse and space * is low (< 8MB), stop doing delayed allocations @@ -274,19 +388,122 @@ hfs_vnop_write(struct vnop_write_args *ap) } #endif /* HFS_SPARSE_DEV */ - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_START, - (int)offset, uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0); + if ((ioflag & (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) == + (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) { + io_return_on_throttle = IO_RETURN_ON_THROTTLE; + } + +again: + /* + * Protect against a size change. + * + * Note: If took_truncate_lock is true, then we previously got the lock shared + * but needed to upgrade to exclusive. So try getting it exclusive from the + * start. + */ + if (ioflag & IO_APPEND || took_truncate_lock) { + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + } + else { + hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT); + } + took_truncate_lock = 1; - /* Now test if we need to extend the file */ - /* Doing so will adjust the filebytes for us */ + /* Update UIO */ + if (ioflag & IO_APPEND) { + uio_setoffset(uio, fp->ff_size); + offset = fp->ff_size; + } + if ((cp->c_bsdflags & APPEND) && offset != fp->ff_size) { + retval = EPERM; + goto exit; + } + origFileSize = fp->ff_size; writelimit = offset + resid; - if (writelimit <= filebytes) + filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize; + + /* + * We may need an exclusive truncate lock for several reasons, all + * of which are because we may be writing to a (portion of a) block + * for the first time, and we need to make sure no readers see the + * prior, uninitialized contents of the block. The cases are: + * + * 1. We have unallocated (delayed allocation) blocks. We may be + * allocating new blocks to the file and writing to them. + * (A more precise check would be whether the range we're writing + * to contains delayed allocation blocks.) + * 2. We need to extend the file. The bytes between the old EOF + * and the new EOF are not yet initialized. This is important + * even if we're not allocating new blocks to the file. If the + * old EOF and new EOF are in the same block, we still need to + * protect that range of bytes until they are written for the + * first time. + * 3. The write overlaps some invalid ranges (delayed zero fill; that + * part of the file has been allocated, but not yet written). + * + * If we had a shared lock with the above cases, we need to try to upgrade + * to an exclusive lock. If the upgrade fails, we will lose the shared + * lock, and will need to take the truncate lock again; the took_truncate_lock + * flag will still be set, causing us to try for an exclusive lock next time. + * + * NOTE: Testing for #3 (delayed zero fill) needs to be done while the cnode + * lock is held, since it protects the range lists. + */ + if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) && + ((fp->ff_unallocblocks != 0) || + (writelimit > origFileSize))) { + if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) { + /* + * Lock upgrade failed and we lost our shared lock, try again. + * Note: we do not set took_truncate_lock=0 here. Leaving it + * set to 1 will cause us to try to get the lock exclusive. + */ + goto again; + } + else { + /* Store the owner in the c_truncatelockowner field if we successfully upgrade */ + cp->c_truncatelockowner = current_thread(); + } + } + + if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { + goto exit; + } + cnode_locked = 1; + + /* + * Now that we have the cnode lock, see if there are delayed zero fill ranges + * overlapping our write. If so, we need the truncate lock exclusive (see above). + */ + if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) && + (rl_scan(&fp->ff_invalidranges, offset, writelimit-1, &invalid_range) != RL_NOOVERLAP)) { + /* + * When testing, it appeared that calling lck_rw_lock_shared_to_exclusive() causes + * a deadlock, rather than simply returning failure. (That is, it apparently does + * not behave like a "try_lock"). Since this condition is rare, just drop the + * cnode lock and try again. Since took_truncate_lock is set, we will + * automatically take the truncate lock exclusive. + */ + hfs_unlock(cp); + cnode_locked = 0; + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + goto again; + } + + KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_START, + (int)offset, uio_resid(uio), (int)fp->ff_size, + (int)filebytes, 0); + + /* Check if we do not need to extend the file */ + if (writelimit <= filebytes) { goto sizeok; + } cred = vfs_context_ucred(ap->a_context); -#if QUOTA bytesToAdd = writelimit - filebytes; + +#if QUOTA retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, hfsmp->blockSize)), cred, 0); if (retval) @@ -323,13 +540,24 @@ hfs_vnop_write(struct vnop_write_args *ap) if (retval != E_NONE) break; filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_NONE, + KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_NONE, (int)offset, uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0); } (void) hfs_update(vp, TRUE); (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); (void) hfs_end_transaction(hfsmp); + /* + * If we didn't grow the file enough try a partial write. + * POSIX expects this behavior. + */ + if ((retval == ENOSPC) && (filebytes > offset)) { + retval = 0; + partialwrite = 1; + uio_setresid(uio, (uio_resid(uio) - bytesToAdd)); + resid -= bytesToAdd; + writelimit = filebytes; + } sizeok: if (retval == E_NONE) { off_t filesize; @@ -339,14 +567,13 @@ sizeok: off_t inval_end; off_t io_start; int lflag; - struct rl_entry *invalid_range; if (writelimit > fp->ff_size) filesize = writelimit; else filesize = fp->ff_size; - lflag = (ioflag & IO_SYNC); + lflag = ioflag & ~(IO_TAILZEROFILL | IO_HEADZEROFILL | IO_NOZEROVALID | IO_NOZERODIRTY); if (offset <= fp->ff_size) { zero_off = offset & ~PAGE_MASK_64; @@ -413,7 +640,7 @@ sizeok: fp->ff_size, inval_start, zero_off, (off_t)0, lflag | IO_HEADZEROFILL | IO_NOZERODIRTY); - hfs_lock(cp, HFS_FORCE_LOCK); + hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); cnode_locked = 1; if (retval) goto ioerr_exit; offset = uio_offset(uio); @@ -460,100 +687,164 @@ sizeok: hfs_unlock(cp); cnode_locked = 0; + + /* + * We need to tell UBC the fork's new size BEFORE calling + * cluster_write, in case any of the new pages need to be + * paged out before cluster_write completes (which does happen + * in embedded systems due to extreme memory pressure). + * Similarly, we need to tell hfs_vnop_pageout what the new EOF + * will be, so that it can pass that on to cluster_pageout, and + * allow those pageouts. + * + * We don't update ff_size yet since we don't want pageins to + * be able to see uninitialized data between the old and new + * EOF, until cluster_write has completed and initialized that + * part of the file. + * + * The vnode pager relies on the file size last given to UBC via + * ubc_setsize. hfs_vnop_pageout relies on fp->ff_new_size or + * ff_size (whichever is larger). NOTE: ff_new_size is always + * zero, unless we are extending the file via write. + */ + if (filesize > fp->ff_size) { + fp->ff_new_size = filesize; + ubc_setsize(vp, filesize); + } retval = cluster_write(vp, uio, fp->ff_size, filesize, zero_off, - tail_off, lflag | IO_NOZERODIRTY); - offset = uio_offset(uio); - if (offset > fp->ff_size) { - fp->ff_size = offset; + tail_off, lflag | IO_NOZERODIRTY | io_return_on_throttle); + if (retval) { + fp->ff_new_size = 0; /* no longer extending; use ff_size */ + + if (retval == EAGAIN) { + /* + * EAGAIN indicates that we still have I/O to do, but + * that we now need to be throttled + */ + if (resid != uio_resid(uio)) { + /* + * did manage to do some I/O before returning EAGAIN + */ + resid = uio_resid(uio); + offset = uio_offset(uio); + + cp->c_touch_chgtime = TRUE; + cp->c_touch_modtime = TRUE; + hfs_incr_gencount(cp); + } + if (filesize > fp->ff_size) { + /* + * we called ubc_setsize before the call to + * cluster_write... since we only partially + * completed the I/O, we need to + * re-adjust our idea of the filesize based + * on our interim EOF + */ + ubc_setsize(vp, offset); - ubc_setsize(vp, fp->ff_size); /* XXX check errors */ + fp->ff_size = offset; + } + goto exit; + } + if (filesize > origFileSize) { + ubc_setsize(vp, origFileSize); + } + goto ioerr_exit; + } + + if (filesize > origFileSize) { + fp->ff_size = filesize; + /* Files that are changing size are not hot file candidates. */ - if (hfsmp->hfc_stage == HFC_RECORDING) + if (hfsmp->hfc_stage == HFC_RECORDING) { fp->ff_bytesread = 0; + } } - if (resid > uio_resid(uio)) { - cp->c_touch_chgtime = TRUE; - cp->c_touch_modtime = TRUE; - } + fp->ff_new_size = 0; /* ff_size now has the correct size */ + } + if (partialwrite) { + uio_setresid(uio, (uio_resid(uio) + bytesToAdd)); + resid += bytesToAdd; } - // XXXdbg - testing for vivek and paul lambert + // XXXdbg - see radar 4871353 for more info { if (flush_cache_on_write && ((ioflag & IO_NOCACHE) || vnode_isnocache(vp))) { VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NULL); } } - HFS_KNOTE(vp, NOTE_WRITE); ioerr_exit: - /* - * If we successfully wrote any data, and we are not the superuser - * we clear the setuid and setgid bits as a precaution against - * tampering. - */ - if (cp->c_mode & (S_ISUID | S_ISGID)) { - cred = vfs_context_ucred(ap->a_context); - if (resid > uio_resid(uio) && cred && suser(cred, NULL)) { - if (!cnode_locked) { - hfs_lock(cp, HFS_FORCE_LOCK); - cnode_locked = 1; + if (resid > uio_resid(uio)) { + if (!cnode_locked) { + hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + cnode_locked = 1; + } + + cp->c_touch_chgtime = TRUE; + cp->c_touch_modtime = TRUE; + hfs_incr_gencount(cp); + + /* + * If we successfully wrote any data, and we are not the superuser + * we clear the setuid and setgid bits as a precaution against + * tampering. + */ + if (cp->c_mode & (S_ISUID | S_ISGID)) { + cred = vfs_context_ucred(ap->a_context); + if (cred && suser(cred, NULL)) { + cp->c_mode &= ~(S_ISUID | S_ISGID); } - cp->c_mode &= ~(S_ISUID | S_ISGID); } } if (retval) { if (ioflag & IO_UNIT) { - if (!cnode_locked) { - hfs_lock(cp, HFS_FORCE_LOCK); - cnode_locked = 1; - } (void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC, 0, ap->a_context); - // LP64todo - fix this! resid needs to by user_ssize_t uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio)))); uio_setresid(uio, resid); filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize; } - } else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio))) { - if (!cnode_locked) { - hfs_lock(cp, HFS_FORCE_LOCK); - cnode_locked = 1; - } + } else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio))) retval = hfs_update(vp, TRUE); - } + /* Updating vcbWrCnt doesn't need to be atomic. */ hfsmp->vcbWrCnt++; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_END, + KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_END, (int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0); exit: if (cnode_locked) hfs_unlock(cp); - hfs_unlock_truncate(cp); + + if (took_truncate_lock) { + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + } + if (retval == EAGAIN) { + throttle_lowpri_io(1); + throttled_count++; + + retval = 0; + goto again; + } + if (throttled_count) { + throttle_info_reset_window((uthread_t)get_bsdthread_info(current_thread())); + } return (retval); } /* support for the "bulk-access" fcntl */ -#define CACHE_ELEMS 64 #define CACHE_LEVELS 16 +#define NUM_CACHE_ENTRIES (64*16) #define PARENT_IDS_FLAG 0x100 -/* from hfs_attrlist.c */ -extern unsigned long DerivePermissionSummary(uid_t obj_uid, gid_t obj_gid, - mode_t obj_mode, struct mount *mp, - kauth_cred_t cred, struct proc *p); - -/* from vfs/vfs_fsevents.c */ -extern char *get_pathbuff(void); -extern void release_pathbuff(char *buff); - struct access_cache { int numcached; int cachehits; /* these two for statistics gathering */ int lookups; unsigned int *acache; - Boolean *haveaccess; + unsigned char *haveaccess; }; struct access_t { @@ -564,80 +855,140 @@ struct access_t { int *file_ids; /* IN: array of file ids */ gid_t *groups; /* IN: array of groups */ short *access; /* OUT: access info for each file (0 for 'has access') */ +} __attribute__((unavailable)); // this structure is for reference purposes only + +struct user32_access_t { + uid_t uid; /* IN: effective user id */ + short flags; /* IN: access requested (i.e. R_OK) */ + short num_groups; /* IN: number of groups user belongs to */ + int num_files; /* IN: number of files to process */ + user32_addr_t file_ids; /* IN: array of file ids */ + user32_addr_t groups; /* IN: array of groups */ + user32_addr_t access; /* OUT: access info for each file (0 for 'has access') */ }; -struct user_access_t { +struct user64_access_t { uid_t uid; /* IN: effective user id */ short flags; /* IN: access requested (i.e. R_OK) */ short num_groups; /* IN: number of groups user belongs to */ - int num_files; /* IN: number of files to process */ - user_addr_t file_ids; /* IN: array of file ids */ - user_addr_t groups; /* IN: array of groups */ - user_addr_t access; /* OUT: access info for each file (0 for 'has access') */ + int num_files; /* IN: number of files to process */ + user64_addr_t file_ids; /* IN: array of file ids */ + user64_addr_t groups; /* IN: array of groups */ + user64_addr_t access; /* OUT: access info for each file (0 for 'has access') */ +}; + + +// these are the "extended" versions of the above structures +// note that it is crucial that they be different sized than +// the regular version +struct ext_access_t { + uint32_t flags; /* IN: access requested (i.e. R_OK) */ + uint32_t num_files; /* IN: number of files to process */ + uint32_t map_size; /* IN: size of the bit map */ + uint32_t *file_ids; /* IN: Array of file ids */ + char *bitmap; /* OUT: hash-bitmap of interesting directory ids */ + short *access; /* OUT: access info for each file (0 for 'has access') */ + uint32_t num_parents; /* future use */ + cnid_t *parents; /* future use */ +} __attribute__((unavailable)); // this structure is for reference purposes only + +struct user32_ext_access_t { + uint32_t flags; /* IN: access requested (i.e. R_OK) */ + uint32_t num_files; /* IN: number of files to process */ + uint32_t map_size; /* IN: size of the bit map */ + user32_addr_t file_ids; /* IN: Array of file ids */ + user32_addr_t bitmap; /* OUT: hash-bitmap of interesting directory ids */ + user32_addr_t access; /* OUT: access info for each file (0 for 'has access') */ + uint32_t num_parents; /* future use */ + user32_addr_t parents; /* future use */ +}; + +struct user64_ext_access_t { + uint32_t flags; /* IN: access requested (i.e. R_OK) */ + uint32_t num_files; /* IN: number of files to process */ + uint32_t map_size; /* IN: size of the bit map */ + user64_addr_t file_ids; /* IN: array of file ids */ + user64_addr_t bitmap; /* IN: array of groups */ + user64_addr_t access; /* OUT: access info for each file (0 for 'has access') */ + uint32_t num_parents;/* future use */ + user64_addr_t parents;/* future use */ }; + /* * Perform a binary search for the given parent_id. Return value is - * found/not found boolean, and indexp will be the index of the item - * or the index at which to insert the item if it's not found. + * the index if there is a match. If no_match_indexp is non-NULL it + * will be assigned with the index to insert the item (even if it was + * not found). */ -static int -lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id) +static int cache_binSearch(cnid_t *array, unsigned int hi, cnid_t parent_id, int *no_match_indexp) { - unsigned int lo, hi; - int index, matches = 0; + int index=-1; + unsigned int lo=0; - if (cache->numcached == 0) { - *indexp = 0; - return 0; // table is empty, so insert at index=0 and report no match + do { + unsigned int mid = ((hi - lo)/2) + lo; + unsigned int this_id = array[mid]; + + if (parent_id == this_id) { + hi = mid; + break; } - - if (cache->numcached > CACHE_ELEMS) { - /*printf("EGAD! numcached is %d... cut our losses and trim to %d\n", - cache->numcached, CACHE_ELEMS);*/ - cache->numcached = CACHE_ELEMS; + + if (parent_id < this_id) { + hi = mid; + continue; } + + if (parent_id > this_id) { + lo = mid + 1; + continue; + } + } while(lo < hi); + + /* check if lo and hi converged on the match */ + if (parent_id == array[hi]) { + index = hi; + } + + if (no_match_indexp) { + *no_match_indexp = hi; + } + + return index; +} + + +static int +lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id) +{ + unsigned int hi; + int matches = 0; + int index, no_match_index; - lo = 0; - hi = cache->numcached - 1; - index = -1; + if (cache->numcached == 0) { + *indexp = 0; + return 0; // table is empty, so insert at index=0 and report no match + } - /* perform binary search for parent_id */ - do { - unsigned int mid = (hi - lo)/2 + lo; - unsigned int this_id = cache->acache[mid]; - - if (parent_id == this_id) { - index = mid; - break; - } - - if (parent_id < this_id) { - hi = mid; - continue; - } - - if (parent_id > this_id) { - lo = mid + 1; - continue; - } - } while(lo < hi); + if (cache->numcached > NUM_CACHE_ENTRIES) { + cache->numcached = NUM_CACHE_ENTRIES; + } - /* check if lo and hi converged on the match */ - if (parent_id == cache->acache[hi]) { - index = hi; - } + hi = cache->numcached - 1; - /* if no existing entry found, find index for new one */ - if (index == -1) { - index = (parent_id < cache->acache[hi]) ? hi : hi + 1; - matches = 0; - } else { - matches = 1; - } + index = cache_binSearch(cache->acache, hi, parent_id, &no_match_index); + + /* if no existing entry found, find index for new one */ + if (index == -1) { + index = no_match_index; + matches = 0; + } else { + matches = 1; + } - *indexp = index; - return matches; + *indexp = index; + return matches; } /* @@ -648,63 +999,69 @@ lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id) static void add_node(struct access_cache *cache, int index, cnid_t nodeID, int access) { - int lookup_index = -1; - - /* need to do a lookup first if -1 passed for index */ - if (index == -1) { - if (lookup_bucket(cache, &lookup_index, nodeID)) { - if (cache->haveaccess[lookup_index] != access) { - /* change access info for existing entry... should never happen */ - cache->haveaccess[lookup_index] = access; - } - - /* mission accomplished */ - return; - } else { - index = lookup_index; - } - - } - - /* if the cache is full, do a replace rather than an insert */ - if (cache->numcached >= CACHE_ELEMS) { - //printf("cache is full (%d). replace at index %d\n", cache->numcached, index); - cache->numcached = CACHE_ELEMS-1; - - if (index > cache->numcached) { - // printf("index %d pinned to %d\n", index, cache->numcached); - index = cache->numcached; - } - } else if (index >= 0 && index < cache->numcached) { - /* only do bcopy if we're inserting */ - bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) ); - bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(Boolean) ); - } - - cache->acache[index] = nodeID; - cache->haveaccess[index] = access; - cache->numcached++; + int lookup_index = -1; + + /* need to do a lookup first if -1 passed for index */ + if (index == -1) { + if (lookup_bucket(cache, &lookup_index, nodeID)) { + if (cache->haveaccess[lookup_index] != access && cache->haveaccess[lookup_index] == ESRCH) { + // only update an entry if the previous access was ESRCH (i.e. a scope checking error) + cache->haveaccess[lookup_index] = access; + } + + /* mission accomplished */ + return; + } else { + index = lookup_index; + } + + } + + /* if the cache is full, do a replace rather than an insert */ + if (cache->numcached >= NUM_CACHE_ENTRIES) { + cache->numcached = NUM_CACHE_ENTRIES-1; + + if (index > cache->numcached) { + index = cache->numcached; + } + } + + if (index < cache->numcached && index < NUM_CACHE_ENTRIES && nodeID > cache->acache[index]) { + index++; + } + + if (index >= 0 && index < cache->numcached) { + /* only do bcopy if we're inserting */ + bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) ); + bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(unsigned char) ); + } + + cache->acache[index] = nodeID; + cache->haveaccess[index] = access; + cache->numcached++; } struct cinfo { - uid_t uid; - gid_t gid; - mode_t mode; - cnid_t parentcnid; + uid_t uid; + gid_t gid; + mode_t mode; + cnid_t parentcnid; + u_int16_t recflags; }; static int -snoop_callback(const struct cat_desc *descp, const struct cat_attr *attrp, void * arg) +snoop_callback(const cnode_t *cp, void *arg) { - struct cinfo *cip = (struct cinfo *)arg; + struct cinfo *cip = arg; - cip->uid = attrp->ca_uid; - cip->gid = attrp->ca_gid; - cip->mode = attrp->ca_mode; - cip->parentcnid = descp->cd_parentcnid; + cip->uid = cp->c_uid; + cip->gid = cp->c_gid; + cip->mode = cp->c_mode; + cip->parentcnid = cp->c_parentcnid; + cip->recflags = cp->c_attr.ca_recflags; - return (0); + return (0); } /* @@ -712,31 +1069,41 @@ snoop_callback(const struct cat_desc *descp, const struct cat_attr *attrp, void * isn't incore, then go to the catalog. */ static int -do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, dev_t dev, cnid_t cnid, - struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp, struct proc *p) +do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, cnid_t cnid, + struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp) { - int error = 0; + int error = 0; - /* if this id matches the one the fsctl was called with, skip the lookup */ - if (cnid == skip_cp->c_cnid) { + /* if this id matches the one the fsctl was called with, skip the lookup */ + if (cnid == skip_cp->c_cnid) { cnattrp->ca_uid = skip_cp->c_uid; cnattrp->ca_gid = skip_cp->c_gid; cnattrp->ca_mode = skip_cp->c_mode; + cnattrp->ca_recflags = skip_cp->c_attr.ca_recflags; keyp->hfsPlus.parentID = skip_cp->c_parentcnid; - } else { + } else { struct cinfo c_info; /* otherwise, check the cnode hash incase the file/dir is incore */ - if (hfs_chash_snoop(dev, cnid, snoop_callback, &c_info) == 0) { + error = hfs_chash_snoop(hfsmp, cnid, 0, snoop_callback, &c_info); + + if (error == EACCES) { + // File is deleted + return ENOENT; + } else if (!error) { cnattrp->ca_uid = c_info.uid; cnattrp->ca_gid = c_info.gid; cnattrp->ca_mode = c_info.mode; + cnattrp->ca_recflags = c_info.recflags; keyp->hfsPlus.parentID = c_info.parentcnid; } else { int lockflags; - + + if (throttle_io_will_be_throttled(-1, HFSTOVFS(hfsmp))) + throttle_lowpri_io(1); + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - + /* lookup this cnid in the catalog */ error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp); @@ -744,133 +1111,723 @@ do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, dev_t dev, cn cache->lookups++; } - } + } - return (error); + return (error); } + /* * Compute whether we have access to the given directory (nodeID) and all its parents. Cache * up to CACHE_LEVELS as we progress towards the root. */ static int do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HFSCatalogNodeID nodeID, - struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred, dev_t dev ) + struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred, + struct vfs_context *my_context, + char *bitmap, + uint32_t map_size, + cnid_t* parents, + uint32_t num_parents) { - int myErr = 0; - int myResult; - HFSCatalogNodeID thisNodeID; - unsigned long myPerms; - struct cat_attr cnattr; - int cache_index = -1; - CatalogKey catkey; - - int i = 0, ids_to_cache = 0; - int parent_ids[CACHE_LEVELS]; - - /* root always has access */ - if (!suser(myp_ucred, NULL)) { - return (1); - } - - thisNodeID = nodeID; - while (thisNodeID >= kRootDirID) { - myResult = 0; /* default to "no access" */ - - /* check the cache before resorting to hitting the catalog */ - - /* ASSUMPTION: access info of cached entries is "final"... i.e. no need - * to look any further after hitting cached dir */ - - if (lookup_bucket(cache, &cache_index, thisNodeID)) { - cache->cachehits++; - myResult = cache->haveaccess[cache_index]; - goto ExitThisRoutine; - } - - /* remember which parents we want to cache */ - if (ids_to_cache < CACHE_LEVELS) { - parent_ids[ids_to_cache] = thisNodeID; - ids_to_cache++; - } + int myErr = 0; + int myResult; + HFSCatalogNodeID thisNodeID; + unsigned int myPerms; + struct cat_attr cnattr; + int cache_index = -1, scope_index = -1, scope_idx_start = -1; + CatalogKey catkey; + + int i = 0, ids_to_cache = 0; + int parent_ids[CACHE_LEVELS]; + + thisNodeID = nodeID; + while (thisNodeID >= kRootDirID) { + myResult = 0; /* default to "no access" */ - /* do the lookup (checks the cnode hash, then the catalog) */ - myErr = do_attr_lookup(hfsmp, cache, dev, thisNodeID, skip_cp, &catkey, &cnattr, theProcPtr); - if (myErr) { - goto ExitThisRoutine; /* no access */ - } - - myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, - cnattr.ca_mode, hfsmp->hfs_mp, - myp_ucred, theProcPtr); - - if ( (myPerms & X_OK) == 0 ) { - myResult = 0; - goto ExitThisRoutine; /* no access */ - } - - /* up the hierarchy we go */ - thisNodeID = catkey.hfsPlus.parentID; - } - - /* if here, we have access to this node */ - myResult = 1; - - ExitThisRoutine: - if (myErr) { - //printf("*** error %d from catalog looking up parent %d/%d!\n", myErr, dev, thisNodeID); - myResult = 0; - } - *err = myErr; - - /* cache the parent directory(ies) */ - for (i = 0; i < ids_to_cache; i++) { - /* small optimization: get rid of double-lookup for all these */ - // printf("adding %d to cache with result: %d\n", parent_ids[i], myResult); - add_node(cache, -1, parent_ids[i], myResult); - } - - return (myResult); -} -/* end "bulk-access" support */ + /* check the cache before resorting to hitting the catalog */ + /* ASSUMPTION: access info of cached entries is "final"... i.e. no need + * to look any further after hitting cached dir */ + if (lookup_bucket(cache, &cache_index, thisNodeID)) { + cache->cachehits++; + myErr = cache->haveaccess[cache_index]; + if (scope_index != -1) { + if (myErr == ESRCH) { + myErr = 0; + } + } else { + scope_index = 0; // so we'll just use the cache result + scope_idx_start = ids_to_cache; + } + myResult = (myErr == 0) ? 1 : 0; + goto ExitThisRoutine; + } -/* - * Callback for use with freeze ioctl. - */ -static int -hfs_freezewrite_callback(struct vnode *vp, void *cargs) -{ - vnode_waitforwrites(vp, 0, 0, 0, "hfs freeze"); - return 0; -} + if (parents) { + int tmp; + tmp = cache_binSearch(parents, num_parents-1, thisNodeID, NULL); + if (scope_index == -1) + scope_index = tmp; + if (tmp != -1 && scope_idx_start == -1 && ids_to_cache < CACHE_LEVELS) { + scope_idx_start = ids_to_cache; + } + } -/* - * Control filesystem operating characteristics. - */ -int -hfs_vnop_ioctl( struct vnop_ioctl_args /* { - vnode_t a_vp; - int a_command; - caddr_t a_data; - int a_fflag; - vfs_context_t a_context; - } */ *ap) -{ - struct vnode * vp = ap->a_vp; - struct hfsmount *hfsmp = VTOHFS(vp); - vfs_context_t context = ap->a_context; - kauth_cred_t cred = vfs_context_ucred(context); + /* remember which parents we want to cache */ + if (ids_to_cache < CACHE_LEVELS) { + parent_ids[ids_to_cache] = thisNodeID; + ids_to_cache++; + } + // Inefficient (using modulo) and we might want to use a hash function, not rely on the node id to be "nice"... + if (bitmap && map_size) { + bitmap[(thisNodeID/8)%(map_size)]|=(1<<(thisNodeID&7)); + } + + + /* do the lookup (checks the cnode hash, then the catalog) */ + myErr = do_attr_lookup(hfsmp, cache, thisNodeID, skip_cp, &catkey, &cnattr); + if (myErr) { + goto ExitThisRoutine; /* no access */ + } + + /* Root always gets access. */ + if (suser(myp_ucred, NULL) == 0) { + thisNodeID = catkey.hfsPlus.parentID; + myResult = 1; + continue; + } + + // if the thing has acl's, do the full permission check + if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) { + struct vnode *vp; + + /* get the vnode for this cnid */ + myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0, 0); + if ( myErr ) { + myResult = 0; + goto ExitThisRoutine; + } + + thisNodeID = VTOC(vp)->c_parentcnid; + + hfs_unlock(VTOC(vp)); + + if (vnode_vtype(vp) == VDIR) { + myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), my_context); + } else { + myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, my_context); + } + + vnode_put(vp); + if (myErr) { + myResult = 0; + goto ExitThisRoutine; + } + } else { + unsigned int flags; + int mode = cnattr.ca_mode & S_IFMT; + myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, cnattr.ca_mode, hfsmp->hfs_mp,myp_ucred, theProcPtr); + + if (mode == S_IFDIR) { + flags = R_OK | X_OK; + } else { + flags = R_OK; + } + if ( (myPerms & flags) != flags) { + myResult = 0; + myErr = EACCES; + goto ExitThisRoutine; /* no access */ + } + + /* up the hierarchy we go */ + thisNodeID = catkey.hfsPlus.parentID; + } + } + + /* if here, we have access to this node */ + myResult = 1; + + ExitThisRoutine: + if (parents && myErr == 0 && scope_index == -1) { + myErr = ESRCH; + } + + if (myErr) { + myResult = 0; + } + *err = myErr; + + /* cache the parent directory(ies) */ + for (i = 0; i < ids_to_cache; i++) { + if (myErr == 0 && parents && (scope_idx_start == -1 || i > scope_idx_start)) { + add_node(cache, -1, parent_ids[i], ESRCH); + } else { + add_node(cache, -1, parent_ids[i], myErr); + } + } + + return (myResult); +} + +static int +do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp, + struct vnop_ioctl_args *ap, int arg_size, vfs_context_t context) +{ + boolean_t is64bit; + + /* + * NOTE: on entry, the vnode has an io_ref. In case this vnode + * happens to be in our list of file_ids, we'll note it + * avoid calling hfs_chashget_nowait() on that id as that + * will cause a "locking against myself" panic. + */ + Boolean check_leaf = true; + + struct user64_ext_access_t *user_access_structp; + struct user64_ext_access_t tmp_user_access; + struct access_cache cache; + + int error = 0, prev_parent_check_ok=1; + unsigned int i; + + short flags; + unsigned int num_files = 0; + int map_size = 0; + int num_parents = 0; + int *file_ids=NULL; + short *access=NULL; + char *bitmap=NULL; + cnid_t *parents=NULL; + int leaf_index; + + cnid_t cnid; + cnid_t prevParent_cnid = 0; + unsigned int myPerms; + short myaccess = 0; + struct cat_attr cnattr; + CatalogKey catkey; + struct cnode *skip_cp = VTOC(vp); + kauth_cred_t cred = vfs_context_ucred(context); + proc_t p = vfs_context_proc(context); + + is64bit = proc_is64bit(p); + + /* initialize the local cache and buffers */ + cache.numcached = 0; + cache.cachehits = 0; + cache.lookups = 0; + cache.acache = NULL; + cache.haveaccess = NULL; + + /* struct copyin done during dispatch... need to copy file_id array separately */ + if (ap->a_data == NULL) { + error = EINVAL; + goto err_exit_bulk_access; + } + + if (is64bit) { + if (arg_size != sizeof(struct user64_ext_access_t)) { + error = EINVAL; + goto err_exit_bulk_access; + } + + user_access_structp = (struct user64_ext_access_t *)ap->a_data; + + } else if (arg_size == sizeof(struct user32_access_t)) { + struct user32_access_t *accessp = (struct user32_access_t *)ap->a_data; + + // convert an old style bulk-access struct to the new style + tmp_user_access.flags = accessp->flags; + tmp_user_access.num_files = accessp->num_files; + tmp_user_access.map_size = 0; + tmp_user_access.file_ids = CAST_USER_ADDR_T(accessp->file_ids); + tmp_user_access.bitmap = USER_ADDR_NULL; + tmp_user_access.access = CAST_USER_ADDR_T(accessp->access); + tmp_user_access.num_parents = 0; + user_access_structp = &tmp_user_access; + + } else if (arg_size == sizeof(struct user32_ext_access_t)) { + struct user32_ext_access_t *accessp = (struct user32_ext_access_t *)ap->a_data; + + // up-cast from a 32-bit version of the struct + tmp_user_access.flags = accessp->flags; + tmp_user_access.num_files = accessp->num_files; + tmp_user_access.map_size = accessp->map_size; + tmp_user_access.num_parents = accessp->num_parents; + + tmp_user_access.file_ids = CAST_USER_ADDR_T(accessp->file_ids); + tmp_user_access.bitmap = CAST_USER_ADDR_T(accessp->bitmap); + tmp_user_access.access = CAST_USER_ADDR_T(accessp->access); + tmp_user_access.parents = CAST_USER_ADDR_T(accessp->parents); + + user_access_structp = &tmp_user_access; + } else { + error = EINVAL; + goto err_exit_bulk_access; + } + + map_size = user_access_structp->map_size; + + num_files = user_access_structp->num_files; + + num_parents= user_access_structp->num_parents; + + if (num_files < 1) { + goto err_exit_bulk_access; + } + if (num_files > 1024) { + error = EINVAL; + goto err_exit_bulk_access; + } + + if (num_parents > 1024) { + error = EINVAL; + goto err_exit_bulk_access; + } + + file_ids = (int *) kalloc(sizeof(int) * num_files); + access = (short *) kalloc(sizeof(short) * num_files); + if (map_size) { + bitmap = (char *) kalloc(sizeof(char) * map_size); + } + + if (num_parents) { + parents = (cnid_t *) kalloc(sizeof(cnid_t) * num_parents); + } + + cache.acache = (unsigned int *) kalloc(sizeof(int) * NUM_CACHE_ENTRIES); + cache.haveaccess = (unsigned char *) kalloc(sizeof(unsigned char) * NUM_CACHE_ENTRIES); + + if (file_ids == NULL || access == NULL || (map_size != 0 && bitmap == NULL) || cache.acache == NULL || cache.haveaccess == NULL) { + if (file_ids) { + kfree(file_ids, sizeof(int) * num_files); + } + if (bitmap) { + kfree(bitmap, sizeof(char) * map_size); + } + if (access) { + kfree(access, sizeof(short) * num_files); + } + if (cache.acache) { + kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES); + } + if (cache.haveaccess) { + kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES); + } + if (parents) { + kfree(parents, sizeof(cnid_t) * num_parents); + } + return ENOMEM; + } + + // make sure the bitmap is zero'ed out... + if (bitmap) { + bzero(bitmap, (sizeof(char) * map_size)); + } + + if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids, + num_files * sizeof(int)))) { + goto err_exit_bulk_access; + } + + if (num_parents) { + if ((error = copyin(user_access_structp->parents, (caddr_t)parents, + num_parents * sizeof(cnid_t)))) { + goto err_exit_bulk_access; + } + } + + flags = user_access_structp->flags; + if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) { + flags = R_OK; + } + + /* check if we've been passed leaf node ids or parent ids */ + if (flags & PARENT_IDS_FLAG) { + check_leaf = false; + } + + /* Check access to each file_id passed in */ + for (i = 0; i < num_files; i++) { + leaf_index=-1; + cnid = (cnid_t) file_ids[i]; + + /* root always has access */ + if ((!parents) && (!suser(cred, NULL))) { + access[i] = 0; + continue; + } + + if (check_leaf) { + /* do the lookup (checks the cnode hash, then the catalog) */ + error = do_attr_lookup(hfsmp, &cache, cnid, skip_cp, &catkey, &cnattr); + if (error) { + access[i] = (short) error; + continue; + } + + if (parents) { + // Check if the leaf matches one of the parent scopes + leaf_index = cache_binSearch(parents, num_parents-1, cnid, NULL); + if (leaf_index >= 0 && parents[leaf_index] == cnid) + prev_parent_check_ok = 0; + else if (leaf_index >= 0) + prev_parent_check_ok = 1; + } + + // if the thing has acl's, do the full permission check + if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) { + struct vnode *cvp; + int myErr = 0; + /* get the vnode for this cnid */ + myErr = hfs_vget(hfsmp, cnid, &cvp, 0, 0); + if ( myErr ) { + access[i] = myErr; + continue; + } + + hfs_unlock(VTOC(cvp)); + + if (vnode_vtype(cvp) == VDIR) { + myErr = vnode_authorize(cvp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), context); + } else { + myErr = vnode_authorize(cvp, NULL, KAUTH_VNODE_READ_DATA, context); + } + + vnode_put(cvp); + if (myErr) { + access[i] = myErr; + continue; + } + } else { + /* before calling CheckAccess(), check the target file for read access */ + myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, + cnattr.ca_mode, hfsmp->hfs_mp, cred, p); + + /* fail fast if no access */ + if ((myPerms & flags) == 0) { + access[i] = EACCES; + continue; + } + } + } else { + /* we were passed an array of parent ids */ + catkey.hfsPlus.parentID = cnid; + } + + /* if the last guy had the same parent and had access, we're done */ + if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0 && prev_parent_check_ok) { + cache.cachehits++; + access[i] = 0; + continue; + } + + myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID, + skip_cp, p, cred, context,bitmap, map_size, parents, num_parents); + + if (myaccess || (error == ESRCH && leaf_index != -1)) { + access[i] = 0; // have access.. no errors to report + } else { + access[i] = (error != 0 ? (short) error : EACCES); + } + + prevParent_cnid = catkey.hfsPlus.parentID; + } + + /* copyout the access array */ + if ((error = copyout((caddr_t)access, user_access_structp->access, + num_files * sizeof (short)))) { + goto err_exit_bulk_access; + } + if (map_size && bitmap) { + if ((error = copyout((caddr_t)bitmap, user_access_structp->bitmap, + map_size * sizeof (char)))) { + goto err_exit_bulk_access; + } + } + + + err_exit_bulk_access: + + if (file_ids) + kfree(file_ids, sizeof(int) * num_files); + if (parents) + kfree(parents, sizeof(cnid_t) * num_parents); + if (bitmap) + kfree(bitmap, sizeof(char) * map_size); + if (access) + kfree(access, sizeof(short) * num_files); + if (cache.acache) + kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES); + if (cache.haveaccess) + kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES); + + return (error); +} + + +/* end "bulk-access" support */ + + +/* + * Control filesystem operating characteristics. + */ +int +hfs_vnop_ioctl( struct vnop_ioctl_args /* { + vnode_t a_vp; + long a_command; + caddr_t a_data; + int a_fflag; + vfs_context_t a_context; + } */ *ap) +{ + struct vnode * vp = ap->a_vp; + struct hfsmount *hfsmp = VTOHFS(vp); + vfs_context_t context = ap->a_context; + kauth_cred_t cred = vfs_context_ucred(context); proc_t p = vfs_context_proc(context); struct vfsstatfs *vfsp; boolean_t is64bit; + off_t jnl_start, jnl_size; + struct hfs_journal_info *jip; +#if HFS_COMPRESSION + int compressed = 0; + off_t uncompressed_size = -1; + int decmpfs_error = 0; + + if (ap->a_command == F_RDADVISE) { + /* we need to inspect the decmpfs state of the file as early as possible */ + compressed = hfs_file_is_compressed(VTOC(vp), 0); + if (compressed) { + if (VNODE_IS_RSRC(vp)) { + /* if this is the resource fork, treat it as if it were empty */ + uncompressed_size = 0; + } else { + decmpfs_error = hfs_uncompressed_size_of_compressed_file(NULL, vp, 0, &uncompressed_size, 0); + if (decmpfs_error != 0) { + /* failed to get the uncompressed size, we'll check for this later */ + uncompressed_size = -1; + } + } + } + } +#endif /* HFS_COMPRESSION */ is64bit = proc_is64bit(p); +#if CONFIG_PROTECT + { + int error = 0; + if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) { + return error; + } + } +#endif /* CONFIG_PROTECT */ + switch (ap->a_command) { + case HFS_GETPATH: + { + struct vnode *file_vp; + cnid_t cnid; + int outlen; + char *bufptr; + int error; + int flags = 0; + + /* Caller must be owner of file system. */ + vfsp = vfs_statfs(HFSTOVFS(hfsmp)); + if (suser(cred, NULL) && + kauth_cred_getuid(cred) != vfsp->f_owner) { + return (EACCES); + } + /* Target vnode must be file system's root. */ + if (!vnode_isvroot(vp)) { + return (EINVAL); + } + bufptr = (char *)ap->a_data; + cnid = strtoul(bufptr, NULL, 10); + if (ap->a_fflag & HFS_GETPATH_VOLUME_RELATIVE) { + flags |= BUILDPATH_VOLUME_RELATIVE; + } + + /* We need to call hfs_vfs_vget to leverage the code that will + * fix the origin list for us if needed, as opposed to calling + * hfs_vget, since we will need the parent for build_path call. + */ + + if ((error = hfs_vfs_vget(HFSTOVFS(hfsmp), cnid, &file_vp, context))) { + return (error); + } + error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, flags, context); + vnode_put(file_vp); + + return (error); + } + + case HFS_TRANSFER_DOCUMENT_ID: + { + struct cnode *cp = NULL; + int error; + u_int32_t to_fd = *(u_int32_t *)ap->a_data; + struct fileproc *to_fp; + struct vnode *to_vp; + struct cnode *to_cp; + + cp = VTOC(vp); + + if ((error = fp_getfvp(p, to_fd, &to_fp, &to_vp)) != 0) { + //printf("could not get the vnode for fd %d (err %d)\n", to_fd, error); + return error; + } + if ( (error = vnode_getwithref(to_vp)) ) { + file_drop(to_fd); + return error; + } + + if (VTOHFS(to_vp) != hfsmp) { + error = EXDEV; + goto transfer_cleanup; + } + + int need_unlock = 1; + to_cp = VTOC(to_vp); + error = hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK); + if (error != 0) { + //printf("could not lock the pair of cnodes (error %d)\n", error); + goto transfer_cleanup; + } + + if (!(cp->c_bsdflags & UF_TRACKED)) { + error = EINVAL; + } else if (to_cp->c_bsdflags & UF_TRACKED) { + // + // if the destination is already tracked, return an error + // as otherwise it's a silent deletion of the target's + // document-id + // + error = EEXIST; + } else if (S_ISDIR(cp->c_attr.ca_mode) || S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) { + // + // we can use the FndrExtendedFileInfo because the doc-id is the first + // thing in both it and the ExtendedDirInfo struct which is fixed in + // format and can not change layout + // + struct FndrExtendedFileInfo *f_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)cp->c_finderinfo + 16); + struct FndrExtendedFileInfo *to_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)to_cp->c_finderinfo + 16); + + if (f_extinfo->document_id == 0) { + uint32_t new_id; + + hfs_unlockpair(cp, to_cp); // have to unlock to be able to get a new-id + + if ((error = hfs_generate_document_id(hfsmp, &new_id)) == 0) { + // + // re-lock the pair now that we have the document-id + // + hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK); + f_extinfo->document_id = new_id; + } else { + goto transfer_cleanup; + } + } + + to_extinfo->document_id = f_extinfo->document_id; + f_extinfo->document_id = 0; + //printf("TRANSFERRING: doc-id %d from ino %d to ino %d\n", to_extinfo->document_id, cp->c_fileid, to_cp->c_fileid); + + // make sure the destination is also UF_TRACKED + to_cp->c_bsdflags |= UF_TRACKED; + cp->c_bsdflags &= ~UF_TRACKED; + + // mark the cnodes dirty + cp->c_flag |= C_MODIFIED | C_FORCEUPDATE; + to_cp->c_flag |= C_MODIFIED | C_FORCEUPDATE; + + int lockflags; + if ((error = hfs_start_transaction(hfsmp)) == 0) { + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); + + (void) cat_update(hfsmp, &cp->c_desc, &cp->c_attr, NULL, NULL); + (void) cat_update(hfsmp, &to_cp->c_desc, &to_cp->c_attr, NULL, NULL); + + hfs_systemfile_unlock (hfsmp, lockflags); + (void) hfs_end_transaction(hfsmp); + } + +#if CONFIG_FSE + add_fsevent(FSE_DOCID_CHANGED, context, + FSE_ARG_DEV, hfsmp->hfs_raw_dev, + FSE_ARG_INO, (ino64_t)cp->c_fileid, // src inode # + FSE_ARG_INO, (ino64_t)to_cp->c_fileid, // dst inode # + FSE_ARG_INT32, to_extinfo->document_id, + FSE_ARG_DONE); + + hfs_unlockpair(cp, to_cp); // unlock this so we can send the fsevents + need_unlock = 0; + + if (need_fsevent(FSE_STAT_CHANGED, vp)) { + add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, vp, FSE_ARG_DONE); + } + if (need_fsevent(FSE_STAT_CHANGED, to_vp)) { + add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, to_vp, FSE_ARG_DONE); + } +#else + hfs_unlockpair(cp, to_cp); // unlock this so we can send the fsevents + need_unlock = 0; +#endif + } + + if (need_unlock) { + hfs_unlockpair(cp, to_cp); + } + + transfer_cleanup: + vnode_put(to_vp); + file_drop(to_fd); + + return error; + } + + + + case HFS_PREV_LINK: + case HFS_NEXT_LINK: + { + cnid_t linkfileid; + cnid_t nextlinkid; + cnid_t prevlinkid; + int error; + + /* Caller must be owner of file system. */ + vfsp = vfs_statfs(HFSTOVFS(hfsmp)); + if (suser(cred, NULL) && + kauth_cred_getuid(cred) != vfsp->f_owner) { + return (EACCES); + } + /* Target vnode must be file system's root. */ + if (!vnode_isvroot(vp)) { + return (EINVAL); + } + linkfileid = *(cnid_t *)ap->a_data; + if (linkfileid < kHFSFirstUserCatalogNodeID) { + return (EINVAL); + } + if ((error = hfs_lookup_siblinglinks(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) { + return (error); + } + if (ap->a_command == HFS_NEXT_LINK) { + *(cnid_t *)ap->a_data = nextlinkid; + } else { + *(cnid_t *)ap->a_data = prevlinkid; + } + return (0); + } + case HFS_RESIZE_PROGRESS: { vfsp = vfs_statfs(HFSTOVFS(hfsmp)); @@ -881,8 +1838,14 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { if (!vnode_isvroot(vp)) { return (EINVAL); } + /* file system must not be mounted read-only */ + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } + return hfs_resize_progress(hfsmp, (u_int32_t *)ap->a_data); } + case HFS_RESIZE_VOLUME: { u_int64_t newsize; u_int64_t cursize; @@ -895,6 +1858,11 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { if (!vnode_isvroot(vp)) { return (EINVAL); } + + /* filesystem must not be mounted read only */ + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } newsize = *(u_int64_t *)ap->a_data; cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize; @@ -907,6 +1875,7 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { } } case HFS_CHANGE_NEXT_ALLOCATION: { + int error = 0; /* Assume success */ u_int32_t location; if (vnode_vfsisrdonly(vp)) { @@ -920,26 +1889,44 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { if (!vnode_isvroot(vp)) { return (EINVAL); } + hfs_lock_mount(hfsmp); location = *(u_int32_t *)ap->a_data; - if (location > hfsmp->totalBlocks - 1) { - return (EINVAL); + if ((location >= hfsmp->allocLimit) && + (location != HFS_NO_UPDATE_NEXT_ALLOCATION)) { + error = EINVAL; + goto fail_change_next_allocation; } /* Return previous value. */ *(u_int32_t *)ap->a_data = hfsmp->nextAllocation; - HFS_MOUNT_LOCK(hfsmp, TRUE); - hfsmp->nextAllocation = location; - hfsmp->vcbFlags |= 0xFF00; - HFS_MOUNT_UNLOCK(hfsmp, TRUE); - return (0); + if (location == HFS_NO_UPDATE_NEXT_ALLOCATION) { + /* On magic value for location, set nextAllocation to next block + * after metadata zone and set flag in mount structure to indicate + * that nextAllocation should not be updated again. + */ + if (hfsmp->hfs_metazone_end != 0) { + HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_end + 1); + } + hfsmp->hfs_flags |= HFS_SKIP_UPDATE_NEXT_ALLOCATION; + } else { + hfsmp->hfs_flags &= ~HFS_SKIP_UPDATE_NEXT_ALLOCATION; + HFS_UPDATE_NEXT_ALLOCATION(hfsmp, location); + } + MarkVCBDirty(hfsmp); +fail_change_next_allocation: + hfs_unlock_mount(hfsmp); + return (error); } -#ifdef HFS_SPARSE_DEV +#if HFS_SPARSE_DEV case HFS_SETBACKINGSTOREINFO: { struct vnode * bsfs_rootvp; struct vnode * di_vp; struct hfs_backingstoreinfo *bsdata; int error = 0; + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { return (EALREADY); } @@ -979,10 +1966,41 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { vnode_ref(bsfs_rootvp); vnode_put(bsfs_rootvp); + hfs_lock_mount(hfsmp); hfsmp->hfs_backingfs_rootvp = bsfs_rootvp; hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE; - hfsmp->hfs_sparsebandblks = bsdata->bandsize / HFSTOVCB(hfsmp)->blockSize; - hfsmp->hfs_sparsebandblks *= 4; + hfsmp->hfs_sparsebandblks = bsdata->bandsize / hfsmp->blockSize * 4; + hfs_unlock_mount(hfsmp); + + /* We check the MNTK_VIRTUALDEV bit instead of marking the dependent process */ + + /* + * If the sparse image is on a sparse image file (as opposed to a sparse + * bundle), then we may need to limit the free space to the maximum size + * of a file on that volume. So we query (using pathconf), and if we get + * a meaningful result, we cache the number of blocks for later use in + * hfs_freeblks(). + */ + hfsmp->hfs_backingfs_maxblocks = 0; + if (vnode_vtype(di_vp) == VREG) { + int terr; + int hostbits; + terr = vn_pathconf(di_vp, _PC_FILESIZEBITS, &hostbits, context); + if (terr == 0 && hostbits != 0 && hostbits < 64) { + u_int64_t hostfilesizemax = ((u_int64_t)1) << hostbits; + + hfsmp->hfs_backingfs_maxblocks = hostfilesizemax / hfsmp->blockSize; + } + } + + /* The free extent cache is managed differently for sparse devices. + * There is a window between which the volume is mounted and the + * device is marked as sparse, so the free extent cache for this + * volume is currently initialized as normal volume (sorted by block + * count). Reset the cache so that it will be rebuilt again + * for sparse device (sorted by start block). + */ + ResetVCBFreeExtCache(hfsmp); (void)vnode_put(di_vp); file_drop(bsdata->backingfd); @@ -996,349 +2014,420 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { kauth_cred_getuid(cred) != vfsp->f_owner) { return (EACCES); /* must be owner of file system */ } + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } + if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) && hfsmp->hfs_backingfs_rootvp) { + hfs_lock_mount(hfsmp); hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE; tmpvp = hfsmp->hfs_backingfs_rootvp; hfsmp->hfs_backingfs_rootvp = NULLVP; hfsmp->hfs_sparsebandblks = 0; + hfs_unlock_mount(hfsmp); + vnode_rele(tmpvp); } return (0); } #endif /* HFS_SPARSE_DEV */ + /* Change the next CNID stored in the VH */ + case HFS_CHANGE_NEXTCNID: { + int error = 0; /* Assume success */ + u_int32_t fileid; + int wraparound = 0; + int lockflags = 0; + + if (vnode_vfsisrdonly(vp)) { + return (EROFS); + } + vfsp = vfs_statfs(HFSTOVFS(hfsmp)); + if (suser(cred, NULL) && + kauth_cred_getuid(cred) != vfsp->f_owner) { + return (EACCES); /* must be owner of file system */ + } + + fileid = *(u_int32_t *)ap->a_data; + + /* Must have catalog lock excl. to advance the CNID pointer */ + lockflags = hfs_systemfile_lock (hfsmp, SFL_CATALOG , HFS_EXCLUSIVE_LOCK); + + hfs_lock_mount(hfsmp); + + /* If it is less than the current next CNID, force the wraparound bit to be set */ + if (fileid < hfsmp->vcbNxtCNID) { + wraparound=1; + } + + /* Return previous value. */ + *(u_int32_t *)ap->a_data = hfsmp->vcbNxtCNID; + + hfsmp->vcbNxtCNID = fileid; + + if (wraparound) { + hfsmp->vcbAtrb |= kHFSCatalogNodeIDsReusedMask; + } + + MarkVCBDirty(hfsmp); + hfs_unlock_mount(hfsmp); + hfs_systemfile_unlock (hfsmp, lockflags); + + return (error); + } + case F_FREEZE_FS: { struct mount *mp; - task_t task; - if (!is_suser()) - return (EACCES); - mp = vnode_mount(vp); hfsmp = VFSTOHFS(mp); if (!(hfsmp->jnl)) return (ENOTSUP); - lck_rw_lock_exclusive(&hfsmp->hfs_insync); - - task = current_task(); - task_working_set_disable(task); - - // flush things before we get started to try and prevent - // dirty data from being paged out while we're frozen. - // note: can't do this after taking the lock as it will - // deadlock against ourselves. - vnode_iterate(mp, 0, hfs_freezewrite_callback, NULL); - hfs_global_exclusive_lock_acquire(hfsmp); - journal_flush(hfsmp->jnl); - - // don't need to iterate on all vnodes, we just need to - // wait for writes to the system files and the device vnode - if (HFSTOVCB(hfsmp)->extentsRefNum) - vnode_waitforwrites(HFSTOVCB(hfsmp)->extentsRefNum, 0, 0, 0, "hfs freeze"); - if (HFSTOVCB(hfsmp)->catalogRefNum) - vnode_waitforwrites(HFSTOVCB(hfsmp)->catalogRefNum, 0, 0, 0, "hfs freeze"); - if (HFSTOVCB(hfsmp)->allocationsRefNum) - vnode_waitforwrites(HFSTOVCB(hfsmp)->allocationsRefNum, 0, 0, 0, "hfs freeze"); - if (hfsmp->hfs_attribute_vp) - vnode_waitforwrites(hfsmp->hfs_attribute_vp, 0, 0, 0, "hfs freeze"); - vnode_waitforwrites(hfsmp->hfs_devvp, 0, 0, 0, "hfs freeze"); - - hfsmp->hfs_freezing_proc = current_proc(); + vfsp = vfs_statfs(mp); + + if (kauth_cred_getuid(cred) != vfsp->f_owner && + !kauth_cred_issuser(cred)) + return (EACCES); - return (0); + return hfs_freeze(hfsmp); } case F_THAW_FS: { - if (!is_suser()) + vfsp = vfs_statfs(vnode_mount(vp)); + if (kauth_cred_getuid(cred) != vfsp->f_owner && + !kauth_cred_issuser(cred)) return (EACCES); - // if we're not the one who froze the fs then we - // can't thaw it. - if (hfsmp->hfs_freezing_proc != current_proc()) { - return EPERM; + return hfs_thaw(hfsmp, current_proc()); + } + + case HFS_BULKACCESS_FSCTL: { + int size; + + if (hfsmp->hfs_flags & HFS_STANDARD) { + return EINVAL; + } + + if (is64bit) { + size = sizeof(struct user64_access_t); + } else { + size = sizeof(struct user32_access_t); + } + + return do_bulk_access_check(hfsmp, vp, ap, size, context); + } + + case HFS_EXT_BULKACCESS_FSCTL: { + int size; + + if (hfsmp->hfs_flags & HFS_STANDARD) { + return EINVAL; + } + + if (is64bit) { + size = sizeof(struct user64_ext_access_t); + } else { + size = sizeof(struct user32_ext_access_t); + } + + return do_bulk_access_check(hfsmp, vp, ap, size, context); + } + + case HFS_SET_XATTREXTENTS_STATE: { + int state; + + if (ap->a_data == NULL) { + return (EINVAL); } - // NOTE: if you add code here, also go check the - // code that "thaws" the fs in hfs_vnop_close() - // - hfsmp->hfs_freezing_proc = NULL; - hfs_global_exclusive_lock_release(hfsmp); - lck_rw_unlock_exclusive(&hfsmp->hfs_insync); + state = *(int *)ap->a_data; + + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } - return (0); + /* Super-user can enable or disable extent-based extended + * attribute support on a volume + * Note: Starting Mac OS X 10.7, extent-based extended attributes + * are enabled by default, so any change will be transient only + * till the volume is remounted. + */ + if (!kauth_cred_issuser(kauth_cred_get())) { + return (EPERM); + } + if (state == 0 || state == 1) + return hfs_set_volxattr(hfsmp, HFS_SET_XATTREXTENTS_STATE, state); + else + return (EINVAL); } -#define HFSIOC_BULKACCESS _IOW('h', 9, struct access_t) -#define HFS_BULKACCESS_FSCTL IOCBASECMD(HFSIOC_BULKACCESS) + case F_SETSTATICCONTENT: { + int error; + int enable_static = 0; + struct cnode *cp = NULL; + /* + * lock the cnode, decorate the cnode flag, and bail out. + * VFS should have already authenticated the caller for us. + */ - case HFS_BULKACCESS_FSCTL: - case HFS_BULKACCESS: { - /* - * NOTE: on entry, the vnode is locked. Incase this vnode - * happens to be in our list of file_ids, we'll note it - * avoid calling hfs_chashget_nowait() on that id as that - * will cause a "locking against myself" panic. + if (ap->a_data) { + /* + * Note that even though ap->a_data is of type caddr_t, + * the fcntl layer at the syscall handler will pass in NULL + * or 1 depending on what the argument supplied to the fcntl + * was. So it is in fact correct to check the ap->a_data + * argument for zero or non-zero value when deciding whether or not + * to enable the static bit in the cnode. + */ + enable_static = 1; + } + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return EROFS; + } + cp = VTOC(vp); + + error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + if (error == 0) { + if (enable_static) { + cp->c_flag |= C_SSD_STATIC; + } + else { + cp->c_flag &= ~C_SSD_STATIC; + } + hfs_unlock (cp); + } + return error; + } + + case F_SET_GREEDY_MODE: { + int error; + int enable_greedy_mode = 0; + struct cnode *cp = NULL; + /* + * lock the cnode, decorate the cnode flag, and bail out. + * VFS should have already authenticated the caller for us. */ - Boolean check_leaf = true; - - struct user_access_t *user_access_structp; - struct user_access_t tmp_user_access_t; - struct access_cache cache; - - int error = 0, i; - - dev_t dev = VTOC(vp)->c_dev; - - short flags; - struct ucred myucred; - int num_files; - int *file_ids = NULL; - short *access = NULL; - - cnid_t cnid; - cnid_t prevParent_cnid = 0; - unsigned long myPerms; - short myaccess = 0; - struct cat_attr cnattr; - CatalogKey catkey; - struct cnode *skip_cp = VTOC(vp); - struct vfs_context my_context; - - /* set up front for common exit code */ - my_context.vc_ucred = NOCRED; - - /* first, return error if not run as root */ - if (cred->cr_ruid != 0) { - return EPERM; + + if (ap->a_data) { + /* + * Note that even though ap->a_data is of type caddr_t, + * the fcntl layer at the syscall handler will pass in NULL + * or 1 depending on what the argument supplied to the fcntl + * was. So it is in fact correct to check the ap->a_data + * argument for zero or non-zero value when deciding whether or not + * to enable the greedy mode bit in the cnode. + */ + enable_greedy_mode = 1; } - - /* initialize the local cache and buffers */ - cache.numcached = 0; - cache.cachehits = 0; - cache.lookups = 0; - - file_ids = (int *) get_pathbuff(); - access = (short *) get_pathbuff(); - cache.acache = (int *) get_pathbuff(); - cache.haveaccess = (Boolean *) get_pathbuff(); - - if (file_ids == NULL || access == NULL || cache.acache == NULL || cache.haveaccess == NULL) { - release_pathbuff((char *) file_ids); - release_pathbuff((char *) access); - release_pathbuff((char *) cache.acache); - release_pathbuff((char *) cache.haveaccess); - - return ENOMEM; + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return EROFS; } + cp = VTOC(vp); + + error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + if (error == 0) { + if (enable_greedy_mode) { + cp->c_flag |= C_SSD_GREEDY_MODE; + } + else { + cp->c_flag &= ~C_SSD_GREEDY_MODE; + } + hfs_unlock (cp); + } + return error; + } + + case F_SETIOTYPE: { + int error; + uint32_t iotypeflag = 0; - /* struct copyin done during dispatch... need to copy file_id array separately */ + struct cnode *cp = NULL; + /* + * lock the cnode, decorate the cnode flag, and bail out. + * VFS should have already authenticated the caller for us. + */ + if (ap->a_data == NULL) { - error = EINVAL; - goto err_exit_bulk_access; + return EINVAL; } - if (is64bit) { - user_access_structp = (struct user_access_t *)ap->a_data; + /* + * Note that even though ap->a_data is of type caddr_t, we + * can only use 32 bits of flag values. + */ + iotypeflag = (uint32_t) ap->a_data; + switch (iotypeflag) { + case F_IOTYPE_ISOCHRONOUS: + break; + default: + return EINVAL; } - else { - struct access_t * accessp = (struct access_t *)ap->a_data; - tmp_user_access_t.uid = accessp->uid; - tmp_user_access_t.flags = accessp->flags; - tmp_user_access_t.num_groups = accessp->num_groups; - tmp_user_access_t.num_files = accessp->num_files; - tmp_user_access_t.file_ids = CAST_USER_ADDR_T(accessp->file_ids); - tmp_user_access_t.groups = CAST_USER_ADDR_T(accessp->groups); - tmp_user_access_t.access = CAST_USER_ADDR_T(accessp->access); - user_access_structp = &tmp_user_access_t; + + + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return EROFS; } - - num_files = user_access_structp->num_files; - if (num_files < 1) { - goto err_exit_bulk_access; + cp = VTOC(vp); + + error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + if (error == 0) { + switch (iotypeflag) { + case F_IOTYPE_ISOCHRONOUS: + cp->c_flag |= C_IO_ISOCHRONOUS; + break; + default: + break; + } + hfs_unlock (cp); } - if (num_files > 256) { - error = EINVAL; - goto err_exit_bulk_access; + return error; + } + + case F_MAKECOMPRESSED: { + int error = 0; + uint32_t gen_counter; + struct cnode *cp = NULL; + int reset_decmp = 0; + + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return EROFS; } - - if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids, - num_files * sizeof(int)))) { - goto err_exit_bulk_access; + + /* + * acquire & lock the cnode. + * VFS should have already authenticated the caller for us. + */ + + if (ap->a_data) { + /* + * Cast the pointer into a uint32_t so we can extract the + * supplied generation counter. + */ + gen_counter = *((uint32_t*)ap->a_data); } - - /* fill in the ucred structure */ - flags = user_access_structp->flags; - if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) { - flags = R_OK; + else { + return EINVAL; } - - /* check if we've been passed leaf node ids or parent ids */ - if (flags & PARENT_IDS_FLAG) { - check_leaf = false; + +#if HFS_COMPRESSION + cp = VTOC(vp); + /* Grab truncate lock first; we may truncate the file */ + hfs_lock_truncate (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + + error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + if (error) { + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + return error; } - - /* - * Create a templated credential; this credential may *NOT* - * be used unless instantiated with a kauth_cred_create(); - * there must be a correcponding kauth_cred_unref() when it - * is no longer in use (i.e. before it goes out of scope). - */ - memset(&myucred, 0, sizeof(myucred)); - myucred.cr_ref = 1; - myucred.cr_uid = myucred.cr_ruid = myucred.cr_svuid = user_access_structp->uid; - myucred.cr_ngroups = user_access_structp->num_groups; - if (myucred.cr_ngroups < 1 || myucred.cr_ngroups > 16) { - myucred.cr_ngroups = 0; - } else if ((error = copyin(user_access_structp->groups, (caddr_t)myucred.cr_groups, - myucred.cr_ngroups * sizeof(gid_t)))) { - goto err_exit_bulk_access; - } - myucred.cr_rgid = myucred.cr_svgid = myucred.cr_groups[0]; - myucred.cr_gmuid = myucred.cr_uid; - - my_context.vc_proc = p; - my_context.vc_ucred = kauth_cred_create(&myucred); - /* Check access to each file_id passed in */ - for (i = 0; i < num_files; i++) { -#if 0 - cnid = (cnid_t) file_ids[i]; - - /* root always has access */ - if (!suser(my_context.vc_ucred, NULL)) { - access[i] = 0; - continue; - } - - if (check_leaf) { - - /* do the lookup (checks the cnode hash, then the catalog) */ - error = do_attr_lookup(hfsmp, &cache, dev, cnid, skip_cp, &catkey, &cnattr, p); - if (error) { - access[i] = (short) error; - continue; - } - - /* before calling CheckAccess(), check the target file for read access */ - myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, - cnattr.ca_mode, hfsmp->hfs_mp, my_context.vc_ucred, p ); - - - /* fail fast if no access */ - if ((myPerms & flags) == 0) { - access[i] = EACCES; - continue; - } - } else { - /* we were passed an array of parent ids */ - catkey.hfsPlus.parentID = cnid; - } - - /* if the last guy had the same parent and had access, we're done */ - if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0) { - cache.cachehits++; - access[i] = 0; - continue; - } - - myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID, - skip_cp, p, my_context.vc_ucred, dev); - - if ( myaccess ) { - access[i] = 0; // have access.. no errors to report - } else { - access[i] = (error != 0 ? (short) error : EACCES); - } - - prevParent_cnid = catkey.hfsPlus.parentID; -#else - int myErr; - - cnid = (cnid_t)file_ids[i]; - - while (cnid >= kRootDirID) { - /* get the vnode for this cnid */ - myErr = hfs_vget(hfsmp, cnid, &vp, 0); - if ( myErr ) { - access[i] = EACCES; - break; - } + /* Are there any other usecounts/FDs? */ + if (vnode_isinuse(vp, 1)) { + hfs_unlock(cp); + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + return EBUSY; + } + + /* now we have the cnode locked down; Validate arguments */ + if (cp->c_attr.ca_flags & (UF_IMMUTABLE | UF_COMPRESSED)) { + /* EINVAL if you are trying to manipulate an IMMUTABLE file */ + hfs_unlock(cp); + hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT); + return EINVAL; + } + + if ((hfs_get_gencount (cp)) == gen_counter) { + /* + * OK, the gen_counter matched. Go for it: + * Toggle state bits, truncate file, and suppress mtime update + */ + reset_decmp = 1; + cp->c_bsdflags |= UF_COMPRESSED; + + error = hfs_truncate(vp, 0, IO_NDELAY, HFS_TRUNCATE_SKIPTIMES, + ap->a_context); + } + else { + error = ESTALE; + } - cnid = VTOC(vp)->c_parentcnid; + /* Unlock cnode before executing decmpfs ; they may need to get an EA */ + hfs_unlock(cp); - hfs_unlock(VTOC(vp)); - if (vnode_vtype(vp) == VDIR) { - /* - * XXX This code assumes that none of the - * XXX callbacks from vnode_authorize() will - * XXX take a persistent ref on the context - * XXX credential, which is a bad assumption. - */ - myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), &my_context); - } else { - myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, &my_context); - } - vnode_put(vp); - access[i] = myErr; - if (myErr) { - break; - } + /* + * Reset the decmp state while still holding the truncate lock. We need to + * serialize here against a listxattr on this node which may occur at any + * time. + * + * Even if '0/skiplock' is passed in 2nd argument to hfs_file_is_compressed, + * that will still potentially require getting the com.apple.decmpfs EA. If the + * EA is required, then we can't hold the cnode lock, because the getxattr call is + * generic(through VFS), and can't pass along any info telling it that we're already + * holding it (the lock). If we don't serialize, then we risk listxattr stopping + * and trying to fill in the hfs_file_is_compressed info during the callback + * operation, which will result in deadlock against the b-tree node. + * + * So, to serialize against listxattr (which will grab buf_t meta references on + * the b-tree blocks), we hold the truncate lock as we're manipulating the + * decmpfs payload. + */ + if ((reset_decmp) && (error == 0)) { + decmpfs_cnode *dp = VTOCMP (vp); + if (dp != NULL) { + decmpfs_cnode_set_vnode_state(dp, FILE_TYPE_UNKNOWN, 0); } -#endif - } - - /* copyout the access array */ - if ((error = copyout((caddr_t)access, user_access_structp->access, - num_files * sizeof (short)))) { - goto err_exit_bulk_access; + + /* Initialize the decmpfs node as needed */ + (void) hfs_file_is_compressed (cp, 0); /* ok to take lock */ } - - err_exit_bulk_access: - - //printf("on exit (err %d), numfiles/numcached/cachehits/lookups is %d/%d/%d/%d\n", error, num_files, cache.numcached, cache.cachehits, cache.lookups); - - release_pathbuff((char *) cache.acache); - release_pathbuff((char *) cache.haveaccess); - release_pathbuff((char *) file_ids); - release_pathbuff((char *) access); - /* clean up local context, if needed */ - if (IS_VALID_CRED(my_context.vc_ucred)) - kauth_cred_unref(&my_context.vc_ucred); - - return (error); - } /* HFS_BULKACCESS */ - case HFS_SETACLSTATE: { - int state; + hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT); - if (ap->a_data == NULL) { - return (EINVAL); - } +#endif + return error; + } - vfsp = vfs_statfs(HFSTOVFS(hfsmp)); - state = *(int *)ap->a_data; + case F_SETBACKINGSTORE: { - // super-user can enable or disable acl's on a volume. - // the volume owner can only enable acl's - if (!is_suser() && (state == 0 || kauth_cred_getuid(cred) != vfsp->f_owner)) { - return (EPERM); + int error = 0; + + /* + * See comment in F_SETSTATICCONTENT re: using + * a null check for a_data + */ + if (ap->a_data) { + error = hfs_set_backingstore (vp, 1); } - if (state == 0 || state == 1) - return hfs_setextendedsecurity(hfsmp, state); - else - return (EINVAL); + else { + error = hfs_set_backingstore (vp, 0); + } + + return error; + } + + case F_GETPATH_MTMINFO: { + int error = 0; + + int *data = (int*) ap->a_data; + + /* Ask if this is a backingstore vnode */ + error = hfs_is_backingstore (vp, data); + + return error; } case F_FULLFSYNC: { int error; - - error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK); + + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } + error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); if (error == 0) { - error = hfs_fsync(vp, MNT_NOWAIT, TRUE, p); + error = hfs_fsync(vp, MNT_WAIT, TRUE, p); hfs_unlock(VTOC(vp)); } @@ -1352,7 +2441,7 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { if (!vnode_isreg(vp)) return EINVAL; - error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK); + error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); if (error == 0) { cp = VTOC(vp); /* @@ -1378,107 +2467,88 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { fp = VTOF(vp); /* Protect against a size change. */ - hfs_lock_truncate(VTOC(vp), TRUE); - + hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + +#if HFS_COMPRESSION + if (compressed && (uncompressed_size == -1)) { + /* fetching the uncompressed size failed above, so return the error */ + error = decmpfs_error; + } else if ((compressed && (ra->ra_offset >= uncompressed_size)) || + (!compressed && (ra->ra_offset >= fp->ff_size))) { + error = EFBIG; + } +#else /* HFS_COMPRESSION */ if (ra->ra_offset >= fp->ff_size) { error = EFBIG; - } else { + } +#endif /* HFS_COMPRESSION */ + else { error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count); } - hfs_unlock_truncate(VTOC(vp)); + hfs_unlock_truncate(VTOC(vp), HFS_LOCK_DEFAULT); return (error); } - case F_READBOOTSTRAP: - case F_WRITEBOOTSTRAP: - { - struct vnode *devvp = NULL; - user_fbootstraptransfer_t *user_bootstrapp; - int devBlockSize; - int error; - uio_t auio; - daddr64_t blockNumber; - u_long blockOffset; - u_long xfersize; - struct buf *bp; - user_fbootstraptransfer_t user_bootstrap; - - if (!vnode_isvroot(vp)) - return (EINVAL); - /* LP64 - when caller is a 64 bit process then we are passed a pointer - * to a user_fbootstraptransfer_t else we get a pointer to a - * fbootstraptransfer_t which we munge into a user_fbootstraptransfer_t - */ - if (is64bit) { - user_bootstrapp = (user_fbootstraptransfer_t *)ap->a_data; - } - else { - fbootstraptransfer_t *bootstrapp = (fbootstraptransfer_t *)ap->a_data; - user_bootstrapp = &user_bootstrap; - user_bootstrap.fbt_offset = bootstrapp->fbt_offset; - user_bootstrap.fbt_length = bootstrapp->fbt_length; - user_bootstrap.fbt_buffer = CAST_USER_ADDR_T(bootstrapp->fbt_buffer); - } - if (user_bootstrapp->fbt_offset + user_bootstrapp->fbt_length > 1024) - return EINVAL; - - devvp = VTOHFS(vp)->hfs_devvp; - auio = uio_create(1, user_bootstrapp->fbt_offset, - is64bit ? UIO_USERSPACE64 : UIO_USERSPACE32, - (ap->a_command == F_WRITEBOOTSTRAP) ? UIO_WRITE : UIO_READ); - uio_addiov(auio, user_bootstrapp->fbt_buffer, user_bootstrapp->fbt_length); - - devBlockSize = vfs_devblocksize(vnode_mount(vp)); - - while (uio_resid(auio) > 0) { - blockNumber = uio_offset(auio) / devBlockSize; - error = (int)buf_bread(devvp, blockNumber, devBlockSize, cred, &bp); - if (error) { - if (bp) buf_brelse(bp); - uio_free(auio); - return error; - }; - - blockOffset = uio_offset(auio) % devBlockSize; - xfersize = devBlockSize - blockOffset; - error = uiomove((caddr_t)buf_dataptr(bp) + blockOffset, (int)xfersize, auio); - if (error) { - buf_brelse(bp); - uio_free(auio); - return error; - }; - if (uio_rw(auio) == UIO_WRITE) { - error = VNOP_BWRITE(bp); - if (error) { - uio_free(auio); - return error; - } - } else { - buf_brelse(bp); - }; - }; - uio_free(auio); - }; - return 0; - case _IOC(IOC_OUT,'h', 4, 0): /* Create date in local time */ { if (is64bit) { *(user_time_t *)(ap->a_data) = (user_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate)); } else { - *(time_t *)(ap->a_data) = to_bsd_time(VTOVCB(vp)->localCreateDate); + *(user32_time_t *)(ap->a_data) = (user32_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate)); } return 0; } - case HFS_GET_MOUNT_TIME: - return copyout(&hfsmp->hfs_mount_time, CAST_USER_ADDR_T(ap->a_data), sizeof(hfsmp->hfs_mount_time)); + case SPOTLIGHT_FSCTL_GET_MOUNT_TIME: + *(uint32_t *)ap->a_data = hfsmp->hfs_mount_time; + break; + + case SPOTLIGHT_FSCTL_GET_LAST_MTIME: + *(uint32_t *)ap->a_data = hfsmp->hfs_last_mounted_mtime; + break; + + case HFS_FSCTL_GET_VERY_LOW_DISK: + *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_dangerlimit; + break; + + case HFS_FSCTL_SET_VERY_LOW_DISK: + if (*(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_warninglimit) { + return EINVAL; + } + + hfsmp->hfs_freespace_notify_dangerlimit = *(uint32_t *)ap->a_data; + break; + + case HFS_FSCTL_GET_LOW_DISK: + *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_warninglimit; + break; + + case HFS_FSCTL_SET_LOW_DISK: + if ( *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel + || *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_dangerlimit) { + + return EINVAL; + } + + hfsmp->hfs_freespace_notify_warninglimit = *(uint32_t *)ap->a_data; + break; + + case HFS_FSCTL_GET_DESIRED_DISK: + *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_desiredlevel; + break; + + case HFS_FSCTL_SET_DESIRED_DISK: + if (*(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) { + return EINVAL; + } + + hfsmp->hfs_freespace_notify_desiredlevel = *(uint32_t *)ap->a_data; break; - case HFS_GET_LAST_MTIME: - return copyout(&hfsmp->hfs_last_mounted_mtime, CAST_USER_ADDR_T(ap->a_data), sizeof(hfsmp->hfs_last_mounted_mtime)); + case HFS_VOLUME_STATUS: + *(uint32_t *)ap->a_data = hfsmp->hfs_notification_conditions; break; case HFS_SET_BOOT_INFO: @@ -1486,25 +2556,229 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { return(EINVAL); if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(HFSTOVFS(hfsmp))->f_owner)) return(EACCES); /* must be superuser or owner of filesystem */ - HFS_MOUNT_LOCK(hfsmp, TRUE); + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } + hfs_lock_mount (hfsmp); bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo)); - HFS_MOUNT_UNLOCK(hfsmp, TRUE); + hfs_unlock_mount (hfsmp); (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0); break; case HFS_GET_BOOT_INFO: if (!vnode_isvroot(vp)) return(EINVAL); - HFS_MOUNT_LOCK(hfsmp, TRUE); + hfs_lock_mount (hfsmp); bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo)); - HFS_MOUNT_UNLOCK(hfsmp, TRUE); + hfs_unlock_mount(hfsmp); + break; + + case HFS_MARK_BOOT_CORRUPT: + /* Mark the boot volume corrupt by setting + * kHFSVolumeInconsistentBit in the volume header. This will + * force fsck_hfs on next mount. + */ + if (!kauth_cred_issuser(kauth_cred_get())) { + return EACCES; + } + + /* Allowed only on the root vnode of the boot volume */ + if (!(vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) || + !vnode_isvroot(vp)) { + return EINVAL; + } + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } + printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n"); + hfs_mark_inconsistent(hfsmp, HFS_FSCK_FORCED); + break; + + case HFS_FSCTL_GET_JOURNAL_INFO: + jip = (struct hfs_journal_info*)ap->a_data; + + if (vp == NULLVP) + return EINVAL; + + if (hfsmp->jnl == NULL) { + jnl_start = 0; + jnl_size = 0; + } else { + jnl_start = (off_t)(hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset; + jnl_size = (off_t)hfsmp->jnl_size; + } + + jip->jstart = jnl_start; + jip->jsize = jnl_size; + break; + + case HFS_SET_ALWAYS_ZEROFILL: { + struct cnode *cp = VTOC(vp); + + if (*(int *)ap->a_data) { + cp->c_flag |= C_ALWAYS_ZEROFILL; + } else { + cp->c_flag &= ~C_ALWAYS_ZEROFILL; + } + break; + } + + case HFS_DISABLE_METAZONE: { + /* Only root can disable metadata zone */ + if (!kauth_cred_issuser(kauth_cred_get())) { + return EACCES; + } + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } + + /* Disable metadata zone now */ + (void) hfs_metadatazone_init(hfsmp, true); + printf ("hfs: Disabling metadata zone on %s\n", hfsmp->vcbVN); + break; + } + + + case HFS_FSINFO_METADATA_BLOCKS: { + int error; + struct hfsinfo_metadata *hinfo; + + hinfo = (struct hfsinfo_metadata *)ap->a_data; + + /* Get information about number of metadata blocks */ + error = hfs_getinfo_metadata_blocks(hfsmp, hinfo); + if (error) { + return error; + } + + break; + } + + case HFS_GET_FSINFO: { + hfs_fsinfo *fsinfo = (hfs_fsinfo *)ap->a_data; + + /* Only root is allowed to get fsinfo */ + if (!kauth_cred_issuser(kauth_cred_get())) { + return EACCES; + } + + /* + * Make sure that the caller's version number matches with + * the kernel's version number. This will make sure that + * if the structures being read/written into are changed + * by the kernel, the caller will not read incorrect data. + * + * The first three fields --- request_type, version and + * flags are same for all the hfs_fsinfo structures, so + * we can access the version number by assuming any + * structure for now. + */ + if (fsinfo->header.version != HFS_FSINFO_VERSION) { + return ENOTSUP; + } + + /* Make sure that the current file system is not marked inconsistent */ + if (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) { + return EIO; + } + + return hfs_get_fsinfo(hfsmp, ap->a_data); + } + + case HFS_CS_FREESPACE_TRIM: { + int error = 0; + int lockflags = 0; + + /* Only root allowed */ + if (!kauth_cred_issuser(kauth_cred_get())) { + return EACCES; + } + + /* + * This core functionality is similar to hfs_scan_blocks(). + * The main difference is that hfs_scan_blocks() is called + * as part of mount where we are assured that the journal is + * empty to start with. This fcntl() can be called on a + * mounted volume, therefore it has to flush the content of + * the journal as well as ensure the state of summary table. + * + * This fcntl scans over the entire allocation bitmap, + * creates list of all the free blocks, and issues TRIM + * down to the underlying device. This can take long time + * as it can generate up to 512MB of read I/O. + */ + + if ((hfsmp->hfs_flags & HFS_SUMMARY_TABLE) == 0) { + error = hfs_init_summary(hfsmp); + if (error) { + printf("hfs: fsctl() could not initialize summary table for %s\n", hfsmp->vcbVN); + return error; + } + } + + /* + * The journal maintains list of recently deallocated blocks to + * issue DKIOCUNMAPs when the corresponding journal transaction is + * flushed to the disk. To avoid any race conditions, we only + * want one active trim list and only one thread issuing DKIOCUNMAPs. + * Therefore we make sure that the journal trim list is sync'ed, + * empty, and not modifiable for the duration of our scan. + * + * Take the journal lock before flushing the journal to the disk. + * We will keep on holding the journal lock till we don't get the + * bitmap lock to make sure that no new journal transactions can + * start. This will make sure that the journal trim list is not + * modified after the journal flush and before getting bitmap lock. + * We can release the journal lock after we acquire the bitmap + * lock as it will prevent any further block deallocations. + */ + hfs_journal_lock(hfsmp); + + /* Flush the journal and wait for all I/Os to finish up */ + error = hfs_journal_flush(hfsmp, TRUE); + if (error) { + hfs_journal_unlock(hfsmp); + return error; + } + + /* Take bitmap lock to ensure it is not being modified */ + lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + + /* Release the journal lock */ + hfs_journal_unlock(hfsmp); + + /* + * ScanUnmapBlocks reads the bitmap in large block size + * (up to 1MB) unlike the runtime which reads the bitmap + * in the 4K block size. This can cause buf_t collisions + * and potential data corruption. To avoid this, we + * invalidate all the existing buffers associated with + * the bitmap vnode before scanning it. + * + * Note: ScanUnmapBlock() cleans up all the buffers + * after itself, so there won't be any large buffers left + * for us to clean up after it returns. + */ + error = buf_invalidateblks(hfsmp->hfs_allocation_vp, 0, 0, 0); + if (error) { + hfs_systemfile_unlock(hfsmp, lockflags); + return error; + } + + /* Traverse bitmap and issue DKIOCUNMAPs */ + error = ScanUnmapBlocks(hfsmp); + hfs_systemfile_unlock(hfsmp, lockflags); + if (error) { + return error; + } + break; + } default: return (ENOTTY); } - /* Should never get here */ return 0; } @@ -1536,13 +2810,12 @@ hfs_vnop_select(__unused struct vnop_select_args *ap) * The block run is returned in logical blocks, and is the REMAINING amount of blocks */ int -hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, int *runp) +hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, unsigned int *runp) { - struct cnode *cp = VTOC(vp); struct filefork *fp = VTOF(vp); struct hfsmount *hfsmp = VTOHFS(vp); int retval = E_NONE; - daddr_t logBlockSize; + u_int32_t logBlockSize; size_t bytesContAvail = 0; off_t blockposition; int lockExtBtree; @@ -1553,17 +2826,17 @@ hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, int * * to physical mapping is requested. */ if (vpp != NULL) - *vpp = cp->c_devvp; + *vpp = hfsmp->hfs_devvp; if (bnp == NULL) return (0); logBlockSize = GetLogicalBlockSize(vp); - blockposition = (off_t)bn * (off_t)logBlockSize; + blockposition = (off_t)bn * logBlockSize; lockExtBtree = overflow_extents(fp); if (lockExtBtree) - lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_SHARED_LOCK); + lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); retval = MacToVFSError( MapFileBlockC (HFSTOVCB(hfsmp), @@ -1633,6 +2906,15 @@ hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap) /* * Map file offset to physical block number. * + * If this function is called for write operation, and if the file + * had virtual blocks allocated (delayed allocation), real blocks + * are allocated by calling ExtendFileC(). + * + * If this function is called for read operation, and if the file + * had virtual blocks allocated (delayed allocation), no change + * to the size of file is done, and if required, rangelist is + * searched for mapping. + * * System file cnodes are expected to be locked (shared or exclusive). */ int @@ -1663,6 +2945,26 @@ hfs_vnop_blockmap(struct vnop_blockmap_args *ap) int started_tr = 0; int tooklock = 0; +#if HFS_COMPRESSION + if (VNODE_IS_RSRC(vp)) { + /* allow blockmaps to the resource fork */ + } else { + if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */ + int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp)); + switch(state) { + case FILE_IS_COMPRESSED: + return ENOTSUP; + case FILE_IS_CONVERTING: + /* if FILE_IS_CONVERTING, we allow blockmap */ + break; + default: + printf("invalid state %d for compressed file\n", state); + /* fall through */ + } + } + } +#endif /* HFS_COMPRESSION */ + /* Do not allow blockmap operation on a directory */ if (vnode_isdir(vp)) { return (ENOTSUP); @@ -1675,14 +2977,10 @@ hfs_vnop_blockmap(struct vnop_blockmap_args *ap) if (ap->a_bpn == NULL) return (0); - if ( !vnode_issystem(vp) && !vnode_islnk(vp)) { + if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) { if (VTOC(vp)->c_lockowner != current_thread()) { - hfs_lock(VTOC(vp), HFS_FORCE_LOCK); + hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); tooklock = 1; - } else { - cp = VTOC(vp); - panic("blockmap: %s cnode lock already held!\n", - cp->c_desc.cd_nameptr ? cp->c_desc.cd_nameptr : ""); } } hfsmp = VTOHFS(vp); @@ -1690,7 +2988,8 @@ hfs_vnop_blockmap(struct vnop_blockmap_args *ap) fp = VTOF(vp); retry: - if (fp->ff_unallocblocks) { + /* Check virtual blocks only when performing write operation */ + if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) { if (hfs_start_transaction(hfsmp) != 0) { retval = EINVAL; goto exit; @@ -1709,8 +3008,8 @@ retry: /* * Check for any delayed allocations. */ - if (fp->ff_unallocblocks) { - SInt64 actbytes; + if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) { + int64_t actbytes; u_int32_t loanedBlocks; // @@ -1743,12 +3042,10 @@ retry: cp->c_blocks += loanedBlocks; fp->ff_blocks += loanedBlocks; - HFS_MOUNT_LOCK(hfsmp, TRUE); + hfs_lock_mount (hfsmp); hfsmp->loanedBlocks += loanedBlocks; - HFS_MOUNT_UNLOCK(hfsmp, TRUE); - } + hfs_unlock_mount (hfsmp); - if (retval) { hfs_systemfile_unlock(hfsmp, lockflags); cp->c_flag |= C_MODIFIED; if (started_tr) { @@ -1756,6 +3053,7 @@ retry: (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); hfs_end_transaction(hfsmp); + started_tr = 0; } goto exit; } @@ -1775,10 +3073,62 @@ retry: started_tr = 0; } if (retval) { + /* On write, always return error because virtual blocks, if any, + * should have been allocated in ExtendFileC(). We do not + * allocate virtual blocks on read, therefore return error + * only if no virtual blocks are allocated. Otherwise we search + * rangelist for zero-fills + */ + if ((MacToVFSError(retval) != ERANGE) || + (ap->a_flags & VNODE_WRITE) || + ((ap->a_flags & VNODE_READ) && (fp->ff_unallocblocks == 0))) { + goto exit; + } + + /* Validate if the start offset is within logical file size */ + if (ap->a_foffset >= fp->ff_size) { + goto exit; + } + + /* + * At this point, we have encountered a failure during + * MapFileBlockC that resulted in ERANGE, and we are not servicing + * a write, and there are borrowed blocks. + * + * However, the cluster layer will not call blockmap for + * blocks that are borrowed and in-cache. We have to assume that + * because we observed ERANGE being emitted from MapFileBlockC, this + * extent range is not valid on-disk. So we treat this as a + * mapping that needs to be zero-filled prior to reading. + * + * Note that under certain circumstances (such as non-contiguous + * userland VM mappings in the calling process), cluster_io + * may be forced to split a large I/O driven by hfs_vnop_write + * into multiple sub-I/Os that necessitate a RMW cycle. If this is + * the case here, then we have already removed the invalid range list + * mapping prior to getting to this blockmap call, so we should not + * search the invalid rangelist for this byte range. + */ + + bytesContAvail = fp->ff_size - ap->a_foffset; + /* + * Clip the contiguous available bytes to, at most, the allowable + * maximum or the amount requested. + */ + + if (bytesContAvail > ap->a_size) { + bytesContAvail = ap->a_size; + } + + *ap->a_bpn = (daddr64_t) -1; + retval = 0; + goto exit; } - /* Adjust the mapping information for invalid file ranges: */ + /* MapFileC() found a valid extent in the filefork. Search the + * mapping information further for invalid file ranges + */ overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset, ap->a_foffset + (off_t)bytesContAvail - 1, &invalid_range); @@ -1787,7 +3137,7 @@ retry: case RL_MATCHINGOVERLAP: case RL_OVERLAPCONTAINSRANGE: case RL_OVERLAPSTARTSBEFORE: - /* There's no valid block for this byte offset: */ + /* There's no valid block for this byte offset */ *ap->a_bpn = (daddr64_t)-1; /* There's no point limiting the amount to be returned * if the invalid range that was hit extends all the way @@ -1795,7 +3145,7 @@ retry: * end of this range and the file's EOF): */ if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) && - (invalid_range->rl_end + 1 - ap->a_foffset < bytesContAvail)) { + ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) { bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset; } break; @@ -1807,7 +3157,7 @@ retry: /* There's actually no valid information to be had starting here: */ *ap->a_bpn = (daddr64_t)-1; if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) && - (invalid_range->rl_end + 1 - ap->a_foffset < bytesContAvail)) { + ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) { bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset; } } else { @@ -1820,20 +3170,23 @@ retry: } /* end switch */ if (bytesContAvail > ap->a_size) bytesContAvail = ap->a_size; + } + +exit: + if (retval == 0) { + if (ap->a_run) + *ap->a_run = bytesContAvail; + + if (ap->a_poff) + *(int *)ap->a_poff = 0; } - if (ap->a_run) - *ap->a_run = bytesContAvail; - if (ap->a_poff) - *(int *)ap->a_poff = 0; -exit: if (tooklock) hfs_unlock(cp); return (MacToVFSError(retval)); } - /* * prepare and issue the I/O * buf_strategy knows how to deal @@ -1845,35 +3198,126 @@ hfs_vnop_strategy(struct vnop_strategy_args *ap) { buf_t bp = ap->a_bp; vnode_t vp = buf_vnode(bp); - struct cnode *cp = VTOC(vp); + int error = 0; + + /* Mark buffer as containing static data if cnode flag set */ + if (VTOC(vp)->c_flag & C_SSD_STATIC) { + buf_markstatic(bp); + } + + /* Mark buffer as containing static data if cnode flag set */ + if (VTOC(vp)->c_flag & C_SSD_GREEDY_MODE) { + bufattr_markgreedymode(&bp->b_attr); + } - return (buf_strategy(cp->c_devvp, ap)); + /* mark buffer as containing burst mode data if cnode flag set */ + if (VTOC(vp)->c_flag & C_IO_ISOCHRONOUS) { + bufattr_markisochronous(&bp->b_attr); + } + +#if CONFIG_PROTECT + cnode_t *cp = NULL; + + if ((!bufattr_rawencrypted(&bp->b_attr)) && + ((cp = cp_get_protected_cnode(vp)) != NULL)) { + /* + * We rely upon the truncate lock to protect the + * CP cache key from getting tossed prior to our IO finishing here. + * Nearly all cluster io calls to manipulate file payload from HFS + * take the truncate lock before calling into the cluster + * layer to ensure the file size does not change, or that they + * have exclusive right to change the EOF of the file. + * That same guarantee protects us here since the code that + * deals with CP lock events must now take the truncate lock + * before doing anything. + * + * There is 1 exception here: + * 1) One exception should be the VM swapfile IO, because HFS will + * funnel the VNOP_PAGEOUT directly into a cluster_pageout call for the + * swapfile code only without holding the truncate lock. This is because + * individual swapfiles are maintained at fixed-length sizes by the VM code. + * In non-swapfile IO we use PAGEOUT_V2 semantics which allow us to + * create our own UPL and thus take the truncate lock before calling + * into the cluster layer. In that case, however, we are not concerned + * with the CP blob being wiped out in the middle of the IO + * because there isn't anything to toss; the VM swapfile key stays + * in-core as long as the file is open. + */ + + + /* + * Last chance: If this data protected I/O does not have unwrapped keys + * present, then try to get them. We already know that it should, by this point. + */ + if (cp->c_cpentry->cp_flags & (CP_KEY_FLUSHED | CP_NEEDS_KEYS)) { + int io_op = ( (buf_flags(bp) & B_READ) ? CP_READ_ACCESS : CP_WRITE_ACCESS); + if ((error = cp_handle_vnop(vp, io_op, 0)) != 0) { + /* + * We have to be careful here. By this point in the I/O path, VM or the cluster + * engine has prepared a buf_t with the proper file offsets and all the rest, + * so simply erroring out will result in us leaking this particular buf_t. + * We need to properly decorate the buf_t just as buf_strategy would so as + * to make it appear that the I/O errored out with the particular error code. + */ + buf_seterror (bp, error); + buf_biodone(bp); + return error; + } + } + + /* + *NB: + * For filesystem resize, we may not have access to the underlying + * file's cache key for whatever reason (device may be locked). However, + * we do not need it since we are going to use the temporary HFS-wide resize key + * which is generated once we start relocating file content. If this file's I/O + * should be done using the resize key, it will have been supplied already, so + * do not attach the file's cp blob to the buffer. + */ + if ((cp->c_cpentry->cp_flags & CP_RELOCATION_INFLIGHT) == 0) { + buf_setcpaddr(bp, cp->c_cpentry); + } + } +#endif /* CONFIG_PROTECT */ + + error = buf_strategy(VTOHFS(vp)->hfs_devvp, ap); + + return error; } +static int +hfs_minorupdate(struct vnode *vp) { + struct cnode *cp = VTOC(vp); + cp->c_flag &= ~C_MODIFIED; + cp->c_touch_acctime = 0; + cp->c_touch_chgtime = 0; + cp->c_touch_modtime = 0; + + return 0; +} -static int -do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize, vfs_context_t context) +int +do_hfs_truncate(struct vnode *vp, off_t length, int flags, int truncateflags, vfs_context_t context) { register struct cnode *cp = VTOC(vp); struct filefork *fp = VTOF(vp); - struct proc *p = vfs_context_proc(context);; kauth_cred_t cred = vfs_context_ucred(context); int retval; off_t bytesToAdd; off_t actualBytesAdded; off_t filebytes; - u_int64_t old_filesize; - u_long fileblocks; + u_int32_t fileblocks; int blksize; struct hfsmount *hfsmp; int lockflags; + int skipupdate = (truncateflags & HFS_TRUNCATE_SKIPUPDATE); + int suppress_times = (truncateflags & HFS_TRUNCATE_SKIPTIMES); blksize = VTOVCB(vp)->blockSize; fileblocks = fp->ff_blocks; filebytes = (off_t)fileblocks * (off_t)blksize; - old_filesize = fp->ff_size; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_START, + KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_START, (int)length, (int)fp->ff_size, (int)filebytes, 0, 0); if (length < 0) @@ -1922,13 +3366,14 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize, vfs_ */ if (length > filebytes) { int eflags; - u_long blockHint = 0; + u_int32_t blockHint = 0; /* All or nothing and don't round up to clumpsize. */ eflags = kEFAllMask | kEFNoClumpMask; - if (cred && suser(cred, NULL) != 0) + if (cred && (suser(cred, NULL) != 0)) { eflags |= kEFReserveMask; /* keep a reserve */ + } /* * Allocate Journal and Quota files in metadata zone. @@ -1950,6 +3395,10 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize, vfs_ lockflags |= SFL_EXTENTS; lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + /* + * Keep growing the file as long as the current EOF is + * less than the desired value. + */ while ((length > filebytes) && (retval == E_NONE)) { bytesToAdd = length - filebytes; retval = MacToVFSError(ExtendFileC(VTOVCB(vp), @@ -1970,8 +3419,13 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize, vfs_ hfs_systemfile_unlock(hfsmp, lockflags); if (hfsmp->jnl) { - (void) hfs_update(vp, TRUE); - (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); + if (skipupdate) { + (void) hfs_minorupdate(vp); + } + else { + (void) hfs_update(vp, TRUE); + (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); + } } hfs_end_transaction(hfsmp); @@ -1979,12 +3433,16 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize, vfs_ if (retval) goto Err_Exit; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE, + KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE, (int)length, (int)fp->ff_size, (int)filebytes, 0, 0); } - if (!(flags & IO_NOZEROFILL)) { - if (UBCINFOEXISTS(vp) && retval == E_NONE) { + if (ISSET(flags, IO_NOZEROFILL)) { + // An optimisation for the hibernation file + if (vnode_isswap(vp)) + rl_remove_all(&fp->ff_invalidranges); + } else { + if (UBCINFOEXISTS(vp) && (vnode_issystem(vp) == 0) && retval == E_NONE) { struct rl_entry *invalid_range; off_t zero_limit; @@ -2008,7 +3466,7 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize, vfs_ retval = cluster_write(vp, (struct uio *) 0, fp->ff_size, zero_limit, fp->ff_size, (off_t)0, (flags & IO_SYNC) | IO_HEADZEROFILL | IO_NOZERODIRTY); - hfs_lock(cp, HFS_FORCE_LOCK); + hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); if (retval) goto Err_Exit; /* Merely invalidate the remaining area, if necessary: */ @@ -2031,39 +3489,17 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize, vfs_ panic("hfs_truncate: invoked on non-UBC object?!"); }; } - cp->c_touch_modtime = TRUE; - fp->ff_size = length; - - /* Nested transactions will do their own ubc_setsize. */ - if (!skipsetsize) { - /* - * ubc_setsize can cause a pagein here - * so we need to drop cnode lock. - */ - hfs_unlock(cp); - ubc_setsize(vp, length); - hfs_lock(cp, HFS_FORCE_LOCK); + if (suppress_times == 0) { + cp->c_touch_modtime = TRUE; } + fp->ff_size = length; } else { /* Shorten the size of the file */ - if ((off_t)fp->ff_size > length) { - /* - * Any buffers that are past the truncation point need to be - * invalidated (to maintain buffer cache consistency). - */ - - /* Nested transactions will do their own ubc_setsize. */ - if (!skipsetsize) { - /* - * ubc_setsize can cause a pageout here - * so we need to drop cnode lock. - */ - hfs_unlock(cp); - ubc_setsize(vp, length); - hfs_lock(cp, HFS_FORCE_LOCK); - } - + // An optimisation for the hibernation file + if (ISSET(flags, IO_NOZEROFILL) && vnode_isswap(vp)) { + rl_remove_all(&fp->ff_invalidranges); + } else if ((off_t)fp->ff_size > length) { /* Any space previously marked as invalid is now irrelevant: */ rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges); } @@ -2076,8 +3512,7 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize, vfs_ u_int32_t finalblks; u_int32_t loanedBlocks; - HFS_MOUNT_LOCK(hfsmp, TRUE); - + hfs_lock_mount(hfsmp); loanedBlocks = fp->ff_unallocblocks; cp->c_blocks -= loanedBlocks; fp->ff_blocks -= loanedBlocks; @@ -2095,138 +3530,413 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize, vfs_ cp->c_blocks += loanedBlocks; fp->ff_blocks += loanedBlocks; } - HFS_MOUNT_UNLOCK(hfsmp, TRUE); + hfs_unlock_mount (hfsmp); } - /* - * For a TBE process the deallocation of the file blocks is - * delayed until the file is closed. And hfs_close calls - * truncate with the IO_NDELAY flag set. So when IO_NDELAY - * isn't set, we make sure this isn't a TBE process. - */ - if ((flags & IO_NDELAY) || (proc_tbe(p) == 0)) { #if QUOTA - off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize); + off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize); #endif /* QUOTA */ - if (hfs_start_transaction(hfsmp) != 0) { - retval = EINVAL; - goto Err_Exit; - } + if (hfs_start_transaction(hfsmp) != 0) { + retval = EINVAL; + goto Err_Exit; + } - if (fp->ff_unallocblocks == 0) { - /* Protect extents b-tree and allocation bitmap */ - lockflags = SFL_BITMAP; - if (overflow_extents(fp)) - lockflags |= SFL_EXTENTS; - lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + if (fp->ff_unallocblocks == 0) { + /* Protect extents b-tree and allocation bitmap */ + lockflags = SFL_BITMAP; + if (overflow_extents(fp)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); - retval = MacToVFSError(TruncateFileC(VTOVCB(vp), - (FCB*)fp, length, false)); + retval = MacToVFSError(TruncateFileC(VTOVCB(vp), (FCB*)fp, length, 0, + FORK_IS_RSRC (fp), FTOC(fp)->c_fileid, false)); - hfs_systemfile_unlock(hfsmp, lockflags); + hfs_systemfile_unlock(hfsmp, lockflags); + } + if (hfsmp->jnl) { + if (retval == 0) { + fp->ff_size = length; } - if (hfsmp->jnl) { - if (retval == 0) { - fp->ff_size = length; - } + if (skipupdate) { + (void) hfs_minorupdate(vp); + } + else { (void) hfs_update(vp, TRUE); (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); } + } + hfs_end_transaction(hfsmp); - hfs_end_transaction(hfsmp); - - filebytes = (off_t)fp->ff_blocks * (off_t)blksize; - if (retval) - goto Err_Exit; + filebytes = (off_t)fp->ff_blocks * (off_t)blksize; + if (retval) + goto Err_Exit; #if QUOTA - /* These are bytesreleased */ - (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0); + /* These are bytesreleased */ + (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0); #endif /* QUOTA */ - } - /* Only set update flag if the logical length changes */ - if (old_filesize != length) + + /* + * Only set update flag if the logical length changes & we aren't + * suppressing modtime updates. + */ + if (((off_t)fp->ff_size != length) && (suppress_times == 0)) { cp->c_touch_modtime = TRUE; + } fp->ff_size = length; } - cp->c_touch_chgtime = TRUE; - retval = hfs_update(vp, MNT_WAIT); + if (cp->c_mode & (S_ISUID | S_ISGID)) { + if (!vfs_context_issuser(context)) { + cp->c_mode &= ~(S_ISUID | S_ISGID); + skipupdate = 0; + } + } + if (skipupdate) { + retval = hfs_minorupdate(vp); + } + else { + cp->c_touch_chgtime = TRUE; /* status changed */ + if (suppress_times == 0) { + cp->c_touch_modtime = TRUE; /* file data was modified */ + + /* + * If we are not suppressing the modtime update, then + * update the gen count as well. + */ + if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK (cp->c_attr.ca_mode)) { + hfs_incr_gencount(cp); + } + } + + retval = hfs_update(vp, MNT_WAIT); + } if (retval) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE, + KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE, -1, -1, -1, retval, 0); } Err_Exit: - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_END, + KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_END, (int)length, (int)fp->ff_size, (int)filebytes, retval, 0); return (retval); } +/* + * Preparation which must be done prior to deleting the catalog record + * of a file or directory. In order to make the on-disk as safe as possible, + * we remove the catalog entry before releasing the bitmap blocks and the + * overflow extent records. However, some work must be done prior to deleting + * the catalog record. + * + * When calling this function, the cnode must exist both in memory and on-disk. + * If there are both resource fork and data fork vnodes, this function should + * be called on both. + */ + +int +hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) { + + struct filefork *fp = VTOF(vp); + struct cnode *cp = VTOC(vp); +#if QUOTA + int retval = 0; +#endif /* QUOTA */ + + /* Cannot truncate an HFS directory! */ + if (vnode_isdir(vp)) { + return (EISDIR); + } + + /* + * See the comment below in hfs_truncate for why we need to call + * setsize here. Essentially we want to avoid pending IO if we + * already know that the blocks are going to be released here. + * This function is only called when totally removing all storage for a file, so + * we can take a shortcut and immediately setsize (0); + */ + ubc_setsize(vp, 0); + + /* This should only happen with a corrupt filesystem */ + if ((off_t)fp->ff_size < 0) + return (EINVAL); + + /* + * We cannot just check if fp->ff_size == length (as an optimization) + * since there may be extra physical blocks that also need truncation. + */ +#if QUOTA + if ((retval = hfs_getinoquota(cp))) { + return(retval); + } +#endif /* QUOTA */ + + /* Wipe out any invalid ranges which have yet to be backed by disk */ + rl_remove(0, fp->ff_size - 1, &fp->ff_invalidranges); + + /* + * Account for any unmapped blocks. Since we're deleting the + * entire file, we don't have to worry about just shrinking + * to a smaller number of borrowed blocks. + */ + if (fp->ff_unallocblocks > 0) { + u_int32_t loanedBlocks; + + hfs_lock_mount (hfsmp); + loanedBlocks = fp->ff_unallocblocks; + cp->c_blocks -= loanedBlocks; + fp->ff_blocks -= loanedBlocks; + fp->ff_unallocblocks = 0; + + hfsmp->loanedBlocks -= loanedBlocks; + + hfs_unlock_mount (hfsmp); + } + + return 0; +} + + +/* + * Special wrapper around calling TruncateFileC. This function is useable + * even when the catalog record does not exist any longer, making it ideal + * for use when deleting a file. The simplification here is that we know + * that we are releasing all blocks. + * + * Note that this function may be called when there is no vnode backing + * the file fork in question. We may call this from hfs_vnop_inactive + * to clear out resource fork data (and may not want to clear out the data + * fork yet). As a result, we pointer-check both sets of inputs before + * doing anything with them. + * + * The caller is responsible for saving off a copy of the filefork(s) + * embedded within the cnode prior to calling this function. The pointers + * supplied as arguments must be valid even if the cnode is no longer valid. + */ + +int +hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork, + struct filefork *rsrcfork, u_int32_t fileid) { + + off_t filebytes; + u_int32_t fileblocks; + int blksize = 0; + int error = 0; + int lockflags; + + blksize = hfsmp->blockSize; + + /* Data Fork */ + if (datafork) { + datafork->ff_size = 0; + + fileblocks = datafork->ff_blocks; + filebytes = (off_t)fileblocks * (off_t)blksize; + + /* We killed invalid ranges and loaned blocks before we removed the catalog entry */ + + while (filebytes > 0) { + if (filebytes > HFS_BIGFILE_SIZE) { + filebytes -= HFS_BIGFILE_SIZE; + } else { + filebytes = 0; + } + + /* Start a transaction, and wipe out as many blocks as we can in this iteration */ + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + break; + } + + if (datafork->ff_unallocblocks == 0) { + /* Protect extents b-tree and allocation bitmap */ + lockflags = SFL_BITMAP; + if (overflow_extents(datafork)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + + error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), datafork, filebytes, 1, 0, fileid, false)); + + hfs_systemfile_unlock(hfsmp, lockflags); + } + (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); + + /* Finish the transaction and start over if necessary */ + hfs_end_transaction(hfsmp); + + if (error) { + break; + } + } + } + + /* Resource fork */ + if (error == 0 && rsrcfork) { + rsrcfork->ff_size = 0; + + fileblocks = rsrcfork->ff_blocks; + filebytes = (off_t)fileblocks * (off_t)blksize; + + /* We killed invalid ranges and loaned blocks before we removed the catalog entry */ + + while (filebytes > 0) { + if (filebytes > HFS_BIGFILE_SIZE) { + filebytes -= HFS_BIGFILE_SIZE; + } else { + filebytes = 0; + } + + /* Start a transaction, and wipe out as many blocks as we can in this iteration */ + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + break; + } + + if (rsrcfork->ff_unallocblocks == 0) { + /* Protect extents b-tree and allocation bitmap */ + lockflags = SFL_BITMAP; + if (overflow_extents(rsrcfork)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + + error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), rsrcfork, filebytes, 1, 1, fileid, false)); + + hfs_systemfile_unlock(hfsmp, lockflags); + } + (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); + + /* Finish the transaction and start over if necessary */ + hfs_end_transaction(hfsmp); + + if (error) { + break; + } + } + } + + return error; +} + +errno_t hfs_ubc_setsize(vnode_t vp, off_t len, bool have_cnode_lock) +{ + errno_t error; + + /* + * Call ubc_setsize to give the VM subsystem a chance to do + * whatever it needs to with existing pages before we delete + * blocks. Note that symlinks don't use the UBC so we'll + * get back ENOENT in that case. + */ + if (have_cnode_lock) { + error = ubc_setsize_ex(vp, len, UBC_SETSIZE_NO_FS_REENTRY); + if (error == EAGAIN) { + cnode_t *cp = VTOC(vp); + + if (cp->c_truncatelockowner != current_thread()) { +#if DEVELOPMENT || DEBUG + panic("hfs: hfs_ubc_setsize called without exclusive truncate lock!"); +#else + printf("hfs: hfs_ubc_setsize called without exclusive truncate lock!\n"); +#endif + } + + hfs_unlock(cp); + error = ubc_setsize_ex(vp, len, 0); + hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK); + } + } else + error = ubc_setsize_ex(vp, len, 0); + return error == ENOENT ? 0 : error; +} /* * Truncate a cnode to at most length size, freeing (or adding) the * disk blocks. */ -__private_extern__ int -hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize, - vfs_context_t context) +hfs_truncate(struct vnode *vp, off_t length, int flags, + int truncateflags, vfs_context_t context) { - struct filefork *fp = VTOF(vp); + struct filefork *fp = VTOF(vp); off_t filebytes; - u_long fileblocks; - int blksize, error = 0; + u_int32_t fileblocks; + int blksize; + errno_t error = 0; struct cnode *cp = VTOC(vp); - if (vnode_isdir(vp)) - return (EISDIR); /* cannot truncate an HFS directory! */ + /* Cannot truncate an HFS directory! */ + if (vnode_isdir(vp)) { + return (EISDIR); + } + /* A swap file cannot change size. */ + if (vnode_isswap(vp) && length && !ISSET(flags, IO_NOAUTH)) { + return (EPERM); + } blksize = VTOVCB(vp)->blockSize; fileblocks = fp->ff_blocks; filebytes = (off_t)fileblocks * (off_t)blksize; + bool caller_has_cnode_lock = (cp->c_lockowner == current_thread()); + + error = hfs_ubc_setsize(vp, length, caller_has_cnode_lock); + if (error) + return error; + + if (!caller_has_cnode_lock) { + error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + if (error) + return error; + } + // have to loop truncating or growing files that are // really big because otherwise transactions can get // enormous and consume too many kernel resources. if (length < filebytes) { while (filebytes > length) { - if ((filebytes - length) > HFS_BIGFILE_SIZE && overflow_extents(fp)) { + if ((filebytes - length) > HFS_BIGFILE_SIZE) { filebytes -= HFS_BIGFILE_SIZE; } else { filebytes = length; } cp->c_flag |= C_FORCEUPDATE; - error = do_hfs_truncate(vp, filebytes, flags, skipsetsize, context); + error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context); if (error) break; } } else if (length > filebytes) { while (filebytes < length) { - if ((length - filebytes) > HFS_BIGFILE_SIZE && overflow_extents(fp)) { + if ((length - filebytes) > HFS_BIGFILE_SIZE) { filebytes += HFS_BIGFILE_SIZE; } else { filebytes = length; } cp->c_flag |= C_FORCEUPDATE; - error = do_hfs_truncate(vp, filebytes, flags, skipsetsize, context); + error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context); if (error) break; } } else /* Same logical size */ { - error = do_hfs_truncate(vp, length, flags, skipsetsize, context); + error = do_hfs_truncate(vp, length, flags, truncateflags, context); } /* Files that are changing size are not hot file candidates. */ if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) { fp->ff_bytesread = 0; } - return (error); -} + if (!caller_has_cnode_lock) + hfs_unlock(cp); + // Make sure UBC's size matches up (in case we didn't completely succeed) + errno_t err2 = hfs_ubc_setsize(vp, fp->ff_size, caller_has_cnode_lock); + if (!error) + error = err2; + + return error; +} /* @@ -2251,13 +3961,14 @@ hfs_vnop_allocate(struct vnop_allocate_args /* { off_t moreBytesRequested; off_t actualBytesAdded; off_t filebytes; - u_long fileblocks; + u_int32_t fileblocks; int retval, retval2; - UInt32 blockHint; - UInt32 extendFlags; /* For call to ExtendFileC */ + u_int32_t blockHint; + u_int32_t extendFlags; /* For call to ExtendFileC */ struct hfsmount *hfsmp; kauth_cred_t cred = vfs_context_ucred(ap->a_context); int lockflags; + time_t orig_ctime; *(ap->a_bytesallocated) = 0; @@ -2265,10 +3976,19 @@ hfs_vnop_allocate(struct vnop_allocate_args /* { return (EISDIR); if (length < (off_t)0) return (EINVAL); - - if ((retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) - return (retval); + cp = VTOC(vp); + + orig_ctime = VTOC(vp)->c_ctime; + + check_for_tracked_file(vp, orig_ctime, ap->a_length == 0 ? NAMESPACE_HANDLER_TRUNCATE_OP|NAMESPACE_HANDLER_DELETE_OP : NAMESPACE_HANDLER_TRUNCATE_OP, NULL); + + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + + if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { + goto Err_Exit; + } + fp = VTOF(vp); hfsmp = VTOHFS(vp); vcb = VTOVCB(vp); @@ -2290,6 +4010,8 @@ hfs_vnop_allocate(struct vnop_allocate_args /* { extendFlags |= kEFAllMask; if (cred && suser(cred, NULL) != 0) extendFlags |= kEFReserveMask; + if (hfs_virtualmetafile(cp)) + extendFlags |= kEFMetadataMask; retval = E_NONE; blockHint = 0; @@ -2310,7 +4032,9 @@ hfs_vnop_allocate(struct vnop_allocate_args /* { * value of filebytes is 0, length will be at least 1. */ if (length > filebytes) { - moreBytesRequested = length - filebytes; + off_t total_bytes_added = 0, orig_request_size; + + orig_request_size = moreBytesRequested = length - filebytes; #if QUOTA retval = hfs_chkdq(cp, @@ -2328,7 +4052,6 @@ hfs_vnop_allocate(struct vnop_allocate_args /* { * Allocate Journal and Quota files in metadata zone. */ if (hfs_virtualmetafile(cp)) { - extendFlags |= kEFMetadataMask; blockHint = hfsmp->hfs_metazone_start; } else if ((blockHint >= hfsmp->hfs_metazone_start) && (blockHint <= hfsmp->hfs_metazone_end)) { @@ -2339,35 +4062,60 @@ hfs_vnop_allocate(struct vnop_allocate_args /* { } } - if (hfs_start_transaction(hfsmp) != 0) { - retval = EINVAL; - goto Err_Exit; - } - /* Protect extents b-tree and allocation bitmap */ - lockflags = SFL_BITMAP; - if (overflow_extents(fp)) - lockflags |= SFL_EXTENTS; - lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + while ((length > filebytes) && (retval == E_NONE)) { + off_t bytesRequested; + + if (hfs_start_transaction(hfsmp) != 0) { + retval = EINVAL; + goto Err_Exit; + } + + /* Protect extents b-tree and allocation bitmap */ + lockflags = SFL_BITMAP; + if (overflow_extents(fp)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + + if (moreBytesRequested >= HFS_BIGFILE_SIZE) { + bytesRequested = HFS_BIGFILE_SIZE; + } else { + bytesRequested = moreBytesRequested; + } - retval = MacToVFSError(ExtendFileC(vcb, + if (extendFlags & kEFContigMask) { + // if we're on a sparse device, this will force it to do a + // full scan to find the space needed. + hfsmp->hfs_flags &= ~HFS_DID_CONTIG_SCAN; + } + + retval = MacToVFSError(ExtendFileC(vcb, (FCB*)fp, - moreBytesRequested, + bytesRequested, blockHint, extendFlags, &actualBytesAdded)); - *(ap->a_bytesallocated) = actualBytesAdded; - filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize; - - hfs_systemfile_unlock(hfsmp, lockflags); + if (retval == E_NONE) { + *(ap->a_bytesallocated) += actualBytesAdded; + total_bytes_added += actualBytesAdded; + moreBytesRequested -= actualBytesAdded; + if (blockHint != 0) { + blockHint += actualBytesAdded / vcb->blockSize; + } + } + filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize; + + hfs_systemfile_unlock(hfsmp, lockflags); - if (hfsmp->jnl) { + if (hfsmp->jnl) { (void) hfs_update(vp, TRUE); (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); + } + + hfs_end_transaction(hfsmp); } - hfs_end_transaction(hfsmp); /* * if we get an error and no changes were made then exit @@ -2383,43 +4131,25 @@ hfs_vnop_allocate(struct vnop_allocate_args /* { * until the file is closed, when we truncate the file to allocation * block size. */ - if ((actualBytesAdded != 0) && (moreBytesRequested < actualBytesAdded)) + if (total_bytes_added != 0 && orig_request_size < total_bytes_added) *(ap->a_bytesallocated) = - roundup(moreBytesRequested, (off_t)vcb->blockSize); + roundup(orig_request_size, (off_t)vcb->blockSize); } else { /* Shorten the size of the file */ - if (fp->ff_size > length) { - /* - * Any buffers that are past the truncation point need to be - * invalidated (to maintain buffer cache consistency). - */ - } - - if (hfs_start_transaction(hfsmp) != 0) { - retval = EINVAL; - goto Err_Exit; - } - - /* Protect extents b-tree and allocation bitmap */ - lockflags = SFL_BITMAP; - if (overflow_extents(fp)) - lockflags |= SFL_EXTENTS; - lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); - - retval = MacToVFSError(TruncateFileC(vcb, (FCB*)fp, length, false)); - - hfs_systemfile_unlock(hfsmp, lockflags); - - filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize; - - if (hfsmp->jnl) { - (void) hfs_update(vp, TRUE); - (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); - } + /* + * N.B. At present, this code is never called. If and when we + * do start using it, it looks like there might be slightly + * strange semantics with the file size: it's possible for the + * file size to *increase* e.g. if current file size is 5, + * length is 1024 and filebytes is 4096, the file size will + * end up being 1024 bytes. This isn't necessarily a problem + * but it's not consistent with the code above which doesn't + * change the file size. + */ - hfs_end_transaction(hfsmp); - + retval = hfs_truncate(vp, length, 0, 0, ap->a_context); + filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize; /* * if we get an error and no changes were made then exit @@ -2434,9 +4164,7 @@ hfs_vnop_allocate(struct vnop_allocate_args /* { if (fp->ff_size > filebytes) { fp->ff_size = filebytes; - hfs_unlock(cp); - ubc_setsize(vp, fp->ff_size); - hfs_lock(cp, HFS_FORCE_LOCK); + hfs_ubc_setsize(vp, fp->ff_size, true); } } @@ -2448,6 +4176,7 @@ Std_Exit: if (retval == 0) retval = retval2; Err_Exit: + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); hfs_unlock(cp); return (retval); } @@ -2470,50 +4199,339 @@ hfs_vnop_pagein(struct vnop_pagein_args *ap) }; */ { - vnode_t vp = ap->a_vp; - int error; + vnode_t vp; + struct cnode *cp; + struct filefork *fp; + int error = 0; + upl_t upl; + upl_page_info_t *pl; + off_t f_offset; + off_t page_needed_f_offset; + int offset; + int isize; + int upl_size; + int pg_index; + boolean_t truncate_lock_held = FALSE; + boolean_t file_converted = FALSE; + kern_return_t kret; + + vp = ap->a_vp; + cp = VTOC(vp); + fp = VTOF(vp); + +#if CONFIG_PROTECT + if ((error = cp_handle_vnop(vp, CP_READ_ACCESS | CP_WRITE_ACCESS, 0)) != 0) { + /* + * If we errored here, then this means that one of two things occurred: + * 1. there was a problem with the decryption of the key. + * 2. the device is locked and we are not allowed to access this particular file. + * + * Either way, this means that we need to shut down this upl now. As long as + * the pl pointer is NULL (meaning that we're supposed to create the UPL ourselves) + * then we create a upl and immediately abort it. + */ + if (ap->a_pl == NULL) { + /* create the upl */ + ubc_create_upl (vp, ap->a_f_offset, ap->a_size, &upl, &pl, + UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT); + /* mark the range as needed so it doesn't immediately get discarded upon abort */ + ubc_upl_range_needed (upl, ap->a_pl_offset / PAGE_SIZE, 1); + + /* Abort the range */ + ubc_upl_abort_range (upl, 0, ap->a_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR); + } + + + return error; + } +#endif /* CONFIG_PROTECT */ - error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset, - ap->a_size, (off_t)VTOF(vp)->ff_size, ap->a_flags); + if (ap->a_pl != NULL) { + /* + * this can only happen for swap files now that + * we're asking for V2 paging behavior... + * so don't need to worry about decompression, or + * keeping track of blocks read or taking the truncate lock + */ + error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset, + ap->a_size, (off_t)fp->ff_size, ap->a_flags); + goto pagein_done; + } + + page_needed_f_offset = ap->a_f_offset + ap->a_pl_offset; + +retry_pagein: /* - * Keep track of blocks read. + * take truncate lock (shared/recursive) to guard against + * zero-fill thru fsync interfering, but only for v2 + * + * the HFS_RECURSE_TRUNCLOCK arg indicates that we want the + * lock shared and we are allowed to recurse 1 level if this thread already + * owns the lock exclusively... this can legally occur + * if we are doing a shrinking ftruncate against a file + * that is mapped private, and the pages being truncated + * do not currently exist in the cache... in that case + * we will have to page-in the missing pages in order + * to provide them to the private mapping... we must + * also call hfs_unlock_truncate with a postive been_recursed + * arg to indicate that if we have recursed, there is no need to drop + * the lock. Allowing this simple recursion is necessary + * in order to avoid a certain deadlock... since the ftruncate + * already holds the truncate lock exclusively, if we try + * to acquire it shared to protect the pagein path, we will + * hang this thread + * + * NOTE: The if () block below is a workaround in order to prevent a + * VM deadlock. See rdar://7853471. + * + * If we are in a forced unmount, then launchd will still have the + * dyld_shared_cache file mapped as it is trying to reboot. If we + * take the truncate lock here to service a page fault, then our + * thread could deadlock with the forced-unmount. The forced unmount + * thread will try to reclaim the dyld_shared_cache vnode, but since it's + * marked C_DELETED, it will call ubc_setsize(0). As a result, the unmount + * thread will think it needs to copy all of the data out of the file + * and into a VM copy object. If we hold the cnode lock here, then that + * VM operation will not be able to proceed, because we'll set a busy page + * before attempting to grab the lock. Note that this isn't as simple as "don't + * call ubc_setsize" because doing that would just shift the problem to the + * ubc_msync done before the vnode is reclaimed. + * + * So, if a forced unmount on this volume is in flight AND the cnode is + * marked C_DELETED, then just go ahead and do the page in without taking + * the lock (thus suspending pagein_v2 semantics temporarily). Since it's on a file + * that is not going to be available on the next mount, this seems like a + * OK solution from a correctness point of view, even though it is hacky. */ - if (VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) { - struct cnode *cp; - struct filefork *fp; - int bytesread; - int took_cnode_lock = 0; - - cp = VTOC(vp); - fp = VTOF(vp); + if (vfs_isforce(vp->v_mount)) { + if (cp->c_flag & C_DELETED) { + /* If we don't get it, then just go ahead and operate without the lock */ + truncate_lock_held = hfs_try_trunclock(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE); + } + } + else { + hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE); + truncate_lock_held = TRUE; + } - if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE) - bytesread = fp->ff_size; - else - bytesread = ap->a_size; + kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT); - /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */ - if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) { - hfs_lock(cp, HFS_FORCE_LOCK); - took_cnode_lock = 1; + if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) { + error = EINVAL; + goto pagein_done; + } + ubc_upl_range_needed(upl, ap->a_pl_offset / PAGE_SIZE, 1); + + upl_size = isize = ap->a_size; + + /* + * Scan from the back to find the last page in the UPL, so that we + * aren't looking at a UPL that may have already been freed by the + * preceding aborts/completions. + */ + for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) { + if (upl_page_present(pl, --pg_index)) + break; + if (pg_index == 0) { + /* + * no absent pages were found in the range specified + * just abort the UPL to get rid of it and then we're done + */ + ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY); + goto pagein_done; } - /* - * If this file hasn't been seen since the start of - * the current sampling period then start over. + } + /* + * initialize the offset variables before we touch the UPL. + * f_offset is the position into the file, in bytes + * offset is the position into the UPL, in bytes + * pg_index is the pg# of the UPL we're operating on + * isize is the offset into the UPL of the last page that is present. + */ + isize = ((pg_index + 1) * PAGE_SIZE); + pg_index = 0; + offset = 0; + f_offset = ap->a_f_offset; + + while (isize) { + int xsize; + int num_of_pages; + + if ( !upl_page_present(pl, pg_index)) { + /* + * we asked for RET_ONLY_ABSENT, so it's possible + * to get back empty slots in the UPL. + * just skip over them + */ + f_offset += PAGE_SIZE; + offset += PAGE_SIZE; + isize -= PAGE_SIZE; + pg_index++; + + continue; + } + /* + * We know that we have at least one absent page. + * Now checking to see how many in a row we have */ - if (cp->c_atime < VTOHFS(vp)->hfc_timebase) { - struct timeval tv; + num_of_pages = 1; + xsize = isize - PAGE_SIZE; - fp->ff_bytesread = bytesread; - microtime(&tv); - cp->c_atime = tv.tv_sec; + while (xsize) { + if ( !upl_page_present(pl, pg_index + num_of_pages)) + break; + num_of_pages++; + xsize -= PAGE_SIZE; + } + xsize = num_of_pages * PAGE_SIZE; + +#if HFS_COMPRESSION + if (VNODE_IS_RSRC(vp)) { + /* allow pageins of the resource fork */ } else { - fp->ff_bytesread += bytesread; + int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */ + + if (compressed) { + + if (truncate_lock_held) { + /* + * can't hold the truncate lock when calling into the decmpfs layer + * since it calls back into this layer... even though we're only + * holding the lock in shared mode, and the re-entrant path only + * takes the lock shared, we can deadlock if some other thread + * tries to grab the lock exclusively in between. + */ + hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE); + truncate_lock_held = FALSE; + } + ap->a_pl = upl; + ap->a_pl_offset = offset; + ap->a_f_offset = f_offset; + ap->a_size = xsize; + + error = decmpfs_pagein_compressed(ap, &compressed, VTOCMP(vp)); + /* + * note that decpfs_pagein_compressed can change the state of + * 'compressed'... it will set it to 0 if the file is no longer + * compressed once the compression lock is successfully taken + * i.e. we would block on that lock while the file is being inflated + */ + if (compressed) { + if (error == 0) { + /* successful page-in, update the access time */ + VTOC(vp)->c_touch_acctime = TRUE; + + /* compressed files are not hot file candidates */ + if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) { + fp->ff_bytesread = 0; + } + } else if (error == EAGAIN) { + /* + * EAGAIN indicates someone else already holds the compression lock... + * to avoid deadlocking, we'll abort this range of pages with an + * indication that the pagein needs to be redriven + */ + ubc_upl_abort_range(upl, (upl_offset_t) offset, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_RESTART); + } else if (error == ENOSPC) { + + if (upl_size == PAGE_SIZE) + panic("decmpfs_pagein_compressed: couldn't ubc_upl_map a single page\n"); + + ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY); + + ap->a_size = PAGE_SIZE; + ap->a_pl = NULL; + ap->a_pl_offset = 0; + ap->a_f_offset = page_needed_f_offset; + + goto retry_pagein; + } + goto pagein_next_range; + } + else { + /* + * Set file_converted only if the file became decompressed while we were + * paging in. If it were still compressed, we would re-start the loop using the goto + * in the above block. This avoid us overloading truncate_lock_held as our retry_pagein + * condition below, since we could have avoided taking the truncate lock to prevent + * a deadlock in the force unmount case. + */ + file_converted = TRUE; + } + } + if (file_converted == TRUE) { + /* + * the file was converted back to a regular file after we first saw it as compressed + * we need to abort the upl, retake the truncate lock, recreate the UPL and start over + * reset a_size so that we consider what remains of the original request + * and null out a_upl and a_pl_offset. + * + * We should only be able to get into this block if the decmpfs_pagein_compressed + * successfully decompressed the range in question for this file. + */ + ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY); + + ap->a_size = isize; + ap->a_pl = NULL; + ap->a_pl_offset = 0; + + /* Reset file_converted back to false so that we don't infinite-loop. */ + file_converted = FALSE; + goto retry_pagein; + } } - cp->c_touch_acctime = TRUE; - if (took_cnode_lock) - hfs_unlock(cp); +#endif + error = cluster_pagein(vp, upl, offset, f_offset, xsize, (off_t)fp->ff_size, ap->a_flags); + + /* + * Keep track of blocks read. + */ + if ( !vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) { + int bytesread; + int took_cnode_lock = 0; + + if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE) + bytesread = fp->ff_size; + else + bytesread = xsize; + + /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */ + if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) { + hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + took_cnode_lock = 1; + } + /* + * If this file hasn't been seen since the start of + * the current sampling period then start over. + */ + if (cp->c_atime < VTOHFS(vp)->hfc_timebase) { + struct timeval tv; + + fp->ff_bytesread = bytesread; + microtime(&tv); + cp->c_atime = tv.tv_sec; + } else { + fp->ff_bytesread += bytesread; + } + cp->c_touch_acctime = TRUE; + if (took_cnode_lock) + hfs_unlock(cp); + } +pagein_next_range: + f_offset += xsize; + offset += xsize; + isize -= xsize; + pg_index += num_of_pages; + + error = 0; } + +pagein_done: + if (truncate_lock_held == TRUE) { + /* Note 1 is passed to hfs_unlock_truncate in been_recursed argument */ + hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE); + } + return (error); } @@ -2537,54 +4555,306 @@ hfs_vnop_pageout(struct vnop_pageout_args *ap) vnode_t vp = ap->a_vp; struct cnode *cp; struct filefork *fp; - int retval; - off_t end_of_range; + int retval = 0; off_t filesize; + upl_t upl; + upl_page_info_t* pl; + vm_offset_t a_pl_offset; + int a_flags; + int is_pageoutv2 = 0; + kern_return_t kret; cp = VTOC(vp); - if (cp->c_lockowner == current_thread()) { - panic("pageout: %s cnode lock already held!\n", - cp->c_desc.cd_nameptr ? cp->c_desc.cd_nameptr : ""); - } - if ( (retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) { - if (!(ap->a_flags & UPL_NOCOMMIT)) { - ubc_upl_abort_range(ap->a_pl, - ap->a_pl_offset, - ap->a_size, - UPL_ABORT_FREE_ON_EMPTY); - } - return (retval); - } fp = VTOF(vp); - + + /* + * Figure out where the file ends, for pageout purposes. If + * ff_new_size > ff_size, then we're in the middle of extending the + * file via a write, so it is safe (and necessary) that we be able + * to pageout up to that point. + */ filesize = fp->ff_size; - end_of_range = ap->a_f_offset + ap->a_size - 1; + if (fp->ff_new_size > filesize) + filesize = fp->ff_new_size; - if (end_of_range >= filesize) { - end_of_range = (off_t)(filesize - 1); - } - if (ap->a_f_offset < filesize) { - rl_remove(ap->a_f_offset, end_of_range, &fp->ff_invalidranges); - cp->c_flag |= C_MODIFIED; /* leof is dirty */ + a_flags = ap->a_flags; + a_pl_offset = ap->a_pl_offset; + + /* + * we can tell if we're getting the new or old behavior from the UPL + */ + if ((upl = ap->a_pl) == NULL) { + int request_flags; + + is_pageoutv2 = 1; + /* + * we're in control of any UPL we commit + * make sure someone hasn't accidentally passed in UPL_NOCOMMIT + */ + a_flags &= ~UPL_NOCOMMIT; + a_pl_offset = 0; + + /* + * For V2 semantics, we want to take the cnode truncate lock + * shared to guard against the file size changing via zero-filling. + * + * However, we have to be careful because we may be invoked + * via the ubc_msync path to write out dirty mmap'd pages + * in response to a lock event on a content-protected + * filesystem (e.g. to write out class A files). + * As a result, we want to take the truncate lock 'SHARED' with + * the mini-recursion locktype so that we don't deadlock/panic + * because we may be already holding the truncate lock exclusive to force any other + * IOs to have blocked behind us. + */ + hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE); + + if (a_flags & UPL_MSYNC) { + request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY; + } + else { + request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY; + } + + kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags); + + if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) { + retval = EINVAL; + goto pageout_done; + } } - hfs_unlock(cp); + /* + * from this point forward upl points at the UPL we're working with + * it was either passed in or we succesfully created it + */ + + /* + * Now that HFS is opting into VFC_VFSVNOP_PAGEOUTV2, we may need to operate on our own + * UPL instead of relying on the UPL passed into us. We go ahead and do that here, + * scanning for dirty ranges. We'll issue our own N cluster_pageout calls, for + * N dirty ranges in the UPL. Note that this is almost a direct copy of the + * logic in vnode_pageout except that we need to do it after grabbing the truncate + * lock in HFS so that we don't lock invert ourselves. + * + * Note that we can still get into this function on behalf of the default pager with + * non-V2 behavior (swapfiles). However in that case, we did not grab locks above + * since fsync and other writing threads will grab the locks, then mark the + * relevant pages as busy. But the pageout codepath marks the pages as busy, + * and THEN would attempt to grab the truncate lock, which would result in deadlock. So + * we do not try to grab anything for the pre-V2 case, which should only be accessed + * by the paging/VM system. + */ + + if (is_pageoutv2) { + off_t f_offset; + int offset; + int isize; + int pg_index; + int error; + int error_ret = 0; + + isize = ap->a_size; + f_offset = ap->a_f_offset; + + /* + * Scan from the back to find the last page in the UPL, so that we + * aren't looking at a UPL that may have already been freed by the + * preceding aborts/completions. + */ + for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) { + if (upl_page_present(pl, --pg_index)) + break; + if (pg_index == 0) { + ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY); + goto pageout_done; + } + } + + /* + * initialize the offset variables before we touch the UPL. + * a_f_offset is the position into the file, in bytes + * offset is the position into the UPL, in bytes + * pg_index is the pg# of the UPL we're operating on. + * isize is the offset into the UPL of the last non-clean page. + */ + isize = ((pg_index + 1) * PAGE_SIZE); + + offset = 0; + pg_index = 0; + + while (isize) { + int xsize; + int num_of_pages; + + if ( !upl_page_present(pl, pg_index)) { + /* + * we asked for RET_ONLY_DIRTY, so it's possible + * to get back empty slots in the UPL. + * just skip over them + */ + f_offset += PAGE_SIZE; + offset += PAGE_SIZE; + isize -= PAGE_SIZE; + pg_index++; + + continue; + } + if ( !upl_dirty_page(pl, pg_index)) { + panic ("hfs_vnop_pageout: unforeseen clean page @ index %d for UPL %p\n", pg_index, upl); + } - retval = cluster_pageout(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset, - ap->a_size, filesize, ap->a_flags); + /* + * We know that we have at least one dirty page. + * Now checking to see how many in a row we have + */ + num_of_pages = 1; + xsize = isize - PAGE_SIZE; + + while (xsize) { + if ( !upl_dirty_page(pl, pg_index + num_of_pages)) + break; + num_of_pages++; + xsize -= PAGE_SIZE; + } + xsize = num_of_pages * PAGE_SIZE; + + if (!vnode_isswap(vp)) { + off_t end_of_range; + int tooklock; + + tooklock = 0; + + if (cp->c_lockowner != current_thread()) { + if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { + /* + * we're in the v2 path, so we are the + * owner of the UPL... we may have already + * processed some of the UPL, so abort it + * from the current working offset to the + * end of the UPL + */ + ubc_upl_abort_range(upl, + offset, + ap->a_size - offset, + UPL_ABORT_FREE_ON_EMPTY); + goto pageout_done; + } + tooklock = 1; + } + end_of_range = f_offset + xsize - 1; + + if (end_of_range >= filesize) { + end_of_range = (off_t)(filesize - 1); + } + if (f_offset < filesize) { + rl_remove(f_offset, end_of_range, &fp->ff_invalidranges); + cp->c_flag |= C_MODIFIED; /* leof is dirty */ + } + if (tooklock) { + hfs_unlock(cp); + } + } + if ((error = cluster_pageout(vp, upl, offset, f_offset, + xsize, filesize, a_flags))) { + if (error_ret == 0) + error_ret = error; + } + f_offset += xsize; + offset += xsize; + isize -= xsize; + pg_index += num_of_pages; + } + /* capture errnos bubbled out of cluster_pageout if they occurred */ + if (error_ret != 0) { + retval = error_ret; + } + } /* end block for v2 pageout behavior */ + else { + if (!vnode_isswap(vp)) { + off_t end_of_range; + int tooklock = 0; + + if (cp->c_lockowner != current_thread()) { + if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { + if (!(a_flags & UPL_NOCOMMIT)) { + ubc_upl_abort_range(upl, + a_pl_offset, + ap->a_size, + UPL_ABORT_FREE_ON_EMPTY); + } + goto pageout_done; + } + tooklock = 1; + } + end_of_range = ap->a_f_offset + ap->a_size - 1; + + if (end_of_range >= filesize) { + end_of_range = (off_t)(filesize - 1); + } + if (ap->a_f_offset < filesize) { + rl_remove(ap->a_f_offset, end_of_range, &fp->ff_invalidranges); + cp->c_flag |= C_MODIFIED; /* leof is dirty */ + } + + if (tooklock) { + hfs_unlock(cp); + } + } + /* + * just call cluster_pageout for old pre-v2 behavior + */ + retval = cluster_pageout(vp, upl, a_pl_offset, ap->a_f_offset, + ap->a_size, filesize, a_flags); + } /* - * If data was written, and setuid or setgid bits are set and - * this process is not the superuser then clear the setuid and - * setgid bits as a precaution against tampering. + * If data was written, update the modification time of the file + * but only if it's mapped writable; we will have touched the + * modifcation time for direct writes. */ - if ((retval == 0) && - (cp->c_mode & (S_ISUID | S_ISGID)) && - (vfs_context_suser(ap->a_context) != 0)) { - hfs_lock(cp, HFS_FORCE_LOCK); - cp->c_mode &= ~(S_ISUID | S_ISGID); - cp->c_touch_chgtime = TRUE; + if (retval == 0 && (ubc_is_mapped_writable(vp) + || ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING))) { + hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + + // Check again with lock + bool mapped_writable = ubc_is_mapped_writable(vp); + if (mapped_writable + || ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING)) { + cp->c_touch_modtime = TRUE; + cp->c_touch_chgtime = TRUE; + + /* + * We only need to increment the generation counter if + * it's currently mapped writable because we incremented + * the counter in hfs_vnop_mnomap. + */ + if (mapped_writable) + hfs_incr_gencount(VTOC(vp)); + + /* + * If setuid or setgid bits are set and this process is + * not the superuser then clear the setuid and setgid bits + * as a precaution against tampering. + */ + if ((cp->c_mode & (S_ISUID | S_ISGID)) && + (vfs_context_suser(ap->a_context) != 0)) { + cp->c_mode &= ~(S_ISUID | S_ISGID); + } + } + hfs_unlock(cp); } + +pageout_done: + if (is_pageoutv2) { + /* + * Release the truncate lock. Note that because + * we may have taken the lock recursively by + * being invoked via ubc_msync due to lockdown, + * we should release it recursively, too. + */ + hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE); + } return (retval); } @@ -2609,10 +4879,10 @@ hfs_vnop_bwrite(struct vnop_bwrite_args *ap) * Swap and validate the node if it is in native byte order. * This is always be true on big endian, so we always validate * before writing here. On little endian, the node typically has - * been swapped and validatated when it was written to the journal, + * been swapped and validated when it was written to the journal, * so we won't do anything here. */ - if (((UInt16 *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) { + if (((u_int16_t *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) { /* Prepare the block pointer */ block.blockHeader = bp; block.buffer = (char *)buf_dataptr(bp); @@ -2622,7 +4892,7 @@ hfs_vnop_bwrite(struct vnop_bwrite_args *ap) block.blockSize = buf_count(bp); /* Endian un-swap B-Tree node */ - retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig); + retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, false); if (retval) panic("hfs_vnop_bwrite: about to write corrupt node!\n"); } @@ -2632,7 +4902,7 @@ hfs_vnop_bwrite(struct vnop_bwrite_args *ap) if ((buf_flags(bp) & B_LOCKED)) { // XXXdbg if (VTOHFS(vp)->jnl) { - panic("hfs: CLEARING the lock bit on bp 0x%x\n", bp); + panic("hfs: CLEARING the lock bit on bp %p\n", bp); } buf_clearflags(bp, B_LOCKED); } @@ -2656,7 +4926,7 @@ hfs_vnop_bwrite(struct vnop_bwrite_args *ap) * 0 N (file offset) * * ----------------- ----------------- - * |///////////////| | | STEP 1 (aquire new blocks) + * |///////////////| | | STEP 1 (acquire new blocks) * ----------------- ----------------- * 0 N N+1 2N * @@ -2673,9 +4943,8 @@ hfs_vnop_bwrite(struct vnop_bwrite_args *ap) * During steps 2 and 3 page-outs to file offsets less * than or equal to N are suspended. * - * During step 3 page-ins to the file get supended. + * During step 3 page-ins to the file get suspended. */ -__private_extern__ int hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, struct proc *p) @@ -2689,7 +4958,6 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, u_int32_t growsize; u_int32_t nextallocsave; daddr64_t sector_a, sector_b; - int disabled_caching = 0; int eflags; off_t newbytes; int retval; @@ -2699,7 +4967,8 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, enum vtype vnodetype; vnodetype = vnode_vtype(vp); - if (vnodetype != VREG && vnodetype != VLNK) { + if (vnodetype != VREG) { + /* Not allowed to move symlinks. */ return (EPERM); } @@ -2712,12 +4981,27 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, fp = VTOF(vp); if (fp->ff_unallocblocks) return (EINVAL); + +#if CONFIG_PROTECT + /* + * + * Disable HFS file relocation on content-protected filesystems + */ + if (cp_fs_protected (hfsmp->hfs_mp)) { + return EINVAL; + } +#endif + /* If it's an SSD, also disable HFS relocation */ + if (hfsmp->hfs_flags & HFS_SSD) { + return EINVAL; + } + + blksize = hfsmp->blockSize; if (blockHint == 0) blockHint = hfsmp->nextAllocation; - if ((fp->ff_size > (u_int64_t)0x7fffffff) || - ((fp->ff_size > blksize) && vnodetype == VLNK)) { + if (fp->ff_size > 0x7fffffff) { return (EFBIG); } @@ -2734,11 +5018,17 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, if (!vnode_issystem(vp) && (vnodetype != VLNK)) { hfs_unlock(cp); - hfs_lock_truncate(cp, TRUE); - if ((retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) { - hfs_unlock_truncate(cp); + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + /* Force lock since callers expects lock to be held. */ + if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS))) { + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); return (retval); } + /* No need to continue if file was removed. */ + if (cp->c_flag & C_NOEXISTS) { + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + return (ENOENT); + } took_trunc_lock = 1; } headblks = fp->ff_blocks; @@ -2751,7 +5041,7 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, if (hfs_start_transaction(hfsmp) != 0) { if (took_trunc_lock) - hfs_unlock_truncate(cp); + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); return (EINVAL); } started_tr = 1; @@ -2771,20 +5061,15 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, } /* - * STEP 1 - aquire new allocation blocks. + * STEP 1 - acquire new allocation blocks. */ - if (!vnode_isnocache(vp)) { - vnode_setnocache(vp); - disabled_caching = 1; - - } nextallocsave = hfsmp->nextAllocation; retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes); if (eflags & kEFMetadataMask) { - HFS_MOUNT_LOCK(hfsmp, TRUE); - hfsmp->nextAllocation = nextallocsave; - hfsmp->vcbFlags |= 0xFF00; - HFS_MOUNT_UNLOCK(hfsmp, TRUE); + hfs_lock_mount(hfsmp); + HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave); + MarkVCBDirty(hfsmp); + hfs_unlock_mount(hfsmp); } retval = MacToVFSError(retval); @@ -2794,7 +5079,7 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, retval = ENOSPC; goto restore; } else if (fp->ff_blocks < (headblks + datablks)) { - printf("hfs_relocate: allocation failed"); + printf("hfs_relocate: allocation failed id=%u, vol=%s\n", cp->c_cnid, hfsmp->vcbVN); retval = ENOSPC; goto restore; } @@ -2806,9 +5091,20 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, retval = ENOSPC; goto restore; } else if ((eflags & kEFMetadataMask) && - ((((u_int64_t)sector_b * hfsmp->hfs_phys_block_size) / blksize) > + ((((u_int64_t)sector_b * hfsmp->hfs_logical_block_size) / blksize) > hfsmp->hfs_metazone_end)) { - printf("hfs_relocate: didn't move into metadata zone\n"); +#if 0 + const char * filestr; + char emptystr = '\0'; + + if (cp->c_desc.cd_nameptr != NULL) { + filestr = (const char *)&cp->c_desc.cd_nameptr[0]; + } else if (vnode_name(vp) != NULL) { + filestr = vnode_name(vp); + } else { + filestr = &emptystr; + } +#endif retval = ENOSPC; goto restore; } @@ -2834,7 +5130,7 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, */ if (vnodetype == VLNK) - retval = hfs_clonelink(vp, blksize, cred, p); + retval = EPERM; else if (vnode_issystem(vp)) retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p); else @@ -2865,7 +5161,7 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, goto restore; out: if (took_trunc_lock) - hfs_unlock_truncate(cp); + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); if (lockflags) { hfs_systemfile_unlock(hfsmp, lockflags); @@ -2876,7 +5172,6 @@ out: if (retval == 0) { (void) hfs_update(vp, MNT_WAIT); } - if (hfsmp->jnl) { if (cp->c_cnid < kHFSFirstUserCatalogNodeID) (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH); @@ -2884,17 +5179,17 @@ out: (void) hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); } exit: - if (disabled_caching) { - vnode_clearnocache(vp); - } if (started_tr) hfs_end_transaction(hfsmp); return (retval); restore: - if (fp->ff_blocks == headblks) + if (fp->ff_blocks == headblks) { + if (took_trunc_lock) + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); goto exit; + } /* * Give back any newly allocated space. */ @@ -2905,50 +5200,18 @@ restore: lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); } - (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, false); + (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, 0, FORK_IS_RSRC(fp), + FTOC(fp)->c_fileid, false); hfs_systemfile_unlock(hfsmp, lockflags); lockflags = 0; if (took_trunc_lock) - hfs_unlock_truncate(cp); + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); goto exit; } -/* - * Clone a symlink. - * - */ -static int -hfs_clonelink(struct vnode *vp, int blksize, kauth_cred_t cred, struct proc *p) -{ - struct buf *head_bp = NULL; - struct buf *tail_bp = NULL; - int error; - - - error = (int)buf_meta_bread(vp, (daddr64_t)0, blksize, cred, &head_bp); - if (error) - goto out; - - tail_bp = buf_getblk(vp, (daddr64_t)1, blksize, 0, 0, BLK_META); - if (tail_bp == NULL) { - error = EIO; - goto out; - } - bcopy((char *)buf_dataptr(head_bp), (char *)buf_dataptr(tail_bp), blksize); - error = (int)buf_bwrite(tail_bp); -out: - if (head_bp) { - buf_markinvalid(head_bp); - buf_brelse(head_bp); - } - (void) buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0); - - return (error); -} - /* * Clone a file's data within the file. * @@ -2957,50 +5220,57 @@ static int hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize) { caddr_t bufp; - size_t writebase; size_t bufsize; size_t copysize; size_t iosize; - off_t filesize; size_t offset; + off_t writebase; uio_t auio; int error = 0; - filesize = VTOF(vp)->ff_blocks * blksize; /* virtual file size */ writebase = blkstart * blksize; copysize = blkcnt * blksize; iosize = bufsize = MIN(copysize, 128 * 1024); offset = 0; + hfs_unlock(VTOC(vp)); + +#if CONFIG_PROTECT + if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) { + hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + return (error); + } +#endif /* CONFIG_PROTECT */ + if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) { + hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); return (ENOMEM); - } - hfs_unlock(VTOC(vp)); + } - auio = uio_create(1, 0, UIO_SYSSPACE32, UIO_READ); + auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ); while (offset < copysize) { iosize = MIN(copysize - offset, iosize); - uio_reset(auio, offset, UIO_SYSSPACE32, UIO_READ); + uio_reset(auio, offset, UIO_SYSSPACE, UIO_READ); uio_addiov(auio, (uintptr_t)bufp, iosize); - error = cluster_read(vp, auio, copysize, 0); + error = cluster_read(vp, auio, copysize, IO_NOCACHE); if (error) { printf("hfs_clonefile: cluster_read failed - %d\n", error); break; } if (uio_resid(auio) != 0) { - printf("clonedata: cluster_read: uio_resid = %lld\n", uio_resid(auio)); + printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", (int64_t)uio_resid(auio)); error = EIO; break; } - uio_reset(auio, writebase + offset, UIO_SYSSPACE32, UIO_WRITE); + uio_reset(auio, writebase + offset, UIO_SYSSPACE, UIO_WRITE); uio_addiov(auio, (uintptr_t)bufp, iosize); - error = cluster_write(vp, auio, filesize + offset, - filesize + offset + iosize, + error = cluster_write(vp, auio, writebase + offset, + writebase + offset + iosize, uio_offset(auio), 0, IO_NOCACHE | IO_SYNC); if (error) { printf("hfs_clonefile: cluster_write failed - %d\n", error); @@ -3015,14 +5285,28 @@ hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize) } uio_free(auio); - /* - * No need to call ubc_sync_range or hfs_invalbuf - * since the file was copied using IO_NOCACHE. - */ - + if ((blksize & PAGE_MASK)) { + /* + * since the copy may not have started on a PAGE + * boundary (or may not have ended on one), we + * may have pages left in the cache since NOCACHE + * will let partially written pages linger... + * lets just flush the entire range to make sure + * we don't have any pages left that are beyond + * (or intersect) the real LEOF of this file + */ + ubc_msync(vp, writebase, writebase + offset, NULL, UBC_INVALIDATE | UBC_PUSHDIRTY); + } else { + /* + * No need to call ubc_msync or hfs_invalbuf + * since the file was copied using IO_NOCACHE and + * the copy was done starting and ending on a page + * boundary in the file. + */ + } kmem_free(kernel_map, (vm_offset_t)bufp, bufsize); - hfs_lock(VTOC(vp), HFS_FORCE_LOCK); + hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); return (error); }