X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/ebb1b9f42b62218f29061826217bb0f71cd375a6..7e41aa883dd258f888d0470250eead40a53ef1f5:/bsd/hfs/hfs_readwrite.c diff --git a/bsd/hfs/hfs_readwrite.c b/bsd/hfs/hfs_readwrite.c index 7bf65093c..78719c069 100644 --- a/bsd/hfs/hfs_readwrite.c +++ b/bsd/hfs/hfs_readwrite.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -52,6 +53,10 @@ #include #include #include +#include +#include + +#include #include @@ -61,6 +66,8 @@ #include #include +#include + #include #include "hfs.h" @@ -73,6 +80,7 @@ #include "hfs_cnode.h" #include "hfs_dbg.h" + #define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2))) enum { @@ -82,12 +90,18 @@ enum { /* from bsd/hfs/hfs_vfsops.c */ extern int hfs_vfs_vget (struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context); -static int hfs_clonelink(struct vnode *, int, kauth_cred_t, struct proc *); +/* from hfs_hotfiles.c */ +extern int hfs_pin_overflow_extents (struct hfsmount *hfsmp, uint32_t fileid, + uint8_t forktype, uint32_t *pinned); + static int hfs_clonefile(struct vnode *, int, int, int); static int hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *); -static int hfs_minorupdate(struct vnode *vp); static int do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skip, vfs_context_t context); +/* from bsd/hfs/hfs_vnops.c */ +extern decmpfs_cnode* hfs_lazy_init_decmpfs_cnode (struct cnode *cp); + + int flush_cache_on_write = 0; SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW | CTLFLAG_LOCKED, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files"); @@ -98,6 +112,16 @@ SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW | CTLFLAG_LOCKED, int hfs_vnop_read(struct vnop_read_args *ap) { + /* + struct vnop_read_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + struct uio *a_uio; + int a_ioflag; + vfs_context_t a_context; + }; + */ + uio_t uio = ap->a_uio; struct vnode *vp = ap->a_vp; struct cnode *cp; @@ -109,6 +133,8 @@ hfs_vnop_read(struct vnop_read_args *ap) off_t offset = uio_offset(uio); int retval = 0; int took_truncate_lock = 0; + int io_throttle = 0; + int throttled_count = 0; /* Preflight checks */ if (!vnode_isreg(vp)) { @@ -122,7 +148,15 @@ hfs_vnop_read(struct vnop_read_args *ap) return (0); /* Nothing left to do */ if (offset < 0) return (EINVAL); /* cant read from a negative offset */ - + +#if SECURE_KERNEL + if ((ap->a_ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) == + (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) { + /* Don't allow unencrypted io request from user space */ + return EPERM; + } +#endif + #if HFS_COMPRESSION if (VNODE_IS_RSRC(vp)) { if (hfs_hides_rsrc(ap->a_context, VTOC(vp), 1)) { /* 1 == don't take the cnode lock */ @@ -133,12 +167,19 @@ hfs_vnop_read(struct vnop_read_args *ap) int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */ if (compressed) { retval = decmpfs_read_compressed(ap, &compressed, VTOCMP(vp)); + if (retval == 0 && !(ap->a_ioflag & IO_EVTONLY) && vnode_isfastdevicecandidate(vp)) { + (void) hfs_addhotfile(vp); + } if (compressed) { if (retval == 0) { /* successful read, update the access time */ VTOC(vp)->c_touch_acctime = TRUE; - /* compressed files are not hot file candidates */ + // + // compressed files are not traditional hot file candidates + // but they may be for CF (which ignores the ff_bytesread + // field) + // if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) { VTOF(vp)->ff_bytesread = 0; } @@ -147,7 +188,7 @@ hfs_vnop_read(struct vnop_read_args *ap) } /* otherwise the file was converted back to a regular file while we were reading it */ retval = 0; - } else if ((VTOC(vp)->c_flags & UF_COMPRESSED)) { + } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) { int error; error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP); @@ -164,17 +205,36 @@ hfs_vnop_read(struct vnop_read_args *ap) hfsmp = VTOHFS(vp); #if CONFIG_PROTECT - if ((retval = cp_handle_vnop (cp, CP_READ_ACCESS)) != 0) { + if ((retval = cp_handle_vnop (vp, CP_READ_ACCESS, ap->a_ioflag)) != 0) { goto exit; } -#endif + +#endif // CONFIG_PROTECT + + /* + * If this read request originated from a syscall (as opposed to + * an in-kernel page fault or something), then set it up for + * throttle checks + */ + if (ap->a_ioflag & IO_SYSCALL_DISPATCH) { + io_throttle = IO_RETURN_ON_THROTTLE; + } + +read_again: /* Protect against a size change. */ - hfs_lock_truncate(cp, HFS_SHARED_LOCK); + hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT); took_truncate_lock = 1; filesize = fp->ff_size; filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize; + + /* + * Check the file size. Note that per POSIX spec, we return 0 at + * file EOF, so attempting a read at an offset that is too big + * should just return 0 on HFS+. Since the return value was initialized + * to 0 above, we just jump to exit. HFS Standard has its own behavior. + */ if (offset > filesize) { if ((hfsmp->hfs_flags & HFS_STANDARD) && (offset > (off_t)MAXHFSFILESIZE)) { @@ -183,14 +243,14 @@ hfs_vnop_read(struct vnop_read_args *ap) goto exit; } - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_START, + KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_START, (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0); - retval = cluster_read(vp, uio, filesize, ap->a_ioflag); + retval = cluster_read(vp, uio, filesize, ap->a_ioflag |io_throttle); cp->c_touch_acctime = TRUE; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_END, + KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_END, (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0); /* @@ -204,7 +264,7 @@ hfs_vnop_read(struct vnop_read_args *ap) /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */ if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) { - hfs_lock(cp, HFS_FORCE_LOCK); + hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); took_cnode_lock = 1; } /* @@ -220,17 +280,60 @@ hfs_vnop_read(struct vnop_read_args *ap) } else { fp->ff_bytesread += bytesread; } + + if (!(ap->a_ioflag & IO_EVTONLY) && vnode_isfastdevicecandidate(vp)) { + // + // We don't add hotfiles for processes doing IO_EVTONLY I/O + // on the assumption that they're system processes such as + // mdworker which scan everything in the system (and thus + // do not represent user-initiated access to files) + // + (void) hfs_addhotfile(vp); + } if (took_cnode_lock) hfs_unlock(cp); } exit: if (took_truncate_lock) { - hfs_unlock_truncate(cp, 0); + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); } + if (retval == EAGAIN) { + throttle_lowpri_io(1); + throttled_count++; + retval = 0; + goto read_again; + } + if (throttled_count) { + throttle_info_reset_window((uthread_t)get_bsdthread_info(current_thread())); + } return (retval); } +/* + * Ideally, this wouldn't be necessary; the cluster code should be + * able to handle this on the read-side. See . + */ +static errno_t hfs_zero_eof_page(vnode_t vp, off_t zero_up_to) +{ + assert(VTOC(vp)->c_lockowner != current_thread()); + assert(VTOC(vp)->c_truncatelockowner == current_thread()); + + struct filefork *fp = VTOF(vp); + + if (!(fp->ff_size & PAGE_MASK_64) || zero_up_to <= fp->ff_size) { + // Nothing to do + return 0; + } + + zero_up_to = MIN(zero_up_to, (off_t)round_page_64(fp->ff_size)); + + /* N.B. At present, @zero_up_to is not important because the cluster + code will always zero up to the end of the page anyway. */ + return cluster_write(vp, NULL, fp->ff_size, zero_up_to, + fp->ff_size, 0, IO_HEADZEROFILL); +} + /* * Write data to a file. */ @@ -259,6 +362,8 @@ hfs_vnop_write(struct vnop_write_args *ap) int do_snapshot = 1; time_t orig_ctime=VTOC(vp)->c_ctime; int took_truncate_lock = 0; + int io_return_on_throttle = 0; + int throttled_count = 0; #if HFS_COMPRESSION if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */ @@ -276,7 +381,7 @@ hfs_vnop_write(struct vnop_write_args *ap) printf("invalid state %d for compressed file\n", state); /* fall through */ } - } else if ((VTOC(vp)->c_flags & UF_COMPRESSED)) { + } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) { int error; error = check_for_dataless_file(vp, NAMESPACE_HANDLER_WRITE_OP); @@ -291,7 +396,14 @@ hfs_vnop_write(struct vnop_write_args *ap) #endif - // LP64todo - fix this! uio_resid may be 64-bit value +#if SECURE_KERNEL + if ((ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) == + (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) { + /* Don't allow unencrypted io request from user space */ + return EPERM; + } +#endif + resid = uio_resid(uio); offset = uio_offset(uio); @@ -307,7 +419,7 @@ hfs_vnop_write(struct vnop_write_args *ap) hfsmp = VTOHFS(vp); #if CONFIG_PROTECT - if ((retval = cp_handle_vnop (cp, CP_WRITE_ACCESS)) != 0) { + if ((retval = cp_handle_vnop (vp, CP_WRITE_ACCESS, 0)) != 0) { goto exit; } #endif @@ -326,13 +438,24 @@ hfs_vnop_write(struct vnop_write_args *ap) } #endif /* HFS_SPARSE_DEV */ + if ((ioflag & (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) == + (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) { + io_return_on_throttle = IO_RETURN_ON_THROTTLE; + } + again: - /* Protect against a size change. */ - if (ioflag & IO_APPEND) { - hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK); + /* + * Protect against a size change. + * + * Note: If took_truncate_lock is true, then we previously got the lock shared + * but needed to upgrade to exclusive. So try getting it exclusive from the + * start. + */ + if (ioflag & IO_APPEND || took_truncate_lock) { + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); } else { - hfs_lock_truncate(cp, HFS_SHARED_LOCK); + hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT); } took_truncate_lock = 1; @@ -341,26 +464,49 @@ again: uio_setoffset(uio, fp->ff_size); offset = fp->ff_size; } - if ((cp->c_flags & APPEND) && offset != fp->ff_size) { + if ((cp->c_bsdflags & APPEND) && offset != fp->ff_size) { retval = EPERM; goto exit; } + cred = vfs_context_ucred(ap->a_context); + if (cred && suser(cred, NULL) != 0) + eflags |= kEFReserveMask; + origFileSize = fp->ff_size; writelimit = offset + resid; - filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize; - /* If the truncate lock is shared, and if we either have virtual - * blocks or will need to extend the file, upgrade the truncate - * to exclusive lock. If upgrade fails, we lose the lock and - * have to get exclusive lock again. Note that we want to - * grab the truncate lock exclusive even if we're not allocating new blocks - * because we could still be growing past the LEOF. + /* + * We may need an exclusive truncate lock for several reasons, all + * of which are because we may be writing to a (portion of a) block + * for the first time, and we need to make sure no readers see the + * prior, uninitialized contents of the block. The cases are: + * + * 1. We have unallocated (delayed allocation) blocks. We may be + * allocating new blocks to the file and writing to them. + * (A more precise check would be whether the range we're writing + * to contains delayed allocation blocks.) + * 2. We need to extend the file. The bytes between the old EOF + * and the new EOF are not yet initialized. This is important + * even if we're not allocating new blocks to the file. If the + * old EOF and new EOF are in the same block, we still need to + * protect that range of bytes until they are written for the + * first time. + * + * If we had a shared lock with the above cases, we need to try to upgrade + * to an exclusive lock. If the upgrade fails, we will lose the shared + * lock, and will need to take the truncate lock again; the took_truncate_lock + * flag will still be set, causing us to try for an exclusive lock next time. */ if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) && - ((fp->ff_unallocblocks != 0) || (writelimit > origFileSize))) { - /* Lock upgrade failed and we lost our shared lock, try again */ + ((fp->ff_unallocblocks != 0) || + (writelimit > origFileSize))) { if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) { + /* + * Lock upgrade failed and we lost our shared lock, try again. + * Note: we do not set took_truncate_lock=0 here. Leaving it + * set to 1 will cause us to try to get the lock exclusive. + */ goto again; } else { @@ -369,23 +515,29 @@ again: } } - if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) { + if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { goto exit; } cnode_locked = 1; - - if (cp->c_truncatelockowner == HFS_SHARED_OWNER) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_START, - (int)offset, uio_resid(uio), (int)fp->ff_size, - (int)filebytes, 0); + + filebytes = hfs_blk_to_bytes(fp->ff_blocks, hfsmp->blockSize); + + if (offset > filebytes + && (hfs_blk_to_bytes(hfs_freeblks(hfsmp, ISSET(eflags, kEFReserveMask)), + hfsmp->blockSize) < offset - filebytes)) { + retval = ENOSPC; + goto exit; } + KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_START, + (int)offset, uio_resid(uio), (int)fp->ff_size, + (int)filebytes, 0); + /* Check if we do not need to extend the file */ if (writelimit <= filebytes) { goto sizeok; } - cred = vfs_context_ucred(ap->a_context); bytesToAdd = writelimit - filebytes; #if QUOTA @@ -402,8 +554,6 @@ again: while (writelimit > filebytes) { bytesToAdd = writelimit - filebytes; - if (cred && suser(cred, NULL) != 0) - eflags |= kEFReserveMask; /* Protect extents b-tree and allocation bitmap */ lockflags = SFL_BITMAP; @@ -425,10 +575,10 @@ again: if (retval != E_NONE) break; filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_NONE, + KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_NONE, (int)offset, uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0); } - (void) hfs_update(vp, TRUE); + (void) hfs_update(vp, 0); (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); (void) hfs_end_transaction(hfsmp); @@ -446,134 +596,45 @@ again: sizeok: if (retval == E_NONE) { off_t filesize; - off_t zero_off; - off_t tail_off; - off_t inval_start; - off_t inval_end; - off_t io_start; + off_t head_off; int lflag; - struct rl_entry *invalid_range; - if (writelimit > fp->ff_size) + if (writelimit > fp->ff_size) { filesize = writelimit; - else + struct timeval tv; + rl_add(fp->ff_size, writelimit - 1 , &fp->ff_invalidranges); + microuptime(&tv); + cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT; + } else filesize = fp->ff_size; lflag = ioflag & ~(IO_TAILZEROFILL | IO_HEADZEROFILL | IO_NOZEROVALID | IO_NOZERODIRTY); - if (offset <= fp->ff_size) { - zero_off = offset & ~PAGE_MASK_64; - - /* Check to see whether the area between the zero_offset and the start - of the transfer to see whether is invalid and should be zero-filled - as part of the transfer: - */ - if (offset > zero_off) { - if (rl_scan(&fp->ff_invalidranges, zero_off, offset - 1, &invalid_range) != RL_NOOVERLAP) - lflag |= IO_HEADZEROFILL; - } - } else { - off_t eof_page_base = fp->ff_size & ~PAGE_MASK_64; - - /* The bytes between fp->ff_size and uio->uio_offset must never be - read without being zeroed. The current last block is filled with zeroes - if it holds valid data but in all cases merely do a little bookkeeping - to track the area from the end of the current last page to the start of - the area actually written. For the same reason only the bytes up to the - start of the page where this write will start is invalidated; any remainder - before uio->uio_offset is explicitly zeroed as part of the cluster_write. - - Note that inval_start, the start of the page after the current EOF, - may be past the start of the write, in which case the zeroing - will be handled by the cluser_write of the actual data. - */ - inval_start = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64; - inval_end = offset & ~PAGE_MASK_64; - zero_off = fp->ff_size; - - if ((fp->ff_size & PAGE_MASK_64) && - (rl_scan(&fp->ff_invalidranges, - eof_page_base, - fp->ff_size - 1, - &invalid_range) != RL_NOOVERLAP)) { - /* The page containing the EOF is not valid, so the - entire page must be made inaccessible now. If the write - starts on a page beyond the page containing the eof - (inval_end > eof_page_base), add the - whole page to the range to be invalidated. Otherwise - (i.e. if the write starts on the same page), zero-fill - the entire page explicitly now: - */ - if (inval_end > eof_page_base) { - inval_start = eof_page_base; - } else { - zero_off = eof_page_base; - }; - }; - - if (inval_start < inval_end) { - struct timeval tv; - /* There's some range of data that's going to be marked invalid */ - - if (zero_off < inval_start) { - /* The pages between inval_start and inval_end are going to be invalidated, - and the actual write will start on a page past inval_end. Now's the last - chance to zero-fill the page containing the EOF: - */ - hfs_unlock(cp); - cnode_locked = 0; - retval = cluster_write(vp, (uio_t) 0, - fp->ff_size, inval_start, - zero_off, (off_t)0, - lflag | IO_HEADZEROFILL | IO_NOZERODIRTY); - hfs_lock(cp, HFS_FORCE_LOCK); - cnode_locked = 1; - if (retval) goto ioerr_exit; - offset = uio_offset(uio); - }; - - /* Mark the remaining area of the newly allocated space as invalid: */ - rl_add(inval_start, inval_end - 1 , &fp->ff_invalidranges); - microuptime(&tv); - cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT; - zero_off = fp->ff_size = inval_end; - }; - - if (offset > zero_off) lflag |= IO_HEADZEROFILL; - }; - - /* Check to see whether the area between the end of the write and the end of - the page it falls in is invalid and should be zero-filled as part of the transfer: - */ - tail_off = (writelimit + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64; - if (tail_off > filesize) tail_off = filesize; - if (tail_off > writelimit) { - if (rl_scan(&fp->ff_invalidranges, writelimit, tail_off - 1, &invalid_range) != RL_NOOVERLAP) { - lflag |= IO_TAILZEROFILL; - }; - }; - /* - * if the write starts beyond the current EOF (possibly advanced in the - * zeroing of the last block, above), then we'll zero fill from the current EOF - * to where the write begins: - * - * NOTE: If (and ONLY if) the portion of the file about to be written is - * before the current EOF it might be marked as invalid now and must be - * made readable (removed from the invalid ranges) before cluster_write - * tries to write it: + * We no longer use IO_HEADZEROFILL or IO_TAILZEROFILL (except + * for one case below). For the regions that lie before the + * beginning and after the end of this write that are in the + * same page, we let the cluster code handle zeroing that out + * if necessary. If those areas are not cached, the cluster + * code will try and read those areas in, and in the case + * where those regions have never been written to, + * hfs_vnop_blockmap will consult the invalid ranges and then + * indicate that. The cluster code will zero out those areas. */ - io_start = (lflag & IO_HEADZEROFILL) ? zero_off : offset; - if (io_start < fp->ff_size) { - off_t io_end; - io_end = (lflag & IO_TAILZEROFILL) ? tail_off : writelimit; - rl_remove(io_start, io_end - 1, &fp->ff_invalidranges); - }; + head_off = trunc_page_64(offset); + + if (head_off < offset && head_off >= fp->ff_size) { + /* + * The first page is beyond current EOF, so as an + * optimisation, we can pass IO_HEADZEROFILL. + */ + lflag |= IO_HEADZEROFILL; + } hfs_unlock(cp); cnode_locked = 0; - + /* * We need to tell UBC the fork's new size BEFORE calling * cluster_write, in case any of the new pages need to be @@ -594,13 +655,47 @@ sizeok: * zero, unless we are extending the file via write. */ if (filesize > fp->ff_size) { + retval = hfs_zero_eof_page(vp, offset); + if (retval) + goto exit; fp->ff_new_size = filesize; ubc_setsize(vp, filesize); } - retval = cluster_write(vp, uio, fp->ff_size, filesize, zero_off, - tail_off, lflag | IO_NOZERODIRTY); + retval = cluster_write(vp, uio, fp->ff_size, filesize, head_off, + 0, lflag | IO_NOZERODIRTY | io_return_on_throttle); if (retval) { fp->ff_new_size = 0; /* no longer extending; use ff_size */ + + if (retval == EAGAIN) { + /* + * EAGAIN indicates that we still have I/O to do, but + * that we now need to be throttled + */ + if (resid != uio_resid(uio)) { + /* + * did manage to do some I/O before returning EAGAIN + */ + resid = uio_resid(uio); + offset = uio_offset(uio); + + cp->c_touch_chgtime = TRUE; + cp->c_touch_modtime = TRUE; + hfs_incr_gencount(cp); + } + if (filesize > fp->ff_size) { + /* + * we called ubc_setsize before the call to + * cluster_write... since we only partially + * completed the I/O, we need to + * re-adjust our idea of the filesize based + * on our interim EOF + */ + ubc_setsize(vp, offset); + + fp->ff_size = offset; + } + goto exit; + } if (filesize > origFileSize) { ubc_setsize(vp, origFileSize); } @@ -615,13 +710,7 @@ sizeok: fp->ff_bytesread = 0; } } - fp->ff_new_size = 0; /* ff_size now has the correct size */ - - /* If we wrote some bytes, then touch the change and mod times */ - if (resid > uio_resid(uio)) { - cp->c_touch_chgtime = TRUE; - cp->c_touch_modtime = TRUE; - } + fp->ff_new_size = 0; /* ff_size now has the correct size */ } if (partialwrite) { uio_setresid(uio, (uio_resid(uio) + bytesToAdd)); @@ -631,57 +720,71 @@ sizeok: // XXXdbg - see radar 4871353 for more info { if (flush_cache_on_write && ((ioflag & IO_NOCACHE) || vnode_isnocache(vp))) { - VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NULL); + hfs_flush(hfsmp, HFS_FLUSH_CACHE); } } ioerr_exit: - /* - * If we successfully wrote any data, and we are not the superuser - * we clear the setuid and setgid bits as a precaution against - * tampering. - */ - if (cp->c_mode & (S_ISUID | S_ISGID)) { - cred = vfs_context_ucred(ap->a_context); - if (resid > uio_resid(uio) && cred && suser(cred, NULL)) { - if (!cnode_locked) { - hfs_lock(cp, HFS_FORCE_LOCK); - cnode_locked = 1; + if (!cnode_locked) { + hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + cnode_locked = 1; + } + + if (resid > uio_resid(uio)) { + cp->c_touch_chgtime = TRUE; + cp->c_touch_modtime = TRUE; + hfs_incr_gencount(cp); + + /* + * If we successfully wrote any data, and we are not the superuser + * we clear the setuid and setgid bits as a precaution against + * tampering. + */ + if (cp->c_mode & (S_ISUID | S_ISGID)) { + cred = vfs_context_ucred(ap->a_context); + if (cred && suser(cred, NULL)) { + cp->c_mode &= ~(S_ISUID | S_ISGID); } - cp->c_mode &= ~(S_ISUID | S_ISGID); } } if (retval) { if (ioflag & IO_UNIT) { - if (!cnode_locked) { - hfs_lock(cp, HFS_FORCE_LOCK); - cnode_locked = 1; - } (void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC, - 0, 0, ap->a_context); - // LP64todo - fix this! resid needs to by user_ssize_t + 0, ap->a_context); uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio)))); uio_setresid(uio, resid); filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize; } - } else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio))) { - if (!cnode_locked) { - hfs_lock(cp, HFS_FORCE_LOCK); - cnode_locked = 1; - } - retval = hfs_update(vp, TRUE); - } + } else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio))) + retval = hfs_update(vp, 0); + /* Updating vcbWrCnt doesn't need to be atomic. */ hfsmp->vcbWrCnt++; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_END, + KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_END, (int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0); exit: + if (retval && took_truncate_lock + && cp->c_truncatelockowner == current_thread()) { + fp->ff_new_size = 0; + rl_remove(fp->ff_size, RL_INFINITY, &fp->ff_invalidranges); + } + if (cnode_locked) hfs_unlock(cp); - + if (took_truncate_lock) { - hfs_unlock_truncate(cp, 0); + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + } + if (retval == EAGAIN) { + throttle_lowpri_io(1); + throttled_count++; + + retval = 0; + goto again; + } + if (throttled_count) { + throttle_info_reset_window((uthread_t)get_bsdthread_info(current_thread())); } return (retval); } @@ -825,8 +928,6 @@ lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id) } if (cache->numcached > NUM_CACHE_ENTRIES) { - /*printf("hfs: EGAD! numcached is %d... cut our losses and trim to %d\n", - cache->numcached, NUM_CACHE_ENTRIES);*/ cache->numcached = NUM_CACHE_ENTRIES; } @@ -874,11 +975,9 @@ add_node(struct access_cache *cache, int index, cnid_t nodeID, int access) /* if the cache is full, do a replace rather than an insert */ if (cache->numcached >= NUM_CACHE_ENTRIES) { - //printf("hfs: cache is full (%d). replace at index %d\n", cache->numcached, index); cache->numcached = NUM_CACHE_ENTRIES-1; if (index > cache->numcached) { - // printf("hfs: index %d pinned to %d\n", index, cache->numcached); index = cache->numcached; } } @@ -908,15 +1007,15 @@ struct cinfo { }; static int -snoop_callback(const struct cat_desc *descp, const struct cat_attr *attrp, void * arg) +snoop_callback(const cnode_t *cp, void *arg) { - struct cinfo *cip = (struct cinfo *)arg; + struct cinfo *cip = arg; - cip->uid = attrp->ca_uid; - cip->gid = attrp->ca_gid; - cip->mode = attrp->ca_mode; - cip->parentcnid = descp->cd_parentcnid; - cip->recflags = attrp->ca_recflags; + cip->uid = cp->c_uid; + cip->gid = cp->c_gid; + cip->mode = cp->c_mode; + cip->parentcnid = cp->c_parentcnid; + cip->recflags = cp->c_attr.ca_recflags; return (0); } @@ -933,33 +1032,41 @@ do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, cnid_t cnid, /* if this id matches the one the fsctl was called with, skip the lookup */ if (cnid == skip_cp->c_cnid) { - cnattrp->ca_uid = skip_cp->c_uid; - cnattrp->ca_gid = skip_cp->c_gid; - cnattrp->ca_mode = skip_cp->c_mode; - cnattrp->ca_recflags = skip_cp->c_attr.ca_recflags; - keyp->hfsPlus.parentID = skip_cp->c_parentcnid; + cnattrp->ca_uid = skip_cp->c_uid; + cnattrp->ca_gid = skip_cp->c_gid; + cnattrp->ca_mode = skip_cp->c_mode; + cnattrp->ca_recflags = skip_cp->c_attr.ca_recflags; + keyp->hfsPlus.parentID = skip_cp->c_parentcnid; } else { - struct cinfo c_info; - - /* otherwise, check the cnode hash incase the file/dir is incore */ - if (hfs_chash_snoop(hfsmp, cnid, 0, snoop_callback, &c_info) == 0) { - cnattrp->ca_uid = c_info.uid; - cnattrp->ca_gid = c_info.gid; - cnattrp->ca_mode = c_info.mode; - cnattrp->ca_recflags = c_info.recflags; - keyp->hfsPlus.parentID = c_info.parentcnid; - } else { - int lockflags; - - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - - /* lookup this cnid in the catalog */ - error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp); + struct cinfo c_info; + + /* otherwise, check the cnode hash incase the file/dir is incore */ + error = hfs_chash_snoop(hfsmp, cnid, 0, snoop_callback, &c_info); + + if (error == EACCES) { + // File is deleted + return ENOENT; + } else if (!error) { + cnattrp->ca_uid = c_info.uid; + cnattrp->ca_gid = c_info.gid; + cnattrp->ca_mode = c_info.mode; + cnattrp->ca_recflags = c_info.recflags; + keyp->hfsPlus.parentID = c_info.parentcnid; + } else { + int lockflags; + + if (throttle_io_will_be_throttled(-1, HFSTOVFS(hfsmp))) + throttle_lowpri_io(1); + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + + /* lookup this cnid in the catalog */ + error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp); - hfs_systemfile_unlock(hfsmp, lockflags); + hfs_systemfile_unlock(hfsmp, lockflags); - cache->lookups++; - } + cache->lookups++; + } } return (error); @@ -1128,7 +1235,7 @@ do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp, boolean_t is64bit; /* - * NOTE: on entry, the vnode is locked. Incase this vnode + * NOTE: on entry, the vnode has an io_ref. In case this vnode * happens to be in our list of file_ids, we'll note it * avoid calling hfs_chashget_nowait() on that id as that * will cause a "locking against myself" panic. @@ -1373,7 +1480,7 @@ do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp, access[i] = 0; continue; } - + myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID, skip_cp, p, cred, context,bitmap, map_size, parents, num_parents); @@ -1401,8 +1508,6 @@ do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp, err_exit_bulk_access: - //printf("hfs: on exit (err %d), numfiles/numcached/cachehits/lookups is %d/%d/%d/%d\n", error, num_files, cache.numcached, cache.cachehits, cache.lookups); - if (file_ids) kfree(file_ids, sizeof(int) * num_files); if (parents) @@ -1423,24 +1528,13 @@ do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp, /* end "bulk-access" support */ -/* - * Callback for use with freeze ioctl. - */ -static int -hfs_freezewrite_callback(struct vnode *vp, __unused void *cargs) -{ - vnode_waitforwrites(vp, 0, 0, 0, "hfs freeze"); - - return 0; -} - /* * Control filesystem operating characteristics. */ int hfs_vnop_ioctl( struct vnop_ioctl_args /* { vnode_t a_vp; - int a_command; + long a_command; caddr_t a_data; int a_fflag; vfs_context_t a_context; @@ -1483,7 +1577,7 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { #if CONFIG_PROTECT { int error = 0; - if ((error = cp_handle_vnop(VTOC(vp), CP_WRITE_ACCESS)) != 0) { + if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) { return error; } } @@ -1498,6 +1592,7 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { int outlen; char *bufptr; int error; + int flags = 0; /* Caller must be owner of file system. */ vfsp = vfs_statfs(HFSTOVFS(hfsmp)); @@ -1511,6 +1606,9 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { } bufptr = (char *)ap->a_data; cnid = strtoul(bufptr, NULL, 10); + if (ap->a_fflag & HFS_GETPATH_VOLUME_RELATIVE) { + flags |= BUILDPATH_VOLUME_RELATIVE; + } /* We need to call hfs_vfs_vget to leverage the code that will * fix the origin list for us if needed, as opposed to calling @@ -1520,12 +1618,139 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { if ((error = hfs_vfs_vget(HFSTOVFS(hfsmp), cnid, &file_vp, context))) { return (error); } - error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, 0, context); + error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, flags, context); vnode_put(file_vp); return (error); } + case HFS_TRANSFER_DOCUMENT_ID: + { + struct cnode *cp = NULL; + int error; + u_int32_t to_fd = *(u_int32_t *)ap->a_data; + struct fileproc *to_fp; + struct vnode *to_vp; + struct cnode *to_cp; + + cp = VTOC(vp); + + if ((error = fp_getfvp(p, to_fd, &to_fp, &to_vp)) != 0) { + //printf("could not get the vnode for fd %d (err %d)\n", to_fd, error); + return error; + } + if ( (error = vnode_getwithref(to_vp)) ) { + file_drop(to_fd); + return error; + } + + if (VTOHFS(to_vp) != hfsmp) { + error = EXDEV; + goto transfer_cleanup; + } + + int need_unlock = 1; + to_cp = VTOC(to_vp); + error = hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK); + if (error != 0) { + //printf("could not lock the pair of cnodes (error %d)\n", error); + goto transfer_cleanup; + } + + if (!(cp->c_bsdflags & UF_TRACKED)) { + error = EINVAL; + } else if (to_cp->c_bsdflags & UF_TRACKED) { + // + // if the destination is already tracked, return an error + // as otherwise it's a silent deletion of the target's + // document-id + // + error = EEXIST; + } else if (S_ISDIR(cp->c_attr.ca_mode) || S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) { + // + // we can use the FndrExtendedFileInfo because the doc-id is the first + // thing in both it and the ExtendedDirInfo struct which is fixed in + // format and can not change layout + // + struct FndrExtendedFileInfo *f_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)cp->c_finderinfo + 16); + struct FndrExtendedFileInfo *to_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)to_cp->c_finderinfo + 16); + + if (f_extinfo->document_id == 0) { + uint32_t new_id; + + hfs_unlockpair(cp, to_cp); // have to unlock to be able to get a new-id + + if ((error = hfs_generate_document_id(hfsmp, &new_id)) == 0) { + // + // re-lock the pair now that we have the document-id + // + hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK); + f_extinfo->document_id = new_id; + } else { + goto transfer_cleanup; + } + } + + to_extinfo->document_id = f_extinfo->document_id; + f_extinfo->document_id = 0; + //printf("TRANSFERRING: doc-id %d from ino %d to ino %d\n", to_extinfo->document_id, cp->c_fileid, to_cp->c_fileid); + + // make sure the destination is also UF_TRACKED + to_cp->c_bsdflags |= UF_TRACKED; + cp->c_bsdflags &= ~UF_TRACKED; + + // mark the cnodes dirty + cp->c_flag |= C_MODIFIED; + to_cp->c_flag |= C_MODIFIED; + + int lockflags; + if ((error = hfs_start_transaction(hfsmp)) == 0) { + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); + + (void) cat_update(hfsmp, &cp->c_desc, &cp->c_attr, NULL, NULL); + (void) cat_update(hfsmp, &to_cp->c_desc, &to_cp->c_attr, NULL, NULL); + + hfs_systemfile_unlock (hfsmp, lockflags); + (void) hfs_end_transaction(hfsmp); + } + +#if CONFIG_FSE + add_fsevent(FSE_DOCID_CHANGED, context, + FSE_ARG_DEV, hfsmp->hfs_raw_dev, + FSE_ARG_INO, (ino64_t)cp->c_fileid, // src inode # + FSE_ARG_INO, (ino64_t)to_cp->c_fileid, // dst inode # + FSE_ARG_INT32, to_extinfo->document_id, + FSE_ARG_DONE); + + hfs_unlockpair(cp, to_cp); // unlock this so we can send the fsevents + need_unlock = 0; + + if (need_fsevent(FSE_STAT_CHANGED, vp)) { + add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, vp, FSE_ARG_DONE); + } + if (need_fsevent(FSE_STAT_CHANGED, to_vp)) { + add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, to_vp, FSE_ARG_DONE); + } +#else + hfs_unlockpair(cp, to_cp); // unlock this so we can send the fsevents + need_unlock = 0; +#endif + } + + if (need_unlock) { + hfs_unlockpair(cp, to_cp); + } + + transfer_cleanup: + vnode_put(to_vp); + file_drop(to_fd); + + return error; + } + + + case HFS_PREV_LINK: case HFS_NEXT_LINK: { @@ -1580,6 +1805,7 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { case HFS_RESIZE_VOLUME: { u_int64_t newsize; u_int64_t cursize; + int ret; vfsp = vfs_statfs(HFSTOVFS(hfsmp)); if (suser(cred, NULL) && @@ -1596,14 +1822,18 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { } newsize = *(u_int64_t *)ap->a_data; cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize; - + + if (newsize == cursize) { + return (0); + } + IOBSDMountChange(hfsmp->hfs_mp, kIOMountChangeWillResize); if (newsize > cursize) { - return hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context); - } else if (newsize < cursize) { - return hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context); + ret = hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context); } else { - return (0); + ret = hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context); } + IOBSDMountChange(hfsmp->hfs_mp, kIOMountChangeDidResize); + return (ret); } case HFS_CHANGE_NEXT_ALLOCATION: { int error = 0; /* Assume success */ @@ -1620,7 +1850,7 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { if (!vnode_isvroot(vp)) { return (EINVAL); } - HFS_MOUNT_LOCK(hfsmp, TRUE); + hfs_lock_mount(hfsmp); location = *(u_int32_t *)ap->a_data; if ((location >= hfsmp->allocLimit) && (location != HFS_NO_UPDATE_NEXT_ALLOCATION)) { @@ -1644,7 +1874,7 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { } MarkVCBDirty(hfsmp); fail_change_next_allocation: - HFS_MOUNT_UNLOCK(hfsmp, TRUE); + hfs_unlock_mount(hfsmp); return (error); } @@ -1697,22 +1927,13 @@ fail_change_next_allocation: vnode_ref(bsfs_rootvp); vnode_put(bsfs_rootvp); + hfs_lock_mount(hfsmp); hfsmp->hfs_backingfs_rootvp = bsfs_rootvp; - hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE; - /* The free extent cache is managed differently for sparse devices. - * There is a window between which the volume is mounted and the - * device is marked as sparse, so the free extent cache for this - * volume is currently initialized as normal volume (sorted by block - * count). Reset the cache so that it will be rebuilt again - * for sparse device (sorted by start block). - */ - ResetVCBFreeExtCache(hfsmp); + hfsmp->hfs_sparsebandblks = bsdata->bandsize / hfsmp->blockSize * 4; + hfs_unlock_mount(hfsmp); - hfsmp->hfs_sparsebandblks = bsdata->bandsize / HFSTOVCB(hfsmp)->blockSize; - hfsmp->hfs_sparsebandblks *= 4; - - vfs_markdependency(hfsmp->hfs_mp); + /* We check the MNTK_VIRTUALDEV bit instead of marking the dependent process */ /* * If the sparse image is on a sparse image file (as opposed to a sparse @@ -1733,6 +1954,15 @@ fail_change_next_allocation: } } + /* The free extent cache is managed differently for sparse devices. + * There is a window between which the volume is mounted and the + * device is marked as sparse, so the free extent cache for this + * volume is currently initialized as normal volume (sorted by block + * count). Reset the cache so that it will be rebuilt again + * for sparse device (sorted by start block). + */ + ResetVCBFreeExtCache(hfsmp); + (void)vnode_put(di_vp); file_drop(bsdata->backingfd); return (0); @@ -1752,102 +1982,89 @@ fail_change_next_allocation: if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) && hfsmp->hfs_backingfs_rootvp) { + hfs_lock_mount(hfsmp); hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE; tmpvp = hfsmp->hfs_backingfs_rootvp; hfsmp->hfs_backingfs_rootvp = NULLVP; hfsmp->hfs_sparsebandblks = 0; + hfs_unlock_mount(hfsmp); + vnode_rele(tmpvp); } return (0); } #endif /* HFS_SPARSE_DEV */ - case F_FREEZE_FS: { - struct mount *mp; - - mp = vnode_mount(vp); - hfsmp = VFSTOHFS(mp); + /* Change the next CNID stored in the VH */ + case HFS_CHANGE_NEXTCNID: { + int error = 0; /* Assume success */ + u_int32_t fileid; + int wraparound = 0; + int lockflags = 0; - if (!(hfsmp->jnl)) - return (ENOTSUP); + if (vnode_vfsisrdonly(vp)) { + return (EROFS); + } + vfsp = vfs_statfs(HFSTOVFS(hfsmp)); + if (suser(cred, NULL) && + kauth_cred_getuid(cred) != vfsp->f_owner) { + return (EACCES); /* must be owner of file system */ + } + + fileid = *(u_int32_t *)ap->a_data; - vfsp = vfs_statfs(mp); - - if (kauth_cred_getuid(cred) != vfsp->f_owner && - !kauth_cred_issuser(cred)) - return (EACCES); + /* Must have catalog lock excl. to advance the CNID pointer */ + lockflags = hfs_systemfile_lock (hfsmp, SFL_CATALOG , HFS_EXCLUSIVE_LOCK); - lck_rw_lock_exclusive(&hfsmp->hfs_insync); - - // flush things before we get started to try and prevent - // dirty data from being paged out while we're frozen. - // note: can't do this after taking the lock as it will - // deadlock against ourselves. - vnode_iterate(mp, 0, hfs_freezewrite_callback, NULL); - hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK); - - // DO NOT call hfs_journal_flush() because that takes a - // shared lock on the global exclusive lock! - journal_flush(hfsmp->jnl, TRUE); - - // don't need to iterate on all vnodes, we just need to - // wait for writes to the system files and the device vnode - // - // Now that journal flush waits for all metadata blocks to - // be written out, waiting for btree writes is probably no - // longer required. - if (HFSTOVCB(hfsmp)->extentsRefNum) - vnode_waitforwrites(HFSTOVCB(hfsmp)->extentsRefNum, 0, 0, 0, "hfs freeze"); - if (HFSTOVCB(hfsmp)->catalogRefNum) - vnode_waitforwrites(HFSTOVCB(hfsmp)->catalogRefNum, 0, 0, 0, "hfs freeze"); - if (HFSTOVCB(hfsmp)->allocationsRefNum) - vnode_waitforwrites(HFSTOVCB(hfsmp)->allocationsRefNum, 0, 0, 0, "hfs freeze"); - if (hfsmp->hfs_attribute_vp) - vnode_waitforwrites(hfsmp->hfs_attribute_vp, 0, 0, 0, "hfs freeze"); - vnode_waitforwrites(hfsmp->hfs_devvp, 0, 0, 0, "hfs freeze"); - - hfsmp->hfs_freezing_proc = current_proc(); + hfs_lock_mount(hfsmp); - return (0); + /* If it is less than the current next CNID, force the wraparound bit to be set */ + if (fileid < hfsmp->vcbNxtCNID) { + wraparound=1; + } + + /* Return previous value. */ + *(u_int32_t *)ap->a_data = hfsmp->vcbNxtCNID; + + hfsmp->vcbNxtCNID = fileid; + + if (wraparound) { + hfsmp->vcbAtrb |= kHFSCatalogNodeIDsReusedMask; + } + + MarkVCBDirty(hfsmp); + hfs_unlock_mount(hfsmp); + hfs_systemfile_unlock (hfsmp, lockflags); + + return (error); } + + case F_FREEZE_FS: { + struct mount *mp; + + mp = vnode_mount(vp); + hfsmp = VFSTOHFS(mp); - case F_THAW_FS: { - vfsp = vfs_statfs(vnode_mount(vp)); + if (!(hfsmp->jnl)) + return (ENOTSUP); + + vfsp = vfs_statfs(mp); + if (kauth_cred_getuid(cred) != vfsp->f_owner && !kauth_cred_issuser(cred)) return (EACCES); - // if we're not the one who froze the fs then we - // can't thaw it. - if (hfsmp->hfs_freezing_proc != current_proc()) { - return EPERM; - } - - // NOTE: if you add code here, also go check the - // code that "thaws" the fs in hfs_vnop_close() - // - hfsmp->hfs_freezing_proc = NULL; - hfs_unlock_global (hfsmp); - lck_rw_unlock_exclusive(&hfsmp->hfs_insync); - - return (0); + return hfs_freeze(hfsmp); } - case HFS_BULKACCESS_FSCTL: { - int size; - - if (hfsmp->hfs_flags & HFS_STANDARD) { - return EINVAL; - } + case F_THAW_FS: { + vfsp = vfs_statfs(vnode_mount(vp)); + if (kauth_cred_getuid(cred) != vfsp->f_owner && + !kauth_cred_issuser(cred)) + return (EACCES); - if (is64bit) { - size = sizeof(struct user64_access_t); - } else { - size = sizeof(struct user32_access_t); - } - - return do_bulk_access_check(hfsmp, vp, ap, size, context); - } + return hfs_thaw(hfsmp, current_proc()); + } case HFS_EXT_BULKACCESS_FSCTL: { int size; @@ -1884,7 +2101,7 @@ fail_change_next_allocation: * are enabled by default, so any change will be transient only * till the volume is remounted. */ - if (!is_suser()) { + if (!kauth_cred_issuser(kauth_cred_get())) { return (EPERM); } if (state == 0 || state == 1) @@ -1893,15 +2110,284 @@ fail_change_next_allocation: return (EINVAL); } + case F_SETSTATICCONTENT: { + int error; + int enable_static = 0; + struct cnode *cp = NULL; + /* + * lock the cnode, decorate the cnode flag, and bail out. + * VFS should have already authenticated the caller for us. + */ + + if (ap->a_data) { + /* + * Note that even though ap->a_data is of type caddr_t, + * the fcntl layer at the syscall handler will pass in NULL + * or 1 depending on what the argument supplied to the fcntl + * was. So it is in fact correct to check the ap->a_data + * argument for zero or non-zero value when deciding whether or not + * to enable the static bit in the cnode. + */ + enable_static = 1; + } + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return EROFS; + } + cp = VTOC(vp); + + error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + if (error == 0) { + if (enable_static) { + cp->c_flag |= C_SSD_STATIC; + } + else { + cp->c_flag &= ~C_SSD_STATIC; + } + hfs_unlock (cp); + } + return error; + } + + case F_SET_GREEDY_MODE: { + int error; + int enable_greedy_mode = 0; + struct cnode *cp = NULL; + /* + * lock the cnode, decorate the cnode flag, and bail out. + * VFS should have already authenticated the caller for us. + */ + + if (ap->a_data) { + /* + * Note that even though ap->a_data is of type caddr_t, + * the fcntl layer at the syscall handler will pass in NULL + * or 1 depending on what the argument supplied to the fcntl + * was. So it is in fact correct to check the ap->a_data + * argument for zero or non-zero value when deciding whether or not + * to enable the greedy mode bit in the cnode. + */ + enable_greedy_mode = 1; + } + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return EROFS; + } + cp = VTOC(vp); + + error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + if (error == 0) { + if (enable_greedy_mode) { + cp->c_flag |= C_SSD_GREEDY_MODE; + } + else { + cp->c_flag &= ~C_SSD_GREEDY_MODE; + } + hfs_unlock (cp); + } + return error; + } + + case F_SETIOTYPE: { + int error; + uint32_t iotypeflag = 0; + + struct cnode *cp = NULL; + /* + * lock the cnode, decorate the cnode flag, and bail out. + * VFS should have already authenticated the caller for us. + */ + + if (ap->a_data == NULL) { + return EINVAL; + } + + /* + * Note that even though ap->a_data is of type caddr_t, we + * can only use 32 bits of flag values. + */ + iotypeflag = (uint32_t) ap->a_data; + switch (iotypeflag) { + case F_IOTYPE_ISOCHRONOUS: + break; + default: + return EINVAL; + } + + + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return EROFS; + } + cp = VTOC(vp); + + error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + if (error == 0) { + switch (iotypeflag) { + case F_IOTYPE_ISOCHRONOUS: + cp->c_flag |= C_IO_ISOCHRONOUS; + break; + default: + break; + } + hfs_unlock (cp); + } + return error; + } + + case F_MAKECOMPRESSED: { + int error = 0; + uint32_t gen_counter; + struct cnode *cp = NULL; + int reset_decmp = 0; + + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return EROFS; + } + + /* + * acquire & lock the cnode. + * VFS should have already authenticated the caller for us. + */ + + if (ap->a_data) { + /* + * Cast the pointer into a uint32_t so we can extract the + * supplied generation counter. + */ + gen_counter = *((uint32_t*)ap->a_data); + } + else { + return EINVAL; + } + +#if HFS_COMPRESSION + cp = VTOC(vp); + /* Grab truncate lock first; we may truncate the file */ + hfs_lock_truncate (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + + error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + if (error) { + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + return error; + } + + /* Are there any other usecounts/FDs? */ + if (vnode_isinuse(vp, 1)) { + hfs_unlock(cp); + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + return EBUSY; + } + + /* now we have the cnode locked down; Validate arguments */ + if (cp->c_attr.ca_flags & (UF_IMMUTABLE | UF_COMPRESSED)) { + /* EINVAL if you are trying to manipulate an IMMUTABLE file */ + hfs_unlock(cp); + hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT); + return EINVAL; + } + + if ((hfs_get_gencount (cp)) == gen_counter) { + /* + * OK, the gen_counter matched. Go for it: + * Toggle state bits, truncate file, and suppress mtime update + */ + reset_decmp = 1; + cp->c_bsdflags |= UF_COMPRESSED; + + error = hfs_truncate(vp, 0, IO_NDELAY, HFS_TRUNCATE_SKIPTIMES, + ap->a_context); + } + else { + error = ESTALE; + } + + /* Unlock cnode before executing decmpfs ; they may need to get an EA */ + hfs_unlock(cp); + + /* + * Reset the decmp state while still holding the truncate lock. We need to + * serialize here against a listxattr on this node which may occur at any + * time. + * + * Even if '0/skiplock' is passed in 2nd argument to hfs_file_is_compressed, + * that will still potentially require getting the com.apple.decmpfs EA. If the + * EA is required, then we can't hold the cnode lock, because the getxattr call is + * generic(through VFS), and can't pass along any info telling it that we're already + * holding it (the lock). If we don't serialize, then we risk listxattr stopping + * and trying to fill in the hfs_file_is_compressed info during the callback + * operation, which will result in deadlock against the b-tree node. + * + * So, to serialize against listxattr (which will grab buf_t meta references on + * the b-tree blocks), we hold the truncate lock as we're manipulating the + * decmpfs payload. + */ + if ((reset_decmp) && (error == 0)) { + decmpfs_cnode *dp = VTOCMP (vp); + if (dp != NULL) { + decmpfs_cnode_set_vnode_state(dp, FILE_TYPE_UNKNOWN, 0); + } + + /* Initialize the decmpfs node as needed */ + (void) hfs_file_is_compressed (cp, 0); /* ok to take lock */ + } + + hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT); + +#endif + return error; + } + + case F_SETBACKINGSTORE: { + + int error = 0; + + /* + * See comment in F_SETSTATICCONTENT re: using + * a null check for a_data + */ + if (ap->a_data) { + error = hfs_set_backingstore (vp, 1); + } + else { + error = hfs_set_backingstore (vp, 0); + } + + return error; + } + + case F_GETPATH_MTMINFO: { + int error = 0; + + int *data = (int*) ap->a_data; + + /* Ask if this is a backingstore vnode */ + error = hfs_is_backingstore (vp, data); + + return error; + } + case F_FULLFSYNC: { int error; if (hfsmp->hfs_flags & HFS_READ_ONLY) { return (EROFS); } - error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK); + error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); if (error == 0) { - error = hfs_fsync(vp, MNT_WAIT, TRUE, p); + error = hfs_fsync(vp, MNT_WAIT, HFS_FSYNC_FULL, p); + hfs_unlock(VTOC(vp)); + } + + return error; + } + + case F_BARRIERFSYNC: { + int error; + + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } + error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + if (error == 0) { + error = hfs_fsync(vp, MNT_WAIT, HFS_FSYNC_BARRIER, p); hfs_unlock(VTOC(vp)); } @@ -1915,7 +2401,7 @@ fail_change_next_allocation: if (!vnode_isreg(vp)) return EINVAL; - error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK); + error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); if (error == 0) { cp = VTOC(vp); /* @@ -1941,7 +2427,7 @@ fail_change_next_allocation: fp = VTOF(vp); /* Protect against a size change. */ - hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK); + hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); #if HFS_COMPRESSION if (compressed && (uncompressed_size == -1)) { @@ -1960,92 +2446,10 @@ fail_change_next_allocation: error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count); } - hfs_unlock_truncate(VTOC(vp), 0); + hfs_unlock_truncate(VTOC(vp), HFS_LOCK_DEFAULT); return (error); } - case F_READBOOTSTRAP: - case F_WRITEBOOTSTRAP: - { - struct vnode *devvp = NULL; - user_fbootstraptransfer_t *user_bootstrapp; - int devBlockSize; - int error; - uio_t auio; - daddr64_t blockNumber; - u_int32_t blockOffset; - u_int32_t xfersize; - struct buf *bp; - user_fbootstraptransfer_t user_bootstrap; - - if (!vnode_isvroot(vp)) - return (EINVAL); - /* LP64 - when caller is a 64 bit process then we are passed a pointer - * to a user_fbootstraptransfer_t else we get a pointer to a - * fbootstraptransfer_t which we munge into a user_fbootstraptransfer_t - */ - if ((hfsmp->hfs_flags & HFS_READ_ONLY) - && (ap->a_command == F_WRITEBOOTSTRAP)) { - return (EROFS); - } - if (is64bit) { - user_bootstrapp = (user_fbootstraptransfer_t *)ap->a_data; - } - else { - user32_fbootstraptransfer_t *bootstrapp = (user32_fbootstraptransfer_t *)ap->a_data; - user_bootstrapp = &user_bootstrap; - user_bootstrap.fbt_offset = bootstrapp->fbt_offset; - user_bootstrap.fbt_length = bootstrapp->fbt_length; - user_bootstrap.fbt_buffer = CAST_USER_ADDR_T(bootstrapp->fbt_buffer); - } - - if ((user_bootstrapp->fbt_offset < 0) || (user_bootstrapp->fbt_offset > 1024) || - (user_bootstrapp->fbt_length > 1024)) { - return EINVAL; - } - - if (user_bootstrapp->fbt_offset + user_bootstrapp->fbt_length > 1024) - return EINVAL; - - devvp = VTOHFS(vp)->hfs_devvp; - auio = uio_create(1, user_bootstrapp->fbt_offset, - is64bit ? UIO_USERSPACE64 : UIO_USERSPACE32, - (ap->a_command == F_WRITEBOOTSTRAP) ? UIO_WRITE : UIO_READ); - uio_addiov(auio, user_bootstrapp->fbt_buffer, user_bootstrapp->fbt_length); - - devBlockSize = vfs_devblocksize(vnode_mount(vp)); - - while (uio_resid(auio) > 0) { - blockNumber = uio_offset(auio) / devBlockSize; - error = (int)buf_bread(devvp, blockNumber, devBlockSize, cred, &bp); - if (error) { - if (bp) buf_brelse(bp); - uio_free(auio); - return error; - }; - - blockOffset = uio_offset(auio) % devBlockSize; - xfersize = devBlockSize - blockOffset; - error = uiomove((caddr_t)buf_dataptr(bp) + blockOffset, (int)xfersize, auio); - if (error) { - buf_brelse(bp); - uio_free(auio); - return error; - }; - if (uio_rw(auio) == UIO_WRITE) { - error = VNOP_BWRITE(bp); - if (error) { - uio_free(auio); - return error; - } - } else { - buf_brelse(bp); - }; - }; - uio_free(auio); - }; - return 0; - case _IOC(IOC_OUT,'h', 4, 0): /* Create date in local time */ { if (is64bit) { @@ -2065,6 +2469,10 @@ fail_change_next_allocation: *(uint32_t *)ap->a_data = hfsmp->hfs_last_mounted_mtime; break; + case HFS_FSCTL_GET_VERY_LOW_DISK: + *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_dangerlimit; + break; + case HFS_FSCTL_SET_VERY_LOW_DISK: if (*(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_warninglimit) { return EINVAL; @@ -2073,6 +2481,10 @@ fail_change_next_allocation: hfsmp->hfs_freespace_notify_dangerlimit = *(uint32_t *)ap->a_data; break; + case HFS_FSCTL_GET_LOW_DISK: + *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_warninglimit; + break; + case HFS_FSCTL_SET_LOW_DISK: if ( *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel || *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_dangerlimit) { @@ -2083,6 +2495,10 @@ fail_change_next_allocation: hfsmp->hfs_freespace_notify_warninglimit = *(uint32_t *)ap->a_data; break; + case HFS_FSCTL_GET_DESIRED_DISK: + *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_desiredlevel; + break; + case HFS_FSCTL_SET_DESIRED_DISK: if (*(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) { return EINVAL; @@ -2103,18 +2519,18 @@ fail_change_next_allocation: if (hfsmp->hfs_flags & HFS_READ_ONLY) { return (EROFS); } - HFS_MOUNT_LOCK(hfsmp, TRUE); + hfs_lock_mount (hfsmp); bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo)); - HFS_MOUNT_UNLOCK(hfsmp, TRUE); - (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0); + hfs_unlock_mount (hfsmp); + (void) hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT); break; case HFS_GET_BOOT_INFO: if (!vnode_isvroot(vp)) return(EINVAL); - HFS_MOUNT_LOCK(hfsmp, TRUE); + hfs_lock_mount (hfsmp); bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo)); - HFS_MOUNT_UNLOCK(hfsmp, TRUE); + hfs_unlock_mount(hfsmp); break; case HFS_MARK_BOOT_CORRUPT: @@ -2122,7 +2538,7 @@ fail_change_next_allocation: * kHFSVolumeInconsistentBit in the volume header. This will * force fsck_hfs on next mount. */ - if (!is_suser()) { + if (!kauth_cred_issuser(kauth_cred_get())) { return EACCES; } @@ -2134,54 +2550,276 @@ fail_change_next_allocation: if (hfsmp->hfs_flags & HFS_READ_ONLY) { return (EROFS); } - printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n"); - hfs_mark_volume_inconsistent(hfsmp); + printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n"); + hfs_mark_inconsistent(hfsmp, HFS_FSCK_FORCED); + break; + + case HFS_FSCTL_GET_JOURNAL_INFO: + jip = (struct hfs_journal_info*)ap->a_data; + + if (vp == NULLVP) + return EINVAL; + + if (hfsmp->jnl == NULL) { + jnl_start = 0; + jnl_size = 0; + } else { + jnl_start = hfs_blk_to_bytes(hfsmp->jnl_start, hfsmp->blockSize) + hfsmp->hfsPlusIOPosOffset; + jnl_size = hfsmp->jnl_size; + } + + jip->jstart = jnl_start; + jip->jsize = jnl_size; + break; + + case HFS_SET_ALWAYS_ZEROFILL: { + struct cnode *cp = VTOC(vp); + + if (*(int *)ap->a_data) { + cp->c_flag |= C_ALWAYS_ZEROFILL; + } else { + cp->c_flag &= ~C_ALWAYS_ZEROFILL; + } + break; + } + + case HFS_DISABLE_METAZONE: { + /* Only root can disable metadata zone */ + if (!kauth_cred_issuser(kauth_cred_get())) { + return EACCES; + } + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } + + /* Disable metadata zone now */ + (void) hfs_metadatazone_init(hfsmp, true); + printf ("hfs: Disabling metadata zone on %s\n", hfsmp->vcbVN); + break; + } + + + case HFS_FSINFO_METADATA_BLOCKS: { + int error; + struct hfsinfo_metadata *hinfo; + + hinfo = (struct hfsinfo_metadata *)ap->a_data; + + /* Get information about number of metadata blocks */ + error = hfs_getinfo_metadata_blocks(hfsmp, hinfo); + if (error) { + return error; + } + + break; + } + + case HFS_GET_FSINFO: { + hfs_fsinfo *fsinfo = (hfs_fsinfo *)ap->a_data; + + /* Only root is allowed to get fsinfo */ + if (!kauth_cred_issuser(kauth_cred_get())) { + return EACCES; + } + + /* + * Make sure that the caller's version number matches with + * the kernel's version number. This will make sure that + * if the structures being read/written into are changed + * by the kernel, the caller will not read incorrect data. + * + * The first three fields --- request_type, version and + * flags are same for all the hfs_fsinfo structures, so + * we can access the version number by assuming any + * structure for now. + */ + if (fsinfo->header.version != HFS_FSINFO_VERSION) { + return ENOTSUP; + } + + /* Make sure that the current file system is not marked inconsistent */ + if (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) { + return EIO; + } + + return hfs_get_fsinfo(hfsmp, ap->a_data); + } + + case HFS_CS_FREESPACE_TRIM: { + int error = 0; + int lockflags = 0; + + /* Only root allowed */ + if (!kauth_cred_issuser(kauth_cred_get())) { + return EACCES; + } + + /* + * This core functionality is similar to hfs_scan_blocks(). + * The main difference is that hfs_scan_blocks() is called + * as part of mount where we are assured that the journal is + * empty to start with. This fcntl() can be called on a + * mounted volume, therefore it has to flush the content of + * the journal as well as ensure the state of summary table. + * + * This fcntl scans over the entire allocation bitmap, + * creates list of all the free blocks, and issues TRIM + * down to the underlying device. This can take long time + * as it can generate up to 512MB of read I/O. + */ + + if ((hfsmp->hfs_flags & HFS_SUMMARY_TABLE) == 0) { + error = hfs_init_summary(hfsmp); + if (error) { + printf("hfs: fsctl() could not initialize summary table for %s\n", hfsmp->vcbVN); + return error; + } + } + + /* + * The journal maintains list of recently deallocated blocks to + * issue DKIOCUNMAPs when the corresponding journal transaction is + * flushed to the disk. To avoid any race conditions, we only + * want one active trim list and only one thread issuing DKIOCUNMAPs. + * Therefore we make sure that the journal trim list is sync'ed, + * empty, and not modifiable for the duration of our scan. + * + * Take the journal lock before flushing the journal to the disk. + * We will keep on holding the journal lock till we don't get the + * bitmap lock to make sure that no new journal transactions can + * start. This will make sure that the journal trim list is not + * modified after the journal flush and before getting bitmap lock. + * We can release the journal lock after we acquire the bitmap + * lock as it will prevent any further block deallocations. + */ + hfs_journal_lock(hfsmp); + + /* Flush the journal and wait for all I/Os to finish up */ + error = hfs_flush(hfsmp, HFS_FLUSH_JOURNAL_META); + if (error) { + hfs_journal_unlock(hfsmp); + return error; + } + + /* Take bitmap lock to ensure it is not being modified */ + lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + + /* Release the journal lock */ + hfs_journal_unlock(hfsmp); + + /* + * ScanUnmapBlocks reads the bitmap in large block size + * (up to 1MB) unlike the runtime which reads the bitmap + * in the 4K block size. This can cause buf_t collisions + * and potential data corruption. To avoid this, we + * invalidate all the existing buffers associated with + * the bitmap vnode before scanning it. + * + * Note: ScanUnmapBlock() cleans up all the buffers + * after itself, so there won't be any large buffers left + * for us to clean up after it returns. + */ + error = buf_invalidateblks(hfsmp->hfs_allocation_vp, 0, 0, 0); + if (error) { + hfs_systemfile_unlock(hfsmp, lockflags); + return error; + } + + /* Traverse bitmap and issue DKIOCUNMAPs */ + error = ScanUnmapBlocks(hfsmp); + hfs_systemfile_unlock(hfsmp, lockflags); + if (error) { + return error; + } + break; + } - case HFS_FSCTL_GET_JOURNAL_INFO: - jip = (struct hfs_journal_info*)ap->a_data; + case HFS_SET_HOTFILE_STATE: { + int error; + struct cnode *cp = VTOC(vp); + uint32_t hf_state = *((uint32_t*)ap->a_data); + uint32_t num_unpinned = 0; + + error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + if (error) { + return error; + } - if (vp == NULLVP) - return EINVAL; + // printf("hfs: setting hotfile state %d on %s\n", hf_state, vp->v_name); + if (hf_state == HFS_MARK_FASTDEVCANDIDATE) { + vnode_setfastdevicecandidate(vp); - if (hfsmp->jnl == NULL) { - jnl_start = 0; - jnl_size = 0; - } else { - jnl_start = (off_t)(hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset; - jnl_size = (off_t)hfsmp->jnl_size; - } + cp->c_attr.ca_recflags |= kHFSFastDevCandidateMask; + cp->c_attr.ca_recflags &= ~kHFSDoNotFastDevPinMask; + cp->c_flag |= C_MODIFIED; + } else if (hf_state == HFS_UNMARK_FASTDEVCANDIDATE || hf_state == HFS_NEVER_FASTDEVCANDIDATE) { + vnode_clearfastdevicecandidate(vp); + hfs_removehotfile(vp); - jip->jstart = jnl_start; - jip->jsize = jnl_size; - break; + if (cp->c_attr.ca_recflags & kHFSFastDevPinnedMask) { + hfs_pin_vnode(hfsmp, vp, HFS_UNPIN_IT, &num_unpinned, ap->a_context); + } + + if (hf_state == HFS_NEVER_FASTDEVCANDIDATE) { + cp->c_attr.ca_recflags |= kHFSDoNotFastDevPinMask; + } + cp->c_attr.ca_recflags &= ~(kHFSFastDevCandidateMask|kHFSFastDevPinnedMask); + cp->c_flag |= C_MODIFIED; - case HFS_SET_ALWAYS_ZEROFILL: { - struct cnode *cp = VTOC(vp); + } else { + error = EINVAL; + } - if (*(int *)ap->a_data) { - cp->c_flag |= C_ALWAYS_ZEROFILL; - } else { - cp->c_flag &= ~C_ALWAYS_ZEROFILL; - } - break; - } + if (num_unpinned != 0) { + lck_mtx_lock(&hfsmp->hfc_mutex); + hfsmp->hfs_hotfile_freeblks += num_unpinned; + lck_mtx_unlock(&hfsmp->hfc_mutex); + } - case HFS_DISABLE_METAZONE: { - /* Only root can disable metadata zone */ - if (!is_suser()) { + hfs_unlock(cp); + return error; + break; + } + + case HFS_REPIN_HOTFILE_STATE: { + int error=0; + uint32_t repin_what = *((uint32_t*)ap->a_data); + + /* Only root allowed */ + if (!kauth_cred_issuser(kauth_cred_get())) { return EACCES; } - if (hfsmp->hfs_flags & HFS_READ_ONLY) { - return (EROFS); + + if (!(hfsmp->hfs_flags & (HFS_CS_METADATA_PIN | HFS_CS_HOTFILE_PIN))) { + // this system is neither regular Fusion or Cooperative Fusion + // so this fsctl makes no sense. + return EINVAL; } - /* Disable metadata zone now */ - (void) hfs_metadatazone_init(hfsmp, true); - printf ("hfs: Disabling metadata zone on %s\n", hfsmp->vcbVN); + // + // After a converting a CoreStorage volume to be encrypted, the + // extents could have moved around underneath us. This call + // allows corestoraged to re-pin everything that should be + // pinned (it would happen on the next reboot too but that could + // be a long time away). + // + if ((repin_what & HFS_REPIN_METADATA) && (hfsmp->hfs_flags & HFS_CS_METADATA_PIN)) { + hfs_pin_fs_metadata(hfsmp); + } + if ((repin_what & HFS_REPIN_USERDATA) && (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN)) { + hfs_repin_hotfiles(hfsmp); + } + if ((repin_what & HFS_REPIN_USERDATA) && (hfsmp->hfs_flags & HFS_CS_SWAPFILE_PIN)) { + //XXX Swapfiles (marked SWAP_PINNED) may have moved too. + //XXX Do we care? They have a more transient/dynamic nature/lifetime. + } + + return error; break; - } - + } + + default: return (ENOTTY); } @@ -2323,6 +2961,62 @@ hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap) * searched for mapping. * * System file cnodes are expected to be locked (shared or exclusive). + * + * -- INVALID RANGES -- + * + * Invalid ranges are used to keep track of where we have extended a + * file, but have not yet written that data to disk. In the past we + * would clear up the invalid ranges as we wrote to those areas, but + * before data was actually flushed to disk. The problem with that + * approach is that the data can be left in the cache and is therefore + * still not valid on disk. So now we clear up the ranges here, when + * the flags field has VNODE_WRITE set, indicating a write is about to + * occur. This isn't ideal (ideally we want to clear them up when + * know the data has been successfully written), but it's the best we + * can do. + * + * For reads, we use the invalid ranges here in block map to indicate + * to the caller that the data should be zeroed (a_bpn == -1). We + * have to be careful about what ranges we return to the cluster code. + * Currently the cluster code can only handle non-rounded values for + * the EOF; it cannot handle funny sized ranges in the middle of the + * file (the main problem is that it sends down odd sized I/Os to the + * disk). Our code currently works because whilst the very first + * offset and the last offset in the invalid ranges are not aligned, + * gaps in the invalid ranges between the first and last, have to be + * aligned (because we always write page sized blocks). For example, + * consider this arrangement: + * + * +-------------+-----+-------+------+ + * | |XXXXX| |XXXXXX| + * +-------------+-----+-------+------+ + * a b c d + * + * This shows two invalid ranges and . Whilst a and d + * are not necessarily aligned, b and c *must* be. + * + * Zero-filling occurs in a number of ways: + * + * 1. When a read occurs and we return with a_bpn == -1. + * + * 2. When hfs_fsync or hfs_filedone calls hfs_flush_invalid_ranges + * which will cause us to iterate over the ranges bringing in + * pages that are not present in the cache and zeroing them. Any + * pages that are already in the cache are left untouched. Note + * that hfs_fsync does not always flush invalid ranges. + * + * 3. When we extend a file we zero out from the old EOF to the end + * of the page. It would be nice if we didn't have to do this if + * the page wasn't present (and could defer it), but because of + * the problem described above, we have to. + * + * The invalid ranges are also used to restrict the size that we write + * out on disk: see hfs_prepare_fork_for_update. + * + * Note that invalid ranges are ignored when neither the VNODE_READ or + * the VNODE_WRITE flag is specified. This is useful for the + * F_LOG2PHYS* fcntls which are not interested in invalid ranges: they + * just want to know whether blocks are physically allocated or not. */ int hfs_vnop_blockmap(struct vnop_blockmap_args *ap) @@ -2343,7 +3037,7 @@ hfs_vnop_blockmap(struct vnop_blockmap_args *ap) struct cnode *cp; struct filefork *fp; struct hfsmount *hfsmp; - size_t bytesContAvail = 0; + size_t bytesContAvail = ap->a_size; int retval = E_NONE; int syslocks = 0; int lockflags = 0; @@ -2384,17 +3078,110 @@ hfs_vnop_blockmap(struct vnop_blockmap_args *ap) if (ap->a_bpn == NULL) return (0); + hfsmp = VTOHFS(vp); + cp = VTOC(vp); + fp = VTOF(vp); + if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) { - if (VTOC(vp)->c_lockowner != current_thread()) { - hfs_lock(VTOC(vp), HFS_FORCE_LOCK); + if (cp->c_lockowner != current_thread()) { + hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); tooklock = 1; } + + // For reads, check the invalid ranges + if (ISSET(ap->a_flags, VNODE_READ)) { + if (ap->a_foffset >= fp->ff_size) { + retval = ERANGE; + goto exit; + } + + overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset, + ap->a_foffset + (off_t)bytesContAvail - 1, + &invalid_range); + switch(overlaptype) { + case RL_MATCHINGOVERLAP: + case RL_OVERLAPCONTAINSRANGE: + case RL_OVERLAPSTARTSBEFORE: + /* There's no valid block for this byte offset */ + *ap->a_bpn = (daddr64_t)-1; + /* There's no point limiting the amount to be returned + * if the invalid range that was hit extends all the way + * to the EOF (i.e. there's no valid bytes between the + * end of this range and the file's EOF): + */ + if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) && + ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) { + bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset; + } + + retval = 0; + goto exit; + + case RL_OVERLAPISCONTAINED: + case RL_OVERLAPENDSAFTER: + /* The range of interest hits an invalid block before the end: */ + if (invalid_range->rl_start == ap->a_foffset) { + /* There's actually no valid information to be had starting here: */ + *ap->a_bpn = (daddr64_t)-1; + if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) && + ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) { + bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset; + } + + retval = 0; + goto exit; + } else { + /* + * Sadly, the lower layers don't like us to + * return unaligned ranges, so we skip over + * any invalid ranges here that are less than + * a page: zeroing of those bits is not our + * responsibility (it's dealt with elsewhere). + */ + do { + off_t rounded_start = round_page_64(invalid_range->rl_start); + if ((off_t)bytesContAvail < rounded_start - ap->a_foffset) + break; + if (rounded_start < invalid_range->rl_end + 1) { + bytesContAvail = rounded_start - ap->a_foffset; + break; + } + } while ((invalid_range = TAILQ_NEXT(invalid_range, + rl_link))); + } + break; + + case RL_NOOVERLAP: + break; + } // switch + } } - hfsmp = VTOHFS(vp); - cp = VTOC(vp); - fp = VTOF(vp); + +#if CONFIG_PROTECT + if (cp->c_cpentry) { + const int direction = (ISSET(ap->a_flags, VNODE_WRITE) + ? VNODE_WRITE : VNODE_READ); + + cp_io_params_t io_params; + cp_io_params(hfsmp, cp->c_cpentry, + off_rsrc_make(ap->a_foffset, VNODE_IS_RSRC(vp)), + direction, &io_params); + + if (io_params.max_len < (off_t)bytesContAvail) + bytesContAvail = io_params.max_len; + + if (io_params.phys_offset != -1) { + *ap->a_bpn = ((io_params.phys_offset + hfsmp->hfsPlusIOPosOffset) + / hfsmp->hfs_logical_block_size); + + retval = 0; + goto exit; + } + } +#endif retry: + /* Check virtual blocks only when performing write operation */ if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) { if (hfs_start_transaction(hfsmp) != 0) { @@ -2449,14 +3236,14 @@ retry: cp->c_blocks += loanedBlocks; fp->ff_blocks += loanedBlocks; - HFS_MOUNT_LOCK(hfsmp, TRUE); + hfs_lock_mount (hfsmp); hfsmp->loanedBlocks += loanedBlocks; - HFS_MOUNT_UNLOCK(hfsmp, TRUE); + hfs_unlock_mount (hfsmp); hfs_systemfile_unlock(hfsmp, lockflags); cp->c_flag |= C_MODIFIED; if (started_tr) { - (void) hfs_update(vp, TRUE); + (void) hfs_update(vp, 0); (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); hfs_end_transaction(hfsmp); @@ -2466,19 +3253,13 @@ retry: } } - retval = MapFileBlockC(hfsmp, (FCB *)fp, ap->a_size, ap->a_foffset, + retval = MapFileBlockC(hfsmp, (FCB *)fp, bytesContAvail, ap->a_foffset, ap->a_bpn, &bytesContAvail); if (syslocks) { hfs_systemfile_unlock(hfsmp, lockflags); syslocks = 0; } - if (started_tr) { - (void) hfs_update(vp, TRUE); - (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); - hfs_end_transaction(hfsmp); - started_tr = 0; - } if (retval) { /* On write, always return error because virtual blocks, if any, * should have been allocated in ExtendFileC(). We do not @@ -2490,98 +3271,71 @@ retry: (ap->a_flags & VNODE_WRITE) || ((ap->a_flags & VNODE_READ) && (fp->ff_unallocblocks == 0))) { goto exit; - } - + } + /* Validate if the start offset is within logical file size */ - if (ap->a_foffset > fp->ff_size) { - goto exit; + if (ap->a_foffset >= fp->ff_size) { + goto exit; } - /* Searching file extents has failed for read operation, therefore - * search rangelist for any uncommitted holes in the file. + /* + * At this point, we have encountered a failure during + * MapFileBlockC that resulted in ERANGE, and we are not + * servicing a write, and there are borrowed blocks. + * + * However, the cluster layer will not call blockmap for + * blocks that are borrowed and in-cache. We have to assume + * that because we observed ERANGE being emitted from + * MapFileBlockC, this extent range is not valid on-disk. So + * we treat this as a mapping that needs to be zero-filled + * prior to reading. */ - overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset, - ap->a_foffset + (off_t)(ap->a_size - 1), - &invalid_range); - switch(overlaptype) { - case RL_OVERLAPISCONTAINED: - /* start_offset <= rl_start, end_offset >= rl_end */ - if (ap->a_foffset != invalid_range->rl_start) { - break; - } - case RL_MATCHINGOVERLAP: - /* start_offset = rl_start, end_offset = rl_end */ - case RL_OVERLAPCONTAINSRANGE: - /* start_offset >= rl_start, end_offset <= rl_end */ - case RL_OVERLAPSTARTSBEFORE: - /* start_offset > rl_start, end_offset >= rl_start */ - if ((off_t)fp->ff_size > (invalid_range->rl_end + 1)) { - bytesContAvail = (invalid_range->rl_end + 1) - ap->a_foffset; - } else { - bytesContAvail = fp->ff_size - ap->a_foffset; - } - if (bytesContAvail > ap->a_size) { - bytesContAvail = ap->a_size; - } - *ap->a_bpn = (daddr64_t)-1; - retval = 0; - break; - case RL_OVERLAPENDSAFTER: - /* start_offset < rl_start, end_offset < rl_end */ - case RL_NOOVERLAP: - break; - } + + if (fp->ff_size - ap->a_foffset < (off_t)bytesContAvail) + bytesContAvail = fp->ff_size - ap->a_foffset; + + *ap->a_bpn = (daddr64_t) -1; + retval = 0; + goto exit; } - /* MapFileC() found a valid extent in the filefork. Search the - * mapping information further for invalid file ranges - */ - overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset, - ap->a_foffset + (off_t)bytesContAvail - 1, - &invalid_range); - if (overlaptype != RL_NOOVERLAP) { - switch(overlaptype) { - case RL_MATCHINGOVERLAP: - case RL_OVERLAPCONTAINSRANGE: - case RL_OVERLAPSTARTSBEFORE: - /* There's no valid block for this byte offset */ - *ap->a_bpn = (daddr64_t)-1; - /* There's no point limiting the amount to be returned - * if the invalid range that was hit extends all the way - * to the EOF (i.e. there's no valid bytes between the - * end of this range and the file's EOF): - */ - if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) && - ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) { - bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset; - } - break; - - case RL_OVERLAPISCONTAINED: - case RL_OVERLAPENDSAFTER: - /* The range of interest hits an invalid block before the end: */ - if (invalid_range->rl_start == ap->a_foffset) { - /* There's actually no valid information to be had starting here: */ - *ap->a_bpn = (daddr64_t)-1; - if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) && - ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) { - bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset; +exit: + if (retval == 0) { + if (ISSET(ap->a_flags, VNODE_WRITE)) { + struct rl_entry *r = TAILQ_FIRST(&fp->ff_invalidranges); + + // See if we might be overlapping invalid ranges... + if (r && (ap->a_foffset + (off_t)bytesContAvail) > r->rl_start) { + /* + * Mark the file as needing an update if we think the + * on-disk EOF has changed. + */ + if (ap->a_foffset <= r->rl_start) + SET(cp->c_flag, C_MODIFIED); + + /* + * This isn't the ideal place to put this. Ideally, we + * should do something *after* we have successfully + * written to the range, but that's difficult to do + * because we cannot take locks in the callback. At + * present, the cluster code will call us with VNODE_WRITE + * set just before it's about to write the data so we know + * that data is about to be written. If we get an I/O + * error at this point then chances are the metadata + * update to follow will also have an I/O error so the + * risk here is small. + */ + rl_remove(ap->a_foffset, ap->a_foffset + bytesContAvail - 1, + &fp->ff_invalidranges); + + if (!TAILQ_FIRST(&fp->ff_invalidranges)) { + cp->c_flag &= ~C_ZFWANTSYNC; + cp->c_zftimeout = 0; } - } else { - bytesContAvail = invalid_range->rl_start - ap->a_foffset; } - break; + } - case RL_NOOVERLAP: - break; - } /* end switch */ - if (bytesContAvail > ap->a_size) - bytesContAvail = ap->a_size; - } - -exit: - if (retval == 0) { if (ap->a_run) *ap->a_run = bytesContAvail; @@ -2589,13 +3343,19 @@ exit: *(int *)ap->a_poff = 0; } + if (started_tr) { + hfs_update(vp, TRUE); + hfs_volupdate(hfsmp, VOL_UPDATE, 0); + hfs_end_transaction(hfsmp); + started_tr = 0; + } + if (tooklock) hfs_unlock(cp); return (MacToVFSError(retval)); } - /* * prepare and issue the I/O * buf_strategy knows how to deal @@ -2609,49 +3369,38 @@ hfs_vnop_strategy(struct vnop_strategy_args *ap) vnode_t vp = buf_vnode(bp); int error = 0; -#if CONFIG_PROTECT - cnode_t *cp = NULL; - - if ((cp = cp_get_protected_cnode(vp)) != NULL) { - /* - * Some paths to hfs_vnop_strategy will take the cnode lock, - * and some won't. But since content protection is only enabled - * for files that (a) aren't system files and (b) are regular - * files, any valid cnode here will be unlocked. - */ - hfs_lock(cp, HFS_SHARED_LOCK); - buf_setcpaddr(bp, cp->c_cpentry); + /* Mark buffer as containing static data if cnode flag set */ + if (VTOC(vp)->c_flag & C_SSD_STATIC) { + buf_markstatic(bp); } -#endif /* CONFIG_PROTECT */ - error = buf_strategy(VTOHFS(vp)->hfs_devvp, ap); + /* Mark buffer as containing static data if cnode flag set */ + if (VTOC(vp)->c_flag & C_SSD_GREEDY_MODE) { + bufattr_markgreedymode(&bp->b_attr); + } -#if CONFIG_PROTECT - if (cp) { - hfs_unlock(cp); + /* mark buffer as containing burst mode data if cnode flag set */ + if (VTOC(vp)->c_flag & C_IO_ISOCHRONOUS) { + bufattr_markisochronous(&bp->b_attr); } -#endif - return error; -} +#if CONFIG_PROTECT + error = cp_handle_strategy(bp); -static int -hfs_minorupdate(struct vnode *vp) { - struct cnode *cp = VTOC(vp); - cp->c_flag &= ~C_MODIFIED; - cp->c_touch_acctime = 0; - cp->c_touch_chgtime = 0; - cp->c_touch_modtime = 0; + if (error) + return error; +#endif - return 0; + error = buf_strategy(VTOHFS(vp)->hfs_devvp, ap); + + return error; } int -do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_context_t context) +do_hfs_truncate(struct vnode *vp, off_t length, int flags, int truncateflags, vfs_context_t context) { register struct cnode *cp = VTOC(vp); struct filefork *fp = VTOF(vp); - struct proc *p = vfs_context_proc(context);; kauth_cred_t cred = vfs_context_ucred(context); int retval; off_t bytesToAdd; @@ -2661,12 +3410,13 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_c int blksize; struct hfsmount *hfsmp; int lockflags; + int suppress_times = (truncateflags & HFS_TRUNCATE_SKIPTIMES); blksize = VTOVCB(vp)->blockSize; fileblocks = fp->ff_blocks; filebytes = (off_t)fileblocks * (off_t)blksize; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_START, + KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_START, (int)length, (int)fp->ff_size, (int)filebytes, 0, 0); if (length < 0) @@ -2720,8 +3470,9 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_c /* All or nothing and don't round up to clumpsize. */ eflags = kEFAllMask | kEFNoClumpMask; - if (cred && suser(cred, NULL) != 0) + if (cred && (suser(cred, NULL) != 0)) { eflags |= kEFReserveMask; /* keep a reserve */ + } /* * Allocate Journal and Quota files in metadata zone. @@ -2743,6 +3494,10 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_c lockflags |= SFL_EXTENTS; lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + /* + * Keep growing the file as long as the current EOF is + * less than the desired value. + */ while ((length > filebytes) && (retval == E_NONE)) { bytesToAdd = length - filebytes; retval = MacToVFSError(ExtendFileC(VTOVCB(vp), @@ -2763,13 +3518,8 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_c hfs_systemfile_unlock(hfsmp, lockflags); if (hfsmp->jnl) { - if (skipupdate) { - (void) hfs_minorupdate(vp); - } - else { - (void) hfs_update(vp, TRUE); - (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); - } + hfs_update(vp, 0); + hfs_volupdate(hfsmp, VOL_UPDATE, 0); } hfs_end_transaction(hfsmp); @@ -2777,64 +3527,48 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_c if (retval) goto Err_Exit; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE, + KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE, (int)length, (int)fp->ff_size, (int)filebytes, 0, 0); } - if (!(flags & IO_NOZEROFILL)) { + if (ISSET(flags, IO_NOZEROFILL)) { + // An optimisation for the hibernation file + if (vnode_isswap(vp)) + rl_remove_all(&fp->ff_invalidranges); + } else { if (UBCINFOEXISTS(vp) && (vnode_issystem(vp) == 0) && retval == E_NONE) { - struct rl_entry *invalid_range; - off_t zero_limit; - - zero_limit = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64; - if (length < zero_limit) zero_limit = length; - if (length > (off_t)fp->ff_size) { struct timeval tv; /* Extending the file: time to fill out the current last page w. zeroes? */ - if ((fp->ff_size & PAGE_MASK_64) && - (rl_scan(&fp->ff_invalidranges, fp->ff_size & ~PAGE_MASK_64, - fp->ff_size - 1, &invalid_range) == RL_NOOVERLAP)) { - - /* There's some valid data at the start of the (current) last page + if (fp->ff_size & PAGE_MASK_64) { + /* There might be some valid data at the start of the (current) last page of the file, so zero out the remainder of that page to ensure the - entire page contains valid data. Since there is no invalid range - possible past the (current) eof, there's no need to remove anything - from the invalid range list before calling cluster_write(): */ + entire page contains valid data. */ hfs_unlock(cp); - retval = cluster_write(vp, (struct uio *) 0, fp->ff_size, zero_limit, - fp->ff_size, (off_t)0, - (flags & IO_SYNC) | IO_HEADZEROFILL | IO_NOZERODIRTY); - hfs_lock(cp, HFS_FORCE_LOCK); + retval = hfs_zero_eof_page(vp, length); + hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); if (retval) goto Err_Exit; - - /* Merely invalidate the remaining area, if necessary: */ - if (length > zero_limit) { - microuptime(&tv); - rl_add(zero_limit, length - 1, &fp->ff_invalidranges); - cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT; - } - } else { - /* The page containing the (current) eof is invalid: just add the - remainder of the page to the invalid list, along with the area - being newly allocated: - */ + } microuptime(&tv); rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges); cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT; - }; } } else { panic("hfs_truncate: invoked on non-UBC object?!"); }; } - cp->c_touch_modtime = TRUE; + if (suppress_times == 0) { + cp->c_touch_modtime = TRUE; + } fp->ff_size = length; } else { /* Shorten the size of the file */ - if ((off_t)fp->ff_size > length) { + // An optimisation for the hibernation file + if (ISSET(flags, IO_NOZEROFILL) && vnode_isswap(vp)) { + rl_remove_all(&fp->ff_invalidranges); + } else if ((off_t)fp->ff_size > length) { /* Any space previously marked as invalid is now irrelevant: */ rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges); } @@ -2847,8 +3581,7 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_c u_int32_t finalblks; u_int32_t loanedBlocks; - HFS_MOUNT_LOCK(hfsmp, TRUE); - + hfs_lock_mount(hfsmp); loanedBlocks = fp->ff_unallocblocks; cp->c_blocks -= loanedBlocks; fp->ff_blocks -= loanedBlocks; @@ -2866,85 +3599,89 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_c cp->c_blocks += loanedBlocks; fp->ff_blocks += loanedBlocks; } - HFS_MOUNT_UNLOCK(hfsmp, TRUE); + hfs_unlock_mount (hfsmp); } - /* - * For a TBE process the deallocation of the file blocks is - * delayed until the file is closed. And hfs_close calls - * truncate with the IO_NDELAY flag set. So when IO_NDELAY - * isn't set, we make sure this isn't a TBE process. - */ - if ((flags & IO_NDELAY) || (proc_tbe(p) == 0)) { -#if QUOTA - off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize); -#endif /* QUOTA */ - if (hfs_start_transaction(hfsmp) != 0) { - retval = EINVAL; - goto Err_Exit; - } + off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize); + if (hfs_start_transaction(hfsmp) != 0) { + retval = EINVAL; + goto Err_Exit; + } - if (fp->ff_unallocblocks == 0) { - /* Protect extents b-tree and allocation bitmap */ - lockflags = SFL_BITMAP; - if (overflow_extents(fp)) - lockflags |= SFL_EXTENTS; - lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + if (fp->ff_unallocblocks == 0) { + /* Protect extents b-tree and allocation bitmap */ + lockflags = SFL_BITMAP; + if (overflow_extents(fp)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); - retval = MacToVFSError(TruncateFileC(VTOVCB(vp), (FCB*)fp, length, 0, - FORK_IS_RSRC (fp), FTOC(fp)->c_fileid, false)); + retval = MacToVFSError(TruncateFileC(VTOVCB(vp), (FCB*)fp, length, 0, + FORK_IS_RSRC (fp), FTOC(fp)->c_fileid, false)); - hfs_systemfile_unlock(hfsmp, lockflags); - } - if (hfsmp->jnl) { - if (retval == 0) { - fp->ff_size = length; - } - if (skipupdate) { - (void) hfs_minorupdate(vp); - } - else { - (void) hfs_update(vp, TRUE); - (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); - } + hfs_systemfile_unlock(hfsmp, lockflags); + } + if (hfsmp->jnl) { + if (retval == 0) { + fp->ff_size = length; } - hfs_end_transaction(hfsmp); + hfs_update(vp, 0); + hfs_volupdate(hfsmp, VOL_UPDATE, 0); + } + hfs_end_transaction(hfsmp); - filebytes = (off_t)fp->ff_blocks * (off_t)blksize; - if (retval) - goto Err_Exit; + filebytes = (off_t)fp->ff_blocks * (off_t)blksize; + if (retval) + goto Err_Exit; #if QUOTA - /* These are bytesreleased */ - (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0); + /* These are bytesreleased */ + (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0); #endif /* QUOTA */ - } - /* Only set update flag if the logical length changes */ - if ((off_t)fp->ff_size != length) + + // + // Unlike when growing a file, we adjust the hotfile block count here + // instead of deeper down in the block allocation code because we do + // not necessarily have a vnode or "fcb" at the time we're deleting + // the file and so we wouldn't know if it was hotfile cached or not + // + hfs_hotfile_adjust_blocks(vp, (int64_t)((savedbytes - filebytes) / blksize)); + + + /* + * Only set update flag if the logical length changes & we aren't + * suppressing modtime updates. + */ + if (((off_t)fp->ff_size != length) && (suppress_times == 0)) { cp->c_touch_modtime = TRUE; + } fp->ff_size = length; } if (cp->c_mode & (S_ISUID | S_ISGID)) { - if (!vfs_context_issuser(context)) { + if (!vfs_context_issuser(context)) cp->c_mode &= ~(S_ISUID | S_ISGID); - skipupdate = 0; - } - } - if (skipupdate) { - retval = hfs_minorupdate(vp); } - else { - cp->c_touch_chgtime = TRUE; /* status changed */ + cp->c_flag |= C_MODIFIED; + cp->c_touch_chgtime = TRUE; /* status changed */ + if (suppress_times == 0) { cp->c_touch_modtime = TRUE; /* file data was modified */ - retval = hfs_update(vp, MNT_WAIT); + + /* + * If we are not suppressing the modtime update, then + * update the gen count as well. + */ + if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK (cp->c_attr.ca_mode)) { + hfs_incr_gencount(cp); + } } + + retval = hfs_update(vp, 0); if (retval) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE, + KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE, -1, -1, -1, retval, 0); } Err_Exit: - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_END, + KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_END, (int)length, (int)fp->ff_size, (int)filebytes, retval, 0); return (retval); @@ -2967,7 +3704,9 @@ hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) { struct filefork *fp = VTOF(vp); struct cnode *cp = VTOC(vp); +#if QUOTA int retval = 0; +#endif /* QUOTA */ /* Cannot truncate an HFS directory! */ if (vnode_isdir(vp)) { @@ -3008,8 +3747,7 @@ hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) { if (fp->ff_unallocblocks > 0) { u_int32_t loanedBlocks; - HFS_MOUNT_LOCK(hfsmp, TRUE); - + hfs_lock_mount (hfsmp); loanedBlocks = fp->ff_unallocblocks; cp->c_blocks -= loanedBlocks; fp->ff_blocks -= loanedBlocks; @@ -3017,7 +3755,7 @@ hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) { hfsmp->loanedBlocks -= loanedBlocks; - HFS_MOUNT_UNLOCK(hfsmp, TRUE); + hfs_unlock_mount (hfsmp); } return 0; @@ -3030,6 +3768,12 @@ hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) { * for use when deleting a file. The simplification here is that we know * that we are releasing all blocks. * + * Note that this function may be called when there is no vnode backing + * the file fork in question. We may call this from hfs_vnop_inactive + * to clear out resource fork data (and may not want to clear out the data + * fork yet). As a result, we pointer-check both sets of inputs before + * doing anything with them. + * * The caller is responsible for saving off a copy of the filefork(s) * embedded within the cnode prior to calling this function. The pointers * supplied as arguments must be valid even if the cnode is no longer valid. @@ -3048,14 +3792,18 @@ hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork, blksize = hfsmp->blockSize; /* Data Fork */ - if (datafork->ff_blocks > 0) { + if (datafork) { + off_t prev_filebytes; + datafork->ff_size = 0; + fileblocks = datafork->ff_blocks; - filebytes = (off_t)fileblocks * (off_t)blksize; + filebytes = (off_t)fileblocks * (off_t)blksize; + prev_filebytes = filebytes; /* We killed invalid ranges and loaned blocks before we removed the catalog entry */ while (filebytes > 0) { - if (filebytes > HFS_BIGFILE_SIZE && overflow_extents(datafork)) { + if (filebytes > HFS_BIGFILE_SIZE) { filebytes -= HFS_BIGFILE_SIZE; } else { filebytes = 0; @@ -3078,11 +3826,14 @@ hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork, hfs_systemfile_unlock(hfsmp, lockflags); } - if (error == 0) { - datafork->ff_size = filebytes; - } (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); + struct cnode *cp = datafork ? FTOC(datafork) : NULL; + struct vnode *vp; + vp = cp ? CTOV(cp, 0) : NULL; + hfs_hotfile_adjust_blocks(vp, (int64_t)((prev_filebytes - filebytes) / blksize)); + prev_filebytes = filebytes; + /* Finish the transaction and start over if necessary */ hfs_end_transaction(hfsmp); @@ -3093,14 +3844,16 @@ hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork, } /* Resource fork */ - if (error == 0 && (rsrcfork != NULL) && rsrcfork->ff_blocks > 0) { + if (error == 0 && rsrcfork) { + rsrcfork->ff_size = 0; + fileblocks = rsrcfork->ff_blocks; filebytes = (off_t)fileblocks * (off_t)blksize; /* We killed invalid ranges and loaned blocks before we removed the catalog entry */ while (filebytes > 0) { - if (filebytes > HFS_BIGFILE_SIZE && overflow_extents(rsrcfork)) { + if (filebytes > HFS_BIGFILE_SIZE) { filebytes -= HFS_BIGFILE_SIZE; } else { filebytes = 0; @@ -3123,9 +3876,6 @@ hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork, hfs_systemfile_unlock(hfsmp, lockflags); } - if (error == 0) { - rsrcfork->ff_size = filebytes; - } (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); /* Finish the transaction and start over if necessary */ @@ -3140,52 +3890,79 @@ hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork, return error; } +errno_t hfs_ubc_setsize(vnode_t vp, off_t len, bool have_cnode_lock) +{ + errno_t error; + + /* + * Call ubc_setsize to give the VM subsystem a chance to do + * whatever it needs to with existing pages before we delete + * blocks. Note that symlinks don't use the UBC so we'll + * get back ENOENT in that case. + */ + if (have_cnode_lock) { + error = ubc_setsize_ex(vp, len, UBC_SETSIZE_NO_FS_REENTRY); + if (error == EAGAIN) { + cnode_t *cp = VTOC(vp); + + if (cp->c_truncatelockowner != current_thread()) { +#if DEVELOPMENT || DEBUG + panic("hfs: hfs_ubc_setsize called without exclusive truncate lock!"); +#else + printf("hfs: hfs_ubc_setsize called without exclusive truncate lock!\n"); +#endif + } + + hfs_unlock(cp); + error = ubc_setsize_ex(vp, len, 0); + hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK); + } + } else + error = ubc_setsize_ex(vp, len, 0); + + return error == ENOENT ? 0 : error; +} /* * Truncate a cnode to at most length size, freeing (or adding) the * disk blocks. */ int -hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize, - int skipupdate, vfs_context_t context) +hfs_truncate(struct vnode *vp, off_t length, int flags, + int truncateflags, vfs_context_t context) { - struct filefork *fp = VTOF(vp); + struct filefork *fp = VTOF(vp); off_t filebytes; u_int32_t fileblocks; - int blksize, error = 0; + int blksize; + errno_t error = 0; struct cnode *cp = VTOC(vp); + hfsmount_t *hfsmp = VTOHFS(vp); /* Cannot truncate an HFS directory! */ if (vnode_isdir(vp)) { return (EISDIR); } /* A swap file cannot change size. */ - if (vnode_isswap(vp) && (length != 0)) { + if (vnode_isswap(vp) && length && !ISSET(flags, IO_NOAUTH)) { return (EPERM); } - blksize = VTOVCB(vp)->blockSize; + blksize = hfsmp->blockSize; fileblocks = fp->ff_blocks; filebytes = (off_t)fileblocks * (off_t)blksize; - // - // Have to do this here so that we don't wind up with - // i/o pending for blocks that are about to be released - // if we truncate the file. - // - // If skipsetsize is set, then the caller is responsible - // for the ubc_setsize. - // - // Even if skipsetsize is set, if the length is zero we - // want to call ubc_setsize() because as of SnowLeopard - // it will no longer cause any page-ins and it will drop - // any dirty pages so that we don't do any i/o that we - // don't have to. This also prevents a race where i/o - // for truncated blocks may overwrite later data if the - // blocks get reallocated to a different file. - // - if (!skipsetsize || length == 0) - ubc_setsize(vp, length); + bool caller_has_cnode_lock = (cp->c_lockowner == current_thread()); + + error = hfs_ubc_setsize(vp, length, caller_has_cnode_lock); + if (error) + return error; + + if (!caller_has_cnode_lock) { + error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + if (error) + return error; + } // have to loop truncating or growing files that are // really big because otherwise transactions can get @@ -3193,40 +3970,54 @@ hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize, if (length < filebytes) { while (filebytes > length) { - if ((filebytes - length) > HFS_BIGFILE_SIZE && overflow_extents(fp)) { + if ((filebytes - length) > HFS_BIGFILE_SIZE) { filebytes -= HFS_BIGFILE_SIZE; } else { filebytes = length; } - cp->c_flag |= C_FORCEUPDATE; - error = do_hfs_truncate(vp, filebytes, flags, skipupdate, context); + error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context); if (error) break; } } else if (length > filebytes) { - while (filebytes < length) { - if ((length - filebytes) > HFS_BIGFILE_SIZE && overflow_extents(fp)) { - filebytes += HFS_BIGFILE_SIZE; - } else { - filebytes = length; + kauth_cred_t cred = vfs_context_ucred(context); + const bool keep_reserve = cred && suser(cred, NULL) != 0; + + if (hfs_freeblks(hfsmp, keep_reserve) + < howmany(length - filebytes, blksize)) { + error = ENOSPC; + } else { + while (filebytes < length) { + if ((length - filebytes) > HFS_BIGFILE_SIZE) { + filebytes += HFS_BIGFILE_SIZE; + } else { + filebytes = length; + } + error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context); + if (error) + break; } - cp->c_flag |= C_FORCEUPDATE; - error = do_hfs_truncate(vp, filebytes, flags, skipupdate, context); - if (error) - break; } } else /* Same logical size */ { - error = do_hfs_truncate(vp, length, flags, skipupdate, context); + error = do_hfs_truncate(vp, length, flags, truncateflags, context); } /* Files that are changing size are not hot file candidates. */ if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) { fp->ff_bytesread = 0; } - return (error); -} + if (!caller_has_cnode_lock) + hfs_unlock(cp); + + // Make sure UBC's size matches up (in case we didn't completely succeed) + errno_t err2 = hfs_ubc_setsize(vp, fp->ff_size, caller_has_cnode_lock); + if (!error) + error = err2; + + return error; +} /* @@ -3273,9 +4064,9 @@ hfs_vnop_allocate(struct vnop_allocate_args /* { check_for_tracked_file(vp, orig_ctime, ap->a_length == 0 ? NAMESPACE_HANDLER_TRUNCATE_OP|NAMESPACE_HANDLER_DELETE_OP : NAMESPACE_HANDLER_TRUNCATE_OP, NULL); - hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK); + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) { + if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { goto Err_Exit; } @@ -3322,6 +4113,13 @@ hfs_vnop_allocate(struct vnop_allocate_args /* { * value of filebytes is 0, length will be at least 1. */ if (length > filebytes) { + if (ISSET(extendFlags, kEFAllMask) + && (hfs_freeblks(hfsmp, ISSET(extendFlags, kEFReserveMask)) + < howmany(length - filebytes, hfsmp->blockSize))) { + retval = ENOSPC; + goto Err_Exit; + } + off_t total_bytes_added = 0, orig_request_size; orig_request_size = moreBytesRequested = length - filebytes; @@ -3364,13 +4162,13 @@ hfs_vnop_allocate(struct vnop_allocate_args /* { /* Protect extents b-tree and allocation bitmap */ lockflags = SFL_BITMAP; if (overflow_extents(fp)) - lockflags |= SFL_EXTENTS; + lockflags |= SFL_EXTENTS; lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); if (moreBytesRequested >= HFS_BIGFILE_SIZE) { - bytesRequested = HFS_BIGFILE_SIZE; + bytesRequested = HFS_BIGFILE_SIZE; } else { - bytesRequested = moreBytesRequested; + bytesRequested = moreBytesRequested; } if (extendFlags & kEFContigMask) { @@ -3399,7 +4197,7 @@ hfs_vnop_allocate(struct vnop_allocate_args /* { hfs_systemfile_unlock(hfsmp, lockflags); if (hfsmp->jnl) { - (void) hfs_update(vp, TRUE); + (void) hfs_update(vp, 0); (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); } @@ -3427,14 +4225,18 @@ hfs_vnop_allocate(struct vnop_allocate_args /* { } else { /* Shorten the size of the file */ - if (fp->ff_size > length) { - /* - * Any buffers that are past the truncation point need to be - * invalidated (to maintain buffer cache consistency). - */ - } + /* + * N.B. At present, this code is never called. If and when we + * do start using it, it looks like there might be slightly + * strange semantics with the file size: it's possible for the + * file size to *increase* e.g. if current file size is 5, + * length is 1024 and filebytes is 4096, the file size will + * end up being 1024 bytes. This isn't necessarily a problem + * but it's not consistent with the code above which doesn't + * change the file size. + */ - retval = hfs_truncate(vp, length, 0, 0, 0, ap->a_context); + retval = hfs_truncate(vp, length, 0, 0, ap->a_context); filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize; /* @@ -3450,21 +4252,20 @@ hfs_vnop_allocate(struct vnop_allocate_args /* { if (fp->ff_size > filebytes) { fp->ff_size = filebytes; - hfs_unlock(cp); - ubc_setsize(vp, fp->ff_size); - hfs_lock(cp, HFS_FORCE_LOCK); + hfs_ubc_setsize(vp, fp->ff_size, true); } } Std_Exit: + cp->c_flag |= C_MODIFIED; cp->c_touch_chgtime = TRUE; cp->c_touch_modtime = TRUE; - retval2 = hfs_update(vp, MNT_WAIT); + retval2 = hfs_update(vp, 0); if (retval == 0) retval = retval2; Err_Exit: - hfs_unlock_truncate(cp, 0); + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); hfs_unlock(cp); return (retval); } @@ -3494,8 +4295,10 @@ hfs_vnop_pagein(struct vnop_pagein_args *ap) upl_t upl; upl_page_info_t *pl; off_t f_offset; + off_t page_needed_f_offset; int offset; int isize; + int upl_size; int pg_index; boolean_t truncate_lock_held = FALSE; boolean_t file_converted = FALSE; @@ -3506,7 +4309,28 @@ hfs_vnop_pagein(struct vnop_pagein_args *ap) fp = VTOF(vp); #if CONFIG_PROTECT - if ((error = cp_handle_vnop(cp, CP_READ_ACCESS | CP_WRITE_ACCESS)) != 0) { + if ((error = cp_handle_vnop(vp, CP_READ_ACCESS | CP_WRITE_ACCESS, 0)) != 0) { + /* + * If we errored here, then this means that one of two things occurred: + * 1. there was a problem with the decryption of the key. + * 2. the device is locked and we are not allowed to access this particular file. + * + * Either way, this means that we need to shut down this upl now. As long as + * the pl pointer is NULL (meaning that we're supposed to create the UPL ourselves) + * then we create a upl and immediately abort it. + */ + if (ap->a_pl == NULL) { + /* create the upl */ + ubc_create_upl (vp, ap->a_f_offset, ap->a_size, &upl, &pl, + UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT); + /* mark the range as needed so it doesn't immediately get discarded upon abort */ + ubc_upl_range_needed (upl, ap->a_pl_offset / PAGE_SIZE, 1); + + /* Abort the range */ + ubc_upl_abort_range (upl, 0, ap->a_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR); + } + + return error; } #endif /* CONFIG_PROTECT */ @@ -3523,6 +4347,8 @@ hfs_vnop_pagein(struct vnop_pagein_args *ap) goto pagein_done; } + page_needed_f_offset = ap->a_f_offset + ap->a_pl_offset; + retry_pagein: /* * take truncate lock (shared/recursive) to guard against @@ -3569,11 +4395,11 @@ retry_pagein: if (vfs_isforce(vp->v_mount)) { if (cp->c_flag & C_DELETED) { /* If we don't get it, then just go ahead and operate without the lock */ - truncate_lock_held = hfs_try_trunclock(cp, HFS_RECURSE_TRUNCLOCK); + truncate_lock_held = hfs_try_trunclock(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE); } } else { - hfs_lock_truncate(cp, HFS_RECURSE_TRUNCLOCK); + hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE); truncate_lock_held = TRUE; } @@ -3583,9 +4409,11 @@ retry_pagein: error = EINVAL; goto pagein_done; } - isize = ap->a_size; + ubc_upl_range_needed(upl, ap->a_pl_offset / PAGE_SIZE, 1); - /* + upl_size = isize = ap->a_size; + + /* * Scan from the back to find the last page in the UPL, so that we * aren't looking at a UPL that may have already been freed by the * preceding aborts/completions. @@ -3653,6 +4481,7 @@ retry_pagein: int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */ if (compressed) { + if (truncate_lock_held) { /* * can't hold the truncate lock when calling into the decmpfs layer @@ -3661,7 +4490,7 @@ retry_pagein: * takes the lock shared, we can deadlock if some other thread * tries to grab the lock exclusively in between. */ - hfs_unlock_truncate(cp, 1); + hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE); truncate_lock_held = FALSE; } ap->a_pl = upl; @@ -3676,12 +4505,19 @@ retry_pagein: * compressed once the compression lock is successfully taken * i.e. we would block on that lock while the file is being inflated */ + if (error == 0 && vnode_isfastdevicecandidate(vp)) { + (void) hfs_addhotfile(vp); + } if (compressed) { if (error == 0) { /* successful page-in, update the access time */ VTOC(vp)->c_touch_acctime = TRUE; - /* compressed files are not hot file candidates */ + // + // compressed files are not traditional hot file candidates + // but they may be for CF (which ignores the ff_bytesread + // field) + // if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) { fp->ff_bytesread = 0; } @@ -3692,6 +4528,19 @@ retry_pagein: * indication that the pagein needs to be redriven */ ubc_upl_abort_range(upl, (upl_offset_t) offset, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_RESTART); + } else if (error == ENOSPC) { + + if (upl_size == PAGE_SIZE) + panic("decmpfs_pagein_compressed: couldn't ubc_upl_map a single page\n"); + + ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY); + + ap->a_size = PAGE_SIZE; + ap->a_pl = NULL; + ap->a_pl_offset = 0; + ap->a_f_offset = page_needed_f_offset; + + goto retry_pagein; } goto pagein_next_range; } @@ -3744,7 +4593,7 @@ retry_pagein: /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */ if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) { - hfs_lock(cp, HFS_FORCE_LOCK); + hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); took_cnode_lock = 1; } /* @@ -3761,6 +4610,10 @@ retry_pagein: fp->ff_bytesread += bytesread; } cp->c_touch_acctime = TRUE; + + if (vnode_isfastdevicecandidate(vp)) { + (void) hfs_addhotfile(vp); + } if (took_cnode_lock) hfs_unlock(cp); } @@ -3776,7 +4629,7 @@ pagein_next_range: pagein_done: if (truncate_lock_held == TRUE) { /* Note 1 is passed to hfs_unlock_truncate in been_recursed argument */ - hfs_unlock_truncate(cp, 1); + hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE); } return (error); @@ -3814,16 +4667,6 @@ hfs_vnop_pageout(struct vnop_pageout_args *ap) cp = VTOC(vp); fp = VTOF(vp); - /* - * Figure out where the file ends, for pageout purposes. If - * ff_new_size > ff_size, then we're in the middle of extending the - * file via a write, so it is safe (and necessary) that we be able - * to pageout up to that point. - */ - filesize = fp->ff_size; - if (fp->ff_new_size > filesize) - filesize = fp->ff_new_size; - a_flags = ap->a_flags; a_pl_offset = ap->a_pl_offset; @@ -3842,10 +4685,19 @@ hfs_vnop_pageout(struct vnop_pageout_args *ap) a_pl_offset = 0; /* - * take truncate lock (shared) to guard against - * zero-fill thru fsync interfering, but only for v2 + * For V2 semantics, we want to take the cnode truncate lock + * shared to guard against the file size changing via zero-filling. + * + * However, we have to be careful because we may be invoked + * via the ubc_msync path to write out dirty mmap'd pages + * in response to a lock event on a content-protected + * filesystem (e.g. to write out class A files). + * As a result, we want to take the truncate lock 'SHARED' with + * the mini-recursion locktype so that we don't deadlock/panic + * because we may be already holding the truncate lock exclusive to force any other + * IOs to have blocked behind us. */ - hfs_lock_truncate(cp, HFS_SHARED_LOCK); + hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE); if (a_flags & UPL_MSYNC) { request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY; @@ -3866,6 +4718,16 @@ hfs_vnop_pageout(struct vnop_pageout_args *ap) * it was either passed in or we succesfully created it */ + /* + * Figure out where the file ends, for pageout purposes. If + * ff_new_size > ff_size, then we're in the middle of extending the + * file via a write, so it is safe (and necessary) that we be able + * to pageout up to that point. + */ + filesize = fp->ff_size; + if (fp->ff_new_size > filesize) + filesize = fp->ff_new_size; + /* * Now that HFS is opting into VFC_VFSVNOP_PAGEOUTV2, we may need to operate on our own * UPL instead of relying on the UPL passed into us. We go ahead and do that here, @@ -3956,42 +4818,6 @@ hfs_vnop_pageout(struct vnop_pageout_args *ap) } xsize = num_of_pages * PAGE_SIZE; - if (!vnode_isswap(vp)) { - off_t end_of_range; - int tooklock; - - tooklock = 0; - - if (cp->c_lockowner != current_thread()) { - if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) { - /* - * we're in the v2 path, so we are the - * owner of the UPL... we may have already - * processed some of the UPL, so abort it - * from the current working offset to the - * end of the UPL - */ - ubc_upl_abort_range(upl, - offset, - ap->a_size - offset, - UPL_ABORT_FREE_ON_EMPTY); - goto pageout_done; - } - tooklock = 1; - } - end_of_range = f_offset + xsize - 1; - - if (end_of_range >= filesize) { - end_of_range = (off_t)(filesize - 1); - } - if (f_offset < filesize) { - rl_remove(f_offset, end_of_range, &fp->ff_invalidranges); - cp->c_flag |= C_MODIFIED; /* leof is dirty */ - } - if (tooklock) { - hfs_unlock(cp); - } - } if ((error = cluster_pageout(vp, upl, offset, f_offset, xsize, filesize, a_flags))) { if (error_ret == 0) @@ -4008,36 +4834,6 @@ hfs_vnop_pageout(struct vnop_pageout_args *ap) } } /* end block for v2 pageout behavior */ else { - if (!vnode_isswap(vp)) { - off_t end_of_range; - int tooklock = 0; - - if (cp->c_lockowner != current_thread()) { - if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) { - if (!(a_flags & UPL_NOCOMMIT)) { - ubc_upl_abort_range(upl, - a_pl_offset, - ap->a_size, - UPL_ABORT_FREE_ON_EMPTY); - } - goto pageout_done; - } - tooklock = 1; - } - end_of_range = ap->a_f_offset + ap->a_size - 1; - - if (end_of_range >= filesize) { - end_of_range = (off_t)(filesize - 1); - } - if (ap->a_f_offset < filesize) { - rl_remove(ap->a_f_offset, end_of_range, &fp->ff_invalidranges); - cp->c_flag |= C_MODIFIED; /* leof is dirty */ - } - - if (tooklock) { - hfs_unlock(cp); - } - } /* * just call cluster_pageout for old pre-v2 behavior */ @@ -4046,26 +4842,52 @@ hfs_vnop_pageout(struct vnop_pageout_args *ap) } /* - * If data was written, update the modification time of the file. - * If setuid or setgid bits are set and this process is not the - * superuser then clear the setuid and setgid bits as a precaution - * against tampering. + * If data was written, update the modification time of the file + * but only if it's mapped writable; we will have touched the + * modifcation time for direct writes. */ - if (retval == 0) { - cp->c_touch_modtime = TRUE; - cp->c_touch_chgtime = TRUE; - if ((cp->c_mode & (S_ISUID | S_ISGID)) && - (vfs_context_suser(ap->a_context) != 0)) { - hfs_lock(cp, HFS_FORCE_LOCK); - cp->c_mode &= ~(S_ISUID | S_ISGID); - hfs_unlock(cp); + if (retval == 0 && (ubc_is_mapped_writable(vp) + || ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING))) { + hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + + // Check again with lock + bool mapped_writable = ubc_is_mapped_writable(vp); + if (mapped_writable + || ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING)) { + cp->c_touch_modtime = TRUE; + cp->c_touch_chgtime = TRUE; + + /* + * We only need to increment the generation counter if + * it's currently mapped writable because we incremented + * the counter in hfs_vnop_mnomap. + */ + if (mapped_writable) + hfs_incr_gencount(VTOC(vp)); + + /* + * If setuid or setgid bits are set and this process is + * not the superuser then clear the setuid and setgid bits + * as a precaution against tampering. + */ + if ((cp->c_mode & (S_ISUID | S_ISGID)) && + (vfs_context_suser(ap->a_context) != 0)) { + cp->c_mode &= ~(S_ISUID | S_ISGID); + } } + + hfs_unlock(cp); } pageout_done: if (is_pageoutv2) { - /* release truncate lock (shared) */ - hfs_unlock_truncate(cp, 0); + /* + * Release the truncate lock. Note that because + * we may have taken the lock recursively by + * being invoked via ubc_msync due to lockdown, + * we should release it recursively, too. + */ + hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE); } return (retval); } @@ -4123,6 +4945,168 @@ hfs_vnop_bwrite(struct vnop_bwrite_args *ap) return (retval); } + +int +hfs_pin_block_range(struct hfsmount *hfsmp, int pin_state, uint32_t start_block, uint32_t nblocks, vfs_context_t ctx) +{ + _dk_cs_pin_t pin; + unsigned ioc; + int err; + + memset(&pin, 0, sizeof(pin)); + pin.cp_extent.offset = ((uint64_t)start_block) * HFSTOVCB(hfsmp)->blockSize; + pin.cp_extent.length = ((uint64_t)nblocks) * HFSTOVCB(hfsmp)->blockSize; + switch (pin_state) { + case HFS_PIN_IT: + ioc = _DKIOCCSPINEXTENT; + pin.cp_flags = _DKIOCCSPINTOFASTMEDIA; + break; + case HFS_PIN_IT | HFS_TEMP_PIN: + ioc = _DKIOCCSPINEXTENT; + pin.cp_flags = _DKIOCCSPINTOFASTMEDIA | _DKIOCCSTEMPORARYPIN; + break; + case HFS_PIN_IT | HFS_DATALESS_PIN: + ioc = _DKIOCCSPINEXTENT; + pin.cp_flags = _DKIOCCSPINTOFASTMEDIA | _DKIOCCSPINFORSWAPFILE; + break; + case HFS_UNPIN_IT: + ioc = _DKIOCCSUNPINEXTENT; + pin.cp_flags = 0; + break; + case HFS_UNPIN_IT | HFS_EVICT_PIN: + ioc = _DKIOCCSPINEXTENT; + pin.cp_flags = _DKIOCCSPINTOSLOWMEDIA; + break; + default: + return EINVAL; + } + err = VNOP_IOCTL(hfsmp->hfs_devvp, ioc, (caddr_t)&pin, 0, ctx); + return err; +} + +// +// The cnode lock should already be held on entry to this function +// +int +hfs_pin_vnode(struct hfsmount *hfsmp, struct vnode *vp, int pin_state, uint32_t *num_blocks_pinned, vfs_context_t ctx) +{ + struct filefork *fp = VTOF(vp); + int i, err=0, need_put=0; + struct vnode *rsrc_vp=NULL; + uint32_t npinned = 0; + off_t offset; + + if (num_blocks_pinned) { + *num_blocks_pinned = 0; + } + + if (vnode_vtype(vp) != VREG) { + /* Not allowed to pin directories or symlinks */ + printf("hfs: can't pin vnode of type %d\n", vnode_vtype(vp)); + return (EPERM); + } + + if (fp->ff_unallocblocks) { + printf("hfs: can't pin a vnode w/unalloced blocks (%d)\n", fp->ff_unallocblocks); + return (EINVAL); + } + + /* + * It is possible that if the caller unlocked/re-locked the cnode after checking + * for C_NOEXISTS|C_DELETED that the file could have been deleted while the + * cnode was unlocked. So check the condition again and return ENOENT so that + * the caller knows why we failed to pin the vnode. + */ + if (VTOC(vp)->c_flag & (C_NOEXISTS|C_DELETED)) { + // makes no sense to pin something that's pending deletion + return ENOENT; + } + + if (fp->ff_blocks == 0 && (VTOC(vp)->c_bsdflags & UF_COMPRESSED)) { + if (!VNODE_IS_RSRC(vp) && hfs_vgetrsrc(hfsmp, vp, &rsrc_vp) == 0) { + //printf("hfs: fileid %d resource fork nblocks: %d / size: %lld\n", VTOC(vp)->c_fileid, + // VTOC(rsrc_vp)->c_rsrcfork->ff_blocks,VTOC(rsrc_vp)->c_rsrcfork->ff_size); + + fp = VTOC(rsrc_vp)->c_rsrcfork; + need_put = 1; + } + } + if (fp->ff_blocks == 0) { + if (need_put) { + // + // use a distinct error code for a compressed file that has no resource fork; + // we return EALREADY to indicate that the data is already probably hot file + // cached because it's in an EA and the attributes btree is on the ssd + // + err = EALREADY; + } else { + err = EINVAL; + } + goto out; + } + + offset = 0; + for (i = 0; i < kHFSPlusExtentDensity; i++) { + if (fp->ff_extents[i].startBlock == 0) { + break; + } + + err = hfs_pin_block_range(hfsmp, pin_state, fp->ff_extents[i].startBlock, fp->ff_extents[i].blockCount, ctx); + if (err) { + break; + } else { + npinned += fp->ff_extents[i].blockCount; + } + } + + if (err || npinned == 0) { + goto out; + } + + if (fp->ff_extents[kHFSPlusExtentDensity-1].startBlock) { + uint32_t pblocks; + uint8_t forktype = 0; + + if (fp == VTOC(vp)->c_rsrcfork) { + forktype = 0xff; + } + /* + * The file could have overflow extents, better pin them. + * + * We assume that since we are holding the cnode lock for this cnode, + * the files extents cannot be manipulated, but the tree could, so we + * need to ensure that it doesn't change behind our back as we iterate it. + */ + int lockflags = hfs_systemfile_lock (hfsmp, SFL_EXTENTS, HFS_SHARED_LOCK); + err = hfs_pin_overflow_extents(hfsmp, VTOC(vp)->c_fileid, forktype, &pblocks); + hfs_systemfile_unlock (hfsmp, lockflags); + + if (err) { + goto out; + } + npinned += pblocks; + } + +out: + if (num_blocks_pinned) { + *num_blocks_pinned = npinned; + } + + if (need_put && rsrc_vp) { + // + // have to unlock the cnode since it's shared between the + // resource fork vnode and the data fork vnode (and the + // vnode_put() may need to re-acquire the cnode lock to + // reclaim the resource fork vnode) + // + hfs_unlock(VTOC(vp)); + vnode_put(rsrc_vp); + hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + } + return err; +} + + /* * Relocate a file to a new location on disk * cnode must be locked on entry @@ -4179,7 +5163,8 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, enum vtype vnodetype; vnodetype = vnode_vtype(vp); - if (vnodetype != VREG && vnodetype != VLNK) { + if (vnodetype != VREG) { + /* Not allowed to move symlinks. */ return (EPERM); } @@ -4202,18 +5187,17 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, return EINVAL; } #endif - /* If it's an SSD, also disable HFS relocation */ if (hfsmp->hfs_flags & HFS_SSD) { return EINVAL; } + blksize = hfsmp->blockSize; if (blockHint == 0) blockHint = hfsmp->nextAllocation; - if ((fp->ff_size > 0x7fffffff) || - ((fp->ff_size > blksize) && vnodetype == VLNK)) { + if (fp->ff_size > 0x7fffffff) { return (EFBIG); } @@ -4230,15 +5214,15 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, if (!vnode_issystem(vp) && (vnodetype != VLNK)) { hfs_unlock(cp); - hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK); + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); /* Force lock since callers expects lock to be held. */ - if ((retval = hfs_lock(cp, HFS_FORCE_LOCK))) { - hfs_unlock_truncate(cp, 0); + if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS))) { + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); return (retval); } /* No need to continue if file was removed. */ if (cp->c_flag & C_NOEXISTS) { - hfs_unlock_truncate(cp, 0); + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); return (ENOENT); } took_trunc_lock = 1; @@ -4253,7 +5237,7 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, if (hfs_start_transaction(hfsmp) != 0) { if (took_trunc_lock) - hfs_unlock_truncate(cp, 0); + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); return (EINVAL); } started_tr = 1; @@ -4278,10 +5262,10 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, nextallocsave = hfsmp->nextAllocation; retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes); if (eflags & kEFMetadataMask) { - HFS_MOUNT_LOCK(hfsmp, TRUE); + hfs_lock_mount(hfsmp); HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave); MarkVCBDirty(hfsmp); - HFS_MOUNT_UNLOCK(hfsmp, TRUE); + hfs_unlock_mount(hfsmp); } retval = MacToVFSError(retval); @@ -4291,7 +5275,7 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, retval = ENOSPC; goto restore; } else if (fp->ff_blocks < (headblks + datablks)) { - printf("hfs_relocate: allocation failed"); + printf("hfs_relocate: allocation failed id=%u, vol=%s\n", cp->c_cnid, hfsmp->vcbVN); retval = ENOSPC; goto restore; } @@ -4342,7 +5326,7 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, */ if (vnodetype == VLNK) - retval = hfs_clonelink(vp, blksize, cred, p); + retval = EPERM; else if (vnode_issystem(vp)) retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p); else @@ -4373,7 +5357,7 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, goto restore; out: if (took_trunc_lock) - hfs_unlock_truncate(cp, 0); + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); if (lockflags) { hfs_systemfile_unlock(hfsmp, lockflags); @@ -4382,13 +5366,13 @@ out: /* Push cnode's new extent data to disk. */ if (retval == 0) { - (void) hfs_update(vp, MNT_WAIT); + hfs_update(vp, 0); } if (hfsmp->jnl) { if (cp->c_cnid < kHFSFirstUserCatalogNodeID) - (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH); + (void) hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT | HFS_FVH_WRITE_ALT); else - (void) hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); + (void) hfs_flushvolumeheader(hfsmp, 0); } exit: if (started_tr) @@ -4399,7 +5383,7 @@ exit: restore: if (fp->ff_blocks == headblks) { if (took_trunc_lock) - hfs_unlock_truncate(cp, 0); + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); goto exit; } /* @@ -4419,44 +5403,11 @@ restore: lockflags = 0; if (took_trunc_lock) - hfs_unlock_truncate(cp, 0); + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); goto exit; } -/* - * Clone a symlink. - * - */ -static int -hfs_clonelink(struct vnode *vp, int blksize, kauth_cred_t cred, __unused struct proc *p) -{ - struct buf *head_bp = NULL; - struct buf *tail_bp = NULL; - int error; - - - error = (int)buf_meta_bread(vp, (daddr64_t)0, blksize, cred, &head_bp); - if (error) - goto out; - - tail_bp = buf_getblk(vp, (daddr64_t)1, blksize, 0, 0, BLK_META); - if (tail_bp == NULL) { - error = EIO; - goto out; - } - bcopy((char *)buf_dataptr(head_bp), (char *)buf_dataptr(tail_bp), blksize); - error = (int)buf_bwrite(tail_bp); -out: - if (head_bp) { - buf_markinvalid(head_bp); - buf_brelse(head_bp); - } - (void) buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0); - - return (error); -} - /* * Clone a file's data within the file. * @@ -4481,14 +5432,14 @@ hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize) hfs_unlock(VTOC(vp)); #if CONFIG_PROTECT - if ((error = cp_handle_vnop(VTOC(vp), CP_WRITE_ACCESS)) != 0) { - hfs_lock(VTOC(vp), HFS_FORCE_LOCK); + if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) { + hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); return (error); } #endif /* CONFIG_PROTECT */ - if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) { - hfs_lock(VTOC(vp), HFS_FORCE_LOCK); + if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize, VM_KERN_MEMORY_FILE)) { + hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); return (ENOMEM); } @@ -4506,7 +5457,7 @@ hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize) break; } if (uio_resid(auio) != 0) { - printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", uio_resid(auio)); + printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", (int64_t)uio_resid(auio)); error = EIO; break; } @@ -4543,7 +5494,7 @@ hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize) ubc_msync(vp, writebase, writebase + offset, NULL, UBC_INVALIDATE | UBC_PUSHDIRTY); } else { /* - * No need to call ubc_sync_range or hfs_invalbuf + * No need to call ubc_msync or hfs_invalbuf * since the file was copied using IO_NOCACHE and * the copy was done starting and ending on a page * boundary in the file. @@ -4551,7 +5502,7 @@ hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize) } kmem_free(kernel_map, (vm_offset_t)bufp, bufsize); - hfs_lock(VTOC(vp), HFS_FORCE_LOCK); + hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); return (error); } @@ -4581,7 +5532,7 @@ hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize, bufsize = MIN(blkcnt * blksize, 1024 * 1024) & ~(iosize - 1); breadcnt = bufsize / iosize; - if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) { + if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize, VM_KERN_MEMORY_FILE)) { return (ENOMEM); } start_blk = ((daddr64_t)blkstart * blksize) / iosize; @@ -4642,3 +5593,90 @@ out: return (error); } + +errno_t hfs_flush_invalid_ranges(vnode_t vp) +{ + cnode_t *cp = VTOC(vp); + + assert(cp->c_lockowner == current_thread()); + assert(cp->c_truncatelockowner == current_thread()); + + if (!ISSET(cp->c_flag, C_ZFWANTSYNC) && !cp->c_zftimeout) + return 0; + + filefork_t *fp = VTOF(vp); + + /* + * We can't hold the cnode lock whilst we call cluster_write so we + * need to copy the extents into a local buffer. + */ + int max_exts = 16; + struct ext { + off_t start, end; + } exts_buf[max_exts]; // 256 bytes + struct ext *exts = exts_buf; + int ext_count = 0; + errno_t ret; + + struct rl_entry *r = TAILQ_FIRST(&fp->ff_invalidranges); + + while (r) { + /* If we have more than can fit in our stack buffer, switch + to a heap buffer. */ + if (exts == exts_buf && ext_count == max_exts) { + max_exts = 256; + MALLOC(exts, struct ext *, sizeof(struct ext) * max_exts, + M_TEMP, M_WAITOK); + memcpy(exts, exts_buf, ext_count * sizeof(struct ext)); + } + + struct rl_entry *next = TAILQ_NEXT(r, rl_link); + + exts[ext_count++] = (struct ext){ r->rl_start, r->rl_end }; + + if (!next || (ext_count == max_exts && exts != exts_buf)) { + hfs_unlock(cp); + for (int i = 0; i < ext_count; ++i) { + ret = cluster_write(vp, NULL, fp->ff_size, exts[i].end + 1, + exts[i].start, 0, + IO_HEADZEROFILL | IO_NOZERODIRTY | IO_NOCACHE); + if (ret) { + hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK); + goto exit; + } + } + + if (!next) { + hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK); + break; + } + + /* Push any existing clusters which should clean up our invalid + ranges as they go through hfs_vnop_blockmap. */ + cluster_push(vp, 0); + + hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK); + + /* + * Get back to where we were (given we dropped the lock). + * This shouldn't be many because we pushed above. + */ + TAILQ_FOREACH(r, &fp->ff_invalidranges, rl_link) { + if (r->rl_end > exts[ext_count - 1].end) + break; + } + + ext_count = 0; + } else + r = next; + } + + ret = 0; + +exit: + + if (exts != exts_buf) + FREE(exts, M_TEMP); + + return ret; +}