X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/b0d623f7f2ae71ed96e60569f61f9a9a27016e80..4d15aeb193b2c68f1d38666c317f8d3734f5f083:/bsd/vfs/vfs_cluster.c?ds=sidebyside diff --git a/bsd/vfs/vfs_cluster.c b/bsd/vfs/vfs_cluster.c index 5aec1498a..70eecc5ff 100644 --- a/bsd/vfs/vfs_cluster.c +++ b/bsd/vfs/vfs_cluster.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2014 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -71,6 +71,7 @@ #include #include #include +#include #include #include #include @@ -82,14 +83,21 @@ #include #include #include +#include +#include #include #include #include +#include #include #include +#include + +#include + #if 0 #undef KERNEL_DEBUG #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT @@ -111,9 +119,15 @@ #define CL_DIRECT_IO 0x1000 #define CL_PASSIVE 0x2000 #define CL_IOSTREAMING 0x4000 +#define CL_CLOSE 0x8000 +#define CL_ENCRYPTED 0x10000 +#define CL_RAW_ENCRYPTED 0x20000 +#define CL_NOCACHE 0x40000 #define MAX_VECTOR_UPL_ELEMENTS 8 -#define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE) * PAGE_SIZE +#define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE_BYTES) + +#define CLUSTER_IO_WAITING ((buf_t)1) extern upl_t vector_upl_create(vm_offset_t); extern boolean_t vector_upl_is_valid(upl_t); @@ -122,17 +136,31 @@ extern void vector_upl_set_pagelist(upl_t); extern void vector_upl_set_iostate(upl_t, upl_t, vm_offset_t, u_int32_t); struct clios { + lck_mtx_t io_mtxp; u_int io_completed; /* amount of io that has currently completed */ u_int io_issued; /* amount of io that was successfully issued */ int io_error; /* error code of first error encountered */ int io_wanted; /* someone is sleeping waiting for a change in state */ }; +struct cl_direct_read_lock { + LIST_ENTRY(cl_direct_read_lock) chain; + int32_t ref_count; + vnode_t vp; + lck_rw_t rw_lock; +}; + +#define CL_DIRECT_READ_LOCK_BUCKETS 61 + +static LIST_HEAD(cl_direct_read_locks, cl_direct_read_lock) + cl_direct_read_locks[CL_DIRECT_READ_LOCK_BUCKETS]; + +static lck_spin_t cl_direct_read_spin_lock; + static lck_grp_t *cl_mtx_grp; static lck_attr_t *cl_mtx_attr; static lck_grp_attr_t *cl_mtx_grp_attr; -static lck_mtx_t *cl_mtxp; - +static lck_mtx_t *cl_transaction_mtxp; #define IO_UNKNOWN 0 #define IO_DIRECT 1 @@ -153,10 +181,12 @@ static int cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size, int flags, buf_t real_bp, struct clios *iostate, int (*)(buf_t, void *), void *callback_arg); static int cluster_iodone(buf_t bp, void *callback_arg); -static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags); -static int cluster_hard_throttle_on(vnode_t vp, uint32_t); +static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp); +static int cluster_is_throttled(vnode_t vp); + +static void cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name); -static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg); +static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg, int flags); static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference); static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference); @@ -182,10 +212,10 @@ static void cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t files static int cluster_push_now(vnode_t vp, struct cl_extent *, off_t EOF, int flags, int (*)(buf_t, void *), void *callback_arg); -static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int (*)(buf_t, void *), void *callback_arg); +static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int flags, int (*)(buf_t, void *), void *callback_arg, int *err); static void sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF, int (*)(buf_t, void *), void *callback_arg); -static void sparse_cluster_push(void **cmapp, vnode_t vp, off_t EOF, int push_flag, int (*)(buf_t, void *), void *callback_arg); +static int sparse_cluster_push(void **cmapp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*)(buf_t, void *), void *callback_arg); static void sparse_cluster_add(void **cmapp, vnode_t vp, struct cl_extent *, off_t EOF, int (*)(buf_t, void *), void *callback_arg); static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp); @@ -193,32 +223,74 @@ static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *le static kern_return_t vfs_drt_control(void **cmapp, int op_type); +/* + * For throttled IO to check whether + * a block is cached by the boot cache + * and thus it can avoid delaying the IO. + * + * bootcache_contains_block is initially + * NULL. The BootCache will set it while + * the cache is active and clear it when + * the cache is jettisoned. + * + * Returns 0 if the block is not + * contained in the cache, 1 if it is + * contained. + * + * The function pointer remains valid + * after the cache has been evicted even + * if bootcache_contains_block has been + * cleared. + * + * See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs + */ +int (*bootcache_contains_block)(dev_t device, u_int64_t blkno) = NULL; + + /* * limit the internal I/O size so that we * can represent it in a 32 bit int */ #define MAX_IO_REQUEST_SIZE (1024 * 1024 * 512) -#define MAX_IO_CONTIG_SIZE (MAX_UPL_SIZE * PAGE_SIZE) +#define MAX_IO_CONTIG_SIZE MAX_UPL_SIZE_BYTES #define MAX_VECTS 16 -#define MIN_DIRECT_WRITE_SIZE (4 * PAGE_SIZE) +/* + * The MIN_DIRECT_WRITE_SIZE governs how much I/O should be issued before we consider + * allowing the caller to bypass the buffer cache. For small I/Os (less than 16k), + * we have not historically allowed the write to bypass the UBC. + */ +#define MIN_DIRECT_WRITE_SIZE (16384) -#define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * base) -#define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE)) -#define MAX_PREFETCH(vp, io_size) (io_size * IO_SCALE(vp, 3)) +#define WRITE_THROTTLE 6 +#define WRITE_THROTTLE_SSD 2 +#define WRITE_BEHIND 1 +#define WRITE_BEHIND_SSD 1 + +#define PREFETCH 3 +#define PREFETCH_SSD 2 +uint32_t speculative_prefetch_max = (MAX_UPL_SIZE_BYTES * 3); /* maximum bytes in a specluative read-ahead */ +uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use in a specluative read-ahead on SSDs*/ -int speculative_reads_disabled = 0; +#define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * (base)) +#define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE)) +#define MAX_PREFETCH(vp, size, is_ssd) (size * IO_SCALE(vp, ((is_ssd && !ignore_is_ssd) ? PREFETCH_SSD : PREFETCH))) + +int ignore_is_ssd = 0; +int speculative_reads_disabled = 0; /* * throttle the number of async writes that * can be outstanding on a single vnode * before we issue a synchronous write */ -#define HARD_THROTTLE_MAXCNT 0 -#define HARD_THROTTLE_MAXSIZE (32 * 1024) +#define THROTTLE_MAXCNT 0 -int hard_throttle_on_root = 0; -struct timeval priority_IO_timestamp_for_root; +uint32_t throttle_max_iosize = (128 * 1024); + +#define THROTTLE_MAX_IOSIZE (throttle_max_iosize) + +SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_max_iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_max_iosize, 0, ""); void @@ -234,14 +306,15 @@ cluster_init(void) { */ cl_mtx_attr = lck_attr_alloc_init(); - /* - * allocate and initialize mutex's used to protect updates and waits - * on the cluster_io context - */ - cl_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr); + cl_transaction_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr); - if (cl_mtxp == NULL) - panic("cluster_init: failed to allocate cl_mtxp"); + if (cl_transaction_mtxp == NULL) + panic("cluster_init: failed to allocate cl_transaction_mtxp"); + + lck_spin_init(&cl_direct_read_spin_lock, cl_mtx_grp, cl_mtx_attr); + + for (int i = 0; i < CL_DIRECT_READ_LOCK_BUCKETS; ++i) + LIST_INIT(&cl_direct_read_locks[i]); } @@ -267,19 +340,19 @@ cluster_max_io_size(mount_t mp, int type) maxcnt = min(mp->mnt_maxreadcnt, mp->mnt_maxwritecnt); break; } - if (segcnt > MAX_UPL_SIZE) { + if (segcnt > (MAX_UPL_SIZE_BYTES >> PAGE_SHIFT)) { /* * don't allow a size beyond the max UPL size we can create */ - segcnt = MAX_UPL_SIZE; + segcnt = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT; } max_io_size = min((segcnt * PAGE_SIZE), maxcnt); - if (max_io_size < (MAX_UPL_TRANSFER * PAGE_SIZE)) { + if (max_io_size < MAX_UPL_TRANSFER_BYTES) { /* * don't allow a size smaller than the old fixed limit */ - max_io_size = (MAX_UPL_TRANSFER * PAGE_SIZE); + max_io_size = MAX_UPL_TRANSFER_BYTES; } else { /* * make sure the size specified is a multiple of PAGE_SIZE @@ -397,7 +470,7 @@ cluster_get_wbp(vnode_t vp, int flags) static void -cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg) +cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, int flags) { struct cl_writebehind *wbp; @@ -406,7 +479,7 @@ cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *c if (wbp->cl_number) { lck_mtx_lock(&wbp->cl_lockw); - cluster_try_push(wbp, vp, newEOF, PUSH_ALL | PUSH_SYNC, callback, callback_arg); + cluster_try_push(wbp, vp, newEOF, PUSH_ALL | flags, 0, callback, callback_arg, NULL); lck_mtx_unlock(&wbp->cl_lockw); } @@ -414,44 +487,199 @@ cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *c } +static int +cluster_io_present_in_BC(vnode_t vp, off_t f_offset) +{ + daddr64_t blkno; + size_t io_size; + int (*bootcache_check_fn)(dev_t device, u_int64_t blkno) = bootcache_contains_block; + + if (bootcache_check_fn) { + if (VNOP_BLOCKMAP(vp, f_offset, PAGE_SIZE, &blkno, &io_size, NULL, VNODE_READ, NULL)) + return(0); + + if (io_size == 0) + return (0); + + if (bootcache_check_fn(vp->v_mount->mnt_devvp->v_rdev, blkno)) + return(1); + } + return(0); +} + + static int -cluster_hard_throttle_on(vnode_t vp, uint32_t hard_throttle) +cluster_is_throttled(vnode_t vp) +{ + return (throttle_io_will_be_throttled(-1, vp->v_mount)); +} + + +static void +cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name) +{ + + lck_mtx_lock(&iostate->io_mtxp); + + while ((iostate->io_issued - iostate->io_completed) > target) { + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, + iostate->io_issued, iostate->io_completed, target, 0, 0); + + iostate->io_wanted = 1; + msleep((caddr_t)&iostate->io_wanted, &iostate->io_mtxp, PRIBIO + 1, wait_name, NULL); + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, + iostate->io_issued, iostate->io_completed, target, 0, 0); + } + lck_mtx_unlock(&iostate->io_mtxp); +} + +static void cluster_handle_associated_upl(struct clios *iostate, upl_t upl, + upl_offset_t upl_offset, upl_size_t size) { - struct uthread *ut; + if (!size) + return; + + upl_t associated_upl = upl_associated_upl(upl); + + if (!associated_upl) + return; + +#if 0 + printf("1: %d %d\n", upl_offset, upl_offset + size); +#endif + + /* + * The associated UPL is page aligned to file offsets whereas the + * UPL it's attached to has different alignment requirements. The + * upl_offset that we have refers to @upl. The code that follows + * has to deal with the first and last pages in this transaction + * which might straddle pages in the associated UPL. To keep + * track of these pages, we use the mark bits: if the mark bit is + * set, we know another transaction has completed its part of that + * page and so we can unlock that page here. + * + * The following illustrates what we have to deal with: + * + * MEM u <------------ 1 PAGE ------------> e + * +-------------+----------------------+----------------- + * | |######################|################# + * +-------------+----------------------+----------------- + * FILE | <--- a ---> o <------------ 1 PAGE ------------> + * + * So here we show a write to offset @o. The data that is to be + * written is in a buffer that is not page aligned; it has offset + * @a in the page. The upl that carries the data starts in memory + * at @u. The associated upl starts in the file at offset @o. A + * transaction will always end on a page boundary (like @e above) + * except for the very last transaction in the group. We cannot + * unlock the page at @o in the associated upl until both the + * transaction ending at @e and the following transaction (that + * starts at @e) has completed. + */ + + /* + * We record whether or not the two UPLs are aligned as the mark + * bit in the first page of @upl. + */ + upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl); + bool is_unaligned = upl_page_get_mark(pl, 0); + + if (is_unaligned) { + upl_page_info_t *assoc_pl = UPL_GET_INTERNAL_PAGE_LIST(associated_upl); - if (hard_throttle) { - static struct timeval hard_throttle_maxelapsed = { 0, 200000 }; + upl_offset_t upl_end = upl_offset + size; + assert(upl_end >= PAGE_SIZE); - if (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV) { - struct timeval elapsed; + upl_size_t assoc_upl_size = upl_get_size(associated_upl); - if (hard_throttle_on_root) - return(1); + /* + * In the very first transaction in the group, upl_offset will + * not be page aligned, but after that it will be and in that + * case we want the preceding page in the associated UPL hence + * the minus one. + */ + assert(upl_offset); + if (upl_offset) + upl_offset = trunc_page_32(upl_offset - 1); - microuptime(&elapsed); - timevalsub(&elapsed, &priority_IO_timestamp_for_root); + lck_mtx_lock_spin(&iostate->io_mtxp); - if (timevalcmp(&elapsed, &hard_throttle_maxelapsed, <)) - return(1); + // Look at the first page... + if (upl_offset + && !upl_page_get_mark(assoc_pl, upl_offset >> PAGE_SHIFT)) { + /* + * The first page isn't marked so let another transaction + * completion handle it. + */ + upl_page_set_mark(assoc_pl, upl_offset >> PAGE_SHIFT, true); + upl_offset += PAGE_SIZE; } - } - if (throttle_get_io_policy(&ut) == IOPOL_THROTTLE) { - if (throttle_io_will_be_throttled(-1, vp->v_mount)) { - return(1); + + // And now the last page... + + /* + * This needs to be > rather than >= because if it's equal, it + * means there's another transaction that is sharing the last + * page. + */ + if (upl_end > assoc_upl_size) + upl_end = assoc_upl_size; + else { + upl_end = trunc_page_32(upl_end); + const int last_pg = (upl_end >> PAGE_SHIFT) - 1; + + if (!upl_page_get_mark(assoc_pl, last_pg)) { + /* + * The last page isn't marked so mark the page and let another + * transaction completion handle it. + */ + upl_page_set_mark(assoc_pl, last_pg, true); + upl_end -= PAGE_SIZE; + } } + + lck_mtx_unlock(&iostate->io_mtxp); + +#if 0 + printf("2: %d %d\n", upl_offset, upl_end); +#endif + + if (upl_end <= upl_offset) + return; + + size = upl_end - upl_offset; + } else { + assert(!(upl_offset & PAGE_MASK)); + assert(!(size & PAGE_MASK)); } - return(0); -} + boolean_t empty; + + /* + * We can unlock these pages now and as this is for a + * direct/uncached write, we want to dump the pages too. + */ + kern_return_t kr = upl_abort_range(associated_upl, upl_offset, size, + UPL_ABORT_DUMP_PAGES, &empty); + + assert(!kr); + + if (!kr && empty) { + upl_set_associated_upl(upl, NULL); + upl_deallocate(associated_upl); + } +} static int -cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags) +cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp) { int upl_abort_code = 0; int page_in = 0; int page_out = 0; - if (io_flags & B_PHYS) + if ((io_flags & (B_PHYS | B_CACHE)) == (B_PHYS | B_CACHE)) /* * direct write of any flavor, or a direct read that wasn't aligned */ @@ -468,7 +696,7 @@ cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_fla * leave pages in the cache unchanged on error */ upl_abort_code = UPL_ABORT_FREE_ON_EMPTY; - else if (page_out && (error != ENXIO)) + else if (page_out && ((error != ENXIO) || vnode_isswap(vp))) /* * transient error... leave pages unchanged */ @@ -502,40 +730,62 @@ cluster_iodone(buf_t bp, void *callback_arg) buf_t cbp_head; buf_t cbp_next; buf_t real_bp; + vnode_t vp; struct clios *iostate; boolean_t transaction_complete = FALSE; - cbp_head = (buf_t)(bp->b_trans_head); + __IGNORE_WCASTALIGN(cbp_head = (buf_t)(bp->b_trans_head)); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START, cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0); - for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) { - /* - * all I/O requests that are part of this transaction - * have to complete before we can process it - */ - if ( !(cbp->b_flags & B_DONE)) { + if (cbp_head->b_trans_next || !(cbp_head->b_flags & B_EOT)) { + lck_mtx_lock_spin(cl_transaction_mtxp); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, - cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0); + bp->b_flags |= B_TDONE; - return 0; + for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) { + /* + * all I/O requests that are part of this transaction + * have to complete before we can process it + */ + if ( !(cbp->b_flags & B_TDONE)) { + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, + cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0); + + lck_mtx_unlock(cl_transaction_mtxp); + + return 0; + } + + if (cbp->b_trans_next == CLUSTER_IO_WAITING) { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, + cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0); + + lck_mtx_unlock(cl_transaction_mtxp); + wakeup(cbp); + + return 0; + } + + if (cbp->b_flags & B_EOT) + transaction_complete = TRUE; } - if (cbp->b_flags & B_EOT) - transaction_complete = TRUE; - } - if (transaction_complete == FALSE) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, - cbp_head, 0, 0, 0, 0); + lck_mtx_unlock(cl_transaction_mtxp); - return 0; + if (transaction_complete == FALSE) { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, + cbp_head, 0, 0, 0, 0); + return 0; + } } error = 0; total_size = 0; total_resid = 0; cbp = cbp_head; + vp = cbp->b_vp; upl_offset = cbp->b_uploffset; upl = cbp->b_upl; b_flags = cbp->b_flags; @@ -569,6 +819,14 @@ cluster_iodone(buf_t bp, void *callback_arg) cbp = cbp_next; } + + if (ISSET(b_flags, B_COMMIT_UPL)) { + cluster_handle_associated_upl(iostate, + cbp_head->b_upl, + upl_offset, + transaction_size); + } + if (error == 0 && total_resid) error = EIO; @@ -593,7 +851,7 @@ cluster_iodone(buf_t bp, void *callback_arg) * someone has issued multiple I/Os asynchrounsly * and is waiting for them to complete (streaming) */ - lck_mtx_lock_spin(cl_mtxp); + lck_mtx_lock_spin(&iostate->io_mtxp); if (error && iostate->io_error == 0) iostate->io_error = error; @@ -608,21 +866,20 @@ cluster_iodone(buf_t bp, void *callback_arg) iostate->io_wanted = 0; need_wakeup = 1; } - lck_mtx_unlock(cl_mtxp); + lck_mtx_unlock(&iostate->io_mtxp); if (need_wakeup) wakeup((caddr_t)&iostate->io_wanted); } if (b_flags & B_COMMIT_UPL) { - - pg_offset = upl_offset & PAGE_MASK; + pg_offset = upl_offset & PAGE_MASK; commit_size = (pg_offset + transaction_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; if (error) - upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, commit_size, error, b_flags); + upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, commit_size, error, b_flags, vp); else { - upl_flags = UPL_COMMIT_FREE_ON_EMPTY; + upl_flags = UPL_COMMIT_FREE_ON_EMPTY; if ((b_flags & B_PHYS) && (b_flags & B_READ)) upl_flags |= UPL_COMMIT_SET_DIRTY; @@ -633,7 +890,7 @@ cluster_iodone(buf_t bp, void *callback_arg) ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags); } } - if ((b_flags & B_NEED_IODONE) && real_bp) { + if (real_bp) { if (error) { real_bp->b_flags |= B_ERROR; real_bp->b_error = error; @@ -650,10 +907,10 @@ cluster_iodone(buf_t bp, void *callback_arg) uint32_t -cluster_hard_throttle_limit(vnode_t vp, uint32_t *limit, uint32_t hard_throttle) +cluster_throttle_io_limit(vnode_t vp, uint32_t *limit) { - if (cluster_hard_throttle_on(vp, hard_throttle)) { - *limit = HARD_THROTTLE_MAXSIZE; + if (cluster_is_throttled(vp)) { + *limit = THROTTLE_MAX_IOSIZE; return 1; } return 0; @@ -674,7 +931,7 @@ cluster_zero(upl_t upl, upl_offset_t upl_offset, int size, buf_t bp) pl = ubc_upl_pageinfo(upl); if (upl_device_page(pl) == TRUE) { - zero_addr = ((addr64_t)upl_phys_page(pl, 0) << 12) + upl_offset; + zero_addr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + upl_offset; bzero_phys_nc(zero_addr, size); } else { @@ -686,7 +943,7 @@ cluster_zero(upl_t upl, upl_offset_t upl_offset, int size, buf_t bp) page_index = upl_offset / PAGE_SIZE; page_offset = upl_offset & PAGE_MASK; - zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << 12) + page_offset; + zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << PAGE_SHIFT) + page_offset; zero_cnt = min(PAGE_SIZE - page_offset, size); bzero_phys(zero_addr, zero_cnt); @@ -716,31 +973,53 @@ cluster_wait_IO(buf_t cbp_head, int async) buf_t cbp; if (async) { - /* - * async callback completion will not normally - * generate a wakeup upon I/O completion... - * by setting BL_WANTED, we will force a wakeup - * to occur as any outstanding I/Os complete... - * I/Os already completed will have BL_CALLDONE already - * set and we won't block in buf_biowait_callback.. - * note that we're actually waiting for the bp to have - * completed the callback function... only then - * can we safely take back ownership of the bp - * need the main buf mutex in order to safely - * update b_lflags + /* + * Async callback completion will not normally generate a + * wakeup upon I/O completion. To get woken up, we set + * b_trans_next (which is safe for us to modify) on the last + * buffer to CLUSTER_IO_WAITING so that cluster_iodone knows + * to wake us up when all buffers as part of this transaction + * are completed. This is done under the umbrella of + * cl_transaction_mtxp which is also taken in cluster_iodone. */ - buf_list_lock(); + bool done = true; + buf_t last = NULL; - for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) - cbp->b_lflags |= BL_WANTED; + lck_mtx_lock_spin(cl_transaction_mtxp); - buf_list_unlock(); - } - for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) { - if (async) - buf_biowait_callback(cbp); - else - buf_biowait(cbp); + for (cbp = cbp_head; cbp; last = cbp, cbp = cbp->b_trans_next) { + if (!ISSET(cbp->b_flags, B_TDONE)) + done = false; + } + + if (!done) { + last->b_trans_next = CLUSTER_IO_WAITING; + + DTRACE_IO1(wait__start, buf_t, last); + do { + msleep(last, cl_transaction_mtxp, PSPIN | (PRIBIO+1), "cluster_wait_IO", NULL); + + /* + * We should only have been woken up if all the + * buffers are completed, but just in case... + */ + done = true; + for (cbp = cbp_head; cbp != CLUSTER_IO_WAITING; cbp = cbp->b_trans_next) { + if (!ISSET(cbp->b_flags, B_TDONE)) { + done = false; + break; + } + } + } while (!done); + DTRACE_IO1(wait__done, buf_t, last); + + last->b_trans_next = NULL; + } + + lck_mtx_unlock(cl_transaction_mtxp); + } else { // !async + for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) + buf_biowait(cbp); } } @@ -749,6 +1028,7 @@ cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, i { buf_t cbp; int error; + boolean_t isswapout = FALSE; /* * cluster_complete_transaction will @@ -759,11 +1039,25 @@ cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, i for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) buf_biowait(cbp); } - error = cluster_iodone(*cbp_head, callback_arg); + /* + * we've already waited on all of the I/Os in this transaction, + * so mark all of the buf_t's in this transaction as B_TDONE + * so that cluster_iodone sees the transaction as completed + */ + for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) + cbp->b_flags |= B_TDONE; + cbp = *cbp_head; + + if ((flags & (CL_ASYNC | CL_PAGEOUT)) == CL_PAGEOUT && vnode_isswap(cbp->b_vp)) + isswapout = TRUE; + + error = cluster_iodone(cbp, callback_arg); if ( !(flags & CL_ASYNC) && error && *retval == 0) { - if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) != CL_PAGEOUT) || (error != ENXIO)) - *retval = error; + if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) != CL_PAGEOUT) || (error != ENXIO)) + *retval = error; + else if (isswapout == TRUE) + *retval = error; } *cbp_head = (buf_t)NULL; } @@ -876,21 +1170,30 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no max_iosize = PAGE_SIZE; if (flags & CL_THROTTLE) { - if ( !(flags & CL_PAGEOUT) && cluster_hard_throttle_on(vp, 1)) { - if (max_iosize > HARD_THROTTLE_MAXSIZE) - max_iosize = HARD_THROTTLE_MAXSIZE; - async_throttle = HARD_THROTTLE_MAXCNT; + if ( !(flags & CL_PAGEOUT) && cluster_is_throttled(vp)) { + if (max_iosize > THROTTLE_MAX_IOSIZE) + max_iosize = THROTTLE_MAX_IOSIZE; + async_throttle = THROTTLE_MAXCNT; } else { if ( (flags & CL_DEV_MEMORY) ) async_throttle = IO_SCALE(vp, VNODE_ASYNC_THROTTLE); else { u_int max_cluster; u_int max_cluster_size; - u_int max_prefetch; - - max_cluster_size = MAX_CLUSTER_SIZE(vp); - max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ)); + u_int scale; + + if (vp->v_mount->mnt_minsaturationbytecount) { + max_cluster_size = vp->v_mount->mnt_minsaturationbytecount; + + scale = 1; + } else { + max_cluster_size = MAX_CLUSTER_SIZE(vp); + if ((vp->v_mount->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) + scale = WRITE_THROTTLE_SSD; + else + scale = WRITE_THROTTLE; + } if (max_iosize > max_cluster_size) max_cluster = max_cluster_size; else @@ -898,8 +1201,11 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no if (size < max_cluster) max_cluster = size; - - async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), (max_prefetch / max_cluster) - 1); + + if (flags & CL_CLOSE) + scale += MAX_CLUSTERS; + + async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), ((scale * max_cluster_size) / max_cluster) - 1); } } } @@ -911,12 +1217,15 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no io_flags |= B_IOSTREAMING; if (flags & CL_COMMIT) io_flags |= B_COMMIT_UPL; - if (flags & CL_PRESERVE) + if (flags & CL_DIRECT_IO) io_flags |= B_PHYS; - if (flags & CL_KEEPCACHED) - io_flags |= B_CACHE; + if (flags & (CL_PRESERVE | CL_KEEPCACHED)) + io_flags |= B_CACHE; if (flags & CL_PASSIVE) io_flags |= B_PASSIVE; + if (flags & CL_ENCRYPTED) + io_flags |= B_ENCRYPTED_IO; + if (vp->v_flag & VSYSTEM) io_flags |= B_META; @@ -929,7 +1238,37 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no * read in from the file */ zero_offset = upl_offset + non_rounded_size; + } else if (!ISSET(flags, CL_READ) && ISSET(flags, CL_DIRECT_IO)) { + assert(ISSET(flags, CL_COMMIT)); + + // For a direct/uncached write, we need to lock pages... + + upl_t cached_upl; + + /* + * Create a UPL to lock the pages in the cache whilst the + * write is in progress. + */ + ubc_create_upl(vp, f_offset, non_rounded_size, &cached_upl, + NULL, UPL_SET_LITE); + + /* + * Attach this UPL to the other UPL so that we can find it + * later. + */ + upl_set_associated_upl(upl, cached_upl); + + if (upl_offset & PAGE_MASK) { + /* + * The two UPLs are not aligned, so mark the first page in + * @upl so that cluster_handle_associated_upl can handle + * it accordingly. + */ + upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl); + upl_page_set_mark(pl, 0, true); + } } + while (size) { daddr64_t blkno; daddr64_t lblkno; @@ -973,7 +1312,7 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no off_t e_offset; int pageout_flags; - if(upl_get_internal_vectorupl(upl)) + if (upl_get_internal_vectorupl(upl)) panic("Vector UPLs should not take this code-path\n"); /* * we're writing into a 'hole' @@ -1008,7 +1347,7 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no * * go direct to vnode_pageout so that we don't have to * unbusy the page from the UPL... we used to do this - * so that we could call ubc_sync_range, but that results + * so that we could call ubc_msync, but that results * in a potential deadlock if someone else races us to acquire * that page and wins and in addition needs one of the pages * we're continuing to hold in the UPL @@ -1021,47 +1360,69 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no pageout_flags |= UPL_NOCOMMIT; if (cbp_head) { - buf_t last_cbp; + buf_t prev_cbp; + int bytes_in_last_page; /* * first we have to wait for the the current outstanding I/Os * to complete... EOT hasn't been set yet on this transaction - * so the pages won't be released just because all of the current - * I/O linked to this transaction has completed... + * so the pages won't be released */ cluster_wait_IO(cbp_head, (flags & CL_ASYNC)); - /* - * we've got a transcation that - * includes the page we're about to push out through vnode_pageout... - * find the last bp in the list which will be the one that - * includes the head of this page and round it's iosize down - * to a page boundary... - */ - for (last_cbp = cbp = cbp_head; cbp->b_trans_next; cbp = cbp->b_trans_next) - last_cbp = cbp; - - cbp->b_bcount &= ~PAGE_MASK; - - if (cbp->b_bcount == 0) { - /* - * this buf no longer has any I/O associated with it + bytes_in_last_page = cbp_head->b_uploffset & PAGE_MASK; + for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) + bytes_in_last_page += cbp->b_bcount; + bytes_in_last_page &= PAGE_MASK; + + while (bytes_in_last_page) { + /* + * we've got a transcation that + * includes the page we're about to push out through vnode_pageout... + * find the bp's in the list which intersect this page and either + * remove them entirely from the transaction (there could be multiple bp's), or + * round it's iosize down to the page boundary (there can only be one)... + * + * find the last bp in the list and act on it */ - free_io_buf(cbp); + for (prev_cbp = cbp = cbp_head; cbp->b_trans_next; cbp = cbp->b_trans_next) + prev_cbp = cbp; - if (cbp == cbp_head) { - /* - * the buf we just freed was the only buf in - * this transaction... so there's no I/O to do + if (bytes_in_last_page >= cbp->b_bcount) { + /* + * this buf no longer has any I/O associated with it */ - cbp_head = NULL; + bytes_in_last_page -= cbp->b_bcount; + cbp->b_bcount = 0; + + free_io_buf(cbp); + + if (cbp == cbp_head) { + assert(bytes_in_last_page == 0); + /* + * the buf we just freed was the only buf in + * this transaction... so there's no I/O to do + */ + cbp_head = NULL; + cbp_tail = NULL; + } else { + /* + * remove the buf we just freed from + * the transaction list + */ + prev_cbp->b_trans_next = NULL; + cbp_tail = prev_cbp; + } } else { - /* - * remove the buf we just freed from - * the transaction list + /* + * this is the last bp that has I/O + * intersecting the page of interest + * only some of the I/O is in the intersection + * so clip the size but keep it in the transaction list */ - last_cbp->b_trans_next = NULL; - cbp_tail = last_cbp; + cbp->b_bcount -= bytes_in_last_page; + cbp_tail = cbp; + bytes_in_last_page = 0; } } if (cbp_head) { @@ -1080,7 +1441,6 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no } if (vnode_pageout(vp, upl, trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) { error = EINVAL; - break; } e_offset = round_page_64(f_offset + 1); io_size = e_offset - f_offset; @@ -1109,9 +1469,14 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no */ size = 0; } + if (error) { + if (size == 0) + flags &= ~CL_COMMIT; + break; + } continue; } - lblkno = (daddr64_t)(f_offset / PAGE_SIZE_64); + lblkno = (daddr64_t)(f_offset / 0x1000); /* * we have now figured out how much I/O we can do - this is in 'io_size' * pg_offset is the starting point in the first page for the I/O @@ -1206,6 +1571,10 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no commit_offset = upl_offset & ~PAGE_MASK; } + + // Associated UPL is currently only used in the direct write path + assert(!upl_associated_upl(upl)); + if ( (flags & CL_COMMIT) && pg_count) { ubc_upl_commit_range(upl, commit_offset, pg_count * PAGE_SIZE, UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY); @@ -1302,9 +1671,13 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no if (flags & CL_PAGEOUT) { u_int i; - for (i = 0; i < pg_count; i++) { - if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY) - panic("BUSY bp found in cluster_io"); + /* + * since blocks are in offsets of 0x1000, scale + * iteration to (PAGE_SIZE * pg_count) of blks. + */ + for (i = 0; i < (PAGE_SIZE * pg_count)/0x1000; i++) { + if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY) + panic("BUSY bp found in cluster_io"); } } if (flags & CL_ASYNC) { @@ -1313,6 +1686,8 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no } cbp->b_cliodone = (void *)callback; cbp->b_flags |= io_flags; + if (flags & CL_NOCACHE) + cbp->b_attr.ba_flags |= BA_NOCACHE; cbp->b_lblkno = lblkno; cbp->b_blkno = blkno; @@ -1320,7 +1695,9 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no if (buf_setupl(cbp, upl, upl_offset)) panic("buf_setupl failed\n"); - +#if CONFIG_IOSCHED + upl_set_blkno(upl, upl_offset, io_size, blkno); +#endif cbp->b_trans_next = (buf_t)NULL; if ((cbp->b_iostate = (void *)iostate)) @@ -1346,10 +1723,8 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no cbp_head = cbp; cbp_tail = cbp; - if ( (cbp_head->b_real_bp = real_bp) ) { - cbp_head->b_flags |= B_NEED_IODONE; + if ( (cbp_head->b_real_bp = real_bp) ) real_bp = (buf_t)NULL; - } } *(buf_t *)(&cbp->b_trans_head) = cbp_head; @@ -1407,6 +1782,14 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no if ( !(io_flags & B_READ)) vnode_startwrite(vp); + if (flags & CL_RAW_ENCRYPTED) { + /* + * User requested raw encrypted bytes. + * Twiddle the bit in the ba_flags for the buffer + */ + cbp->b_attr.ba_flags |= BA_RAW_ENCRYPTED_IO; + } + (void) VNOP_STRATEGY(cbp); if (need_EOT == TRUE) { @@ -1419,34 +1802,41 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no } } if (error) { - int abort_size; + int abort_size; io_size = 0; - + if (cbp_head) { - /* - * first wait until all of the outstanding I/O - * for this partial transaction has completed - */ - cluster_wait_IO(cbp_head, (flags & CL_ASYNC)); + /* + * Wait until all of the outstanding I/O + * for this partial transaction has completed + */ + cluster_wait_IO(cbp_head, (flags & CL_ASYNC)); /* * Rewind the upl offset to the beginning of the * transaction. */ upl_offset = cbp_head->b_uploffset; + } - for (cbp = cbp_head; cbp;) { - buf_t cbp_next; - - size += cbp->b_bcount; - io_size += cbp->b_bcount; + if (ISSET(flags, CL_COMMIT)) { + cluster_handle_associated_upl(iostate, upl, upl_offset, + upl_end_offset - upl_offset); + } - cbp_next = cbp->b_trans_next; - free_io_buf(cbp); - cbp = cbp_next; - } + // Free all the IO buffers in this transaction + for (cbp = cbp_head; cbp;) { + buf_t cbp_next; + + size += cbp->b_bcount; + io_size += cbp->b_bcount; + + cbp_next = cbp->b_trans_next; + free_io_buf(cbp); + cbp = cbp_next; } + if (iostate) { int need_wakeup = 0; @@ -1455,7 +1845,7 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no * since we never really issued the io * just go ahead and adjust it back */ - lck_mtx_lock_spin(cl_mtxp); + lck_mtx_lock_spin(&iostate->io_mtxp); if (iostate->io_error == 0) iostate->io_error = error; @@ -1469,18 +1859,19 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no iostate->io_wanted = 0; need_wakeup = 1; } - lck_mtx_unlock(cl_mtxp); + lck_mtx_unlock(&iostate->io_mtxp); if (need_wakeup) wakeup((caddr_t)&iostate->io_wanted); } + if (flags & CL_COMMIT) { int upl_flags; - pg_offset = upl_offset & PAGE_MASK; + pg_offset = upl_offset & PAGE_MASK; abort_size = (upl_end_offset - upl_offset + PAGE_MASK) & ~PAGE_MASK; - - upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, abort_size, error, io_flags); + + upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, abort_size, error, io_flags, vp); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE, upl, upl_offset - pg_offset, abort_size, (error << 24) | upl_flags, 0); @@ -1580,10 +1971,18 @@ cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct return; } - max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ)); + max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ), (vp->v_mount->mnt_kern_flag & MNTK_SSD)); - if (extent->e_addr < rap->cl_maxra) { - if ((rap->cl_maxra - extent->e_addr) > ((max_prefetch / PAGE_SIZE) / 4)) { + if (max_prefetch > speculative_prefetch_max) + max_prefetch = speculative_prefetch_max; + + if (max_prefetch <= PAGE_SIZE) { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, + rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 6, 0); + return; + } + if (extent->e_addr < rap->cl_maxra && rap->cl_ralen >= 4) { + if ((rap->cl_maxra - extent->e_addr) > (rap->cl_ralen / 4)) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0); @@ -1643,18 +2042,7 @@ cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offs off_t max_size; int local_flags; - if (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) - /* - * if we know we're issuing this I/O to a virtual device (i.e. disk image) - * then we don't want to enforce this throttle... if we do, we can - * potentially deadlock since we're stalling the pageout thread at a time - * when the disk image might need additional memory (which won't be available - * if the pageout thread can't run)... instead we'll just depend on the throttle - * that the pageout thread now has in place to deal with external files - */ - local_flags = CL_PAGEOUT; - else - local_flags = CL_PAGEOUT | CL_THROTTLE; + local_flags = CL_PAGEOUT | CL_THROTTLE; if ((flags & UPL_IOSYNC) == 0) local_flags |= CL_ASYNC; @@ -1662,6 +2050,8 @@ cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offs local_flags |= CL_COMMIT; if ((flags & UPL_KEEPCACHED)) local_flags |= CL_KEEPCACHED; + if (flags & UPL_PAGING_ENCRYPTED) + local_flags |= CL_ENCRYPTED; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE, @@ -1738,6 +2128,8 @@ cluster_pagein_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offse local_flags |= CL_COMMIT; if (flags & UPL_IOSTREAMING) local_flags |= CL_IOSTREAMING; + if (flags & UPL_PAGING_ENCRYPTED) + local_flags |= CL_ENCRYPTED; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE, @@ -1831,9 +2223,10 @@ cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t else bflag = 0; - if (vp->v_flag & VNOCACHE_DATA) + if (vp->v_flag & VNOCACHE_DATA){ flags |= IO_NOCACHE; - + bflag |= CL_NOCACHE; + } if (uio == NULL) { /* * no user data... @@ -1845,12 +2238,12 @@ cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t } /* * do a write through the cache if one of the following is true.... - * NOCACHE is not true and + * NOCACHE is not true or NODIRECT is true * the uio request doesn't target USERSPACE * otherwise, find out if we want the direct or contig variant for * the first vector in the uio request */ - if ( (flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ) + if ( ((flags & (IO_NOCACHE | IO_NODIRECT)) == IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ) retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE); if ( (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)) && write_type == IO_DIRECT) @@ -1965,7 +2358,7 @@ cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, in upl_size_t upl_size, vector_upl_size = 0; vm_size_t upl_needed_size; mach_msg_type_number_t pages_in_pl; - int upl_flags; + upl_control_flags_t upl_flags; kern_return_t kret; mach_msg_type_number_t i; int force_data_sync; @@ -1975,7 +2368,11 @@ cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, in user_addr_t iov_base; u_int32_t mem_alignment_mask; u_int32_t devblocksize; + u_int32_t max_io_size; u_int32_t max_upl_size; + u_int32_t max_vector_size; + u_int32_t bytes_outstanding_limit; + boolean_t io_throttled = FALSE; u_int32_t vector_upl_iosize = 0; int issueVectorUPL = 0,useVectorUPL = (uio->uio_iovcnt > 1); @@ -1997,12 +2394,20 @@ cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, in if (flags & IO_PASSIVE) io_flag |= CL_PASSIVE; + + if (flags & IO_NOCACHE) + io_flag |= CL_NOCACHE; + + if (flags & IO_SKIP_ENCRYPTION) + io_flag |= CL_ENCRYPTED; iostate.io_completed = 0; iostate.io_issued = 0; iostate.io_error = 0; iostate.io_wanted = 0; + lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); + mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; @@ -2043,17 +2448,45 @@ next_dwrite: goto wait_for_dwrites; } + task_update_logical_writes(current_task(), (io_req_size & ~PAGE_MASK), TASK_WRITE_IMMEDIATE, vp); while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == 0) { + int throttle_type; + + if ( (throttle_type = cluster_is_throttled(vp)) ) { + /* + * we're in the throttle window, at the very least + * we want to limit the size of the I/O we're about + * to issue + */ + if ( (flags & IO_RETURN_ON_THROTTLE) && throttle_type == THROTTLE_NOW) { + /* + * we're in the throttle window and at least 1 I/O + * has already been issued by a throttleable thread + * in this window, so return with EAGAIN to indicate + * to the FS issuing the cluster_write call that it + * should now throttle after dropping any locks + */ + throttle_info_update_by_mount(vp->v_mount); + + io_throttled = TRUE; + goto wait_for_dwrites; + } + max_vector_size = THROTTLE_MAX_IOSIZE; + max_io_size = THROTTLE_MAX_IOSIZE; + } else { + max_vector_size = MAX_VECTOR_UPL_SIZE; + max_io_size = max_upl_size; + } if (first_IO) { - cluster_syncup(vp, newEOF, callback, callback_arg); + cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0); first_IO = 0; } io_size = io_req_size & ~PAGE_MASK; iov_base = uio_curriovbase(uio); - if (io_size > max_upl_size) - io_size = max_upl_size; + if (io_size > max_io_size) + io_size = max_io_size; if(useVectorUPL && (iov_base & PAGE_MASK)) { /* @@ -2078,13 +2511,15 @@ next_dwrite: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START, (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0); + vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map; for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) { pages_in_pl = 0; upl_size = upl_needed_size; upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC | - UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE; + UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE + | UPL_MEMORY_TAG_MAKE(VM_KERN_MEMORY_FILE); - kret = vm_map_get_upl(current_map(), + kret = vm_map_get_upl(map, (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), &upl_size, &upl, @@ -2169,37 +2604,19 @@ next_dwrite: */ } - /* - * Now look for pages already in the cache - * and throw them away. - * uio->uio_offset is page aligned within the file - * io_size is a multiple of PAGE_SIZE - */ - ubc_range_op(vp, uio->uio_offset, uio->uio_offset + io_size, UPL_ROP_DUMP, NULL); - /* * we want push out these writes asynchronously so that we can overlap * the preparation of the next I/O * if there are already too many outstanding writes * wait until some complete before issuing the next */ - if (iostate.io_issued > iostate.io_completed) { - - lck_mtx_lock(cl_mtxp); - - while ((iostate.io_issued - iostate.io_completed) > (max_upl_size * IO_SCALE(vp, 2))) { - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, max_upl_size * IO_SCALE(vp, 2), 0, 0); + if (vp->v_mount->mnt_minsaturationbytecount) + bytes_outstanding_limit = vp->v_mount->mnt_minsaturationbytecount; + else + bytes_outstanding_limit = max_upl_size * IO_SCALE(vp, 2); - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_direct", NULL); + cluster_iostate_wait(&iostate, bytes_outstanding_limit, "cluster_write_direct"); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, max_upl_size * IO_SCALE(vp, 2), 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) { /* * one of the earlier writes we issued ran into a hard error @@ -2233,7 +2650,7 @@ next_dwrite: vector_upl_iosize += io_size; vector_upl_size += upl_size; - if(issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= MAX_VECTOR_UPL_SIZE) { + if(issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) { retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); reset_vector_run_state(); } @@ -2279,33 +2696,24 @@ next_dwrite: wait_for_dwrites: - if(retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) { + if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) { retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); reset_vector_run_state(); } + /* + * make sure all async writes issued as part of this stream + * have completed before we return + */ + cluster_iostate_wait(&iostate, 0, "cluster_write_direct"); - if (iostate.io_issued > iostate.io_completed) { - /* - * make sure all async writes issued as part of this stream - * have completed before we return - */ - lck_mtx_lock(cl_mtxp); - - while (iostate.io_issued != iostate.io_completed) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_direct", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) retval = iostate.io_error; + lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); + + if (io_throttled == TRUE && retval == 0) + retval = EAGAIN; + if (io_req_size && retval == 0) { /* * we couldn't handle the tail of this request in DIRECT mode @@ -2342,7 +2750,7 @@ cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, upl_size_t upl_size; vm_size_t upl_needed_size; mach_msg_type_number_t pages_in_pl; - int upl_flags; + upl_control_flags_t upl_flags; kern_return_t kret; struct clios iostate; int error = 0; @@ -2358,7 +2766,7 @@ cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, * -- the io_req_size will not exceed iov_len * -- the target address is physically contiguous */ - cluster_syncup(vp, newEOF, callback, callback_arg); + cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0); devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; @@ -2368,6 +2776,8 @@ cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, iostate.io_error = 0; iostate.io_wanted = 0; + lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); + next_cwrite: io_size = *write_length; @@ -2379,9 +2789,11 @@ next_cwrite: pages_in_pl = 0; upl_size = upl_needed_size; upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC | - UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE; + UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE + | UPL_MEMORY_TAG_MAKE(VM_KERN_MEMORY_FILE); - kret = vm_map_get_upl(current_map(), + vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map; + kret = vm_map_get_upl(map, (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, 0); @@ -2406,7 +2818,7 @@ next_cwrite: } pl = ubc_upl_pageinfo(upl[cur_upl]); - src_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset; + src_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset; while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) { u_int32_t head_size; @@ -2456,22 +2868,8 @@ next_cwrite: * if there are already too many outstanding writes * wait until some have completed before issuing the next */ - if (iostate.io_issued > iostate.io_completed) { - lck_mtx_lock(cl_mtxp); - - while ((iostate.io_issued - iostate.io_completed) > (MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2))) { - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_contig", NULL); + cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_write_contig"); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) { /* * one of the earlier writes we issued ran into a hard error @@ -2515,25 +2913,13 @@ wait_for_cwrites: * make sure all async writes that are part of this stream * have completed before we proceed */ - if (iostate.io_issued > iostate.io_completed) { - - lck_mtx_lock(cl_mtxp); - - while (iostate.io_issued != iostate.io_completed) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, 0, 0, 0); + cluster_iostate_wait(&iostate, 0, "cluster_write_contig"); - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_contig", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) error = iostate.io_error; + lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); + if (error == 0 && tail_size) error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0, callback, callback_arg); @@ -2608,6 +2994,9 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old off_t zero_off; long long zero_cnt1; off_t zero_off1; + off_t write_off = 0; + int write_cnt = 0; + boolean_t first_pass = FALSE; struct cl_extent cl; struct cl_writebehind *wbp; int bflag; @@ -2629,6 +3018,11 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old bflag = CL_PASSIVE; else bflag = 0; + if (flags & IO_NOCACHE) + bflag |= CL_NOCACHE; + + if (flags & IO_SKIP_ENCRYPTION) + bflag |= CL_ENCRYPTED; zero_cnt = 0; zero_cnt1 = 0; @@ -2689,7 +3083,16 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old retval, 0, 0, 0, 0); return (0); } - + if (uio) { + write_off = uio->uio_offset; + write_cnt = uio_resid(uio); + /* + * delay updating the sequential write info + * in the control block until we've obtained + * the lock for it + */ + first_pass = TRUE; + } while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) { /* * for this iteration of the loop, figure out where our starting point is @@ -2718,7 +3121,7 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old * because IO_HEADZEROFILL and IO_TAILZEROFILL not set */ if ((start_offset + total_size) > max_io_size) - total_size -= start_offset; + total_size = max_io_size - start_offset; xfer_resid = total_size; retval = cluster_copy_ubc_data_internal(vp, uio, &xfer_resid, 1, 1); @@ -2984,7 +3387,7 @@ check_cluster: */ wbp->cl_number = 0; - sparse_cluster_push(&(wbp->cl_scmap), vp, newEOF, PUSH_ALL, callback, callback_arg); + sparse_cluster_push(&(wbp->cl_scmap), vp, newEOF, PUSH_ALL, 0, callback, callback_arg); /* * no clusters of either type present at this point * so just go directly to start_new_cluster since @@ -2993,7 +3396,17 @@ check_cluster: * to avoid the deadlock with sparse_cluster_push */ goto start_new_cluster; - } + } + if (first_pass) { + if (write_off == wbp->cl_last_write) + wbp->cl_seq_written += write_cnt; + else + wbp->cl_seq_written = write_cnt; + + wbp->cl_last_write = write_off + write_cnt; + + first_pass = FALSE; + } if (wbp->cl_number == 0) /* * no clusters currently present @@ -3108,14 +3521,36 @@ check_cluster: */ goto delay_io; - if (wbp->cl_number < MAX_CLUSTERS) + if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && + wbp->cl_number == MAX_CLUSTERS && + wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) { + uint32_t n; + + if (vp->v_mount->mnt_minsaturationbytecount) { + n = vp->v_mount->mnt_minsaturationbytecount / MAX_CLUSTER_SIZE(vp); + + if (n > MAX_CLUSTERS) + n = MAX_CLUSTERS; + } else + n = 0; + + if (n == 0) { + if (vp->v_mount->mnt_kern_flag & MNTK_SSD) + n = WRITE_BEHIND_SSD; + else + n = WRITE_BEHIND; + } + while (n--) + cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg, NULL); + } + if (wbp->cl_number < MAX_CLUSTERS) { /* * we didn't find an existing cluster to * merge into, but there's room to start * a new one */ goto start_new_cluster; - + } /* * no exisitng cluster to merge with and no * room to start a new one... we'll try @@ -3133,7 +3568,7 @@ check_cluster: */ if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) { - ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, callback, callback_arg); + ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, 0, callback, callback_arg, NULL); } /* @@ -3152,18 +3587,6 @@ check_cluster: continue; } - /* - * we pushed one cluster successfully, so we must be sequentially writing this file - * otherwise, we would have failed and fallen into the sparse cluster support - * so let's take the opportunity to push out additional clusters... - * this will give us better I/O locality if we're in a copy loop - * (i.e. we won't jump back and forth between the read and write points - */ - if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) { - while (wbp->cl_number) - cluster_try_push(wbp, vp, newEOF, 0, callback, callback_arg); - } - start_new_cluster: wbp->cl_clusters[wbp->cl_number].b_addr = cl.b_addr; wbp->cl_clusters[wbp->cl_number].e_addr = cl.e_addr; @@ -3225,15 +3648,24 @@ cluster_read_ext(vnode_t vp, struct uio *uio, off_t filesize, int xflags, int (* if ((vp->v_flag & VRAOFF) || speculative_reads_disabled) flags |= IO_RAOFF; - /* + if (flags & IO_SKIP_ENCRYPTION) + flags |= IO_ENCRYPTED; + + /* * do a read through the cache if one of the following is true.... * NOCACHE is not true * the uio request doesn't target USERSPACE + * Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well. + * Reading encrypted data from a CP filesystem should never result in the data touching + * the UBC. + * * otherwise, find out if we want the direct or contig variant for * the first vector in the uio request */ - if ( (flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ) - retval = cluster_io_type(uio, &read_type, &read_length, 0); + if ( ((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) || (flags & IO_ENCRYPTED) ) { + + retval = cluster_io_type(uio, &read_type, &read_length, 0); + } while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) { @@ -3318,21 +3750,19 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file struct cl_extent extent; int bflag; int take_reference = 1; - struct uthread *ut; int policy = IOPOL_DEFAULT; - + boolean_t iolock_inited = FALSE; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START, (int)uio->uio_offset, io_req_size, (int)filesize, flags, 0); + + if (flags & IO_ENCRYPTED) { + panic ("encrypted blocks will hit UBC!"); + } - policy = current_proc()->p_iopol_disk; + policy = throttle_get_io_policy(NULL); - ut = get_bsdthread_info(current_thread()); - - if (ut->uu_iopol_disk != IOPOL_DEFAULT) - policy = ut->uu_iopol_disk; - - if (policy == IOPOL_THROTTLE || (flags & IO_NOCACHE)) + if (policy == THROTTLE_LEVEL_TIER3 || policy == THROTTLE_LEVEL_TIER2 || (flags & IO_NOCACHE)) take_reference = 0; if (flags & IO_PASSIVE) @@ -3340,8 +3770,14 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file else bflag = 0; + if (flags & IO_NOCACHE) + bflag |= CL_NOCACHE; + + if (flags & IO_SKIP_ENCRYPTION) + bflag |= CL_ENCRYPTED; + max_io_size = cluster_max_io_size(vp->v_mount, CL_READ); - max_prefetch = MAX_PREFETCH(vp, max_io_size); + max_prefetch = MAX_PREFETCH(vp, max_io_size, (vp->v_mount->mnt_kern_flag & MNTK_SSD)); max_rd_size = max_prefetch; last_request_offset = uio->uio_offset + io_req_size; @@ -3353,14 +3789,16 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file rd_ahead_enabled = 0; rap = NULL; } else { - if (cluster_hard_throttle_on(vp, 1)) { + if (cluster_is_throttled(vp)) { + /* + * we're in the throttle window, at the very least + * we want to limit the size of the I/O we're about + * to issue + */ rd_ahead_enabled = 0; prefetch_enabled = 0; - max_rd_size = HARD_THROTTLE_MAXSIZE; - } else if (policy == IOPOL_THROTTLE) { - rd_ahead_enabled = 0; - prefetch_enabled = 0; + max_rd_size = THROTTLE_MAX_IOSIZE; } if ((rap = cluster_get_rap(vp)) == NULL) rd_ahead_enabled = 0; @@ -3440,7 +3878,7 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file io_requested = io_resid; - retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, last_ioread_offset == 0 ? take_reference : 0); + retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, take_reference); xsize = io_requested - io_resid; @@ -3479,6 +3917,30 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file */ max_size = filesize - uio->uio_offset; } + + iostate.io_completed = 0; + iostate.io_issued = 0; + iostate.io_error = 0; + iostate.io_wanted = 0; + + if ( (flags & IO_RETURN_ON_THROTTLE) ) { + if (cluster_is_throttled(vp) == THROTTLE_NOW) { + if ( !cluster_io_present_in_BC(vp, uio->uio_offset)) { + /* + * we're in the throttle window and at least 1 I/O + * has already been issued by a throttleable thread + * in this window, so return with EAGAIN to indicate + * to the FS issuing the cluster_read call that it + * should now throttle after dropping any locks + */ + throttle_info_update_by_mount(vp->v_mount); + + retval = EAGAIN; + break; + } + } + } + /* * compute the size of the upl needed to encompass * the requested read... limit each call to cluster_io @@ -3499,8 +3961,13 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file if (upl_size > max_io_size) upl_size = max_io_size; } else { - if (upl_size > max_io_size / 4) + if (upl_size > max_io_size / 4) { upl_size = max_io_size / 4; + upl_size &= ~PAGE_MASK; + + if (upl_size == 0) + upl_size = PAGE_SIZE; + } } pages_in_upl = upl_size / PAGE_SIZE; @@ -3540,10 +4007,6 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file if (upl_valid_page(pl, last_pg)) break; } - iostate.io_completed = 0; - iostate.io_issued = 0; - iostate.io_error = 0; - iostate.io_wanted = 0; if (start_pg < last_pg) { /* @@ -3552,6 +4015,11 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file * we may have to clip the size of it to keep from reading past * the end of the last physical block associated with the file */ + if (iolock_inited == FALSE) { + lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); + + iolock_inited = TRUE; + } upl_offset = start_pg * PAGE_SIZE; io_size = (last_pg - start_pg) * PAGE_SIZE; @@ -3564,6 +4032,18 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, CL_READ | CL_ASYNC | bflag, (buf_t)NULL, &iostate, callback, callback_arg); + + if (rap) { + if (extent.e_addr < rap->cl_maxra) { + /* + * we've just issued a read for a block that should have been + * in the cache courtesy of the read-ahead engine... something + * has gone wrong with the pipeline, so reset the read-ahead + * logic which will cause us to restart from scratch + */ + rap->cl_maxra = 0; + } + } } if (error == 0) { /* @@ -3642,22 +4122,9 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file rap->cl_lastr = extent.e_addr; } } - if (iostate.io_issued > iostate.io_completed) { - - lck_mtx_lock(cl_mtxp); + if (iolock_inited == TRUE) + cluster_iostate_wait(&iostate, 0, "cluster_read_copy"); - while (iostate.io_issued != iostate.io_completed) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_copy", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) error = iostate.io_error; else { @@ -3669,6 +4136,9 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file io_req_size -= (val_size - io_requested); } + } else { + if (iolock_inited == TRUE) + cluster_iostate_wait(&iostate, 0, "cluster_read_copy"); } if (start_pg < last_pg) { /* @@ -3728,17 +4198,21 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file retval = error; if (io_req_size) { - if (cluster_hard_throttle_on(vp, 1)) { + if (cluster_is_throttled(vp)) { + /* + * we're in the throttle window, at the very least + * we want to limit the size of the I/O we're about + * to issue + */ rd_ahead_enabled = 0; prefetch_enabled = 0; - - max_rd_size = HARD_THROTTLE_MAXSIZE; + max_rd_size = THROTTLE_MAX_IOSIZE; } else { - if (max_rd_size == HARD_THROTTLE_MAXSIZE) { + if (max_rd_size == THROTTLE_MAX_IOSIZE) { /* * coming out of throttled state */ - if (policy != IOPOL_THROTTLE) { + if (policy != THROTTLE_LEVEL_TIER3 && policy != THROTTLE_LEVEL_TIER2) { if (rap != NULL) rd_ahead_enabled = 1; prefetch_enabled = 1; @@ -3749,6 +4223,19 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file } } } + if (iolock_inited == TRUE) { + /* + * cluster_io returned an error after it + * had already issued some I/O. we need + * to wait for that I/O to complete before + * we can destroy the iostate mutex... + * 'retval' already contains the early error + * so no need to pick it up from iostate.io_error + */ + cluster_iostate_wait(&iostate, 0, "cluster_read_copy"); + + lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); + } if (rap != NULL) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END, (int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, 0); @@ -3762,6 +4249,72 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file return (retval); } +/* + * We don't want another read/write lock for every vnode in the system + * so we keep a hash of them here. There should never be very many of + * these around at any point in time. + */ +cl_direct_read_lock_t *cluster_lock_direct_read(vnode_t vp, lck_rw_type_t type) +{ + struct cl_direct_read_locks *head + = &cl_direct_read_locks[(uintptr_t)vp / sizeof(*vp) + % CL_DIRECT_READ_LOCK_BUCKETS]; + + struct cl_direct_read_lock *lck, *new_lck = NULL; + + for (;;) { + lck_spin_lock(&cl_direct_read_spin_lock); + + LIST_FOREACH(lck, head, chain) { + if (lck->vp == vp) { + ++lck->ref_count; + lck_spin_unlock(&cl_direct_read_spin_lock); + if (new_lck) { + // Someone beat us to it, ditch the allocation + lck_rw_destroy(&new_lck->rw_lock, cl_mtx_grp); + FREE(new_lck, M_TEMP); + } + lck_rw_lock(&lck->rw_lock, type); + return lck; + } + } + + if (new_lck) { + // Use the lock we allocated + LIST_INSERT_HEAD(head, new_lck, chain); + lck_spin_unlock(&cl_direct_read_spin_lock); + lck_rw_lock(&new_lck->rw_lock, type); + return new_lck; + } + + lck_spin_unlock(&cl_direct_read_spin_lock); + + // Allocate a new lock + MALLOC(new_lck, cl_direct_read_lock_t *, sizeof(*new_lck), + M_TEMP, M_WAITOK); + lck_rw_init(&new_lck->rw_lock, cl_mtx_grp, cl_mtx_attr); + new_lck->vp = vp; + new_lck->ref_count = 1; + + // Got to go round again + } +} + +void cluster_unlock_direct_read(cl_direct_read_lock_t *lck) +{ + lck_rw_done(&lck->rw_lock); + + lck_spin_lock(&cl_direct_read_spin_lock); + if (lck->ref_count == 1) { + LIST_REMOVE(lck, chain); + lck_spin_unlock(&cl_direct_read_spin_lock); + lck_rw_destroy(&lck->rw_lock, cl_mtx_grp); + FREE(lck, M_TEMP); + } else { + --lck->ref_count; + lck_spin_unlock(&cl_direct_read_spin_lock); + } +} static int cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length, @@ -3774,13 +4327,12 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, upl_size_t upl_size, vector_upl_size = 0; vm_size_t upl_needed_size; unsigned int pages_in_pl; - int upl_flags; + upl_control_flags_t upl_flags; kern_return_t kret; unsigned int i; int force_data_sync; int retval = 0; int no_zero_fill = 0; - int abort_flag = 0; int io_flag = 0; int misaligned = 0; struct clios iostate; @@ -3796,12 +4348,20 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t max_upl_size; u_int32_t max_rd_size; u_int32_t max_rd_ahead; + u_int32_t max_vector_size; + boolean_t strict_uncached_IO = FALSE; + boolean_t io_throttled = FALSE; u_int32_t vector_upl_iosize = 0; int issueVectorUPL = 0,useVectorUPL = (uio->uio_iovcnt > 1); off_t v_upl_uio_offset = 0; int vector_upl_index=0; upl_t vector_upl = NULL; + cl_direct_read_lock_t *lock = NULL; + + user_addr_t orig_iov_base = 0; + user_addr_t last_iov_base = 0; + user_addr_t next_iov_base = 0; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START, (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0); @@ -3812,14 +4372,28 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, max_rd_ahead = max_rd_size * IO_SCALE(vp, 2); io_flag = CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO | CL_DIRECT_IO; + if (flags & IO_PASSIVE) io_flag |= CL_PASSIVE; + if (flags & IO_ENCRYPTED) { + io_flag |= CL_RAW_ENCRYPTED; + } + + if (flags & IO_NOCACHE) { + io_flag |= CL_NOCACHE; + } + + if (flags & IO_SKIP_ENCRYPTION) + io_flag |= CL_ENCRYPTED; + iostate.io_completed = 0; iostate.io_issued = 0; iostate.io_error = 0; iostate.io_wanted = 0; + lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); + devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; @@ -3839,15 +4413,16 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, */ devblocksize = PAGE_SIZE; } + + strict_uncached_IO = ubc_strict_uncached_IO(vp); + + orig_iov_base = uio_curriovbase(uio); + last_iov_base = orig_iov_base; + next_dread: io_req_size = *read_length; iov_base = uio_curriovbase(uio); - max_io_size = filesize - uio->uio_offset; - - if ((off_t)io_req_size > max_io_size) - io_req_size = max_io_size; - offset_in_file = (u_int32_t)uio->uio_offset & (devblocksize - 1); offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask; @@ -3865,7 +4440,25 @@ next_dread: * I/O that ends on a page boundary in cluster_io */ misaligned = 1; - } + } + + max_io_size = filesize - uio->uio_offset; + + /* + * The user must request IO in aligned chunks. If the + * offset into the file is bad, or the userland pointer + * is non-aligned, then we cannot service the encrypted IO request. + */ + if (flags & IO_ENCRYPTED) { + if (misaligned || (io_req_size & (devblocksize - 1))) + retval = EINVAL; + + max_io_size = roundup(max_io_size, devblocksize); + } + + if ((off_t)io_req_size > max_io_size) + io_req_size = max_io_size; + /* * When we get to this point, we know... * -- the offset into the file is on a devblocksize boundary @@ -3874,24 +4467,35 @@ next_dread: while (io_req_size && retval == 0) { u_int32_t io_start; - if (cluster_hard_throttle_on(vp, 1)) { - max_rd_size = HARD_THROTTLE_MAXSIZE; - max_rd_ahead = HARD_THROTTLE_MAXSIZE - 1; + if (cluster_is_throttled(vp)) { + /* + * we're in the throttle window, at the very least + * we want to limit the size of the I/O we're about + * to issue + */ + max_rd_size = THROTTLE_MAX_IOSIZE; + max_rd_ahead = THROTTLE_MAX_IOSIZE - 1; + max_vector_size = THROTTLE_MAX_IOSIZE; } else { max_rd_size = max_upl_size; max_rd_ahead = max_rd_size * IO_SCALE(vp, 2); + max_vector_size = MAX_VECTOR_UPL_SIZE; } io_start = io_size = io_req_size; /* * First look for pages already in the cache - * and move them to user space. + * and move them to user space. But only do this + * check if we are not retrieving encrypted data directly + * from the filesystem; those blocks should never + * be in the UBC. * * cluster_copy_ubc_data returns the resid * in io_size */ - retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0); - + if ((strict_uncached_IO == FALSE) && ((flags & IO_ENCRYPTED) == 0)) { + retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0); + } /* * calculate the number of bytes actually copied * starting size - residual @@ -3922,9 +4526,14 @@ next_dread: } /* - * check to see if we are finished with this request... + * check to see if we are finished with this request. + * + * If we satisfied this IO already, then io_req_size will be 0. + * Otherwise, see if the IO was mis-aligned and needs to go through + * the UBC to deal with the 'tail'. + * */ - if (io_req_size == 0 || misaligned) { + if (io_req_size == 0 || (misaligned)) { /* * see if there's another uio vector to * process that's of type IO_DIRECT @@ -3951,11 +4560,12 @@ next_dread: * get at the overhang bytes */ if (io_size & (devblocksize - 1)) { - /* - * request does NOT end on a device block boundary - * so clip it back to a PAGE_SIZE boundary + assert(!(flags & IO_ENCRYPTED)); + /* + * Clip the request to the previous page size boundary + * since request does NOT end on a device block boundary */ - io_size &= ~PAGE_MASK; + io_size &= ~PAGE_MASK; io_min = PAGE_SIZE; } if (retval || io_size < io_min) { @@ -3968,21 +4578,62 @@ next_dread: */ goto wait_for_dreads; } - if ((xsize = io_size) > max_rd_size) - xsize = max_rd_size; - io_size = 0; + /* + * Don't re-check the UBC data if we are looking for uncached IO + * or asking for encrypted blocks. + */ + if ((strict_uncached_IO == FALSE) && ((flags & IO_ENCRYPTED) == 0)) { - ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size); + if ((xsize = io_size) > max_rd_size) + xsize = max_rd_size; - if (io_size == 0) { - /* - * a page must have just come into the cache - * since the first page in this range is no - * longer absent, go back and re-evaluate - */ - continue; + io_size = 0; + + if (!lock) { + /* + * We hold a lock here between the time we check the + * cache and the time we issue I/O. This saves us + * from having to lock the pages in the cache. Not + * all clients will care about this lock but some + * clients may want to guarantee stability between + * here and when the I/O is issued in which case they + * will take the lock exclusively. + */ + lock = cluster_lock_direct_read(vp, LCK_RW_TYPE_SHARED); + } + + ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size); + + if (io_size == 0) { + /* + * a page must have just come into the cache + * since the first page in this range is no + * longer absent, go back and re-evaluate + */ + continue; + } + } + if ( (flags & IO_RETURN_ON_THROTTLE) ) { + if (cluster_is_throttled(vp) == THROTTLE_NOW) { + if ( !cluster_io_present_in_BC(vp, uio->uio_offset)) { + /* + * we're in the throttle window and at least 1 I/O + * has already been issued by a throttleable thread + * in this window, so return with EAGAIN to indicate + * to the FS issuing the cluster_read call that it + * should now throttle after dropping any locks + */ + throttle_info_update_by_mount(vp->v_mount); + + io_throttled = TRUE; + goto wait_for_dreads; + } + } } + if (io_size > max_rd_size) + io_size = max_rd_size; + iov_base = uio_curriovbase(uio); upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK); @@ -3991,24 +4642,23 @@ next_dread: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START, (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0); - if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) { + if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) no_zero_fill = 1; - abort_flag = UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY; - } else { + else no_zero_fill = 0; - abort_flag = UPL_ABORT_FREE_ON_EMPTY; - } + + vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map; for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) { pages_in_pl = 0; upl_size = upl_needed_size; - upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE; - + upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE + | UPL_MEMORY_TAG_MAKE(VM_KERN_MEMORY_FILE); if (no_zero_fill) upl_flags |= UPL_NOZEROFILL; if (force_data_sync) upl_flags |= UPL_FORCE_DATA_SYNC; - kret = vm_map_create_upl(current_map(), + kret = vm_map_create_upl(map, (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), &upl_size, &upl, NULL, &pages_in_pl, &upl_flags); @@ -4028,13 +4678,13 @@ next_dread: pl = UPL_GET_INTERNAL_PAGE_LIST(upl); for (i = 0; i < pages_in_pl; i++) { - if (!upl_valid_page(pl, i)) + if (!upl_page_present(pl, i)) break; } if (i == pages_in_pl) break; - ubc_upl_abort(upl, abort_flag); + ubc_upl_abort(upl, 0); } if (force_data_sync >= 3) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END, @@ -4052,7 +4702,7 @@ next_dread: io_size = 0; } if (io_size == 0) { - ubc_upl_abort(upl, abort_flag); + ubc_upl_abort(upl, 0); goto wait_for_dreads; } KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END, @@ -4076,22 +4726,8 @@ next_dread: * if there are already too many outstanding reads * wait until some have completed before issuing the next read */ - if (iostate.io_issued > iostate.io_completed) { - - lck_mtx_lock(cl_mtxp); + cluster_iostate_wait(&iostate, max_rd_ahead, "cluster_read_direct"); - while ((iostate.io_issued - iostate.io_completed) > max_rd_ahead) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, max_rd_ahead, 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_direct", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, max_rd_ahead, 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) { /* * one of the earlier reads we issued ran into a hard error @@ -4100,14 +4736,13 @@ next_dread: * go wait for any other reads to complete before * returning the error to the caller */ - ubc_upl_abort(upl, abort_flag); + ubc_upl_abort(upl, 0); goto wait_for_dreads; } KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START, upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0); - if(!useVectorUPL) { if (no_zero_fill) io_flag &= ~CL_PRESERVE; @@ -4130,15 +4765,28 @@ next_dread: vector_upl_size += upl_size; vector_upl_iosize += io_size; - if(issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= MAX_VECTOR_UPL_SIZE) { + if(issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) { retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); reset_vector_run_state(); } - } + } + last_iov_base = iov_base + io_size; + + if (lock) { + // We don't need to wait for the I/O to complete + cluster_unlock_direct_read(lock); + lock = NULL; + } + /* * update the uio structure */ - uio_update(uio, (user_size_t)io_size); + if ((flags & IO_ENCRYPTED) && (max_io_size < io_size)) { + uio_update(uio, (user_size_t)max_io_size); + } + else { + uio_update(uio, (user_size_t)io_size); + } io_req_size -= io_size; @@ -4166,28 +4814,33 @@ wait_for_dreads: retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); reset_vector_run_state(); } + + // We don't need to wait for the I/O to complete + if (lock) + cluster_unlock_direct_read(lock); + /* * make sure all async reads that are part of this stream * have completed before we return */ - if (iostate.io_issued > iostate.io_completed) { + cluster_iostate_wait(&iostate, 0, "cluster_read_direct"); - lck_mtx_lock(cl_mtxp); + if (iostate.io_error) + retval = iostate.io_error; - while (iostate.io_issued != iostate.io_completed) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, 0, 0, 0); + lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_direct", NULL); + if (io_throttled == TRUE && retval == 0) + retval = EAGAIN; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - } - lck_mtx_unlock(cl_mtxp); + for (next_iov_base = orig_iov_base; next_iov_base < last_iov_base; next_iov_base += PAGE_SIZE) { + /* + * This is specifically done for pmap accounting purposes. + * vm_pre_fault() will call vm_fault() to enter the page into + * the pmap if there isn't _a_ physical page for that VA already. + */ + vm_pre_fault(vm_map_trunc_page(next_iov_base, PAGE_MASK)); } - if (iostate.io_error) - retval = iostate.io_error; if (io_req_size && retval == 0) { /* @@ -4218,7 +4871,7 @@ cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, upl_size_t upl_size; vm_size_t upl_needed_size; mach_msg_type_number_t pages_in_pl; - int upl_flags; + upl_control_flags_t upl_flags; kern_return_t kret; struct clios iostate; int error= 0; @@ -4236,13 +4889,16 @@ cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, bflag = CL_PASSIVE; else bflag = 0; - + + if (flags & IO_NOCACHE) + bflag |= CL_NOCACHE; + /* * When we enter this routine, we know * -- the read_length will not exceed the current iov_len * -- the target address is physically contiguous for read_length */ - cluster_syncup(vp, filesize, callback, callback_arg); + cluster_syncup(vp, filesize, callback, callback_arg, PUSH_SYNC); devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; @@ -4252,6 +4908,8 @@ cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, iostate.io_error = 0; iostate.io_wanted = 0; + lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); + next_cread: io_size = *read_length; @@ -4267,13 +4925,15 @@ next_cread: pages_in_pl = 0; upl_size = upl_needed_size; - upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE; + upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE + | UPL_MEMORY_TAG_MAKE(VM_KERN_MEMORY_FILE); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_START, (int)upl_offset, (int)upl_size, (int)iov_base, io_size, 0); - kret = vm_map_get_upl(current_map(), + vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map; + kret = vm_map_get_upl(map, (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, 0); @@ -4298,7 +4958,7 @@ next_cread: } pl = ubc_upl_pageinfo(upl[cur_upl]); - dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset; + dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset; while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) { u_int32_t head_size; @@ -4349,21 +5009,8 @@ next_cread: * if there are already too many outstanding reads * wait until some have completed before issuing the next */ - if (iostate.io_issued > iostate.io_completed) { - lck_mtx_lock(cl_mtxp); + cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_read_contig"); - while ((iostate.io_issued - iostate.io_completed) > (MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2))) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_contig", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) { /* * one of the earlier reads we issued ran into a hard error @@ -4404,25 +5051,13 @@ wait_for_creads: * make sure all async reads that are part of this stream * have completed before we proceed */ - if (iostate.io_issued > iostate.io_completed) { - - lck_mtx_lock(cl_mtxp); + cluster_iostate_wait(&iostate, 0, "cluster_read_contig"); - while (iostate.io_issued != iostate.io_completed) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_contig", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) error = iostate.io_error; + lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); + if (error == 0 && tail_size) error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ, callback, callback_arg); @@ -4444,7 +5079,7 @@ cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t m user_addr_t iov_base = 0; upl_t upl; upl_size_t upl_size; - int upl_flags; + upl_control_flags_t upl_flags; int retval = 0; /* @@ -4468,9 +5103,10 @@ cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t m else upl_size = (u_int32_t)iov_len; - upl_flags = UPL_QUERY_OBJECT_TYPE; - - if ((vm_map_get_upl(current_map(), + upl_flags = UPL_QUERY_OBJECT_TYPE | UPL_MEMORY_TAG_MAKE(VM_KERN_MEMORY_FILE); + + vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map; + if ((vm_map_get_upl(map, (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) { /* @@ -4541,6 +5177,11 @@ advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*c max_io_size = cluster_max_io_size(vp->v_mount, CL_READ); + if ((vp->v_mount->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) { + if (max_io_size > speculative_prefetch_max_iosize) + max_io_size = speculative_prefetch_max_iosize; + } + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START, (int)f_offset, resid, (int)filesize, 0, 0); @@ -4696,13 +5337,23 @@ cluster_push(vnode_t vp, int flags) int cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg) +{ + return cluster_push_err(vp, flags, callback, callback_arg, NULL); +} + +/* write errors via err, but return the number of clusters written */ +int +cluster_push_err(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg, int *err) { int retval; int my_sparse_wait = 0; struct cl_writebehind *wbp; + if (err) + *err = 0; + if ( !UBCINFOEXISTS(vp)) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, vp, flags, 0, -1, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -1, 0); return (0); } /* return if deferred write is set */ @@ -4710,13 +5361,13 @@ cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *ca return (0); } if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, vp, flags, 0, -2, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -2, 0); return (0); } - if (wbp->cl_number == 0 && wbp->cl_scmap == NULL) { + if (!ISSET(flags, IO_SYNC) && wbp->cl_number == 0 && wbp->cl_scmap == NULL) { lck_mtx_unlock(&wbp->cl_lockw); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, vp, flags, 0, -3, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -3, 0); return(0); } KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START, @@ -4730,11 +5381,11 @@ cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *ca * in the sparse map case */ while (wbp->cl_sparse_wait) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, vp, 0, 0, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0); msleep((caddr_t)&wbp->cl_sparse_wait, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, vp, 0, 0, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0); } if (flags & IO_SYNC) { my_sparse_wait = 1; @@ -4747,11 +5398,11 @@ cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *ca * fsync actually get cleaned to the disk before this fsync returns */ while (wbp->cl_sparse_pushes) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_START, vp, 0, 0, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0); msleep((caddr_t)&wbp->cl_sparse_pushes, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_END, vp, 0, 0, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0); } } if (wbp->cl_scmap) { @@ -4766,7 +5417,7 @@ cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *ca lck_mtx_unlock(&wbp->cl_lockw); - sparse_cluster_push(&scmap, vp, ubc_getsize(vp), PUSH_ALL | IO_PASSIVE, callback, callback_arg); + retval = sparse_cluster_push(&scmap, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg); lck_mtx_lock(&wbp->cl_lockw); @@ -4775,11 +5426,13 @@ cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *ca if (wbp->cl_sparse_wait && wbp->cl_sparse_pushes == 0) wakeup((caddr_t)&wbp->cl_sparse_pushes); } else { - sparse_cluster_push(&(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL | IO_PASSIVE, callback, callback_arg); + retval = sparse_cluster_push(&(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg); } + if (err) + *err = retval; retval = 1; - } else { - retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL | IO_PASSIVE, callback, callback_arg); + } else { + retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, err); } lck_mtx_unlock(&wbp->cl_lockw); @@ -4840,7 +5493,7 @@ cluster_release(struct ubc_info *ubc) static int -cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int (*callback)(buf_t, void *), void *callback_arg) +cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg, int *err) { int cl_index; int cl_index1; @@ -4849,7 +5502,7 @@ cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_fla int cl_pushed = 0; struct cl_wextent l_clusters[MAX_CLUSTERS]; u_int max_cluster_pgcount; - + int error = 0; max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE; /* @@ -4890,7 +5543,9 @@ cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_fla cl_len = cl_index; - if ( (push_flag & PUSH_DELAY) && cl_len == MAX_CLUSTERS ) { + /* skip switching to the sparse cluster mechanism if on diskimage */ + if ( ((push_flag & PUSH_DELAY) && cl_len == MAX_CLUSTERS ) && + !(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) ) { int i; /* @@ -4922,16 +5577,17 @@ cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_fla for (cl_index = 0; cl_index < cl_len; cl_index++) { int flags; struct cl_extent cl; + int retval; + + flags = io_flags & (IO_PASSIVE|IO_CLOSE); /* * try to push each cluster in turn... */ if (l_clusters[cl_index].io_flags & CLW_IONOCACHE) - flags = IO_NOCACHE; - else - flags = 0; + flags |= IO_NOCACHE; - if ((l_clusters[cl_index].io_flags & CLW_IOPASSIVE) || (push_flag & IO_PASSIVE)) + if (l_clusters[cl_index].io_flags & CLW_IOPASSIVE) flags |= IO_PASSIVE; if (push_flag & PUSH_SYNC) @@ -4940,7 +5596,10 @@ cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_fla cl.b_addr = l_clusters[cl_index].b_addr; cl.e_addr = l_clusters[cl_index].e_addr; - cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg); + retval = cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg); + + if (error == 0 && retval) + error = retval; l_clusters[cl_index].b_addr = 0; l_clusters[cl_index].e_addr = 0; @@ -4950,6 +5609,9 @@ cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_fla if ( !(push_flag & PUSH_ALL) ) break; } + if (err) + *err = error; + dont_try: if (cl_len > cl_pushed) { /* @@ -5036,9 +5698,12 @@ cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags, int (*c kern_return_t kret; if (flags & IO_PASSIVE) - bflag = CL_PASSIVE; + bflag = CL_PASSIVE; else - bflag = 0; + bflag = 0; + + if (flags & IO_SKIP_ENCRYPTION) + bflag |= CL_ENCRYPTED; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START, (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0); @@ -5165,6 +5830,12 @@ cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags, int (*c if ( !(flags & IO_SYNC)) io_flags |= CL_ASYNC; + if (flags & IO_CLOSE) + io_flags |= CL_CLOSE; + + if (flags & IO_NOCACHE) + io_flags |= CL_NOCACHE; + retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, io_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); @@ -5187,7 +5858,7 @@ sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*c { int cl_index; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, vp, wbp->cl_scmap, 0, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, kdebug_vnode(vp), wbp->cl_scmap, 0, 0, 0); for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) { int flags; @@ -5206,7 +5877,7 @@ sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*c } wbp->cl_number = 0; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, vp, wbp->cl_scmap, 0, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, kdebug_vnode(vp), wbp->cl_scmap, 0, 0, 0); } @@ -5215,31 +5886,37 @@ sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*c * still associated with the write-behind context... however, if the scmap has been disassociated * from the write-behind context (the cluster_push case), the wb lock is not held */ -static void -sparse_cluster_push(void **scmap, vnode_t vp, off_t EOF, int push_flag, int (*callback)(buf_t, void *), void *callback_arg) +static int +sparse_cluster_push(void **scmap, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg) { struct cl_extent cl; off_t offset; u_int length; + int error = 0; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, vp, (*scmap), 0, push_flag, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, kdebug_vnode(vp), (*scmap), 0, push_flag, 0); if (push_flag & PUSH_ALL) vfs_drt_control(scmap, 1); for (;;) { + int retval; if (vfs_drt_get_cluster(scmap, &offset, &length) != KERN_SUCCESS) break; cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64); cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64); - cluster_push_now(vp, &cl, EOF, push_flag & IO_PASSIVE, callback, callback_arg); + retval = cluster_push_now(vp, &cl, EOF, io_flags & (IO_PASSIVE|IO_CLOSE), callback, callback_arg); + if (error == 0 && retval) + error = retval; if ( !(push_flag & PUSH_ALL) ) break; } - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, vp, (*scmap), 0, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), 0, 0, 0); + + return error; } @@ -5264,12 +5941,12 @@ sparse_cluster_add(void **scmap, vnode_t vp, struct cl_extent *cl, off_t EOF, in * only a partial update was done * push out some pages and try again */ - sparse_cluster_push(scmap, vp, EOF, 0, callback, callback_arg); + sparse_cluster_push(scmap, vp, EOF, 0, 0, callback, callback_arg); offset += (new_dirty * PAGE_SIZE_64); length -= (new_dirty * PAGE_SIZE); } - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, vp, (*scmap), 0, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), 0, 0, 0); } @@ -5287,9 +5964,12 @@ cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t int bflag; if (flags & IO_PASSIVE) - bflag = CL_PASSIVE; + bflag = CL_PASSIVE; else - bflag = 0; + bflag = 0; + + if (flags & IO_NOCACHE) + bflag |= CL_NOCACHE; upl_flags = UPL_SET_LITE; @@ -5331,7 +6011,7 @@ cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t } did_read = 1; } - ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64); + ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)(uio->uio_offset & PAGE_MASK_64); /* * NOTE: There is no prototype for the following in BSD. It, and the definitions @@ -5366,8 +6046,6 @@ cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t return (error); } - - int cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid) { @@ -5378,6 +6056,7 @@ cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid) int retval = 0; int xsize; upl_page_info_t *pl; + int dirty_count; xsize = *io_resid; @@ -5414,10 +6093,13 @@ cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid) pg_offset = upl_offset & PAGE_MASK; csize = min(PAGE_SIZE - pg_offset, xsize); + dirty_count = 0; while (xsize && retval == 0) { addr64_t paddr; - paddr = ((addr64_t)upl_phys_page(pl, pg_index) << 12) + pg_offset; + paddr = ((addr64_t)upl_phys_page(pl, pg_index) << PAGE_SHIFT) + pg_offset; + if ((uio->uio_rw == UIO_WRITE) && (upl_dirty_page(pl, pg_index) == FALSE)) + dirty_count++; retval = uiomove64(paddr, csize, uio); @@ -5430,9 +6112,10 @@ cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid) uio->uio_segflg = segflg; + task_update_logical_writes(current_task(), (dirty_count * PAGE_SIZE), TASK_WRITE_DEFERRED, upl_lookup_vnode(upl)); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END, (int)uio->uio_offset, xsize, retval, segflg, 0); - + return (retval); } @@ -5458,7 +6141,7 @@ cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int m io_size = *io_resid; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START, - (int)uio->uio_offset, 0, io_size, 0, 0); + (int)uio->uio_offset, io_size, mark_dirty, take_reference, 0); control = ubc_getobject(vp, UBC_FLAGS_NONE); @@ -5551,15 +6234,15 @@ is_file_clean(vnode_t vp, off_t filesize) * single hashtable entry. Each hashtable entry is aligned to this * size within the file. */ -#define DRT_BITVECTOR_PAGES 256 +#define DRT_BITVECTOR_PAGES ((1024 * 1024) / PAGE_SIZE) /* * File offset handling. * - * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES; - * the correct formula is (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1) + * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES; + * the correct formula is (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)) */ -#define DRT_ADDRESS_MASK (~((1 << 20) - 1)) +#define DRT_ADDRESS_MASK (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)) #define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK) /* @@ -5614,6 +6297,14 @@ is_file_clean(vnode_t vp, off_t filesize) #define DRT_HASH_SMALL_MODULUS 23 #define DRT_HASH_LARGE_MODULUS 401 +/* + * Physical memory required before the large hash modulus is permitted. + * + * On small memory systems, the large hash modulus can lead to phsyical + * memory starvation, so we avoid using it there. + */ +#define DRT_HASH_LARGE_MEMORY_REQUIRED (1024LL * 1024LL * 1024LL) /* 1GiB */ + #define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */ #define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */ @@ -5649,7 +6340,15 @@ is_file_clean(vnode_t vp, off_t filesize) */ struct vfs_drt_hashentry { u_int64_t dhe_control; - u_int32_t dhe_bitvector[DRT_BITVECTOR_PAGES / 32]; +/* +* dhe_bitvector was declared as dhe_bitvector[DRT_BITVECTOR_PAGES / 32]; +* DRT_BITVECTOR_PAGES is defined as ((1024 * 1024) / PAGE_SIZE) +* Since PAGE_SIZE is only known at boot time, +* -define MAX_DRT_BITVECTOR_PAGES for smallest supported page size (4k) +* -declare dhe_bitvector array for largest possible length +*/ +#define MAX_DRT_BITVECTOR_PAGES (1024 * 1024)/( 4 * 1024) + u_int32_t dhe_bitvector[MAX_DRT_BITVECTOR_PAGES/32]; }; /* @@ -5756,8 +6455,12 @@ vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp) * see whether we should grow to the large one. */ if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) { - /* if the ring is nearly full */ - if (active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) { + /* + * If the ring is nearly full and we are allowed to + * use the large modulus, upgrade. + */ + if ((active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) && + (max_mem >= DRT_HASH_LARGE_MEMORY_REQUIRED)) { nsize = DRT_HASH_LARGE_MODULUS; } else { nsize = DRT_HASH_SMALL_MODULUS; @@ -5781,7 +6484,7 @@ vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp) */ kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap, - (nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION); + (nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION, VM_KERN_MEMORY_FILE); if (kret != KERN_SUCCESS) return(kret); cmap->scm_magic = DRT_SCM_MAGIC;