X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/b0d623f7f2ae71ed96e60569f61f9a9a27016e80..bd504ef0e0b883cdd7917b73b3574eb9ce669905:/bsd/vfs/vfs_cluster.c diff --git a/bsd/vfs/vfs_cluster.c b/bsd/vfs/vfs_cluster.c index 5aec1498a..69dfdfda3 100644 --- a/bsd/vfs/vfs_cluster.c +++ b/bsd/vfs/vfs_cluster.c @@ -71,6 +71,7 @@ #include #include #include +#include #include #include #include @@ -82,6 +83,7 @@ #include #include #include +#include #include #include @@ -90,6 +92,8 @@ #include #include +#include + #if 0 #undef KERNEL_DEBUG #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT @@ -111,6 +115,10 @@ #define CL_DIRECT_IO 0x1000 #define CL_PASSIVE 0x2000 #define CL_IOSTREAMING 0x4000 +#define CL_CLOSE 0x8000 +#define CL_ENCRYPTED 0x10000 +#define CL_RAW_ENCRYPTED 0x20000 +#define CL_NOCACHE 0x40000 #define MAX_VECTOR_UPL_ELEMENTS 8 #define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE) * PAGE_SIZE @@ -122,6 +130,7 @@ extern void vector_upl_set_pagelist(upl_t); extern void vector_upl_set_iostate(upl_t, upl_t, vm_offset_t, u_int32_t); struct clios { + lck_mtx_t io_mtxp; u_int io_completed; /* amount of io that has currently completed */ u_int io_issued; /* amount of io that was successfully issued */ int io_error; /* error code of first error encountered */ @@ -131,7 +140,7 @@ struct clios { static lck_grp_t *cl_mtx_grp; static lck_attr_t *cl_mtx_attr; static lck_grp_attr_t *cl_mtx_grp_attr; -static lck_mtx_t *cl_mtxp; +static lck_mtx_t *cl_transaction_mtxp; #define IO_UNKNOWN 0 @@ -156,6 +165,8 @@ static int cluster_iodone(buf_t bp, void *callback_arg); static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags); static int cluster_hard_throttle_on(vnode_t vp, uint32_t); +static void cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name); + static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg); static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference); @@ -182,10 +193,10 @@ static void cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t files static int cluster_push_now(vnode_t vp, struct cl_extent *, off_t EOF, int flags, int (*)(buf_t, void *), void *callback_arg); -static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int (*)(buf_t, void *), void *callback_arg); +static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int flags, int (*)(buf_t, void *), void *callback_arg); static void sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF, int (*)(buf_t, void *), void *callback_arg); -static void sparse_cluster_push(void **cmapp, vnode_t vp, off_t EOF, int push_flag, int (*)(buf_t, void *), void *callback_arg); +static void sparse_cluster_push(void **cmapp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*)(buf_t, void *), void *callback_arg); static void sparse_cluster_add(void **cmapp, vnode_t vp, struct cl_extent *, off_t EOF, int (*)(buf_t, void *), void *callback_arg); static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp); @@ -193,6 +204,30 @@ static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *le static kern_return_t vfs_drt_control(void **cmapp, int op_type); +/* + * For throttled IO to check whether + * a block is cached by the boot cache + * and thus it can avoid delaying the IO. + * + * bootcache_contains_block is initially + * NULL. The BootCache will set it while + * the cache is active and clear it when + * the cache is jettisoned. + * + * Returns 0 if the block is not + * contained in the cache, 1 if it is + * contained. + * + * The function pointer remains valid + * after the cache has been evicted even + * if bootcache_contains_block has been + * cleared. + * + * See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs + */ +int (*bootcache_contains_block)(dev_t device, u_int64_t blkno) = NULL; + + /* * limit the internal I/O size so that we * can represent it in a 32 bit int @@ -202,12 +237,30 @@ static kern_return_t vfs_drt_control(void **cmapp, int op_type); #define MAX_VECTS 16 #define MIN_DIRECT_WRITE_SIZE (4 * PAGE_SIZE) -#define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * base) -#define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE)) -#define MAX_PREFETCH(vp, io_size) (io_size * IO_SCALE(vp, 3)) +#define WRITE_THROTTLE 6 +#define WRITE_THROTTLE_SSD 2 +#define WRITE_BEHIND 1 +#define WRITE_BEHIND_SSD 1 +#if CONFIG_EMBEDDED +#define PREFETCH 1 +#define PREFETCH_SSD 1 +uint32_t speculative_prefetch_max = 512; /* maximum number of pages to use for a specluative read-ahead */ +uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use for a specluative read-ahead */ +#else +#define PREFETCH 3 +#define PREFETCH_SSD 1 +uint32_t speculative_prefetch_max = (MAX_UPL_SIZE * 3); +uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use for a specluative read-ahead on SSDs*/ +#endif + + +#define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * (base)) +#define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE)) +#define MAX_PREFETCH(vp, size, is_ssd) (size * IO_SCALE(vp, ((is_ssd && !ignore_is_ssd) ? PREFETCH_SSD : PREFETCH))) -int speculative_reads_disabled = 0; +int ignore_is_ssd = 0; +int speculative_reads_disabled = 0; /* * throttle the number of async writes that @@ -215,11 +268,25 @@ int speculative_reads_disabled = 0; * before we issue a synchronous write */ #define HARD_THROTTLE_MAXCNT 0 -#define HARD_THROTTLE_MAXSIZE (32 * 1024) +#define HARD_THROTTLE_MAX_IOSIZE (128 * 1024) +#define LEGACY_HARD_THROTTLE_MAX_IOSIZE (512 * 1024) +extern int32_t throttle_legacy_process_count; int hard_throttle_on_root = 0; +uint32_t hard_throttle_max_iosize = HARD_THROTTLE_MAX_IOSIZE; +uint32_t legacy_hard_throttle_max_iosize = LEGACY_HARD_THROTTLE_MAX_IOSIZE; struct timeval priority_IO_timestamp_for_root; +#if CONFIG_EMBEDDED +#define THROTTLE_MAX_IOSIZE (hard_throttle_max_iosize) +#else +#define THROTTLE_MAX_IOSIZE (throttle_legacy_process_count == 0 ? hard_throttle_max_iosize : legacy_hard_throttle_max_iosize) +#endif + + +SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_max_iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &hard_throttle_max_iosize, 0, ""); +SYSCTL_INT(_debug, OID_AUTO, lowpri_legacy_throttle_max_iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &legacy_hard_throttle_max_iosize, 0, ""); + void cluster_init(void) { @@ -234,14 +301,10 @@ cluster_init(void) { */ cl_mtx_attr = lck_attr_alloc_init(); - /* - * allocate and initialize mutex's used to protect updates and waits - * on the cluster_io context - */ - cl_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr); + cl_transaction_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr); - if (cl_mtxp == NULL) - panic("cluster_init: failed to allocate cl_mtxp"); + if (cl_transaction_mtxp == NULL) + panic("cluster_init: failed to allocate cl_transaction_mtxp"); } @@ -406,7 +469,7 @@ cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *c if (wbp->cl_number) { lck_mtx_lock(&wbp->cl_lockw); - cluster_try_push(wbp, vp, newEOF, PUSH_ALL | PUSH_SYNC, callback, callback_arg); + cluster_try_push(wbp, vp, newEOF, PUSH_ALL | PUSH_SYNC, 0, callback, callback_arg); lck_mtx_unlock(&wbp->cl_lockw); } @@ -414,36 +477,73 @@ cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *c } +static int +cluster_io_present_in_BC(vnode_t vp, off_t f_offset) +{ + daddr64_t blkno; + size_t io_size; + int (*bootcache_check_fn)(dev_t device, u_int64_t blkno) = bootcache_contains_block; + + if (bootcache_check_fn) { + if (VNOP_BLOCKMAP(vp, f_offset, PAGE_SIZE, &blkno, &io_size, NULL, VNODE_READ, NULL)) + return(0); + + if (io_size == 0) + return (0); + + if (bootcache_check_fn(vp->v_mount->mnt_devvp->v_rdev, blkno)) + return(1); + } + return(0); +} + + static int cluster_hard_throttle_on(vnode_t vp, uint32_t hard_throttle) { - struct uthread *ut; + int throttle_type = 0; - if (hard_throttle) { - static struct timeval hard_throttle_maxelapsed = { 0, 200000 }; + if ( (throttle_type = throttle_io_will_be_throttled(-1, vp->v_mount)) ) + return(throttle_type); - if (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV) { - struct timeval elapsed; + if (hard_throttle && (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV)) { + static struct timeval hard_throttle_maxelapsed = { 0, 100000 }; + struct timeval elapsed; - if (hard_throttle_on_root) - return(1); + if (hard_throttle_on_root) + return(1); - microuptime(&elapsed); - timevalsub(&elapsed, &priority_IO_timestamp_for_root); + microuptime(&elapsed); + timevalsub(&elapsed, &priority_IO_timestamp_for_root); - if (timevalcmp(&elapsed, &hard_throttle_maxelapsed, <)) - return(1); - } - } - if (throttle_get_io_policy(&ut) == IOPOL_THROTTLE) { - if (throttle_io_will_be_throttled(-1, vp->v_mount)) { + if (timevalcmp(&elapsed, &hard_throttle_maxelapsed, <)) return(1); - } } return(0); } +static void +cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name) +{ + + lck_mtx_lock(&iostate->io_mtxp); + + while ((iostate->io_issued - iostate->io_completed) > target) { + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, + iostate->io_issued, iostate->io_completed, target, 0, 0); + + iostate->io_wanted = 1; + msleep((caddr_t)&iostate->io_wanted, &iostate->io_mtxp, PRIBIO + 1, wait_name, NULL); + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, + iostate->io_issued, iostate->io_completed, target, 0, 0); + } + lck_mtx_unlock(&iostate->io_mtxp); +} + + static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags) { @@ -451,7 +551,7 @@ cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_fla int page_in = 0; int page_out = 0; - if (io_flags & B_PHYS) + if ((io_flags & (B_PHYS | B_CACHE)) == (B_PHYS | B_CACHE)) /* * direct write of any flavor, or a direct read that wasn't aligned */ @@ -510,26 +610,47 @@ cluster_iodone(buf_t bp, void *callback_arg) KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START, cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0); - for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) { - /* - * all I/O requests that are part of this transaction - * have to complete before we can process it - */ - if ( !(cbp->b_flags & B_DONE)) { + if (cbp_head->b_trans_next || !(cbp_head->b_flags & B_EOT)) { + boolean_t need_wakeup = FALSE; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, - cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0); + lck_mtx_lock_spin(cl_transaction_mtxp); - return 0; + bp->b_flags |= B_TDONE; + + if (bp->b_flags & B_TWANTED) { + CLR(bp->b_flags, B_TWANTED); + need_wakeup = TRUE; } - if (cbp->b_flags & B_EOT) - transaction_complete = TRUE; - } - if (transaction_complete == FALSE) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, - cbp_head, 0, 0, 0, 0); + for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) { + /* + * all I/O requests that are part of this transaction + * have to complete before we can process it + */ + if ( !(cbp->b_flags & B_TDONE)) { + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, + cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0); + + lck_mtx_unlock(cl_transaction_mtxp); + + if (need_wakeup == TRUE) + wakeup(bp); + + return 0; + } + if (cbp->b_flags & B_EOT) + transaction_complete = TRUE; + } + lck_mtx_unlock(cl_transaction_mtxp); - return 0; + if (need_wakeup == TRUE) + wakeup(bp); + + if (transaction_complete == FALSE) { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, + cbp_head, 0, 0, 0, 0); + return 0; + } } error = 0; total_size = 0; @@ -593,7 +714,7 @@ cluster_iodone(buf_t bp, void *callback_arg) * someone has issued multiple I/Os asynchrounsly * and is waiting for them to complete (streaming) */ - lck_mtx_lock_spin(cl_mtxp); + lck_mtx_lock_spin(&iostate->io_mtxp); if (error && iostate->io_error == 0) iostate->io_error = error; @@ -608,7 +729,7 @@ cluster_iodone(buf_t bp, void *callback_arg) iostate->io_wanted = 0; need_wakeup = 1; } - lck_mtx_unlock(cl_mtxp); + lck_mtx_unlock(&iostate->io_mtxp); if (need_wakeup) wakeup((caddr_t)&iostate->io_wanted); @@ -633,7 +754,7 @@ cluster_iodone(buf_t bp, void *callback_arg) ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags); } } - if ((b_flags & B_NEED_IODONE) && real_bp) { + if (real_bp) { if (error) { real_bp->b_flags |= B_ERROR; real_bp->b_error = error; @@ -653,7 +774,7 @@ uint32_t cluster_hard_throttle_limit(vnode_t vp, uint32_t *limit, uint32_t hard_throttle) { if (cluster_hard_throttle_on(vp, hard_throttle)) { - *limit = HARD_THROTTLE_MAXSIZE; + *limit = THROTTLE_MAX_IOSIZE; return 1; } return 0; @@ -719,27 +840,36 @@ cluster_wait_IO(buf_t cbp_head, int async) /* * async callback completion will not normally * generate a wakeup upon I/O completion... - * by setting BL_WANTED, we will force a wakeup + * by setting B_TWANTED, we will force a wakeup * to occur as any outstanding I/Os complete... - * I/Os already completed will have BL_CALLDONE already - * set and we won't block in buf_biowait_callback.. + * I/Os already completed will have B_TDONE already + * set and we won't cause us to block * note that we're actually waiting for the bp to have * completed the callback function... only then * can we safely take back ownership of the bp - * need the main buf mutex in order to safely - * update b_lflags */ - buf_list_lock(); + lck_mtx_lock_spin(cl_transaction_mtxp); for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) - cbp->b_lflags |= BL_WANTED; + cbp->b_flags |= B_TWANTED; - buf_list_unlock(); + lck_mtx_unlock(cl_transaction_mtxp); } for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) { - if (async) - buf_biowait_callback(cbp); - else + + if (async) { + while (!ISSET(cbp->b_flags, B_TDONE)) { + + lck_mtx_lock_spin(cl_transaction_mtxp); + + if (!ISSET(cbp->b_flags, B_TDONE)) { + DTRACE_IO1(wait__start, buf_t, cbp); + (void) msleep(cbp, cl_transaction_mtxp, PDROP | (PRIBIO+1), "cluster_wait_IO", NULL); + DTRACE_IO1(wait__done, buf_t, cbp); + } else + lck_mtx_unlock(cl_transaction_mtxp); + } + } else buf_biowait(cbp); } } @@ -759,6 +889,14 @@ cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, i for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) buf_biowait(cbp); } + /* + * we've already waited on all of the I/Os in this transaction, + * so mark all of the buf_t's in this transaction as B_TDONE + * so that cluster_iodone sees the transaction as completed + */ + for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) + cbp->b_flags |= B_TDONE; + error = cluster_iodone(*cbp_head, callback_arg); if ( !(flags & CL_ASYNC) && error && *retval == 0) { @@ -877,8 +1015,8 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no if (flags & CL_THROTTLE) { if ( !(flags & CL_PAGEOUT) && cluster_hard_throttle_on(vp, 1)) { - if (max_iosize > HARD_THROTTLE_MAXSIZE) - max_iosize = HARD_THROTTLE_MAXSIZE; + if (max_iosize > THROTTLE_MAX_IOSIZE) + max_iosize = THROTTLE_MAX_IOSIZE; async_throttle = HARD_THROTTLE_MAXCNT; } else { if ( (flags & CL_DEV_MEMORY) ) @@ -886,10 +1024,9 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no else { u_int max_cluster; u_int max_cluster_size; - u_int max_prefetch; - + u_int scale; + max_cluster_size = MAX_CLUSTER_SIZE(vp); - max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ)); if (max_iosize > max_cluster_size) max_cluster = max_cluster_size; @@ -898,8 +1035,16 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no if (size < max_cluster) max_cluster = size; + + if ((vp->v_mount->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) + scale = WRITE_THROTTLE_SSD; + else + scale = WRITE_THROTTLE; + + if (flags & CL_CLOSE) + scale += MAX_CLUSTERS; - async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), (max_prefetch / max_cluster) - 1); + async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), ((scale * max_cluster_size) / max_cluster) - 1); } } } @@ -911,12 +1056,14 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no io_flags |= B_IOSTREAMING; if (flags & CL_COMMIT) io_flags |= B_COMMIT_UPL; - if (flags & CL_PRESERVE) + if (flags & CL_DIRECT_IO) io_flags |= B_PHYS; - if (flags & CL_KEEPCACHED) - io_flags |= B_CACHE; + if (flags & (CL_PRESERVE | CL_KEEPCACHED)) + io_flags |= B_CACHE; if (flags & CL_PASSIVE) io_flags |= B_PASSIVE; + if (flags & CL_ENCRYPTED) + io_flags |= B_ENCRYPTED_IO; if (vp->v_flag & VSYSTEM) io_flags |= B_META; @@ -973,7 +1120,7 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no off_t e_offset; int pageout_flags; - if(upl_get_internal_vectorupl(upl)) + if (upl_get_internal_vectorupl(upl)) panic("Vector UPLs should not take this code-path\n"); /* * we're writing into a 'hole' @@ -1080,7 +1227,6 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no } if (vnode_pageout(vp, upl, trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) { error = EINVAL; - break; } e_offset = round_page_64(f_offset + 1); io_size = e_offset - f_offset; @@ -1109,6 +1255,11 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no */ size = 0; } + if (error) { + if (size == 0) + flags &= ~CL_COMMIT; + break; + } continue; } lblkno = (daddr64_t)(f_offset / PAGE_SIZE_64); @@ -1313,6 +1464,8 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no } cbp->b_cliodone = (void *)callback; cbp->b_flags |= io_flags; + if (flags & CL_NOCACHE) + cbp->b_attr.ba_flags |= BA_NOCACHE; cbp->b_lblkno = lblkno; cbp->b_blkno = blkno; @@ -1346,10 +1499,8 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no cbp_head = cbp; cbp_tail = cbp; - if ( (cbp_head->b_real_bp = real_bp) ) { - cbp_head->b_flags |= B_NEED_IODONE; + if ( (cbp_head->b_real_bp = real_bp) ) real_bp = (buf_t)NULL; - } } *(buf_t *)(&cbp->b_trans_head) = cbp_head; @@ -1407,6 +1558,14 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no if ( !(io_flags & B_READ)) vnode_startwrite(vp); + if (flags & CL_RAW_ENCRYPTED) { + /* + * User requested raw encrypted bytes. + * Twiddle the bit in the ba_flags for the buffer + */ + cbp->b_attr.ba_flags |= BA_RAW_ENCRYPTED_IO; + } + (void) VNOP_STRATEGY(cbp); if (need_EOT == TRUE) { @@ -1455,7 +1614,7 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no * since we never really issued the io * just go ahead and adjust it back */ - lck_mtx_lock_spin(cl_mtxp); + lck_mtx_lock_spin(&iostate->io_mtxp); if (iostate->io_error == 0) iostate->io_error = error; @@ -1469,7 +1628,7 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no iostate->io_wanted = 0; need_wakeup = 1; } - lck_mtx_unlock(cl_mtxp); + lck_mtx_unlock(&iostate->io_mtxp); if (need_wakeup) wakeup((caddr_t)&iostate->io_wanted); @@ -1580,8 +1739,16 @@ cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct return; } - max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ)); + max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ), (vp->v_mount->mnt_kern_flag & MNTK_SSD)); + + if ((max_prefetch / PAGE_SIZE) > speculative_prefetch_max) + max_prefetch = (speculative_prefetch_max * PAGE_SIZE); + if (max_prefetch <= PAGE_SIZE) { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, + rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 6, 0); + return; + } if (extent->e_addr < rap->cl_maxra) { if ((rap->cl_maxra - extent->e_addr) > ((max_prefetch / PAGE_SIZE) / 4)) { @@ -1643,18 +1810,7 @@ cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offs off_t max_size; int local_flags; - if (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) - /* - * if we know we're issuing this I/O to a virtual device (i.e. disk image) - * then we don't want to enforce this throttle... if we do, we can - * potentially deadlock since we're stalling the pageout thread at a time - * when the disk image might need additional memory (which won't be available - * if the pageout thread can't run)... instead we'll just depend on the throttle - * that the pageout thread now has in place to deal with external files - */ - local_flags = CL_PAGEOUT; - else - local_flags = CL_PAGEOUT | CL_THROTTLE; + local_flags = CL_PAGEOUT | CL_THROTTLE; if ((flags & UPL_IOSYNC) == 0) local_flags |= CL_ASYNC; @@ -1662,6 +1818,8 @@ cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offs local_flags |= CL_COMMIT; if ((flags & UPL_KEEPCACHED)) local_flags |= CL_KEEPCACHED; + if (flags & UPL_PAGING_ENCRYPTED) + local_flags |= CL_ENCRYPTED; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE, @@ -1738,6 +1896,8 @@ cluster_pagein_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offse local_flags |= CL_COMMIT; if (flags & UPL_IOSTREAMING) local_flags |= CL_IOSTREAMING; + if (flags & UPL_PAGING_ENCRYPTED) + local_flags |= CL_ENCRYPTED; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE, @@ -1831,9 +1991,10 @@ cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t else bflag = 0; - if (vp->v_flag & VNOCACHE_DATA) + if (vp->v_flag & VNOCACHE_DATA){ flags |= IO_NOCACHE; - + bflag |= CL_NOCACHE; + } if (uio == NULL) { /* * no user data... @@ -1845,12 +2006,12 @@ cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t } /* * do a write through the cache if one of the following is true.... - * NOCACHE is not true and + * NOCACHE is not true or NODIRECT is true * the uio request doesn't target USERSPACE * otherwise, find out if we want the direct or contig variant for * the first vector in the uio request */ - if ( (flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ) + if ( ((flags & (IO_NOCACHE | IO_NODIRECT)) == IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ) retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE); if ( (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)) && write_type == IO_DIRECT) @@ -1975,7 +2136,10 @@ cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, in user_addr_t iov_base; u_int32_t mem_alignment_mask; u_int32_t devblocksize; + u_int32_t max_io_size; u_int32_t max_upl_size; + u_int32_t max_vector_size; + boolean_t io_throttled = FALSE; u_int32_t vector_upl_iosize = 0; int issueVectorUPL = 0,useVectorUPL = (uio->uio_iovcnt > 1); @@ -1997,12 +2161,17 @@ cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, in if (flags & IO_PASSIVE) io_flag |= CL_PASSIVE; - + + if (flags & IO_NOCACHE) + io_flag |= CL_NOCACHE; + iostate.io_completed = 0; iostate.io_issued = 0; iostate.io_error = 0; iostate.io_wanted = 0; + lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); + mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; @@ -2044,6 +2213,33 @@ next_dwrite: } while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == 0) { + int throttle_type; + + if ( (throttle_type = cluster_hard_throttle_on(vp, 1)) ) { + /* + * we're in the throttle window, at the very least + * we want to limit the size of the I/O we're about + * to issue + */ + if ( (flags & IO_RETURN_ON_THROTTLE) && throttle_type == 2) { + /* + * we're in the throttle window and at least 1 I/O + * has already been issued by a throttleable thread + * in this window, so return with EAGAIN to indicate + * to the FS issuing the cluster_write call that it + * should now throttle after dropping any locks + */ + throttle_info_update_by_mount(vp->v_mount); + + io_throttled = TRUE; + goto wait_for_dwrites; + } + max_vector_size = THROTTLE_MAX_IOSIZE; + max_io_size = THROTTLE_MAX_IOSIZE; + } else { + max_vector_size = MAX_VECTOR_UPL_SIZE; + max_io_size = max_upl_size; + } if (first_IO) { cluster_syncup(vp, newEOF, callback, callback_arg); @@ -2052,8 +2248,8 @@ next_dwrite: io_size = io_req_size & ~PAGE_MASK; iov_base = uio_curriovbase(uio); - if (io_size > max_upl_size) - io_size = max_upl_size; + if (io_size > max_io_size) + io_size = max_io_size; if(useVectorUPL && (iov_base & PAGE_MASK)) { /* @@ -2183,23 +2379,9 @@ next_dwrite: * if there are already too many outstanding writes * wait until some complete before issuing the next */ - if (iostate.io_issued > iostate.io_completed) { - - lck_mtx_lock(cl_mtxp); + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, max_upl_size * IO_SCALE(vp, 2), "cluster_write_direct"); - while ((iostate.io_issued - iostate.io_completed) > (max_upl_size * IO_SCALE(vp, 2))) { - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, max_upl_size * IO_SCALE(vp, 2), 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_direct", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, max_upl_size * IO_SCALE(vp, 2), 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) { /* * one of the earlier writes we issued ran into a hard error @@ -2233,7 +2415,7 @@ next_dwrite: vector_upl_iosize += io_size; vector_upl_size += upl_size; - if(issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= MAX_VECTOR_UPL_SIZE) { + if(issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) { retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); reset_vector_run_state(); } @@ -2279,7 +2461,7 @@ next_dwrite: wait_for_dwrites: - if(retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) { + if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) { retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); reset_vector_run_state(); } @@ -2289,23 +2471,16 @@ wait_for_dwrites: * make sure all async writes issued as part of this stream * have completed before we return */ - lck_mtx_lock(cl_mtxp); - - while (iostate.io_issued != iostate.io_completed) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_direct", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - } - lck_mtx_unlock(cl_mtxp); + cluster_iostate_wait(&iostate, 0, "cluster_write_direct"); } if (iostate.io_error) retval = iostate.io_error; + lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); + + if (io_throttled == TRUE && retval == 0) + retval = EAGAIN; + if (io_req_size && retval == 0) { /* * we couldn't handle the tail of this request in DIRECT mode @@ -2368,6 +2543,8 @@ cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, iostate.io_error = 0; iostate.io_wanted = 0; + lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); + next_cwrite: io_size = *write_length; @@ -2456,22 +2633,9 @@ next_cwrite: * if there are already too many outstanding writes * wait until some have completed before issuing the next */ - if (iostate.io_issued > iostate.io_completed) { - lck_mtx_lock(cl_mtxp); - - while ((iostate.io_issued - iostate.io_completed) > (MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2))) { - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_contig", NULL); + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_write_contig"); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) { /* * one of the earlier writes we issued ran into a hard error @@ -2515,25 +2679,14 @@ wait_for_cwrites: * make sure all async writes that are part of this stream * have completed before we proceed */ - if (iostate.io_issued > iostate.io_completed) { - - lck_mtx_lock(cl_mtxp); - - while (iostate.io_issued != iostate.io_completed) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, 0, 0, 0); + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, 0, "cluster_write_contig"); - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_contig", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) error = iostate.io_error; + lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); + if (error == 0 && tail_size) error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0, callback, callback_arg); @@ -2608,6 +2761,9 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old off_t zero_off; long long zero_cnt1; off_t zero_off1; + off_t write_off = 0; + int write_cnt = 0; + boolean_t first_pass = FALSE; struct cl_extent cl; struct cl_writebehind *wbp; int bflag; @@ -2629,7 +2785,9 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old bflag = CL_PASSIVE; else bflag = 0; - + if (flags & IO_NOCACHE) + bflag |= CL_NOCACHE; + zero_cnt = 0; zero_cnt1 = 0; zero_off = 0; @@ -2689,7 +2847,16 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old retval, 0, 0, 0, 0); return (0); } - + if (uio) { + write_off = uio->uio_offset; + write_cnt = uio_resid(uio); + /* + * delay updating the sequential write info + * in the control block until we've obtained + * the lock for it + */ + first_pass = TRUE; + } while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) { /* * for this iteration of the loop, figure out where our starting point is @@ -2718,7 +2885,7 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old * because IO_HEADZEROFILL and IO_TAILZEROFILL not set */ if ((start_offset + total_size) > max_io_size) - total_size -= start_offset; + total_size = max_io_size - start_offset; xfer_resid = total_size; retval = cluster_copy_ubc_data_internal(vp, uio, &xfer_resid, 1, 1); @@ -2984,7 +3151,7 @@ check_cluster: */ wbp->cl_number = 0; - sparse_cluster_push(&(wbp->cl_scmap), vp, newEOF, PUSH_ALL, callback, callback_arg); + sparse_cluster_push(&(wbp->cl_scmap), vp, newEOF, PUSH_ALL, 0, callback, callback_arg); /* * no clusters of either type present at this point * so just go directly to start_new_cluster since @@ -2993,7 +3160,17 @@ check_cluster: * to avoid the deadlock with sparse_cluster_push */ goto start_new_cluster; - } + } + if (first_pass) { + if (write_off == wbp->cl_last_write) + wbp->cl_seq_written += write_cnt; + else + wbp->cl_seq_written = write_cnt; + + wbp->cl_last_write = write_off + write_cnt; + + first_pass = FALSE; + } if (wbp->cl_number == 0) /* * no clusters currently present @@ -3108,14 +3285,27 @@ check_cluster: */ goto delay_io; - if (wbp->cl_number < MAX_CLUSTERS) + if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && + wbp->cl_number == MAX_CLUSTERS && + wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) { + uint32_t n; + + if (vp->v_mount->mnt_kern_flag & MNTK_SSD) + n = WRITE_BEHIND_SSD; + else + n = WRITE_BEHIND; + + while (n--) + cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg); + } + if (wbp->cl_number < MAX_CLUSTERS) { /* * we didn't find an existing cluster to * merge into, but there's room to start * a new one */ goto start_new_cluster; - + } /* * no exisitng cluster to merge with and no * room to start a new one... we'll try @@ -3133,7 +3323,7 @@ check_cluster: */ if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) { - ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, callback, callback_arg); + ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, 0, callback, callback_arg); } /* @@ -3152,18 +3342,6 @@ check_cluster: continue; } - /* - * we pushed one cluster successfully, so we must be sequentially writing this file - * otherwise, we would have failed and fallen into the sparse cluster support - * so let's take the opportunity to push out additional clusters... - * this will give us better I/O locality if we're in a copy loop - * (i.e. we won't jump back and forth between the read and write points - */ - if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) { - while (wbp->cl_number) - cluster_try_push(wbp, vp, newEOF, 0, callback, callback_arg); - } - start_new_cluster: wbp->cl_clusters[wbp->cl_number].b_addr = cl.b_addr; wbp->cl_clusters[wbp->cl_number].e_addr = cl.e_addr; @@ -3224,17 +3402,34 @@ cluster_read_ext(vnode_t vp, struct uio *uio, off_t filesize, int xflags, int (* flags |= IO_NOCACHE; if ((vp->v_flag & VRAOFF) || speculative_reads_disabled) flags |= IO_RAOFF; + + /* + * If we're doing an encrypted IO, then first check to see + * if the IO requested was page aligned. If not, then bail + * out immediately. + */ + if (flags & IO_ENCRYPTED) { + if (read_length & PAGE_MASK) { + retval = EINVAL; + return retval; + } + } - /* + /* * do a read through the cache if one of the following is true.... * NOCACHE is not true * the uio request doesn't target USERSPACE + * Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well. + * Reading encrypted data from a CP filesystem should never result in the data touching + * the UBC. + * * otherwise, find out if we want the direct or contig variant for * the first vector in the uio request */ - if ( (flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ) - retval = cluster_io_type(uio, &read_type, &read_length, 0); - + if (((flags & IO_NOCACHE) || (flags & IO_ENCRYPTED)) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) { + retval = cluster_io_type(uio, &read_type, &read_length, 0); + } + while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) { switch (read_type) { @@ -3318,21 +3513,19 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file struct cl_extent extent; int bflag; int take_reference = 1; - struct uthread *ut; int policy = IOPOL_DEFAULT; - + boolean_t iolock_inited = FALSE; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START, (int)uio->uio_offset, io_req_size, (int)filesize, flags, 0); + + if (flags & IO_ENCRYPTED) { + panic ("encrypted blocks will hit UBC!"); + } - policy = current_proc()->p_iopol_disk; - - ut = get_bsdthread_info(current_thread()); + policy = proc_get_task_selfdiskacc(); - if (ut->uu_iopol_disk != IOPOL_DEFAULT) - policy = ut->uu_iopol_disk; - - if (policy == IOPOL_THROTTLE || (flags & IO_NOCACHE)) + if (policy == IOPOL_THROTTLE || policy == IOPOL_UTILITY || (flags & IO_NOCACHE)) take_reference = 0; if (flags & IO_PASSIVE) @@ -3340,8 +3533,11 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file else bflag = 0; + if (flags & IO_NOCACHE) + bflag |= CL_NOCACHE; + max_io_size = cluster_max_io_size(vp->v_mount, CL_READ); - max_prefetch = MAX_PREFETCH(vp, max_io_size); + max_prefetch = MAX_PREFETCH(vp, max_io_size, (vp->v_mount->mnt_kern_flag & MNTK_SSD)); max_rd_size = max_prefetch; last_request_offset = uio->uio_offset + io_req_size; @@ -3354,13 +3550,15 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file rap = NULL; } else { if (cluster_hard_throttle_on(vp, 1)) { + /* + * we're in the throttle window, at the very least + * we want to limit the size of the I/O we're about + * to issue + */ rd_ahead_enabled = 0; prefetch_enabled = 0; - max_rd_size = HARD_THROTTLE_MAXSIZE; - } else if (policy == IOPOL_THROTTLE) { - rd_ahead_enabled = 0; - prefetch_enabled = 0; + max_rd_size = THROTTLE_MAX_IOSIZE; } if ((rap = cluster_get_rap(vp)) == NULL) rd_ahead_enabled = 0; @@ -3440,7 +3638,7 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file io_requested = io_resid; - retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, last_ioread_offset == 0 ? take_reference : 0); + retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, take_reference); xsize = io_requested - io_resid; @@ -3479,6 +3677,30 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file */ max_size = filesize - uio->uio_offset; } + + iostate.io_completed = 0; + iostate.io_issued = 0; + iostate.io_error = 0; + iostate.io_wanted = 0; + + if ( (flags & IO_RETURN_ON_THROTTLE) ) { + if (cluster_hard_throttle_on(vp, 0) == 2) { + if ( !cluster_io_present_in_BC(vp, uio->uio_offset)) { + /* + * we're in the throttle window and at least 1 I/O + * has already been issued by a throttleable thread + * in this window, so return with EAGAIN to indicate + * to the FS issuing the cluster_read call that it + * should now throttle after dropping any locks + */ + throttle_info_update_by_mount(vp->v_mount); + + retval = EAGAIN; + break; + } + } + } + /* * compute the size of the upl needed to encompass * the requested read... limit each call to cluster_io @@ -3540,10 +3762,6 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file if (upl_valid_page(pl, last_pg)) break; } - iostate.io_completed = 0; - iostate.io_issued = 0; - iostate.io_error = 0; - iostate.io_wanted = 0; if (start_pg < last_pg) { /* @@ -3552,6 +3770,11 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file * we may have to clip the size of it to keep from reading past * the end of the last physical block associated with the file */ + if (iolock_inited == FALSE) { + lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); + + iolock_inited = TRUE; + } upl_offset = start_pg * PAGE_SIZE; io_size = (last_pg - start_pg) * PAGE_SIZE; @@ -3564,6 +3787,18 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, CL_READ | CL_ASYNC | bflag, (buf_t)NULL, &iostate, callback, callback_arg); + + if (rap) { + if (extent.e_addr < rap->cl_maxra) { + /* + * we've just issued a read for a block that should have been + * in the cache courtesy of the read-ahead engine... something + * has gone wrong with the pipeline, so reset the read-ahead + * logic which will cause us to restart from scratch + */ + rap->cl_maxra = 0; + } + } } if (error == 0) { /* @@ -3642,22 +3877,9 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file rap->cl_lastr = extent.e_addr; } } - if (iostate.io_issued > iostate.io_completed) { - - lck_mtx_lock(cl_mtxp); - - while (iostate.io_issued != iostate.io_completed) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_copy", NULL); + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, 0, "cluster_read_copy"); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) error = iostate.io_error; else { @@ -3669,6 +3891,9 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file io_req_size -= (val_size - io_requested); } + } else { + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, 0, "cluster_read_copy"); } if (start_pg < last_pg) { /* @@ -3729,16 +3954,20 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file if (io_req_size) { if (cluster_hard_throttle_on(vp, 1)) { + /* + * we're in the throttle window, at the very least + * we want to limit the size of the I/O we're about + * to issue + */ rd_ahead_enabled = 0; prefetch_enabled = 0; - - max_rd_size = HARD_THROTTLE_MAXSIZE; + max_rd_size = THROTTLE_MAX_IOSIZE; } else { - if (max_rd_size == HARD_THROTTLE_MAXSIZE) { + if (max_rd_size == THROTTLE_MAX_IOSIZE) { /* * coming out of throttled state */ - if (policy != IOPOL_THROTTLE) { + if (policy != IOPOL_THROTTLE && policy != IOPOL_UTILITY) { if (rap != NULL) rd_ahead_enabled = 1; prefetch_enabled = 1; @@ -3749,6 +3978,20 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file } } } + if (iolock_inited == TRUE) { + if (iostate.io_issued > iostate.io_completed) { + /* + * cluster_io returned an error after it + * had already issued some I/O. we need + * to wait for that I/O to complete before + * we can destroy the iostate mutex... + * 'retval' already contains the early error + * so no need to pick it up from iostate.io_error + */ + cluster_iostate_wait(&iostate, 0, "cluster_read_copy"); + } + lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); + } if (rap != NULL) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END, (int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, 0); @@ -3780,7 +4023,6 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, int force_data_sync; int retval = 0; int no_zero_fill = 0; - int abort_flag = 0; int io_flag = 0; int misaligned = 0; struct clios iostate; @@ -3796,6 +4038,9 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t max_upl_size; u_int32_t max_rd_size; u_int32_t max_rd_ahead; + u_int32_t max_vector_size; + boolean_t strict_uncached_IO = FALSE; + boolean_t io_throttled = FALSE; u_int32_t vector_upl_iosize = 0; int issueVectorUPL = 0,useVectorUPL = (uio->uio_iovcnt > 1); @@ -3812,14 +4057,25 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, max_rd_ahead = max_rd_size * IO_SCALE(vp, 2); io_flag = CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO | CL_DIRECT_IO; + if (flags & IO_PASSIVE) io_flag |= CL_PASSIVE; + if (flags & IO_ENCRYPTED) { + io_flag |= CL_RAW_ENCRYPTED; + } + + if (flags & IO_NOCACHE) { + io_flag |= CL_NOCACHE; + } + iostate.io_completed = 0; iostate.io_issued = 0; iostate.io_error = 0; iostate.io_wanted = 0; + lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); + devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; @@ -3839,6 +4095,9 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, */ devblocksize = PAGE_SIZE; } + + strict_uncached_IO = ubc_strict_uncached_IO(vp); + next_dread: io_req_size = *read_length; iov_base = uio_curriovbase(uio); @@ -3865,7 +4124,17 @@ next_dread: * I/O that ends on a page boundary in cluster_io */ misaligned = 1; - } + } + + /* + * The user must request IO in aligned chunks. If the + * offset into the file is bad, or the userland pointer + * is non-aligned, then we cannot service the encrypted IO request. + */ + if ((flags & IO_ENCRYPTED) && (misaligned)) { + retval = EINVAL; + } + /* * When we get to this point, we know... * -- the offset into the file is on a devblocksize boundary @@ -3875,23 +4144,34 @@ next_dread: u_int32_t io_start; if (cluster_hard_throttle_on(vp, 1)) { - max_rd_size = HARD_THROTTLE_MAXSIZE; - max_rd_ahead = HARD_THROTTLE_MAXSIZE - 1; + /* + * we're in the throttle window, at the very least + * we want to limit the size of the I/O we're about + * to issue + */ + max_rd_size = THROTTLE_MAX_IOSIZE; + max_rd_ahead = THROTTLE_MAX_IOSIZE - 1; + max_vector_size = THROTTLE_MAX_IOSIZE; } else { max_rd_size = max_upl_size; max_rd_ahead = max_rd_size * IO_SCALE(vp, 2); + max_vector_size = MAX_VECTOR_UPL_SIZE; } io_start = io_size = io_req_size; /* * First look for pages already in the cache - * and move them to user space. + * and move them to user space. But only do this + * check if we are not retrieving encrypted data directly + * from the filesystem; those blocks should never + * be in the UBC. * * cluster_copy_ubc_data returns the resid * in io_size */ - retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0); - + if ((strict_uncached_IO == FALSE) && ((flags & IO_ENCRYPTED) == 0)) { + retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0); + } /* * calculate the number of bytes actually copied * starting size - residual @@ -3922,9 +4202,14 @@ next_dread: } /* - * check to see if we are finished with this request... + * check to see if we are finished with this request. + * + * If we satisfied this IO already, then io_req_size will be 0. + * Otherwise, see if the IO was mis-aligned and needs to go through + * the UBC to deal with the 'tail'. + * */ - if (io_req_size == 0 || misaligned) { + if (io_req_size == 0 || (misaligned)) { /* * see if there's another uio vector to * process that's of type IO_DIRECT @@ -3950,13 +4235,31 @@ next_dread: * (which overlaps the end of the direct read) in order to * get at the overhang bytes */ - if (io_size & (devblocksize - 1)) { - /* - * request does NOT end on a device block boundary - * so clip it back to a PAGE_SIZE boundary - */ - io_size &= ~PAGE_MASK; - io_min = PAGE_SIZE; + if (io_size & (devblocksize - 1)) { + if (flags & IO_ENCRYPTED) { + /* + * Normally, we'd round down to the previous page boundary to + * let the UBC manage the zero-filling of the file past the EOF. + * But if we're doing encrypted IO, we can't let any of + * the data hit the UBC. This means we have to do the full + * IO to the upper block boundary of the device block that + * contains the EOF. The user will be responsible for not + * interpreting data PAST the EOF in its buffer. + * + * So just bump the IO back up to a multiple of devblocksize + */ + io_size = ((io_size + devblocksize) & ~(devblocksize - 1)); + io_min = io_size; + } + else { + /* + * Clip the request to the previous page size boundary + * since request does NOT end on a device block boundary + */ + io_size &= ~PAGE_MASK; + io_min = PAGE_SIZE; + } + } if (retval || io_size < io_min) { /* @@ -3968,21 +4271,49 @@ next_dread: */ goto wait_for_dreads; } - if ((xsize = io_size) > max_rd_size) - xsize = max_rd_size; - io_size = 0; + /* + * Don't re-check the UBC data if we are looking for uncached IO + * or asking for encrypted blocks. + */ + if ((strict_uncached_IO == FALSE) && ((flags & IO_ENCRYPTED) == 0)) { - ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size); + if ((xsize = io_size) > max_rd_size) + xsize = max_rd_size; - if (io_size == 0) { - /* - * a page must have just come into the cache - * since the first page in this range is no - * longer absent, go back and re-evaluate - */ - continue; + io_size = 0; + + ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size); + + if (io_size == 0) { + /* + * a page must have just come into the cache + * since the first page in this range is no + * longer absent, go back and re-evaluate + */ + continue; + } } + if ( (flags & IO_RETURN_ON_THROTTLE) ) { + if (cluster_hard_throttle_on(vp, 0) == 2) { + if ( !cluster_io_present_in_BC(vp, uio->uio_offset)) { + /* + * we're in the throttle window and at least 1 I/O + * has already been issued by a throttleable thread + * in this window, so return with EAGAIN to indicate + * to the FS issuing the cluster_read call that it + * should now throttle after dropping any locks + */ + throttle_info_update_by_mount(vp->v_mount); + + io_throttled = TRUE; + goto wait_for_dreads; + } + } + } + if (io_size > max_rd_size) + io_size = max_rd_size; + iov_base = uio_curriovbase(uio); upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK); @@ -3991,13 +4322,11 @@ next_dread: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START, (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0); - if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) { + if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) no_zero_fill = 1; - abort_flag = UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY; - } else { + else no_zero_fill = 0; - abort_flag = UPL_ABORT_FREE_ON_EMPTY; - } + for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) { pages_in_pl = 0; upl_size = upl_needed_size; @@ -4028,13 +4357,13 @@ next_dread: pl = UPL_GET_INTERNAL_PAGE_LIST(upl); for (i = 0; i < pages_in_pl; i++) { - if (!upl_valid_page(pl, i)) + if (!upl_page_present(pl, i)) break; } if (i == pages_in_pl) break; - ubc_upl_abort(upl, abort_flag); + ubc_upl_abort(upl, 0); } if (force_data_sync >= 3) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END, @@ -4052,7 +4381,7 @@ next_dread: io_size = 0; } if (io_size == 0) { - ubc_upl_abort(upl, abort_flag); + ubc_upl_abort(upl, 0); goto wait_for_dreads; } KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END, @@ -4076,22 +4405,9 @@ next_dread: * if there are already too many outstanding reads * wait until some have completed before issuing the next read */ - if (iostate.io_issued > iostate.io_completed) { - - lck_mtx_lock(cl_mtxp); + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, max_rd_ahead, "cluster_read_direct"); - while ((iostate.io_issued - iostate.io_completed) > max_rd_ahead) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, max_rd_ahead, 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_direct", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, max_rd_ahead, 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) { /* * one of the earlier reads we issued ran into a hard error @@ -4100,7 +4416,7 @@ next_dread: * go wait for any other reads to complete before * returning the error to the caller */ - ubc_upl_abort(upl, abort_flag); + ubc_upl_abort(upl, 0); goto wait_for_dreads; } @@ -4130,7 +4446,7 @@ next_dread: vector_upl_size += upl_size; vector_upl_iosize += io_size; - if(issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= MAX_VECTOR_UPL_SIZE) { + if(issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) { retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); reset_vector_run_state(); } @@ -4138,9 +4454,24 @@ next_dread: /* * update the uio structure */ - uio_update(uio, (user_size_t)io_size); - - io_req_size -= io_size; + if ((flags & IO_ENCRYPTED) && (max_io_size < io_size)) { + uio_update(uio, (user_size_t)max_io_size); + } + else { + uio_update(uio, (user_size_t)io_size); + } + /* + * Under normal circumstances, the io_size should not be + * bigger than the io_req_size, but we may have had to round up + * to the end of the page in the encrypted IO case. In that case only, + * ensure that we only decrement io_req_size to 0. + */ + if ((flags & IO_ENCRYPTED) && (io_size > io_req_size)) { + io_req_size = 0; + } + else { + io_req_size -= io_size; + } KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END, upl, (int)uio->uio_offset, io_req_size, retval, 0); @@ -4170,25 +4501,17 @@ wait_for_dreads: * make sure all async reads that are part of this stream * have completed before we return */ - if (iostate.io_issued > iostate.io_completed) { - - lck_mtx_lock(cl_mtxp); - - while (iostate.io_issued != iostate.io_completed) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, 0, 0, 0); + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, 0, "cluster_read_direct"); - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_direct", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) retval = iostate.io_error; + lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); + + if (io_throttled == TRUE && retval == 0) + retval = EAGAIN; + if (io_req_size && retval == 0) { /* * we couldn't handle the tail of this request in DIRECT mode @@ -4236,7 +4559,10 @@ cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, bflag = CL_PASSIVE; else bflag = 0; - + + if (flags & IO_NOCACHE) + bflag |= CL_NOCACHE; + /* * When we enter this routine, we know * -- the read_length will not exceed the current iov_len @@ -4252,6 +4578,8 @@ cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, iostate.io_error = 0; iostate.io_wanted = 0; + lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); + next_cread: io_size = *read_length; @@ -4349,21 +4677,9 @@ next_cread: * if there are already too many outstanding reads * wait until some have completed before issuing the next */ - if (iostate.io_issued > iostate.io_completed) { - lck_mtx_lock(cl_mtxp); - - while ((iostate.io_issued - iostate.io_completed) > (MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2))) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), 0, 0); + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_read_contig"); - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_contig", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) { /* * one of the earlier reads we issued ran into a hard error @@ -4404,25 +4720,14 @@ wait_for_creads: * make sure all async reads that are part of this stream * have completed before we proceed */ - if (iostate.io_issued > iostate.io_completed) { - - lck_mtx_lock(cl_mtxp); - - while (iostate.io_issued != iostate.io_completed) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, 0, 0, 0); + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, 0, "cluster_read_contig"); - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_contig", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) error = iostate.io_error; + lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); + if (error == 0 && tail_size) error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ, callback, callback_arg); @@ -4541,6 +4846,16 @@ advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*c max_io_size = cluster_max_io_size(vp->v_mount, CL_READ); +#if CONFIG_EMBEDDED + if (max_io_size > speculative_prefetch_max_iosize) + max_io_size = speculative_prefetch_max_iosize; +#else + if ((vp->v_mount->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) { + if (max_io_size > speculative_prefetch_max_iosize) + max_io_size = speculative_prefetch_max_iosize; + } +#endif + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START, (int)f_offset, resid, (int)filesize, 0, 0); @@ -4766,7 +5081,7 @@ cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *ca lck_mtx_unlock(&wbp->cl_lockw); - sparse_cluster_push(&scmap, vp, ubc_getsize(vp), PUSH_ALL | IO_PASSIVE, callback, callback_arg); + sparse_cluster_push(&scmap, vp, ubc_getsize(vp), PUSH_ALL, flags | IO_PASSIVE, callback, callback_arg); lck_mtx_lock(&wbp->cl_lockw); @@ -4775,11 +5090,11 @@ cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *ca if (wbp->cl_sparse_wait && wbp->cl_sparse_pushes == 0) wakeup((caddr_t)&wbp->cl_sparse_pushes); } else { - sparse_cluster_push(&(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL | IO_PASSIVE, callback, callback_arg); + sparse_cluster_push(&(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL, flags | IO_PASSIVE, callback, callback_arg); } retval = 1; } else { - retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL | IO_PASSIVE, callback, callback_arg); + retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL, flags | IO_PASSIVE, callback, callback_arg); } lck_mtx_unlock(&wbp->cl_lockw); @@ -4840,7 +5155,7 @@ cluster_release(struct ubc_info *ubc) static int -cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int (*callback)(buf_t, void *), void *callback_arg) +cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg) { int cl_index; int cl_index1; @@ -4923,15 +5238,15 @@ cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_fla int flags; struct cl_extent cl; + flags = io_flags & (IO_PASSIVE|IO_CLOSE); + /* * try to push each cluster in turn... */ if (l_clusters[cl_index].io_flags & CLW_IONOCACHE) - flags = IO_NOCACHE; - else - flags = 0; + flags |= IO_NOCACHE; - if ((l_clusters[cl_index].io_flags & CLW_IOPASSIVE) || (push_flag & IO_PASSIVE)) + if (l_clusters[cl_index].io_flags & CLW_IOPASSIVE) flags |= IO_PASSIVE; if (push_flag & PUSH_SYNC) @@ -5036,9 +5351,9 @@ cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags, int (*c kern_return_t kret; if (flags & IO_PASSIVE) - bflag = CL_PASSIVE; + bflag = CL_PASSIVE; else - bflag = 0; + bflag = 0; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START, (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0); @@ -5165,6 +5480,12 @@ cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags, int (*c if ( !(flags & IO_SYNC)) io_flags |= CL_ASYNC; + if (flags & IO_CLOSE) + io_flags |= CL_CLOSE; + + if (flags & IO_NOCACHE) + io_flags |= CL_NOCACHE; + retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, io_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); @@ -5216,7 +5537,7 @@ sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*c * from the write-behind context (the cluster_push case), the wb lock is not held */ static void -sparse_cluster_push(void **scmap, vnode_t vp, off_t EOF, int push_flag, int (*callback)(buf_t, void *), void *callback_arg) +sparse_cluster_push(void **scmap, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg) { struct cl_extent cl; off_t offset; @@ -5234,7 +5555,7 @@ sparse_cluster_push(void **scmap, vnode_t vp, off_t EOF, int push_flag, int (*ca cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64); cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64); - cluster_push_now(vp, &cl, EOF, push_flag & IO_PASSIVE, callback, callback_arg); + cluster_push_now(vp, &cl, EOF, io_flags & (IO_PASSIVE|IO_CLOSE), callback, callback_arg); if ( !(push_flag & PUSH_ALL) ) break; @@ -5264,7 +5585,7 @@ sparse_cluster_add(void **scmap, vnode_t vp, struct cl_extent *cl, off_t EOF, in * only a partial update was done * push out some pages and try again */ - sparse_cluster_push(scmap, vp, EOF, 0, callback, callback_arg); + sparse_cluster_push(scmap, vp, EOF, 0, 0, callback, callback_arg); offset += (new_dirty * PAGE_SIZE_64); length -= (new_dirty * PAGE_SIZE); @@ -5287,9 +5608,12 @@ cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t int bflag; if (flags & IO_PASSIVE) - bflag = CL_PASSIVE; + bflag = CL_PASSIVE; else - bflag = 0; + bflag = 0; + + if (flags & IO_NOCACHE) + bflag |= CL_NOCACHE; upl_flags = UPL_SET_LITE; @@ -5458,7 +5782,7 @@ cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int m io_size = *io_resid; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START, - (int)uio->uio_offset, 0, io_size, 0, 0); + (int)uio->uio_offset, io_size, mark_dirty, take_reference, 0); control = ubc_getobject(vp, UBC_FLAGS_NONE); @@ -5614,6 +5938,14 @@ is_file_clean(vnode_t vp, off_t filesize) #define DRT_HASH_SMALL_MODULUS 23 #define DRT_HASH_LARGE_MODULUS 401 +/* + * Physical memory required before the large hash modulus is permitted. + * + * On small memory systems, the large hash modulus can lead to phsyical + * memory starvation, so we avoid using it there. + */ +#define DRT_HASH_LARGE_MEMORY_REQUIRED (1024LL * 1024LL * 1024LL) /* 1GiB */ + #define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */ #define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */ @@ -5756,8 +6088,12 @@ vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp) * see whether we should grow to the large one. */ if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) { - /* if the ring is nearly full */ - if (active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) { + /* + * If the ring is nearly full and we are allowed to + * use the large modulus, upgrade. + */ + if ((active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) && + (max_mem >= DRT_HASH_LARGE_MEMORY_REQUIRED)) { nsize = DRT_HASH_LARGE_MODULUS; } else { nsize = DRT_HASH_SMALL_MODULUS;