X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/593a1d5fd87cdf5b46dd5fcb84467b432cea0f91..bd504ef0e0b883cdd7917b73b3574eb9ce669905:/bsd/vfs/vfs_cluster.c diff --git a/bsd/vfs/vfs_cluster.c b/bsd/vfs/vfs_cluster.c index 9b1a7af25..69dfdfda3 100644 --- a/bsd/vfs/vfs_cluster.c +++ b/bsd/vfs/vfs_cluster.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -71,6 +71,7 @@ #include #include #include +#include #include #include #include @@ -82,15 +83,25 @@ #include #include #include +#include #include #include #include #include +#include + +#include + +#if 0 +#undef KERNEL_DEBUG +#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT +#endif + #define CL_READ 0x01 -#define CL_WRITE 0x02 +#define CL_WRITE 0x02 #define CL_ASYNC 0x04 #define CL_COMMIT 0x08 #define CL_PAGEOUT 0x10 @@ -103,9 +114,23 @@ #define CL_KEEPCACHED 0x800 #define CL_DIRECT_IO 0x1000 #define CL_PASSIVE 0x2000 +#define CL_IOSTREAMING 0x4000 +#define CL_CLOSE 0x8000 +#define CL_ENCRYPTED 0x10000 +#define CL_RAW_ENCRYPTED 0x20000 +#define CL_NOCACHE 0x40000 + +#define MAX_VECTOR_UPL_ELEMENTS 8 +#define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE) * PAGE_SIZE +extern upl_t vector_upl_create(vm_offset_t); +extern boolean_t vector_upl_is_valid(upl_t); +extern boolean_t vector_upl_set_subupl(upl_t,upl_t, u_int32_t); +extern void vector_upl_set_pagelist(upl_t); +extern void vector_upl_set_iostate(upl_t, upl_t, vm_offset_t, u_int32_t); struct clios { + lck_mtx_t io_mtxp; u_int io_completed; /* amount of io that has currently completed */ u_int io_issued; /* amount of io that was successfully issued */ int io_error; /* error code of first error encountered */ @@ -115,7 +140,7 @@ struct clios { static lck_grp_t *cl_mtx_grp; static lck_attr_t *cl_mtx_attr; static lck_grp_attr_t *cl_mtx_grp_attr; -static lck_mtx_t *cl_mtxp; +static lck_mtx_t *cl_transaction_mtxp; #define IO_UNKNOWN 0 @@ -138,11 +163,13 @@ static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_off int flags, buf_t real_bp, struct clios *iostate, int (*)(buf_t, void *), void *callback_arg); static int cluster_iodone(buf_t bp, void *callback_arg); static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags); -static int cluster_hard_throttle_on(vnode_t vp); +static int cluster_hard_throttle_on(vnode_t vp, uint32_t); + +static void cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name); static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg); -static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int flags); +static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference); static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference); static int cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags, @@ -166,33 +193,74 @@ static void cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t files static int cluster_push_now(vnode_t vp, struct cl_extent *, off_t EOF, int flags, int (*)(buf_t, void *), void *callback_arg); -static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int (*)(buf_t, void *), void *callback_arg); +static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int flags, int (*)(buf_t, void *), void *callback_arg); static void sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF, int (*)(buf_t, void *), void *callback_arg); -static void sparse_cluster_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int (*)(buf_t, void *), void *callback_arg); -static void sparse_cluster_add(struct cl_writebehind *, vnode_t vp, struct cl_extent *, off_t EOF, int (*)(buf_t, void *), void *callback_arg); +static void sparse_cluster_push(void **cmapp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*)(buf_t, void *), void *callback_arg); +static void sparse_cluster_add(void **cmapp, vnode_t vp, struct cl_extent *, off_t EOF, int (*)(buf_t, void *), void *callback_arg); static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp); static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp); static kern_return_t vfs_drt_control(void **cmapp, int op_type); -int is_file_clean(vnode_t, off_t); + +/* + * For throttled IO to check whether + * a block is cached by the boot cache + * and thus it can avoid delaying the IO. + * + * bootcache_contains_block is initially + * NULL. The BootCache will set it while + * the cache is active and clear it when + * the cache is jettisoned. + * + * Returns 0 if the block is not + * contained in the cache, 1 if it is + * contained. + * + * The function pointer remains valid + * after the cache has been evicted even + * if bootcache_contains_block has been + * cleared. + * + * See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs + */ +int (*bootcache_contains_block)(dev_t device, u_int64_t blkno) = NULL; + /* * limit the internal I/O size so that we * can represent it in a 32 bit int */ -#define MAX_IO_REQUEST_SIZE (1024 * 1024 * 256) -#define MAX_IO_CONTIG_SIZE (MAX_UPL_SIZE * PAGE_SIZE) -#define MAX_VECTS 16 +#define MAX_IO_REQUEST_SIZE (1024 * 1024 * 512) +#define MAX_IO_CONTIG_SIZE (MAX_UPL_SIZE * PAGE_SIZE) +#define MAX_VECTS 16 #define MIN_DIRECT_WRITE_SIZE (4 * PAGE_SIZE) +#define WRITE_THROTTLE 6 +#define WRITE_THROTTLE_SSD 2 +#define WRITE_BEHIND 1 +#define WRITE_BEHIND_SSD 1 + +#if CONFIG_EMBEDDED +#define PREFETCH 1 +#define PREFETCH_SSD 1 +uint32_t speculative_prefetch_max = 512; /* maximum number of pages to use for a specluative read-ahead */ +uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use for a specluative read-ahead */ +#else +#define PREFETCH 3 +#define PREFETCH_SSD 1 +uint32_t speculative_prefetch_max = (MAX_UPL_SIZE * 3); +uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use for a specluative read-ahead on SSDs*/ +#endif -#define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE)) -#define MAX_PREFETCH(vp) (cluster_max_io_size(vp->v_mount, CL_READ) * 3); +#define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * (base)) +#define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE)) +#define MAX_PREFETCH(vp, size, is_ssd) (size * IO_SCALE(vp, ((is_ssd && !ignore_is_ssd) ? PREFETCH_SSD : PREFETCH))) -int speculative_reads_disabled = 0; +int ignore_is_ssd = 0; +int speculative_reads_disabled = 0; /* * throttle the number of async writes that @@ -200,11 +268,25 @@ int speculative_reads_disabled = 0; * before we issue a synchronous write */ #define HARD_THROTTLE_MAXCNT 0 -#define HARD_THROTTLE_MAXSIZE (64 * 1024) +#define HARD_THROTTLE_MAX_IOSIZE (128 * 1024) +#define LEGACY_HARD_THROTTLE_MAX_IOSIZE (512 * 1024) +extern int32_t throttle_legacy_process_count; int hard_throttle_on_root = 0; +uint32_t hard_throttle_max_iosize = HARD_THROTTLE_MAX_IOSIZE; +uint32_t legacy_hard_throttle_max_iosize = LEGACY_HARD_THROTTLE_MAX_IOSIZE; struct timeval priority_IO_timestamp_for_root; +#if CONFIG_EMBEDDED +#define THROTTLE_MAX_IOSIZE (hard_throttle_max_iosize) +#else +#define THROTTLE_MAX_IOSIZE (throttle_legacy_process_count == 0 ? hard_throttle_max_iosize : legacy_hard_throttle_max_iosize) +#endif + + +SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_max_iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &hard_throttle_max_iosize, 0, ""); +SYSCTL_INT(_debug, OID_AUTO, lowpri_legacy_throttle_max_iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &legacy_hard_throttle_max_iosize, 0, ""); + void cluster_init(void) { @@ -219,39 +301,35 @@ cluster_init(void) { */ cl_mtx_attr = lck_attr_alloc_init(); - /* - * allocate and initialize mutex's used to protect updates and waits - * on the cluster_io context - */ - cl_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr); + cl_transaction_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr); - if (cl_mtxp == NULL) - panic("cluster_init: failed to allocate cl_mtxp"); + if (cl_transaction_mtxp == NULL) + panic("cluster_init: failed to allocate cl_transaction_mtxp"); } uint32_t cluster_max_io_size(mount_t mp, int type) { - uint32_t max_io_size; - uint32_t segcnt; - uint32_t maxcnt; - - switch(type) { - - case CL_READ: - segcnt = mp->mnt_segreadcnt; - maxcnt = mp->mnt_maxreadcnt; - break; - case CL_WRITE: - segcnt = mp->mnt_segwritecnt; - maxcnt = mp->mnt_maxwritecnt; - break; - default: - segcnt = min(mp->mnt_segreadcnt, mp->mnt_segwritecnt); - maxcnt = min(mp->mnt_maxreadcnt, mp->mnt_maxwritecnt); - break; - } + uint32_t max_io_size; + uint32_t segcnt; + uint32_t maxcnt; + + switch(type) { + + case CL_READ: + segcnt = mp->mnt_segreadcnt; + maxcnt = mp->mnt_maxreadcnt; + break; + case CL_WRITE: + segcnt = mp->mnt_segwritecnt; + maxcnt = mp->mnt_maxwritecnt; + break; + default: + segcnt = min(mp->mnt_segreadcnt, mp->mnt_segwritecnt); + maxcnt = min(mp->mnt_maxreadcnt, mp->mnt_maxwritecnt); + break; + } if (segcnt > MAX_UPL_SIZE) { /* * don't allow a size beyond the max UPL size we can create @@ -391,7 +469,7 @@ cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *c if (wbp->cl_number) { lck_mtx_lock(&wbp->cl_lockw); - cluster_try_push(wbp, vp, newEOF, PUSH_ALL | PUSH_SYNC, callback, callback_arg); + cluster_try_push(wbp, vp, newEOF, PUSH_ALL | PUSH_SYNC, 0, callback, callback_arg); lck_mtx_unlock(&wbp->cl_lockw); } @@ -399,38 +477,73 @@ cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *c } +static int +cluster_io_present_in_BC(vnode_t vp, off_t f_offset) +{ + daddr64_t blkno; + size_t io_size; + int (*bootcache_check_fn)(dev_t device, u_int64_t blkno) = bootcache_contains_block; + + if (bootcache_check_fn) { + if (VNOP_BLOCKMAP(vp, f_offset, PAGE_SIZE, &blkno, &io_size, NULL, VNODE_READ, NULL)) + return(0); + + if (io_size == 0) + return (0); + + if (bootcache_check_fn(vp->v_mount->mnt_devvp->v_rdev, blkno)) + return(1); + } + return(0); +} + + static int -cluster_hard_throttle_on(vnode_t vp) +cluster_hard_throttle_on(vnode_t vp, uint32_t hard_throttle) { - static struct timeval hard_throttle_maxelapsed = { 0, 200000 }; + int throttle_type = 0; - if (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV) { - struct timeval elapsed; + if ( (throttle_type = throttle_io_will_be_throttled(-1, vp->v_mount)) ) + return(throttle_type); + + if (hard_throttle && (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV)) { + static struct timeval hard_throttle_maxelapsed = { 0, 100000 }; + struct timeval elapsed; if (hard_throttle_on_root) - return(1); + return(1); microuptime(&elapsed); timevalsub(&elapsed, &priority_IO_timestamp_for_root); if (timevalcmp(&elapsed, &hard_throttle_maxelapsed, <)) - return(1); - } - struct uthread *ut; - if (throttle_get_io_policy(&ut) == IOPOL_THROTTLE) { - size_t devbsdunit; - if (vp->v_mount != NULL) - devbsdunit = vp->v_mount->mnt_devbsdunit; - else - devbsdunit = LOWPRI_MAX_NUM_DEV - 1; - if (throttle_io_will_be_throttled(-1, devbsdunit)) { return(1); - } } return(0); } +static void +cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name) +{ + + lck_mtx_lock(&iostate->io_mtxp); + + while ((iostate->io_issued - iostate->io_completed) > target) { + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, + iostate->io_issued, iostate->io_completed, target, 0, 0); + + iostate->io_wanted = 1; + msleep((caddr_t)&iostate->io_wanted, &iostate->io_mtxp, PRIBIO + 1, wait_name, NULL); + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, + iostate->io_issued, iostate->io_completed, target, 0, 0); + } + lck_mtx_unlock(&iostate->io_mtxp); +} + + static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags) { @@ -438,7 +551,7 @@ cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_fla int page_in = 0; int page_out = 0; - if (io_flags & B_PHYS) + if ((io_flags & (B_PHYS | B_CACHE)) == (B_PHYS | B_CACHE)) /* * direct write of any flavor, or a direct read that wasn't aligned */ @@ -495,28 +608,49 @@ cluster_iodone(buf_t bp, void *callback_arg) cbp_head = (buf_t)(bp->b_trans_head); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START, - (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0); + cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0); - for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) { - /* - * all I/O requests that are part of this transaction - * have to complete before we can process it - */ - if ( !(cbp->b_flags & B_DONE)) { + if (cbp_head->b_trans_next || !(cbp_head->b_flags & B_EOT)) { + boolean_t need_wakeup = FALSE; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, - (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0); + lck_mtx_lock_spin(cl_transaction_mtxp); - return 0; + bp->b_flags |= B_TDONE; + + if (bp->b_flags & B_TWANTED) { + CLR(bp->b_flags, B_TWANTED); + need_wakeup = TRUE; } - if (cbp->b_flags & B_EOT) - transaction_complete = TRUE; - } - if (transaction_complete == FALSE) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, - (int)cbp_head, 0, 0, 0, 0); + for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) { + /* + * all I/O requests that are part of this transaction + * have to complete before we can process it + */ + if ( !(cbp->b_flags & B_TDONE)) { + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, + cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0); + + lck_mtx_unlock(cl_transaction_mtxp); + + if (need_wakeup == TRUE) + wakeup(bp); + + return 0; + } + if (cbp->b_flags & B_EOT) + transaction_complete = TRUE; + } + lck_mtx_unlock(cl_transaction_mtxp); - return 0; + if (need_wakeup == TRUE) + wakeup(bp); + + if (transaction_complete == FALSE) { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, + cbp_head, 0, 0, 0, 0); + return 0; + } } error = 0; total_size = 0; @@ -580,7 +714,7 @@ cluster_iodone(buf_t bp, void *callback_arg) * someone has issued multiple I/Os asynchrounsly * and is waiting for them to complete (streaming) */ - lck_mtx_lock_spin(cl_mtxp); + lck_mtx_lock_spin(&iostate->io_mtxp); if (error && iostate->io_error == 0) iostate->io_error = error; @@ -595,7 +729,7 @@ cluster_iodone(buf_t bp, void *callback_arg) iostate->io_wanted = 0; need_wakeup = 1; } - lck_mtx_unlock(cl_mtxp); + lck_mtx_unlock(&iostate->io_mtxp); if (need_wakeup) wakeup((caddr_t)&iostate->io_wanted); @@ -620,7 +754,7 @@ cluster_iodone(buf_t bp, void *callback_arg) ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags); } } - if ((b_flags & B_NEED_IODONE) && real_bp) { + if (real_bp) { if (error) { real_bp->b_flags |= B_ERROR; real_bp->b_error = error; @@ -630,18 +764,29 @@ cluster_iodone(buf_t bp, void *callback_arg) buf_biodone(real_bp); } KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, - (int)upl, upl_offset - pg_offset, commit_size, (error << 24) | upl_flags, 0); + upl, upl_offset - pg_offset, commit_size, (error << 24) | upl_flags, 0); return (error); } +uint32_t +cluster_hard_throttle_limit(vnode_t vp, uint32_t *limit, uint32_t hard_throttle) +{ + if (cluster_hard_throttle_on(vp, hard_throttle)) { + *limit = THROTTLE_MAX_IOSIZE; + return 1; + } + return 0; +} + + void -cluster_zero(upl_t upl, vm_offset_t upl_offset, int size, buf_t bp) +cluster_zero(upl_t upl, upl_offset_t upl_offset, int size, buf_t bp) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START, - upl_offset, size, (int)bp, 0, 0); + upl_offset, size, bp, 0, 0); if (bp == NULL || bp->b_datap == 0) { upl_page_info_t *pl; @@ -695,27 +840,36 @@ cluster_wait_IO(buf_t cbp_head, int async) /* * async callback completion will not normally * generate a wakeup upon I/O completion... - * by setting BL_WANTED, we will force a wakeup + * by setting B_TWANTED, we will force a wakeup * to occur as any outstanding I/Os complete... - * I/Os already completed will have BL_CALLDONE already - * set and we won't block in buf_biowait_callback.. + * I/Os already completed will have B_TDONE already + * set and we won't cause us to block * note that we're actually waiting for the bp to have * completed the callback function... only then * can we safely take back ownership of the bp - * need the main buf mutex in order to safely - * update b_lflags */ - buf_list_lock(); + lck_mtx_lock_spin(cl_transaction_mtxp); for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) - cbp->b_lflags |= BL_WANTED; + cbp->b_flags |= B_TWANTED; - buf_list_unlock(); + lck_mtx_unlock(cl_transaction_mtxp); } for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) { - if (async) - buf_biowait_callback(cbp); - else + + if (async) { + while (!ISSET(cbp->b_flags, B_TDONE)) { + + lck_mtx_lock_spin(cl_transaction_mtxp); + + if (!ISSET(cbp->b_flags, B_TDONE)) { + DTRACE_IO1(wait__start, buf_t, cbp); + (void) msleep(cbp, cl_transaction_mtxp, PDROP | (PRIBIO+1), "cluster_wait_IO", NULL); + DTRACE_IO1(wait__done, buf_t, cbp); + } else + lck_mtx_unlock(cl_transaction_mtxp); + } + } else buf_biowait(cbp); } } @@ -735,6 +889,14 @@ cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, i for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) buf_biowait(cbp); } + /* + * we've already waited on all of the I/Os in this transaction, + * so mark all of the buf_t's in this transaction as B_TDONE + * so that cluster_iodone sees the transaction as completed + */ + for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) + cbp->b_flags |= B_TDONE; + error = cluster_iodone(*cbp_head, callback_arg); if ( !(flags & CL_ASYNC) && error && *retval == 0) { @@ -852,30 +1014,37 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no max_iosize = PAGE_SIZE; if (flags & CL_THROTTLE) { - if ( !(flags & CL_PAGEOUT) && cluster_hard_throttle_on(vp)) { - if (max_iosize > HARD_THROTTLE_MAXSIZE) - max_iosize = HARD_THROTTLE_MAXSIZE; + if ( !(flags & CL_PAGEOUT) && cluster_hard_throttle_on(vp, 1)) { + if (max_iosize > THROTTLE_MAX_IOSIZE) + max_iosize = THROTTLE_MAX_IOSIZE; async_throttle = HARD_THROTTLE_MAXCNT; } else { if ( (flags & CL_DEV_MEMORY) ) - async_throttle = VNODE_ASYNC_THROTTLE; + async_throttle = IO_SCALE(vp, VNODE_ASYNC_THROTTLE); else { u_int max_cluster; u_int max_cluster_size; - u_int max_prefetch; + u_int scale; max_cluster_size = MAX_CLUSTER_SIZE(vp); - max_prefetch = MAX_PREFETCH(vp); - + if (max_iosize > max_cluster_size) - max_cluster = max_cluster_size; + max_cluster = max_cluster_size; else max_cluster = max_iosize; if (size < max_cluster) max_cluster = size; + + if ((vp->v_mount->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) + scale = WRITE_THROTTLE_SSD; + else + scale = WRITE_THROTTLE; + + if (flags & CL_CLOSE) + scale += MAX_CLUSTERS; - async_throttle = min(VNODE_ASYNC_THROTTLE, (max_prefetch / max_cluster) - 1); + async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), ((scale * max_cluster_size) / max_cluster) - 1); } } } @@ -883,14 +1052,18 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no io_flags |= B_AGE; if (flags & (CL_PAGEIN | CL_PAGEOUT)) io_flags |= B_PAGEIO; + if (flags & (CL_IOSTREAMING)) + io_flags |= B_IOSTREAMING; if (flags & CL_COMMIT) io_flags |= B_COMMIT_UPL; - if (flags & CL_PRESERVE) + if (flags & CL_DIRECT_IO) io_flags |= B_PHYS; - if (flags & CL_KEEPCACHED) - io_flags |= B_CACHE; + if (flags & (CL_PRESERVE | CL_KEEPCACHED)) + io_flags |= B_CACHE; if (flags & CL_PASSIVE) io_flags |= B_PASSIVE; + if (flags & CL_ENCRYPTED) + io_flags |= B_ENCRYPTED_IO; if (vp->v_flag & VSYSTEM) io_flags |= B_META; @@ -908,6 +1081,7 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no daddr64_t blkno; daddr64_t lblkno; u_int io_size_wanted; + size_t io_size_tmp; if (size > max_iosize) io_size = max_iosize; @@ -915,12 +1089,15 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no io_size = size; io_size_wanted = io_size; + io_size_tmp = (size_t)io_size; - if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL, bmap_flags, NULL))) + if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, &io_size_tmp, NULL, bmap_flags, NULL))) break; - if (io_size > io_size_wanted) + if (io_size_tmp > io_size_wanted) io_size = io_size_wanted; + else + io_size = (u_int)io_size_tmp; if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) real_bp->b_blkno = blkno; @@ -943,6 +1120,8 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no off_t e_offset; int pageout_flags; + if (upl_get_internal_vectorupl(upl)) + panic("Vector UPLs should not take this code-path\n"); /* * we're writing into a 'hole' */ @@ -1048,7 +1227,6 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no } if (vnode_pageout(vp, upl, trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) { error = EINVAL; - break; } e_offset = round_page_64(f_offset + 1); io_size = e_offset - f_offset; @@ -1077,6 +1255,11 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no */ size = 0; } + if (error) { + if (size == 0) + flags &= ~CL_COMMIT; + break; + } continue; } lblkno = (daddr64_t)(f_offset / PAGE_SIZE_64); @@ -1281,6 +1464,8 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no } cbp->b_cliodone = (void *)callback; cbp->b_flags |= io_flags; + if (flags & CL_NOCACHE) + cbp->b_attr.ba_flags |= BA_NOCACHE; cbp->b_lblkno = lblkno; cbp->b_blkno = blkno; @@ -1314,10 +1499,8 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no cbp_head = cbp; cbp_tail = cbp; - if ( (cbp_head->b_real_bp = real_bp) ) { - cbp_head->b_flags |= B_NEED_IODONE; + if ( (cbp_head->b_real_bp = real_bp) ) real_bp = (buf_t)NULL; - } } *(buf_t *)(&cbp->b_trans_head) = cbp_head; @@ -1375,6 +1558,14 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no if ( !(io_flags & B_READ)) vnode_startwrite(vp); + if (flags & CL_RAW_ENCRYPTED) { + /* + * User requested raw encrypted bytes. + * Twiddle the bit in the ba_flags for the buffer + */ + cbp->b_attr.ba_flags |= BA_RAW_ENCRYPTED_IO; + } + (void) VNOP_STRATEGY(cbp); if (need_EOT == TRUE) { @@ -1423,7 +1614,7 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no * since we never really issued the io * just go ahead and adjust it back */ - lck_mtx_lock_spin(cl_mtxp); + lck_mtx_lock_spin(&iostate->io_mtxp); if (iostate->io_error == 0) iostate->io_error = error; @@ -1437,7 +1628,7 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no iostate->io_wanted = 0; need_wakeup = 1; } - lck_mtx_unlock(cl_mtxp); + lck_mtx_unlock(&iostate->io_mtxp); if (need_wakeup) wakeup((caddr_t)&iostate->io_wanted); @@ -1451,7 +1642,7 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, abort_size, error, io_flags); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE, - (int)upl, upl_offset - pg_offset, abort_size, (error << 24) | upl_flags, 0); + upl, upl_offset - pg_offset, abort_size, (error << 24) | upl_flags, 0); } if (retval == 0) retval = error; @@ -1475,6 +1666,24 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no return (retval); } +#define reset_vector_run_state() \ + issueVectorUPL = vector_upl_offset = vector_upl_index = vector_upl_iosize = vector_upl_size = 0; + +static int +vector_cluster_io(vnode_t vp, upl_t vector_upl, vm_offset_t vector_upl_offset, off_t v_upl_uio_offset, int vector_upl_iosize, + int io_flag, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg) +{ + vector_upl_set_pagelist(vector_upl); + + if(io_flag & CL_READ) { + if(vector_upl_offset == 0 && ((vector_upl_iosize & PAGE_MASK)==0)) + io_flag &= ~CL_PRESERVE; /*don't zero fill*/ + else + io_flag |= CL_PRESERVE; /*zero fill*/ + } + return (cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp, iostate, callback, callback_arg)); + +} static int cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag) @@ -1510,7 +1719,7 @@ cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct daddr64_t r_addr; off_t f_offset; int size_of_prefetch; - u_int max_prefetch; + u_int max_prefetch; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START, @@ -1530,8 +1739,16 @@ cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct return; } - max_prefetch = MAX_PREFETCH(vp); + max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ), (vp->v_mount->mnt_kern_flag & MNTK_SSD)); + if ((max_prefetch / PAGE_SIZE) > speculative_prefetch_max) + max_prefetch = (speculative_prefetch_max * PAGE_SIZE); + + if (max_prefetch <= PAGE_SIZE) { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, + rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 6, 0); + return; + } if (extent->e_addr < rap->cl_maxra) { if ((rap->cl_maxra - extent->e_addr) > ((max_prefetch / PAGE_SIZE) / 4)) { @@ -1576,7 +1793,7 @@ cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct int -cluster_pageout(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, +cluster_pageout(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset, int size, off_t filesize, int flags) { return cluster_pageout_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL); @@ -1585,7 +1802,7 @@ cluster_pageout(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int -cluster_pageout_ext(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, +cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset, int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg) { int io_size; @@ -1593,18 +1810,7 @@ cluster_pageout_ext(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offse off_t max_size; int local_flags; - if (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) - /* - * if we know we're issuing this I/O to a virtual device (i.e. disk image) - * then we don't want to enforce this throttle... if we do, we can - * potentially deadlock since we're stalling the pageout thread at a time - * when the disk image might need additional memory (which won't be available - * if the pageout thread can't run)... instead we'll just depend on the throttle - * that the pageout thread now has in place to deal with external files - */ - local_flags = CL_PAGEOUT; - else - local_flags = CL_PAGEOUT | CL_THROTTLE; + local_flags = CL_PAGEOUT | CL_THROTTLE; if ((flags & UPL_IOSYNC) == 0) local_flags |= CL_ASYNC; @@ -1612,8 +1818,8 @@ cluster_pageout_ext(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offse local_flags |= CL_COMMIT; if ((flags & UPL_KEEPCACHED)) local_flags |= CL_KEEPCACHED; - if (flags & IO_PASSIVE) - local_flags |= CL_PASSIVE; + if (flags & UPL_PAGING_ENCRYPTED) + local_flags |= CL_ENCRYPTED; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE, @@ -1664,7 +1870,7 @@ cluster_pageout_ext(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offse int -cluster_pagein(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, +cluster_pagein(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset, int size, off_t filesize, int flags) { return cluster_pagein_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL); @@ -1672,7 +1878,7 @@ cluster_pagein(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int -cluster_pagein_ext(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, +cluster_pagein_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset, int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg) { u_int io_size; @@ -1688,8 +1894,10 @@ cluster_pagein_ext(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset local_flags |= CL_ASYNC; if ((flags & UPL_NOCOMMIT) == 0) local_flags |= CL_COMMIT; - if (flags & IO_PASSIVE) - local_flags |= CL_PASSIVE; + if (flags & UPL_IOSTREAMING) + local_flags |= CL_IOSTREAMING; + if (flags & UPL_PAGING_ENCRYPTED) + local_flags |= CL_ENCRYPTED; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE, @@ -1741,7 +1949,7 @@ cluster_bp_ext(buf_t bp, int (*callback)(buf_t, void *), void *callback_arg) int flags; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START, - (int)bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0); + bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0); if (bp->b_flags & B_READ) flags = CL_ASYNC | CL_READ; @@ -1779,13 +1987,14 @@ cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t flags = xflags; if (flags & IO_PASSIVE) - bflag = CL_PASSIVE; + bflag = CL_PASSIVE; else - bflag = 0; + bflag = 0; - if (vp->v_flag & VNOCACHE_DATA) + if (vp->v_flag & VNOCACHE_DATA){ flags |= IO_NOCACHE; - + bflag |= CL_NOCACHE; + } if (uio == NULL) { /* * no user data... @@ -1797,12 +2006,12 @@ cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t } /* * do a write through the cache if one of the following is true.... - * NOCACHE is not true and + * NOCACHE is not true or NODIRECT is true * the uio request doesn't target USERSPACE * otherwise, find out if we want the direct or contig variant for * the first vector in the uio request */ - if ( (flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ) + if ( ((flags & (IO_NOCACHE | IO_NODIRECT)) == IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ) retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE); if ( (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)) && write_type == IO_DIRECT) @@ -1884,6 +2093,18 @@ cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE); break; } + /* + * in case we end up calling cluster_write_copy (from cluster_write_direct) + * multiple times to service a multi-vector request that is not aligned properly + * we need to update the oldEOF so that we + * don't zero-fill the head of a page if we've successfully written + * data to that area... 'cluster_write_copy' will zero-fill the head of a + * page that is beyond the oldEOF if the write is unaligned... we only + * want that to happen for the very first page of the cluster_write, + * NOT the first page of each vector making up a multi-vector write. + */ + if (uio->uio_offset > oldEOF) + oldEOF = uio->uio_offset; } return (retval); } @@ -1896,13 +2117,13 @@ cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, in upl_t upl; upl_page_info_t *pl; vm_offset_t upl_offset; + vm_offset_t vector_upl_offset = 0; u_int32_t io_req_size; u_int32_t offset_in_file; u_int32_t offset_in_iovbase; - u_int32_t io_size; - int io_flag; - int bflag; - vm_size_t upl_size; + u_int32_t io_size; + int io_flag = 0; + upl_size_t upl_size, vector_upl_size = 0; vm_size_t upl_needed_size; mach_msg_type_number_t pages_in_pl; int upl_flags; @@ -1915,15 +2136,17 @@ cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, in user_addr_t iov_base; u_int32_t mem_alignment_mask; u_int32_t devblocksize; - u_int32_t max_upl_size; + u_int32_t max_io_size; + u_int32_t max_upl_size; + u_int32_t max_vector_size; + boolean_t io_throttled = FALSE; + u_int32_t vector_upl_iosize = 0; + int issueVectorUPL = 0,useVectorUPL = (uio->uio_iovcnt > 1); + off_t v_upl_uio_offset = 0; + int vector_upl_index=0; + upl_t vector_upl = NULL; - max_upl_size = cluster_max_io_size(vp->v_mount, CL_WRITE); - - if (flags & IO_PASSIVE) - bflag = CL_PASSIVE; - else - bflag = 0; /* * When we enter this routine, we know @@ -1932,11 +2155,23 @@ cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, in KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START, (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0); + max_upl_size = cluster_max_io_size(vp->v_mount, CL_WRITE); + + io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE | CL_DIRECT_IO; + + if (flags & IO_PASSIVE) + io_flag |= CL_PASSIVE; + + if (flags & IO_NOCACHE) + io_flag |= CL_NOCACHE; + iostate.io_completed = 0; iostate.io_issued = 0; iostate.io_error = 0; iostate.io_wanted = 0; + lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); + mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; @@ -1978,6 +2213,33 @@ next_dwrite: } while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == 0) { + int throttle_type; + + if ( (throttle_type = cluster_hard_throttle_on(vp, 1)) ) { + /* + * we're in the throttle window, at the very least + * we want to limit the size of the I/O we're about + * to issue + */ + if ( (flags & IO_RETURN_ON_THROTTLE) && throttle_type == 2) { + /* + * we're in the throttle window and at least 1 I/O + * has already been issued by a throttleable thread + * in this window, so return with EAGAIN to indicate + * to the FS issuing the cluster_write call that it + * should now throttle after dropping any locks + */ + throttle_info_update_by_mount(vp->v_mount); + + io_throttled = TRUE; + goto wait_for_dwrites; + } + max_vector_size = THROTTLE_MAX_IOSIZE; + max_io_size = THROTTLE_MAX_IOSIZE; + } else { + max_vector_size = MAX_VECTOR_UPL_SIZE; + max_io_size = max_upl_size; + } if (first_IO) { cluster_syncup(vp, newEOF, callback, callback_arg); @@ -1986,8 +2248,25 @@ next_dwrite: io_size = io_req_size & ~PAGE_MASK; iov_base = uio_curriovbase(uio); - if (io_size > max_upl_size) - io_size = max_upl_size; + if (io_size > max_io_size) + io_size = max_io_size; + + if(useVectorUPL && (iov_base & PAGE_MASK)) { + /* + * We have an iov_base that's not page-aligned. + * Issue all I/O's that have been collected within + * this Vectored UPL. + */ + if(vector_upl_index) { + retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); + reset_vector_run_state(); + } + + /* + * After this point, if we are using the Vector UPL path and the base is + * not page-aligned then the UPL with that base will be the first in the vector UPL. + */ + } upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK); upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK; @@ -2073,6 +2352,18 @@ next_dwrite: */ goto wait_for_dwrites; } + + if(useVectorUPL) { + vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK); + if(end_off) + issueVectorUPL = 1; + /* + * After this point, if we are using a vector UPL, then + * either all the UPL elements end on a page boundary OR + * this UPL is the last element because it does not end + * on a page boundary. + */ + } /* * Now look for pages already in the cache @@ -2088,20 +2379,8 @@ next_dwrite: * if there are already too many outstanding writes * wait until some complete before issuing the next */ - lck_mtx_lock(cl_mtxp); - - while ((iostate.io_issued - iostate.io_completed) > (2 * max_upl_size)) { - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, 2 * max_upl_size, 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_direct", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, 2 * max_upl_size, 0, 0); - } - lck_mtx_unlock(cl_mtxp); + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, max_upl_size * IO_SCALE(vp, 2), "cluster_write_direct"); if (iostate.io_error) { /* @@ -2115,20 +2394,51 @@ next_dwrite: goto wait_for_dwrites; } - io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE | CL_DIRECT_IO | bflag; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START, (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0); - retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, + if(!useVectorUPL) + retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); + else { + if(!vector_upl_index) { + vector_upl = vector_upl_create(upl_offset); + v_upl_uio_offset = uio->uio_offset; + vector_upl_offset = upl_offset; + } + + vector_upl_set_subupl(vector_upl,upl,upl_size); + vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size); + vector_upl_index++; + vector_upl_iosize += io_size; + vector_upl_size += upl_size; + + if(issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) { + retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); + reset_vector_run_state(); + } + } + /* * update the uio structure to * reflect the I/O that we just issued */ uio_update(uio, (user_size_t)io_size); + /* + * in case we end up calling through to cluster_write_copy to finish + * the tail of this request, we need to update the oldEOF so that we + * don't zero-fill the head of a page if we've successfully written + * data to that area... 'cluster_write_copy' will zero-fill the head of a + * page that is beyond the oldEOF if the write is unaligned... we only + * want that to happen for the very first page of the cluster_write, + * NOT the first page of each vector making up a multi-vector write. + */ + if (uio->uio_offset > oldEOF) + oldEOF = uio->uio_offset; + io_req_size -= io_size; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END, @@ -2150,28 +2460,27 @@ next_dwrite: } wait_for_dwrites: - if (iostate.io_issued) { + + if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) { + retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); + reset_vector_run_state(); + } + + if (iostate.io_issued > iostate.io_completed) { /* * make sure all async writes issued as part of this stream * have completed before we return */ - lck_mtx_lock(cl_mtxp); - - while (iostate.io_issued != iostate.io_completed) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_direct", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - } - lck_mtx_unlock(cl_mtxp); + cluster_iostate_wait(&iostate, 0, "cluster_write_direct"); } if (iostate.io_error) retval = iostate.io_error; + lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); + + if (io_throttled == TRUE && retval == 0) + retval = EAGAIN; + if (io_req_size && retval == 0) { /* * we couldn't handle the tail of this request in DIRECT mode @@ -2180,6 +2489,9 @@ wait_for_dwrites: * note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set * so we can just pass 0 in for the headOff and tailOff */ + if (uio->uio_offset > oldEOF) + oldEOF = uio->uio_offset; + retval = cluster_write_copy(vp, uio, io_req_size, oldEOF, newEOF, (off_t)0, (off_t)0, flags, callback, callback_arg); *write_type = IO_UNKNOWN; @@ -2202,7 +2514,7 @@ cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, u_int32_t tail_size = 0; u_int32_t io_size; u_int32_t xsize; - vm_size_t upl_size; + upl_size_t upl_size; vm_size_t upl_needed_size; mach_msg_type_number_t pages_in_pl; int upl_flags; @@ -2231,6 +2543,8 @@ cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, iostate.io_error = 0; iostate.io_wanted = 0; + lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); + next_cwrite: io_size = *write_length; @@ -2319,22 +2633,9 @@ next_cwrite: * if there are already too many outstanding writes * wait until some have completed before issuing the next */ - if (iostate.io_issued) { - lck_mtx_lock(cl_mtxp); - - while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_IO_CONTIG_SIZE)) { - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, 2 * MAX_IO_CONTIG_SIZE, 0, 0); + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_write_contig"); - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_contig", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, 2 * MAX_IO_CONTIG_SIZE, 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) { /* * one of the earlier writes we issued ran into a hard error @@ -2378,23 +2679,14 @@ wait_for_cwrites: * make sure all async writes that are part of this stream * have completed before we proceed */ - lck_mtx_lock(cl_mtxp); - - while (iostate.io_issued != iostate.io_completed) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_contig", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - } - lck_mtx_unlock(cl_mtxp); + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, 0, "cluster_write_contig"); if (iostate.io_error) error = iostate.io_error; + lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); + if (error == 0 && tail_size) error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0, callback, callback_arg); @@ -2409,6 +2701,42 @@ wait_for_cwrites: } +/* + * need to avoid a race between an msync of a range of pages dirtied via mmap + * vs a filesystem such as HFS deciding to write a 'hole' to disk via cluster_write's + * zerofill mechanism before it has seen the VNOP_PAGEOUTs for the pages being msync'd + * + * we should never force-zero-fill pages that are already valid in the cache... + * the entire page contains valid data (either from disk, zero-filled or dirtied + * via an mmap) so we can only do damage by trying to zero-fill + * + */ +static int +cluster_zero_range(upl_t upl, upl_page_info_t *pl, int flags, int io_offset, off_t zero_off, off_t upl_f_offset, int bytes_to_zero) +{ + int zero_pg_index; + boolean_t need_cluster_zero = TRUE; + + if ((flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) { + + bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64)); + zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64); + + if (upl_valid_page(pl, zero_pg_index)) { + /* + * never force zero valid pages - dirty or clean + * we'll leave these in the UPL for cluster_write_copy to deal with + */ + need_cluster_zero = FALSE; + } + } + if (need_cluster_zero == TRUE) + cluster_zero(upl, io_offset, bytes_to_zero, NULL); + + return (bytes_to_zero); +} + + static int cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int flags, int (*callback)(buf_t, void *), void *callback_arg) @@ -2433,16 +2761,14 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old off_t zero_off; long long zero_cnt1; off_t zero_off1; + off_t write_off = 0; + int write_cnt = 0; + boolean_t first_pass = FALSE; struct cl_extent cl; struct cl_writebehind *wbp; int bflag; - u_int max_cluster_pgcount; - u_int max_io_size; - - if (flags & IO_PASSIVE) - bflag = CL_PASSIVE; - else - bflag = 0; + u_int max_cluster_pgcount; + u_int max_io_size; if (uio) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START, @@ -2455,6 +2781,13 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old io_resid = 0; } + if (flags & IO_PASSIVE) + bflag = CL_PASSIVE; + else + bflag = 0; + if (flags & IO_NOCACHE) + bflag |= CL_NOCACHE; + zero_cnt = 0; zero_cnt1 = 0; zero_off = 0; @@ -2480,6 +2813,16 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old zero_cnt = newEOF - headOff; zero_off = headOff; } + } else { + if (uio && uio->uio_offset > oldEOF) { + zero_off = uio->uio_offset & ~PAGE_MASK_64; + + if (zero_off >= oldEOF) { + zero_cnt = uio->uio_offset - zero_off; + + flags |= IO_HEADZEROFILL; + } + } } if (flags & IO_TAILZEROFILL) { if (uio) { @@ -2488,13 +2831,32 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old if (zero_off1 < tailOff) zero_cnt1 = tailOff - zero_off1; } + } else { + if (uio && newEOF > oldEOF) { + zero_off1 = uio->uio_offset + io_req_size; + + if (zero_off1 == newEOF && (zero_off1 & PAGE_MASK_64)) { + zero_cnt1 = PAGE_SIZE_64 - (zero_off1 & PAGE_MASK_64); + + flags |= IO_TAILZEROFILL; + } + } } if (zero_cnt == 0 && uio == (struct uio *) 0) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, retval, 0, 0, 0, 0); return (0); } - + if (uio) { + write_off = uio->uio_offset; + write_cnt = uio_resid(uio); + /* + * delay updating the sequential write info + * in the control block until we've obtained + * the lock for it + */ + first_pass = TRUE; + } while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) { /* * for this iteration of the loop, figure out where our starting point is @@ -2523,11 +2885,11 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old * because IO_HEADZEROFILL and IO_TAILZEROFILL not set */ if ((start_offset + total_size) > max_io_size) - total_size -= start_offset; + total_size = max_io_size - start_offset; xfer_resid = total_size; retval = cluster_copy_ubc_data_internal(vp, uio, &xfer_resid, 1, 1); - + if (retval) break; @@ -2585,14 +2947,14 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old upl_size, &upl, &pl, - UPL_SET_LITE | UPL_WILL_MODIFY); + UPL_SET_LITE | (( uio!=NULL && (uio->uio_flags & UIO_FLAGS_IS_COMPRESSED_FILE)) ? 0 : UPL_WILL_MODIFY)); if (kret != KERN_SUCCESS) panic("cluster_write_copy: failed to get pagelist"); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, - (int)upl, (int)upl_f_offset, start_offset, 0, 0); + upl, (int)upl_f_offset, start_offset, 0, 0); - if (start_offset && !upl_valid_page(pl, 0)) { + if (start_offset && upl_f_offset < oldEOF && !upl_valid_page(pl, 0)) { int read_size; /* @@ -2602,8 +2964,8 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old */ read_size = PAGE_SIZE; - if ((upl_f_offset + read_size) > newEOF) - read_size = newEOF - upl_f_offset; + if ((upl_f_offset + read_size) > oldEOF) + read_size = oldEOF - upl_f_offset; retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); @@ -2620,7 +2982,7 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE, - (int)upl, 0, 0, retval, 0); + upl, 0, 0, retval, 0); break; } } @@ -2638,8 +3000,8 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old read_size = PAGE_SIZE; - if ((upl_f_offset + upl_offset + read_size) > newEOF) - read_size = newEOF - (upl_f_offset + upl_offset); + if ((off_t)(upl_f_offset + upl_offset + read_size) > oldEOF) + read_size = oldEOF - (upl_f_offset + upl_offset); retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); @@ -2656,7 +3018,7 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE, - (int)upl, 0, 0, retval, 0); + upl, 0, 0, retval, 0); break; } } @@ -2671,22 +3033,8 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old else bytes_to_zero = xfer_resid; - if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) { - cluster_zero(upl, io_offset, bytes_to_zero, NULL); - } else { - int zero_pg_index; - - bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64)); - zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64); - - if ( !upl_valid_page(pl, zero_pg_index)) { - cluster_zero(upl, io_offset, bytes_to_zero, NULL); + bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off, upl_f_offset, bytes_to_zero); - } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY && - !upl_dirty_page(pl, zero_pg_index)) { - cluster_zero(upl, io_offset, bytes_to_zero, NULL); - } - } xfer_resid -= bytes_to_zero; zero_cnt -= bytes_to_zero; zero_off += bytes_to_zero; @@ -2701,11 +3049,10 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old retval = cluster_copy_upl_data(uio, upl, io_offset, (int *)&io_requested); if (retval) { - ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE, - (int)upl, 0, 0, retval, 0); + upl, 0, 0, retval, 0); } else { io_resid -= bytes_to_move; xfer_resid -= bytes_to_move; @@ -2719,27 +3066,13 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old else bytes_to_zero = xfer_resid; - if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) { - cluster_zero(upl, io_offset, bytes_to_zero, NULL); - } else { - int zero_pg_index; - - bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64)); - zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64); - - if ( !upl_valid_page(pl, zero_pg_index)) { - cluster_zero(upl, io_offset, bytes_to_zero, NULL); - } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY && - !upl_dirty_page(pl, zero_pg_index)) { - cluster_zero(upl, io_offset, bytes_to_zero, NULL); - } - } + bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off1, upl_f_offset, bytes_to_zero); + xfer_resid -= bytes_to_zero; zero_cnt1 -= bytes_to_zero; zero_off1 += bytes_to_zero; io_offset += bytes_to_zero; } - if (retval == 0) { int cl_index; int ret_cluster_try_push; @@ -2782,14 +3115,14 @@ check_cluster: */ cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64); - if (flags & IO_SYNC) + if (flags & IO_SYNC) { /* * if the IO_SYNC flag is set than we need to * bypass any clusters and immediately issue * the I/O */ goto issue_io; - + } /* * take the lock to protect our accesses * of the writebehind and sparse cluster state @@ -2803,7 +3136,7 @@ check_cluster: * we've fallen into the sparse * cluster method of delaying dirty pages */ - sparse_cluster_add(wbp, vp, &cl, newEOF, callback, callback_arg); + sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, newEOF, callback, callback_arg); lck_mtx_unlock(&wbp->cl_lockw); @@ -2818,7 +3151,7 @@ check_cluster: */ wbp->cl_number = 0; - sparse_cluster_push(wbp, vp, newEOF, PUSH_ALL, callback, callback_arg); + sparse_cluster_push(&(wbp->cl_scmap), vp, newEOF, PUSH_ALL, 0, callback, callback_arg); /* * no clusters of either type present at this point * so just go directly to start_new_cluster since @@ -2827,8 +3160,18 @@ check_cluster: * to avoid the deadlock with sparse_cluster_push */ goto start_new_cluster; - } - if (wbp->cl_number == 0) + } + if (first_pass) { + if (write_off == wbp->cl_last_write) + wbp->cl_seq_written += write_cnt; + else + wbp->cl_seq_written = write_cnt; + + wbp->cl_last_write = write_off + write_cnt; + + first_pass = FALSE; + } + if (wbp->cl_number == 0) /* * no clusters currently present */ @@ -2942,14 +3285,27 @@ check_cluster: */ goto delay_io; - if (wbp->cl_number < MAX_CLUSTERS) + if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && + wbp->cl_number == MAX_CLUSTERS && + wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) { + uint32_t n; + + if (vp->v_mount->mnt_kern_flag & MNTK_SSD) + n = WRITE_BEHIND_SSD; + else + n = WRITE_BEHIND; + + while (n--) + cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg); + } + if (wbp->cl_number < MAX_CLUSTERS) { /* * we didn't find an existing cluster to * merge into, but there's room to start * a new one */ goto start_new_cluster; - + } /* * no exisitng cluster to merge with and no * room to start a new one... we'll try @@ -2967,7 +3323,7 @@ check_cluster: */ if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) { - ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, callback, callback_arg); + ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, 0, callback, callback_arg); } /* @@ -2980,24 +3336,12 @@ check_cluster: * sparse mechanism.... */ sparse_cluster_switch(wbp, vp, newEOF, callback, callback_arg); - sparse_cluster_add(wbp, vp, &cl, newEOF, callback, callback_arg); + sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, newEOF, callback, callback_arg); lck_mtx_unlock(&wbp->cl_lockw); continue; } - /* - * we pushed one cluster successfully, so we must be sequentially writing this file - * otherwise, we would have failed and fallen into the sparse cluster support - * so let's take the opportunity to push out additional clusters... - * this will give us better I/O locality if we're in a copy loop - * (i.e. we won't jump back and forth between the read and write points - */ - if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) { - while (wbp->cl_number) - cluster_try_push(wbp, vp, newEOF, 0, callback, callback_arg); - } - start_new_cluster: wbp->cl_clusters[wbp->cl_number].b_addr = cl.b_addr; wbp->cl_clusters[wbp->cl_number].e_addr = cl.e_addr; @@ -3058,17 +3402,34 @@ cluster_read_ext(vnode_t vp, struct uio *uio, off_t filesize, int xflags, int (* flags |= IO_NOCACHE; if ((vp->v_flag & VRAOFF) || speculative_reads_disabled) flags |= IO_RAOFF; + + /* + * If we're doing an encrypted IO, then first check to see + * if the IO requested was page aligned. If not, then bail + * out immediately. + */ + if (flags & IO_ENCRYPTED) { + if (read_length & PAGE_MASK) { + retval = EINVAL; + return retval; + } + } - /* + /* * do a read through the cache if one of the following is true.... * NOCACHE is not true * the uio request doesn't target USERSPACE + * Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well. + * Reading encrypted data from a CP filesystem should never result in the data touching + * the UBC. + * * otherwise, find out if we want the direct or contig variant for * the first vector in the uio request */ - if ( (flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ) - retval = cluster_io_type(uio, &read_type, &read_length, 0); - + if (((flags & IO_NOCACHE) || (flags & IO_ENCRYPTED)) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) { + retval = cluster_io_type(uio, &read_type, &read_length, 0); + } + while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) { switch (read_type) { @@ -3106,13 +3467,13 @@ cluster_read_ext(vnode_t vp, struct uio *uio, off_t filesize, int xflags, int (* static void -cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int flags) +cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference) { int range; int abort_flags = UPL_ABORT_FREE_ON_EMPTY; if ((range = last_pg - start_pg)) { - if ( !(flags & IO_NOCACHE)) + if (take_reference) abort_flags |= UPL_ABORT_REFERENCE; ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, range * PAGE_SIZE, abort_flags); @@ -3126,7 +3487,7 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file upl_page_info_t *pl; upl_t upl; vm_offset_t upl_offset; - u_int32_t upl_size; + u_int32_t upl_size; off_t upl_f_offset; int start_offset; int start_pg; @@ -3143,8 +3504,8 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file u_int32_t xsize; u_int32_t io_size; u_int32_t max_rd_size; - u_int32_t max_io_size; - u_int32_t max_prefetch; + u_int32_t max_io_size; + u_int32_t max_prefetch; u_int rd_ahead_enabled = 1; u_int prefetch_enabled = 1; struct cl_readahead * rap; @@ -3152,51 +3513,60 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file struct cl_extent extent; int bflag; int take_reference = 1; - struct uthread *ut; int policy = IOPOL_DEFAULT; + boolean_t iolock_inited = FALSE; - policy = current_proc()->p_iopol_disk; - - ut = get_bsdthread_info(current_thread()); - - if (ut->uu_iopol_disk != IOPOL_DEFAULT) - policy = ut->uu_iopol_disk; + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START, + (int)uio->uio_offset, io_req_size, (int)filesize, flags, 0); + + if (flags & IO_ENCRYPTED) { + panic ("encrypted blocks will hit UBC!"); + } + + policy = proc_get_task_selfdiskacc(); - if (policy == IOPOL_THROTTLE) + if (policy == IOPOL_THROTTLE || policy == IOPOL_UTILITY || (flags & IO_NOCACHE)) take_reference = 0; if (flags & IO_PASSIVE) bflag = CL_PASSIVE; else - bflag = 0; + bflag = 0; - max_prefetch = MAX_PREFETCH(vp); - max_rd_size = max_prefetch; - max_io_size = cluster_max_io_size(vp->v_mount, CL_READ); + if (flags & IO_NOCACHE) + bflag |= CL_NOCACHE; + + max_io_size = cluster_max_io_size(vp->v_mount, CL_READ); + max_prefetch = MAX_PREFETCH(vp, max_io_size, (vp->v_mount->mnt_kern_flag & MNTK_SSD)); + max_rd_size = max_prefetch; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START, - (int)uio->uio_offset, io_req_size, (int)filesize, flags, 0); - last_request_offset = uio->uio_offset + io_req_size; + if (last_request_offset > filesize) + last_request_offset = filesize; + if ((flags & (IO_RAOFF|IO_NOCACHE)) || ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) { rd_ahead_enabled = 0; rap = NULL; } else { - if (cluster_hard_throttle_on(vp)) { + if (cluster_hard_throttle_on(vp, 1)) { + /* + * we're in the throttle window, at the very least + * we want to limit the size of the I/O we're about + * to issue + */ rd_ahead_enabled = 0; prefetch_enabled = 0; - max_rd_size = HARD_THROTTLE_MAXSIZE; + max_rd_size = THROTTLE_MAX_IOSIZE; } if ((rap = cluster_get_rap(vp)) == NULL) rd_ahead_enabled = 0; + else { + extent.b_addr = uio->uio_offset / PAGE_SIZE_64; + extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64; + } } - if (last_request_offset > filesize) - last_request_offset = filesize; - extent.b_addr = uio->uio_offset / PAGE_SIZE_64; - extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64; - if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) { /* * determine if we already have a read-ahead in the pipe courtesy of the @@ -3215,17 +3585,8 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file last_ioread_offset = (off_t)0; while (io_req_size && uio->uio_offset < filesize && retval == 0) { - /* - * compute the size of the upl needed to encompass - * the requested read... limit each call to cluster_io - * to the maximum UPL size... cluster_io will clip if - * this exceeds the maximum io_size for the device, - * make sure to account for - * a starting offset that's not page aligned - */ - start_offset = (int)(uio->uio_offset & PAGE_MASK_64); - upl_f_offset = uio->uio_offset - (off_t)start_offset; - max_size = filesize - uio->uio_offset; + + max_size = filesize - uio->uio_offset; if ((off_t)(io_req_size) < max_size) io_size = io_req_size; @@ -3292,7 +3653,7 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file */ break; - if ((io_size == 0 || last_ioread_offset == last_request_offset) && rd_ahead_enabled) { + if (rd_ahead_enabled && (io_size == 0 || last_ioread_offset == last_request_offset)) { /* * we're already finished the I/O for this read request * let's see if we should do a read-ahead @@ -3310,10 +3671,47 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file } break; } - start_offset = (int)(uio->uio_offset & PAGE_MASK_64); - upl_f_offset = uio->uio_offset - (off_t)start_offset; - max_size = filesize - uio->uio_offset; + /* + * recompute max_size since cluster_copy_ubc_data_internal + * may have advanced uio->uio_offset + */ + max_size = filesize - uio->uio_offset; } + + iostate.io_completed = 0; + iostate.io_issued = 0; + iostate.io_error = 0; + iostate.io_wanted = 0; + + if ( (flags & IO_RETURN_ON_THROTTLE) ) { + if (cluster_hard_throttle_on(vp, 0) == 2) { + if ( !cluster_io_present_in_BC(vp, uio->uio_offset)) { + /* + * we're in the throttle window and at least 1 I/O + * has already been issued by a throttleable thread + * in this window, so return with EAGAIN to indicate + * to the FS issuing the cluster_read call that it + * should now throttle after dropping any locks + */ + throttle_info_update_by_mount(vp->v_mount); + + retval = EAGAIN; + break; + } + } + } + + /* + * compute the size of the upl needed to encompass + * the requested read... limit each call to cluster_io + * to the maximum UPL size... cluster_io will clip if + * this exceeds the maximum io_size for the device, + * make sure to account for + * a starting offset that's not page aligned + */ + start_offset = (int)(uio->uio_offset & PAGE_MASK_64); + upl_f_offset = uio->uio_offset - (off_t)start_offset; + if (io_size > max_rd_size) io_size = max_rd_size; @@ -3329,7 +3727,7 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file pages_in_upl = upl_size / PAGE_SIZE; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START, - (int)upl, (int)upl_f_offset, upl_size, start_offset, 0); + upl, (int)upl_f_offset, upl_size, start_offset, 0); kret = ubc_create_upl(vp, upl_f_offset, @@ -3341,7 +3739,7 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file panic("cluster_read_copy: failed to get pagelist"); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END, - (int)upl, (int)upl_f_offset, upl_size, start_offset, 0); + upl, (int)upl_f_offset, upl_size, start_offset, 0); /* * scan from the beginning of the upl looking for the first @@ -3364,10 +3762,6 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file if (upl_valid_page(pl, last_pg)) break; } - iostate.io_completed = 0; - iostate.io_issued = 0; - iostate.io_error = 0; - iostate.io_wanted = 0; if (start_pg < last_pg) { /* @@ -3376,10 +3770,15 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file * we may have to clip the size of it to keep from reading past * the end of the last physical block associated with the file */ + if (iolock_inited == FALSE) { + lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); + + iolock_inited = TRUE; + } upl_offset = start_pg * PAGE_SIZE; io_size = (last_pg - start_pg) * PAGE_SIZE; - if ((upl_f_offset + upl_offset + io_size) > filesize) + if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) io_size = filesize - (upl_f_offset + upl_offset); /* @@ -3388,6 +3787,18 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, CL_READ | CL_ASYNC | bflag, (buf_t)NULL, &iostate, callback, callback_arg); + + if (rap) { + if (extent.e_addr < rap->cl_maxra) { + /* + * we've just issued a read for a block that should have been + * in the cache courtesy of the read-ahead engine... something + * has gone wrong with the pipeline, so reset the read-ahead + * logic which will cause us to restart from scratch + */ + rap->cl_maxra = 0; + } + } } if (error == 0) { /* @@ -3466,19 +3877,8 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file rap->cl_lastr = extent.e_addr; } } - lck_mtx_lock(cl_mtxp); - - while (iostate.io_issued != iostate.io_completed) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_copy", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - } - lck_mtx_unlock(cl_mtxp); + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, 0, "cluster_read_copy"); if (iostate.io_error) error = iostate.io_error; @@ -3491,6 +3891,9 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file io_req_size -= (val_size - io_requested); } + } else { + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, 0, "cluster_read_copy"); } if (start_pg < last_pg) { /* @@ -3501,16 +3904,22 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file */ io_size = (last_pg - start_pg) * PAGE_SIZE; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START, (int)upl, start_pg * PAGE_SIZE, io_size, error, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START, upl, start_pg * PAGE_SIZE, io_size, error, 0); if (error || (flags & IO_NOCACHE)) ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY); - else - ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size, - UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY | UPL_COMMIT_INACTIVATE); + else { + int commit_flags = UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY; + + if (take_reference) + commit_flags |= UPL_COMMIT_INACTIVATE; + else + commit_flags |= UPL_COMMIT_SPECULATE; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, (int)upl, start_pg * PAGE_SIZE, io_size, error, 0); + ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size, commit_flags); + } + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, start_pg * PAGE_SIZE, io_size, error, 0); } if ((last_pg - start_pg) < pages_in_upl) { /* @@ -3523,47 +3932,66 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file else { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START, - (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0); + upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0); /* * handle any valid pages at the beginning of * the upl... release these appropriately */ - cluster_read_upl_release(upl, 0, start_pg, flags); + cluster_read_upl_release(upl, 0, start_pg, take_reference); /* * handle any valid pages immediately after the * pages we issued I/O for... ... release these appropriately */ - cluster_read_upl_release(upl, last_pg, uio_last, flags); + cluster_read_upl_release(upl, last_pg, uio_last, take_reference); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, (int)upl, -1, -1, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, -1, -1, 0, 0); } } if (retval == 0) retval = error; if (io_req_size) { - if (cluster_hard_throttle_on(vp)) { + if (cluster_hard_throttle_on(vp, 1)) { + /* + * we're in the throttle window, at the very least + * we want to limit the size of the I/O we're about + * to issue + */ rd_ahead_enabled = 0; prefetch_enabled = 0; - - max_rd_size = HARD_THROTTLE_MAXSIZE; + max_rd_size = THROTTLE_MAX_IOSIZE; } else { - if (max_rd_size == HARD_THROTTLE_MAXSIZE) { + if (max_rd_size == THROTTLE_MAX_IOSIZE) { /* * coming out of throttled state */ - if (rap != NULL) - rd_ahead_enabled = 1; - prefetch_enabled = 1; - + if (policy != IOPOL_THROTTLE && policy != IOPOL_UTILITY) { + if (rap != NULL) + rd_ahead_enabled = 1; + prefetch_enabled = 1; + } max_rd_size = max_prefetch; last_ioread_offset = 0; } } } } + if (iolock_inited == TRUE) { + if (iostate.io_issued > iostate.io_completed) { + /* + * cluster_io returned an error after it + * had already issued some I/O. we need + * to wait for that I/O to complete before + * we can destroy the iostate mutex... + * 'retval' already contains the early error + * so no need to pick it up from iostate.io_error + */ + cluster_iostate_wait(&iostate, 0, "cluster_read_copy"); + } + lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); + } if (rap != NULL) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END, (int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, 0); @@ -3585,18 +4013,16 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, upl_t upl; upl_page_info_t *pl; off_t max_io_size; - vm_offset_t upl_offset; - vm_size_t upl_size; + vm_offset_t upl_offset, vector_upl_offset = 0; + upl_size_t upl_size, vector_upl_size = 0; vm_size_t upl_needed_size; unsigned int pages_in_pl; int upl_flags; - int bflag; kern_return_t kret; unsigned int i; int force_data_sync; int retval = 0; int no_zero_fill = 0; - int abort_flag = 0; int io_flag = 0; int misaligned = 0; struct clios iostate; @@ -3609,30 +4035,47 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t xsize; u_int32_t devblocksize; u_int32_t mem_alignment_mask; - u_int32_t max_upl_size; - u_int32_t max_rd_size; - u_int32_t max_rd_ahead; - + u_int32_t max_upl_size; + u_int32_t max_rd_size; + u_int32_t max_rd_ahead; + u_int32_t max_vector_size; + boolean_t strict_uncached_IO = FALSE; + boolean_t io_throttled = FALSE; + + u_int32_t vector_upl_iosize = 0; + int issueVectorUPL = 0,useVectorUPL = (uio->uio_iovcnt > 1); + off_t v_upl_uio_offset = 0; + int vector_upl_index=0; + upl_t vector_upl = NULL; + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START, + (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0); - max_upl_size = cluster_max_io_size(vp->v_mount, CL_READ); + max_upl_size = cluster_max_io_size(vp->v_mount, CL_READ); - max_rd_size = max_upl_size; - max_rd_ahead = max_rd_size * 2; + max_rd_size = max_upl_size; + max_rd_ahead = max_rd_size * IO_SCALE(vp, 2); + io_flag = CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO | CL_DIRECT_IO; if (flags & IO_PASSIVE) - bflag = CL_PASSIVE; - else - bflag = 0; + io_flag |= CL_PASSIVE; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START, - (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0); + if (flags & IO_ENCRYPTED) { + io_flag |= CL_RAW_ENCRYPTED; + } + + if (flags & IO_NOCACHE) { + io_flag |= CL_NOCACHE; + } iostate.io_completed = 0; iostate.io_issued = 0; iostate.io_error = 0; iostate.io_wanted = 0; + lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); + devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; @@ -3652,6 +4095,9 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, */ devblocksize = PAGE_SIZE; } + + strict_uncached_IO = ubc_strict_uncached_IO(vp); + next_dread: io_req_size = *read_length; iov_base = uio_curriovbase(uio); @@ -3678,7 +4124,17 @@ next_dread: * I/O that ends on a page boundary in cluster_io */ misaligned = 1; - } + } + + /* + * The user must request IO in aligned chunks. If the + * offset into the file is bad, or the userland pointer + * is non-aligned, then we cannot service the encrypted IO request. + */ + if ((flags & IO_ENCRYPTED) && (misaligned)) { + retval = EINVAL; + } + /* * When we get to this point, we know... * -- the offset into the file is on a devblocksize boundary @@ -3687,24 +4143,35 @@ next_dread: while (io_req_size && retval == 0) { u_int32_t io_start; - if (cluster_hard_throttle_on(vp)) { - max_rd_size = HARD_THROTTLE_MAXSIZE; - max_rd_ahead = HARD_THROTTLE_MAXSIZE - 1; + if (cluster_hard_throttle_on(vp, 1)) { + /* + * we're in the throttle window, at the very least + * we want to limit the size of the I/O we're about + * to issue + */ + max_rd_size = THROTTLE_MAX_IOSIZE; + max_rd_ahead = THROTTLE_MAX_IOSIZE - 1; + max_vector_size = THROTTLE_MAX_IOSIZE; } else { max_rd_size = max_upl_size; - max_rd_ahead = max_rd_size * 2; + max_rd_ahead = max_rd_size * IO_SCALE(vp, 2); + max_vector_size = MAX_VECTOR_UPL_SIZE; } io_start = io_size = io_req_size; /* * First look for pages already in the cache - * and move them to user space. + * and move them to user space. But only do this + * check if we are not retrieving encrypted data directly + * from the filesystem; those blocks should never + * be in the UBC. * * cluster_copy_ubc_data returns the resid * in io_size */ - retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0); - + if ((strict_uncached_IO == FALSE) && ((flags & IO_ENCRYPTED) == 0)) { + retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0); + } /* * calculate the number of bytes actually copied * starting size - residual @@ -3713,10 +4180,36 @@ next_dread: io_req_size -= xsize; + if(useVectorUPL && (xsize || (iov_base & PAGE_MASK))) { + /* + * We found something in the cache or we have an iov_base that's not + * page-aligned. + * + * Issue all I/O's that have been collected within this Vectored UPL. + */ + if(vector_upl_index) { + retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); + reset_vector_run_state(); + } + + if(xsize) + useVectorUPL = 0; + + /* + * After this point, if we are using the Vector UPL path and the base is + * not page-aligned then the UPL with that base will be the first in the vector UPL. + */ + } + /* - * check to see if we are finished with this request... + * check to see if we are finished with this request. + * + * If we satisfied this IO already, then io_req_size will be 0. + * Otherwise, see if the IO was mis-aligned and needs to go through + * the UBC to deal with the 'tail'. + * */ - if (io_req_size == 0 || misaligned) { + if (io_req_size == 0 || (misaligned)) { /* * see if there's another uio vector to * process that's of type IO_DIRECT @@ -3742,13 +4235,31 @@ next_dread: * (which overlaps the end of the direct read) in order to * get at the overhang bytes */ - if (io_size & (devblocksize - 1)) { - /* - * request does NOT end on a device block boundary - * so clip it back to a PAGE_SIZE boundary - */ - io_size &= ~PAGE_MASK; - io_min = PAGE_SIZE; + if (io_size & (devblocksize - 1)) { + if (flags & IO_ENCRYPTED) { + /* + * Normally, we'd round down to the previous page boundary to + * let the UBC manage the zero-filling of the file past the EOF. + * But if we're doing encrypted IO, we can't let any of + * the data hit the UBC. This means we have to do the full + * IO to the upper block boundary of the device block that + * contains the EOF. The user will be responsible for not + * interpreting data PAST the EOF in its buffer. + * + * So just bump the IO back up to a multiple of devblocksize + */ + io_size = ((io_size + devblocksize) & ~(devblocksize - 1)); + io_min = io_size; + } + else { + /* + * Clip the request to the previous page size boundary + * since request does NOT end on a device block boundary + */ + io_size &= ~PAGE_MASK; + io_min = PAGE_SIZE; + } + } if (retval || io_size < io_min) { /* @@ -3760,21 +4271,49 @@ next_dread: */ goto wait_for_dreads; } - if ((xsize = io_size) > max_rd_size) - xsize = max_rd_size; - io_size = 0; + /* + * Don't re-check the UBC data if we are looking for uncached IO + * or asking for encrypted blocks. + */ + if ((strict_uncached_IO == FALSE) && ((flags & IO_ENCRYPTED) == 0)) { - ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size); + if ((xsize = io_size) > max_rd_size) + xsize = max_rd_size; - if (io_size == 0) { - /* - * a page must have just come into the cache - * since the first page in this range is no - * longer absent, go back and re-evaluate - */ - continue; + io_size = 0; + + ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size); + + if (io_size == 0) { + /* + * a page must have just come into the cache + * since the first page in this range is no + * longer absent, go back and re-evaluate + */ + continue; + } } + if ( (flags & IO_RETURN_ON_THROTTLE) ) { + if (cluster_hard_throttle_on(vp, 0) == 2) { + if ( !cluster_io_present_in_BC(vp, uio->uio_offset)) { + /* + * we're in the throttle window and at least 1 I/O + * has already been issued by a throttleable thread + * in this window, so return with EAGAIN to indicate + * to the FS issuing the cluster_read call that it + * should now throttle after dropping any locks + */ + throttle_info_update_by_mount(vp->v_mount); + + io_throttled = TRUE; + goto wait_for_dreads; + } + } + } + if (io_size > max_rd_size) + io_size = max_rd_size; + iov_base = uio_curriovbase(uio); upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK); @@ -3783,13 +4322,11 @@ next_dread: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START, (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0); - if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) { + if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) no_zero_fill = 1; - abort_flag = UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY; - } else { + else no_zero_fill = 0; - abort_flag = UPL_ABORT_FREE_ON_EMPTY; - } + for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) { pages_in_pl = 0; upl_size = upl_needed_size; @@ -3820,13 +4357,13 @@ next_dread: pl = UPL_GET_INTERNAL_PAGE_LIST(upl); for (i = 0; i < pages_in_pl; i++) { - if (!upl_valid_page(pl, i)) + if (!upl_page_present(pl, i)) break; } if (i == pages_in_pl) break; - ubc_upl_abort(upl, abort_flag); + ubc_upl_abort(upl, 0); } if (force_data_sync >= 3) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END, @@ -3844,32 +4381,33 @@ next_dread: io_size = 0; } if (io_size == 0) { - ubc_upl_abort(upl, abort_flag); + ubc_upl_abort(upl, 0); goto wait_for_dreads; } KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END, (int)upl_offset, upl_size, io_size, kret, 0); + if(useVectorUPL) { + vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK); + if(end_off) + issueVectorUPL = 1; + /* + * After this point, if we are using a vector UPL, then + * either all the UPL elements end on a page boundary OR + * this UPL is the last element because it does not end + * on a page boundary. + */ + } + /* * request asynchronously so that we can overlap * the preparation of the next I/O * if there are already too many outstanding reads * wait until some have completed before issuing the next read */ - lck_mtx_lock(cl_mtxp); - - while ((iostate.io_issued - iostate.io_completed) > max_rd_ahead) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, max_rd_ahead, 0, 0); + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, max_rd_ahead, "cluster_read_direct"); - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_direct", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, max_rd_ahead, 0, 0); - } - lck_mtx_unlock(cl_mtxp); - if (iostate.io_error) { /* * one of the earlier reads we issued ran into a hard error @@ -3878,29 +4416,65 @@ next_dread: * go wait for any other reads to complete before * returning the error to the caller */ - ubc_upl_abort(upl, abort_flag); + ubc_upl_abort(upl, 0); goto wait_for_dreads; } KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START, - (int)upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0); + upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0); - if (no_zero_fill) - io_flag = CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO | CL_DIRECT_IO | bflag; - else - io_flag = CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO | CL_DIRECT_IO | CL_PRESERVE | bflag; - retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); + if(!useVectorUPL) { + if (no_zero_fill) + io_flag &= ~CL_PRESERVE; + else + io_flag |= CL_PRESERVE; + + retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); + + } else { + + if(!vector_upl_index) { + vector_upl = vector_upl_create(upl_offset); + v_upl_uio_offset = uio->uio_offset; + vector_upl_offset = upl_offset; + } + vector_upl_set_subupl(vector_upl,upl, upl_size); + vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size); + vector_upl_index++; + vector_upl_size += upl_size; + vector_upl_iosize += io_size; + + if(issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) { + retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); + reset_vector_run_state(); + } + } /* * update the uio structure */ - uio_update(uio, (user_size_t)io_size); - - io_req_size -= io_size; + if ((flags & IO_ENCRYPTED) && (max_io_size < io_size)) { + uio_update(uio, (user_size_t)max_io_size); + } + else { + uio_update(uio, (user_size_t)io_size); + } + /* + * Under normal circumstances, the io_size should not be + * bigger than the io_req_size, but we may have had to round up + * to the end of the page in the encrypted IO case. In that case only, + * ensure that we only decrement io_req_size to 0. + */ + if ((flags & IO_ENCRYPTED) && (io_size > io_req_size)) { + io_req_size = 0; + } + else { + io_req_size -= io_size; + } KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END, - (int)upl, (int)uio->uio_offset, io_req_size, retval, 0); + upl, (int)uio->uio_offset, io_req_size, retval, 0); } /* end while */ @@ -3918,29 +4492,26 @@ next_dread: } wait_for_dreads: - if (iostate.io_issued) { - /* - * make sure all async reads that are part of this stream - * have completed before we return - */ - lck_mtx_lock(cl_mtxp); - while (iostate.io_issued != iostate.io_completed) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_direct", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - } - lck_mtx_unlock(cl_mtxp); + if(retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) { + retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); + reset_vector_run_state(); } + /* + * make sure all async reads that are part of this stream + * have completed before we return + */ + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, 0, "cluster_read_direct"); if (iostate.io_error) retval = iostate.io_error; + lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); + + if (io_throttled == TRUE && retval == 0) + retval = EAGAIN; + if (io_req_size && retval == 0) { /* * we couldn't handle the tail of this request in DIRECT mode @@ -3967,7 +4538,7 @@ cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, addr64_t dst_paddr = 0; user_addr_t iov_base; off_t max_size; - vm_size_t upl_size; + upl_size_t upl_size; vm_size_t upl_needed_size; mach_msg_type_number_t pages_in_pl; int upl_flags; @@ -3985,10 +4556,13 @@ cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, int bflag; if (flags & IO_PASSIVE) - bflag = CL_PASSIVE; + bflag = CL_PASSIVE; else - bflag = 0; - + bflag = 0; + + if (flags & IO_NOCACHE) + bflag |= CL_NOCACHE; + /* * When we enter this routine, we know * -- the read_length will not exceed the current iov_len @@ -4004,6 +4578,8 @@ cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, iostate.io_error = 0; iostate.io_wanted = 0; + lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); + next_cread: io_size = *read_length; @@ -4101,21 +4677,9 @@ next_cread: * if there are already too many outstanding reads * wait until some have completed before issuing the next */ - if (iostate.io_issued) { - lck_mtx_lock(cl_mtxp); + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_read_contig"); - while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_IO_CONTIG_SIZE)) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, 2 * MAX_IO_CONTIG_SIZE, 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_contig", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, 2 * MAX_IO_CONTIG_SIZE, 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) { /* * one of the earlier reads we issued ran into a hard error @@ -4156,23 +4720,14 @@ wait_for_creads: * make sure all async reads that are part of this stream * have completed before we proceed */ - lck_mtx_lock(cl_mtxp); - - while (iostate.io_issued != iostate.io_completed) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_contig", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - } - lck_mtx_unlock(cl_mtxp); + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, 0, "cluster_read_contig"); if (iostate.io_error) error = iostate.io_error; + lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); + if (error == 0 && tail_size) error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ, callback, callback_arg); @@ -4193,7 +4748,7 @@ cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t m user_size_t iov_len; user_addr_t iov_base = 0; upl_t upl; - vm_size_t upl_size; + upl_size_t upl_size; int upl_flags; int retval = 0; @@ -4204,7 +4759,7 @@ cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t m iov_len = uio_curriovlen(uio); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_START, (int)uio, (int)iov_len, 0, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_START, uio, (int)iov_len, 0, 0, 0); if (iov_len) { iov_base = uio_curriovbase(uio); @@ -4246,7 +4801,7 @@ cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t m *io_length = 0; *io_type = IO_UNKNOWN; } - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_END, (int)iov_base, *io_type, *io_length, retval, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_END, iov_base, *io_type, *io_length, retval, 0); return (retval); } @@ -4268,7 +4823,7 @@ advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*c upl_page_info_t *pl; upl_t upl; vm_offset_t upl_offset; - int upl_size; + int upl_size; off_t upl_f_offset; int start_offset; int start_pg; @@ -4280,9 +4835,9 @@ advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*c int retval = 0; int issued_io; int skip_range; - uint32_t max_io_size; - - + uint32_t max_io_size; + + if ( !UBCINFOEXISTS(vp)) return(EINVAL); @@ -4290,9 +4845,19 @@ advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*c return(EINVAL); max_io_size = cluster_max_io_size(vp->v_mount, CL_READ); - + +#if CONFIG_EMBEDDED + if (max_io_size > speculative_prefetch_max_iosize) + max_io_size = speculative_prefetch_max_iosize; +#else + if ((vp->v_mount->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) { + if (max_io_size > speculative_prefetch_max_iosize) + max_io_size = speculative_prefetch_max_iosize; + } +#endif + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START, - (int)f_offset, resid, (int)filesize, 0, 0); + (int)f_offset, resid, (int)filesize, 0, 0); while (resid && f_offset < filesize && retval == 0) { /* @@ -4346,7 +4911,7 @@ advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*c pages_in_upl = upl_size / PAGE_SIZE; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START, - (int)upl, (int)upl_f_offset, upl_size, start_offset, 0); + upl, (int)upl_f_offset, upl_size, start_offset, 0); kret = ubc_create_upl(vp, upl_f_offset, @@ -4371,7 +4936,7 @@ advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*c KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END, - (int)upl, (int)upl_f_offset, upl_size, start_offset, 0); + upl, (int)upl_f_offset, upl_size, start_offset, 0); for (last_pg = 0; last_pg < pages_in_upl; ) { @@ -4407,7 +4972,7 @@ advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*c upl_offset = start_pg * PAGE_SIZE; io_size = (last_pg - start_pg) * PAGE_SIZE; - if ((upl_f_offset + upl_offset + io_size) > filesize) + if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) io_size = filesize - (upl_f_offset + upl_offset); /* @@ -4448,10 +5013,11 @@ int cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg) { int retval; + int my_sparse_wait = 0; struct cl_writebehind *wbp; if ( !UBCINFOEXISTS(vp)) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -1, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, vp, flags, 0, -1, 0); return (0); } /* return if deferred write is set */ @@ -4459,32 +5025,97 @@ cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *ca return (0); } if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -2, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, vp, flags, 0, -2, 0); return (0); } if (wbp->cl_number == 0 && wbp->cl_scmap == NULL) { lck_mtx_unlock(&wbp->cl_lockw); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -3, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, vp, flags, 0, -3, 0); return(0); } KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START, - (int)wbp->cl_scmap, wbp->cl_number, flags, 0, 0); + wbp->cl_scmap, wbp->cl_number, flags, 0, 0); + /* + * if we have an fsync in progress, we don't want to allow any additional + * sync/fsync/close(s) to occur until it finishes. + * note that its possible for writes to continue to occur to this file + * while we're waiting and also once the fsync starts to clean if we're + * in the sparse map case + */ + while (wbp->cl_sparse_wait) { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, vp, 0, 0, 0, 0); + + msleep((caddr_t)&wbp->cl_sparse_wait, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL); + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, vp, 0, 0, 0, 0); + } + if (flags & IO_SYNC) { + my_sparse_wait = 1; + wbp->cl_sparse_wait = 1; + + /* + * this is an fsync (or equivalent)... we must wait for any existing async + * cleaning operations to complete before we evaulate the current state + * and finish cleaning... this insures that all writes issued before this + * fsync actually get cleaned to the disk before this fsync returns + */ + while (wbp->cl_sparse_pushes) { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_START, vp, 0, 0, 0, 0); + + msleep((caddr_t)&wbp->cl_sparse_pushes, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL); + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_END, vp, 0, 0, 0, 0); + } + } if (wbp->cl_scmap) { - sparse_cluster_push(wbp, vp, ubc_getsize(vp), PUSH_ALL | IO_PASSIVE, callback, callback_arg); + void *scmap; - retval = 1; - } else - retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL | IO_PASSIVE, callback, callback_arg); + if (wbp->cl_sparse_pushes < SPARSE_PUSH_LIMIT) { + + scmap = wbp->cl_scmap; + wbp->cl_scmap = NULL; + wbp->cl_sparse_pushes++; + + lck_mtx_unlock(&wbp->cl_lockw); + + sparse_cluster_push(&scmap, vp, ubc_getsize(vp), PUSH_ALL, flags | IO_PASSIVE, callback, callback_arg); + + lck_mtx_lock(&wbp->cl_lockw); + + wbp->cl_sparse_pushes--; + + if (wbp->cl_sparse_wait && wbp->cl_sparse_pushes == 0) + wakeup((caddr_t)&wbp->cl_sparse_pushes); + } else { + sparse_cluster_push(&(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL, flags | IO_PASSIVE, callback, callback_arg); + } + retval = 1; + } else { + retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL, flags | IO_PASSIVE, callback, callback_arg); + } lck_mtx_unlock(&wbp->cl_lockw); if (flags & IO_SYNC) (void)vnode_waitforwrites(vp, 0, 0, 0, "cluster_push"); + if (my_sparse_wait) { + /* + * I'm the owner of the serialization token + * clear it and wakeup anyone that is waiting + * for me to finish + */ + lck_mtx_lock(&wbp->cl_lockw); + + wbp->cl_sparse_wait = 0; + wakeup((caddr_t)&wbp->cl_sparse_wait); + + lck_mtx_unlock(&wbp->cl_lockw); + } KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END, - (int)wbp->cl_scmap, wbp->cl_number, retval, 0, 0); + wbp->cl_scmap, wbp->cl_number, retval, 0, 0); return (retval); } @@ -4498,12 +5129,12 @@ cluster_release(struct ubc_info *ubc) if ((wbp = ubc->cl_wbehind)) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, wbp->cl_scmap, 0, 0, 0); if (wbp->cl_scmap) vfs_drt_control(&(wbp->cl_scmap), 0); } else { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, 0, 0, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, 0, 0, 0, 0); } rap = ubc->cl_rahead; @@ -4519,12 +5150,12 @@ cluster_release(struct ubc_info *ubc) ubc->cl_rahead = NULL; ubc->cl_wbehind = NULL; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, (int)ubc, (int)rap, (int)wbp, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, ubc, rap, wbp, 0, 0); } static int -cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int (*callback)(buf_t, void *), void *callback_arg) +cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg) { int cl_index; int cl_index1; @@ -4532,10 +5163,10 @@ cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_fla int cl_len; int cl_pushed = 0; struct cl_wextent l_clusters[MAX_CLUSTERS]; - u_int max_cluster_pgcount; - - - max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE; + u_int max_cluster_pgcount; + + + max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE; /* * the write behind context exists and has * already been locked... @@ -4563,6 +5194,7 @@ cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_fla } if (min_index == -1) break; + l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr; l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr; l_clusters[cl_index].io_flags = wbp->cl_clusters[min_index].io_flags; @@ -4606,15 +5238,15 @@ cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_fla int flags; struct cl_extent cl; + flags = io_flags & (IO_PASSIVE|IO_CLOSE); + /* * try to push each cluster in turn... */ if (l_clusters[cl_index].io_flags & CLW_IONOCACHE) - flags = IO_NOCACHE; - else - flags = 0; + flags |= IO_NOCACHE; - if ((l_clusters[cl_index].io_flags & CLW_IOPASSIVE) || (push_flag & IO_PASSIVE)) + if (l_clusters[cl_index].io_flags & CLW_IOPASSIVE) flags |= IO_PASSIVE; if (push_flag & PUSH_SYNC) @@ -4719,9 +5351,9 @@ cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags, int (*c kern_return_t kret; if (flags & IO_PASSIVE) - bflag = CL_PASSIVE; + bflag = CL_PASSIVE; else - bflag = 0; + bflag = 0; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START, (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0); @@ -4781,7 +5413,7 @@ cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags, int (*c if (kret != KERN_SUCCESS) panic("cluster_push: failed to get pagelist"); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, (int)upl, upl_f_offset, 0, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, upl, upl_f_offset, 0, 0, 0); /* * since we only asked for the dirty pages back @@ -4848,6 +5480,12 @@ cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags, int (*c if ( !(flags & IO_SYNC)) io_flags |= CL_ASYNC; + if (flags & IO_CLOSE) + io_flags |= CL_CLOSE; + + if (flags & IO_NOCACHE) + io_flags |= CL_NOCACHE; + retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, io_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); @@ -4870,10 +5508,7 @@ sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*c { int cl_index; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0); - - if (wbp->cl_scmap == NULL) - wbp->cl_scdirty = 0; + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, vp, wbp->cl_scmap, 0, 0, 0); for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) { int flags; @@ -4885,47 +5520,47 @@ sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*c if (flags & UPL_POP_DIRTY) { cl.e_addr = cl.b_addr + 1; - sparse_cluster_add(wbp, vp, &cl, EOF, callback, callback_arg); + sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, EOF, callback, callback_arg); } } } } wbp->cl_number = 0; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, vp, wbp->cl_scmap, 0, 0, 0); } /* - * sparse_cluster_push is called with the write behind lock held + * sparse_cluster_push must be called with the write-behind lock held if the scmap is + * still associated with the write-behind context... however, if the scmap has been disassociated + * from the write-behind context (the cluster_push case), the wb lock is not held */ static void -sparse_cluster_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int (*callback)(buf_t, void *), void *callback_arg) +sparse_cluster_push(void **scmap, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg) { struct cl_extent cl; off_t offset; u_int length; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, push_flag, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, vp, (*scmap), 0, push_flag, 0); if (push_flag & PUSH_ALL) - vfs_drt_control(&(wbp->cl_scmap), 1); + vfs_drt_control(scmap, 1); for (;;) { - if (vfs_drt_get_cluster(&(wbp->cl_scmap), &offset, &length) != KERN_SUCCESS) + if (vfs_drt_get_cluster(scmap, &offset, &length) != KERN_SUCCESS) break; cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64); cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64); - wbp->cl_scdirty -= (int)(cl.e_addr - cl.b_addr); - - cluster_push_now(vp, &cl, EOF, push_flag & IO_PASSIVE, callback, callback_arg); + cluster_push_now(vp, &cl, EOF, io_flags & (IO_PASSIVE|IO_CLOSE), callback, callback_arg); if ( !(push_flag & PUSH_ALL) ) break; } - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, vp, (*scmap), 0, 0, 0); } @@ -4933,33 +5568,29 @@ sparse_cluster_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_ * sparse_cluster_add is called with the write behind lock held */ static void -sparse_cluster_add(struct cl_writebehind *wbp, vnode_t vp, struct cl_extent *cl, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg) +sparse_cluster_add(void **scmap, vnode_t vp, struct cl_extent *cl, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg) { u_int new_dirty; u_int length; off_t offset; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (int)wbp->cl_scmap, wbp->cl_scdirty, (int)cl->b_addr, (int)cl->e_addr, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (*scmap), 0, cl->b_addr, (int)cl->e_addr, 0); offset = (off_t)(cl->b_addr * PAGE_SIZE_64); length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE; - while (vfs_drt_mark_pages(&(wbp->cl_scmap), offset, length, &new_dirty) != KERN_SUCCESS) { + while (vfs_drt_mark_pages(scmap, offset, length, &new_dirty) != KERN_SUCCESS) { /* * no room left in the map * only a partial update was done * push out some pages and try again */ - wbp->cl_scdirty += new_dirty; - - sparse_cluster_push(wbp, vp, EOF, 0, callback, callback_arg); + sparse_cluster_push(scmap, vp, EOF, 0, 0, callback, callback_arg); offset += (new_dirty * PAGE_SIZE_64); length -= (new_dirty * PAGE_SIZE); } - wbp->cl_scdirty += new_dirty; - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, vp, (*scmap), 0, 0, 0); } @@ -4977,9 +5608,12 @@ cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t int bflag; if (flags & IO_PASSIVE) - bflag = CL_PASSIVE; + bflag = CL_PASSIVE; else - bflag = 0; + bflag = 0; + + if (flags & IO_NOCACHE) + bflag |= CL_NOCACHE; upl_flags = UPL_SET_LITE; @@ -5093,17 +5727,10 @@ cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid) uio->uio_segflg = UIO_PHYS_USERSPACE64; break; - case UIO_SYSSPACE32: - uio->uio_segflg = UIO_PHYS_SYSSPACE32; - break; - case UIO_SYSSPACE: uio->uio_segflg = UIO_PHYS_SYSSPACE; break; - case UIO_SYSSPACE64: - uio->uio_segflg = UIO_PHYS_SYSSPACE64; - break; } pl = ubc_upl_pageinfo(upl); @@ -5155,7 +5782,7 @@ cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int m io_size = *io_resid; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START, - (int)uio->uio_offset, 0, io_size, 0, 0); + (int)uio->uio_offset, io_size, mark_dirty, take_reference, 0); control = ubc_getobject(vp, UBC_FLAGS_NONE); @@ -5179,14 +5806,6 @@ cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int m uio->uio_segflg = UIO_PHYS_USERSPACE64; break; - case UIO_SYSSPACE32: - uio->uio_segflg = UIO_PHYS_SYSSPACE32; - break; - - case UIO_SYSSPACE64: - uio->uio_segflg = UIO_PHYS_SYSSPACE64; - break; - case UIO_USERSPACE: case UIO_USERISPACE: uio->uio_segflg = UIO_PHYS_USERSPACE; @@ -5319,6 +5938,14 @@ is_file_clean(vnode_t vp, off_t filesize) #define DRT_HASH_SMALL_MODULUS 23 #define DRT_HASH_LARGE_MODULUS 401 +/* + * Physical memory required before the large hash modulus is permitted. + * + * On small memory systems, the large hash modulus can lead to phsyical + * memory starvation, so we avoid using it there. + */ +#define DRT_HASH_LARGE_MEMORY_REQUIRED (1024LL * 1024LL * 1024LL) /* 1GiB */ + #define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */ #define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */ @@ -5461,8 +6088,12 @@ vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp) * see whether we should grow to the large one. */ if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) { - /* if the ring is nearly full */ - if (active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) { + /* + * If the ring is nearly full and we are allowed to + * use the large modulus, upgrade. + */ + if ((active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) && + (max_mem >= DRT_HASH_LARGE_MEMORY_REQUIRED)) { nsize = DRT_HASH_LARGE_MODULUS; } else { nsize = DRT_HASH_SMALL_MODULUS;