X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/2d21ac55c334faf3a56e5634905ed6987fc787d4..935ed37a5c468c8a1c07408573c08b8b7ef80e8b:/bsd/vfs/vfs_cluster.c?ds=sidebyside diff --git a/bsd/vfs/vfs_cluster.c b/bsd/vfs/vfs_cluster.c index 8d3657909..3a34e1787 100644 --- a/bsd/vfs/vfs_cluster.c +++ b/bsd/vfs/vfs_cluster.c @@ -90,8 +90,9 @@ #include #define CL_READ 0x01 -#define CL_ASYNC 0x02 -#define CL_COMMIT 0x04 +#define CL_WRITE 0x02 +#define CL_ASYNC 0x04 +#define CL_COMMIT 0x08 #define CL_PAGEOUT 0x10 #define CL_AGE 0x20 #define CL_NOZERO 0x40 @@ -182,17 +183,15 @@ int is_file_clean(vnode_t, off_t); * can represent it in a 32 bit int */ #define MAX_IO_REQUEST_SIZE (1024 * 1024 * 256) -#define MAX_IO_CONTIG_SIZE (1024 * 1024 * 8) -#define MAX_VECTS 16 - -/* - * note: MAX_CLUSTER_SIZE CANNOT be larger than MAX_UPL_TRANSFER - */ -#define MAX_CLUSTER_SIZE (MAX_UPL_TRANSFER) -#define MAX_PREFETCH (MAX_CLUSTER_SIZE * PAGE_SIZE * 2) +#define MAX_IO_CONTIG_SIZE (MAX_UPL_SIZE * PAGE_SIZE) +#define MAX_VECTS 16 #define MIN_DIRECT_WRITE_SIZE (4 * PAGE_SIZE) +#define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE)) +#define MAX_PREFETCH(vp) (cluster_max_io_size(vp->v_mount, CL_READ) * 3); + + int speculative_reads_disabled = 0; /* @@ -231,6 +230,52 @@ cluster_init(void) { } +uint32_t +cluster_max_io_size(mount_t mp, int type) +{ + uint32_t max_io_size; + uint32_t segcnt; + uint32_t maxcnt; + + switch(type) { + + case CL_READ: + segcnt = mp->mnt_segreadcnt; + maxcnt = mp->mnt_maxreadcnt; + break; + case CL_WRITE: + segcnt = mp->mnt_segwritecnt; + maxcnt = mp->mnt_maxwritecnt; + break; + default: + segcnt = min(mp->mnt_segreadcnt, mp->mnt_segwritecnt); + maxcnt = min(mp->mnt_maxreadcnt, mp->mnt_maxwritecnt); + break; + } + if (segcnt > MAX_UPL_SIZE) { + /* + * don't allow a size beyond the max UPL size we can create + */ + segcnt = MAX_UPL_SIZE; + } + max_io_size = min((segcnt * PAGE_SIZE), maxcnt); + + if (max_io_size < (MAX_UPL_TRANSFER * PAGE_SIZE)) { + /* + * don't allow a size smaller than the old fixed limit + */ + max_io_size = (MAX_UPL_TRANSFER * PAGE_SIZE); + } else { + /* + * make sure the size specified is a multiple of PAGE_SIZE + */ + max_io_size &= ~PAGE_MASK; + } + return (max_io_size); +} + + + #define CLW_ALLOCATE 0x01 #define CLW_RETURNLOCKED 0x02 @@ -805,16 +850,21 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no async_throttle = VNODE_ASYNC_THROTTLE; else { u_int max_cluster; + u_int max_cluster_size; + u_int max_prefetch; + + max_cluster_size = MAX_CLUSTER_SIZE(vp); + max_prefetch = MAX_PREFETCH(vp); - if (max_iosize > (MAX_CLUSTER_SIZE * PAGE_SIZE)) - max_cluster = (MAX_CLUSTER_SIZE * PAGE_SIZE); + if (max_iosize > max_cluster_size) + max_cluster = max_cluster_size; else max_cluster = max_iosize; if (size < max_cluster) max_cluster = size; - async_throttle = min(VNODE_ASYNC_THROTTLE, (MAX_PREFETCH / max_cluster) - 1); + async_throttle = min(VNODE_ASYNC_THROTTLE, (max_prefetch / max_cluster) - 1); } } } @@ -1449,6 +1499,7 @@ cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct daddr64_t r_addr; off_t f_offset; int size_of_prefetch; + u_int max_prefetch; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START, @@ -1468,8 +1519,10 @@ cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct return; } + max_prefetch = MAX_PREFETCH(vp); + if (extent->e_addr < rap->cl_maxra) { - if ((rap->cl_maxra - extent->e_addr) > ((MAX_PREFETCH / PAGE_SIZE) / 4)) { + if ((rap->cl_maxra - extent->e_addr) > ((max_prefetch / PAGE_SIZE) / 4)) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0); @@ -1491,13 +1544,13 @@ cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct if (f_offset < filesize) { daddr64_t read_size; - rap->cl_ralen = rap->cl_ralen ? min(MAX_PREFETCH / PAGE_SIZE, rap->cl_ralen << 1) : 1; + rap->cl_ralen = rap->cl_ralen ? min(max_prefetch / PAGE_SIZE, rap->cl_ralen << 1) : 1; read_size = (extent->e_addr + 1) - extent->b_addr; if (read_size > rap->cl_ralen) { - if (read_size > MAX_PREFETCH / PAGE_SIZE) - rap->cl_ralen = MAX_PREFETCH / PAGE_SIZE; + if (read_size > max_prefetch / PAGE_SIZE) + rap->cl_ralen = max_prefetch / PAGE_SIZE; else rap->cl_ralen = read_size; } @@ -1835,7 +1888,7 @@ cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, in u_int32_t io_req_size; u_int32_t offset_in_file; u_int32_t offset_in_iovbase; - int io_size; + u_int32_t io_size; int io_flag; int bflag; vm_size_t upl_size; @@ -1851,11 +1904,15 @@ cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, in user_addr_t iov_base; u_int32_t mem_alignment_mask; u_int32_t devblocksize; + u_int32_t max_upl_size; + + + max_upl_size = cluster_max_io_size(vp->v_mount, CL_WRITE); if (flags & IO_PASSIVE) - bflag = CL_PASSIVE; + bflag = CL_PASSIVE; else - bflag = 0; + bflag = 0; /* * When we enter this routine, we know @@ -1918,8 +1975,8 @@ next_dwrite: io_size = io_req_size & ~PAGE_MASK; iov_base = uio_curriovbase(uio); - if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE)) - io_size = MAX_UPL_TRANSFER * PAGE_SIZE; + if (io_size > max_upl_size) + io_size = max_upl_size; upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK); upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK; @@ -2022,9 +2079,16 @@ next_dwrite: */ lck_mtx_lock(cl_mtxp); - while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) { + while ((iostate.io_issued - iostate.io_completed) > (2 * max_upl_size)) { + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, + iostate.io_issued, iostate.io_completed, 2 * max_upl_size, 0, 0); + iostate.io_wanted = 1; msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_direct", NULL); + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, + iostate.io_issued, iostate.io_completed, 2 * max_upl_size, 0, 0); } lck_mtx_unlock(cl_mtxp); @@ -2083,8 +2147,14 @@ wait_for_dwrites: lck_mtx_lock(cl_mtxp); while (iostate.io_issued != iostate.io_completed) { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, + iostate.io_issued, iostate.io_completed, 0, 0, 0); + iostate.io_wanted = 1; msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_direct", NULL); + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, + iostate.io_issued, iostate.io_completed, 0, 0, 0); } lck_mtx_unlock(cl_mtxp); } @@ -2242,8 +2312,15 @@ next_cwrite: lck_mtx_lock(cl_mtxp); while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_IO_CONTIG_SIZE)) { + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, + iostate.io_issued, iostate.io_completed, 2 * MAX_IO_CONTIG_SIZE, 0, 0); + iostate.io_wanted = 1; msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_contig", NULL); + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, + iostate.io_issued, iostate.io_completed, 2 * MAX_IO_CONTIG_SIZE, 0, 0); } lck_mtx_unlock(cl_mtxp); } @@ -2274,7 +2351,7 @@ next_cwrite: io_size -= xsize; } } - if (error == 0 && iostate.io_error == 0 && tail_size == 0) { + if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS) { error = cluster_io_type(uio, write_type, write_length, 0); @@ -2293,8 +2370,14 @@ wait_for_cwrites: lck_mtx_lock(cl_mtxp); while (iostate.io_issued != iostate.io_completed) { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, + iostate.io_issued, iostate.io_completed, 0, 0, 0); + iostate.io_wanted = 1; msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_contig", NULL); + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, + iostate.io_issued, iostate.io_completed, 0, 0, 0); } lck_mtx_unlock(cl_mtxp); @@ -2340,14 +2423,15 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old long long zero_cnt1; off_t zero_off1; struct cl_extent cl; - int intersection; struct cl_writebehind *wbp; int bflag; + u_int max_cluster_pgcount; + u_int max_io_size; if (flags & IO_PASSIVE) - bflag = CL_PASSIVE; + bflag = CL_PASSIVE; else - bflag = 0; + bflag = 0; if (uio) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START, @@ -2365,6 +2449,9 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old zero_off = 0; zero_off1 = 0; + max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE; + max_io_size = cluster_max_io_size(vp->v_mount, CL_WRITE); + if (flags & IO_HEADZEROFILL) { /* * some filesystems (HFS is one) don't support unallocated holes within a file... @@ -2414,8 +2501,8 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE, (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0); - if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE)) - total_size = MAX_UPL_TRANSFER * PAGE_SIZE; + if (total_size > max_io_size) + total_size = max_io_size; cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64); @@ -2424,7 +2511,7 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old * assumption... total_size <= io_resid * because IO_HEADZEROFILL and IO_TAILZEROFILL not set */ - if ((start_offset + total_size) > (MAX_UPL_TRANSFER * PAGE_SIZE)) + if ((start_offset + total_size) > max_io_size) total_size -= start_offset; xfer_resid = total_size; @@ -2465,8 +2552,8 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old */ upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; - if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE)) - upl_size = MAX_UPL_TRANSFER * PAGE_SIZE; + if (upl_size > max_io_size) + upl_size = max_io_size; pages_in_upl = upl_size / PAGE_SIZE; io_size = upl_size - start_offset; @@ -2516,7 +2603,7 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old * to release the rest of the pages in the upl without modifying * there state and mark the failed page in error */ - ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES); + ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY); if (upl_size > PAGE_SIZE) ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); @@ -2552,7 +2639,7 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old * need to release the rest of the pages in the upl without * modifying there state and mark the failed page in error */ - ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES); + ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY); if (upl_size > PAGE_SIZE) ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); @@ -2657,6 +2744,33 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old */ cluster_zero(upl, io_size, upl_size - io_size, NULL); } + /* + * release the upl now if we hold one since... + * 1) pages in it may be present in the sparse cluster map + * and may span 2 separate buckets there... if they do and + * we happen to have to flush a bucket to make room and it intersects + * this upl, a deadlock may result on page BUSY + * 2) we're delaying the I/O... from this point forward we're just updating + * the cluster state... no need to hold the pages, so commit them + * 3) IO_SYNC is set... + * because we had to ask for a UPL that provides currenty non-present pages, the + * UPL has been automatically set to clear the dirty flags (both software and hardware) + * upon committing it... this is not the behavior we want since it's possible for + * pages currently present as part of a mapped file to be dirtied while the I/O is in flight. + * we'll pick these pages back up later with the correct behavior specified. + * 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush + * of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages + * we hold since the flushing context is holding the cluster lock. + */ + ubc_upl_commit_range(upl, 0, upl_size, + UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY); +check_cluster: + /* + * calculate the last logical block number + * that this delayed I/O encompassed + */ + cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64); + if (flags & IO_SYNC) /* * if the IO_SYNC flag is set than we need to @@ -2664,35 +2778,20 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old * the I/O */ goto issue_io; -check_cluster: + /* * take the lock to protect our accesses * of the writebehind and sparse cluster state */ wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED); - /* - * calculate the last logical block number - * that this delayed I/O encompassed - */ - cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64); - if (wbp->cl_scmap) { if ( !(flags & IO_NOCACHE)) { /* * we've fallen into the sparse * cluster method of delaying dirty pages - * first, we need to release the upl if we hold one - * since pages in it may be present in the sparse cluster map - * and may span 2 separate buckets there... if they do and - * we happen to have to flush a bucket to make room and it intersects - * this upl, a deadlock may result on page BUSY */ - if (upl_size) - ubc_upl_commit_range(upl, 0, upl_size, - UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY); - sparse_cluster_add(wbp, vp, &cl, newEOF, callback, callback_arg); lck_mtx_unlock(&wbp->cl_lockw); @@ -2705,21 +2804,10 @@ check_cluster: * to uncached writes on the file, so go ahead * and push whatever's in the sparse map * and switch back to normal clustering - * - * see the comment above concerning a possible deadlock... */ - if (upl_size) { - ubc_upl_commit_range(upl, 0, upl_size, - UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY); - /* - * setting upl_size to 0 keeps us from committing a - * second time in the start_new_cluster path - */ - upl_size = 0; - } - sparse_cluster_push(wbp, vp, newEOF, PUSH_ALL, callback, callback_arg); - wbp->cl_number = 0; + + sparse_cluster_push(wbp, vp, newEOF, PUSH_ALL, callback, callback_arg); /* * no clusters of either type present at this point * so just go directly to start_new_cluster since @@ -2729,8 +2817,6 @@ check_cluster: */ goto start_new_cluster; } - upl_offset = 0; - if (wbp->cl_number == 0) /* * no clusters currently present @@ -2749,7 +2835,7 @@ check_cluster: /* * the current write starts at or after the current cluster */ - if (cl.e_addr <= (wbp->cl_clusters[cl_index].b_addr + MAX_CLUSTER_SIZE)) { + if (cl.e_addr <= (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) { /* * we have a write that fits entirely * within the existing cluster limits @@ -2761,7 +2847,7 @@ check_cluster: wbp->cl_clusters[cl_index].e_addr = cl.e_addr; break; } - if (cl.b_addr < (wbp->cl_clusters[cl_index].b_addr + MAX_CLUSTER_SIZE)) { + if (cl.b_addr < (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) { /* * we have a write that starts in the middle of the current cluster * but extends beyond the cluster's limit... we know this because @@ -2772,23 +2858,8 @@ check_cluster: * note that we'll always have a leftover tail in this case since * full absorbtion would have occurred in the clause above */ - wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + MAX_CLUSTER_SIZE; - - if (upl_size) { - daddr64_t start_pg_in_upl; - - start_pg_in_upl = (daddr64_t)(upl_f_offset / PAGE_SIZE_64); - - if (start_pg_in_upl < wbp->cl_clusters[cl_index].e_addr) { - intersection = (int)((wbp->cl_clusters[cl_index].e_addr - start_pg_in_upl) * PAGE_SIZE); - - ubc_upl_commit_range(upl, upl_offset, intersection, - UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY); - upl_f_offset += intersection; - upl_offset += intersection; - upl_size -= intersection; - } - } + wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount; + cl.b_addr = wbp->cl_clusters[cl_index].e_addr; } /* @@ -2803,7 +2874,7 @@ check_cluster: /* * the current write starts in front of the cluster we're currently considering */ - if ((wbp->cl_clusters[cl_index].e_addr - cl.b_addr) <= MAX_CLUSTER_SIZE) { + if ((wbp->cl_clusters[cl_index].e_addr - cl.b_addr) <= max_cluster_pgcount) { /* * we can just merge the new request into * this cluster and leave it in the cache @@ -2816,7 +2887,7 @@ check_cluster: /* * the current write completely * envelops the existing cluster and since - * each write is limited to at most MAX_CLUSTER_SIZE pages + * each write is limited to at most max_cluster_pgcount pages * we can just use the start and last blocknos of the write * to generate the cluster limits */ @@ -2834,29 +2905,14 @@ check_cluster: * get an intersection with the current write * */ - if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr - MAX_CLUSTER_SIZE) { + if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount) { /* * the current write extends into the proposed cluster * clip the length of the current write after first combining it's * tail with the newly shaped cluster */ - wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - MAX_CLUSTER_SIZE; - - if (upl_size) { - intersection = (int)((cl.e_addr - wbp->cl_clusters[cl_index].b_addr) * PAGE_SIZE); - - if ((u_int)intersection > upl_size) - /* - * because the current write may consist of a number of pages found in the cache - * which are not part of the UPL, we may have an intersection that exceeds - * the size of the UPL that is also part of this write - */ - intersection = upl_size; - - ubc_upl_commit_range(upl, upl_offset + (upl_size - intersection), intersection, - UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY); - upl_size -= intersection; - } + wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount; + cl.e_addr = wbp->cl_clusters[cl_index].b_addr; } /* @@ -2911,16 +2967,7 @@ check_cluster: * no more room in the normal cluster mechanism * so let's switch to the more expansive but expensive * sparse mechanism.... - * first, we need to release the upl if we hold one - * since pages in it may be present in the sparse cluster map (after the cluster_switch) - * and may span 2 separate buckets there... if they do and - * we happen to have to flush a bucket to make room and it intersects - * this upl, a deadlock may result on page BUSY */ - if (upl_size) - ubc_upl_commit_range(upl, upl_offset, upl_size, - UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY); - sparse_cluster_switch(wbp, vp, newEOF, callback, callback_arg); sparse_cluster_add(wbp, vp, &cl, newEOF, callback, callback_arg); @@ -2954,33 +3001,19 @@ start_new_cluster: wbp->cl_number++; delay_io: - if (upl_size) - ubc_upl_commit_range(upl, upl_offset, upl_size, - UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY); - lck_mtx_unlock(&wbp->cl_lockw); continue; issue_io: /* - * we don't hold the vnode lock at this point + * we don't hold the lock at this point * - * because we had to ask for a UPL that provides currenty non-present pages, the - * UPL has been automatically set to clear the dirty flags (both software and hardware) - * upon committing it... this is not the behavior we want since it's possible for - * pages currently present as part of a mapped file to be dirtied while the I/O is in flight. - * in order to maintain some semblance of coherency with mapped writes - * we need to drop the current upl and pick it back up with COPYOUT_FROM set + * we've already dropped the current upl, so pick it back up with COPYOUT_FROM set * so that we correctly deal with a change in state of the hardware modify bit... * we do this via cluster_push_now... by passing along the IO_SYNC flag, we force * cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also * responsible for generating the correct sized I/O(s) */ - ubc_upl_commit_range(upl, 0, upl_size, - UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY); - - cl.e_addr = (upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64; - retval = cluster_push_now(vp, &cl, newEOF, flags, callback, callback_arg); } } @@ -3082,7 +3115,7 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file upl_page_info_t *pl; upl_t upl; vm_offset_t upl_offset; - int upl_size; + u_int32_t upl_size; off_t upl_f_offset; int start_offset; int start_pg; @@ -3098,7 +3131,9 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file u_int32_t size_of_prefetch; u_int32_t xsize; u_int32_t io_size; - u_int32_t max_rd_size = MAX_PREFETCH; + u_int32_t max_rd_size; + u_int32_t max_io_size; + u_int32_t max_prefetch; u_int rd_ahead_enabled = 1; u_int prefetch_enabled = 1; struct cl_readahead * rap; @@ -3120,9 +3155,13 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file take_reference = 0; if (flags & IO_PASSIVE) - bflag = CL_PASSIVE; + bflag = CL_PASSIVE; else - bflag = 0; + bflag = 0; + + max_prefetch = MAX_PREFETCH(vp); + max_rd_size = max_prefetch; + max_io_size = cluster_max_io_size(vp->v_mount, CL_READ); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START, (int)uio->uio_offset, io_req_size, (int)filesize, flags, 0); @@ -3220,8 +3259,8 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file * we can notice that our I/O pipe is running dry and * get the next I/O issued before it does go dry */ - if (last_ioread_offset && io_size > ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4)) - io_resid = ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4); + if (last_ioread_offset && io_size > (max_io_size / 4)) + io_resid = (max_io_size / 4); else io_resid = io_size; @@ -3270,11 +3309,11 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; if (flags & IO_NOCACHE) { - if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE)) - upl_size = (MAX_UPL_TRANSFER * PAGE_SIZE); + if (upl_size > max_io_size) + upl_size = max_io_size; } else { - if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4) - upl_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 4; + if (upl_size > max_io_size / 4) + upl_size = max_io_size / 4; } pages_in_upl = upl_size / PAGE_SIZE; @@ -3419,8 +3458,14 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file lck_mtx_lock(cl_mtxp); while (iostate.io_issued != iostate.io_completed) { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, + iostate.io_issued, iostate.io_completed, 0, 0, 0); + iostate.io_wanted = 1; msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_copy", NULL); + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, + iostate.io_issued, iostate.io_completed, 0, 0, 0); } lck_mtx_unlock(cl_mtxp); @@ -3502,7 +3547,7 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file rd_ahead_enabled = 1; prefetch_enabled = 1; - max_rd_size = MAX_PREFETCH; + max_rd_size = max_prefetch; last_ioread_offset = 0; } } @@ -3553,13 +3598,21 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t xsize; u_int32_t devblocksize; u_int32_t mem_alignment_mask; - u_int32_t max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE; - u_int32_t max_rd_ahead = MAX_PREFETCH; + u_int32_t max_upl_size; + u_int32_t max_rd_size; + u_int32_t max_rd_ahead; + + + max_upl_size = cluster_max_io_size(vp->v_mount, CL_READ); + + max_rd_size = max_upl_size; + max_rd_ahead = max_rd_size * 2; + if (flags & IO_PASSIVE) - bflag = CL_PASSIVE; + bflag = CL_PASSIVE; else - bflag = 0; + bflag = 0; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START, (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0); @@ -3627,8 +3680,8 @@ next_dread: max_rd_size = HARD_THROTTLE_MAXSIZE; max_rd_ahead = HARD_THROTTLE_MAXSIZE - 1; } else { - max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE; - max_rd_ahead = MAX_PREFETCH; + max_rd_size = max_upl_size; + max_rd_ahead = max_rd_size * 2; } io_start = io_size = io_req_size; @@ -3795,8 +3848,14 @@ next_dread: lck_mtx_lock(cl_mtxp); while ((iostate.io_issued - iostate.io_completed) > max_rd_ahead) { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, + iostate.io_issued, iostate.io_completed, max_rd_ahead, 0, 0); + iostate.io_wanted = 1; msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_direct", NULL); + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, + iostate.io_issued, iostate.io_completed, max_rd_ahead, 0, 0); } lck_mtx_unlock(cl_mtxp); @@ -3856,8 +3915,14 @@ wait_for_dreads: lck_mtx_lock(cl_mtxp); while (iostate.io_issued != iostate.io_completed) { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, + iostate.io_issued, iostate.io_completed, 0, 0, 0); + iostate.io_wanted = 1; msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_direct", NULL); + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, + iostate.io_issued, iostate.io_completed, 0, 0, 0); } lck_mtx_unlock(cl_mtxp); } @@ -3909,9 +3974,9 @@ cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, int bflag; if (flags & IO_PASSIVE) - bflag = CL_PASSIVE; + bflag = CL_PASSIVE; else - bflag = 0; + bflag = 0; /* * When we enter this routine, we know @@ -4028,9 +4093,15 @@ next_cread: if (iostate.io_issued) { lck_mtx_lock(cl_mtxp); - while ((iostate.io_issued - iostate.io_completed) > (3 * MAX_IO_CONTIG_SIZE)) { + while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_IO_CONTIG_SIZE)) { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, + iostate.io_issued, iostate.io_completed, 2 * MAX_IO_CONTIG_SIZE, 0, 0); + iostate.io_wanted = 1; msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_contig", NULL); + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, + iostate.io_issued, iostate.io_completed, 2 * MAX_IO_CONTIG_SIZE, 0, 0); } lck_mtx_unlock(cl_mtxp); } @@ -4077,8 +4148,14 @@ wait_for_creads: lck_mtx_lock(cl_mtxp); while (iostate.io_issued != iostate.io_completed) { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, + iostate.io_issued, iostate.io_completed, 0, 0, 0); + iostate.io_wanted = 1; msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_contig", NULL); + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, + iostate.io_issued, iostate.io_completed, 0, 0, 0); } lck_mtx_unlock(cl_mtxp); @@ -4192,12 +4269,19 @@ advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*c int retval = 0; int issued_io; int skip_range; - + uint32_t max_io_size; + + if ( !UBCINFOEXISTS(vp)) return(EINVAL); + if (resid < 0) + return(EINVAL); + + max_io_size = cluster_max_io_size(vp->v_mount, CL_READ); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START, - (int)f_offset, resid, (int)filesize, 0, 0); + (int)f_offset, resid, (int)filesize, 0, 0); while (resid && f_offset < filesize && retval == 0) { /* @@ -4218,8 +4302,8 @@ advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*c io_size = max_size; upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; - if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE)) - upl_size = MAX_UPL_TRANSFER * PAGE_SIZE; + if ((uint32_t)upl_size > max_io_size) + upl_size = max_io_size; skip_range = 0; /* @@ -4437,7 +4521,10 @@ cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_fla int cl_len; int cl_pushed = 0; struct cl_wextent l_clusters[MAX_CLUSTERS]; - + u_int max_cluster_pgcount; + + + max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE; /* * the write behind context exists and has * already been locked... @@ -4498,25 +4585,12 @@ cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_fla * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world... */ for (i = 0; i < MAX_CLUSTERS - 1; i++) { - if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != MAX_CLUSTER_SIZE) + if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != max_cluster_pgcount) goto dont_try; if (l_clusters[i].e_addr != l_clusters[i+1].b_addr) goto dont_try; } } - /* - * drop the lock while we're firing off the I/Os... - * this is safe since I'm working off of a private sorted copy - * of the clusters, and I'm going to re-evaluate the public - * state after I retake the lock - * - * we need to drop it to avoid a lock inversion when trying to - * grab pages into the UPL... another thread in 'write' may - * have these pages in its UPL and be blocked trying to - * gain the write-behind lock for this vnode - */ - lck_mtx_unlock(&wbp->cl_lockw); - for (cl_index = 0; cl_index < cl_len; cl_index++) { int flags; struct cl_extent cl; @@ -4548,8 +4622,6 @@ cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_fla if ( !(push_flag & PUSH_ALL) ) break; } - lck_mtx_lock(&wbp->cl_lockw); - dont_try: if (cl_len > cl_pushed) { /* @@ -4837,23 +4909,8 @@ sparse_cluster_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_ wbp->cl_scdirty -= (int)(cl.e_addr - cl.b_addr); - /* - * drop the lock while we're firing off the I/Os... - * this is safe since I've already updated the state - * this lock is protecting and I'm going to re-evaluate - * the public state after I retake the lock - * - * we need to drop it to avoid a lock inversion when trying to - * grab pages into the UPL... another thread in 'write' may - * have these pages in its UPL and be blocked trying to - * gain the write-behind lock for this vnode - */ - lck_mtx_unlock(&wbp->cl_lockw); - cluster_push_now(vp, &cl, EOF, push_flag & IO_PASSIVE, callback, callback_arg); - lck_mtx_lock(&wbp->cl_lockw); - if ( !(push_flag & PUSH_ALL) ) break; }