X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/39236c6e673c41db228275375ab7fdb0f837b292..7e41aa883dd258f888d0470250eead40a53ef1f5:/bsd/vfs/vfs_cluster.c?ds=inline diff --git a/bsd/vfs/vfs_cluster.c b/bsd/vfs/vfs_cluster.c index 2f662c53c..c565a3df4 100644 --- a/bsd/vfs/vfs_cluster.c +++ b/bsd/vfs/vfs_cluster.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2014 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -88,12 +88,15 @@ #include #include #include +#include #include #include #include +#include + #if 0 #undef KERNEL_DEBUG #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT @@ -121,7 +124,7 @@ #define CL_NOCACHE 0x40000 #define MAX_VECTOR_UPL_ELEMENTS 8 -#define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE) * PAGE_SIZE +#define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE_BYTES) extern upl_t vector_upl_create(vm_offset_t); extern boolean_t vector_upl_is_valid(upl_t); @@ -137,12 +140,25 @@ struct clios { int io_wanted; /* someone is sleeping waiting for a change in state */ }; +struct cl_direct_read_lock { + LIST_ENTRY(cl_direct_read_lock) chain; + int32_t ref_count; + vnode_t vp; + lck_rw_t rw_lock; +}; + +#define CL_DIRECT_READ_LOCK_BUCKETS 61 + +static LIST_HEAD(cl_direct_read_locks, cl_direct_read_lock) + cl_direct_read_locks[CL_DIRECT_READ_LOCK_BUCKETS]; + +static lck_spin_t cl_direct_read_spin_lock; + static lck_grp_t *cl_mtx_grp; static lck_attr_t *cl_mtx_attr; static lck_grp_attr_t *cl_mtx_grp_attr; static lck_mtx_t *cl_transaction_mtxp; - #define IO_UNKNOWN 0 #define IO_DIRECT 1 #define IO_CONTIG 2 @@ -167,7 +183,7 @@ static int cluster_is_throttled(vnode_t vp); static void cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name); -static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg); +static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg, int flags); static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference); static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference); @@ -233,9 +249,14 @@ int (*bootcache_contains_block)(dev_t device, u_int64_t blkno) = NULL; * can represent it in a 32 bit int */ #define MAX_IO_REQUEST_SIZE (1024 * 1024 * 512) -#define MAX_IO_CONTIG_SIZE (MAX_UPL_SIZE * PAGE_SIZE) +#define MAX_IO_CONTIG_SIZE MAX_UPL_SIZE_BYTES #define MAX_VECTS 16 -#define MIN_DIRECT_WRITE_SIZE (4 * PAGE_SIZE) +/* + * The MIN_DIRECT_WRITE_SIZE governs how much I/O should be issued before we consider + * allowing the caller to bypass the buffer cache. For small I/Os (less than 16k), + * we have not historically allowed the write to bypass the UBC. + */ +#define MIN_DIRECT_WRITE_SIZE (16384) #define WRITE_THROTTLE 6 #define WRITE_THROTTLE_SSD 2 @@ -243,9 +264,9 @@ int (*bootcache_contains_block)(dev_t device, u_int64_t blkno) = NULL; #define WRITE_BEHIND_SSD 1 #define PREFETCH 3 -#define PREFETCH_SSD 1 -uint32_t speculative_prefetch_max = (MAX_UPL_SIZE * 3); -uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use for a specluative read-ahead on SSDs*/ +#define PREFETCH_SSD 2 +uint32_t speculative_prefetch_max = (MAX_UPL_SIZE_BYTES * 3); /* maximum bytes in a specluative read-ahead */ +uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use in a specluative read-ahead on SSDs*/ #define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * (base)) @@ -286,6 +307,11 @@ cluster_init(void) { if (cl_transaction_mtxp == NULL) panic("cluster_init: failed to allocate cl_transaction_mtxp"); + + lck_spin_init(&cl_direct_read_spin_lock, cl_mtx_grp, cl_mtx_attr); + + for (int i = 0; i < CL_DIRECT_READ_LOCK_BUCKETS; ++i) + LIST_INIT(&cl_direct_read_locks[i]); } @@ -311,19 +337,19 @@ cluster_max_io_size(mount_t mp, int type) maxcnt = min(mp->mnt_maxreadcnt, mp->mnt_maxwritecnt); break; } - if (segcnt > MAX_UPL_SIZE) { + if (segcnt > (MAX_UPL_SIZE_BYTES >> PAGE_SHIFT)) { /* * don't allow a size beyond the max UPL size we can create */ - segcnt = MAX_UPL_SIZE; + segcnt = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT; } max_io_size = min((segcnt * PAGE_SIZE), maxcnt); - if (max_io_size < (MAX_UPL_TRANSFER * PAGE_SIZE)) { + if (max_io_size < MAX_UPL_TRANSFER_BYTES) { /* * don't allow a size smaller than the old fixed limit */ - max_io_size = (MAX_UPL_TRANSFER * PAGE_SIZE); + max_io_size = MAX_UPL_TRANSFER_BYTES; } else { /* * make sure the size specified is a multiple of PAGE_SIZE @@ -441,7 +467,7 @@ cluster_get_wbp(vnode_t vp, int flags) static void -cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg) +cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, int flags) { struct cl_writebehind *wbp; @@ -450,7 +476,7 @@ cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *c if (wbp->cl_number) { lck_mtx_lock(&wbp->cl_lockw); - cluster_try_push(wbp, vp, newEOF, PUSH_ALL | PUSH_SYNC, 0, callback, callback_arg); + cluster_try_push(wbp, vp, newEOF, PUSH_ALL | flags, 0, callback, callback_arg); lck_mtx_unlock(&wbp->cl_lockw); } @@ -506,6 +532,142 @@ cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name) lck_mtx_unlock(&iostate->io_mtxp); } +static void cluster_handle_associated_upl(struct clios *iostate, upl_t upl, + upl_offset_t upl_offset, upl_size_t size) +{ + if (!size) + return; + + upl_t associated_upl = upl_associated_upl(upl); + + if (!associated_upl) + return; + +#if 0 + printf("1: %d %d\n", upl_offset, upl_offset + size); +#endif + + /* + * The associated UPL is page aligned to file offsets whereas the + * UPL it's attached to has different alignment requirements. The + * upl_offset that we have refers to @upl. The code that follows + * has to deal with the first and last pages in this transaction + * which might straddle pages in the associated UPL. To keep + * track of these pages, we use the mark bits: if the mark bit is + * set, we know another transaction has completed its part of that + * page and so we can unlock that page here. + * + * The following illustrates what we have to deal with: + * + * MEM u <------------ 1 PAGE ------------> e + * +-------------+----------------------+----------------- + * | |######################|################# + * +-------------+----------------------+----------------- + * FILE | <--- a ---> o <------------ 1 PAGE ------------> + * + * So here we show a write to offset @o. The data that is to be + * written is in a buffer that is not page aligned; it has offset + * @a in the page. The upl that carries the data starts in memory + * at @u. The associated upl starts in the file at offset @o. A + * transaction will always end on a page boundary (like @e above) + * except for the very last transaction in the group. We cannot + * unlock the page at @o in the associated upl until both the + * transaction ending at @e and the following transaction (that + * starts at @e) has completed. + */ + + /* + * We record whether or not the two UPLs are aligned as the mark + * bit in the first page of @upl. + */ + upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl); + bool is_unaligned = upl_page_get_mark(pl, 0); + + if (is_unaligned) { + upl_page_info_t *assoc_pl = UPL_GET_INTERNAL_PAGE_LIST(associated_upl); + + upl_offset_t upl_end = upl_offset + size; + assert(upl_end >= PAGE_SIZE); + + upl_size_t assoc_upl_size = upl_get_size(associated_upl); + + /* + * In the very first transaction in the group, upl_offset will + * not be page aligned, but after that it will be and in that + * case we want the preceding page in the associated UPL hence + * the minus one. + */ + assert(upl_offset); + if (upl_offset) + upl_offset = trunc_page_32(upl_offset - 1); + + lck_mtx_lock_spin(&iostate->io_mtxp); + + // Look at the first page... + if (upl_offset + && !upl_page_get_mark(assoc_pl, upl_offset >> PAGE_SHIFT)) { + /* + * The first page isn't marked so let another transaction + * completion handle it. + */ + upl_page_set_mark(assoc_pl, upl_offset >> PAGE_SHIFT, true); + upl_offset += PAGE_SIZE; + } + + // And now the last page... + + /* + * This needs to be > rather than >= because if it's equal, it + * means there's another transaction that is sharing the last + * page. + */ + if (upl_end > assoc_upl_size) + upl_end = assoc_upl_size; + else { + upl_end = trunc_page_32(upl_end); + const int last_pg = (upl_end >> PAGE_SHIFT) - 1; + + if (!upl_page_get_mark(assoc_pl, last_pg)) { + /* + * The last page isn't marked so mark the page and let another + * transaction completion handle it. + */ + upl_page_set_mark(assoc_pl, last_pg, true); + upl_end -= PAGE_SIZE; + } + } + + lck_mtx_unlock(&iostate->io_mtxp); + +#if 0 + printf("2: %d %d\n", upl_offset, upl_end); +#endif + + if (upl_end <= upl_offset) + return; + + size = upl_end - upl_offset; + } else { + assert(!(upl_offset & PAGE_MASK)); + assert(!(size & PAGE_MASK)); + } + + boolean_t empty; + + /* + * We can unlock these pages now and as this is for a + * direct/uncached write, we want to dump the pages too. + */ + kern_return_t kr = upl_abort_range(associated_upl, upl_offset, size, + UPL_ABORT_DUMP_PAGES, &empty); + + assert(!kr); + + if (!kr && empty) { + upl_set_associated_upl(upl, NULL); + upl_deallocate(associated_upl); + } +} static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp) @@ -569,7 +731,7 @@ cluster_iodone(buf_t bp, void *callback_arg) struct clios *iostate; boolean_t transaction_complete = FALSE; - cbp_head = (buf_t)(bp->b_trans_head); + __IGNORE_WCASTALIGN(cbp_head = (buf_t)(bp->b_trans_head)); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START, cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0); @@ -580,7 +742,7 @@ cluster_iodone(buf_t bp, void *callback_arg) lck_mtx_lock_spin(cl_transaction_mtxp); bp->b_flags |= B_TDONE; - + if (bp->b_flags & B_TWANTED) { CLR(bp->b_flags, B_TWANTED); need_wakeup = TRUE; @@ -655,6 +817,14 @@ cluster_iodone(buf_t bp, void *callback_arg) cbp = cbp_next; } + + if (ISSET(b_flags, B_COMMIT_UPL)) { + cluster_handle_associated_upl(iostate, + cbp_head->b_upl, + upl_offset, + transaction_size); + } + if (error == 0 && total_resid) error = EIO; @@ -701,14 +871,13 @@ cluster_iodone(buf_t bp, void *callback_arg) } if (b_flags & B_COMMIT_UPL) { - - pg_offset = upl_offset & PAGE_MASK; + pg_offset = upl_offset & PAGE_MASK; commit_size = (pg_offset + transaction_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; if (error) upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, commit_size, error, b_flags, vp); else { - upl_flags = UPL_COMMIT_FREE_ON_EMPTY; + upl_flags = UPL_COMMIT_FREE_ON_EMPTY; if ((b_flags & B_PHYS) && (b_flags & B_READ)) upl_flags |= UPL_COMMIT_SET_DIRTY; @@ -760,7 +929,7 @@ cluster_zero(upl_t upl, upl_offset_t upl_offset, int size, buf_t bp) pl = ubc_upl_pageinfo(upl); if (upl_device_page(pl) == TRUE) { - zero_addr = ((addr64_t)upl_phys_page(pl, 0) << 12) + upl_offset; + zero_addr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + upl_offset; bzero_phys_nc(zero_addr, size); } else { @@ -772,7 +941,7 @@ cluster_zero(upl_t upl, upl_offset_t upl_offset, int size, buf_t bp) page_index = upl_offset / PAGE_SIZE; page_offset = upl_offset & PAGE_MASK; - zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << 12) + page_offset; + zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << PAGE_SHIFT) + page_offset; zero_cnt = min(PAGE_SIZE - page_offset, size); bzero_phys(zero_addr, zero_cnt); @@ -1036,6 +1205,7 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no io_flags |= B_PASSIVE; if (flags & CL_ENCRYPTED) io_flags |= B_ENCRYPTED_IO; + if (vp->v_flag & VSYSTEM) io_flags |= B_META; @@ -1048,7 +1218,37 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no * read in from the file */ zero_offset = upl_offset + non_rounded_size; + } else if (!ISSET(flags, CL_READ) && ISSET(flags, CL_DIRECT_IO)) { + assert(ISSET(flags, CL_COMMIT)); + + // For a direct/uncached write, we need to lock pages... + + upl_t cached_upl; + + /* + * Create a UPL to lock the pages in the cache whilst the + * write is in progress. + */ + ubc_create_upl(vp, f_offset, non_rounded_size, &cached_upl, + NULL, UPL_SET_LITE); + + /* + * Attach this UPL to the other UPL so that we can find it + * later. + */ + upl_set_associated_upl(upl, cached_upl); + + if (upl_offset & PAGE_MASK) { + /* + * The two UPLs are not aligned, so mark the first page in + * @upl so that cluster_handle_associated_upl can handle + * it accordingly. + */ + upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl); + upl_page_set_mark(pl, 0, true); + } } + while (size) { daddr64_t blkno; daddr64_t lblkno; @@ -1127,7 +1327,7 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no * * go direct to vnode_pageout so that we don't have to * unbusy the page from the UPL... we used to do this - * so that we could call ubc_sync_range, but that results + * so that we could call ubc_msync, but that results * in a potential deadlock if someone else races us to acquire * that page and wins and in addition needs one of the pages * we're continuing to hold in the UPL @@ -1234,7 +1434,7 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no } continue; } - lblkno = (daddr64_t)(f_offset / PAGE_SIZE_64); + lblkno = (daddr64_t)(f_offset / 0x1000); /* * we have now figured out how much I/O we can do - this is in 'io_size' * pg_offset is the starting point in the first page for the I/O @@ -1329,6 +1529,10 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no commit_offset = upl_offset & ~PAGE_MASK; } + + // Associated UPL is currently only used in the direct write path + assert(!upl_associated_upl(upl)); + if ( (flags & CL_COMMIT) && pg_count) { ubc_upl_commit_range(upl, commit_offset, pg_count * PAGE_SIZE, UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY); @@ -1425,9 +1629,13 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no if (flags & CL_PAGEOUT) { u_int i; - for (i = 0; i < pg_count; i++) { - if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY) - panic("BUSY bp found in cluster_io"); + /* + * since blocks are in offsets of 0x1000, scale + * iteration to (PAGE_SIZE * pg_count) of blks. + */ + for (i = 0; i < (PAGE_SIZE * pg_count)/0x1000; i++) { + if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY) + panic("BUSY bp found in cluster_io"); } } if (flags & CL_ASYNC) { @@ -1445,7 +1653,9 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no if (buf_setupl(cbp, upl, upl_offset)) panic("buf_setupl failed\n"); - +#if CONFIG_IOSCHED + upl_set_blkno(upl, upl_offset, io_size, blkno); +#endif cbp->b_trans_next = (buf_t)NULL; if ((cbp->b_iostate = (void *)iostate)) @@ -1550,34 +1760,41 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no } } if (error) { - int abort_size; + int abort_size; io_size = 0; - + if (cbp_head) { - /* - * first wait until all of the outstanding I/O - * for this partial transaction has completed - */ - cluster_wait_IO(cbp_head, (flags & CL_ASYNC)); + /* + * Wait until all of the outstanding I/O + * for this partial transaction has completed + */ + cluster_wait_IO(cbp_head, (flags & CL_ASYNC)); /* * Rewind the upl offset to the beginning of the * transaction. */ upl_offset = cbp_head->b_uploffset; + } - for (cbp = cbp_head; cbp;) { - buf_t cbp_next; - - size += cbp->b_bcount; - io_size += cbp->b_bcount; + if (ISSET(flags, CL_COMMIT)) { + cluster_handle_associated_upl(iostate, upl, upl_offset, + upl_end_offset - upl_offset); + } - cbp_next = cbp->b_trans_next; - free_io_buf(cbp); - cbp = cbp_next; - } + // Free all the IO buffers in this transaction + for (cbp = cbp_head; cbp;) { + buf_t cbp_next; + + size += cbp->b_bcount; + io_size += cbp->b_bcount; + + cbp_next = cbp->b_trans_next; + free_io_buf(cbp); + cbp = cbp_next; } + if (iostate) { int need_wakeup = 0; @@ -1605,12 +1822,13 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no if (need_wakeup) wakeup((caddr_t)&iostate->io_wanted); } + if (flags & CL_COMMIT) { int upl_flags; - pg_offset = upl_offset & PAGE_MASK; + pg_offset = upl_offset & PAGE_MASK; abort_size = (upl_end_offset - upl_offset + PAGE_MASK) & ~PAGE_MASK; - + upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, abort_size, error, io_flags, vp); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE, @@ -1713,16 +1931,16 @@ cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct } max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ), (vp->v_mount->mnt_kern_flag & MNTK_SSD)); - if ((max_prefetch / PAGE_SIZE) > speculative_prefetch_max) - max_prefetch = (speculative_prefetch_max * PAGE_SIZE); + if (max_prefetch > speculative_prefetch_max) + max_prefetch = speculative_prefetch_max; if (max_prefetch <= PAGE_SIZE) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 6, 0); return; } - if (extent->e_addr < rap->cl_maxra) { - if ((rap->cl_maxra - extent->e_addr) > ((max_prefetch / PAGE_SIZE) / 4)) { + if (extent->e_addr < rap->cl_maxra && rap->cl_ralen >= 4) { + if ((rap->cl_maxra - extent->e_addr) > (rap->cl_ralen / 4)) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0); @@ -2098,7 +2316,7 @@ cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, in upl_size_t upl_size, vector_upl_size = 0; vm_size_t upl_needed_size; mach_msg_type_number_t pages_in_pl; - int upl_flags; + upl_control_flags_t upl_flags; kern_return_t kret; mach_msg_type_number_t i; int force_data_sync; @@ -2137,6 +2355,9 @@ cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, in if (flags & IO_NOCACHE) io_flag |= CL_NOCACHE; + if (flags & IO_SKIP_ENCRYPTION) + io_flag |= CL_ENCRYPTED; + iostate.io_completed = 0; iostate.io_issued = 0; iostate.io_error = 0; @@ -2184,6 +2405,7 @@ next_dwrite: goto wait_for_dwrites; } + task_update_logical_writes(current_task(), (io_req_size & ~PAGE_MASK), TASK_WRITE_IMMEDIATE); while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == 0) { int throttle_type; @@ -2214,7 +2436,7 @@ next_dwrite: } if (first_IO) { - cluster_syncup(vp, newEOF, callback, callback_arg); + cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0); first_IO = 0; } io_size = io_req_size & ~PAGE_MASK; @@ -2246,13 +2468,15 @@ next_dwrite: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START, (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0); + vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map; for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) { pages_in_pl = 0; upl_size = upl_needed_size; upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC | - UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE; + UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE + | UPL_MEMORY_TAG_MAKE(VM_KERN_MEMORY_FILE); - kret = vm_map_get_upl(current_map(), + kret = vm_map_get_upl(map, (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), &upl_size, &upl, @@ -2337,22 +2561,13 @@ next_dwrite: */ } - /* - * Now look for pages already in the cache - * and throw them away. - * uio->uio_offset is page aligned within the file - * io_size is a multiple of PAGE_SIZE - */ - ubc_range_op(vp, uio->uio_offset, uio->uio_offset + io_size, UPL_ROP_DUMP, NULL); - /* * we want push out these writes asynchronously so that we can overlap * the preparation of the next I/O * if there are already too many outstanding writes * wait until some complete before issuing the next */ - if (iostate.io_issued > iostate.io_completed) - cluster_iostate_wait(&iostate, max_upl_size * IO_SCALE(vp, 2), "cluster_write_direct"); + cluster_iostate_wait(&iostate, max_upl_size * IO_SCALE(vp, 2), "cluster_write_direct"); if (iostate.io_error) { /* @@ -2437,14 +2652,12 @@ wait_for_dwrites: retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); reset_vector_run_state(); } + /* + * make sure all async writes issued as part of this stream + * have completed before we return + */ + cluster_iostate_wait(&iostate, 0, "cluster_write_direct"); - if (iostate.io_issued > iostate.io_completed) { - /* - * make sure all async writes issued as part of this stream - * have completed before we return - */ - cluster_iostate_wait(&iostate, 0, "cluster_write_direct"); - } if (iostate.io_error) retval = iostate.io_error; @@ -2489,7 +2702,7 @@ cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, upl_size_t upl_size; vm_size_t upl_needed_size; mach_msg_type_number_t pages_in_pl; - int upl_flags; + upl_control_flags_t upl_flags; kern_return_t kret; struct clios iostate; int error = 0; @@ -2505,7 +2718,7 @@ cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, * -- the io_req_size will not exceed iov_len * -- the target address is physically contiguous */ - cluster_syncup(vp, newEOF, callback, callback_arg); + cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0); devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; @@ -2528,9 +2741,11 @@ next_cwrite: pages_in_pl = 0; upl_size = upl_needed_size; upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC | - UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE; + UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE + | UPL_MEMORY_TAG_MAKE(VM_KERN_MEMORY_FILE); - kret = vm_map_get_upl(current_map(), + vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map; + kret = vm_map_get_upl(map, (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, 0); @@ -2555,7 +2770,7 @@ next_cwrite: } pl = ubc_upl_pageinfo(upl[cur_upl]); - src_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset; + src_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset; while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) { u_int32_t head_size; @@ -2605,8 +2820,7 @@ next_cwrite: * if there are already too many outstanding writes * wait until some have completed before issuing the next */ - if (iostate.io_issued > iostate.io_completed) - cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_write_contig"); + cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_write_contig"); if (iostate.io_error) { /* @@ -2651,8 +2865,7 @@ wait_for_cwrites: * make sure all async writes that are part of this stream * have completed before we proceed */ - if (iostate.io_issued > iostate.io_completed) - cluster_iostate_wait(&iostate, 0, "cluster_write_contig"); + cluster_iostate_wait(&iostate, 0, "cluster_write_contig"); if (iostate.io_error) error = iostate.io_error; @@ -2760,6 +2973,9 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old if (flags & IO_NOCACHE) bflag |= CL_NOCACHE; + if (flags & IO_SKIP_ENCRYPTION) + bflag |= CL_ENCRYPTED; + zero_cnt = 0; zero_cnt1 = 0; zero_off = 0; @@ -3374,7 +3590,9 @@ cluster_read_ext(vnode_t vp, struct uio *uio, off_t filesize, int xflags, int (* flags |= IO_NOCACHE; if ((vp->v_flag & VRAOFF) || speculative_reads_disabled) flags |= IO_RAOFF; - + + if (flags & IO_SKIP_ENCRYPTION) + flags |= IO_ENCRYPTED; /* * If we're doing an encrypted IO, then first check to see * if the IO requested was page aligned. If not, then bail @@ -3398,14 +3616,9 @@ cluster_read_ext(vnode_t vp, struct uio *uio, off_t filesize, int xflags, int (* * otherwise, find out if we want the direct or contig variant for * the first vector in the uio request */ - if (((flags & IO_NOCACHE) || (flags & IO_ENCRYPTED)) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) { - - boolean_t check_io_type = TRUE; + if ( ((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) || (flags & IO_ENCRYPTED) ) { - - if (check_io_type) { - retval = cluster_io_type(uio, &read_type, &read_length, 0); - } + retval = cluster_io_type(uio, &read_type, &read_length, 0); } while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) { @@ -3514,6 +3727,9 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file if (flags & IO_NOCACHE) bflag |= CL_NOCACHE; + if (flags & IO_SKIP_ENCRYPTION) + bflag |= CL_ENCRYPTED; + max_io_size = cluster_max_io_size(vp->v_mount, CL_READ); max_prefetch = MAX_PREFETCH(vp, max_io_size, (vp->v_mount->mnt_kern_flag & MNTK_SSD)); max_rd_size = max_prefetch; @@ -3699,8 +3915,13 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file if (upl_size > max_io_size) upl_size = max_io_size; } else { - if (upl_size > max_io_size / 4) + if (upl_size > max_io_size / 4) { upl_size = max_io_size / 4; + upl_size &= ~PAGE_MASK; + + if (upl_size == 0) + upl_size = PAGE_SIZE; + } } pages_in_upl = upl_size / PAGE_SIZE; @@ -3855,7 +4076,7 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file rap->cl_lastr = extent.e_addr; } } - if (iostate.io_issued > iostate.io_completed) + if (iolock_inited == TRUE) cluster_iostate_wait(&iostate, 0, "cluster_read_copy"); if (iostate.io_error) @@ -3870,7 +4091,7 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file io_req_size -= (val_size - io_requested); } } else { - if (iostate.io_issued > iostate.io_completed) + if (iolock_inited == TRUE) cluster_iostate_wait(&iostate, 0, "cluster_read_copy"); } if (start_pg < last_pg) { @@ -3957,17 +4178,16 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file } } if (iolock_inited == TRUE) { - if (iostate.io_issued > iostate.io_completed) { - /* - * cluster_io returned an error after it - * had already issued some I/O. we need - * to wait for that I/O to complete before - * we can destroy the iostate mutex... - * 'retval' already contains the early error - * so no need to pick it up from iostate.io_error - */ - cluster_iostate_wait(&iostate, 0, "cluster_read_copy"); - } + /* + * cluster_io returned an error after it + * had already issued some I/O. we need + * to wait for that I/O to complete before + * we can destroy the iostate mutex... + * 'retval' already contains the early error + * so no need to pick it up from iostate.io_error + */ + cluster_iostate_wait(&iostate, 0, "cluster_read_copy"); + lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); } if (rap != NULL) { @@ -3983,6 +4203,72 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file return (retval); } +/* + * We don't want another read/write lock for every vnode in the system + * so we keep a hash of them here. There should never be very many of + * these around at any point in time. + */ +cl_direct_read_lock_t *cluster_lock_direct_read(vnode_t vp, lck_rw_type_t type) +{ + struct cl_direct_read_locks *head + = &cl_direct_read_locks[(uintptr_t)vp / sizeof(*vp) + % CL_DIRECT_READ_LOCK_BUCKETS]; + + struct cl_direct_read_lock *lck, *new_lck = NULL; + + for (;;) { + lck_spin_lock(&cl_direct_read_spin_lock); + + LIST_FOREACH(lck, head, chain) { + if (lck->vp == vp) { + ++lck->ref_count; + lck_spin_unlock(&cl_direct_read_spin_lock); + if (new_lck) { + // Someone beat us to it, ditch the allocation + lck_rw_destroy(&new_lck->rw_lock, cl_mtx_grp); + FREE(new_lck, M_TEMP); + } + lck_rw_lock(&lck->rw_lock, type); + return lck; + } + } + + if (new_lck) { + // Use the lock we allocated + LIST_INSERT_HEAD(head, new_lck, chain); + lck_spin_unlock(&cl_direct_read_spin_lock); + lck_rw_lock(&new_lck->rw_lock, type); + return new_lck; + } + + lck_spin_unlock(&cl_direct_read_spin_lock); + + // Allocate a new lock + MALLOC(new_lck, cl_direct_read_lock_t *, sizeof(*new_lck), + M_TEMP, M_WAITOK); + lck_rw_init(&new_lck->rw_lock, cl_mtx_grp, cl_mtx_attr); + new_lck->vp = vp; + new_lck->ref_count = 1; + + // Got to go round again + } +} + +void cluster_unlock_direct_read(cl_direct_read_lock_t *lck) +{ + lck_rw_done(&lck->rw_lock); + + lck_spin_lock(&cl_direct_read_spin_lock); + if (lck->ref_count == 1) { + LIST_REMOVE(lck, chain); + lck_spin_unlock(&cl_direct_read_spin_lock); + lck_rw_destroy(&lck->rw_lock, cl_mtx_grp); + FREE(lck, M_TEMP); + } else { + --lck->ref_count; + lck_spin_unlock(&cl_direct_read_spin_lock); + } +} static int cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length, @@ -3995,7 +4281,7 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, upl_size_t upl_size, vector_upl_size = 0; vm_size_t upl_needed_size; unsigned int pages_in_pl; - int upl_flags; + upl_control_flags_t upl_flags; kern_return_t kret; unsigned int i; int force_data_sync; @@ -4025,6 +4311,11 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, off_t v_upl_uio_offset = 0; int vector_upl_index=0; upl_t vector_upl = NULL; + cl_direct_read_lock_t *lock = NULL; + + user_addr_t orig_iov_base = 0; + user_addr_t last_iov_base = 0; + user_addr_t next_iov_base = 0; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START, (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0); @@ -4047,6 +4338,9 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, io_flag |= CL_NOCACHE; } + if (flags & IO_SKIP_ENCRYPTION) + io_flag |= CL_ENCRYPTED; + iostate.io_completed = 0; iostate.io_issued = 0; iostate.io_error = 0; @@ -4076,6 +4370,9 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, strict_uncached_IO = ubc_strict_uncached_IO(vp); + orig_iov_base = uio_curriovbase(uio); + last_iov_base = orig_iov_base; + next_dread: io_req_size = *read_length; iov_base = uio_curriovbase(uio); @@ -4250,7 +4547,7 @@ next_dread: goto wait_for_dreads; } - /* + /* * Don't re-check the UBC data if we are looking for uncached IO * or asking for encrypted blocks. */ @@ -4261,6 +4558,19 @@ next_dread: io_size = 0; + if (!lock) { + /* + * We hold a lock here between the time we check the + * cache and the time we issue I/O. This saves us + * from having to lock the pages in the cache. Not + * all clients will care about this lock but some + * clients may want to guarantee stability between + * here and when the I/O is issued in which case they + * will take the lock exclusively. + */ + lock = cluster_lock_direct_read(vp, LCK_RW_TYPE_SHARED); + } + ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size); if (io_size == 0) { @@ -4305,17 +4615,18 @@ next_dread: else no_zero_fill = 0; + vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map; for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) { pages_in_pl = 0; upl_size = upl_needed_size; - upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE; - + upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE + | UPL_MEMORY_TAG_MAKE(VM_KERN_MEMORY_FILE); if (no_zero_fill) upl_flags |= UPL_NOZEROFILL; if (force_data_sync) upl_flags |= UPL_FORCE_DATA_SYNC; - kret = vm_map_create_upl(current_map(), + kret = vm_map_create_upl(map, (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), &upl_size, &upl, NULL, &pages_in_pl, &upl_flags); @@ -4383,8 +4694,7 @@ next_dread: * if there are already too many outstanding reads * wait until some have completed before issuing the next read */ - if (iostate.io_issued > iostate.io_completed) - cluster_iostate_wait(&iostate, max_rd_ahead, "cluster_read_direct"); + cluster_iostate_wait(&iostate, max_rd_ahead, "cluster_read_direct"); if (iostate.io_error) { /* @@ -4401,7 +4711,6 @@ next_dread: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START, upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0); - if(!useVectorUPL) { if (no_zero_fill) io_flag &= ~CL_PRESERVE; @@ -4428,7 +4737,15 @@ next_dread: retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); reset_vector_run_state(); } - } + } + last_iov_base = iov_base + io_size; + + if (lock) { + // We don't need to wait for the I/O to complete + cluster_unlock_direct_read(lock); + lock = NULL; + } + /* * update the uio structure */ @@ -4475,12 +4792,16 @@ wait_for_dreads: retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); reset_vector_run_state(); } + + // We don't need to wait for the I/O to complete + if (lock) + cluster_unlock_direct_read(lock); + /* * make sure all async reads that are part of this stream * have completed before we return */ - if (iostate.io_issued > iostate.io_completed) - cluster_iostate_wait(&iostate, 0, "cluster_read_direct"); + cluster_iostate_wait(&iostate, 0, "cluster_read_direct"); if (iostate.io_error) retval = iostate.io_error; @@ -4490,6 +4811,15 @@ wait_for_dreads: if (io_throttled == TRUE && retval == 0) retval = EAGAIN; + for (next_iov_base = orig_iov_base; next_iov_base < last_iov_base; next_iov_base += PAGE_SIZE) { + /* + * This is specifically done for pmap accounting purposes. + * vm_pre_fault() will call vm_fault() to enter the page into + * the pmap if there isn't _a_ physical page for that VA already. + */ + vm_pre_fault(vm_map_trunc_page(next_iov_base, PAGE_MASK)); + } + if (io_req_size && retval == 0) { /* * we couldn't handle the tail of this request in DIRECT mode @@ -4519,7 +4849,7 @@ cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, upl_size_t upl_size; vm_size_t upl_needed_size; mach_msg_type_number_t pages_in_pl; - int upl_flags; + upl_control_flags_t upl_flags; kern_return_t kret; struct clios iostate; int error= 0; @@ -4546,7 +4876,7 @@ cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, * -- the read_length will not exceed the current iov_len * -- the target address is physically contiguous for read_length */ - cluster_syncup(vp, filesize, callback, callback_arg); + cluster_syncup(vp, filesize, callback, callback_arg, PUSH_SYNC); devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; @@ -4573,13 +4903,15 @@ next_cread: pages_in_pl = 0; upl_size = upl_needed_size; - upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE; + upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE + | UPL_MEMORY_TAG_MAKE(VM_KERN_MEMORY_FILE); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_START, (int)upl_offset, (int)upl_size, (int)iov_base, io_size, 0); - kret = vm_map_get_upl(current_map(), + vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map; + kret = vm_map_get_upl(map, (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, 0); @@ -4604,7 +4936,7 @@ next_cread: } pl = ubc_upl_pageinfo(upl[cur_upl]); - dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset; + dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset; while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) { u_int32_t head_size; @@ -4655,8 +4987,7 @@ next_cread: * if there are already too many outstanding reads * wait until some have completed before issuing the next */ - if (iostate.io_issued > iostate.io_completed) - cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_read_contig"); + cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_read_contig"); if (iostate.io_error) { /* @@ -4698,8 +5029,7 @@ wait_for_creads: * make sure all async reads that are part of this stream * have completed before we proceed */ - if (iostate.io_issued > iostate.io_completed) - cluster_iostate_wait(&iostate, 0, "cluster_read_contig"); + cluster_iostate_wait(&iostate, 0, "cluster_read_contig"); if (iostate.io_error) error = iostate.io_error; @@ -4727,7 +5057,7 @@ cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t m user_addr_t iov_base = 0; upl_t upl; upl_size_t upl_size; - int upl_flags; + upl_control_flags_t upl_flags; int retval = 0; /* @@ -4751,9 +5081,10 @@ cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t m else upl_size = (u_int32_t)iov_len; - upl_flags = UPL_QUERY_OBJECT_TYPE; - - if ((vm_map_get_upl(current_map(), + upl_flags = UPL_QUERY_OBJECT_TYPE | UPL_MEMORY_TAG_MAKE(VM_KERN_MEMORY_FILE); + + vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map; + if ((vm_map_get_upl(map, (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)), &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) { /* @@ -5001,7 +5332,7 @@ cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *ca KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, vp, flags, 0, -2, 0); return (0); } - if (wbp->cl_number == 0 && wbp->cl_scmap == NULL) { + if (!ISSET(flags, IO_SYNC) && wbp->cl_number == 0 && wbp->cl_scmap == NULL) { lck_mtx_unlock(&wbp->cl_lockw); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, vp, flags, 0, -3, 0); @@ -5328,6 +5659,9 @@ cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags, int (*c else bflag = 0; + if (flags & IO_SKIP_ENCRYPTION) + bflag |= CL_ENCRYPTED; + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START, (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0); @@ -5628,7 +5962,7 @@ cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t } did_read = 1; } - ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64); + ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)(uio->uio_offset & PAGE_MASK_64); /* * NOTE: There is no prototype for the following in BSD. It, and the definitions @@ -5675,6 +6009,7 @@ cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid) int retval = 0; int xsize; upl_page_info_t *pl; + int dirty_count; xsize = *io_resid; @@ -5711,10 +6046,13 @@ cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid) pg_offset = upl_offset & PAGE_MASK; csize = min(PAGE_SIZE - pg_offset, xsize); + dirty_count = 0; while (xsize && retval == 0) { addr64_t paddr; - paddr = ((addr64_t)upl_phys_page(pl, pg_index) << 12) + pg_offset; + paddr = ((addr64_t)upl_phys_page(pl, pg_index) << PAGE_SHIFT) + pg_offset; + if ((uio->uio_rw == UIO_WRITE) && (upl_dirty_page(pl, pg_index) == FALSE)) + dirty_count++; retval = uiomove64(paddr, csize, uio); @@ -5727,6 +6065,7 @@ cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid) uio->uio_segflg = segflg; + task_update_logical_writes(current_task(), (dirty_count * PAGE_SIZE), TASK_WRITE_DEFERRED); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END, (int)uio->uio_offset, xsize, retval, segflg, 0); @@ -5848,15 +6187,15 @@ is_file_clean(vnode_t vp, off_t filesize) * single hashtable entry. Each hashtable entry is aligned to this * size within the file. */ -#define DRT_BITVECTOR_PAGES 256 +#define DRT_BITVECTOR_PAGES ((1024 * 1024) / PAGE_SIZE) /* * File offset handling. * - * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES; - * the correct formula is (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1) + * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES; + * the correct formula is (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)) */ -#define DRT_ADDRESS_MASK (~((1 << 20) - 1)) +#define DRT_ADDRESS_MASK (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)) #define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK) /* @@ -5954,7 +6293,15 @@ is_file_clean(vnode_t vp, off_t filesize) */ struct vfs_drt_hashentry { u_int64_t dhe_control; - u_int32_t dhe_bitvector[DRT_BITVECTOR_PAGES / 32]; +/* +* dhe_bitvector was declared as dhe_bitvector[DRT_BITVECTOR_PAGES / 32]; +* DRT_BITVECTOR_PAGES is defined as ((1024 * 1024) / PAGE_SIZE) +* Since PAGE_SIZE is only known at boot time, +* -define MAX_DRT_BITVECTOR_PAGES for smallest supported page size (4k) +* -declare dhe_bitvector array for largest possible length +*/ +#define MAX_DRT_BITVECTOR_PAGES (1024 * 1024)/( 4 * 1024) + u_int32_t dhe_bitvector[MAX_DRT_BITVECTOR_PAGES/32]; }; /* @@ -6090,7 +6437,7 @@ vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp) */ kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap, - (nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION); + (nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION, VM_KERN_MEMORY_FILE); if (kret != KERN_SUCCESS) return(kret); cmap->scm_magic = DRT_SCM_MAGIC;