/*
- * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
#include <vm/vm_kern.h>
#include <vm/vm_map.h>
#include <vm/vm_pageout.h>
+#include <vm/vm_fault.h>
#include <sys/kdebug.h>
#include <libkern/OSAtomic.h>
#define CL_NOCACHE 0x40000
#define MAX_VECTOR_UPL_ELEMENTS 8
-#define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE) * PAGE_SIZE
+#define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE_BYTES)
extern upl_t vector_upl_create(vm_offset_t);
extern boolean_t vector_upl_is_valid(upl_t);
static void cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name);
-static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg);
+static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg, int flags);
static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference);
static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference);
* can represent it in a 32 bit int
*/
#define MAX_IO_REQUEST_SIZE (1024 * 1024 * 512)
-#define MAX_IO_CONTIG_SIZE (MAX_UPL_SIZE * PAGE_SIZE)
+#define MAX_IO_CONTIG_SIZE MAX_UPL_SIZE_BYTES
#define MAX_VECTS 16
#define MIN_DIRECT_WRITE_SIZE (4 * PAGE_SIZE)
#define WRITE_BEHIND_SSD 1
#define PREFETCH 3
-#define PREFETCH_SSD 1
-uint32_t speculative_prefetch_max = (MAX_UPL_SIZE * 3);
-uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use for a specluative read-ahead on SSDs*/
+#define PREFETCH_SSD 2
+uint32_t speculative_prefetch_max = (MAX_UPL_SIZE_BYTES * 3); /* maximum bytes in a specluative read-ahead */
+uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use in a specluative read-ahead on SSDs*/
#define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * (base))
maxcnt = min(mp->mnt_maxreadcnt, mp->mnt_maxwritecnt);
break;
}
- if (segcnt > MAX_UPL_SIZE) {
+ if (segcnt > (MAX_UPL_SIZE_BYTES >> PAGE_SHIFT)) {
/*
* don't allow a size beyond the max UPL size we can create
*/
- segcnt = MAX_UPL_SIZE;
+ segcnt = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
}
max_io_size = min((segcnt * PAGE_SIZE), maxcnt);
- if (max_io_size < (MAX_UPL_TRANSFER * PAGE_SIZE)) {
+ if (max_io_size < MAX_UPL_TRANSFER_BYTES) {
/*
* don't allow a size smaller than the old fixed limit
*/
- max_io_size = (MAX_UPL_TRANSFER * PAGE_SIZE);
+ max_io_size = MAX_UPL_TRANSFER_BYTES;
} else {
/*
* make sure the size specified is a multiple of PAGE_SIZE
static void
-cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg)
+cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, int flags)
{
struct cl_writebehind *wbp;
if (wbp->cl_number) {
lck_mtx_lock(&wbp->cl_lockw);
- cluster_try_push(wbp, vp, newEOF, PUSH_ALL | PUSH_SYNC, 0, callback, callback_arg);
+ cluster_try_push(wbp, vp, newEOF, PUSH_ALL | flags, 0, callback, callback_arg);
lck_mtx_unlock(&wbp->cl_lockw);
}
pl = ubc_upl_pageinfo(upl);
if (upl_device_page(pl) == TRUE) {
- zero_addr = ((addr64_t)upl_phys_page(pl, 0) << 12) + upl_offset;
+ zero_addr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + upl_offset;
bzero_phys_nc(zero_addr, size);
} else {
page_index = upl_offset / PAGE_SIZE;
page_offset = upl_offset & PAGE_MASK;
- zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << 12) + page_offset;
+ zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << PAGE_SHIFT) + page_offset;
zero_cnt = min(PAGE_SIZE - page_offset, size);
bzero_phys(zero_addr, zero_cnt);
*
* go direct to vnode_pageout so that we don't have to
* unbusy the page from the UPL... we used to do this
- * so that we could call ubc_sync_range, but that results
+ * so that we could call ubc_msync, but that results
* in a potential deadlock if someone else races us to acquire
* that page and wins and in addition needs one of the pages
* we're continuing to hold in the UPL
}
continue;
}
- lblkno = (daddr64_t)(f_offset / PAGE_SIZE_64);
+ lblkno = (daddr64_t)(f_offset / 0x1000);
/*
* we have now figured out how much I/O we can do - this is in 'io_size'
* pg_offset is the starting point in the first page for the I/O
if (buf_setupl(cbp, upl, upl_offset))
panic("buf_setupl failed\n");
-
+#if CONFIG_IOSCHED
+ upl_set_blkno(upl, upl_offset, io_size, blkno);
+#endif
cbp->b_trans_next = (buf_t)NULL;
if ((cbp->b_iostate = (void *)iostate))
}
max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ), (vp->v_mount->mnt_kern_flag & MNTK_SSD));
- if ((max_prefetch / PAGE_SIZE) > speculative_prefetch_max)
- max_prefetch = (speculative_prefetch_max * PAGE_SIZE);
+ if (max_prefetch > speculative_prefetch_max)
+ max_prefetch = speculative_prefetch_max;
if (max_prefetch <= PAGE_SIZE) {
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 6, 0);
return;
}
- if (extent->e_addr < rap->cl_maxra) {
- if ((rap->cl_maxra - extent->e_addr) > ((max_prefetch / PAGE_SIZE) / 4)) {
+ if (extent->e_addr < rap->cl_maxra && rap->cl_ralen >= 4) {
+ if ((rap->cl_maxra - extent->e_addr) > (rap->cl_ralen / 4)) {
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
if (flags & IO_NOCACHE)
io_flag |= CL_NOCACHE;
+ if (flags & IO_SKIP_ENCRYPTION)
+ io_flag |= CL_ENCRYPTED;
+
iostate.io_completed = 0;
iostate.io_issued = 0;
iostate.io_error = 0;
}
if (first_IO) {
- cluster_syncup(vp, newEOF, callback, callback_arg);
+ cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
first_IO = 0;
}
io_size = io_req_size & ~PAGE_MASK;
* if there are already too many outstanding writes
* wait until some complete before issuing the next
*/
- if (iostate.io_issued > iostate.io_completed)
- cluster_iostate_wait(&iostate, max_upl_size * IO_SCALE(vp, 2), "cluster_write_direct");
+ cluster_iostate_wait(&iostate, max_upl_size * IO_SCALE(vp, 2), "cluster_write_direct");
if (iostate.io_error) {
/*
retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
reset_vector_run_state();
}
+ /*
+ * make sure all async writes issued as part of this stream
+ * have completed before we return
+ */
+ cluster_iostate_wait(&iostate, 0, "cluster_write_direct");
- if (iostate.io_issued > iostate.io_completed) {
- /*
- * make sure all async writes issued as part of this stream
- * have completed before we return
- */
- cluster_iostate_wait(&iostate, 0, "cluster_write_direct");
- }
if (iostate.io_error)
retval = iostate.io_error;
* -- the io_req_size will not exceed iov_len
* -- the target address is physically contiguous
*/
- cluster_syncup(vp, newEOF, callback, callback_arg);
+ cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
}
pl = ubc_upl_pageinfo(upl[cur_upl]);
- src_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset;
+ src_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
u_int32_t head_size;
* if there are already too many outstanding writes
* wait until some have completed before issuing the next
*/
- if (iostate.io_issued > iostate.io_completed)
- cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_write_contig");
+ cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_write_contig");
if (iostate.io_error) {
/*
* make sure all async writes that are part of this stream
* have completed before we proceed
*/
- if (iostate.io_issued > iostate.io_completed)
- cluster_iostate_wait(&iostate, 0, "cluster_write_contig");
+ cluster_iostate_wait(&iostate, 0, "cluster_write_contig");
if (iostate.io_error)
error = iostate.io_error;
if (flags & IO_NOCACHE)
bflag |= CL_NOCACHE;
+ if (flags & IO_SKIP_ENCRYPTION)
+ bflag |= CL_ENCRYPTED;
+
zero_cnt = 0;
zero_cnt1 = 0;
zero_off = 0;
if ((vp->v_flag & VRAOFF) || speculative_reads_disabled)
flags |= IO_RAOFF;
+ if (flags & IO_SKIP_ENCRYPTION)
+ flags |= IO_ENCRYPTED;
/*
* If we're doing an encrypted IO, then first check to see
* if the IO requested was page aligned. If not, then bail
* otherwise, find out if we want the direct or contig variant for
* the first vector in the uio request
*/
- if (((flags & IO_NOCACHE) || (flags & IO_ENCRYPTED)) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) {
-
- boolean_t check_io_type = TRUE;
+ if ( ((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) || (flags & IO_ENCRYPTED) ) {
-
- if (check_io_type) {
- retval = cluster_io_type(uio, &read_type, &read_length, 0);
- }
+ retval = cluster_io_type(uio, &read_type, &read_length, 0);
}
while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) {
if (flags & IO_NOCACHE)
bflag |= CL_NOCACHE;
+ if (flags & IO_SKIP_ENCRYPTION)
+ bflag |= CL_ENCRYPTED;
+
max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
max_prefetch = MAX_PREFETCH(vp, max_io_size, (vp->v_mount->mnt_kern_flag & MNTK_SSD));
max_rd_size = max_prefetch;
if (upl_size > max_io_size)
upl_size = max_io_size;
} else {
- if (upl_size > max_io_size / 4)
+ if (upl_size > max_io_size / 4) {
upl_size = max_io_size / 4;
+ upl_size &= ~PAGE_MASK;
+
+ if (upl_size == 0)
+ upl_size = PAGE_SIZE;
+ }
}
pages_in_upl = upl_size / PAGE_SIZE;
rap->cl_lastr = extent.e_addr;
}
}
- if (iostate.io_issued > iostate.io_completed)
+ if (iolock_inited == TRUE)
cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
if (iostate.io_error)
io_req_size -= (val_size - io_requested);
}
} else {
- if (iostate.io_issued > iostate.io_completed)
+ if (iolock_inited == TRUE)
cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
}
if (start_pg < last_pg) {
}
}
if (iolock_inited == TRUE) {
- if (iostate.io_issued > iostate.io_completed) {
- /*
- * cluster_io returned an error after it
- * had already issued some I/O. we need
- * to wait for that I/O to complete before
- * we can destroy the iostate mutex...
- * 'retval' already contains the early error
- * so no need to pick it up from iostate.io_error
- */
- cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
- }
+ /*
+ * cluster_io returned an error after it
+ * had already issued some I/O. we need
+ * to wait for that I/O to complete before
+ * we can destroy the iostate mutex...
+ * 'retval' already contains the early error
+ * so no need to pick it up from iostate.io_error
+ */
+ cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
+
lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
}
if (rap != NULL) {
return (retval);
}
-
static int
cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
int flags, int (*callback)(buf_t, void *), void *callback_arg)
int vector_upl_index=0;
upl_t vector_upl = NULL;
+ user_addr_t orig_iov_base = 0;
+ user_addr_t last_iov_base = 0;
+ user_addr_t next_iov_base = 0;
+
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
(int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
io_flag |= CL_NOCACHE;
}
+ if (flags & IO_SKIP_ENCRYPTION)
+ io_flag |= CL_ENCRYPTED;
+
iostate.io_completed = 0;
iostate.io_issued = 0;
iostate.io_error = 0;
strict_uncached_IO = ubc_strict_uncached_IO(vp);
+ orig_iov_base = uio_curriovbase(uio);
+ last_iov_base = orig_iov_base;
+
next_dread:
io_req_size = *read_length;
iov_base = uio_curriovbase(uio);
* if there are already too many outstanding reads
* wait until some have completed before issuing the next read
*/
- if (iostate.io_issued > iostate.io_completed)
- cluster_iostate_wait(&iostate, max_rd_ahead, "cluster_read_direct");
+ cluster_iostate_wait(&iostate, max_rd_ahead, "cluster_read_direct");
if (iostate.io_error) {
/*
retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
reset_vector_run_state();
}
- }
+ }
+ last_iov_base = iov_base + io_size;
+
/*
* update the uio structure
*/
* make sure all async reads that are part of this stream
* have completed before we return
*/
- if (iostate.io_issued > iostate.io_completed)
- cluster_iostate_wait(&iostate, 0, "cluster_read_direct");
+ cluster_iostate_wait(&iostate, 0, "cluster_read_direct");
if (iostate.io_error)
retval = iostate.io_error;
if (io_throttled == TRUE && retval == 0)
retval = EAGAIN;
+ for (next_iov_base = orig_iov_base; next_iov_base < last_iov_base; next_iov_base += PAGE_SIZE) {
+ /*
+ * This is specifically done for pmap accounting purposes.
+ * vm_pre_fault() will call vm_fault() to enter the page into
+ * the pmap if there isn't _a_ physical page for that VA already.
+ */
+ vm_pre_fault(vm_map_trunc_page(next_iov_base, PAGE_MASK));
+ }
+
if (io_req_size && retval == 0) {
/*
* we couldn't handle the tail of this request in DIRECT mode
* -- the read_length will not exceed the current iov_len
* -- the target address is physically contiguous for read_length
*/
- cluster_syncup(vp, filesize, callback, callback_arg);
+ cluster_syncup(vp, filesize, callback, callback_arg, PUSH_SYNC);
devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
}
pl = ubc_upl_pageinfo(upl[cur_upl]);
- dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset;
+ dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
u_int32_t head_size;
* if there are already too many outstanding reads
* wait until some have completed before issuing the next
*/
- if (iostate.io_issued > iostate.io_completed)
- cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_read_contig");
+ cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_read_contig");
if (iostate.io_error) {
/*
* make sure all async reads that are part of this stream
* have completed before we proceed
*/
- if (iostate.io_issued > iostate.io_completed)
- cluster_iostate_wait(&iostate, 0, "cluster_read_contig");
+ cluster_iostate_wait(&iostate, 0, "cluster_read_contig");
if (iostate.io_error)
error = iostate.io_error;
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, vp, flags, 0, -2, 0);
return (0);
}
- if (wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
+ if (!ISSET(flags, IO_SYNC) && wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
lck_mtx_unlock(&wbp->cl_lockw);
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, vp, flags, 0, -3, 0);
else
bflag = 0;
+ if (flags & IO_SKIP_ENCRYPTION)
+ bflag |= CL_ENCRYPTED;
+
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
(int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
}
did_read = 1;
}
- ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
+ ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
/*
* NOTE: There is no prototype for the following in BSD. It, and the definitions
while (xsize && retval == 0) {
addr64_t paddr;
- paddr = ((addr64_t)upl_phys_page(pl, pg_index) << 12) + pg_offset;
+ paddr = ((addr64_t)upl_phys_page(pl, pg_index) << PAGE_SHIFT) + pg_offset;
retval = uiomove64(paddr, csize, uio);