X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/b4c24cb9d3df001f2892dc4ed451bc769ff28a9f..d7e50217d7adf6e52786a38bcaa4cd698cb9a79e:/bsd/vfs/vfs_cluster.c diff --git a/bsd/vfs/vfs_cluster.c b/bsd/vfs/vfs_cluster.c index 49b0938bb..ec2eaf7f4 100644 --- a/bsd/vfs/vfs_cluster.c +++ b/bsd/vfs/vfs_cluster.c @@ -3,19 +3,22 @@ * * @APPLE_LICENSE_HEADER_START@ * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. + * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. * * @APPLE_LICENSE_HEADER_END@ */ @@ -81,12 +84,12 @@ #define CL_DEV_MEMORY 0x200 #define CL_PRESERVE 0x400 + struct clios { - u_int io_completed; - u_int io_issued; - off_t io_offset; - int io_error; - int io_wanted; + u_int io_completed; /* amount of io that has currently completed */ + u_int io_issued; /* amount of io that was successfully issued */ + int io_error; /* error code of first error encountered */ + int io_wanted; /* someone is sleeping waiting for a change in state */ }; @@ -106,7 +109,7 @@ static int cluster_phys_read(struct vnode *vp, struct uio *uio, static int cluster_phys_write(struct vnode *vp, struct uio *uio, off_t newEOF, int devblocksize, int flags); static int cluster_align_phys_io(struct vnode *vp, struct uio *uio, - vm_offset_t usr_paddr, int xsize, int devblocksize, int flags); + addr64_t usr_paddr, int xsize, int devblocksize, int flags); static int cluster_push_x(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last, int can_delay); static int cluster_try_push(struct vnode *vp, off_t newEOF, int can_delay, int push_all); @@ -128,7 +131,6 @@ cluster_iodone(bp) int total_resid; int upl_offset; int zero_offset; - int l_blkno; upl_t upl; struct buf *cbp; struct buf *cbp_head; @@ -169,13 +171,9 @@ cluster_iodone(bp) real_bp = cbp->b_real_bp; vp = cbp->b_vp; zero_offset= cbp->b_validend; - l_blkno = cbp->b_lblkno; iostate = (struct clios *)cbp->b_iostate; while (cbp) { - if (cbp->b_vectorcount > 1) - _FREE(cbp->b_vectorlist, M_SEGMENT); - if ((cbp->b_flags & B_ERROR) && error == 0) error = cbp->b_error; @@ -196,22 +194,20 @@ cluster_iodone(bp) wakeup((caddr_t)&vp->v_numoutput); } if (iostate) { - if (error) { - off_t error_offset; - - error_offset = (off_t)l_blkno * PAGE_SIZE_64; + /* + * someone has issued multiple I/Os asynchrounsly + * and is waiting for them to complete (streaming) + */ + if (error && iostate->io_error == 0) + iostate->io_error = error; - if (iostate->io_error == 0) { - iostate->io_error = error; - iostate->io_offset = error_offset; - } else { - if (error_offset < iostate->io_offset) - iostate->io_offset = error_offset; - } - } iostate->io_completed += total_size; if (iostate->io_wanted) { + /* + * someone is waiting for the state of + * this io stream to change + */ iostate->io_wanted = 0; wakeup((caddr_t)&iostate->io_wanted); } @@ -324,7 +320,6 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, struct clios *iostate; { struct buf *cbp; - struct iovec *iovp; u_int size; u_int io_size; int io_flags; @@ -386,8 +381,7 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, zero_offset = upl_offset + non_rounded_size; } while (size) { - int vsize; - int i; + int i; int pl_index; int pg_resid; int num_contig; @@ -531,31 +525,14 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp->b_blkno = blkno; } - if (pg_count > 1) { - if (pg_count > max_vectors) { - io_size -= (pg_count - max_vectors) * PAGE_SIZE; + if (pg_count > max_vectors) { + io_size -= (pg_count - max_vectors) * PAGE_SIZE; - if (io_size < 0) { - io_size = PAGE_SIZE - pg_offset; - pg_count = 1; - } else - pg_count = max_vectors; - } - /* - * we need to allocate space for the vector list - */ - if (pg_count > 1) { - iovp = (struct iovec *)_MALLOC(sizeof(struct iovec) * pg_count, - M_SEGMENT, M_NOWAIT); - - if (iovp == (struct iovec *) 0) { - /* - * if the allocation fails, then throttle down to a single page - */ - io_size = PAGE_SIZE - pg_offset; - pg_count = 1; - } - } + if (io_size < 0) { + io_size = PAGE_SIZE - pg_offset; + pg_count = 1; + } else + pg_count = max_vectors; } /* Throttle the speculative IO */ @@ -566,53 +543,9 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, cbp = alloc_io_buf(vp, priv); - if (pg_count == 1) - /* - * we use the io vector that's reserved in the buffer header - * this insures we can always issue an I/O even in a low memory - * condition that prevents the _MALLOC from succeeding... this - * is necessary to prevent deadlocks with the pager - */ - iovp = (struct iovec *)(&cbp->b_vects[0]); - - cbp->b_vectorlist = (void *)iovp; - cbp->b_vectorcount = pg_count; - - if (flags & CL_DEV_MEMORY) { - - iovp->iov_len = io_size; - iovp->iov_base = (caddr_t)upl_phys_page(pl, 0); - - if (iovp->iov_base == (caddr_t) 0) { - free_io_buf(cbp); - error = EINVAL; - } else - iovp->iov_base += upl_offset; - } else { - - for (i = 0, vsize = io_size; i < pg_count; i++, iovp++) { - int psize; - - psize = PAGE_SIZE - pg_offset; - - if (psize > vsize) - psize = vsize; - iovp->iov_len = psize; - iovp->iov_base = (caddr_t)upl_phys_page(pl, pl_index + i); - - if (iovp->iov_base == (caddr_t) 0) { - if (pg_count > 1) - _FREE(cbp->b_vectorlist, M_SEGMENT); - free_io_buf(cbp); - - error = EINVAL; - break; - } - iovp->iov_base += pg_offset; - pg_offset = 0; - - if (flags & CL_PAGEOUT) { + if (flags & CL_PAGEOUT) { + for (i = 0; i < pg_count; i++) { int s; struct buf *bp; @@ -628,12 +561,7 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, } splx(s); } - vsize -= psize; - } } - if (error) - break; - if (flags & CL_ASYNC) { cbp->b_flags |= (B_CALL | B_ASYNC); cbp->b_iodone = (void *)cluster_iodone; @@ -648,6 +576,10 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, cbp->b_trans_next = (struct buf *)0; if (cbp->b_iostate = (void *)iostate) + /* + * caller wants to track the state of this + * io... bump the amount issued against this stream + */ iostate->io_issued += io_size; if (flags & CL_READ) @@ -737,8 +669,6 @@ start_io: for (cbp = cbp_head; cbp;) { struct buf * cbp_next; - if (cbp->b_vectorcount > 1) - _FREE(cbp->b_vectorlist, M_SEGMENT); upl_offset -= cbp->b_bcount; size += cbp->b_bcount; io_size += cbp->b_bcount; @@ -748,13 +678,20 @@ start_io: cbp = cbp_next; } if (iostate) { - if (iostate->io_error == 0) { + /* + * update the error condition for this stream + * since we never really issued the io + * just go ahead and adjust it back + */ + if (iostate->io_error == 0) iostate->io_error = error; - iostate->io_offset = f_offset - (off_t)io_size; - } iostate->io_issued -= io_size; if (iostate->io_wanted) { + /* + * someone is waiting for the state of + * this io stream to change + */ iostate->io_wanted = 0; wakeup((caddr_t)&iostate->io_wanted); } @@ -1241,6 +1178,7 @@ cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags) vm_offset_t upl_offset; off_t max_io_size; int io_size; + int io_flag; int upl_size; int upl_needed_size; int pages_in_pl; @@ -1248,8 +1186,10 @@ cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags) kern_return_t kret; struct iovec *iov; int i; + int first = 1; int force_data_sync; int error = 0; + struct clios iostate; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START, (int)uio->uio_offset, (int)uio->uio_resid, @@ -1263,152 +1203,182 @@ cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags) */ cluster_try_push(vp, newEOF, 0, 1); + iostate.io_completed = 0; + iostate.io_issued = 0; + iostate.io_error = 0; + iostate.io_wanted = 0; + iov = uio->uio_iov; while (uio->uio_resid && uio->uio_offset < newEOF && error == 0) { - io_size = uio->uio_resid; - - if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE)) - io_size = MAX_UPL_TRANSFER * PAGE_SIZE; + io_size = uio->uio_resid; - upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64; - upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK; - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START, - (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0); - - for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) - { - pages_in_pl = 0; - upl_size = upl_needed_size; - upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC | - UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL; + if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE)) + io_size = MAX_UPL_TRANSFER * PAGE_SIZE; - kret = vm_map_get_upl(current_map(), - (vm_offset_t)iov->iov_base & ~PAGE_MASK, - &upl_size, - &upl, - NULL, - &pages_in_pl, - &upl_flags, - force_data_sync); + if (first) { + if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4) + io_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 8; + first = 0; + } + upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64; + upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK; + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START, + (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0); + + for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) { + pages_in_pl = 0; + upl_size = upl_needed_size; + upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC | + UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL; + + kret = vm_map_get_upl(current_map(), + (vm_offset_t)iov->iov_base & ~PAGE_MASK, + &upl_size, + &upl, + NULL, + &pages_in_pl, + &upl_flags, + force_data_sync); + + if (kret != KERN_SUCCESS) { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END, + 0, 0, 0, kret, 0); - if (kret != KERN_SUCCESS) - { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END, - 0, 0, 0, kret, 0); + /* + * cluster_nocopy_write: failed to get pagelist + * + * we may have already spun some portion of this request + * off as async requests... we need to wait for the I/O + * to complete before returning + */ + goto wait_for_writes; + } + pl = UPL_GET_INTERNAL_PAGE_LIST(upl); + pages_in_pl = upl_size / PAGE_SIZE; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END, - (int)uio->uio_offset, (int)uio->uio_resid, kret, 1, 0); + for (i = 0; i < pages_in_pl; i++) { + if (!upl_valid_page(pl, i)) + break; + } + if (i == pages_in_pl) + break; - /* cluster_nocopy_write: failed to get pagelist */ - /* do not return kret here */ - return(0); + /* + * didn't get all the pages back that we + * needed... release this upl and try again + */ + ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, + UPL_ABORT_FREE_ON_EMPTY); } + if (force_data_sync >= 3) { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END, + i, pages_in_pl, upl_size, kret, 0); - pl = UPL_GET_INTERNAL_PAGE_LIST(upl); - pages_in_pl = upl_size / PAGE_SIZE; - - for(i=0; i < pages_in_pl; i++) - { - if (!upl_valid_page(pl, i)) - break; + /* + * for some reason, we couldn't acquire a hold on all + * the pages needed in the user's address space + * + * we may have already spun some portion of this request + * off as async requests... we need to wait for the I/O + * to complete before returning + */ + goto wait_for_writes; } - if (i == pages_in_pl) - break; - - ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, - UPL_ABORT_FREE_ON_EMPTY); - } - - if (force_data_sync >= 3) - { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END, - i, pages_in_pl, upl_size, kret, 0); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END, - (int)uio->uio_offset, (int)uio->uio_resid, kret, 2, 0); - return(0); - } - - /* - * Consider the possibility that upl_size wasn't satisfied. - */ - if (upl_size != upl_needed_size) - io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK; - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END, - (int)upl_offset, upl_size, (int)iov->iov_base, io_size, 0); - - if (io_size == 0) - { - ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, - UPL_ABORT_FREE_ON_EMPTY); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END, - (int)uio->uio_offset, uio->uio_resid, 0, 3, 0); + /* + * Consider the possibility that upl_size wasn't satisfied. + */ + if (upl_size != upl_needed_size) + io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK; - return(0); - } + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END, + (int)upl_offset, upl_size, (int)iov->iov_base, io_size, 0); - /* - * Now look for pages already in the cache - * and throw them away. - */ + if (io_size == 0) { + ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, + UPL_ABORT_FREE_ON_EMPTY); - upl_f_offset = uio->uio_offset; /* this is page aligned in the file */ - max_io_size = io_size; + /* + * we may have already spun some portion of this request + * off as async requests... we need to wait for the I/O + * to complete before returning + */ + goto wait_for_writes; + } + /* + * Now look for pages already in the cache + * and throw them away. + */ - while (max_io_size) { + upl_f_offset = uio->uio_offset; /* this is page aligned in the file */ + max_io_size = io_size; - /* - * Flag UPL_POP_DUMP says if the page is found - * in the page cache it must be thrown away. - */ - ubc_page_op(vp, - upl_f_offset, - UPL_POP_SET | UPL_POP_BUSY | UPL_POP_DUMP, - 0, 0); - max_io_size -= PAGE_SIZE; - upl_f_offset += PAGE_SIZE; - } - - /* - * issue a synchronous write to cluster_io - */ + while (max_io_size) { + /* + * Flag UPL_POP_DUMP says if the page is found + * in the page cache it must be thrown away. + */ + ubc_page_op(vp, + upl_f_offset, + UPL_POP_SET | UPL_POP_BUSY | UPL_POP_DUMP, + 0, 0); + max_io_size -= PAGE_SIZE_64; + upl_f_offset += PAGE_SIZE_64; + } + /* + * we want push out these writes asynchronously so that we can overlap + * the preparation of the next I/O + * if there are already too many outstanding writes + * wait until some complete before issuing the next + */ + while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) { + iostate.io_wanted = 1; + tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0); + } + if (iostate.io_error) { + /* + * one of the earlier writes we issued ran into a hard error + * don't issue any more writes, cleanup the UPL + * that was just created but not used, then + * go wait for all writes that are part of this stream + * to complete before returning the error to the caller + */ + ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, + UPL_ABORT_FREE_ON_EMPTY); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START, - (int)upl_offset, (int)uio->uio_offset, io_size, 0, 0); + goto wait_for_writes; + } + io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT; - error = cluster_io(vp, upl, upl_offset, uio->uio_offset, - io_size, devblocksize, 0, (struct buf *)0, (struct clios *)0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START, + (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0); - if (error == 0) { - /* - * The cluster_io write completed successfully, - * update the uio structure. - */ - iov->iov_base += io_size; - iov->iov_len -= io_size; - uio->uio_resid -= io_size; - uio->uio_offset += io_size; - } - /* - * always 'commit' the I/O via the abort primitive whether the I/O - * succeeded cleanly or not... this is necessary to insure that - * we preserve the state of the DIRTY flag on the pages used to - * provide the data for the I/O... the state of this flag SHOULD - * NOT be changed by a write - */ - ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, - UPL_ABORT_FREE_ON_EMPTY); + error = cluster_io(vp, upl, upl_offset, uio->uio_offset, + io_size, devblocksize, io_flag, (struct buf *)0, &iostate); + iov->iov_len -= io_size; + iov->iov_base += io_size; + uio->uio_resid -= io_size; + uio->uio_offset += io_size; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END, - (int)upl_offset, (int)uio->uio_offset, (int)uio->uio_resid, error, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END, + (int)upl_offset, (int)uio->uio_offset, (int)uio->uio_resid, error, 0); } /* end while */ +wait_for_writes: + /* + * make sure all async writes issued as part of this stream + * have completed before we return + */ + while (iostate.io_issued != iostate.io_completed) { + iostate.io_wanted = 1; + tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0); + } + if (iostate.io_error) + error = iostate.io_error; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END, (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0); @@ -1426,7 +1396,7 @@ cluster_phys_write(vp, uio, newEOF, devblocksize, flags) int flags; { upl_page_info_t *pl; - vm_offset_t src_paddr; + addr64_t src_paddr; upl_t upl; vm_offset_t upl_offset; int tail_size; @@ -1477,7 +1447,7 @@ cluster_phys_write(vp, uio, newEOF, devblocksize, flags) } pl = ubc_upl_pageinfo(upl); - src_paddr = (vm_offset_t)upl_phys_page(pl, 0) + ((vm_offset_t)iov->iov_base & PAGE_MASK); + src_paddr = (((addr64_t)(int)upl_phys_page(pl, 0)) << 12) + ((addr64_t)iov->iov_base & PAGE_MASK); while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) { int head_size; @@ -2267,8 +2237,8 @@ cluster_read_x(vp, uio, filesize, devblocksize, flags) (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0); while (io_size && retval == 0) { - int xsize; - vm_offset_t paddr; + int xsize; + ppnum_t paddr; if (ubc_page_op(vp, upl_f_offset, @@ -2281,7 +2251,7 @@ cluster_read_x(vp, uio, filesize, devblocksize, flags) if (xsize > io_size) xsize = io_size; - retval = uiomove((caddr_t)(paddr + start_offset), xsize, uio); + retval = uiomove64((addr64_t)(((addr64_t)paddr << 12) + start_offset), xsize, uio); ubc_page_op(vp, upl_f_offset, UPL_POP_CLR | UPL_POP_BUSY, 0, 0); @@ -2442,14 +2412,14 @@ cluster_read_x(vp, uio, filesize, devblocksize, flags) while (val_size && retval == 0) { int csize; int i; - caddr_t paddr; + addr64_t paddr; i = offset / PAGE_SIZE; csize = min(PAGE_SIZE - start_offset, val_size); - paddr = (caddr_t)upl_phys_page(pl, i) + start_offset; + paddr = ((addr64_t)upl_phys_page(pl, i) << 12) + start_offset; - retval = uiomove(paddr, csize, uio); + retval = uiomove64(paddr, csize, uio); val_size -= csize; offset += csize; @@ -2591,15 +2561,16 @@ cluster_nocopy_read(vp, uio, filesize, devblocksize, flags) int upl_size; int upl_needed_size; int pages_in_pl; - vm_offset_t paddr; + ppnum_t paddr; int upl_flags; kern_return_t kret; int segflg; struct iovec *iov; int i; int force_data_sync; - int error = 0; int retval = 0; + int first = 1; + struct clios iostate; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START, (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0); @@ -2611,203 +2582,225 @@ cluster_nocopy_read(vp, uio, filesize, devblocksize, flags) * -- the resid will not exceed iov_len */ + iostate.io_completed = 0; + iostate.io_issued = 0; + iostate.io_error = 0; + iostate.io_wanted = 0; + iov = uio->uio_iov; + while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) { - max_io_size = filesize - uio->uio_offset; + max_io_size = filesize - uio->uio_offset; - if (max_io_size < (off_t)((unsigned int)uio->uio_resid)) - io_size = max_io_size; - else - io_size = uio->uio_resid; + if (max_io_size < (off_t)((unsigned int)uio->uio_resid)) + io_size = max_io_size; + else + io_size = uio->uio_resid; - /* - * We don't come into this routine unless - * UIO_USERSPACE is set. - */ - segflg = uio->uio_segflg; + /* + * We don't come into this routine unless + * UIO_USERSPACE is set. + */ + segflg = uio->uio_segflg; - uio->uio_segflg = UIO_PHYS_USERSPACE; + uio->uio_segflg = UIO_PHYS_USERSPACE; - /* - * First look for pages already in the cache - * and move them to user space. - */ - while (io_size && (retval == 0)) { - upl_f_offset = uio->uio_offset; + /* + * First look for pages already in the cache + * and move them to user space. + */ + while (io_size && (retval == 0)) { + upl_f_offset = uio->uio_offset; - /* - * If this call fails, it means the page is not - * in the page cache. - */ - if (ubc_page_op(vp, upl_f_offset, - UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) != KERN_SUCCESS) - break; + /* + * If this call fails, it means the page is not + * in the page cache. + */ + if (ubc_page_op(vp, upl_f_offset, + UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) != KERN_SUCCESS) + break; - retval = uiomove((caddr_t)(paddr), PAGE_SIZE, uio); + retval = uiomove64((addr64_t)paddr << 12, PAGE_SIZE, uio); - ubc_page_op(vp, upl_f_offset, - UPL_POP_CLR | UPL_POP_BUSY, 0, 0); + ubc_page_op(vp, upl_f_offset, + UPL_POP_CLR | UPL_POP_BUSY, 0, 0); - io_size -= PAGE_SIZE; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 71)) | DBG_FUNC_NONE, - (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0); - } - - uio->uio_segflg = segflg; + io_size -= PAGE_SIZE; + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 71)) | DBG_FUNC_NONE, + (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0); + } + uio->uio_segflg = segflg; - if (retval) - { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END, - (int)uio->uio_offset, uio->uio_resid, 2, retval, 0); - return(retval); - } - - /* If we are already finished with this read, then return */ - if (io_size == 0) - { - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END, - (int)uio->uio_offset, uio->uio_resid, 3, io_size, 0); - return(0); - } - - max_io_size = io_size; - if (max_io_size > (MAX_UPL_TRANSFER * PAGE_SIZE)) - max_io_size = MAX_UPL_TRANSFER * PAGE_SIZE; - - start_upl_f_offset = uio->uio_offset; /* this is page aligned in the file */ - upl_f_offset = start_upl_f_offset; - io_size = 0; - - while(io_size < max_io_size) - { - - if(ubc_page_op(vp, upl_f_offset, - UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) == KERN_SUCCESS) - { - ubc_page_op(vp, upl_f_offset, - UPL_POP_CLR | UPL_POP_BUSY, 0, 0); - break; - } - - /* - * Build up the io request parameters. - */ - - io_size += PAGE_SIZE; - upl_f_offset += PAGE_SIZE; + if (retval) { + /* + * we may have already spun some portion of this request + * off as async requests... we need to wait for the I/O + * to complete before returning + */ + goto wait_for_reads; } + /* + * If we are already finished with this read, then return + */ + if (io_size == 0) { + /* + * we may have already spun some portion of this request + * off as async requests... we need to wait for the I/O + * to complete before returning + */ + goto wait_for_reads; + } + max_io_size = io_size; + + if (max_io_size > (MAX_UPL_TRANSFER * PAGE_SIZE)) + max_io_size = MAX_UPL_TRANSFER * PAGE_SIZE; + if (first) { + if (max_io_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4) + max_io_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 8; + first = 0; + } + start_upl_f_offset = uio->uio_offset; /* this is page aligned in the file */ + upl_f_offset = start_upl_f_offset; + io_size = 0; - if (io_size == 0) - return(retval); + while (io_size < max_io_size) { + if (ubc_page_op(vp, upl_f_offset, + UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) == KERN_SUCCESS) { + ubc_page_op(vp, upl_f_offset, + UPL_POP_CLR | UPL_POP_BUSY, 0, 0); + break; + } + /* + * Build up the io request parameters. + */ + io_size += PAGE_SIZE_64; + upl_f_offset += PAGE_SIZE_64; + } + if (io_size == 0) + /* + * we may have already spun some portion of this request + * off as async requests... we need to wait for the I/O + * to complete before returning + */ + goto wait_for_reads; - upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64; - upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK; + upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64; + upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START, - (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START, + (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0); - for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) - { - pages_in_pl = 0; - upl_size = upl_needed_size; - upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL; + for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) { + pages_in_pl = 0; + upl_size = upl_needed_size; + upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL; - kret = vm_map_get_upl(current_map(), - (vm_offset_t)iov->iov_base & ~PAGE_MASK, - &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, force_data_sync); + kret = vm_map_get_upl(current_map(), + (vm_offset_t)iov->iov_base & ~PAGE_MASK, + &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, force_data_sync); - if (kret != KERN_SUCCESS) - { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END, - (int)upl_offset, upl_size, io_size, kret, 0); + if (kret != KERN_SUCCESS) { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END, + (int)upl_offset, upl_size, io_size, kret, 0); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END, - (int)uio->uio_offset, uio->uio_resid, 4, retval, 0); - - /* cluster_nocopy_read: failed to get pagelist */ - /* do not return kret here */ - return(retval); - } + /* + * cluster_nocopy_read: failed to get pagelist + * + * we may have already spun some portion of this request + * off as async requests... we need to wait for the I/O + * to complete before returning + */ + goto wait_for_reads; + } + pages_in_pl = upl_size / PAGE_SIZE; + pl = UPL_GET_INTERNAL_PAGE_LIST(upl); - pages_in_pl = upl_size / PAGE_SIZE; - pl = UPL_GET_INTERNAL_PAGE_LIST(upl); + for (i = 0; i < pages_in_pl; i++) { + if (!upl_valid_page(pl, i)) + break; + } + if (i == pages_in_pl) + break; - for(i=0; i < pages_in_pl; i++) - { - if (!upl_valid_page(pl, i)) - break; + ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, + UPL_ABORT_FREE_ON_EMPTY); } - if (i == pages_in_pl) - break; - - ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, - UPL_ABORT_FREE_ON_EMPTY); - } - - if (force_data_sync >= 3) - { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END, - (int)upl_offset, upl_size, io_size, kret, 0); + if (force_data_sync >= 3) { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END, + (int)upl_offset, upl_size, io_size, kret, 0); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END, - (int)uio->uio_offset, uio->uio_resid, 5, retval, 0); - return(retval); - } - /* - * Consider the possibility that upl_size wasn't satisfied. - */ - if (upl_size != upl_needed_size) - io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK; - - if (io_size == 0) - { - ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, - UPL_ABORT_FREE_ON_EMPTY); - return(retval); - } - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END, - (int)upl_offset, upl_size, io_size, kret, 0); - - /* - * issue a synchronous read to cluster_io - */ + goto wait_for_reads; + } + /* + * Consider the possibility that upl_size wasn't satisfied. + */ + if (upl_size != upl_needed_size) + io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START, - (int)upl, (int)upl_offset, (int)start_upl_f_offset, io_size, 0); + if (io_size == 0) { + ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, + UPL_ABORT_FREE_ON_EMPTY); + goto wait_for_reads; + } + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END, + (int)upl_offset, upl_size, io_size, kret, 0); - error = cluster_io(vp, upl, upl_offset, start_upl_f_offset, - io_size, devblocksize, CL_READ| CL_NOZERO, (struct buf *)0, (struct clios *)0); + /* + * request asynchronously so that we can overlap + * the preparation of the next I/O + * if there are already too many outstanding reads + * wait until some have completed before issuing the next read + */ + while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) { + iostate.io_wanted = 1; + tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0); + } + if (iostate.io_error) { + /* + * one of the earlier reads we issued ran into a hard error + * don't issue any more reads, cleanup the UPL + * that was just created but not used, then + * go wait for any other reads to complete before + * returning the error to the caller + */ + ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, + UPL_ABORT_FREE_ON_EMPTY); - if (error == 0) { - /* - * The cluster_io read completed successfully, - * update the uio structure and commit. - */ + goto wait_for_reads; + } + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START, + (int)upl, (int)upl_offset, (int)start_upl_f_offset, io_size, 0); - ubc_upl_commit_range(upl, (upl_offset & ~PAGE_MASK), upl_size, - UPL_COMMIT_SET_DIRTY | UPL_COMMIT_FREE_ON_EMPTY); - - iov->iov_base += io_size; - iov->iov_len -= io_size; - uio->uio_resid -= io_size; - uio->uio_offset += io_size; - } - else { - ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, - UPL_ABORT_FREE_ON_EMPTY); - } + retval = cluster_io(vp, upl, upl_offset, start_upl_f_offset, + io_size, devblocksize, + CL_PRESERVE | CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO, + (struct buf *)0, &iostate); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END, - (int)upl, (int)uio->uio_offset, (int)uio->uio_resid, error, 0); + /* + * update the uio structure + */ + iov->iov_base += io_size; + iov->iov_len -= io_size; + uio->uio_resid -= io_size; + uio->uio_offset += io_size; - if (retval == 0) - retval = error; + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END, + (int)upl, (int)uio->uio_offset, (int)uio->uio_resid, retval, 0); } /* end while */ +wait_for_reads: + /* + * make sure all async reads that are part of this stream + * have completed before we return + */ + while (iostate.io_issued != iostate.io_completed) { + iostate.io_wanted = 1; + tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0); + } + if (iostate.io_error) + retval = iostate.io_error; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END, (int)uio->uio_offset, (int)uio->uio_resid, 6, retval, 0); @@ -2816,7 +2809,6 @@ cluster_nocopy_read(vp, uio, filesize, devblocksize, flags) } - static int cluster_phys_read(vp, uio, filesize, devblocksize, flags) struct vnode *vp; @@ -2828,7 +2820,7 @@ cluster_phys_read(vp, uio, filesize, devblocksize, flags) upl_page_info_t *pl; upl_t upl; vm_offset_t upl_offset; - vm_offset_t dst_paddr; + addr64_t dst_paddr; off_t max_size; int io_size; int tail_size; @@ -2884,7 +2876,7 @@ cluster_phys_read(vp, uio, filesize, devblocksize, flags) } pl = ubc_upl_pageinfo(upl); - dst_paddr = (vm_offset_t)upl_phys_page(pl, 0) + ((vm_offset_t)iov->iov_base & PAGE_MASK); + dst_paddr = (((addr64_t)(int)upl_phys_page(pl, 0)) << 12) + ((addr64_t)iov->iov_base & PAGE_MASK); while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) { int head_size; @@ -2926,7 +2918,7 @@ cluster_phys_read(vp, uio, filesize, devblocksize, flags) * the commit after all the I/O has completed * since its all issued against the same UPL * if there are already too many outstanding reads - * throttle back until we reach a more reasonable level + * wait until some have completed before issuing the next */ while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) { iostate.io_wanted = 1; @@ -2951,8 +2943,8 @@ cluster_phys_read(vp, uio, filesize, devblocksize, flags) } } /* - * make sure any async reads have completed before - * we proceed + * make sure all async reads that are part of this stream + * have completed before we proceed */ while (iostate.io_issued != iostate.io_completed) { iostate.io_wanted = 1; @@ -3404,12 +3396,12 @@ cluster_push_x(vp, EOF, first, last, can_delay) static int -cluster_align_phys_io(struct vnode *vp, struct uio *uio, vm_offset_t usr_paddr, int xsize, int devblocksize, int flags) +cluster_align_phys_io(struct vnode *vp, struct uio *uio, addr64_t usr_paddr, int xsize, int devblocksize, int flags) { struct iovec *iov; upl_page_info_t *pl; upl_t upl; - vm_offset_t ubc_paddr; + addr64_t ubc_paddr; kern_return_t kret; int error = 0; @@ -3437,27 +3429,35 @@ cluster_align_phys_io(struct vnode *vp, struct uio *uio, vm_offset_t usr_paddr, return(error); } } - ubc_paddr = (vm_offset_t)upl_phys_page(pl, 0) + (int)(uio->uio_offset & PAGE_MASK_64); - - if (flags & CL_READ) - copyp2p(ubc_paddr, usr_paddr, xsize, 2); - else - copyp2p(usr_paddr, ubc_paddr, xsize, 1); - - if ( !(flags & CL_READ) || upl_dirty_page(pl, 0)) { - /* - * issue a synchronous write to cluster_io - */ - error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize, - 0, (struct buf *)0, (struct clios *)0); - } - if (error == 0) { - uio->uio_offset += xsize; - iov->iov_base += xsize; - iov->iov_len -= xsize; - uio->uio_resid -= xsize; - } - ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY); + ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64); - return (error); +/* + * NOTE: There is no prototype for the following in BSD. It, and the definitions + * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in + * osfmk/ppc/mappings.h. They are not included here because there appears to be no + * way to do so without exporting them to kexts as well. + */ + if (flags & CL_READ) +// copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */ + copypv(ubc_paddr, usr_paddr, xsize, 2 | 1 | 4); /* Copy physical to physical and flush the destination */ + else +// copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */ + copypv(ubc_paddr, usr_paddr, xsize, 2 | 1 | 8); /* Copy physical to physical and flush the source */ + + if ( !(flags & CL_READ) || upl_dirty_page(pl, 0)) { + /* + * issue a synchronous write to cluster_io + */ + error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize, + 0, (struct buf *)0, (struct clios *)0); + } + if (error == 0) { + uio->uio_offset += xsize; + iov->iov_base += xsize; + iov->iov_len -= xsize; + uio->uio_resid -= xsize; + } + ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY); + + return (error); }