+static int
+cluster_iodone(buf_t bp, void *callback_arg)
+{
+ int b_flags;
+ int error;
+ int total_size;
+ int total_resid;
+ int upl_offset;
+ int zero_offset;
+ int pg_offset = 0;
+ int commit_size = 0;
+ int upl_flags = 0;
+ int transaction_size = 0;
+ upl_t upl;
+ buf_t cbp;
+ buf_t cbp_head;
+ buf_t cbp_next;
+ buf_t real_bp;
+ vnode_t vp;
+ struct clios *iostate;
+ boolean_t transaction_complete = FALSE;
+
+ __IGNORE_WCASTALIGN(cbp_head = (buf_t)(bp->b_trans_head));
+
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
+ cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
+
+ if (cbp_head->b_trans_next || !(cbp_head->b_flags & B_EOT)) {
+ lck_mtx_lock_spin(cl_transaction_mtxp);
+
+ bp->b_flags |= B_TDONE;
+
+ for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
+ /*
+ * all I/O requests that are part of this transaction
+ * have to complete before we can process it
+ */
+ if (!(cbp->b_flags & B_TDONE)) {
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
+ cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
+
+ lck_mtx_unlock(cl_transaction_mtxp);
+
+ return 0;
+ }
+
+ if (cbp->b_trans_next == CLUSTER_IO_WAITING) {
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
+ cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
+
+ lck_mtx_unlock(cl_transaction_mtxp);
+ wakeup(cbp);
+
+ return 0;
+ }
+
+ if (cbp->b_flags & B_EOT) {
+ transaction_complete = TRUE;
+ }
+ }
+ lck_mtx_unlock(cl_transaction_mtxp);
+
+ if (transaction_complete == FALSE) {
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
+ cbp_head, 0, 0, 0, 0);
+ return 0;
+ }
+ }
+ error = 0;
+ total_size = 0;
+ total_resid = 0;
+
+ cbp = cbp_head;
+ vp = cbp->b_vp;
+ upl_offset = cbp->b_uploffset;
+ upl = cbp->b_upl;
+ b_flags = cbp->b_flags;
+ real_bp = cbp->b_real_bp;
+ zero_offset = cbp->b_validend;
+ iostate = (struct clios *)cbp->b_iostate;
+
+ if (real_bp) {
+ real_bp->b_dev = cbp->b_dev;
+ }
+
+ while (cbp) {
+ if ((cbp->b_flags & B_ERROR) && error == 0) {
+ error = cbp->b_error;
+ }
+
+ total_resid += cbp->b_resid;
+ total_size += cbp->b_bcount;
+
+ cbp_next = cbp->b_trans_next;
+
+ if (cbp_next == NULL) {
+ /*
+ * compute the overall size of the transaction
+ * in case we created one that has 'holes' in it
+ * 'total_size' represents the amount of I/O we
+ * did, not the span of the transaction w/r to the UPL
+ */
+ transaction_size = cbp->b_uploffset + cbp->b_bcount - upl_offset;
+ }
+
+ if (cbp != cbp_head) {
+ free_io_buf(cbp);
+ }
+
+ cbp = cbp_next;
+ }
+
+ if (ISSET(b_flags, B_COMMIT_UPL)) {
+ cluster_handle_associated_upl(iostate,
+ cbp_head->b_upl,
+ upl_offset,
+ transaction_size);
+ }
+
+ if (error == 0 && total_resid) {
+ error = EIO;
+ }
+
+ if (error == 0) {
+ int (*cliodone_func)(buf_t, void *) = (int (*)(buf_t, void *))(cbp_head->b_cliodone);
+
+ if (cliodone_func != NULL) {
+ cbp_head->b_bcount = transaction_size;
+
+ error = (*cliodone_func)(cbp_head, callback_arg);
+ }
+ }
+ if (zero_offset) {
+ cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
+ }
+
+ free_io_buf(cbp_head);
+
+ if (iostate) {
+ int need_wakeup = 0;
+
+ /*
+ * someone has issued multiple I/Os asynchrounsly
+ * and is waiting for them to complete (streaming)
+ */
+ lck_mtx_lock_spin(&iostate->io_mtxp);
+
+ if (error && iostate->io_error == 0) {
+ iostate->io_error = error;
+ }
+
+ iostate->io_completed += total_size;
+
+ if (iostate->io_wanted) {
+ /*
+ * someone is waiting for the state of
+ * this io stream to change
+ */
+ iostate->io_wanted = 0;
+ need_wakeup = 1;
+ }
+ lck_mtx_unlock(&iostate->io_mtxp);
+
+ if (need_wakeup) {
+ wakeup((caddr_t)&iostate->io_wanted);
+ }
+ }
+
+ if (b_flags & B_COMMIT_UPL) {
+ pg_offset = upl_offset & PAGE_MASK;
+ commit_size = (pg_offset + transaction_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
+
+ if (error) {
+ upl_set_iodone_error(upl, error);
+
+ upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, commit_size, error, b_flags, vp);
+ } else {
+ upl_flags = UPL_COMMIT_FREE_ON_EMPTY;
+
+ if ((b_flags & B_PHYS) && (b_flags & B_READ)) {
+ upl_flags |= UPL_COMMIT_SET_DIRTY;
+ }
+
+ if (b_flags & B_AGE) {
+ upl_flags |= UPL_COMMIT_INACTIVATE;
+ }
+
+ ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags);
+ }
+ }
+ if (real_bp) {
+ if (error) {
+ real_bp->b_flags |= B_ERROR;
+ real_bp->b_error = error;
+ }
+ real_bp->b_resid = total_resid;
+
+ buf_biodone(real_bp);
+ }
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
+ upl, upl_offset - pg_offset, commit_size, (error << 24) | upl_flags, 0);
+
+ return error;
+}
+
+
+uint32_t
+cluster_throttle_io_limit(vnode_t vp, uint32_t *limit)
+{
+ if (cluster_is_throttled(vp)) {
+ *limit = THROTTLE_MAX_IOSIZE;
+ return 1;
+ }
+ return 0;
+}
+
+
+void
+cluster_zero(upl_t upl, upl_offset_t upl_offset, int size, buf_t bp)
+{
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
+ upl_offset, size, bp, 0, 0);
+
+ if (bp == NULL || bp->b_datap == 0) {
+ upl_page_info_t *pl;
+ addr64_t zero_addr;
+
+ pl = ubc_upl_pageinfo(upl);
+
+ if (upl_device_page(pl) == TRUE) {
+ zero_addr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + upl_offset;
+
+ bzero_phys_nc(zero_addr, size);
+ } else {
+ while (size) {
+ int page_offset;
+ int page_index;
+ int zero_cnt;
+
+ page_index = upl_offset / PAGE_SIZE;
+ page_offset = upl_offset & PAGE_MASK;
+
+ zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << PAGE_SHIFT) + page_offset;
+ zero_cnt = min(PAGE_SIZE - page_offset, size);
+
+ bzero_phys(zero_addr, zero_cnt);
+
+ size -= zero_cnt;
+ upl_offset += zero_cnt;
+ }
+ }
+ } else {
+ bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
+ }
+
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
+ upl_offset, size, 0, 0, 0);
+}
+
+
+static void
+cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset)
+{
+ cbp_head->b_validend = zero_offset;
+ cbp_tail->b_flags |= B_EOT;
+}
+
+static void
+cluster_wait_IO(buf_t cbp_head, int async)
+{
+ buf_t cbp;
+
+ if (async) {
+ /*
+ * Async callback completion will not normally generate a
+ * wakeup upon I/O completion. To get woken up, we set
+ * b_trans_next (which is safe for us to modify) on the last
+ * buffer to CLUSTER_IO_WAITING so that cluster_iodone knows
+ * to wake us up when all buffers as part of this transaction
+ * are completed. This is done under the umbrella of
+ * cl_transaction_mtxp which is also taken in cluster_iodone.
+ */
+ bool done = true;
+ buf_t last = NULL;
+
+ lck_mtx_lock_spin(cl_transaction_mtxp);
+
+ for (cbp = cbp_head; cbp; last = cbp, cbp = cbp->b_trans_next) {
+ if (!ISSET(cbp->b_flags, B_TDONE)) {
+ done = false;
+ }
+ }
+
+ if (!done) {
+ last->b_trans_next = CLUSTER_IO_WAITING;
+
+ DTRACE_IO1(wait__start, buf_t, last);
+ do {
+ msleep(last, cl_transaction_mtxp, PSPIN | (PRIBIO + 1), "cluster_wait_IO", NULL);
+
+ /*
+ * We should only have been woken up if all the
+ * buffers are completed, but just in case...
+ */
+ done = true;
+ for (cbp = cbp_head; cbp != CLUSTER_IO_WAITING; cbp = cbp->b_trans_next) {
+ if (!ISSET(cbp->b_flags, B_TDONE)) {
+ done = false;
+ break;
+ }
+ }
+ } while (!done);
+ DTRACE_IO1(wait__done, buf_t, last);
+
+ last->b_trans_next = NULL;
+ }
+
+ lck_mtx_unlock(cl_transaction_mtxp);
+ } else { // !async
+ for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
+ buf_biowait(cbp);
+ }
+ }
+}
+
+static void
+cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait)
+{
+ buf_t cbp;
+ int error;
+ boolean_t isswapout = FALSE;
+
+ /*
+ * cluster_complete_transaction will
+ * only be called if we've issued a complete chain in synchronous mode
+ * or, we've already done a cluster_wait_IO on an incomplete chain
+ */
+ if (needwait) {
+ for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) {
+ buf_biowait(cbp);
+ }
+ }
+ /*
+ * we've already waited on all of the I/Os in this transaction,
+ * so mark all of the buf_t's in this transaction as B_TDONE
+ * so that cluster_iodone sees the transaction as completed
+ */
+ for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) {
+ cbp->b_flags |= B_TDONE;
+ }
+ cbp = *cbp_head;
+
+ if ((flags & (CL_ASYNC | CL_PAGEOUT)) == CL_PAGEOUT && vnode_isswap(cbp->b_vp)) {
+ isswapout = TRUE;
+ }
+
+ error = cluster_iodone(cbp, callback_arg);
+
+ if (!(flags & CL_ASYNC) && error && *retval == 0) {
+ if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) != CL_PAGEOUT) || (error != ENXIO)) {
+ *retval = error;
+ } else if (isswapout == TRUE) {
+ *retval = error;
+ }
+ }
+ *cbp_head = (buf_t)NULL;
+}
+
+
+static int
+cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
+ int flags, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg)
+{
+ buf_t cbp;
+ u_int size;
+ u_int io_size;
+ int io_flags;
+ int bmap_flags;
+ int error = 0;
+ int retval = 0;
+ buf_t cbp_head = NULL;
+ buf_t cbp_tail = NULL;
+ int trans_count = 0;
+ int max_trans_count;
+ u_int pg_count;
+ int pg_offset;
+ u_int max_iosize;
+ u_int max_vectors;
+ int priv;
+ int zero_offset = 0;
+ int async_throttle = 0;
+ mount_t mp;
+ vm_offset_t upl_end_offset;
+ boolean_t need_EOT = FALSE;
+
+ /*
+ * we currently don't support buffers larger than a page
+ */
+ if (real_bp && non_rounded_size > PAGE_SIZE) {
+ panic("%s(): Called with real buffer of size %d bytes which "
+ "is greater than the maximum allowed size of "
+ "%d bytes (the system PAGE_SIZE).\n",
+ __FUNCTION__, non_rounded_size, PAGE_SIZE);
+ }
+
+ mp = vp->v_mount;
+
+ /*
+ * we don't want to do any funny rounding of the size for IO requests
+ * coming through the DIRECT or CONTIGUOUS paths... those pages don't
+ * belong to us... we can't extend (nor do we need to) the I/O to fill
+ * out a page
+ */
+ if (mp->mnt_devblocksize > 1 && !(flags & (CL_DEV_MEMORY | CL_DIRECT_IO))) {
+ /*
+ * round the requested size up so that this I/O ends on a
+ * page boundary in case this is a 'write'... if the filesystem
+ * has blocks allocated to back the page beyond the EOF, we want to
+ * make sure to write out the zero's that are sitting beyond the EOF
+ * so that in case the filesystem doesn't explicitly zero this area
+ * if a hole is created via a lseek/write beyond the current EOF,
+ * it will return zeros when it's read back from the disk. If the
+ * physical allocation doesn't extend for the whole page, we'll
+ * only write/read from the disk up to the end of this allocation
+ * via the extent info returned from the VNOP_BLOCKMAP call.
+ */
+ pg_offset = upl_offset & PAGE_MASK;
+
+ size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
+ } else {
+ /*
+ * anyone advertising a blocksize of 1 byte probably
+ * can't deal with us rounding up the request size
+ * AFP is one such filesystem/device
+ */
+ size = non_rounded_size;
+ }
+ upl_end_offset = upl_offset + size;
+
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START, (int)f_offset, size, upl_offset, flags, 0);
+
+ /*
+ * Set the maximum transaction size to the maximum desired number of
+ * buffers.
+ */
+ max_trans_count = 8;
+ if (flags & CL_DEV_MEMORY) {
+ max_trans_count = 16;
+ }
+
+ if (flags & CL_READ) {
+ io_flags = B_READ;
+ bmap_flags = VNODE_READ;
+
+ max_iosize = mp->mnt_maxreadcnt;
+ max_vectors = mp->mnt_segreadcnt;
+ } else {
+ io_flags = B_WRITE;
+ bmap_flags = VNODE_WRITE;
+
+ max_iosize = mp->mnt_maxwritecnt;
+ max_vectors = mp->mnt_segwritecnt;
+ }
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
+
+ /*
+ * make sure the maximum iosize is a
+ * multiple of the page size
+ */
+ max_iosize &= ~PAGE_MASK;
+
+ /*
+ * Ensure the maximum iosize is sensible.
+ */
+ if (!max_iosize) {
+ max_iosize = PAGE_SIZE;
+ }
+
+ if (flags & CL_THROTTLE) {
+ if (!(flags & CL_PAGEOUT) && cluster_is_throttled(vp)) {
+ if (max_iosize > THROTTLE_MAX_IOSIZE) {
+ max_iosize = THROTTLE_MAX_IOSIZE;
+ }
+ async_throttle = THROTTLE_MAXCNT;
+ } else {
+ if ((flags & CL_DEV_MEMORY)) {
+ async_throttle = IO_SCALE(vp, VNODE_ASYNC_THROTTLE);
+ } else {
+ u_int max_cluster;
+ u_int max_cluster_size;
+ u_int scale;
+
+ if (vp->v_mount->mnt_minsaturationbytecount) {
+ max_cluster_size = vp->v_mount->mnt_minsaturationbytecount;
+
+ scale = 1;
+ } else {
+ max_cluster_size = MAX_CLUSTER_SIZE(vp);
+
+ if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
+ scale = WRITE_THROTTLE_SSD;
+ } else {
+ scale = WRITE_THROTTLE;
+ }
+ }
+ if (max_iosize > max_cluster_size) {
+ max_cluster = max_cluster_size;
+ } else {
+ max_cluster = max_iosize;
+ }
+
+ if (size < max_cluster) {
+ max_cluster = size;
+ }
+
+ if (flags & CL_CLOSE) {
+ scale += MAX_CLUSTERS;
+ }
+
+ async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), ((scale * max_cluster_size) / max_cluster) - 1);
+ }
+ }
+ }
+ if (flags & CL_AGE) {
+ io_flags |= B_AGE;
+ }
+ if (flags & (CL_PAGEIN | CL_PAGEOUT)) {
+ io_flags |= B_PAGEIO;
+ }
+ if (flags & (CL_IOSTREAMING)) {
+ io_flags |= B_IOSTREAMING;
+ }
+ if (flags & CL_COMMIT) {
+ io_flags |= B_COMMIT_UPL;
+ }
+ if (flags & CL_DIRECT_IO) {
+ io_flags |= B_PHYS;
+ }
+ if (flags & (CL_PRESERVE | CL_KEEPCACHED)) {
+ io_flags |= B_CACHE;
+ }
+ if (flags & CL_PASSIVE) {
+ io_flags |= B_PASSIVE;
+ }
+ if (flags & CL_ENCRYPTED) {
+ io_flags |= B_ENCRYPTED_IO;
+ }
+
+ if (vp->v_flag & VSYSTEM) {
+ io_flags |= B_META;
+ }
+
+ if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
+ /*
+ * then we are going to end up
+ * with a page that we can't complete (the file size wasn't a multiple
+ * of PAGE_SIZE and we're trying to read to the end of the file
+ * so we'll go ahead and zero out the portion of the page we can't
+ * read in from the file
+ */
+ zero_offset = upl_offset + non_rounded_size;
+ } else if (!ISSET(flags, CL_READ) && ISSET(flags, CL_DIRECT_IO)) {
+ assert(ISSET(flags, CL_COMMIT));
+
+ // For a direct/uncached write, we need to lock pages...
+
+ upl_t cached_upl;
+
+ /*
+ * Create a UPL to lock the pages in the cache whilst the
+ * write is in progress.
+ */
+ ubc_create_upl_kernel(vp, f_offset, non_rounded_size, &cached_upl,
+ NULL, UPL_SET_LITE, VM_KERN_MEMORY_FILE);
+
+ /*
+ * Attach this UPL to the other UPL so that we can find it
+ * later.
+ */
+ upl_set_associated_upl(upl, cached_upl);
+
+ if (upl_offset & PAGE_MASK) {
+ /*
+ * The two UPLs are not aligned, so mark the first page in
+ * @upl so that cluster_handle_associated_upl can handle
+ * it accordingly.
+ */
+ upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
+ upl_page_set_mark(pl, 0, true);
+ }
+ }
+
+ while (size) {
+ daddr64_t blkno;
+ daddr64_t lblkno;
+ u_int io_size_wanted;
+ size_t io_size_tmp;
+
+ if (size > max_iosize) {
+ io_size = max_iosize;
+ } else {
+ io_size = size;
+ }
+
+ io_size_wanted = io_size;
+ io_size_tmp = (size_t)io_size;
+
+ if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, &io_size_tmp, NULL, bmap_flags, NULL))) {
+ break;
+ }
+
+ if (io_size_tmp > io_size_wanted) {
+ io_size = io_size_wanted;
+ } else {
+ io_size = (u_int)io_size_tmp;
+ }
+
+ if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
+ real_bp->b_blkno = blkno;
+ }
+
+ KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
+ (int)f_offset, (int)(blkno >> 32), (int)blkno, io_size, 0);
+
+ if (io_size == 0) {
+ /*
+ * vnop_blockmap didn't return an error... however, it did
+ * return an extent size of 0 which means we can't
+ * make forward progress on this I/O... a hole in the
+ * file would be returned as a blkno of -1 with a non-zero io_size
+ * a real extent is returned with a blkno != -1 and a non-zero io_size
+ */
+ error = EINVAL;
+ break;
+ }
+ if (!(flags & CL_READ) && blkno == -1) {
+ off_t e_offset;
+ int pageout_flags;
+
+ if (upl_get_internal_vectorupl(upl)) {
+ panic("Vector UPLs should not take this code-path\n");
+ }
+ /*
+ * we're writing into a 'hole'
+ */
+ if (flags & CL_PAGEOUT) {
+ /*
+ * if we got here via cluster_pageout
+ * then just error the request and return
+ * the 'hole' should already have been covered
+ */
+ error = EINVAL;
+ break;
+ }
+ /*
+ * we can get here if the cluster code happens to
+ * pick up a page that was dirtied via mmap vs
+ * a 'write' and the page targets a 'hole'...
+ * i.e. the writes to the cluster were sparse
+ * and the file was being written for the first time
+ *
+ * we can also get here if the filesystem supports
+ * 'holes' that are less than PAGE_SIZE.... because
+ * we can't know if the range in the page that covers
+ * the 'hole' has been dirtied via an mmap or not,
+ * we have to assume the worst and try to push the
+ * entire page to storage.
+ *
+ * Try paging out the page individually before
+ * giving up entirely and dumping it (the pageout
+ * path will insure that the zero extent accounting
+ * has been taken care of before we get back into cluster_io)
+ *
+ * go direct to vnode_pageout so that we don't have to
+ * unbusy the page from the UPL... we used to do this
+ * so that we could call ubc_msync, but that results
+ * in a potential deadlock if someone else races us to acquire
+ * that page and wins and in addition needs one of the pages
+ * we're continuing to hold in the UPL
+ */
+ pageout_flags = UPL_MSYNC | UPL_VNODE_PAGER | UPL_NESTED_PAGEOUT;
+
+ if (!(flags & CL_ASYNC)) {
+ pageout_flags |= UPL_IOSYNC;
+ }
+ if (!(flags & CL_COMMIT)) {
+ pageout_flags |= UPL_NOCOMMIT;
+ }
+
+ if (cbp_head) {
+ buf_t prev_cbp;
+ int bytes_in_last_page;
+
+ /*
+ * first we have to wait for the the current outstanding I/Os
+ * to complete... EOT hasn't been set yet on this transaction
+ * so the pages won't be released
+ */
+ cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
+
+ bytes_in_last_page = cbp_head->b_uploffset & PAGE_MASK;
+ for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
+ bytes_in_last_page += cbp->b_bcount;
+ }
+ bytes_in_last_page &= PAGE_MASK;
+
+ while (bytes_in_last_page) {
+ /*
+ * we've got a transcation that
+ * includes the page we're about to push out through vnode_pageout...
+ * find the bp's in the list which intersect this page and either
+ * remove them entirely from the transaction (there could be multiple bp's), or
+ * round it's iosize down to the page boundary (there can only be one)...
+ *
+ * find the last bp in the list and act on it
+ */
+ for (prev_cbp = cbp = cbp_head; cbp->b_trans_next; cbp = cbp->b_trans_next) {
+ prev_cbp = cbp;
+ }
+
+ if (bytes_in_last_page >= cbp->b_bcount) {
+ /*
+ * this buf no longer has any I/O associated with it
+ */
+ bytes_in_last_page -= cbp->b_bcount;
+ cbp->b_bcount = 0;
+
+ free_io_buf(cbp);
+
+ if (cbp == cbp_head) {
+ assert(bytes_in_last_page == 0);
+ /*
+ * the buf we just freed was the only buf in
+ * this transaction... so there's no I/O to do
+ */
+ cbp_head = NULL;
+ cbp_tail = NULL;
+ } else {
+ /*
+ * remove the buf we just freed from
+ * the transaction list
+ */
+ prev_cbp->b_trans_next = NULL;
+ cbp_tail = prev_cbp;
+ }
+ } else {
+ /*
+ * this is the last bp that has I/O
+ * intersecting the page of interest
+ * only some of the I/O is in the intersection
+ * so clip the size but keep it in the transaction list
+ */
+ cbp->b_bcount -= bytes_in_last_page;
+ cbp_tail = cbp;
+ bytes_in_last_page = 0;
+ }
+ }
+ if (cbp_head) {
+ /*
+ * there was more to the current transaction
+ * than just the page we are pushing out via vnode_pageout...
+ * mark it as finished and complete it... we've already
+ * waited for the I/Os to complete above in the call to cluster_wait_IO
+ */
+ cluster_EOT(cbp_head, cbp_tail, 0);
+
+ cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
+
+ trans_count = 0;
+ }
+ }
+ if (vnode_pageout(vp, upl, trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) {
+ error = EINVAL;
+ }
+ e_offset = round_page_64(f_offset + 1);
+ io_size = e_offset - f_offset;
+
+ f_offset += io_size;
+ upl_offset += io_size;
+
+ if (size >= io_size) {
+ size -= io_size;
+ } else {
+ size = 0;
+ }
+ /*
+ * keep track of how much of the original request
+ * that we've actually completed... non_rounded_size
+ * may go negative due to us rounding the request
+ * to a page size multiple (i.e. size > non_rounded_size)
+ */
+ non_rounded_size -= io_size;
+
+ if (non_rounded_size <= 0) {
+ /*
+ * we've transferred all of the data in the original
+ * request, but we were unable to complete the tail
+ * of the last page because the file didn't have
+ * an allocation to back that portion... this is ok.
+ */
+ size = 0;
+ }
+ if (error) {
+ if (size == 0) {
+ flags &= ~CL_COMMIT;
+ }
+ break;
+ }
+ continue;
+ }
+ lblkno = (daddr64_t)(f_offset / 0x1000);
+ /*
+ * we have now figured out how much I/O we can do - this is in 'io_size'
+ * pg_offset is the starting point in the first page for the I/O
+ * pg_count is the number of full and partial pages that 'io_size' encompasses
+ */
+ pg_offset = upl_offset & PAGE_MASK;
+
+ if (flags & CL_DEV_MEMORY) {
+ /*
+ * treat physical requests as one 'giant' page
+ */
+ pg_count = 1;
+ } else {
+ pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
+ }
+
+ if ((flags & CL_READ) && blkno == -1) {
+ vm_offset_t commit_offset;
+ int bytes_to_zero;
+ int complete_transaction_now = 0;
+
+ /*
+ * if we're reading and blkno == -1, then we've got a
+ * 'hole' in the file that we need to deal with by zeroing
+ * out the affected area in the upl
+ */
+ if (io_size >= (u_int)non_rounded_size) {
+ /*
+ * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
+ * than 'zero_offset' will be non-zero
+ * if the 'hole' returned by vnop_blockmap extends all the way to the eof
+ * (indicated by the io_size finishing off the I/O request for this UPL)
+ * than we're not going to issue an I/O for the
+ * last page in this upl... we need to zero both the hole and the tail
+ * of the page beyond the EOF, since the delayed zero-fill won't kick in
+ */
+ bytes_to_zero = non_rounded_size;
+ if (!(flags & CL_NOZERO)) {
+ bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
+ }
+
+ zero_offset = 0;
+ } else {
+ bytes_to_zero = io_size;
+ }
+
+ pg_count = 0;
+
+ cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
+
+ if (cbp_head) {
+ int pg_resid;
+
+ /*
+ * if there is a current I/O chain pending
+ * then the first page of the group we just zero'd
+ * will be handled by the I/O completion if the zero
+ * fill started in the middle of the page
+ */
+ commit_offset = (upl_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
+
+ pg_resid = commit_offset - upl_offset;
+
+ if (bytes_to_zero >= pg_resid) {
+ /*
+ * the last page of the current I/O
+ * has been completed...
+ * compute the number of fully zero'd
+ * pages that are beyond it
+ * plus the last page if its partial
+ * and we have no more I/O to issue...
+ * otherwise a partial page is left
+ * to begin the next I/O
+ */
+ if ((int)io_size >= non_rounded_size) {
+ pg_count = (bytes_to_zero - pg_resid + (PAGE_SIZE - 1)) / PAGE_SIZE;
+ } else {
+ pg_count = (bytes_to_zero - pg_resid) / PAGE_SIZE;
+ }
+
+ complete_transaction_now = 1;
+ }
+ } else {
+ /*
+ * no pending I/O to deal with
+ * so, commit all of the fully zero'd pages
+ * plus the last page if its partial
+ * and we have no more I/O to issue...
+ * otherwise a partial page is left
+ * to begin the next I/O
+ */
+ if ((int)io_size >= non_rounded_size) {
+ pg_count = (pg_offset + bytes_to_zero + (PAGE_SIZE - 1)) / PAGE_SIZE;
+ } else {
+ pg_count = (pg_offset + bytes_to_zero) / PAGE_SIZE;
+ }
+
+ commit_offset = upl_offset & ~PAGE_MASK;
+ }
+
+ // Associated UPL is currently only used in the direct write path
+ assert(!upl_associated_upl(upl));
+
+ if ((flags & CL_COMMIT) && pg_count) {
+ ubc_upl_commit_range(upl, commit_offset, pg_count * PAGE_SIZE,
+ UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
+ }
+ upl_offset += io_size;
+ f_offset += io_size;
+ size -= io_size;
+
+ /*
+ * keep track of how much of the original request
+ * that we've actually completed... non_rounded_size
+ * may go negative due to us rounding the request
+ * to a page size multiple (i.e. size > non_rounded_size)
+ */
+ non_rounded_size -= io_size;
+
+ if (non_rounded_size <= 0) {
+ /*
+ * we've transferred all of the data in the original
+ * request, but we were unable to complete the tail
+ * of the last page because the file didn't have
+ * an allocation to back that portion... this is ok.
+ */
+ size = 0;
+ }
+ if (cbp_head && (complete_transaction_now || size == 0)) {
+ cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
+
+ cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0);
+
+ cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
+
+ trans_count = 0;
+ }
+ continue;
+ }
+ if (pg_count > max_vectors) {
+ if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
+ io_size = PAGE_SIZE - pg_offset;
+ pg_count = 1;
+ } else {
+ io_size -= (pg_count - max_vectors) * PAGE_SIZE;
+ pg_count = max_vectors;
+ }
+ }
+ /*
+ * If the transaction is going to reach the maximum number of
+ * desired elements, truncate the i/o to the nearest page so
+ * that the actual i/o is initiated after this buffer is
+ * created and added to the i/o chain.
+ *
+ * I/O directed to physically contiguous memory
+ * doesn't have a requirement to make sure we 'fill' a page
+ */
+ if (!(flags & CL_DEV_MEMORY) && trans_count >= max_trans_count &&
+ ((upl_offset + io_size) & PAGE_MASK)) {
+ vm_offset_t aligned_ofs;
+
+ aligned_ofs = (upl_offset + io_size) & ~PAGE_MASK;
+ /*
+ * If the io_size does not actually finish off even a
+ * single page we have to keep adding buffers to the
+ * transaction despite having reached the desired limit.
+ *
+ * Eventually we get here with the page being finished
+ * off (and exceeded) and then we truncate the size of
+ * this i/o request so that it is page aligned so that
+ * we can finally issue the i/o on the transaction.
+ */
+ if (aligned_ofs > upl_offset) {
+ io_size = aligned_ofs - upl_offset;
+ pg_count--;
+ }
+ }
+
+ if (!(mp->mnt_kern_flag & MNTK_VIRTUALDEV)) {
+ /*
+ * if we're not targeting a virtual device i.e. a disk image
+ * it's safe to dip into the reserve pool since real devices
+ * can complete this I/O request without requiring additional
+ * bufs from the alloc_io_buf pool
+ */
+ priv = 1;
+ } else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT)) {
+ /*
+ * Throttle the speculative IO
+ */
+ priv = 0;
+ } else {
+ priv = 1;
+ }
+
+ cbp = alloc_io_buf(vp, priv);
+
+ if (flags & CL_PAGEOUT) {
+ u_int i;
+
+ /*
+ * since blocks are in offsets of 0x1000, scale
+ * iteration to (PAGE_SIZE * pg_count) of blks.
+ */
+ for (i = 0; i < (PAGE_SIZE * pg_count) / 0x1000; i++) {
+ if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY) {
+ panic("BUSY bp found in cluster_io");
+ }