+void
+buf_setdataptr(buf_t bp, uintptr_t data) {
+
+ bp->b_datap = data;
+}
+
+vnode_t
+buf_vnode(buf_t bp) {
+
+ return (bp->b_vp);
+}
+
+void
+buf_setvnode(buf_t bp, vnode_t vp) {
+
+ bp->b_vp = vp;
+}
+
+
+void *
+buf_callback(buf_t bp)
+{
+ if ( !(bp->b_flags & B_CALL) )
+ return ((void *) NULL);
+
+ return ((void *)bp->b_iodone);
+}
+
+
+errno_t
+buf_setcallback(buf_t bp, void (*callback)(buf_t, void *), void *transaction)
+{
+ if (callback)
+ bp->b_flags |= (B_CALL | B_ASYNC);
+ else
+ bp->b_flags &= ~B_CALL;
+ bp->b_transaction = transaction;
+ bp->b_iodone = callback;
+
+ return (0);
+}
+
+errno_t
+buf_setupl(buf_t bp, upl_t upl, uint32_t offset)
+{
+
+ if ( !(bp->b_lflags & BL_IOBUF) )
+ return (EINVAL);
+
+ if (upl)
+ bp->b_flags |= B_CLUSTER;
+ else
+ bp->b_flags &= ~B_CLUSTER;
+ bp->b_upl = upl;
+ bp->b_uploffset = offset;
+
+ return (0);
+}
+
+buf_t
+buf_clone(buf_t bp, int io_offset, int io_size, void (*iodone)(buf_t, void *), void *arg)
+{
+ buf_t io_bp;
+
+ if (io_offset < 0 || io_size < 0)
+ return (NULL);
+
+ if ((unsigned)(io_offset + io_size) > (unsigned)bp->b_bcount)
+ return (NULL);
+
+ if (bp->b_flags & B_CLUSTER) {
+ if (io_offset && ((bp->b_uploffset + io_offset) & PAGE_MASK))
+ return (NULL);
+
+ if (((bp->b_uploffset + io_offset + io_size) & PAGE_MASK) && ((io_offset + io_size) < bp->b_bcount))
+ return (NULL);
+ }
+ io_bp = alloc_io_buf(bp->b_vp, 0);
+
+ io_bp->b_flags = bp->b_flags & (B_COMMIT_UPL | B_META | B_PAGEIO | B_CLUSTER | B_PHYS | B_RAW | B_ASYNC | B_READ | B_FUA);
+
+ if (iodone) {
+ io_bp->b_transaction = arg;
+ io_bp->b_iodone = iodone;
+ io_bp->b_flags |= B_CALL;
+ }
+ if (bp->b_flags & B_CLUSTER) {
+ io_bp->b_upl = bp->b_upl;
+ io_bp->b_uploffset = bp->b_uploffset + io_offset;
+ } else {
+ io_bp->b_datap = (uintptr_t)(((char *)bp->b_datap) + io_offset);
+ }
+ io_bp->b_bcount = io_size;
+
+ return (io_bp);
+}
+
+
+
+void
+buf_setfilter(buf_t bp, void (*filter)(buf_t, void *), void *transaction,
+ void **old_iodone, void **old_transaction)
+{
+ if (old_iodone)
+ *old_iodone = (void *)(bp->b_iodone);
+ if (old_transaction)
+ *old_transaction = (void *)(bp->b_transaction);
+
+ bp->b_transaction = transaction;
+ bp->b_iodone = filter;
+ if (filter)
+ bp->b_flags |= B_FILTER;
+ else
+ bp->b_flags &= ~B_FILTER;
+}
+
+
+daddr64_t
+buf_blkno(buf_t bp) {
+
+ return (bp->b_blkno);
+}
+
+daddr64_t
+buf_lblkno(buf_t bp) {
+
+ return (bp->b_lblkno);
+}
+
+void
+buf_setblkno(buf_t bp, daddr64_t blkno) {
+
+ bp->b_blkno = blkno;
+}
+
+void
+buf_setlblkno(buf_t bp, daddr64_t lblkno) {
+
+ bp->b_lblkno = lblkno;
+}
+
+dev_t
+buf_device(buf_t bp) {
+
+ return (bp->b_dev);
+}
+
+errno_t
+buf_setdevice(buf_t bp, vnode_t vp) {
+
+ if ((vp->v_type != VBLK) && (vp->v_type != VCHR))
+ return EINVAL;
+ bp->b_dev = vp->v_rdev;
+
+ return 0;
+}
+
+
+void *
+buf_drvdata(buf_t bp) {
+
+ return (bp->b_drvdata);
+}
+
+void
+buf_setdrvdata(buf_t bp, void *drvdata) {
+
+ bp->b_drvdata = drvdata;
+}
+
+void *
+buf_fsprivate(buf_t bp) {
+
+ return (bp->b_fsprivate);
+}
+
+void
+buf_setfsprivate(buf_t bp, void *fsprivate) {
+
+ bp->b_fsprivate = fsprivate;
+}
+
+ucred_t
+buf_rcred(buf_t bp) {
+
+ return (bp->b_rcred);
+}
+
+ucred_t
+buf_wcred(buf_t bp) {
+
+ return (bp->b_wcred);
+}
+
+void *
+buf_upl(buf_t bp) {
+
+ return (bp->b_upl);
+}
+
+uint32_t
+buf_uploffset(buf_t bp) {
+
+ return ((uint32_t)(bp->b_uploffset));
+}
+
+proc_t
+buf_proc(buf_t bp) {
+
+ return (bp->b_proc);
+}
+
+
+errno_t
+buf_map(buf_t bp, caddr_t *io_addr)
+{
+ buf_t real_bp;
+ vm_offset_t vaddr;
+ kern_return_t kret;
+
+ if ( !(bp->b_flags & B_CLUSTER)) {
+ *io_addr = (caddr_t)bp->b_datap;
+ return (0);
+ }
+ real_bp = (buf_t)(bp->b_real_bp);
+
+ if (real_bp && real_bp->b_datap) {
+ /*
+ * b_real_bp is only valid if B_CLUSTER is SET
+ * if it's non-zero, than someone did a cluster_bp call
+ * if the backing physical pages were already mapped
+ * in before the call to cluster_bp (non-zero b_datap),
+ * than we just use that mapping
+ */
+ *io_addr = (caddr_t)real_bp->b_datap;
+ return (0);
+ }
+ kret = ubc_upl_map(bp->b_upl, &vaddr); /* Map it in */
+
+ if (kret != KERN_SUCCESS) {
+ *io_addr = NULL;
+
+ return(ENOMEM);
+ }
+ vaddr += bp->b_uploffset;
+
+ *io_addr = (caddr_t)vaddr;
+
+ return (0);
+}
+
+errno_t
+buf_unmap(buf_t bp)
+{
+ buf_t real_bp;
+ kern_return_t kret;
+
+ if ( !(bp->b_flags & B_CLUSTER))
+ return (0);
+ /*
+ * see buf_map for the explanation
+ */
+ real_bp = (buf_t)(bp->b_real_bp);
+
+ if (real_bp && real_bp->b_datap)
+ return (0);
+
+ if ((bp->b_lflags & BL_IOBUF) &&
+ ((bp->b_flags & (B_PAGEIO | B_READ)) != (B_PAGEIO | B_READ))) {
+ /*
+ * ignore pageins... the 'right' thing will
+ * happen due to the way we handle speculative
+ * clusters...
+ *
+ * when we commit these pages, we'll hit
+ * it with UPL_COMMIT_INACTIVE which
+ * will clear the reference bit that got
+ * turned on when we touched the mapping
+ */
+ bp->b_flags |= B_AGE;
+ }
+ kret = ubc_upl_unmap(bp->b_upl);
+
+ if (kret != KERN_SUCCESS)
+ return (EINVAL);
+ return (0);
+}
+
+
+void
+buf_clear(buf_t bp) {
+ caddr_t baddr;
+
+ if (buf_map(bp, &baddr) == 0) {
+ bzero(baddr, bp->b_bcount);
+ buf_unmap(bp);
+ }
+ bp->b_resid = 0;
+}
+
+
+
+/*
+ * Read or write a buffer that is not contiguous on disk.
+ * buffer is marked done/error at the conclusion
+ */
+static int
+buf_strategy_fragmented(vnode_t devvp, buf_t bp, off_t f_offset, size_t contig_bytes)
+{
+ vnode_t vp = buf_vnode(bp);
+ buf_t io_bp; /* For reading or writing a single block */
+ int io_direction;
+ int io_resid;
+ size_t io_contig_bytes;
+ daddr64_t io_blkno;
+ int error = 0;
+ int bmap_flags;
+
+ /*
+ * save our starting point... the bp was already mapped
+ * in buf_strategy before we got called
+ * no sense doing it again.
+ */
+ io_blkno = bp->b_blkno;
+ /*
+ * Make sure we redo this mapping for the next I/O
+ * i.e. this can never be a 'permanent' mapping
+ */
+ bp->b_blkno = bp->b_lblkno;
+
+ /*
+ * Get an io buffer to do the deblocking
+ */
+ io_bp = alloc_io_buf(devvp, 0);
+
+ io_bp->b_lblkno = bp->b_lblkno;
+ io_bp->b_datap = bp->b_datap;
+ io_resid = bp->b_bcount;
+ io_direction = bp->b_flags & B_READ;
+ io_contig_bytes = contig_bytes;
+
+ if (bp->b_flags & B_READ)
+ bmap_flags = VNODE_READ;
+ else
+ bmap_flags = VNODE_WRITE;
+
+ for (;;) {
+ if (io_blkno == -1)
+ /*
+ * this is unexepected, but we'll allow for it
+ */
+ bzero((caddr_t)io_bp->b_datap, (int)io_contig_bytes);
+ else {
+ io_bp->b_bcount = io_contig_bytes;
+ io_bp->b_bufsize = io_contig_bytes;
+ io_bp->b_resid = io_contig_bytes;
+ io_bp->b_blkno = io_blkno;
+
+ buf_reset(io_bp, io_direction);
+
+ /*
+ * Call the device to do the I/O and wait for it. Make sure the appropriate party is charged for write
+ */
+
+ if (!ISSET(bp->b_flags, B_READ))
+ OSAddAtomic(1, &devvp->v_numoutput);
+
+ if ((error = VNOP_STRATEGY(io_bp)))
+ break;
+ if ((error = (int)buf_biowait(io_bp)))
+ break;
+ if (io_bp->b_resid) {
+ io_resid -= (io_contig_bytes - io_bp->b_resid);
+ break;
+ }
+ }
+ if ((io_resid -= io_contig_bytes) == 0)
+ break;
+ f_offset += io_contig_bytes;
+ io_bp->b_datap += io_contig_bytes;
+
+ /*
+ * Map the current position to a physical block number
+ */
+ if ((error = VNOP_BLOCKMAP(vp, f_offset, io_resid, &io_blkno, &io_contig_bytes, NULL, bmap_flags, NULL)))
+ break;
+ }
+ buf_free(io_bp);
+
+ if (error)
+ buf_seterror(bp, error);
+ bp->b_resid = io_resid;
+ /*
+ * This I/O is now complete
+ */
+ buf_biodone(bp);
+
+ return error;
+}
+
+
+/*
+ * struct vnop_strategy_args {
+ * struct buf *a_bp;
+ * } *ap;
+ */
+errno_t
+buf_strategy(vnode_t devvp, void *ap)
+{
+ buf_t bp = ((struct vnop_strategy_args *)ap)->a_bp;
+ vnode_t vp = bp->b_vp;
+ int bmap_flags;
+ errno_t error;
+
+ if (vp == NULL || vp->v_type == VCHR || vp->v_type == VBLK)
+ panic("buf_strategy: b_vp == NULL || vtype == VCHR | VBLK\n");
+ /*
+ * associate the physical device with
+ * with this buf_t even if we don't
+ * end up issuing the I/O...
+ */
+ bp->b_dev = devvp->v_rdev;
+ DTRACE_IO1(start, buf_t, bp);
+
+ if (bp->b_flags & B_READ)
+ bmap_flags = VNODE_READ;
+ else
+ bmap_flags = VNODE_WRITE;
+
+ if ( !(bp->b_flags & B_CLUSTER)) {
+
+ if ( (bp->b_upl) ) {
+ /*
+ * we have a UPL associated with this bp
+ * go through cluster_bp which knows how
+ * to deal with filesystem block sizes
+ * that aren't equal to the page size
+ */
+ return (cluster_bp(bp));
+ }
+ if (bp->b_blkno == bp->b_lblkno) {
+ off_t f_offset;
+ size_t contig_bytes;
+
+ if ((error = VNOP_BLKTOOFF(vp, bp->b_lblkno, &f_offset))) {
+ buf_seterror(bp, error);
+ buf_biodone(bp);
+
+ return (error);
+ }
+ if ((error = VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))) {
+ buf_seterror(bp, error);
+ buf_biodone(bp);
+
+ return (error);
+ }
+ if (bp->b_blkno == -1)
+ buf_clear(bp);
+ else if ((long)contig_bytes < bp->b_bcount)
+ return (buf_strategy_fragmented(devvp, bp, f_offset, contig_bytes));
+ }
+ if (bp->b_blkno == -1) {
+ buf_biodone(bp);
+ return (0);
+ }
+ }
+ /*
+ * we can issue the I/O because...
+ * either B_CLUSTER is set which
+ * means that the I/O is properly set
+ * up to be a multiple of the page size, or
+ * we were able to successfully set up the
+ * phsyical block mapping
+ */
+ return (VOCALL(devvp->v_op, VOFFSET(vnop_strategy), ap));
+}
+
+
+
+buf_t
+buf_alloc(vnode_t vp)
+{
+ return(alloc_io_buf(vp, 0));
+}
+
+void
+buf_free(buf_t bp) {
+
+ free_io_buf(bp);
+}
+
+
+/*
+ * iterate buffers for the specified vp.
+ * if BUF_SCAN_DIRTY is set, do the dirty list
+ * if BUF_SCAN_CLEAN is set, do the clean list
+ * if neither flag is set, default to BUF_SCAN_DIRTY
+ * if BUF_NOTIFY_BUSY is set, call the callout function using a NULL bp for busy pages
+ */
+
+struct buf_iterate_info_t {
+ int flag;
+ struct buflists *listhead;
+};
+
+void
+buf_iterate(vnode_t vp, int (*callout)(buf_t, void *), int flags, void *arg)
+{
+ buf_t bp;
+ int retval;
+ struct buflists local_iterblkhd;
+ int lock_flags = BAC_NOWAIT | BAC_REMOVE;
+ int notify_busy = flags & BUF_NOTIFY_BUSY;
+ struct buf_iterate_info_t list[2];
+ int num_lists, i;
+
+ if (flags & BUF_SKIP_LOCKED)
+ lock_flags |= BAC_SKIP_LOCKED;
+ if (flags & BUF_SKIP_NONLOCKED)
+ lock_flags |= BAC_SKIP_NONLOCKED;
+
+ if ( !(flags & (BUF_SCAN_DIRTY | BUF_SCAN_CLEAN)))
+ flags |= BUF_SCAN_DIRTY;
+
+ num_lists = 0;
+
+ if (flags & BUF_SCAN_DIRTY) {
+ list[num_lists].flag = VBI_DIRTY;
+ list[num_lists].listhead = &vp->v_dirtyblkhd;
+ num_lists++;
+ }
+ if (flags & BUF_SCAN_CLEAN) {
+ list[num_lists].flag = VBI_CLEAN;
+ list[num_lists].listhead = &vp->v_cleanblkhd;
+ num_lists++;
+ }
+
+ for (i = 0; i < num_lists; i++) {
+ lck_mtx_lock(buf_mtxp);
+
+ if (buf_iterprepare(vp, &local_iterblkhd, list[i].flag)) {
+ lck_mtx_unlock(buf_mtxp);
+ continue;
+ }
+ while (!LIST_EMPTY(&local_iterblkhd)) {
+ bp = LIST_FIRST(&local_iterblkhd);
+ LIST_REMOVE(bp, b_vnbufs);
+ LIST_INSERT_HEAD(list[i].listhead, bp, b_vnbufs);
+
+ if (buf_acquire_locked(bp, lock_flags, 0, 0)) {
+ if (notify_busy) {
+ bp = NULL;
+ } else {
+ continue;
+ }
+ }
+
+ lck_mtx_unlock(buf_mtxp);
+
+ retval = callout(bp, arg);
+
+ switch (retval) {
+ case BUF_RETURNED:
+ if (bp)
+ buf_brelse(bp);
+ break;
+ case BUF_CLAIMED:
+ break;
+ case BUF_RETURNED_DONE:
+ if (bp)
+ buf_brelse(bp);
+ lck_mtx_lock(buf_mtxp);
+ goto out;
+ case BUF_CLAIMED_DONE:
+ lck_mtx_lock(buf_mtxp);
+ goto out;
+ }
+ lck_mtx_lock(buf_mtxp);
+ } /* while list has more nodes */
+ out:
+ buf_itercomplete(vp, &local_iterblkhd, list[i].flag);
+ lck_mtx_unlock(buf_mtxp);
+ } /* for each list */
+} /* buf_iterate */
+
+
+/*
+ * Flush out and invalidate all buffers associated with a vnode.
+ */
+int
+buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo)
+{
+ buf_t bp;
+ int error = 0;
+ int must_rescan = 1;
+ struct buflists local_iterblkhd;
+
+ lck_mtx_lock(buf_mtxp);
+
+ for (;;) {
+ if (must_rescan == 0)
+ /*
+ * the lists may not be empty, but all that's left at this
+ * point are metadata or B_LOCKED buffers which are being
+ * skipped... we know this because we made it through both
+ * the clean and dirty lists without dropping buf_mtxp...
+ * each time we drop buf_mtxp we bump "must_rescan"
+ */
+ break;
+ if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd))
+ break;
+ must_rescan = 0;
+ /*
+ * iterate the clean list
+ */
+ if (buf_iterprepare(vp, &local_iterblkhd, VBI_CLEAN)) {
+ goto try_dirty_list;
+ }
+ while (!LIST_EMPTY(&local_iterblkhd)) {
+ bp = LIST_FIRST(&local_iterblkhd);
+
+ LIST_REMOVE(bp, b_vnbufs);
+ LIST_INSERT_HEAD(&vp->v_cleanblkhd, bp, b_vnbufs);
+
+ /*
+ * some filesystems distinguish meta data blocks with a negative logical block #
+ */
+ if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
+ continue;
+
+ if ( (error = (int)buf_acquire_locked(bp, BAC_REMOVE | BAC_SKIP_LOCKED, slpflag, slptimeo)) ) {
+ if (error == EDEADLK)
+ /*
+ * this buffer was marked B_LOCKED...
+ * we didn't drop buf_mtxp, so we
+ * we don't need to rescan
+ */
+ continue;
+ if (error == EAGAIN) {
+ /*
+ * found a busy buffer... we blocked and
+ * dropped buf_mtxp, so we're going to
+ * need to rescan after this pass is completed
+ */
+ must_rescan++;
+ continue;
+ }
+ /*
+ * got some kind of 'real' error out of the msleep
+ * in buf_acquire_locked, terminate the scan and return the error
+ */
+ buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
+
+ lck_mtx_unlock(buf_mtxp);
+ return (error);
+ }
+ lck_mtx_unlock(buf_mtxp);
+
+ SET(bp->b_flags, B_INVAL);
+ buf_brelse(bp);
+
+ lck_mtx_lock(buf_mtxp);
+
+ /*
+ * by dropping buf_mtxp, we allow new
+ * buffers to be added to the vnode list(s)
+ * we'll have to rescan at least once more
+ * if the queues aren't empty
+ */
+ must_rescan++;
+ }
+ buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
+
+try_dirty_list:
+ /*
+ * Now iterate on dirty blks
+ */
+ if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY)) {
+ continue;
+ }
+ while (!LIST_EMPTY(&local_iterblkhd)) {
+ bp = LIST_FIRST(&local_iterblkhd);
+
+ LIST_REMOVE(bp, b_vnbufs);
+ LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
+
+ /*
+ * some filesystems distinguish meta data blocks with a negative logical block #
+ */
+ if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
+ continue;
+
+ if ( (error = (int)buf_acquire_locked(bp, BAC_REMOVE | BAC_SKIP_LOCKED, slpflag, slptimeo)) ) {
+ if (error == EDEADLK)
+ /*
+ * this buffer was marked B_LOCKED...
+ * we didn't drop buf_mtxp, so we
+ * we don't need to rescan
+ */
+ continue;
+ if (error == EAGAIN) {
+ /*
+ * found a busy buffer... we blocked and
+ * dropped buf_mtxp, so we're going to
+ * need to rescan after this pass is completed
+ */
+ must_rescan++;
+ continue;
+ }
+ /*
+ * got some kind of 'real' error out of the msleep
+ * in buf_acquire_locked, terminate the scan and return the error
+ */
+ buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
+
+ lck_mtx_unlock(buf_mtxp);
+ return (error);
+ }
+ lck_mtx_unlock(buf_mtxp);
+
+ SET(bp->b_flags, B_INVAL);
+
+ if (ISSET(bp->b_flags, B_DELWRI) && (flags & BUF_WRITE_DATA))
+ (void) VNOP_BWRITE(bp);
+ else
+ buf_brelse(bp);
+
+ lck_mtx_lock(buf_mtxp);
+ /*
+ * by dropping buf_mtxp, we allow new
+ * buffers to be added to the vnode list(s)
+ * we'll have to rescan at least once more
+ * if the queues aren't empty
+ */
+ must_rescan++;
+ }
+ buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
+ }
+ lck_mtx_unlock(buf_mtxp);
+
+ return (0);
+}
+
+void
+buf_flushdirtyblks(vnode_t vp, int wait, int flags, const char *msg) {
+ buf_t bp;
+ int writes_issued = 0;
+ errno_t error;
+ int busy = 0;
+ struct buflists local_iterblkhd;
+ int lock_flags = BAC_NOWAIT | BAC_REMOVE;
+
+ if (flags & BUF_SKIP_LOCKED)
+ lock_flags |= BAC_SKIP_LOCKED;
+ if (flags & BUF_SKIP_NONLOCKED)
+ lock_flags |= BAC_SKIP_NONLOCKED;
+loop:
+ lck_mtx_lock(buf_mtxp);
+
+ if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY) == 0) {
+ while (!LIST_EMPTY(&local_iterblkhd)) {
+ bp = LIST_FIRST(&local_iterblkhd);
+ LIST_REMOVE(bp, b_vnbufs);
+ LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
+
+ if ((error = buf_acquire_locked(bp, lock_flags, 0, 0)) == EBUSY)
+ busy++;
+ if (error)
+ continue;
+ lck_mtx_unlock(buf_mtxp);
+
+ bp->b_flags &= ~B_LOCKED;
+
+ /*
+ * Wait for I/O associated with indirect blocks to complete,
+ * since there is no way to quickly wait for them below.
+ */
+ if ((bp->b_vp == vp) || (wait == 0))
+ (void) buf_bawrite(bp);
+ else
+ (void) VNOP_BWRITE(bp);
+ writes_issued++;
+
+ lck_mtx_lock(buf_mtxp);
+ }
+ buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
+ }
+ lck_mtx_unlock(buf_mtxp);
+
+ if (wait) {
+ (void)vnode_waitforwrites(vp, 0, 0, 0, msg);
+
+ if (vp->v_dirtyblkhd.lh_first && busy) {
+ /*
+ * we had one or more BUSY buffers on
+ * the dirtyblock list... most likely
+ * these are due to delayed writes that
+ * were moved to the bclean queue but
+ * have not yet been 'written'.
+ * if we issued some writes on the
+ * previous pass, we try again immediately
+ * if we didn't, we'll sleep for some time
+ * to allow the state to change...
+ */
+ if (writes_issued == 0) {
+ (void)tsleep((caddr_t)&vp->v_numoutput,
+ PRIBIO + 1, "vnode_flushdirtyblks", hz/20);
+ }
+ writes_issued = 0;
+ busy = 0;
+
+ goto loop;
+ }
+ }
+}
+
+
+/*
+ * called with buf_mtxp held...
+ * this lock protects the queue manipulation
+ */
+static int
+buf_iterprepare(vnode_t vp, struct buflists *iterheadp, int flags)
+{
+ struct buflists * listheadp;
+
+ if (flags & VBI_DIRTY)
+ listheadp = &vp->v_dirtyblkhd;
+ else
+ listheadp = &vp->v_cleanblkhd;
+
+ while (vp->v_iterblkflags & VBI_ITER) {
+ vp->v_iterblkflags |= VBI_ITERWANT;
+ msleep(&vp->v_iterblkflags, buf_mtxp, 0, "buf_iterprepare", NULL);
+ }
+ if (LIST_EMPTY(listheadp)) {
+ LIST_INIT(iterheadp);
+ return(EINVAL);
+ }
+ vp->v_iterblkflags |= VBI_ITER;
+
+ iterheadp->lh_first = listheadp->lh_first;
+ listheadp->lh_first->b_vnbufs.le_prev = &iterheadp->lh_first;
+ LIST_INIT(listheadp);
+
+ return(0);
+}
+
+/*
+ * called with buf_mtxp held...
+ * this lock protects the queue manipulation
+ */
+static void
+buf_itercomplete(vnode_t vp, struct buflists *iterheadp, int flags)
+{
+ struct buflists * listheadp;
+ buf_t bp;
+
+ if (flags & VBI_DIRTY)
+ listheadp = &vp->v_dirtyblkhd;
+ else
+ listheadp = &vp->v_cleanblkhd;
+
+ while (!LIST_EMPTY(iterheadp)) {
+ bp = LIST_FIRST(iterheadp);
+ LIST_REMOVE(bp, b_vnbufs);
+ LIST_INSERT_HEAD(listheadp, bp, b_vnbufs);
+ }
+ vp->v_iterblkflags &= ~VBI_ITER;
+
+ if (vp->v_iterblkflags & VBI_ITERWANT) {
+ vp->v_iterblkflags &= ~VBI_ITERWANT;
+ wakeup(&vp->v_iterblkflags);
+ }
+}
+
+
+static void
+bremfree_locked(buf_t bp)
+{
+ struct bqueues *dp = NULL;
+ int whichq;
+ /*
+ * We only calculate the head of the freelist when removing
+ * the last element of the list as that is the only time that
+ * it is needed (e.g. to reset the tail pointer).
+ *
+ * NB: This makes an assumption about how tailq's are implemented.
+ */
+ whichq = bp->b_whichq;
+
+ if (bp->b_freelist.tqe_next == NULL) {
+ dp = &bufqueues[whichq];
+
+ if (dp->tqh_last != &bp->b_freelist.tqe_next)
+ panic("bremfree: lost tail");
+ }
+ TAILQ_REMOVE(dp, bp, b_freelist);
+
+#if BALANCE_QUEUES
+ bufqdec(whichq);
+#endif
+ if (whichq == BQ_LAUNDRY)
+ blaundrycnt--;
+
+ bp->b_whichq = -1;
+ bp->b_timestamp = 0;
+}
+
+/*
+ * Associate a buffer with a vnode.
+ * buf_mtxp must be locked on entry
+ */
+static void
+bgetvp_locked(vnode_t vp, buf_t bp)
+{
+
+ if (bp->b_vp != vp)
+ panic("bgetvp_locked: not free");
+
+ if (vp->v_type == VBLK || vp->v_type == VCHR)
+ bp->b_dev = vp->v_rdev;
+ else
+ bp->b_dev = NODEV;
+ /*
+ * Insert onto list for new vnode.
+ */
+ bufinsvn(bp, &vp->v_cleanblkhd);
+}
+
+/*
+ * Disassociate a buffer from a vnode.
+ * buf_mtxp must be locked on entry
+ */
+static void
+brelvp_locked(buf_t bp)
+{
+ /*
+ * Delete from old vnode list, if on one.
+ */
+ if (bp->b_vnbufs.le_next != NOLIST)
+ bufremvn(bp);
+
+ bp->b_vp = (vnode_t)NULL;
+}
+
+/*
+ * Reassign a buffer from one vnode to another.
+ * Used to assign file specific control information
+ * (indirect blocks) to the vnode to which they belong.
+ */
+static void
+buf_reassign(buf_t bp, vnode_t newvp)
+{
+ register struct buflists *listheadp;
+
+ if (newvp == NULL) {
+ printf("buf_reassign: NULL");
+ return;
+ }
+ lck_mtx_lock_spin(buf_mtxp);
+
+ /*
+ * Delete from old vnode list, if on one.
+ */
+ if (bp->b_vnbufs.le_next != NOLIST)
+ bufremvn(bp);
+ /*
+ * If dirty, put on list of dirty buffers;
+ * otherwise insert onto list of clean buffers.
+ */
+ if (ISSET(bp->b_flags, B_DELWRI))
+ listheadp = &newvp->v_dirtyblkhd;
+ else
+ listheadp = &newvp->v_cleanblkhd;
+ bufinsvn(bp, listheadp);
+
+ lck_mtx_unlock(buf_mtxp);
+}
+
+static __inline__ void
+bufhdrinit(buf_t bp)
+{
+ bzero((char *)bp, sizeof *bp);
+ bp->b_dev = NODEV;
+ bp->b_rcred = NOCRED;
+ bp->b_wcred = NOCRED;
+ bp->b_vnbufs.le_next = NOLIST;
+ bp->b_flags = B_INVAL;
+
+ return;
+}
+
+/*
+ * Initialize buffers and hash links for buffers.
+ */
+__private_extern__ void
+bufinit(void)
+{
+ buf_t bp;
+ struct bqueues *dp;
+ int i;
+
+ nbuf_headers = 0;
+ /* Initialize the buffer queues ('freelists') and the hash table */
+ for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
+ TAILQ_INIT(dp);
+ bufhashtbl = hashinit(nbuf_hashelements, M_CACHE, &bufhash);
+
+ /* Initialize the buffer headers */
+ for (i = 0; i < max_nbuf_headers; i++) {
+ nbuf_headers++;
+ bp = &buf_headers[i];
+ bufhdrinit(bp);
+
+ BLISTNONE(bp);
+ dp = &bufqueues[BQ_EMPTY];
+ bp->b_whichq = BQ_EMPTY;
+ bp->b_timestamp = buf_timestamp();
+ binsheadfree(bp, dp, BQ_EMPTY);
+ binshash(bp, &invalhash);
+ }
+
+ boot_nbuf_headers = nbuf_headers;
+ for (; i < nbuf_headers + niobuf_headers; i++) {
+ bp = &buf_headers[i];
+ bufhdrinit(bp);
+ bp->b_whichq = -1;
+ binsheadfree(bp, &iobufqueue, -1);
+ }
+
+ /*
+ * allocate lock group attribute and group
+ */
+ buf_mtx_grp_attr = lck_grp_attr_alloc_init();
+ buf_mtx_grp = lck_grp_alloc_init("buffer cache", buf_mtx_grp_attr);
+
+ /*
+ * allocate the lock attribute
+ */
+ buf_mtx_attr = lck_attr_alloc_init();
+
+ /*
+ * allocate and initialize mutex's for the buffer and iobuffer pools
+ */
+ buf_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
+ iobuffer_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
+
+ if (iobuffer_mtxp == NULL)
+ panic("couldn't create iobuffer mutex");
+
+ if (buf_mtxp == NULL)
+ panic("couldn't create buf mutex");
+
+ /*
+ * allocate and initialize cluster specific global locks...
+ */
+ cluster_init();
+
+ printf("using %d buffer headers and %d cluster IO buffer headers\n",
+ nbuf_headers, niobuf_headers);
+
+ /* Set up zones used by the buffer cache */
+ bufzoneinit();
+
+ /* start the bcleanbuf() thread */
+ bcleanbuf_thread_init();
+
+#if BALANCE_QUEUES
+ {
+ static void bufq_balance_thread_init(void) __attribute__((section("__TEXT, initcode")));
+ /* create a thread to do dynamic buffer queue balancing */
+ bufq_balance_thread_init();
+ }
+#endif /* notyet */
+}
+
+
+
+/*
+ * Zones for the meta data buffers
+ */
+
+#define MINMETA 512
+#define MAXMETA 8192
+
+struct meta_zone_entry {
+ zone_t mz_zone;
+ vm_size_t mz_size;
+ vm_size_t mz_max;
+ const char *mz_name;
+};
+
+struct meta_zone_entry meta_zones[] = {
+ {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
+ {NULL, (MINMETA * 2), 64 * (MINMETA * 2), "buf.1024" },
+ {NULL, (MINMETA * 4), 16 * (MINMETA * 4), "buf.2048" },
+ {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
+ {NULL, (MINMETA * 16), 512 * (MINMETA * 16), "buf.8192" },
+ {NULL, 0, 0, "" } /* End */
+};
+
+/*
+ * Initialize the meta data zones
+ */
+static void
+bufzoneinit(void)
+{
+ int i;
+
+ for (i = 0; meta_zones[i].mz_size != 0; i++) {
+ meta_zones[i].mz_zone =
+ zinit(meta_zones[i].mz_size,
+ meta_zones[i].mz_max,
+ PAGE_SIZE,
+ meta_zones[i].mz_name);
+ }
+ buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
+}
+
+static __inline__ zone_t
+getbufzone(size_t size)
+{
+ int i;
+
+ if ((size % 512) || (size < MINMETA) || (size > MAXMETA))
+ panic("getbufzone: incorect size = %lu", size);
+
+ for (i = 0; meta_zones[i].mz_size != 0; i++) {
+ if (meta_zones[i].mz_size >= size)
+ break;
+ }
+
+ return (meta_zones[i].mz_zone);
+}
+
+
+
+static struct buf *
+bio_doread(vnode_t vp, daddr64_t blkno, int size, ucred_t cred, int async, int queuetype)
+{
+ buf_t bp;
+
+ bp = buf_getblk(vp, blkno, size, 0, 0, queuetype);
+
+ /*
+ * If buffer does not have data valid, start a read.
+ * Note that if buffer is B_INVAL, buf_getblk() won't return it.
+ * Therefore, it's valid if it's I/O has completed or been delayed.
+ */
+ if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
+ struct proc *p;
+
+ p = current_proc();
+
+ /* Start I/O for the buffer (keeping credentials). */
+ SET(bp->b_flags, B_READ | async);
+ if (IS_VALID_CRED(cred) && !IS_VALID_CRED(bp->b_rcred)) {
+ kauth_cred_ref(cred);
+ bp->b_rcred = cred;
+ }
+
+ VNOP_STRATEGY(bp);
+
+ trace(TR_BREADMISS, pack(vp, size), blkno);
+
+ /* Pay for the read. */
+ if (p && p->p_stats)
+ OSIncrementAtomic(&p->p_stats->p_ru.ru_inblock); /* XXX */
+
+ if (async) {
+ /*
+ * since we asked for an ASYNC I/O
+ * the biodone will do the brelse
+ * we don't want to pass back a bp
+ * that we don't 'own'
+ */
+ bp = NULL;
+ }
+ } else if (async) {
+ buf_brelse(bp);
+ bp = NULL;
+ }
+
+ trace(TR_BREADHIT, pack(vp, size), blkno);
+
+ return (bp);
+}
+
+/*
+ * Perform the reads for buf_breadn() and buf_meta_breadn().
+ * Trivial modification to the breada algorithm presented in Bach (p.55).
+ */
+static errno_t
+do_breadn_for_type(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes,
+ int nrablks, ucred_t cred, buf_t *bpp, int queuetype)
+{
+ buf_t bp;
+ int i;
+
+ bp = *bpp = bio_doread(vp, blkno, size, cred, 0, queuetype);
+
+ /*
+ * For each of the read-ahead blocks, start a read, if necessary.