+void
+bufattr_markisochronous(bufattr_t bap) {
+ SET(bap->ba_flags, BA_ISOCHRONOUS);
+}
+
+int
+bufattr_isochronous(bufattr_t bap) {
+ if ( (bap->ba_flags & BA_ISOCHRONOUS) )
+ return 1;
+ return 0;
+}
+
+void
+bufattr_markquickcomplete(bufattr_t bap) {
+ SET(bap->ba_flags, BA_QUICK_COMPLETE);
+}
+
+int
+bufattr_quickcomplete(bufattr_t bap) {
+ if ( (bap->ba_flags & BA_QUICK_COMPLETE) )
+ return 1;
+ return 0;
+}
+
+errno_t
+buf_error(buf_t bp) {
+
+ return (bp->b_error);
+}
+
+void
+buf_seterror(buf_t bp, errno_t error) {
+
+ if ((bp->b_error = error))
+ SET(bp->b_flags, B_ERROR);
+ else
+ CLR(bp->b_flags, B_ERROR);
+}
+
+void
+buf_setflags(buf_t bp, int32_t flags) {
+
+ SET(bp->b_flags, (flags & BUF_X_WRFLAGS));
+}
+
+void
+buf_clearflags(buf_t bp, int32_t flags) {
+
+ CLR(bp->b_flags, (flags & BUF_X_WRFLAGS));
+}
+
+int32_t
+buf_flags(buf_t bp) {
+
+ return ((bp->b_flags & BUF_X_RDFLAGS));
+}
+
+void
+buf_reset(buf_t bp, int32_t io_flags) {
+
+ CLR(bp->b_flags, (B_READ | B_WRITE | B_ERROR | B_DONE | B_INVAL | B_ASYNC | B_NOCACHE | B_FUA));
+ SET(bp->b_flags, (io_flags & (B_ASYNC | B_READ | B_WRITE | B_NOCACHE)));
+
+ bp->b_error = 0;
+}
+
+uint32_t
+buf_count(buf_t bp) {
+
+ return (bp->b_bcount);
+}
+
+void
+buf_setcount(buf_t bp, uint32_t bcount) {
+
+ bp->b_bcount = bcount;
+}
+
+uint32_t
+buf_size(buf_t bp) {
+
+ return (bp->b_bufsize);
+}
+
+void
+buf_setsize(buf_t bp, uint32_t bufsize) {
+
+ bp->b_bufsize = bufsize;
+}
+
+uint32_t
+buf_resid(buf_t bp) {
+
+ return (bp->b_resid);
+}
+
+void
+buf_setresid(buf_t bp, uint32_t resid) {
+
+ bp->b_resid = resid;
+}
+
+uint32_t
+buf_dirtyoff(buf_t bp) {
+
+ return (bp->b_dirtyoff);
+}
+
+uint32_t
+buf_dirtyend(buf_t bp) {
+
+ return (bp->b_dirtyend);
+}
+
+void
+buf_setdirtyoff(buf_t bp, uint32_t dirtyoff) {
+
+ bp->b_dirtyoff = dirtyoff;
+}
+
+void
+buf_setdirtyend(buf_t bp, uint32_t dirtyend) {
+
+ bp->b_dirtyend = dirtyend;
+}
+
+uintptr_t
+buf_dataptr(buf_t bp) {
+
+ return (bp->b_datap);
+}
+
+void
+buf_setdataptr(buf_t bp, uintptr_t data) {
+
+ bp->b_datap = data;
+}
+
+vnode_t
+buf_vnode(buf_t bp) {
+
+ return (bp->b_vp);
+}
+
+void
+buf_setvnode(buf_t bp, vnode_t vp) {
+
+ bp->b_vp = vp;
+}
+
+
+void *
+buf_callback(buf_t bp)
+{
+ if ( !(bp->b_flags & B_CALL) )
+ return ((void *) NULL);
+
+ return ((void *)bp->b_iodone);
+}
+
+
+errno_t
+buf_setcallback(buf_t bp, void (*callback)(buf_t, void *), void *transaction)
+{
+ if (callback)
+ bp->b_flags |= (B_CALL | B_ASYNC);
+ else
+ bp->b_flags &= ~B_CALL;
+ bp->b_transaction = transaction;
+ bp->b_iodone = callback;
+
+ return (0);
+}
+
+errno_t
+buf_setupl(buf_t bp, upl_t upl, uint32_t offset)
+{
+
+ if ( !(bp->b_lflags & BL_IOBUF) )
+ return (EINVAL);
+
+ if (upl)
+ bp->b_flags |= B_CLUSTER;
+ else
+ bp->b_flags &= ~B_CLUSTER;
+ bp->b_upl = upl;
+ bp->b_uploffset = offset;
+
+ return (0);
+}
+
+buf_t
+buf_clone(buf_t bp, int io_offset, int io_size, void (*iodone)(buf_t, void *), void *arg)
+{
+ buf_t io_bp;
+
+ if (io_offset < 0 || io_size < 0)
+ return (NULL);
+
+ if ((unsigned)(io_offset + io_size) > (unsigned)bp->b_bcount)
+ return (NULL);
+
+ if (bp->b_flags & B_CLUSTER) {
+ if (io_offset && ((bp->b_uploffset + io_offset) & PAGE_MASK))
+ return (NULL);
+
+ if (((bp->b_uploffset + io_offset + io_size) & PAGE_MASK) && ((io_offset + io_size) < bp->b_bcount))
+ return (NULL);
+ }
+ io_bp = alloc_io_buf(bp->b_vp, 0);
+
+ io_bp->b_flags = bp->b_flags & (B_COMMIT_UPL | B_META | B_PAGEIO | B_CLUSTER | B_PHYS | B_RAW | B_ASYNC | B_READ | B_FUA);
+
+ if (iodone) {
+ io_bp->b_transaction = arg;
+ io_bp->b_iodone = iodone;
+ io_bp->b_flags |= B_CALL;
+ }
+ if (bp->b_flags & B_CLUSTER) {
+ io_bp->b_upl = bp->b_upl;
+ io_bp->b_uploffset = bp->b_uploffset + io_offset;
+ } else {
+ io_bp->b_datap = (uintptr_t)(((char *)bp->b_datap) + io_offset);
+ }
+ io_bp->b_bcount = io_size;
+
+ return (io_bp);
+}
+
+
+int
+buf_shadow(buf_t bp)
+{
+ if (bp->b_lflags & BL_SHADOW)
+ return 1;
+ return 0;
+}
+
+
+buf_t
+buf_create_shadow_priv(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg)
+{
+ return (buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, 1));
+}
+
+buf_t
+buf_create_shadow(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg)
+{
+ return (buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, 0));
+}
+
+
+static buf_t
+buf_create_shadow_internal(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg, int priv)
+{
+ buf_t io_bp;
+
+ KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_START, bp, 0, 0, 0, 0);
+
+ if ( !(bp->b_flags & B_META) || (bp->b_lflags & BL_IOBUF)) {
+
+ KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_END, bp, 0, 0, 0, 0);
+ return (NULL);
+ }
+#ifdef BUF_MAKE_PRIVATE
+ if (bp->b_shadow_ref && bp->b_data_ref == 0 && external_storage == 0)
+ panic("buf_create_shadow: %p is in the private state (%d, %d)", bp, bp->b_shadow_ref, bp->b_data_ref);
+#endif
+ io_bp = alloc_io_buf(bp->b_vp, priv);
+
+ io_bp->b_flags = bp->b_flags & (B_META | B_ZALLOC | B_ASYNC | B_READ | B_FUA);
+ io_bp->b_blkno = bp->b_blkno;
+ io_bp->b_lblkno = bp->b_lblkno;
+
+ if (iodone) {
+ io_bp->b_transaction = arg;
+ io_bp->b_iodone = iodone;
+ io_bp->b_flags |= B_CALL;
+ }
+ if (force_copy == FALSE) {
+ io_bp->b_bcount = bp->b_bcount;
+ io_bp->b_bufsize = bp->b_bufsize;
+
+ if (external_storage) {
+ io_bp->b_datap = external_storage;
+#ifdef BUF_MAKE_PRIVATE
+ io_bp->b_data_store = NULL;
+#endif
+ } else {
+ io_bp->b_datap = bp->b_datap;
+#ifdef BUF_MAKE_PRIVATE
+ io_bp->b_data_store = bp;
+#endif
+ }
+ *(buf_t *)(&io_bp->b_orig) = bp;
+
+ lck_mtx_lock_spin(buf_mtxp);
+
+ io_bp->b_lflags |= BL_SHADOW;
+ io_bp->b_shadow = bp->b_shadow;
+ bp->b_shadow = io_bp;
+ bp->b_shadow_ref++;
+
+#ifdef BUF_MAKE_PRIVATE
+ if (external_storage)
+ io_bp->b_lflags |= BL_EXTERNAL;
+ else
+ bp->b_data_ref++;
+#endif
+ lck_mtx_unlock(buf_mtxp);
+ } else {
+ if (external_storage) {
+#ifdef BUF_MAKE_PRIVATE
+ io_bp->b_lflags |= BL_EXTERNAL;
+#endif
+ io_bp->b_bcount = bp->b_bcount;
+ io_bp->b_bufsize = bp->b_bufsize;
+ io_bp->b_datap = external_storage;
+ } else {
+ allocbuf(io_bp, bp->b_bcount);
+
+ io_bp->b_lflags |= BL_IOBUF_ALLOC;
+ }
+ bcopy((caddr_t)bp->b_datap, (caddr_t)io_bp->b_datap, bp->b_bcount);
+
+#ifdef BUF_MAKE_PRIVATE
+ io_bp->b_data_store = NULL;
+#endif
+ }
+ KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, io_bp, 0);
+
+ return (io_bp);
+}
+
+
+#ifdef BUF_MAKE_PRIVATE
+errno_t
+buf_make_private(buf_t bp)
+{
+ buf_t ds_bp;
+ buf_t t_bp;
+ struct buf my_buf;
+
+ KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_START, bp, bp->b_shadow_ref, 0, 0, 0);
+
+ if (bp->b_shadow_ref == 0 || bp->b_data_ref == 0 || ISSET(bp->b_lflags, BL_SHADOW)) {
+
+ KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, EINVAL, 0);
+ return (EINVAL);
+ }
+ my_buf.b_flags = B_META;
+ my_buf.b_datap = (uintptr_t)NULL;
+ allocbuf(&my_buf, bp->b_bcount);
+
+ bcopy((caddr_t)bp->b_datap, (caddr_t)my_buf.b_datap, bp->b_bcount);
+
+ lck_mtx_lock_spin(buf_mtxp);
+
+ for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) {
+ if ( !ISSET(bp->b_lflags, BL_EXTERNAL))
+ break;
+ }
+ ds_bp = t_bp;
+
+ if (ds_bp == NULL && bp->b_data_ref)
+ panic("buf_make_private: b_data_ref != 0 && ds_bp == NULL");
+
+ if (ds_bp && (bp->b_data_ref == 0 || bp->b_shadow_ref == 0))
+ panic("buf_make_private: ref_count == 0 && ds_bp != NULL");
+
+ if (ds_bp == NULL) {
+ lck_mtx_unlock(buf_mtxp);
+
+ buf_free_meta_store(&my_buf);
+
+ KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, EINVAL, 0);
+ return (EINVAL);
+ }
+ for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) {
+ if ( !ISSET(t_bp->b_lflags, BL_EXTERNAL))
+ t_bp->b_data_store = ds_bp;
+ }
+ ds_bp->b_data_ref = bp->b_data_ref;
+
+ bp->b_data_ref = 0;
+ bp->b_datap = my_buf.b_datap;
+
+ lck_mtx_unlock(buf_mtxp);
+
+ KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, 0, 0);
+ return (0);
+}
+#endif
+
+
+void
+buf_setfilter(buf_t bp, void (*filter)(buf_t, void *), void *transaction,
+ void (**old_iodone)(buf_t, void *), void **old_transaction)
+{
+ if (old_iodone)
+ *old_iodone = bp->b_iodone;
+ if (old_transaction)
+ *old_transaction = bp->b_transaction;
+
+ bp->b_transaction = transaction;
+ bp->b_iodone = filter;
+ if (filter)
+ bp->b_flags |= B_FILTER;
+ else
+ bp->b_flags &= ~B_FILTER;
+}
+
+
+daddr64_t
+buf_blkno(buf_t bp) {
+
+ return (bp->b_blkno);
+}
+
+daddr64_t
+buf_lblkno(buf_t bp) {
+
+ return (bp->b_lblkno);
+}
+
+void
+buf_setblkno(buf_t bp, daddr64_t blkno) {
+
+ bp->b_blkno = blkno;
+}
+
+void
+buf_setlblkno(buf_t bp, daddr64_t lblkno) {
+
+ bp->b_lblkno = lblkno;
+}
+
+dev_t
+buf_device(buf_t bp) {
+
+ return (bp->b_dev);
+}
+
+errno_t
+buf_setdevice(buf_t bp, vnode_t vp) {
+
+ if ((vp->v_type != VBLK) && (vp->v_type != VCHR))
+ return EINVAL;
+ bp->b_dev = vp->v_rdev;
+
+ return 0;
+}
+
+
+void *
+buf_drvdata(buf_t bp) {
+
+ return (bp->b_drvdata);
+}
+
+void
+buf_setdrvdata(buf_t bp, void *drvdata) {
+
+ bp->b_drvdata = drvdata;
+}
+
+void *
+buf_fsprivate(buf_t bp) {
+
+ return (bp->b_fsprivate);
+}
+
+void
+buf_setfsprivate(buf_t bp, void *fsprivate) {
+
+ bp->b_fsprivate = fsprivate;
+}
+
+kauth_cred_t
+buf_rcred(buf_t bp) {
+
+ return (bp->b_rcred);
+}
+
+kauth_cred_t
+buf_wcred(buf_t bp) {
+
+ return (bp->b_wcred);
+}
+
+void *
+buf_upl(buf_t bp) {
+
+ return (bp->b_upl);
+}
+
+uint32_t
+buf_uploffset(buf_t bp) {
+
+ return ((uint32_t)(bp->b_uploffset));
+}
+
+proc_t
+buf_proc(buf_t bp) {
+
+ return (bp->b_proc);
+}
+
+
+errno_t
+buf_map(buf_t bp, caddr_t *io_addr)
+{
+ buf_t real_bp;
+ vm_offset_t vaddr;
+ kern_return_t kret;
+
+ if ( !(bp->b_flags & B_CLUSTER)) {
+ *io_addr = (caddr_t)bp->b_datap;
+ return (0);
+ }
+ real_bp = (buf_t)(bp->b_real_bp);
+
+ if (real_bp && real_bp->b_datap) {
+ /*
+ * b_real_bp is only valid if B_CLUSTER is SET
+ * if it's non-zero, than someone did a cluster_bp call
+ * if the backing physical pages were already mapped
+ * in before the call to cluster_bp (non-zero b_datap),
+ * than we just use that mapping
+ */
+ *io_addr = (caddr_t)real_bp->b_datap;
+ return (0);
+ }
+ kret = ubc_upl_map(bp->b_upl, &vaddr); /* Map it in */
+
+ if (kret != KERN_SUCCESS) {
+ *io_addr = NULL;
+
+ return(ENOMEM);
+ }
+ vaddr += bp->b_uploffset;
+
+ *io_addr = (caddr_t)vaddr;
+
+ return (0);
+}
+
+errno_t
+buf_unmap(buf_t bp)
+{
+ buf_t real_bp;
+ kern_return_t kret;
+
+ if ( !(bp->b_flags & B_CLUSTER))
+ return (0);
+ /*
+ * see buf_map for the explanation
+ */
+ real_bp = (buf_t)(bp->b_real_bp);
+
+ if (real_bp && real_bp->b_datap)
+ return (0);
+
+ if ((bp->b_lflags & BL_IOBUF) &&
+ ((bp->b_flags & (B_PAGEIO | B_READ)) != (B_PAGEIO | B_READ))) {
+ /*
+ * ignore pageins... the 'right' thing will
+ * happen due to the way we handle speculative
+ * clusters...
+ *
+ * when we commit these pages, we'll hit
+ * it with UPL_COMMIT_INACTIVE which
+ * will clear the reference bit that got
+ * turned on when we touched the mapping
+ */
+ bp->b_flags |= B_AGE;
+ }
+ kret = ubc_upl_unmap(bp->b_upl);
+
+ if (kret != KERN_SUCCESS)
+ return (EINVAL);
+ return (0);
+}
+
+
+void
+buf_clear(buf_t bp) {
+ caddr_t baddr;
+
+ if (buf_map(bp, &baddr) == 0) {
+ bzero(baddr, bp->b_bcount);
+ buf_unmap(bp);
+ }
+ bp->b_resid = 0;
+}
+
+/*
+ * Read or write a buffer that is not contiguous on disk.
+ * buffer is marked done/error at the conclusion
+ */
+static int
+buf_strategy_fragmented(vnode_t devvp, buf_t bp, off_t f_offset, size_t contig_bytes)
+{
+ vnode_t vp = buf_vnode(bp);
+ buf_t io_bp; /* For reading or writing a single block */
+ int io_direction;
+ int io_resid;
+ size_t io_contig_bytes;
+ daddr64_t io_blkno;
+ int error = 0;
+ int bmap_flags;
+
+ /*
+ * save our starting point... the bp was already mapped
+ * in buf_strategy before we got called
+ * no sense doing it again.
+ */
+ io_blkno = bp->b_blkno;
+ /*
+ * Make sure we redo this mapping for the next I/O
+ * i.e. this can never be a 'permanent' mapping
+ */
+ bp->b_blkno = bp->b_lblkno;
+
+ /*
+ * Get an io buffer to do the deblocking
+ */
+ io_bp = alloc_io_buf(devvp, 0);
+
+ io_bp->b_lblkno = bp->b_lblkno;
+ io_bp->b_datap = bp->b_datap;
+ io_resid = bp->b_bcount;
+ io_direction = bp->b_flags & B_READ;
+ io_contig_bytes = contig_bytes;
+
+ if (bp->b_flags & B_READ)
+ bmap_flags = VNODE_READ;
+ else
+ bmap_flags = VNODE_WRITE;
+
+ for (;;) {
+ if (io_blkno == -1)
+ /*
+ * this is unexepected, but we'll allow for it
+ */
+ bzero((caddr_t)io_bp->b_datap, (int)io_contig_bytes);
+ else {
+ io_bp->b_bcount = io_contig_bytes;
+ io_bp->b_bufsize = io_contig_bytes;
+ io_bp->b_resid = io_contig_bytes;
+ io_bp->b_blkno = io_blkno;
+
+ buf_reset(io_bp, io_direction);
+
+ /*
+ * Call the device to do the I/O and wait for it. Make sure the appropriate party is charged for write
+ */
+
+ if (!ISSET(bp->b_flags, B_READ))
+ OSAddAtomic(1, &devvp->v_numoutput);
+
+ if ((error = VNOP_STRATEGY(io_bp)))
+ break;
+ if ((error = (int)buf_biowait(io_bp)))
+ break;
+ if (io_bp->b_resid) {
+ io_resid -= (io_contig_bytes - io_bp->b_resid);
+ break;
+ }
+ }
+ if ((io_resid -= io_contig_bytes) == 0)
+ break;
+ f_offset += io_contig_bytes;
+ io_bp->b_datap += io_contig_bytes;
+
+ /*
+ * Map the current position to a physical block number
+ */
+ if ((error = VNOP_BLOCKMAP(vp, f_offset, io_resid, &io_blkno, &io_contig_bytes, NULL, bmap_flags, NULL)))
+ break;
+ }
+ buf_free(io_bp);
+
+ if (error)
+ buf_seterror(bp, error);
+ bp->b_resid = io_resid;
+ /*
+ * This I/O is now complete
+ */
+ buf_biodone(bp);
+
+ return error;
+}
+
+
+/*
+ * struct vnop_strategy_args {
+ * struct buf *a_bp;
+ * } *ap;
+ */
+errno_t
+buf_strategy(vnode_t devvp, void *ap)
+{
+ buf_t bp = ((struct vnop_strategy_args *)ap)->a_bp;
+ vnode_t vp = bp->b_vp;
+ int bmap_flags;
+ errno_t error;
+#if CONFIG_DTRACE
+ int dtrace_io_start_flag = 0; /* We only want to trip the io:::start
+ * probe once, with the true physical
+ * block in place (b_blkno)
+ */
+
+#endif
+
+ if (vp == NULL || vp->v_type == VCHR || vp->v_type == VBLK)
+ panic("buf_strategy: b_vp == NULL || vtype == VCHR | VBLK\n");
+ /*
+ * associate the physical device with
+ * with this buf_t even if we don't
+ * end up issuing the I/O...
+ */
+ bp->b_dev = devvp->v_rdev;
+
+ if (bp->b_flags & B_READ)
+ bmap_flags = VNODE_READ;
+ else
+ bmap_flags = VNODE_WRITE;
+
+ if ( !(bp->b_flags & B_CLUSTER)) {
+
+ if ( (bp->b_upl) ) {
+ /*
+ * we have a UPL associated with this bp
+ * go through cluster_bp which knows how
+ * to deal with filesystem block sizes
+ * that aren't equal to the page size
+ */
+ DTRACE_IO1(start, buf_t, bp);
+ return (cluster_bp(bp));
+ }
+ if (bp->b_blkno == bp->b_lblkno) {
+ off_t f_offset;
+ size_t contig_bytes;
+
+ if ((error = VNOP_BLKTOOFF(vp, bp->b_lblkno, &f_offset))) {
+ DTRACE_IO1(start, buf_t, bp);
+ buf_seterror(bp, error);
+ buf_biodone(bp);
+
+ return (error);
+ }
+
+ if ((error = VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))) {
+ DTRACE_IO1(start, buf_t, bp);
+ buf_seterror(bp, error);
+ buf_biodone(bp);
+
+ return (error);
+ }
+
+ DTRACE_IO1(start, buf_t, bp);
+#if CONFIG_DTRACE
+ dtrace_io_start_flag = 1;
+#endif /* CONFIG_DTRACE */
+
+ if ((bp->b_blkno == -1) || (contig_bytes == 0)) {
+ /* Set block number to force biodone later */
+ bp->b_blkno = -1;
+ buf_clear(bp);
+ }
+ else if ((long)contig_bytes < bp->b_bcount) {
+ return (buf_strategy_fragmented(devvp, bp, f_offset, contig_bytes));
+ }
+ }
+
+#if CONFIG_DTRACE
+ if (dtrace_io_start_flag == 0) {
+ DTRACE_IO1(start, buf_t, bp);
+ dtrace_io_start_flag = 1;
+ }
+#endif /* CONFIG_DTRACE */
+
+ if (bp->b_blkno == -1) {
+ buf_biodone(bp);
+ return (0);
+ }
+ }
+
+#if CONFIG_DTRACE
+ if (dtrace_io_start_flag == 0)
+ DTRACE_IO1(start, buf_t, bp);
+#endif /* CONFIG_DTRACE */
+
+#if CONFIG_PROTECT
+ /* Capture f_offset in the bufattr*/
+ if (bp->b_attr.ba_cpentry != 0) {
+ /* No need to go here for older EAs */
+ if(bp->b_attr.ba_cpentry->cp_flags & CP_OFF_IV_ENABLED) {
+ off_t f_offset;
+ if ((error = VNOP_BLKTOOFF(bp->b_vp, bp->b_lblkno, &f_offset)))
+ return error;
+
+ /*
+ * Attach the file offset to this buffer. The
+ * bufattr attributes will be passed down the stack
+ * until they reach IOFlashStorage. IOFlashStorage
+ * will retain the offset in a local variable when it
+ * issues its I/Os to the NAND controller.
+ *
+ * Note that LwVM may end up splitting this I/O
+ * into sub-I/Os if it crosses a chunk boundary. In this
+ * case, LwVM will update this field when it dispatches
+ * each I/O to IOFlashStorage. But from our perspective
+ * we have only issued a single I/O.
+ */
+ bufattr_setcpoff (&(bp->b_attr), (u_int64_t)f_offset);
+ CP_DEBUG((CPDBG_OFFSET_IO | DBG_FUNC_NONE), (uint32_t) f_offset, (uint32_t) bp->b_lblkno, (uint32_t) bp->b_blkno, (uint32_t) bp->b_bcount, 0);
+ }
+ }
+#endif
+
+ /*
+ * we can issue the I/O because...
+ * either B_CLUSTER is set which
+ * means that the I/O is properly set
+ * up to be a multiple of the page size, or
+ * we were able to successfully set up the
+ * physical block mapping
+ */
+ error = VOCALL(devvp->v_op, VOFFSET(vnop_strategy), ap);
+ DTRACE_FSINFO(strategy, vnode_t, vp);
+ return (error);
+}
+
+
+
+buf_t
+buf_alloc(vnode_t vp)
+{
+ return(alloc_io_buf(vp, 0));
+}
+
+void
+buf_free(buf_t bp) {
+
+ free_io_buf(bp);
+}
+
+
+/*
+ * iterate buffers for the specified vp.
+ * if BUF_SCAN_DIRTY is set, do the dirty list
+ * if BUF_SCAN_CLEAN is set, do the clean list
+ * if neither flag is set, default to BUF_SCAN_DIRTY
+ * if BUF_NOTIFY_BUSY is set, call the callout function using a NULL bp for busy pages
+ */
+
+struct buf_iterate_info_t {
+ int flag;
+ struct buflists *listhead;
+};
+
+void
+buf_iterate(vnode_t vp, int (*callout)(buf_t, void *), int flags, void *arg)
+{
+ buf_t bp;
+ int retval;
+ struct buflists local_iterblkhd;
+ int lock_flags = BAC_NOWAIT | BAC_REMOVE;
+ int notify_busy = flags & BUF_NOTIFY_BUSY;
+ struct buf_iterate_info_t list[2];
+ int num_lists, i;
+
+ if (flags & BUF_SKIP_LOCKED)
+ lock_flags |= BAC_SKIP_LOCKED;
+ if (flags & BUF_SKIP_NONLOCKED)
+ lock_flags |= BAC_SKIP_NONLOCKED;
+
+ if ( !(flags & (BUF_SCAN_DIRTY | BUF_SCAN_CLEAN)))
+ flags |= BUF_SCAN_DIRTY;
+
+ num_lists = 0;
+
+ if (flags & BUF_SCAN_DIRTY) {
+ list[num_lists].flag = VBI_DIRTY;
+ list[num_lists].listhead = &vp->v_dirtyblkhd;
+ num_lists++;
+ }
+ if (flags & BUF_SCAN_CLEAN) {
+ list[num_lists].flag = VBI_CLEAN;
+ list[num_lists].listhead = &vp->v_cleanblkhd;
+ num_lists++;
+ }
+
+ for (i = 0; i < num_lists; i++) {
+ lck_mtx_lock(buf_mtxp);
+
+ if (buf_iterprepare(vp, &local_iterblkhd, list[i].flag)) {
+ lck_mtx_unlock(buf_mtxp);
+ continue;
+ }
+ while (!LIST_EMPTY(&local_iterblkhd)) {
+ bp = LIST_FIRST(&local_iterblkhd);
+ LIST_REMOVE(bp, b_vnbufs);
+ LIST_INSERT_HEAD(list[i].listhead, bp, b_vnbufs);
+
+ if (buf_acquire_locked(bp, lock_flags, 0, 0)) {
+ if (notify_busy) {
+ bp = NULL;
+ } else {
+ continue;
+ }
+ }
+
+ lck_mtx_unlock(buf_mtxp);
+
+ retval = callout(bp, arg);
+
+ switch (retval) {
+ case BUF_RETURNED:
+ if (bp)
+ buf_brelse(bp);
+ break;
+ case BUF_CLAIMED:
+ break;
+ case BUF_RETURNED_DONE:
+ if (bp)
+ buf_brelse(bp);
+ lck_mtx_lock(buf_mtxp);
+ goto out;
+ case BUF_CLAIMED_DONE:
+ lck_mtx_lock(buf_mtxp);
+ goto out;
+ }
+ lck_mtx_lock(buf_mtxp);
+ } /* while list has more nodes */
+ out:
+ buf_itercomplete(vp, &local_iterblkhd, list[i].flag);
+ lck_mtx_unlock(buf_mtxp);
+ } /* for each list */
+} /* buf_iterate */
+
+
+/*
+ * Flush out and invalidate all buffers associated with a vnode.
+ */
+int
+buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo)
+{
+ buf_t bp;
+ int aflags;
+ int error = 0;
+ int must_rescan = 1;
+ struct buflists local_iterblkhd;
+
+
+ if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd))
+ return (0);
+
+ lck_mtx_lock(buf_mtxp);
+
+ for (;;) {
+ if (must_rescan == 0)
+ /*
+ * the lists may not be empty, but all that's left at this
+ * point are metadata or B_LOCKED buffers which are being
+ * skipped... we know this because we made it through both
+ * the clean and dirty lists without dropping buf_mtxp...
+ * each time we drop buf_mtxp we bump "must_rescan"
+ */
+ break;
+ if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd))
+ break;
+ must_rescan = 0;
+ /*
+ * iterate the clean list
+ */
+ if (buf_iterprepare(vp, &local_iterblkhd, VBI_CLEAN)) {
+ goto try_dirty_list;
+ }
+ while (!LIST_EMPTY(&local_iterblkhd)) {
+
+ bp = LIST_FIRST(&local_iterblkhd);
+
+ LIST_REMOVE(bp, b_vnbufs);
+ LIST_INSERT_HEAD(&vp->v_cleanblkhd, bp, b_vnbufs);
+
+ /*
+ * some filesystems distinguish meta data blocks with a negative logical block #
+ */
+ if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
+ continue;
+
+ aflags = BAC_REMOVE;
+
+ if ( !(flags & BUF_INVALIDATE_LOCKED) )
+ aflags |= BAC_SKIP_LOCKED;
+
+ if ( (error = (int)buf_acquire_locked(bp, aflags, slpflag, slptimeo)) ) {
+ if (error == EDEADLK)
+ /*
+ * this buffer was marked B_LOCKED...
+ * we didn't drop buf_mtxp, so we
+ * we don't need to rescan
+ */
+ continue;
+ if (error == EAGAIN) {
+ /*
+ * found a busy buffer... we blocked and
+ * dropped buf_mtxp, so we're going to
+ * need to rescan after this pass is completed
+ */
+ must_rescan++;
+ continue;
+ }
+ /*
+ * got some kind of 'real' error out of the msleep
+ * in buf_acquire_locked, terminate the scan and return the error
+ */
+ buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
+
+ lck_mtx_unlock(buf_mtxp);
+ return (error);
+ }
+ lck_mtx_unlock(buf_mtxp);
+
+ if (bp->b_flags & B_LOCKED)
+ KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 0, 0);
+
+ CLR(bp->b_flags, B_LOCKED);
+ SET(bp->b_flags, B_INVAL);
+ buf_brelse(bp);
+
+ lck_mtx_lock(buf_mtxp);
+
+ /*
+ * by dropping buf_mtxp, we allow new
+ * buffers to be added to the vnode list(s)
+ * we'll have to rescan at least once more
+ * if the queues aren't empty
+ */
+ must_rescan++;
+ }
+ buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
+
+try_dirty_list:
+ /*
+ * Now iterate on dirty blks
+ */
+ if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY)) {
+ continue;
+ }
+ while (!LIST_EMPTY(&local_iterblkhd)) {
+ bp = LIST_FIRST(&local_iterblkhd);
+
+ LIST_REMOVE(bp, b_vnbufs);
+ LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
+
+ /*
+ * some filesystems distinguish meta data blocks with a negative logical block #
+ */
+ if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
+ continue;
+
+ aflags = BAC_REMOVE;
+
+ if ( !(flags & BUF_INVALIDATE_LOCKED) )
+ aflags |= BAC_SKIP_LOCKED;
+
+ if ( (error = (int)buf_acquire_locked(bp, aflags, slpflag, slptimeo)) ) {
+ if (error == EDEADLK)
+ /*
+ * this buffer was marked B_LOCKED...
+ * we didn't drop buf_mtxp, so we
+ * we don't need to rescan
+ */
+ continue;
+ if (error == EAGAIN) {
+ /*
+ * found a busy buffer... we blocked and
+ * dropped buf_mtxp, so we're going to
+ * need to rescan after this pass is completed
+ */
+ must_rescan++;
+ continue;
+ }
+ /*
+ * got some kind of 'real' error out of the msleep
+ * in buf_acquire_locked, terminate the scan and return the error
+ */
+ buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
+
+ lck_mtx_unlock(buf_mtxp);
+ return (error);
+ }
+ lck_mtx_unlock(buf_mtxp);
+
+ if (bp->b_flags & B_LOCKED)
+ KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 1, 0);
+
+ CLR(bp->b_flags, B_LOCKED);
+ SET(bp->b_flags, B_INVAL);
+
+ if (ISSET(bp->b_flags, B_DELWRI) && (flags & BUF_WRITE_DATA))
+ (void) VNOP_BWRITE(bp);
+ else
+ buf_brelse(bp);
+
+ lck_mtx_lock(buf_mtxp);
+ /*
+ * by dropping buf_mtxp, we allow new
+ * buffers to be added to the vnode list(s)
+ * we'll have to rescan at least once more
+ * if the queues aren't empty
+ */
+ must_rescan++;
+ }
+ buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
+ }
+ lck_mtx_unlock(buf_mtxp);
+
+ return (0);
+}
+
+void
+buf_flushdirtyblks(vnode_t vp, int wait, int flags, const char *msg) {
+
+ (void) buf_flushdirtyblks_skipinfo(vp, wait, flags, msg);
+ return;
+}
+
+int
+buf_flushdirtyblks_skipinfo(vnode_t vp, int wait, int flags, const char *msg) {
+ buf_t bp;
+ int writes_issued = 0;
+ errno_t error;
+ int busy = 0;
+ struct buflists local_iterblkhd;
+ int lock_flags = BAC_NOWAIT | BAC_REMOVE;
+ int any_locked = 0;
+
+ if (flags & BUF_SKIP_LOCKED)
+ lock_flags |= BAC_SKIP_LOCKED;
+ if (flags & BUF_SKIP_NONLOCKED)
+ lock_flags |= BAC_SKIP_NONLOCKED;
+loop:
+ lck_mtx_lock(buf_mtxp);
+
+ if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY) == 0) {
+ while (!LIST_EMPTY(&local_iterblkhd)) {
+ bp = LIST_FIRST(&local_iterblkhd);
+ LIST_REMOVE(bp, b_vnbufs);
+ LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
+
+ if ((error = buf_acquire_locked(bp, lock_flags, 0, 0)) == EBUSY) {
+ busy++;
+ }
+ if (error) {
+ /*
+ * If we passed in BUF_SKIP_LOCKED or BUF_SKIP_NONLOCKED,
+ * we may want to do somethign differently if a locked or unlocked
+ * buffer was encountered (depending on the arg specified).
+ * In this case, we know that one of those two was set, and the
+ * buf acquisition failed above.
+ *
+ * If it failed with EDEADLK, then save state which can be emitted
+ * later on to the caller. Most callers should not care.
+ */
+ if (error == EDEADLK) {
+ any_locked++;
+ }
+ continue;
+ }
+ lck_mtx_unlock(buf_mtxp);
+
+ bp->b_flags &= ~B_LOCKED;
+
+ /*
+ * Wait for I/O associated with indirect blocks to complete,
+ * since there is no way to quickly wait for them below.
+ */
+ if ((bp->b_vp == vp) || (wait == 0))
+ (void) buf_bawrite(bp);
+ else
+ (void) VNOP_BWRITE(bp);
+ writes_issued++;
+
+ lck_mtx_lock(buf_mtxp);
+ }
+ buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
+ }
+ lck_mtx_unlock(buf_mtxp);
+
+ if (wait) {
+ (void)vnode_waitforwrites(vp, 0, 0, 0, msg);
+
+ if (vp->v_dirtyblkhd.lh_first && busy) {
+ /*
+ * we had one or more BUSY buffers on
+ * the dirtyblock list... most likely
+ * these are due to delayed writes that
+ * were moved to the bclean queue but
+ * have not yet been 'written'.
+ * if we issued some writes on the
+ * previous pass, we try again immediately
+ * if we didn't, we'll sleep for some time
+ * to allow the state to change...
+ */
+ if (writes_issued == 0) {
+ (void)tsleep((caddr_t)&vp->v_numoutput,
+ PRIBIO + 1, "vnode_flushdirtyblks", hz/20);
+ }
+ writes_issued = 0;
+ busy = 0;
+
+ goto loop;
+ }
+ }
+
+ return any_locked;
+}
+
+
+/*
+ * called with buf_mtxp held...
+ * this lock protects the queue manipulation
+ */
+static int
+buf_iterprepare(vnode_t vp, struct buflists *iterheadp, int flags)
+{
+ struct buflists * listheadp;
+
+ if (flags & VBI_DIRTY)
+ listheadp = &vp->v_dirtyblkhd;
+ else
+ listheadp = &vp->v_cleanblkhd;
+
+ while (vp->v_iterblkflags & VBI_ITER) {
+ vp->v_iterblkflags |= VBI_ITERWANT;
+ msleep(&vp->v_iterblkflags, buf_mtxp, 0, "buf_iterprepare", NULL);
+ }
+ if (LIST_EMPTY(listheadp)) {
+ LIST_INIT(iterheadp);
+ return(EINVAL);
+ }
+ vp->v_iterblkflags |= VBI_ITER;
+
+ iterheadp->lh_first = listheadp->lh_first;
+ listheadp->lh_first->b_vnbufs.le_prev = &iterheadp->lh_first;
+ LIST_INIT(listheadp);
+
+ return(0);
+}
+
+/*
+ * called with buf_mtxp held...
+ * this lock protects the queue manipulation
+ */
+static void
+buf_itercomplete(vnode_t vp, struct buflists *iterheadp, int flags)
+{
+ struct buflists * listheadp;
+ buf_t bp;
+
+ if (flags & VBI_DIRTY)
+ listheadp = &vp->v_dirtyblkhd;
+ else
+ listheadp = &vp->v_cleanblkhd;
+
+ while (!LIST_EMPTY(iterheadp)) {
+ bp = LIST_FIRST(iterheadp);
+ LIST_REMOVE(bp, b_vnbufs);
+ LIST_INSERT_HEAD(listheadp, bp, b_vnbufs);
+ }
+ vp->v_iterblkflags &= ~VBI_ITER;
+
+ if (vp->v_iterblkflags & VBI_ITERWANT) {
+ vp->v_iterblkflags &= ~VBI_ITERWANT;
+ wakeup(&vp->v_iterblkflags);
+ }
+}
+
+
+static void
+bremfree_locked(buf_t bp)
+{
+ struct bqueues *dp = NULL;
+ int whichq;
+
+ whichq = bp->b_whichq;
+
+ if (whichq == -1) {
+ if (bp->b_shadow_ref == 0)
+ panic("bremfree_locked: %p not on freelist", bp);
+ /*
+ * there are clones pointing to 'bp'...
+ * therefore, it was not put on a freelist
+ * when buf_brelse was last called on 'bp'
+ */
+ return;
+ }
+ /*
+ * We only calculate the head of the freelist when removing
+ * the last element of the list as that is the only time that
+ * it is needed (e.g. to reset the tail pointer).
+ *
+ * NB: This makes an assumption about how tailq's are implemented.
+ */
+ if (bp->b_freelist.tqe_next == NULL) {
+ dp = &bufqueues[whichq];
+
+ if (dp->tqh_last != &bp->b_freelist.tqe_next)
+ panic("bremfree: lost tail");
+ }
+ TAILQ_REMOVE(dp, bp, b_freelist);
+
+ if (whichq == BQ_LAUNDRY)
+ blaundrycnt--;
+
+ bp->b_whichq = -1;
+ bp->b_timestamp = 0;
+ bp->b_shadow = 0;
+}
+
+/*
+ * Associate a buffer with a vnode.
+ * buf_mtxp must be locked on entry
+ */
+static void
+bgetvp_locked(vnode_t vp, buf_t bp)
+{
+
+ if (bp->b_vp != vp)
+ panic("bgetvp_locked: not free");
+
+ if (vp->v_type == VBLK || vp->v_type == VCHR)
+ bp->b_dev = vp->v_rdev;
+ else
+ bp->b_dev = NODEV;
+ /*
+ * Insert onto list for new vnode.
+ */
+ bufinsvn(bp, &vp->v_cleanblkhd);
+}
+
+/*
+ * Disassociate a buffer from a vnode.
+ * buf_mtxp must be locked on entry
+ */
+static void
+brelvp_locked(buf_t bp)
+{
+ /*
+ * Delete from old vnode list, if on one.
+ */
+ if (bp->b_vnbufs.le_next != NOLIST)
+ bufremvn(bp);
+
+ bp->b_vp = (vnode_t)NULL;
+}
+
+/*
+ * Reassign a buffer from one vnode to another.
+ * Used to assign file specific control information
+ * (indirect blocks) to the vnode to which they belong.
+ */
+static void
+buf_reassign(buf_t bp, vnode_t newvp)
+{
+ struct buflists *listheadp;
+
+ if (newvp == NULL) {
+ printf("buf_reassign: NULL");
+ return;
+ }
+ lck_mtx_lock_spin(buf_mtxp);
+
+ /*
+ * Delete from old vnode list, if on one.
+ */
+ if (bp->b_vnbufs.le_next != NOLIST)
+ bufremvn(bp);
+ /*
+ * If dirty, put on list of dirty buffers;
+ * otherwise insert onto list of clean buffers.
+ */
+ if (ISSET(bp->b_flags, B_DELWRI))
+ listheadp = &newvp->v_dirtyblkhd;
+ else
+ listheadp = &newvp->v_cleanblkhd;
+ bufinsvn(bp, listheadp);
+
+ lck_mtx_unlock(buf_mtxp);
+}
+
+static __inline__ void
+bufhdrinit(buf_t bp)
+{
+ bzero((char *)bp, sizeof *bp);
+ bp->b_dev = NODEV;
+ bp->b_rcred = NOCRED;
+ bp->b_wcred = NOCRED;
+ bp->b_vnbufs.le_next = NOLIST;
+ bp->b_flags = B_INVAL;
+
+ return;
+}
+
+/*
+ * Initialize buffers and hash links for buffers.
+ */
+__private_extern__ void
+bufinit(void)
+{
+ buf_t bp;
+ struct bqueues *dp;
+ int i;
+
+ nbuf_headers = 0;
+ /* Initialize the buffer queues ('freelists') and the hash table */
+ for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
+ TAILQ_INIT(dp);
+ bufhashtbl = hashinit(nbuf_hashelements, M_CACHE, &bufhash);
+
+ buf_busycount = 0;
+
+ /* Initialize the buffer headers */
+ for (i = 0; i < max_nbuf_headers; i++) {
+ nbuf_headers++;
+ bp = &buf_headers[i];
+ bufhdrinit(bp);
+
+ BLISTNONE(bp);
+ dp = &bufqueues[BQ_EMPTY];
+ bp->b_whichq = BQ_EMPTY;
+ bp->b_timestamp = buf_timestamp();
+ binsheadfree(bp, dp, BQ_EMPTY);
+ binshash(bp, &invalhash);
+ }
+ boot_nbuf_headers = nbuf_headers;
+
+ TAILQ_INIT(&iobufqueue);
+ TAILQ_INIT(&delaybufqueue);
+
+ for (; i < nbuf_headers + niobuf_headers; i++) {
+ bp = &buf_headers[i];
+ bufhdrinit(bp);
+ bp->b_whichq = -1;
+ binsheadfree(bp, &iobufqueue, -1);
+ }
+
+ /*
+ * allocate lock group attribute and group
+ */
+ buf_mtx_grp_attr = lck_grp_attr_alloc_init();
+ buf_mtx_grp = lck_grp_alloc_init("buffer cache", buf_mtx_grp_attr);
+
+ /*
+ * allocate the lock attribute
+ */
+ buf_mtx_attr = lck_attr_alloc_init();
+
+ /*
+ * allocate and initialize mutex's for the buffer and iobuffer pools
+ */
+ buf_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
+ iobuffer_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
+
+ if (iobuffer_mtxp == NULL)
+ panic("couldn't create iobuffer mutex");
+
+ if (buf_mtxp == NULL)
+ panic("couldn't create buf mutex");
+
+ /*
+ * allocate and initialize cluster specific global locks...
+ */
+ cluster_init();
+
+ printf("using %d buffer headers and %d cluster IO buffer headers\n",
+ nbuf_headers, niobuf_headers);
+
+ /* Set up zones used by the buffer cache */
+ bufzoneinit();
+
+ /* start the bcleanbuf() thread */
+ bcleanbuf_thread_init();
+
+ /* Register a callout for relieving vm pressure */
+ if (vm_set_buffer_cleanup_callout(buffer_cache_gc) != KERN_SUCCESS) {
+ panic("Couldn't register buffer cache callout for vm pressure!\n");
+ }
+
+}
+
+/*
+ * Zones for the meta data buffers
+ */
+
+#define MINMETA 512
+#define MAXMETA 8192
+
+struct meta_zone_entry {
+ zone_t mz_zone;
+ vm_size_t mz_size;
+ vm_size_t mz_max;
+ const char *mz_name;
+};
+
+struct meta_zone_entry meta_zones[] = {
+ {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
+ {NULL, (MINMETA * 2), 64 * (MINMETA * 2), "buf.1024" },
+ {NULL, (MINMETA * 4), 16 * (MINMETA * 4), "buf.2048" },
+ {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
+ {NULL, (MINMETA * 16), 512 * (MINMETA * 16), "buf.8192" },
+ {NULL, 0, 0, "" } /* End */
+};
+
+/*
+ * Initialize the meta data zones
+ */
+static void
+bufzoneinit(void)
+{
+ int i;
+
+ for (i = 0; meta_zones[i].mz_size != 0; i++) {
+ meta_zones[i].mz_zone =
+ zinit(meta_zones[i].mz_size,
+ meta_zones[i].mz_max,
+ PAGE_SIZE,
+ meta_zones[i].mz_name);
+ zone_change(meta_zones[i].mz_zone, Z_CALLERACCT, FALSE);
+ }
+ buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
+ zone_change(buf_hdr_zone, Z_CALLERACCT, FALSE);
+}
+
+static __inline__ zone_t
+getbufzone(size_t size)
+{
+ int i;
+
+ if ((size % 512) || (size < MINMETA) || (size > MAXMETA))
+ panic("getbufzone: incorect size = %lu", size);
+
+ for (i = 0; meta_zones[i].mz_size != 0; i++) {
+ if (meta_zones[i].mz_size >= size)
+ break;
+ }
+
+ return (meta_zones[i].mz_zone);
+}
+
+
+
+static struct buf *
+bio_doread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, int async, int queuetype)
+{
+ buf_t bp;
+
+ bp = buf_getblk(vp, blkno, size, 0, 0, queuetype);
+
+ /*
+ * If buffer does not have data valid, start a read.
+ * Note that if buffer is B_INVAL, buf_getblk() won't return it.
+ * Therefore, it's valid if it's I/O has completed or been delayed.
+ */
+ if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
+ struct proc *p;
+
+ p = current_proc();
+
+ /* Start I/O for the buffer (keeping credentials). */
+ SET(bp->b_flags, B_READ | async);
+ if (IS_VALID_CRED(cred) && !IS_VALID_CRED(bp->b_rcred)) {
+ kauth_cred_ref(cred);
+ bp->b_rcred = cred;
+ }
+
+ VNOP_STRATEGY(bp);
+
+ trace(TR_BREADMISS, pack(vp, size), blkno);
+
+ /* Pay for the read. */
+ if (p && p->p_stats) {
+ OSIncrementAtomicLong(&p->p_stats->p_ru.ru_inblock); /* XXX */
+ }
+
+ if (async) {
+ /*
+ * since we asked for an ASYNC I/O
+ * the biodone will do the brelse
+ * we don't want to pass back a bp
+ * that we don't 'own'
+ */
+ bp = NULL;
+ }
+ } else if (async) {
+ buf_brelse(bp);
+ bp = NULL;
+ }
+
+ trace(TR_BREADHIT, pack(vp, size), blkno);
+
+ return (bp);
+}
+
+/*
+ * Perform the reads for buf_breadn() and buf_meta_breadn().
+ * Trivial modification to the breada algorithm presented in Bach (p.55).
+ */
+static errno_t
+do_breadn_for_type(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes,
+ int nrablks, kauth_cred_t cred, buf_t *bpp, int queuetype)
+{
+ buf_t bp;
+ int i;
+
+ bp = *bpp = bio_doread(vp, blkno, size, cred, 0, queuetype);
+
+ /*
+ * For each of the read-ahead blocks, start a read, if necessary.
+ */
+ for (i = 0; i < nrablks; i++) {
+ /* If it's in the cache, just go on to next one. */
+ if (incore(vp, rablks[i]))
+ continue;
+
+ /* Get a buffer for the read-ahead block */
+ (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, queuetype);
+ }
+
+ /* Otherwise, we had to start a read for it; wait until it's valid. */
+ return (buf_biowait(bp));
+}
+
+
+/*
+ * Read a disk block.
+ * This algorithm described in Bach (p.54).
+ */
+errno_t
+buf_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp)
+{
+ buf_t bp;
+
+ /* Get buffer for block. */
+ bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
+
+ /* Wait for the read to complete, and return result. */
+ return (buf_biowait(bp));
+}
+
+/*
+ * Read a disk block. [bread() for meta-data]
+ * This algorithm described in Bach (p.54).
+ */
+errno_t
+buf_meta_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp)
+{
+ buf_t bp;
+
+ /* Get buffer for block. */
+ bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
+
+ /* Wait for the read to complete, and return result. */
+ return (buf_biowait(bp));
+}
+
+/*
+ * Read-ahead multiple disk blocks. The first is sync, the rest async.
+ */
+errno_t
+buf_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, kauth_cred_t cred, buf_t *bpp)
+{
+ return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_READ));
+}
+
+/*
+ * Read-ahead multiple disk blocks. The first is sync, the rest async.
+ * [buf_breadn() for meta-data]
+ */
+errno_t
+buf_meta_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, kauth_cred_t cred, buf_t *bpp)
+{
+ return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_META));
+}
+
+/*
+ * Block write. Described in Bach (p.56)
+ */
+errno_t
+buf_bwrite(buf_t bp)
+{
+ int sync, wasdelayed;
+ errno_t rv;
+ proc_t p = current_proc();
+ vnode_t vp = bp->b_vp;
+
+ if (bp->b_datap == 0) {
+ if (brecover_data(bp) == 0)
+ return (0);
+ }
+ /* Remember buffer type, to switch on it later. */
+ sync = !ISSET(bp->b_flags, B_ASYNC);
+ wasdelayed = ISSET(bp->b_flags, B_DELWRI);
+ CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
+
+ if (wasdelayed)
+ OSAddAtomicLong(-1, &nbdwrite);
+
+ if (!sync) {
+ /*
+ * If not synchronous, pay for the I/O operation and make
+ * sure the buf is on the correct vnode queue. We have
+ * to do this now, because if we don't, the vnode may not
+ * be properly notified that its I/O has completed.
+ */
+ if (wasdelayed)
+ buf_reassign(bp, vp);
+ else
+ if (p && p->p_stats) {
+ OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); /* XXX */
+ }
+ }
+ trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
+
+ /* Initiate disk write. Make sure the appropriate party is charged. */
+
+ OSAddAtomic(1, &vp->v_numoutput);
+
+ VNOP_STRATEGY(bp);
+
+ if (sync) {
+ /*
+ * If I/O was synchronous, wait for it to complete.
+ */
+ rv = buf_biowait(bp);
+
+ /*
+ * Pay for the I/O operation, if it's not been paid for, and
+ * make sure it's on the correct vnode queue. (async operatings
+ * were payed for above.)
+ */
+ if (wasdelayed)
+ buf_reassign(bp, vp);
+ else
+ if (p && p->p_stats) {
+ OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); /* XXX */
+ }
+
+ /* Release the buffer. */
+ // XXXdbg - only if the unused bit is set
+ if (!ISSET(bp->b_flags, B_NORELSE)) {
+ buf_brelse(bp);
+ } else {
+ CLR(bp->b_flags, B_NORELSE);
+ }
+
+ return (rv);
+ } else {
+ return (0);
+ }
+}
+
+int
+vn_bwrite(struct vnop_bwrite_args *ap)
+{
+ return (buf_bwrite(ap->a_bp));
+}
+
+/*
+ * Delayed write.
+ *