/*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
*
* @APPLE_LICENSE_HEADER_START@
*
extern int niobuf; /* The number of IO buffer headers for cluster IO */
int blaundrycnt;
+/* zone allocated buffer headers */
+static zone_t buf_hdr_zone;
+static int buf_hdr_count;
+
#if TRACE
struct proc *traceproc;
int tracewhich, tracebuf[TRCSIZ];
/* Definitions for the buffer stats. */
struct bufstats bufstats;
+/* Number of delayed write buffers */
+int nbdwrite = 0;
+
/*
* Insq/Remq for the buffer hash lists.
*/
TAILQ_HEAD(ioqueue, buf) iobufqueue;
TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
-int needbuffer;
-int need_iobuffer;
+static int needbuffer;
+static int need_iobuffer;
/*
* Insq/Remq for the buffer free lists.
simple_lock_data_t bufhashlist_slock; /* lock on buffer hash list */
+/* number of per vnode, "in flight" buffer writes */
+#define BUFWRITE_THROTTLE 9
+
/*
* Time in seconds before a buffer on a list is
* considered as a stale buffer
sync = !ISSET(bp->b_flags, B_ASYNC);
wasdelayed = ISSET(bp->b_flags, B_DELWRI);
CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
+ if (wasdelayed)
+ nbdwrite--;
if (!sync) {
/*
p->p_stats->p_ru.ru_oublock++; /* XXX */
}
- trace(TR_BWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
+ trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
/* Initiate disk write. Make sure the appropriate party is charged. */
SET(bp->b_flags, B_WRITEINPROG);
* written in the order that the writes are requested.
*
* Described in Leffler, et al. (pp. 208-213).
+ *
+ * Note: With the abilitty to allocate additional buffer
+ * headers, we can get in to the situation where "too" many
+ * bdwrite()s can create situation where the kernel can create
+ * buffers faster than the disks can service. Doing a bawrite() in
+ * cases were we have "too many" outstanding bdwrite()s avoids that.
*/
void
bdwrite(bp)
struct buf *bp;
{
struct proc *p = current_proc();
- kern_return_t kret;
- upl_t upl;
- upl_page_info_t *pl;
+ struct vnode *vp = bp->b_vp;
/*
* If the block hasn't been seen before:
SET(bp->b_flags, B_DELWRI);
if (p && p->p_stats)
p->p_stats->p_ru.ru_oublock++; /* XXX */
-
- reassignbuf(bp, bp->b_vp);
+ nbdwrite ++;
+ reassignbuf(bp, vp);
}
return;
}
+ /*
+ * If the vnode has "too many" write operations in progress
+ * wait for them to finish the IO
+ */
+ while (vp->v_numoutput >= BUFWRITE_THROTTLE) {
+ vp->v_flag |= VTHROTTLED;
+ (void)tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "bdwrite", 0);
+ }
+
+ /*
+ * If we have too many delayed write buffers,
+ * more than we can "safely" handle, just fall back to
+ * doing the async write
+ */
+ if (nbdwrite < 0)
+ panic("bdwrite: Negative nbdwrite");
+
+ if (nbdwrite > ((nbuf/4)*3)) {
+ bawrite(bp);
+ return;
+ }
+
/* Otherwise, the "write" is done, so mark and release the buffer. */
SET(bp->b_flags, B_DONE);
brelse(bp);
/*
* Asynchronous block write; just an asynchronous bwrite().
+ *
+ * Note: With the abilitty to allocate additional buffer
+ * headers, we can get in to the situation where "too" many
+ * bawrite()s can create situation where the kernel can create
+ * buffers faster than the disks can service.
+ * We limit the number of "in flight" writes a vnode can have to
+ * avoid this.
*/
void
bawrite(bp)
struct buf *bp;
{
+ struct vnode *vp = bp->b_vp;
+
+ if (vp) {
+ /*
+ * If the vnode has "too many" write operations in progress
+ * wait for them to finish the IO
+ */
+ while (vp->v_numoutput >= BUFWRITE_THROTTLE) {
+ vp->v_flag |= VTHROTTLED;
+ (void)tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "bawrite", 0);
+ }
+ }
SET(bp->b_flags, B_ASYNC);
VOP_BWRITE(bp);
*/
if (bp->b_vp)
brelvp(bp);
- CLR(bp->b_flags, B_DELWRI);
+ if (ISSET(bp->b_flags, B_DELWRI)) {
+ CLR(bp->b_flags, B_DELWRI);
+ nbdwrite--;
+ }
if (bp->b_bufsize <= 0)
whichq = BQ_EMPTY; /* no data */
else
};
#endif /* ZALLOC_METADATA */
-zone_t buf_hdr_zone;
-int buf_hdr_count;
-
/*
* Initialize the meta data zones
*/
{
int s;
struct ucred *cred;
+ int hdralloc = 0;
s = splbio();
/* Buffer is no longer on free lists. */
SET(bp->b_flags, B_BUSY);
+ /* Check whether the buffer header was "allocated" */
+ if (ISSET(bp->b_flags, B_HDRALLOC))
+ hdralloc = 1;
+
if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
panic("bcleanbuf: le_prev is deadbeef");
bp->b_bufsize = 0;
bp->b_data = 0;
bp->b_flags = B_BUSY;
+ if (hdralloc)
+ SET(bp->b_flags, B_HDRALLOC);
bp->b_dev = NODEV;
bp->b_blkno = bp->b_lblkno = 0;
bp->b_iodone = 0;
struct buf *bp;
{
boolean_t funnel_state;
- int s;
+ struct vnode *vp;
funnel_state = thread_funnel_set(kernel_flock, TRUE);
if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
vwakeup(bp); /* wake up reader */
+ /* Wakeup the throttled write operations as needed */
+ vp = bp->b_vp;
+ if (vp
+ && (vp->v_flag & VTHROTTLED)
+ && (vp->v_numoutput <= (BUFWRITE_THROTTLE / 3))) {
+ vp->v_flag &= ~VTHROTTLED;
+ wakeup((caddr_t)&vp->v_numoutput);
+ }
+
if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */
CLR(bp->b_flags, B_CALL); /* but note callout done */
(*bp->b_iodone)(bp);