]> git.saurik.com Git - apple/xnu.git/blame - bsd/vfs/vfs_bio.c
xnu-344.12.2.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_bio.c
CommitLineData
1c79356b 1/*
d52fe63f 2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
1c79356b
A
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
23/*-
24 * Copyright (c) 1994 Christopher G. Demetriou
25 * Copyright (c) 1982, 1986, 1989, 1993
26 * The Regents of the University of California. All rights reserved.
27 * (c) UNIX System Laboratories, Inc.
28 * All or some portions of this file are derived from material licensed
29 * to the University of California by American Telephone and Telegraph
30 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
31 * the permission of UNIX System Laboratories, Inc.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * The NEXTSTEP Software License Agreement specifies the terms
62 * and conditions for redistribution.
63 *
64 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
65 */
66
67/*
68 * Some references:
69 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
70 * Leffler, et al.: The Design and Implementation of the 4.3BSD
71 * UNIX Operating System (Addison Welley, 1989)
72 */
1c79356b
A
73
74#include <sys/param.h>
75#include <sys/systm.h>
76#include <sys/proc.h>
77#include <sys/buf.h>
78#include <sys/vnode.h>
79#include <sys/mount.h>
80#include <sys/trace.h>
81#include <sys/malloc.h>
82#include <sys/resourcevar.h>
83#include <miscfs/specfs/specdev.h>
84#include <sys/ubc.h>
85#include <vm/vm_pageout.h>
86#if DIAGNOSTIC
87#include <kern/assert.h>
88#endif /* DIAGNOSTIC */
89#include <kern/task.h>
90#include <kern/zalloc.h>
91
92#include <sys/kdebug.h>
9bccf70c 93#include <machine/spl.h>
1c79356b 94
9bccf70c
A
95static __inline__ void bufqinc(int q);
96static __inline__ void bufqdec(int q);
1c79356b 97
1c79356b 98static struct buf *getnewbuf(int slpflag, int slptimeo, int *queue);
9bccf70c
A
99static int bcleanbuf(struct buf *bp);
100extern void vwakeup();
1c79356b 101
fa4905b1 102extern int niobuf; /* The number of IO buffer headers for cluster IO */
765c9de3 103int blaundrycnt;
1c79356b 104
d52fe63f
A
105/* zone allocated buffer headers */
106static zone_t buf_hdr_zone;
107static int buf_hdr_count;
108
1c79356b
A
109#if TRACE
110struct proc *traceproc;
111int tracewhich, tracebuf[TRCSIZ];
112u_int tracex;
113char traceflags[TR_NFLAGS];
114#endif /* TRACE */
115
116/*
117 * Definitions for the buffer hash lists.
118 */
119#define BUFHASH(dvp, lbn) \
120 (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
121LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
122u_long bufhash;
123
124/* Definitions for the buffer stats. */
125struct bufstats bufstats;
126
d52fe63f
A
127/* Number of delayed write buffers */
128int nbdwrite = 0;
129
1c79356b
A
130/*
131 * Insq/Remq for the buffer hash lists.
132 */
133#if 0
134#define binshash(bp, dp) LIST_INSERT_HEAD(dp, bp, b_hash)
135#define bremhash(bp) LIST_REMOVE(bp, b_hash)
136#endif /* 0 */
137
138
139TAILQ_HEAD(ioqueue, buf) iobufqueue;
140TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
d52fe63f
A
141static int needbuffer;
142static int need_iobuffer;
1c79356b
A
143
144/*
145 * Insq/Remq for the buffer free lists.
146 */
147#define binsheadfree(bp, dp, whichq) do { \
148 TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
149 bufqinc((whichq)); \
150 (bp)->b_whichq = whichq; \
151 (bp)->b_timestamp = time.tv_sec; \
152 } while (0)
153
154#define binstailfree(bp, dp, whichq) do { \
155 TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
156 bufqinc((whichq)); \
157 (bp)->b_whichq = whichq; \
158 (bp)->b_timestamp = time.tv_sec; \
159 } while (0)
160
161#define BHASHENTCHECK(bp) \
162 if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef) \
765c9de3 163 panic("%x: b_hash.le_prev is not deadbeef", (bp));
1c79356b
A
164
165#define BLISTNONE(bp) \
166 (bp)->b_hash.le_next = (struct buf *)0; \
167 (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
168
9bccf70c
A
169/*
170 * Insq/Remq for the vnode usage lists.
171 */
172#define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs)
173#define bufremvn(bp) { \
174 LIST_REMOVE(bp, b_vnbufs); \
175 (bp)->b_vnbufs.le_next = NOLIST; \
176}
177
1c79356b
A
178simple_lock_data_t bufhashlist_slock; /* lock on buffer hash list */
179
d52fe63f
A
180/* number of per vnode, "in flight" buffer writes */
181#define BUFWRITE_THROTTLE 9
182
b4c24cb9 183
1c79356b
A
184/*
185 * Time in seconds before a buffer on a list is
186 * considered as a stale buffer
187 */
188#define LRU_IS_STALE 120 /* default value for the LRU */
189#define AGE_IS_STALE 60 /* default value for the AGE */
190#define META_IS_STALE 180 /* default value for the BQ_META */
191
192int lru_is_stale = LRU_IS_STALE;
193int age_is_stale = AGE_IS_STALE;
194int meta_is_stale = META_IS_STALE;
195
9bccf70c
A
196/* LIST_INSERT_HEAD() with assertions */
197static __inline__ void
1c79356b
A
198blistenterhead(struct bufhashhdr * head, struct buf * bp)
199{
200 if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
201 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
202 (head)->lh_first = bp;
203 bp->b_hash.le_prev = &(head)->lh_first;
204 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
205 panic("blistenterhead: le_prev is deadbeef");
1c79356b 206}
1c79356b 207
9bccf70c 208static __inline__ void
1c79356b
A
209binshash(struct buf *bp, struct bufhashhdr *dp)
210{
9bccf70c 211 struct buf *nbp;
1c79356b
A
212
213 simple_lock(&bufhashlist_slock);
9bccf70c 214
b4c24cb9
A
215#if 0
216 if((bad = incore(bp->b_vp, bp->b_lblkno)))
217 panic("binshash: already incore bp 0x%x, bad 0x%x\n", bp, bad);
1c79356b 218#endif /* 0 */
9bccf70c 219
1c79356b 220 BHASHENTCHECK(bp);
9bccf70c 221
1c79356b
A
222 nbp = dp->lh_first;
223 for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
224 if(nbp == bp)
225 panic("buf already in hashlist");
226 }
227
1c79356b 228 blistenterhead(dp, bp);
1c79356b
A
229 simple_unlock(&bufhashlist_slock);
230}
231
9bccf70c 232static __inline__ void
1c79356b
A
233bremhash(struct buf *bp)
234{
1c79356b
A
235 simple_lock(&bufhashlist_slock);
236 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
237 panic("bremhash le_prev is deadbeef");
238 if (bp->b_hash.le_next == bp)
239 panic("bremhash: next points to self");
240
241 if (bp->b_hash.le_next != NULL)
242 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
243 *bp->b_hash.le_prev = (bp)->b_hash.le_next;
244 simple_unlock(&bufhashlist_slock);
245}
246
1c79356b
A
247/*
248 * Remove a buffer from the free list it's on
249 */
250void
251bremfree(bp)
252 struct buf *bp;
253{
254 struct bqueues *dp = NULL;
255 int whichq = -1;
256
257 /*
258 * We only calculate the head of the freelist when removing
259 * the last element of the list as that is the only time that
260 * it is needed (e.g. to reset the tail pointer).
261 *
262 * NB: This makes an assumption about how tailq's are implemented.
263 */
264 if (bp->b_freelist.tqe_next == NULL) {
265 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
266 if (dp->tqh_last == &bp->b_freelist.tqe_next)
267 break;
268 if (dp == &bufqueues[BQUEUES])
269 panic("bremfree: lost tail");
270 }
271 TAILQ_REMOVE(dp, bp, b_freelist);
272 whichq = bp->b_whichq;
273 bufqdec(whichq);
274 bp->b_whichq = -1;
275 bp->b_timestamp = 0;
276}
277
9bccf70c
A
278/*
279 * Associate a buffer with a vnode.
280 */
281static void
282bgetvp(vp, bp)
283 register struct vnode *vp;
284 register struct buf *bp;
285{
286
287 if (bp->b_vp != vp)
288 panic("bgetvp: not free");
289 VHOLD(vp);
290 bp->b_vp = vp;
291 if (vp->v_type == VBLK || vp->v_type == VCHR)
292 bp->b_dev = vp->v_rdev;
293 else
294 bp->b_dev = NODEV;
295 /*
296 * Insert onto list for new vnode.
297 */
298 bufinsvn(bp, &vp->v_cleanblkhd);
299}
300
301/*
302 * Disassociate a buffer from a vnode.
303 */
304static void
305brelvp(bp)
306 register struct buf *bp;
307{
308 struct vnode *vp;
309
310 if (bp->b_vp == (struct vnode *) 0)
311 panic("brelvp: NULL vp");
312 /*
313 * Delete from old vnode list, if on one.
314 */
315 if (bp->b_vnbufs.le_next != NOLIST)
316 bufremvn(bp);
317 vp = bp->b_vp;
318 bp->b_vp = (struct vnode *) 0;
319 HOLDRELE(vp);
320}
321
322/*
323 * Reassign a buffer from one vnode to another.
324 * Used to assign file specific control information
325 * (indirect blocks) to the vnode to which they belong.
326 */
327void
328reassignbuf(bp, newvp)
329 register struct buf *bp;
330 register struct vnode *newvp;
331{
332 register struct buflists *listheadp;
333
334 if (newvp == NULL) {
335 printf("reassignbuf: NULL");
336 return;
337 }
338 /*
339 * Delete from old vnode list, if on one.
340 */
341 if (bp->b_vnbufs.le_next != NOLIST)
342 bufremvn(bp);
343 /*
344 * If dirty, put on list of dirty buffers;
345 * otherwise insert onto list of clean buffers.
346 */
347 if (ISSET(bp->b_flags, B_DELWRI))
348 listheadp = &newvp->v_dirtyblkhd;
349 else
350 listheadp = &newvp->v_cleanblkhd;
351 bufinsvn(bp, listheadp);
352}
353
765c9de3
A
354static __inline__ void
355bufhdrinit(struct buf *bp)
356{
357 bzero((char *)bp, sizeof *bp);
358 bp->b_dev = NODEV;
359 bp->b_rcred = NOCRED;
360 bp->b_wcred = NOCRED;
361 bp->b_vnbufs.le_next = NOLIST;
362 bp->b_flags = B_INVAL;
363
364 return;
365}
366
1c79356b
A
367/*
368 * Initialize buffers and hash links for buffers.
369 */
9bccf70c 370__private_extern__ void
1c79356b
A
371bufinit()
372{
373 register struct buf *bp;
374 register struct bqueues *dp;
375 register int i;
376 int metabuf;
377 long whichq;
1c79356b 378 static void bufzoneinit();
765c9de3 379 static void bcleanbuf_thread_init();
1c79356b
A
380
381 /* Initialize the buffer queues ('freelists') and the hash table */
382 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
383 TAILQ_INIT(dp);
384 bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash);
385
386 simple_lock_init(&bufhashlist_slock );
387
388 metabuf = nbuf/8; /* reserved for meta buf */
389
390 /* Initialize the buffer headers */
391 for (i = 0; i < nbuf; i++) {
392 bp = &buf[i];
765c9de3
A
393 bufhdrinit(bp);
394
1c79356b
A
395 /*
396 * metabuf buffer headers on the meta-data list and
397 * rest of the buffer headers on the empty list
398 */
765c9de3 399 if (--metabuf)
1c79356b
A
400 whichq = BQ_META;
401 else
402 whichq = BQ_EMPTY;
403
404 BLISTNONE(bp);
405 dp = &bufqueues[whichq];
406 binsheadfree(bp, dp, whichq);
407 binshash(bp, &invalhash);
408 }
409
410 for (; i < nbuf + niobuf; i++) {
411 bp = &buf[i];
765c9de3 412 bufhdrinit(bp);
1c79356b
A
413 binsheadfree(bp, &iobufqueue, -1);
414 }
415
416 printf("using %d buffer headers and %d cluster IO buffer headers\n",
417 nbuf, niobuf);
418
765c9de3 419 /* Set up zones used by the buffer cache */
1c79356b 420 bufzoneinit();
1c79356b 421
765c9de3
A
422 /* start the bcleanbuf() thread */
423 bcleanbuf_thread_init();
424
425#if 0 /* notyet */
9bccf70c
A
426 {
427 static void bufq_balance_thread_init();
1c79356b
A
428 /* create a thread to do dynamic buffer queue balancing */
429 bufq_balance_thread_init();
9bccf70c
A
430 }
431#endif /* notyet */
1c79356b
A
432}
433
9bccf70c 434static struct buf *
1c79356b
A
435bio_doread(vp, blkno, size, cred, async, queuetype)
436 struct vnode *vp;
437 daddr_t blkno;
438 int size;
439 struct ucred *cred;
440 int async;
441 int queuetype;
442{
443 register struct buf *bp;
444 struct proc *p = current_proc();
445
446 bp = getblk(vp, blkno, size, 0, 0, queuetype);
447
448 /*
449 * If buffer does not have data valid, start a read.
450 * Note that if buffer is B_INVAL, getblk() won't return it.
451 * Therefore, it's valid if it's I/O has completed or been delayed.
452 */
453 if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
454 /* Start I/O for the buffer (keeping credentials). */
455 SET(bp->b_flags, B_READ | async);
456 if (cred != NOCRED && bp->b_rcred == NOCRED) {
0b4e3aa0
A
457 /*
458 * NFS has embedded ucred.
459 * Can not crhold() here as that causes zone corruption
460 */
461 bp->b_rcred = crdup(cred);
1c79356b 462 }
b4c24cb9 463
1c79356b
A
464 VOP_STRATEGY(bp);
465
466 trace(TR_BREADMISS, pack(vp, size), blkno);
467
468 /* Pay for the read. */
469 if (p && p->p_stats)
470 p->p_stats->p_ru.ru_inblock++; /* XXX */
471 } else if (async) {
472 brelse(bp);
473 }
474
475 trace(TR_BREADHIT, pack(vp, size), blkno);
476
477 return (bp);
478}
479/*
480 * Read a disk block.
481 * This algorithm described in Bach (p.54).
482 */
483int
484bread(vp, blkno, size, cred, bpp)
485 struct vnode *vp;
486 daddr_t blkno;
487 int size;
488 struct ucred *cred;
489 struct buf **bpp;
490{
491 register struct buf *bp;
492
493 /* Get buffer for block. */
494 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
495
496 /* Wait for the read to complete, and return result. */
497 return (biowait(bp));
498}
499
500/*
501 * Read a disk block. [bread() for meta-data]
502 * This algorithm described in Bach (p.54).
503 */
504int
505meta_bread(vp, blkno, size, cred, bpp)
506 struct vnode *vp;
507 daddr_t blkno;
508 int size;
509 struct ucred *cred;
510 struct buf **bpp;
511{
512 register struct buf *bp;
513
514 /* Get buffer for block. */
515 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
516
517 /* Wait for the read to complete, and return result. */
518 return (biowait(bp));
519}
520
521/*
522 * Read-ahead multiple disk blocks. The first is sync, the rest async.
523 * Trivial modification to the breada algorithm presented in Bach (p.55).
524 */
525int
526breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp)
527 struct vnode *vp;
528 daddr_t blkno; int size;
529 daddr_t rablks[]; int rasizes[];
530 int nrablks;
531 struct ucred *cred;
532 struct buf **bpp;
533{
534 register struct buf *bp;
535 int i;
536
537 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
538
539 /*
540 * For each of the read-ahead blocks, start a read, if necessary.
541 */
542 for (i = 0; i < nrablks; i++) {
543 /* If it's in the cache, just go on to next one. */
544 if (incore(vp, rablks[i]))
545 continue;
546
547 /* Get a buffer for the read-ahead block */
548 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, BLK_READ);
549 }
550
551 /* Otherwise, we had to start a read for it; wait until it's valid. */
552 return (biowait(bp));
553}
554
555/*
556 * Read with single-block read-ahead. Defined in Bach (p.55), but
557 * implemented as a call to breadn().
558 * XXX for compatibility with old file systems.
559 */
560int
561breada(vp, blkno, size, rablkno, rabsize, cred, bpp)
562 struct vnode *vp;
563 daddr_t blkno; int size;
564 daddr_t rablkno; int rabsize;
565 struct ucred *cred;
566 struct buf **bpp;
567{
568
569 return (breadn(vp, blkno, size, &rablkno, &rabsize, 1, cred, bpp));
570}
571
572/*
573 * Block write. Described in Bach (p.56)
574 */
575int
576bwrite(bp)
577 struct buf *bp;
578{
579 int rv, sync, wasdelayed;
580 struct proc *p = current_proc();
1c79356b
A
581 struct vnode *vp = bp->b_vp;
582
583 /* Remember buffer type, to switch on it later. */
584 sync = !ISSET(bp->b_flags, B_ASYNC);
585 wasdelayed = ISSET(bp->b_flags, B_DELWRI);
586 CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
9bccf70c 587 if (wasdelayed) {
d52fe63f 588 nbdwrite--;
9bccf70c
A
589 wakeup((caddr_t)&nbdwrite);
590 }
1c79356b
A
591
592 if (!sync) {
593 /*
594 * If not synchronous, pay for the I/O operation and make
595 * sure the buf is on the correct vnode queue. We have
596 * to do this now, because if we don't, the vnode may not
597 * be properly notified that its I/O has completed.
598 */
599 if (wasdelayed)
600 reassignbuf(bp, vp);
601 else
602 if (p && p->p_stats)
603 p->p_stats->p_ru.ru_oublock++; /* XXX */
604 }
605
d52fe63f 606 trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
1c79356b
A
607
608 /* Initiate disk write. Make sure the appropriate party is charged. */
609 SET(bp->b_flags, B_WRITEINPROG);
610 vp->v_numoutput++;
611
612 VOP_STRATEGY(bp);
613
614 if (sync) {
615 /*
616 * If I/O was synchronous, wait for it to complete.
617 */
618 rv = biowait(bp);
619
620 /*
621 * Pay for the I/O operation, if it's not been paid for, and
622 * make sure it's on the correct vnode queue. (async operatings
623 * were payed for above.)
624 */
625 if (wasdelayed)
626 reassignbuf(bp, vp);
627 else
628 if (p && p->p_stats)
629 p->p_stats->p_ru.ru_oublock++; /* XXX */
630
631 /* Release the buffer. */
b4c24cb9
A
632 // XXXdbg - only if the unused bit is set
633 if (!ISSET(bp->b_flags, B_NORELSE)) {
634 brelse(bp);
635 } else {
636 CLR(bp->b_flags, B_NORELSE);
637 }
1c79356b
A
638
639 return (rv);
640 } else {
641 return (0);
642 }
643}
644
645int
646vn_bwrite(ap)
647 struct vop_bwrite_args *ap;
648{
649 return (bwrite(ap->a_bp));
650}
651
652/*
653 * Delayed write.
654 *
655 * The buffer is marked dirty, but is not queued for I/O.
656 * This routine should be used when the buffer is expected
657 * to be modified again soon, typically a small write that
658 * partially fills a buffer.
659 *
660 * NB: magnetic tapes cannot be delayed; they must be
661 * written in the order that the writes are requested.
662 *
663 * Described in Leffler, et al. (pp. 208-213).
d52fe63f
A
664 *
665 * Note: With the abilitty to allocate additional buffer
666 * headers, we can get in to the situation where "too" many
667 * bdwrite()s can create situation where the kernel can create
668 * buffers faster than the disks can service. Doing a bawrite() in
669 * cases were we have "too many" outstanding bdwrite()s avoids that.
1c79356b 670 */
9bccf70c
A
671__private_extern__ int
672bdwrite_internal(bp, return_error)
1c79356b 673 struct buf *bp;
9bccf70c 674 int return_error;
1c79356b
A
675{
676 struct proc *p = current_proc();
d52fe63f 677 struct vnode *vp = bp->b_vp;
1c79356b
A
678
679 /*
680 * If the block hasn't been seen before:
681 * (1) Mark it as having been seen,
682 * (2) Charge for the write.
683 * (3) Make sure it's on its vnode's correct block list,
684 */
685 if (!ISSET(bp->b_flags, B_DELWRI)) {
686 SET(bp->b_flags, B_DELWRI);
687 if (p && p->p_stats)
688 p->p_stats->p_ru.ru_oublock++; /* XXX */
d52fe63f
A
689 nbdwrite ++;
690 reassignbuf(bp, vp);
1c79356b
A
691 }
692
1c79356b
A
693 /* If this is a tape block, write it the block now. */
694 if (ISSET(bp->b_flags, B_TAPE)) {
695 /* bwrite(bp); */
9bccf70c
A
696 VOP_BWRITE(bp);
697 return (0);
1c79356b
A
698 }
699
d52fe63f
A
700 /*
701 * If the vnode has "too many" write operations in progress
702 * wait for them to finish the IO
703 */
704 while (vp->v_numoutput >= BUFWRITE_THROTTLE) {
705 vp->v_flag |= VTHROTTLED;
706 (void)tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "bdwrite", 0);
707 }
708
709 /*
710 * If we have too many delayed write buffers,
711 * more than we can "safely" handle, just fall back to
712 * doing the async write
713 */
714 if (nbdwrite < 0)
715 panic("bdwrite: Negative nbdwrite");
716
b4c24cb9
A
717 // can't do a bawrite() if the LOCKED bit is set because the
718 // buffer is part of a transaction and can't go to disk until
719 // the LOCKED bit is cleared.
720 if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf/4)*3)) {
9bccf70c
A
721 if (return_error)
722 return (EAGAIN);
723 else
724 bawrite(bp);
725 return (0);
d52fe63f
A
726 }
727
1c79356b
A
728 /* Otherwise, the "write" is done, so mark and release the buffer. */
729 SET(bp->b_flags, B_DONE);
730 brelse(bp);
9bccf70c 731 return (0);
1c79356b
A
732}
733
9bccf70c
A
734void
735bdwrite(bp)
736 struct buf *bp;
737{
738 (void) bdwrite_internal(bp, 0);
739}
740
741
1c79356b
A
742/*
743 * Asynchronous block write; just an asynchronous bwrite().
d52fe63f
A
744 *
745 * Note: With the abilitty to allocate additional buffer
746 * headers, we can get in to the situation where "too" many
747 * bawrite()s can create situation where the kernel can create
748 * buffers faster than the disks can service.
749 * We limit the number of "in flight" writes a vnode can have to
750 * avoid this.
1c79356b 751 */
9bccf70c
A
752static int
753bawrite_internal(bp, throttle)
1c79356b 754 struct buf *bp;
9bccf70c 755 int throttle;
1c79356b 756{
d52fe63f
A
757 struct vnode *vp = bp->b_vp;
758
759 if (vp) {
760 /*
761 * If the vnode has "too many" write operations in progress
762 * wait for them to finish the IO
763 */
764 while (vp->v_numoutput >= BUFWRITE_THROTTLE) {
9bccf70c
A
765 if (throttle) {
766 vp->v_flag |= VTHROTTLED;
767 (void)tsleep((caddr_t)&vp->v_numoutput,
768 PRIBIO + 1, "bawrite", 0);
769 } else
770 return (EWOULDBLOCK);
d52fe63f
A
771 }
772 }
1c79356b
A
773
774 SET(bp->b_flags, B_ASYNC);
775 VOP_BWRITE(bp);
9bccf70c
A
776 return (0);
777}
778
779void
780bawrite(bp)
781 struct buf *bp;
782{
783 (void) bawrite_internal(bp, 1);
784}
785
786/*
787 * bwillwrite:
788 *
789 * Called prior to the locking of any vnodes when we are expecting to
790 * write. We do not want to starve the buffer cache with too many
791 * dirty buffers so we block here. By blocking prior to the locking
792 * of any vnodes we attempt to avoid the situation where a locked vnode
793 * prevents the various system daemons from flushing related buffers.
794 */
795
796void
797bwillwrite(void)
798{
799 /* XXX To be implemented later */
1c79356b
A
800}
801
802/*
803 * Release a buffer on to the free lists.
804 * Described in Bach (p. 46).
805 */
806void
807brelse(bp)
808 struct buf *bp;
809{
810 struct bqueues *bufq;
811 int s;
812 long whichq;
813
814 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
fa4905b1
A
815 bp->b_lblkno * PAGE_SIZE, (int)bp, (int)bp->b_data,
816 bp->b_flags, 0);
1c79356b
A
817
818 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
819
b4c24cb9
A
820 // if we're invalidating a buffer that has the B_CALL bit
821 // set then call the b_iodone function so it gets cleaned
822 // up properly.
823 //
824 if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
825 if (ISSET(bp->b_flags, B_CALL) && !ISSET(bp->b_flags, B_DELWRI)) {
826 panic("brelse: CALL flag set but not DELWRI! bp 0x%x\n", bp);
827 }
828 if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */
829 void (*iodone_func)(struct buf *) = bp->b_iodone;
830
831 CLR(bp->b_flags, B_CALL); /* but note callout done */
832 bp->b_iodone = NULL;
833
834 if (iodone_func == NULL) {
835 panic("brelse: bp @ 0x%x has NULL b_iodone!\n", bp);
836 }
837 (*iodone_func)(bp);
838 }
839 }
840
1c79356b
A
841 /* IO is done. Cleanup the UPL state */
842 if (!ISSET(bp->b_flags, B_META)
843 && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
844 kern_return_t kret;
845 upl_t upl;
1c79356b
A
846 int upl_flags;
847
848 if ( !ISSET(bp->b_flags, B_PAGELIST)) {
849 if ( !ISSET(bp->b_flags, B_INVAL)) {
0b4e3aa0
A
850 kret = ubc_create_upl(bp->b_vp,
851 ubc_blktooff(bp->b_vp, bp->b_lblkno),
852 bp->b_bufsize,
853 &upl,
854 NULL,
855 UPL_PRECIOUS);
1c79356b
A
856 if (kret != KERN_SUCCESS)
857 panic("brelse: Failed to get pagelists");
858#ifdef UBC_DEBUG
859 upl_ubc_alias_set(upl, bp, 5);
860#endif /* UBC_DEBUG */
861 } else
0b4e3aa0 862 upl = (upl_t) 0;
1c79356b 863 } else {
0b4e3aa0
A
864 upl = bp->b_pagelist;
865 kret = ubc_upl_unmap(upl);
1c79356b
A
866
867 if (kret != KERN_SUCCESS)
868 panic("kernel_upl_unmap failed");
869 bp->b_data = 0;
870 }
871 if (upl) {
1c79356b 872 if (bp->b_flags & (B_ERROR | B_INVAL)) {
0b4e3aa0 873 if (bp->b_flags & (B_READ | B_INVAL))
1c79356b
A
874 upl_flags = UPL_ABORT_DUMP_PAGES;
875 else
876 upl_flags = 0;
0b4e3aa0 877 ubc_upl_abort(upl, upl_flags);
1c79356b 878 } else {
fa4905b1
A
879 if (ISSET(bp->b_flags, B_NEEDCOMMIT))
880 upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
881 else if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY))
0b4e3aa0 882 upl_flags = UPL_COMMIT_SET_DIRTY ;
1c79356b 883 else
0b4e3aa0
A
884 upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
885 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
886 UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
1c79356b
A
887 }
888 s = splbio();
889 CLR(bp->b_flags, B_PAGELIST);
890 bp->b_pagelist = 0;
891 splx(s);
892 }
893 } else {
894 if(ISSET(bp->b_flags, B_PAGELIST))
895 panic("brelse: pagelist set for non VREG; vp=%x", bp->b_vp);
896 }
897
898 /* Wake up any processes waiting for any buffer to become free. */
899 if (needbuffer) {
900 needbuffer = 0;
901 wakeup(&needbuffer);
902 }
903
904 /* Wake up any proceeses waiting for _this_ buffer to become free. */
905 if (ISSET(bp->b_flags, B_WANTED)) {
906 CLR(bp->b_flags, B_WANTED);
907 wakeup(bp);
908 }
909
910 /* Block disk interrupts. */
911 s = splbio();
912
913 /*
914 * Determine which queue the buffer should be on, then put it there.
915 */
916
917 /* If it's locked, don't report an error; try again later. */
918 if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
919 CLR(bp->b_flags, B_ERROR);
920
921 /* If it's not cacheable, or an error, mark it invalid. */
922 if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
923 SET(bp->b_flags, B_INVAL);
924
925 if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
926 /*
927 * If it's invalid or empty, dissociate it from its vnode
928 * and put on the head of the appropriate queue.
929 */
930 if (bp->b_vp)
931 brelvp(bp);
d52fe63f
A
932 if (ISSET(bp->b_flags, B_DELWRI)) {
933 CLR(bp->b_flags, B_DELWRI);
934 nbdwrite--;
9bccf70c 935 wakeup((caddr_t)&nbdwrite);
d52fe63f 936 }
1c79356b
A
937 if (bp->b_bufsize <= 0)
938 whichq = BQ_EMPTY; /* no data */
9bccf70c
A
939 else if (ISSET(bp->b_flags, B_META))
940 whichq = BQ_META; /* meta-data */
1c79356b
A
941 else
942 whichq = BQ_AGE; /* invalid data */
943
944 bufq = &bufqueues[whichq];
945 binsheadfree(bp, bufq, whichq);
946 } else {
947 /*
948 * It has valid data. Put it on the end of the appropriate
949 * queue, so that it'll stick around for as long as possible.
950 */
951 if (ISSET(bp->b_flags, B_LOCKED))
952 whichq = BQ_LOCKED; /* locked in core */
953 else if (ISSET(bp->b_flags, B_META))
954 whichq = BQ_META; /* meta-data */
955 else if (ISSET(bp->b_flags, B_AGE))
956 whichq = BQ_AGE; /* stale but valid data */
957 else
958 whichq = BQ_LRU; /* valid data */
959
960 bufq = &bufqueues[whichq];
961 binstailfree(bp, bufq, whichq);
962 }
963
964 /* Unlock the buffer. */
965 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE));
966
967 /* Allow disk interrupts. */
968 splx(s);
969
970 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
fa4905b1 971 (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1c79356b
A
972}
973
974/*
975 * Determine if a block is in the cache.
976 * Just look on what would be its hash chain. If it's there, return
977 * a pointer to it, unless it's marked invalid. If it's marked invalid,
978 * we normally don't return the buffer, unless the caller explicitly
979 * wants us to.
980 */
981struct buf *
982incore(vp, blkno)
983 struct vnode *vp;
984 daddr_t blkno;
985{
986 struct buf *bp;
1c79356b
A
987
988 bp = BUFHASH(vp, blkno)->lh_first;
989
990 /* Search hash chain */
9bccf70c 991 for (; bp != NULL; bp = bp->b_hash.le_next) {
1c79356b
A
992 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
993 !ISSET(bp->b_flags, B_INVAL))
994 return (bp);
1c79356b
A
995 }
996
997 return (0);
998}
999
fa4905b1
A
1000
1001/* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
1c79356b
A
1002/*
1003 * Get a block of requested size that is associated with
1004 * a given vnode and block offset. If it is found in the
1005 * block cache, mark it as having been found, make it busy
1006 * and return it. Otherwise, return an empty block of the
1007 * correct size. It is up to the caller to insure that the
1008 * cached blocks be of the correct size.
1009 */
1010struct buf *
1011getblk(vp, blkno, size, slpflag, slptimeo, operation)
1012 register struct vnode *vp;
1013 daddr_t blkno;
1014 int size, slpflag, slptimeo, operation;
1015{
1016 struct buf *bp;
1017 int s, err;
1018 upl_t upl;
1019 upl_page_info_t *pl;
1c79356b 1020 kern_return_t kret;
1c79356b
A
1021 int error=0;
1022 int pagedirty = 0;
1023
1c79356b
A
1024 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
1025 blkno * PAGE_SIZE, size, operation, 0, 0);
0b4e3aa0 1026start:
1c79356b
A
1027
1028 s = splbio();
9bccf70c 1029 if ((bp = incore(vp, blkno))) {
1c79356b
A
1030 /* Found in the Buffer Cache */
1031 if (ISSET(bp->b_flags, B_BUSY)) {
1032 /* but is busy */
1033 switch (operation) {
1034 case BLK_READ:
1035 case BLK_WRITE:
1036 case BLK_META:
1037 SET(bp->b_flags, B_WANTED);
1038 bufstats.bufs_busyincore++;
1039 err = tsleep(bp, slpflag | (PRIBIO + 1), "getblk",
1040 slptimeo);
1041 splx(s);
1042 /*
1043 * Callers who call with PCATCH or timeout are
1044 * willing to deal with the NULL pointer
1045 */
1046 if (err && ((slpflag & PCATCH) ||
1047 ((err == EWOULDBLOCK) && slptimeo)))
1048 return (NULL);
1049 goto start;
1050 /*NOTREACHED*/
1051 break;
1052
1053 case BLK_PAGEIN:
1054 /* pagein operation must not use getblk */
1055 panic("getblk: pagein for incore busy buffer");
1056 splx(s);
1057 /*NOTREACHED*/
1058 break;
1059
1060 case BLK_PAGEOUT:
1061 /* pageout operation must not use getblk */
1062 panic("getblk: pageout for incore busy buffer");
1063 splx(s);
1064 /*NOTREACHED*/
1065 break;
1066
1067 default:
1068 panic("getblk: %d unknown operation 1", operation);
1069 /*NOTREACHED*/
1070 break;
1071 }
1072 } else {
1073 /* not busy */
1074 SET(bp->b_flags, (B_BUSY | B_CACHE));
1075 bremfree(bp);
1076 bufstats.bufs_incore++;
1077 splx(s);
1078
1079 allocbuf(bp, size);
1080 if (ISSET(bp->b_flags, B_PAGELIST))
1081 panic("pagelist buffer is not busy");
1082
1083 switch (operation) {
1084 case BLK_READ:
1085 case BLK_WRITE:
1086 if (UBCISVALID(bp->b_vp) && bp->b_bufsize) {
0b4e3aa0
A
1087 kret = ubc_create_upl(vp,
1088 ubc_blktooff(vp, bp->b_lblkno),
1089 bp->b_bufsize,
1090 &upl,
1091 &pl,
1092 UPL_PRECIOUS);
1c79356b
A
1093 if (kret != KERN_SUCCESS)
1094 panic("Failed to get pagelists");
1095
1096 SET(bp->b_flags, B_PAGELIST);
1097 bp->b_pagelist = upl;
1098
fa4905b1
A
1099 if (!upl_valid_page(pl, 0)) {
1100 if (vp->v_tag != VT_NFS)
1101 panic("getblk: incore buffer without valid page");
1102 CLR(bp->b_flags, B_CACHE);
1103 }
1c79356b
A
1104
1105 if (upl_dirty_page(pl, 0))
1106 SET(bp->b_flags, B_WASDIRTY);
1107 else
1108 CLR(bp->b_flags, B_WASDIRTY);
1109
0b4e3aa0 1110 kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
9bccf70c 1111 if (kret != KERN_SUCCESS)
0b4e3aa0
A
1112 panic("getblk: ubc_upl_map() failed with (%d)",
1113 kret);
9bccf70c
A
1114 if (bp->b_data == 0)
1115 panic("ubc_upl_map mapped 0");
1c79356b
A
1116 }
1117 break;
1118
1119 case BLK_META:
1120 /*
1121 * VM is not involved in IO for the meta data
1122 * buffer already has valid data
1123 */
9bccf70c 1124 if(bp->b_data == 0)
1c79356b
A
1125 panic("bp->b_data null incore buf=%x", bp);
1126 break;
1127
1128 case BLK_PAGEIN:
1129 case BLK_PAGEOUT:
1130 panic("getblk: paging operation 1");
1131 break;
1132
1133 default:
1134 panic("getblk: %d unknown operation 2", operation);
1135 /*NOTREACHED*/
1136 break;
1137 }
1138 }
1139 } else { /* not incore() */
1140 int queue = BQ_EMPTY; /* Start with no preference */
1141 splx(s);
1142
1143 if ((operation == BLK_META) || (UBCINVALID(vp)) ||
1144 !(UBCINFOEXISTS(vp))) {
1145 operation = BLK_META;
1146 }
1147 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
1148 goto start;
0b4e3aa0
A
1149 if (incore(vp, blkno)) {
1150 SET(bp->b_flags, B_INVAL);
1151 binshash(bp, &invalhash);
1152 brelse(bp);
1153 goto start;
1154 }
b4c24cb9
A
1155 /*
1156 * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
1157 * CALLED! BE CAREFUL.
1158 */
0b4e3aa0 1159
1c79356b
A
1160 /*
1161 * if it is meta, the queue may be set to other
1162 * type so reset as well as mark it to be B_META
1163 * so that when buffer is released it will goto META queue
1164 * Also, if the vnode is not VREG, then it is META
1165 */
1166 if (operation == BLK_META) {
1167 SET(bp->b_flags, B_META);
1168 queue = BQ_META;
1169 }
9bccf70c
A
1170
1171 bp->b_blkno = bp->b_lblkno = blkno;
1172 bp->b_vp = vp;
1173
0b4e3aa0
A
1174 /*
1175 * Insert in the hash so that incore() can find it
1176 */
1177 binshash(bp, BUFHASH(vp, blkno));
1178
9bccf70c
A
1179 s = splbio();
1180 bgetvp(vp, bp);
1181 splx(s);
1182
1c79356b
A
1183 allocbuf(bp, size);
1184
1185 switch (operation) {
1186 case BLK_META:
1187 /* buffer data is invalid */
1188
1c79356b
A
1189 if(bp->b_data == 0)
1190 panic("bp->b_data is null %x",bp);
1191
1c79356b 1192 bufstats.bufs_miss++;
1c79356b
A
1193
1194 /* wakeup the buffer */
1195 CLR(bp->b_flags, B_WANTED);
1196 wakeup(bp);
1197 break;
1198
1199 case BLK_READ:
1200 case BLK_WRITE:
1c79356b
A
1201
1202 if (ISSET(bp->b_flags, B_PAGELIST))
1203 panic("B_PAGELIST in bp=%x",bp);
1204
0b4e3aa0
A
1205 kret = ubc_create_upl(vp,
1206 ubc_blktooff(vp, blkno),
1207 bp->b_bufsize,
1208 &upl,
1209 &pl,
1210 UPL_PRECIOUS);
1c79356b
A
1211 if (kret != KERN_SUCCESS)
1212 panic("Failed to get pagelists");
1213
1214#ifdef UBC_DEBUG
1215 upl_ubc_alias_set(upl, bp, 4);
1216#endif /* UBC_DEBUG */
1c79356b
A
1217 bp->b_pagelist = upl;
1218
1219 SET(bp->b_flags, B_PAGELIST);
1c79356b
A
1220
1221 if (upl_valid_page(pl, 0)) {
1222 SET(bp->b_flags, B_CACHE | B_DONE);
1223 bufstats.bufs_vmhits++;
1224
1225 pagedirty = upl_dirty_page(pl, 0);
1226
1227 if (pagedirty)
1228 SET(bp->b_flags, B_WASDIRTY);
1229
1230 if (vp->v_tag == VT_NFS) {
1231 off_t f_offset;
1232 int valid_size;
1233
1234 bp->b_validoff = 0;
1235 bp->b_dirtyoff = 0;
1236
1237 f_offset = ubc_blktooff(vp, blkno);
1238
1239 if (f_offset > vp->v_ubcinfo->ui_size) {
1240 CLR(bp->b_flags, (B_CACHE|B_DONE|B_WASDIRTY));
1241 bp->b_validend = 0;
1242 bp->b_dirtyend = 0;
1243 } else {
1244 valid_size = min(((unsigned int)(vp->v_ubcinfo->ui_size - f_offset)), PAGE_SIZE);
1245 bp->b_validend = valid_size;
1246
1247 if (pagedirty)
1248 bp->b_dirtyend = valid_size;
1249 else
1250 bp->b_dirtyend = 0;
1251
1252 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_NONE,
1253 bp->b_validend, bp->b_dirtyend,
1254 (int)vp->v_ubcinfo->ui_size, 0, 0);
1255 }
1256 } else {
1257 bp->b_validoff = 0;
1258 bp->b_dirtyoff = 0;
1259
1260 if (pagedirty) {
1261 /* page is dirty */
1262 bp->b_validend = bp->b_bcount;
1263 bp->b_dirtyend = bp->b_bcount;
1264 } else {
1265 /* page is clean */
1266 bp->b_validend = bp->b_bcount;
1267 bp->b_dirtyend = 0;
1268 }
1269 }
9bccf70c
A
1270 error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL);
1271 if(error) {
1272 panic("getblk: VOP_BMAP failed");
1c79356b
A
1273 /*NOTREACHED*/
1274 /*
1275 * XXX: We probably should invalidate the VM Page
1276 */
1277 bp->b_error = error;
1278 SET(bp->b_flags, (B_ERROR | B_INVAL));
1279 /* undo B_DONE that was set before upl_commit() */
1280 CLR(bp->b_flags, B_DONE);
1281 brelse(bp);
1282 return (0);
1283 }
1284 } else {
1285 bufstats.bufs_miss++;
1286 }
0b4e3aa0 1287 kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
1c79356b 1288 if (kret != KERN_SUCCESS) {
0b4e3aa0 1289 panic("getblk: ubc_upl_map() "
1c79356b
A
1290 "failed with (%d)", kret);
1291 }
9bccf70c
A
1292 if (bp->b_data == 0)
1293 panic("kernel_upl_map mapped 0");
1c79356b
A
1294
1295 break;
1296
1297 case BLK_PAGEIN:
1298 case BLK_PAGEOUT:
1299 panic("getblk: paging operation 2");
1300 break;
1301 default:
1302 panic("getblk: %d unknown operation 3", operation);
1303 /*NOTREACHED*/
1304 break;
1305 }
1306 }
1307
1308 if (bp->b_data == NULL)
1309 panic("getblk: bp->b_addr is null");
1310
1311 if (bp->b_bufsize & 0xfff) {
1c79356b 1312 if (ISSET(bp->b_flags, B_META) && (bp->b_bufsize & 0x1ff))
1c79356b
A
1313 panic("getblk: bp->b_bufsize = %d", bp->b_bufsize);
1314 }
1315
1316 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
fa4905b1 1317 (int)bp, (int)bp->b_data, bp->b_flags, 3, 0);
1c79356b
A
1318
1319 return (bp);
1320}
1321
1322/*
1323 * Get an empty, disassociated buffer of given size.
1324 */
1325struct buf *
1326geteblk(size)
1327 int size;
1328{
1329 struct buf *bp;
1330 int queue = BQ_EMPTY;
1c79356b
A
1331
1332 while ((bp = getnewbuf(0, 0, &queue)) == 0)
1333 ;
1c79356b 1334 SET(bp->b_flags, (B_META|B_INVAL));
1c79356b
A
1335
1336#if DIAGNOSTIC
1337 assert(queue == BQ_EMPTY);
1338#endif /* DIAGNOSTIC */
1339 /* XXX need to implement logic to deal with other queues */
1340
1c79356b
A
1341 binshash(bp, &invalhash);
1342 allocbuf(bp, size);
1343 bufstats.bufs_eblk++;
1344
1345 return (bp);
1346}
1347
1c79356b
A
1348/*
1349 * Zones for the meta data buffers
1350 */
1351
1352#define MINMETA 512
1353#define MAXMETA 4096
1354
1355struct meta_zone_entry {
1356 zone_t mz_zone;
1357 vm_size_t mz_size;
1358 vm_size_t mz_max;
1359 char *mz_name;
1360};
1361
1362struct meta_zone_entry meta_zones[] = {
1363 {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
1364 {NULL, (MINMETA * 2), 64 * (MINMETA * 2), "buf.1024" },
1c79356b 1365 {NULL, (MINMETA * 4), 16 * (MINMETA * 4), "buf.2048" },
1c79356b
A
1366 {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
1367 {NULL, 0, 0, "" } /* End */
1368};
765c9de3 1369
1c79356b
A
1370/*
1371 * Initialize the meta data zones
1372 */
1373static void
1374bufzoneinit(void)
1375{
1376 int i;
1377
1378 for (i = 0; meta_zones[i].mz_size != 0; i++) {
1379 meta_zones[i].mz_zone =
1380 zinit(meta_zones[i].mz_size,
1381 meta_zones[i].mz_max,
1382 PAGE_SIZE,
1383 meta_zones[i].mz_name);
1384 }
765c9de3 1385 buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
1c79356b
A
1386}
1387
9bccf70c 1388static __inline__ zone_t
1c79356b
A
1389getbufzone(size_t size)
1390{
1391 int i;
1392
9bccf70c 1393 if ((size % 512) || (size < MINMETA) || (size > MAXMETA))
1c79356b
A
1394 panic("getbufzone: incorect size = %d", size);
1395
9bccf70c
A
1396 for (i = 0; meta_zones[i].mz_size != 0; i++) {
1397 if (meta_zones[i].mz_size >= size)
1398 break;
1399 }
1400
1c79356b
A
1401 return (meta_zones[i].mz_zone);
1402}
1c79356b
A
1403
1404/*
1405 * With UBC, there is no need to expand / shrink the file data
1406 * buffer. The VM uses the same pages, hence no waste.
1407 * All the file data buffers can have one size.
1408 * In fact expand / shrink would be an expensive operation.
1409 *
1410 * Only exception to this is meta-data buffers. Most of the
1411 * meta data operations are smaller than PAGE_SIZE. Having the
1412 * meta-data buffers grow and shrink as needed, optimizes use
1413 * of the kernel wired memory.
1414 */
1415
1416int
1417allocbuf(bp, size)
1418 struct buf *bp;
1419 int size;
1420{
1421 vm_size_t desired_size;
1422
1423 desired_size = roundup(size, CLBYTES);
1424
1425 if(desired_size < PAGE_SIZE)
1426 desired_size = PAGE_SIZE;
1427 if (desired_size > MAXBSIZE)
1428 panic("allocbuf: buffer larger than MAXBSIZE requested");
1429
1c79356b
A
1430 if (ISSET(bp->b_flags, B_META)) {
1431 kern_return_t kret;
1432 zone_t zprev, z;
1433 size_t nsize = roundup(size, MINMETA);
1434
1435 if (bp->b_data) {
1436 vm_offset_t elem = (vm_offset_t)bp->b_data;
1437
1438 if (ISSET(bp->b_flags, B_ZALLOC))
1439 if (bp->b_bufsize <= MAXMETA) {
1440 if (bp->b_bufsize < nsize) {
1441 /* reallocate to a bigger size */
1442 desired_size = nsize;
1443
1444 zprev = getbufzone(bp->b_bufsize);
1445 z = getbufzone(nsize);
1446 bp->b_data = (caddr_t)zalloc(z);
1447 if(bp->b_data == 0)
1448 panic("allocbuf: zalloc() returned NULL");
1449 bcopy(elem, bp->b_data, bp->b_bufsize);
1450 zfree(zprev, elem);
1451 } else {
1452 desired_size = bp->b_bufsize;
1453 }
1454 } else
1455 panic("allocbuf: B_ZALLOC set incorrectly");
1456 else
1457 if (bp->b_bufsize < desired_size) {
1458 /* reallocate to a bigger size */
1459 kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1460 if (kret != KERN_SUCCESS)
1461 panic("allocbuf: kmem_alloc() returned %d", kret);
1462 if(bp->b_data == 0)
1463 panic("allocbuf: null b_data");
1464 bcopy(elem, bp->b_data, bp->b_bufsize);
1465 kmem_free(kernel_map, elem, bp->b_bufsize);
1466 } else {
1467 desired_size = bp->b_bufsize;
1468 }
1469 } else {
1470 /* new allocation */
1471 if (nsize <= MAXMETA) {
1472 desired_size = nsize;
1473 z = getbufzone(nsize);
1474 bp->b_data = (caddr_t)zalloc(z);
1475 if(bp->b_data == 0)
1476 panic("allocbuf: zalloc() returned NULL 2");
1477 SET(bp->b_flags, B_ZALLOC);
1478 } else {
1479 kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1480 if (kret != KERN_SUCCESS)
1481 panic("allocbuf: kmem_alloc() 2 returned %d", kret);
1482 if(bp->b_data == 0)
1483 panic("allocbuf: null b_data 2");
1484 }
1485 }
1486 }
1487
1488 if (ISSET(bp->b_flags, B_META) && (bp->b_data == 0))
b4c24cb9 1489 panic("allocbuf: bp->b_data is NULL, buf @ 0x%x", bp);
1c79356b 1490
9bccf70c
A
1491 bp->b_bufsize = desired_size;
1492 bp->b_bcount = size;
1493 return (0);
1c79356b
A
1494}
1495
1496/*
1497 * Get a new buffer from one of the free lists.
1498 *
1499 * Request for a queue is passes in. The queue from which the buffer was taken
1500 * from is returned. Out of range queue requests get BQ_EMPTY. Request for
1501 * BQUEUE means no preference. Use heuristics in that case.
1502 * Heuristics is as follows:
1503 * Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
1504 * If none available block till one is made available.
1505 * If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
1506 * Pick the most stale buffer.
1507 * If found buffer was marked delayed write, start the async. write
1508 * and restart the search.
1509 * Initialize the fields and disassociate the buffer from the vnode.
1510 * Remove the buffer from the hash. Return the buffer and the queue
1511 * on which it was found.
1512 */
1513
1514static struct buf *
1515getnewbuf(slpflag, slptimeo, queue)
1516 int slpflag, slptimeo;
1517 int *queue;
1518{
1519 register struct buf *bp;
1520 register struct buf *lru_bp;
1521 register struct buf *age_bp;
1522 register struct buf *meta_bp;
1523 register int age_time, lru_time, bp_time, meta_time;
1524 int s;
1c79356b
A
1525 int req = *queue; /* save it for restarts */
1526
1527start:
1528 s = splbio();
1529
1530 /* invalid request gets empty queue */
765c9de3
A
1531 if ((*queue > BQUEUES) || (*queue < 0)
1532 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED))
1c79356b
A
1533 *queue = BQ_EMPTY;
1534
1535 /* (*queue == BQUEUES) means no preference */
1536 if (*queue != BQUEUES) {
1537 /* Try for the requested queue first */
1538 bp = bufqueues[*queue].tqh_first;
1539 if (bp)
1540 goto found;
1541 }
1542
1543 /* Unable to use requested queue */
1544 age_bp = bufqueues[BQ_AGE].tqh_first;
1545 lru_bp = bufqueues[BQ_LRU].tqh_first;
1546 meta_bp = bufqueues[BQ_META].tqh_first;
1547
9bccf70c
A
1548 if (!age_bp && !lru_bp && !meta_bp) {
1549 /*
1550 * Unavailble on AGE or LRU or META queues
1551 * Try the empty list first
1552 */
1c79356b
A
1553 bp = bufqueues[BQ_EMPTY].tqh_first;
1554 if (bp) {
1555 *queue = BQ_EMPTY;
1556 goto found;
1557 }
765c9de3
A
1558
1559 /* Create a new temparory buffer header */
1560 bp = (struct buf *)zalloc(buf_hdr_zone);
1561
1562 if (bp) {
1563 bufhdrinit(bp);
1564 BLISTNONE(bp);
1565 binshash(bp, &invalhash);
1566 SET(bp->b_flags, B_HDRALLOC);
1567 *queue = BQ_EMPTY;
1568 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
1569 buf_hdr_count++;
1570 goto found;
1571 }
1572
1c79356b
A
1573 /* Log this error condition */
1574 printf("getnewbuf: No useful buffers");
765c9de3 1575
1c79356b
A
1576 /* wait for a free buffer of any kind */
1577 needbuffer = 1;
1578 bufstats.bufs_sleeps++;
1579 tsleep(&needbuffer, slpflag|(PRIBIO+1), "getnewbuf", slptimeo);
1580 splx(s);
1581 return (0);
1582 }
1583
1584 /* Buffer available either on AGE or LRU or META */
1585 bp = NULL;
1586 *queue = -1;
1587
1588 /* Buffer available either on AGE or LRU */
1589 if (!age_bp) {
1590 bp = lru_bp;
1591 *queue = BQ_LRU;
1592 } else if (!lru_bp) {
1593 bp = age_bp;
1594 *queue = BQ_AGE;
1595 } else { /* buffer available on both AGE and LRU */
1596 age_time = time.tv_sec - age_bp->b_timestamp;
1597 lru_time = time.tv_sec - lru_bp->b_timestamp;
1598 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
1599 bp = age_bp;
1600 *queue = BQ_AGE;
1601 /*
1602 * we should probably re-timestamp eveything in the
1603 * queues at this point with the current time
1604 */
1605 } else {
1606 if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
1607 bp = lru_bp;
1608 *queue = BQ_LRU;
1609 } else {
1610 bp = age_bp;
1611 *queue = BQ_AGE;
1612 }
1613 }
1614 }
1615
1616 if (!bp) { /* Neither on AGE nor on LRU */
1617 bp = meta_bp;
1618 *queue = BQ_META;
1619 } else if (meta_bp) {
1620 bp_time = time.tv_sec - bp->b_timestamp;
1621 meta_time = time.tv_sec - meta_bp->b_timestamp;
1622
1623 if (!(bp_time < 0) && !(meta_time < 0)) {
1624 /* time not set backwards */
1625 int bp_is_stale;
1626 bp_is_stale = (*queue == BQ_LRU) ?
1627 lru_is_stale : age_is_stale;
1628
1629 if ((meta_time >= meta_is_stale) &&
1630 (bp_time < bp_is_stale)) {
1631 bp = meta_bp;
1632 *queue = BQ_META;
1633 }
1634 }
1635 }
1636
1637 if (bp == NULL)
1638 panic("getnewbuf: null bp");
1639
1640found:
b4c24cb9
A
1641 if (ISSET(bp->b_flags, B_LOCKED)) {
1642 panic("getnewbuf: bp @ 0x%x is LOCKED! (flags 0x%x)\n", bp, bp->b_flags);
1643 }
1644
1c79356b 1645 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
b4c24cb9 1646 panic("getnewbuf: le_prev is deadbeef, buf @ 0x%x", bp);
1c79356b
A
1647
1648 if(ISSET(bp->b_flags, B_BUSY))
b4c24cb9 1649 panic("getnewbuf reusing BUSY buf @ 0x%x", bp);
1c79356b
A
1650
1651 /* Clean it */
1652 if (bcleanbuf(bp)) {
1653 /* bawrite() issued, buffer not ready */
1654 splx(s);
1655 *queue = req;
1656 goto start;
1657 }
1658 splx(s);
1659 return (bp);
1660}
9bccf70c 1661
1c79356b
A
1662#include <mach/mach_types.h>
1663#include <mach/memory_object_types.h>
9bccf70c 1664#include <kern/sched_prim.h>
1c79356b
A
1665
1666/*
1667 * Clean a buffer.
1668 * Returns 0 is buffer is ready to use,
1669 * Returns 1 if issued a bawrite() to indicate
1670 * that the buffer is not ready.
1671 */
9bccf70c 1672static int
1c79356b
A
1673bcleanbuf(struct buf *bp)
1674{
1675 int s;
1676 struct ucred *cred;
d52fe63f 1677 int hdralloc = 0;
1c79356b
A
1678
1679 s = splbio();
1680
1681 /* Remove from the queue */
1682 bremfree(bp);
1683
1684 /* Buffer is no longer on free lists. */
1685 SET(bp->b_flags, B_BUSY);
1686
d52fe63f
A
1687 /* Check whether the buffer header was "allocated" */
1688 if (ISSET(bp->b_flags, B_HDRALLOC))
1689 hdralloc = 1;
1690
1c79356b
A
1691 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1692 panic("bcleanbuf: le_prev is deadbeef");
1693
765c9de3
A
1694 /*
1695 * If buffer was a delayed write, start the IO by queuing
1696 * it on the LAUNDRY queue, and return 1
1697 */
1c79356b
A
1698 if (ISSET(bp->b_flags, B_DELWRI)) {
1699 splx(s);
765c9de3
A
1700 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
1701 blaundrycnt++;
1702 wakeup(&blaundrycnt);
9bccf70c
A
1703 /* and give it a chance to run */
1704 (void)thread_block(THREAD_CONTINUE_NULL);
1c79356b
A
1705 return (1);
1706 }
1707
1708 if (bp->b_vp)
1709 brelvp(bp);
1710 bremhash(bp);
1711 BLISTNONE(bp);
1712
1713 splx(s);
1714
1715 if (ISSET(bp->b_flags, B_META)) {
1c79356b
A
1716 vm_offset_t elem = (vm_offset_t)bp->b_data;
1717 if (elem == 0)
1718 panic("bcleanbuf: NULL bp->b_data B_META buffer");
1719
1720 if (ISSET(bp->b_flags, B_ZALLOC)) {
1721 if (bp->b_bufsize <= MAXMETA) {
1722 zone_t z;
1723
1724 z = getbufzone(bp->b_bufsize);
1725 bp->b_data = (caddr_t)0xdeadbeef;
1726 zfree(z, elem);
1727 CLR(bp->b_flags, B_ZALLOC);
1728 } else
1729 panic("bcleanbuf: B_ZALLOC set incorrectly");
1730 } else {
1731 bp->b_data = (caddr_t)0xdeadbeef;
1732 kmem_free(kernel_map, elem, bp->b_bufsize);
1733 }
1c79356b
A
1734 }
1735
1736 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
1737
1738 /* disassociate us from our vnode, if we had one... */
1739 s = splbio();
1740
1741 /* clear out various other fields */
0b4e3aa0 1742 bp->b_bufsize = 0;
1c79356b
A
1743 bp->b_data = 0;
1744 bp->b_flags = B_BUSY;
d52fe63f
A
1745 if (hdralloc)
1746 SET(bp->b_flags, B_HDRALLOC);
1c79356b
A
1747 bp->b_dev = NODEV;
1748 bp->b_blkno = bp->b_lblkno = 0;
1749 bp->b_iodone = 0;
1750 bp->b_error = 0;
1751 bp->b_resid = 0;
1752 bp->b_bcount = 0;
1753 bp->b_dirtyoff = bp->b_dirtyend = 0;
1754 bp->b_validoff = bp->b_validend = 0;
1755
1756 /* nuke any credentials we were holding */
1757 cred = bp->b_rcred;
1758 if (cred != NOCRED) {
1759 bp->b_rcred = NOCRED;
1760 crfree(cred);
1761 }
1762 cred = bp->b_wcred;
1763 if (cred != NOCRED) {
1764 bp->b_wcred = NOCRED;
1765 crfree(cred);
1766 }
1767 splx(s);
1768 return (0);
1769}
1770
1771
1772/*
1773 * Wait for operations on the buffer to complete.
1774 * When they do, extract and return the I/O's error value.
1775 */
1776int
1777biowait(bp)
1778 struct buf *bp;
1779{
1c79356b 1780 int s;
1c79356b
A
1781
1782 s = splbio();
1783 while (!ISSET(bp->b_flags, B_DONE))
1784 tsleep(bp, PRIBIO + 1, "biowait", 0);
1785 splx(s);
1786
1787 /* check for interruption of I/O (e.g. via NFS), then errors. */
1788 if (ISSET(bp->b_flags, B_EINTR)) {
1789 CLR(bp->b_flags, B_EINTR);
1790 return (EINTR);
1791 } else if (ISSET(bp->b_flags, B_ERROR))
1792 return (bp->b_error ? bp->b_error : EIO);
1793 else
1794 return (0);
1795}
1796
1797/*
1798 * Mark I/O complete on a buffer.
1799 *
1800 * If a callback has been requested, e.g. the pageout
1801 * daemon, do so. Otherwise, awaken waiting processes.
1802 *
1803 * [ Leffler, et al., says on p.247:
1804 * "This routine wakes up the blocked process, frees the buffer
1805 * for an asynchronous write, or, for a request by the pagedaemon
1806 * process, invokes a procedure specified in the buffer structure" ]
1807 *
1808 * In real life, the pagedaemon (or other system processes) wants
1809 * to do async stuff to, and doesn't want the buffer brelse()'d.
1810 * (for swap pager, that puts swap buffers on the free lists (!!!),
1811 * for the vn device, that puts malloc'd buffers on the free lists!)
1812 */
1813void
1814biodone(bp)
1815 struct buf *bp;
1816{
1817 boolean_t funnel_state;
d52fe63f 1818 struct vnode *vp;
1c79356b
A
1819
1820 funnel_state = thread_funnel_set(kernel_flock, TRUE);
1821
1822 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
fa4905b1 1823 (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1c79356b
A
1824
1825 if (ISSET(bp->b_flags, B_DONE))
1826 panic("biodone already");
1827 SET(bp->b_flags, B_DONE); /* note that it's done */
1828 /*
1829 * I/O was done, so don't believe
1830 * the DIRTY state from VM anymore
1831 */
1832 CLR(bp->b_flags, B_WASDIRTY);
1833
1834 if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
1835 vwakeup(bp); /* wake up reader */
9bccf70c
A
1836
1837 if (kdebug_enable) {
1838 int code = DKIO_DONE;
1839
1840 if (bp->b_flags & B_READ)
1841 code |= DKIO_READ;
1842 if (bp->b_flags & B_ASYNC)
1843 code |= DKIO_ASYNC;
1844
1845 if (bp->b_flags & B_META)
1846 code |= DKIO_META;
1847 else if (bp->b_flags & (B_PGIN | B_PAGEOUT))
1848 code |= DKIO_PAGING;
1849
1850 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
1851 bp, bp->b_vp, bp->b_resid, bp->b_error, 0);
1852 }
1853
d52fe63f
A
1854 /* Wakeup the throttled write operations as needed */
1855 vp = bp->b_vp;
1856 if (vp
1857 && (vp->v_flag & VTHROTTLED)
1858 && (vp->v_numoutput <= (BUFWRITE_THROTTLE / 3))) {
1859 vp->v_flag &= ~VTHROTTLED;
1860 wakeup((caddr_t)&vp->v_numoutput);
1861 }
1862
1c79356b 1863 if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */
b4c24cb9
A
1864 void (*iodone_func)(struct buf *) = bp->b_iodone;
1865
1c79356b 1866 CLR(bp->b_flags, B_CALL); /* but note callout done */
b4c24cb9
A
1867 bp->b_iodone = NULL;
1868
1869 if (iodone_func == NULL) {
1870 panic("biodone: bp @ 0x%x has NULL b_iodone!\n", bp);
1871 } else {
1872 (*iodone_func)(bp);
1873 }
1c79356b
A
1874 } else if (ISSET(bp->b_flags, B_ASYNC)) /* if async, release it */
1875 brelse(bp);
1876 else { /* or just wakeup the buffer */
1877 CLR(bp->b_flags, B_WANTED);
1878 wakeup(bp);
1879 }
1880
1881 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
fa4905b1 1882 (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1c79356b
A
1883
1884 thread_funnel_set(kernel_flock, funnel_state);
1885}
1886
1887/*
1888 * Return a count of buffers on the "locked" queue.
1889 */
1890int
1891count_lock_queue()
1892{
1893 register struct buf *bp;
1894 register int n = 0;
1895
1896 for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
1897 bp = bp->b_freelist.tqe_next)
1898 n++;
1899 return (n);
1900}
1901
1902/*
1903 * Return a count of 'busy' buffers. Used at the time of shutdown.
1904 */
1905int
1906count_busy_buffers()
1907{
1908 register struct buf *bp;
1909 register int nbusy = 0;
1910
1911 for (bp = &buf[nbuf]; --bp >= buf; )
1912 if ((bp->b_flags & (B_BUSY|B_INVAL)) == B_BUSY)
1913 nbusy++;
1914 return (nbusy);
1915}
1916
9bccf70c 1917#if DIAGNOSTIC
1c79356b
A
1918/*
1919 * Print out statistics on the current allocation of the buffer pool.
1920 * Can be enabled to print out on every ``sync'' by setting "syncprt"
1921 * in vfs_syscalls.c using sysctl.
1922 */
1923void
1924vfs_bufstats()
1925{
1926 int s, i, j, count;
1927 register struct buf *bp;
1928 register struct bqueues *dp;
1929 int counts[MAXBSIZE/CLBYTES+1];
765c9de3
A
1930 static char *bname[BQUEUES] =
1931 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
1c79356b
A
1932
1933 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
1934 count = 0;
1935 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1936 counts[j] = 0;
1937 s = splbio();
1938 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
1939 counts[bp->b_bufsize/CLBYTES]++;
1940 count++;
1941 }
1942 splx(s);
1943 printf("%s: total-%d", bname[i], count);
1944 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1945 if (counts[j] != 0)
1946 printf(", %d-%d", j * CLBYTES, counts[j]);
1947 printf("\n");
1948 }
1949}
1950#endif /* DIAGNOSTIC */
1951
9bccf70c 1952#define NRESERVEDIOBUFS 64
1c79356b 1953
9bccf70c 1954__private_extern__ struct buf *
0b4e3aa0 1955alloc_io_buf(vp, priv)
1c79356b 1956 struct vnode *vp;
0b4e3aa0 1957 int priv;
1c79356b
A
1958{
1959 register struct buf *bp;
1960 int s;
1961
1962 s = splbio();
1963
0b4e3aa0
A
1964 while (niobuf - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse && !priv) {
1965 need_iobuffer = 1;
1966 bufstats.bufs_iobufsleeps++;
1967 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf", 0);
1968 }
1969
1c79356b
A
1970 while ((bp = iobufqueue.tqh_first) == NULL) {
1971 need_iobuffer = 1;
1972 bufstats.bufs_iobufsleeps++;
0b4e3aa0 1973 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf1", 0);
1c79356b 1974 }
0b4e3aa0 1975
1c79356b
A
1976 TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
1977 bp->b_timestamp = 0;
1978
1979 /* clear out various fields */
1980 bp->b_flags = B_BUSY;
1981 bp->b_blkno = bp->b_lblkno = 0;
b4c24cb9 1982
1c79356b
A
1983 bp->b_iodone = 0;
1984 bp->b_error = 0;
1985 bp->b_resid = 0;
1986 bp->b_bcount = 0;
1987 bp->b_bufsize = 0;
1988 bp->b_vp = vp;
1989
1990 if (vp->v_type == VBLK || vp->v_type == VCHR)
1991 bp->b_dev = vp->v_rdev;
1992 else
1993 bp->b_dev = NODEV;
1994 bufstats.bufs_iobufinuse++;
1995 if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
1996 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
1997 splx(s);
1998
1999 return (bp);
2000}
2001
9bccf70c 2002__private_extern__ void
1c79356b
A
2003free_io_buf(bp)
2004 struct buf *bp;
2005{
2006 int s;
2007
2008 s = splbio();
2009 /* put buffer back on the head of the iobufqueue */
2010 bp->b_vp = NULL;
2011 bp->b_flags = B_INVAL;
2012
2013 binsheadfree(bp, &iobufqueue, -1);
2014
2015 /* Wake up any processes waiting for any buffer to become free. */
2016 if (need_iobuffer) {
2017 need_iobuffer = 0;
2018 wakeup(&need_iobuffer);
2019 }
2020 bufstats.bufs_iobufinuse--;
2021 splx(s);
2022}
2023
9bccf70c 2024/* disabled for now */
1c79356b
A
2025
2026/* XXX move this to a separate file */
2027/*
2028 * Dynamic Scaling of the Buffer Queues
2029 */
2030
2031typedef long long blsize_t;
2032
2033blsize_t MAXNBUF; /* initialize to (mem_size / PAGE_SIZE) */
2034/* Global tunable limits */
2035blsize_t nbufh; /* number of buffer headers */
2036blsize_t nbuflow; /* minimum number of buffer headers required */
2037blsize_t nbufhigh; /* maximum number of buffer headers allowed */
2038blsize_t nbuftarget; /* preferred number of buffer headers */
2039
2040/*
2041 * assertions:
2042 *
2043 * 1. 0 < nbuflow <= nbufh <= nbufhigh
2044 * 2. nbufhigh <= MAXNBUF
2045 * 3. 0 < nbuflow <= nbuftarget <= nbufhigh
2046 * 4. nbufh can not be set by sysctl().
2047 */
2048
2049/* Per queue tunable limits */
2050
2051struct bufqlim {
2052 blsize_t bl_nlow; /* minimum number of buffer headers required */
2053 blsize_t bl_num; /* number of buffer headers on the queue */
2054 blsize_t bl_nlhigh; /* maximum number of buffer headers allowed */
2055 blsize_t bl_target; /* preferred number of buffer headers */
2056 long bl_stale; /* Seconds after which a buffer is considered stale */
2057} bufqlim[BQUEUES];
2058
2059/*
2060 * assertions:
2061 *
2062 * 1. 0 <= bl_nlow <= bl_num <= bl_nlhigh
2063 * 2. bl_nlhigh <= MAXNBUF
2064 * 3. bufqlim[BQ_META].bl_nlow != 0
2065 * 4. bufqlim[BQ_META].bl_nlow > (number of possible concurrent
2066 * file system IO operations)
2067 * 5. bl_num can not be set by sysctl().
2068 * 6. bl_nhigh <= nbufhigh
2069 */
2070
2071/*
2072 * Rationale:
2073 * ----------
2074 * Defining it blsize_t as long permits 2^31 buffer headers per queue.
2075 * Which can describe (2^31 * PAGE_SIZE) memory per queue.
2076 *
2077 * These limits are exported to by means of sysctl().
2078 * It was decided to define blsize_t as a 64 bit quantity.
2079 * This will make sure that we will not be required to change it
2080 * as long as we do not exceed 64 bit address space for the kernel.
2081 *
2082 * low and high numbers parameters initialized at compile time
2083 * and boot arguments can be used to override them. sysctl()
2084 * would not change the value. sysctl() can get all the values
2085 * but can set only target. num is the current level.
2086 *
2087 * Advantages of having a "bufqscan" thread doing the balancing are,
2088 * Keep enough bufs on BQ_EMPTY.
2089 * getnewbuf() by default will always select a buffer from the BQ_EMPTY.
2090 * getnewbuf() perfoms best if a buffer was found there.
2091 * Also this minimizes the possibility of starting IO
2092 * from getnewbuf(). That's a performance win, too.
2093 *
2094 * Localize complex logic [balancing as well as time aging]
2095 * to balancebufq().
2096 *
2097 * Simplify getnewbuf() logic by elimination of time aging code.
2098 */
2099
2100/*
2101 * Algorithm:
2102 * -----------
2103 * The goal of the dynamic scaling of the buffer queues to to keep
2104 * the size of the LRU close to bl_target. Buffers on a queue would
2105 * be time aged.
2106 *
2107 * There would be a thread which will be responsible for "balancing"
2108 * the buffer cache queues.
2109 *
2110 * The scan order would be: AGE, LRU, META, EMPTY.
2111 */
2112
2113long bufqscanwait = 0;
2114
9bccf70c
A
2115static void bufqscan_thread();
2116static int balancebufq(int q);
2117static int btrimempty(int n);
2118static __inline__ int initbufqscan(void);
2119static __inline__ int nextbufq(int q);
2120static void buqlimprt(int all);
1c79356b 2121
9bccf70c 2122static void
1c79356b
A
2123bufq_balance_thread_init()
2124{
2125
2126 if (bufqscanwait++ == 0) {
1c79356b
A
2127
2128 /* Initalize globals */
2129 MAXNBUF = (mem_size / PAGE_SIZE);
2130 nbufh = nbuf;
2131 nbuflow = min(nbufh, 100);
2132 nbufhigh = min(MAXNBUF, max(nbufh, 2048));
2133 nbuftarget = (mem_size >> 5) / PAGE_SIZE;
2134 nbuftarget = max(nbuflow, nbuftarget);
2135 nbuftarget = min(nbufhigh, nbuftarget);
2136
2137 /*
2138 * Initialize the bufqlim
2139 */
2140
2141 /* LOCKED queue */
2142 bufqlim[BQ_LOCKED].bl_nlow = 0;
2143 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
2144 bufqlim[BQ_LOCKED].bl_target = 0;
2145 bufqlim[BQ_LOCKED].bl_stale = 30;
2146
2147 /* LRU queue */
2148 bufqlim[BQ_LRU].bl_nlow = 0;
2149 bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
2150 bufqlim[BQ_LRU].bl_target = nbuftarget/4;
2151 bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
2152
2153 /* AGE queue */
2154 bufqlim[BQ_AGE].bl_nlow = 0;
2155 bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
2156 bufqlim[BQ_AGE].bl_target = nbuftarget/4;
2157 bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
2158
2159 /* EMPTY queue */
2160 bufqlim[BQ_EMPTY].bl_nlow = 0;
2161 bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
2162 bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
2163 bufqlim[BQ_EMPTY].bl_stale = 600000;
2164
2165 /* META queue */
2166 bufqlim[BQ_META].bl_nlow = 0;
2167 bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
2168 bufqlim[BQ_META].bl_target = nbuftarget/4;
2169 bufqlim[BQ_META].bl_stale = META_IS_STALE;
2170
765c9de3
A
2171 /* LAUNDRY queue */
2172 bufqlim[BQ_LOCKED].bl_nlow = 0;
2173 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
2174 bufqlim[BQ_LOCKED].bl_target = 0;
2175 bufqlim[BQ_LOCKED].bl_stale = 30;
2176
1c79356b
A
2177 buqlimprt(1);
2178 }
2179
2180 /* create worker thread */
2181 kernel_thread(kernel_task, bufqscan_thread);
2182}
2183
2184/* The workloop for the buffer balancing thread */
9bccf70c 2185static void
1c79356b
A
2186bufqscan_thread()
2187{
2188 boolean_t funnel_state;
2189 int moretodo = 0;
2190
2191 funnel_state = thread_funnel_set(kernel_flock, TRUE);
2192
2193 for(;;) {
2194 do {
2195 int q; /* buffer queue to process */
2196
9bccf70c
A
2197 q = initbufqscan();
2198 for (; q; ) {
1c79356b
A
2199 moretodo |= balancebufq(q);
2200 q = nextbufq(q);
2201 }
2202 } while (moretodo);
2203
9bccf70c 2204#if DIAGNOSTIC
1c79356b
A
2205 vfs_bufstats();
2206 buqlimprt(0);
2207#endif
2208 (void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz);
2209 moretodo = 0;
2210 }
2211
2212 (void) thread_funnel_set(kernel_flock, FALSE);
2213}
2214
2215/* Seed for the buffer queue balancing */
9bccf70c 2216static __inline__ int
1c79356b
A
2217initbufqscan()
2218{
2219 /* Start with AGE queue */
2220 return (BQ_AGE);
2221}
2222
2223/* Pick next buffer queue to balance */
9bccf70c 2224static __inline__ int
1c79356b
A
2225nextbufq(int q)
2226{
2227 int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
2228
2229 q++;
2230 q %= sizeof(order);
2231 return (order[q]);
2232}
2233
2234/* function to balance the buffer queues */
9bccf70c 2235static int
1c79356b
A
2236balancebufq(int q)
2237{
2238 int moretodo = 0;
2239 int s = splbio();
2240 int n;
2241
2242 /* reject invalid q */
2243 if ((q < 0) || (q >= BQUEUES))
2244 goto out;
2245
765c9de3
A
2246 /* LOCKED or LAUNDRY queue MUST not be balanced */
2247 if ((q == BQ_LOCKED) || (q == BQ_LAUNDRY))
1c79356b
A
2248 goto out;
2249
2250 n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
2251
2252 /* If queue has less than target nothing more to do */
2253 if (n < 0)
2254 goto out;
2255
2256 if ( n > 8 ) {
2257 /* Balance only a small amount (12.5%) at a time */
2258 n >>= 3;
2259 }
2260
2261 /* EMPTY queue needs special handling */
2262 if (q == BQ_EMPTY) {
2263 moretodo |= btrimempty(n);
2264 goto out;
2265 }
2266
2267 for (; n > 0; n--) {
2268 struct buf *bp = bufqueues[q].tqh_first;
2269 if (!bp)
2270 break;
2271
2272 /* check if it's stale */
2273 if ((time.tv_sec - bp->b_timestamp) > bufqlim[q].bl_stale) {
2274 if (bcleanbuf(bp)) {
2275 /* bawrite() issued, bp not ready */
2276 moretodo = 1;
2277 } else {
2278 /* release the cleaned buffer to BQ_EMPTY */
2279 SET(bp->b_flags, B_INVAL);
2280 brelse(bp);
2281 }
2282 } else
2283 break;
2284 }
2285
2286out:
2287 splx(s);
2288 return (moretodo);
2289}
2290
9bccf70c 2291static int
1c79356b
A
2292btrimempty(int n)
2293{
2294 /*
2295 * When struct buf are allocated dynamically, this would
2296 * reclaim upto 'n' struct buf from the empty queue.
2297 */
2298
2299 return (0);
2300}
2301
9bccf70c 2302static __inline__ void
1c79356b
A
2303bufqinc(int q)
2304{
2305 if ((q < 0) || (q >= BQUEUES))
2306 return;
2307
2308 bufqlim[q].bl_num++;
2309 return;
2310}
2311
9bccf70c 2312static __inline__ void
1c79356b
A
2313bufqdec(int q)
2314{
2315 if ((q < 0) || (q >= BQUEUES))
2316 return;
2317
2318 bufqlim[q].bl_num--;
2319 return;
2320}
2321
9bccf70c 2322static void
1c79356b
A
2323buqlimprt(int all)
2324{
2325 int i;
765c9de3
A
2326 static char *bname[BQUEUES] =
2327 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
1c79356b
A
2328
2329 if (all)
2330 for (i = 0; i < BQUEUES; i++) {
2331 printf("%s : ", bname[i]);
9bccf70c
A
2332 printf("min = %ld, ", (long)bufqlim[i].bl_nlow);
2333 printf("cur = %ld, ", (long)bufqlim[i].bl_num);
2334 printf("max = %ld, ", (long)bufqlim[i].bl_nlhigh);
2335 printf("target = %ld, ", (long)bufqlim[i].bl_target);
2336 printf("stale after %ld seconds\n", bufqlim[i].bl_stale);
1c79356b
A
2337 }
2338 else
2339 for (i = 0; i < BQUEUES; i++) {
2340 printf("%s : ", bname[i]);
9bccf70c 2341 printf("cur = %ld, ", (long)bufqlim[i].bl_num);
1c79356b
A
2342 }
2343}
765c9de3
A
2344
2345/*
2346 * If the getnewbuf() calls bcleanbuf() on the same thread
2347 * there is a potential for stack overrun and deadlocks.
2348 * So we always handoff the work to worker thread for completion
2349 */
2350
2351static void
2352bcleanbuf_thread_init()
2353{
2354 static void bcleanbuf_thread();
2355
2356 /* create worker thread */
2357 kernel_thread(kernel_task, bcleanbuf_thread);
2358}
2359
2360static void
2361bcleanbuf_thread()
2362{
2363 boolean_t funnel_state;
2364 struct buf *bp;
9bccf70c
A
2365 int error = 0;
2366 int loopcnt = 0;
765c9de3
A
2367
2368 funnel_state = thread_funnel_set(kernel_flock, TRUE);
2369
2370doit:
2371 while (blaundrycnt == 0)
2372 (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 60 * hz);
2373 bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY]);
2374 /* Remove from the queue */
2375 bremfree(bp);
2376 blaundrycnt--;
2377 /* do the IO */
9bccf70c
A
2378 error = bawrite_internal(bp, 0);
2379 if (error) {
2380 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
2381 blaundrycnt++;
2382 if (loopcnt > 10) {
2383 (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 1);
2384 loopcnt = 0;
2385 } else {
2386 (void)thread_block(THREAD_CONTINUE_NULL);
2387 loopcnt++;
2388 }
2389 }
765c9de3
A
2390 /* start again */
2391 goto doit;
2392
2393 (void) thread_funnel_set(kernel_flock, funnel_state);
2394}
b4c24cb9
A
2395
2396
2397static int
2398bp_cmp(void *a, void *b)
2399{
2400 struct buf *bp_a = *(struct buf **)a,
2401 *bp_b = *(struct buf **)b;
2402 daddr_t res;
2403
2404 // don't have to worry about negative block
2405 // numbers so this is ok to do.
2406 //
2407 res = (bp_a->b_blkno - bp_b->b_blkno);
2408
2409 return (int)res;
2410}
2411
2412#define NFLUSH 32
2413
2414int
2415bflushq(int whichq, struct mount *mp)
2416{
2417 struct buf *bp, *next;
2418 int i, buf_count, s;
2419 int counter=0, total_writes=0;
2420 static struct buf *flush_table[NFLUSH];
2421
2422 if (whichq < 0 || whichq >= BQUEUES) {
2423 return;
2424 }
2425
2426
2427 restart:
2428 bp = TAILQ_FIRST(&bufqueues[whichq]);
2429 for(buf_count=0; bp; bp=next) {
2430 next = bp->b_freelist.tqe_next;
2431
2432 if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) {
2433 continue;
2434 }
2435
2436 if ((bp->b_flags & B_DELWRI) && (bp->b_flags & B_BUSY) == 0) {
2437 if (whichq != BQ_LOCKED && (bp->b_flags & B_LOCKED)) {
2438 panic("bflushq: bp @ 0x%x is locked!\n", bp);
2439 }
2440
2441 bremfree(bp);
2442 bp->b_flags |= B_BUSY;
2443 flush_table[buf_count] = bp;
2444 buf_count++;
2445 total_writes++;
2446
2447 if (buf_count >= NFLUSH) {
2448 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
2449
2450 for(i=0; i < buf_count; i++) {
2451 bawrite(flush_table[i]);
2452 }
2453
2454 goto restart;
2455 }
2456 }
2457 }
2458
2459 if (buf_count > 0) {
2460 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
2461 for(i=0; i < buf_count; i++) {
2462 bawrite(flush_table[i]);
2463 }
2464 }
2465
2466 return total_writes;
2467}