]> git.saurik.com Git - apple/xnu.git/blame_incremental - bsd/vfs/vfs_bio.c
xnu-517.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_bio.c
... / ...
CommitLineData
1/*
2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
7 *
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
26/*-
27 * Copyright (c) 1994 Christopher G. Demetriou
28 * Copyright (c) 1982, 1986, 1989, 1993
29 * The Regents of the University of California. All rights reserved.
30 * (c) UNIX System Laboratories, Inc.
31 * All or some portions of this file are derived from material licensed
32 * to the University of California by American Telephone and Telegraph
33 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
34 * the permission of UNIX System Laboratories, Inc.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgement:
46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors.
48 * 4. Neither the name of the University nor the names of its contributors
49 * may be used to endorse or promote products derived from this software
50 * without specific prior written permission.
51 *
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62 * SUCH DAMAGE.
63 *
64 * The NEXTSTEP Software License Agreement specifies the terms
65 * and conditions for redistribution.
66 *
67 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
68 */
69
70/*
71 * Some references:
72 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
73 * Leffler, et al.: The Design and Implementation of the 4.3BSD
74 * UNIX Operating System (Addison Welley, 1989)
75 */
76
77#include <sys/param.h>
78#include <sys/systm.h>
79#include <sys/proc.h>
80#include <sys/buf.h>
81#include <sys/vnode.h>
82#include <sys/mount.h>
83#include <sys/trace.h>
84#include <sys/malloc.h>
85#include <sys/resourcevar.h>
86#include <miscfs/specfs/specdev.h>
87#include <sys/ubc.h>
88#include <vm/vm_pageout.h>
89#if DIAGNOSTIC
90#include <kern/assert.h>
91#endif /* DIAGNOSTIC */
92#include <kern/task.h>
93#include <kern/zalloc.h>
94
95#include <sys/kdebug.h>
96#include <machine/spl.h>
97
98static __inline__ void bufqinc(int q);
99static __inline__ void bufqdec(int q);
100
101static int do_breadn_for_type(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablks,
102 int *rasizes, int nrablks, struct ucred *cred, struct buf **bpp, int queuetype);
103static struct buf *getnewbuf(int slpflag, int slptimeo, int *queue);
104static int bcleanbuf(struct buf *bp);
105static int brecover_data(struct buf *bp);
106extern void vwakeup();
107
108extern int niobuf; /* The number of IO buffer headers for cluster IO */
109int blaundrycnt;
110
111/* zone allocated buffer headers */
112static zone_t buf_hdr_zone;
113static int buf_hdr_count;
114
115#if TRACE
116struct proc *traceproc;
117int tracewhich, tracebuf[TRCSIZ];
118u_int tracex;
119char traceflags[TR_NFLAGS];
120#endif /* TRACE */
121
122/*
123 * Definitions for the buffer hash lists.
124 */
125#define BUFHASH(dvp, lbn) \
126 (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
127LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
128u_long bufhash;
129
130/* Definitions for the buffer stats. */
131struct bufstats bufstats;
132
133/* Number of delayed write buffers */
134int nbdwrite = 0;
135
136/*
137 * Insq/Remq for the buffer hash lists.
138 */
139#if 0
140#define binshash(bp, dp) LIST_INSERT_HEAD(dp, bp, b_hash)
141#define bremhash(bp) LIST_REMOVE(bp, b_hash)
142#endif /* 0 */
143
144
145TAILQ_HEAD(ioqueue, buf) iobufqueue;
146TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
147static int needbuffer;
148static int need_iobuffer;
149
150/*
151 * Insq/Remq for the buffer free lists.
152 */
153#define binsheadfree(bp, dp, whichq) do { \
154 TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
155 bufqinc((whichq)); \
156 (bp)->b_whichq = whichq; \
157 (bp)->b_timestamp = time.tv_sec; \
158 } while (0)
159
160#define binstailfree(bp, dp, whichq) do { \
161 TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
162 bufqinc((whichq)); \
163 (bp)->b_whichq = whichq; \
164 (bp)->b_timestamp = time.tv_sec; \
165 } while (0)
166
167#define BHASHENTCHECK(bp) \
168 if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef) \
169 panic("%x: b_hash.le_prev is not deadbeef", (bp));
170
171#define BLISTNONE(bp) \
172 (bp)->b_hash.le_next = (struct buf *)0; \
173 (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
174
175/*
176 * Insq/Remq for the vnode usage lists.
177 */
178#define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs)
179#define bufremvn(bp) { \
180 LIST_REMOVE(bp, b_vnbufs); \
181 (bp)->b_vnbufs.le_next = NOLIST; \
182}
183
184simple_lock_data_t bufhashlist_slock; /* lock on buffer hash list */
185
186/* number of per vnode, "in flight" buffer writes */
187#define BUFWRITE_THROTTLE 9
188
189
190/*
191 * Time in seconds before a buffer on a list is
192 * considered as a stale buffer
193 */
194#define LRU_IS_STALE 120 /* default value for the LRU */
195#define AGE_IS_STALE 60 /* default value for the AGE */
196#define META_IS_STALE 180 /* default value for the BQ_META */
197
198int lru_is_stale = LRU_IS_STALE;
199int age_is_stale = AGE_IS_STALE;
200int meta_is_stale = META_IS_STALE;
201
202/* LIST_INSERT_HEAD() with assertions */
203static __inline__ void
204blistenterhead(struct bufhashhdr * head, struct buf * bp)
205{
206 if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
207 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
208 (head)->lh_first = bp;
209 bp->b_hash.le_prev = &(head)->lh_first;
210 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
211 panic("blistenterhead: le_prev is deadbeef");
212}
213
214static __inline__ void
215binshash(struct buf *bp, struct bufhashhdr *dp)
216{
217 struct buf *nbp;
218
219 simple_lock(&bufhashlist_slock);
220
221#if 0
222 if((bad = incore(bp->b_vp, bp->b_lblkno)))
223 panic("binshash: already incore bp 0x%x, bad 0x%x\n", bp, bad);
224#endif /* 0 */
225
226 BHASHENTCHECK(bp);
227
228 nbp = dp->lh_first;
229 for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
230 if(nbp == bp)
231 panic("buf already in hashlist");
232 }
233
234 blistenterhead(dp, bp);
235 simple_unlock(&bufhashlist_slock);
236}
237
238static __inline__ void
239bremhash(struct buf *bp)
240{
241 simple_lock(&bufhashlist_slock);
242 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
243 panic("bremhash le_prev is deadbeef");
244 if (bp->b_hash.le_next == bp)
245 panic("bremhash: next points to self");
246
247 if (bp->b_hash.le_next != NULL)
248 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
249 *bp->b_hash.le_prev = (bp)->b_hash.le_next;
250 simple_unlock(&bufhashlist_slock);
251}
252
253/*
254 * Remove a buffer from the free list it's on
255 */
256void
257bremfree(bp)
258 struct buf *bp;
259{
260 struct bqueues *dp = NULL;
261 int whichq = -1;
262
263 /*
264 * We only calculate the head of the freelist when removing
265 * the last element of the list as that is the only time that
266 * it is needed (e.g. to reset the tail pointer).
267 *
268 * NB: This makes an assumption about how tailq's are implemented.
269 */
270 if (bp->b_freelist.tqe_next == NULL) {
271 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
272 if (dp->tqh_last == &bp->b_freelist.tqe_next)
273 break;
274 if (dp == &bufqueues[BQUEUES])
275 panic("bremfree: lost tail");
276 }
277 TAILQ_REMOVE(dp, bp, b_freelist);
278 whichq = bp->b_whichq;
279 bufqdec(whichq);
280 bp->b_whichq = -1;
281 bp->b_timestamp = 0;
282}
283
284/*
285 * Associate a buffer with a vnode.
286 */
287static void
288bgetvp(vp, bp)
289 register struct vnode *vp;
290 register struct buf *bp;
291{
292
293 if (bp->b_vp != vp)
294 panic("bgetvp: not free");
295 VHOLD(vp);
296 bp->b_vp = vp;
297 if (vp->v_type == VBLK || vp->v_type == VCHR)
298 bp->b_dev = vp->v_rdev;
299 else
300 bp->b_dev = NODEV;
301 /*
302 * Insert onto list for new vnode.
303 */
304 bufinsvn(bp, &vp->v_cleanblkhd);
305}
306
307/*
308 * Disassociate a buffer from a vnode.
309 */
310static void
311brelvp(bp)
312 register struct buf *bp;
313{
314 struct vnode *vp;
315
316 if (bp->b_vp == (struct vnode *) 0)
317 panic("brelvp: NULL vp");
318 /*
319 * Delete from old vnode list, if on one.
320 */
321 if (bp->b_vnbufs.le_next != NOLIST)
322 bufremvn(bp);
323 vp = bp->b_vp;
324 bp->b_vp = (struct vnode *) 0;
325 HOLDRELE(vp);
326}
327
328/*
329 * Reassign a buffer from one vnode to another.
330 * Used to assign file specific control information
331 * (indirect blocks) to the vnode to which they belong.
332 */
333void
334reassignbuf(bp, newvp)
335 register struct buf *bp;
336 register struct vnode *newvp;
337{
338 register struct buflists *listheadp;
339
340 if (newvp == NULL) {
341 printf("reassignbuf: NULL");
342 return;
343 }
344 /*
345 * Delete from old vnode list, if on one.
346 */
347 if (bp->b_vnbufs.le_next != NOLIST)
348 bufremvn(bp);
349 /*
350 * If dirty, put on list of dirty buffers;
351 * otherwise insert onto list of clean buffers.
352 */
353 if (ISSET(bp->b_flags, B_DELWRI))
354 listheadp = &newvp->v_dirtyblkhd;
355 else
356 listheadp = &newvp->v_cleanblkhd;
357 bufinsvn(bp, listheadp);
358}
359
360static __inline__ void
361bufhdrinit(struct buf *bp)
362{
363 bzero((char *)bp, sizeof *bp);
364 bp->b_dev = NODEV;
365 bp->b_rcred = NOCRED;
366 bp->b_wcred = NOCRED;
367 bp->b_vnbufs.le_next = NOLIST;
368 bp->b_flags = B_INVAL;
369
370 return;
371}
372
373/*
374 * Initialize buffers and hash links for buffers.
375 */
376__private_extern__ void
377bufinit()
378{
379 register struct buf *bp;
380 register struct bqueues *dp;
381 register int i;
382 int metabuf;
383 long whichq;
384 static void bufzoneinit();
385 static void bcleanbuf_thread_init();
386
387 /* Initialize the buffer queues ('freelists') and the hash table */
388 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
389 TAILQ_INIT(dp);
390 bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash);
391
392 simple_lock_init(&bufhashlist_slock );
393
394 metabuf = nbuf/8; /* reserved for meta buf */
395
396 /* Initialize the buffer headers */
397 for (i = 0; i < nbuf; i++) {
398 bp = &buf[i];
399 bufhdrinit(bp);
400
401 /*
402 * metabuf buffer headers on the meta-data list and
403 * rest of the buffer headers on the empty list
404 */
405 if (--metabuf)
406 whichq = BQ_META;
407 else
408 whichq = BQ_EMPTY;
409
410 BLISTNONE(bp);
411 dp = &bufqueues[whichq];
412 binsheadfree(bp, dp, whichq);
413 binshash(bp, &invalhash);
414 }
415
416 for (; i < nbuf + niobuf; i++) {
417 bp = &buf[i];
418 bufhdrinit(bp);
419 binsheadfree(bp, &iobufqueue, -1);
420 }
421
422 printf("using %d buffer headers and %d cluster IO buffer headers\n",
423 nbuf, niobuf);
424
425 /* Set up zones used by the buffer cache */
426 bufzoneinit();
427
428 /* start the bcleanbuf() thread */
429 bcleanbuf_thread_init();
430
431#if 0 /* notyet */
432 {
433 static void bufq_balance_thread_init();
434 /* create a thread to do dynamic buffer queue balancing */
435 bufq_balance_thread_init();
436 }
437#endif /* notyet */
438}
439
440static struct buf *
441bio_doread(vp, blkno, size, cred, async, queuetype)
442 struct vnode *vp;
443 daddr_t blkno;
444 int size;
445 struct ucred *cred;
446 int async;
447 int queuetype;
448{
449 register struct buf *bp;
450 struct proc *p = current_proc();
451
452 bp = getblk(vp, blkno, size, 0, 0, queuetype);
453
454 /*
455 * If buffer does not have data valid, start a read.
456 * Note that if buffer is B_INVAL, getblk() won't return it.
457 * Therefore, it's valid if it's I/O has completed or been delayed.
458 */
459 if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
460 /* Start I/O for the buffer (keeping credentials). */
461 SET(bp->b_flags, B_READ | async);
462 if (cred != NOCRED && bp->b_rcred == NOCRED) {
463 /*
464 * NFS has embedded ucred.
465 * Can not crhold() here as that causes zone corruption
466 */
467 bp->b_rcred = crdup(cred);
468 }
469
470 VOP_STRATEGY(bp);
471
472 trace(TR_BREADMISS, pack(vp, size), blkno);
473
474 /* Pay for the read. */
475 if (p && p->p_stats)
476 p->p_stats->p_ru.ru_inblock++; /* XXX */
477 } else if (async) {
478 brelse(bp);
479 }
480
481 trace(TR_BREADHIT, pack(vp, size), blkno);
482
483 return (bp);
484}
485/*
486 * Read a disk block.
487 * This algorithm described in Bach (p.54).
488 */
489int
490bread(vp, blkno, size, cred, bpp)
491 struct vnode *vp;
492 daddr_t blkno;
493 int size;
494 struct ucred *cred;
495 struct buf **bpp;
496{
497 register struct buf *bp;
498
499 /* Get buffer for block. */
500 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
501
502 /* Wait for the read to complete, and return result. */
503 return (biowait(bp));
504}
505
506/*
507 * Read a disk block. [bread() for meta-data]
508 * This algorithm described in Bach (p.54).
509 */
510int
511meta_bread(vp, blkno, size, cred, bpp)
512 struct vnode *vp;
513 daddr_t blkno;
514 int size;
515 struct ucred *cred;
516 struct buf **bpp;
517{
518 register struct buf *bp;
519
520 /* Get buffer for block. */
521 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
522
523 /* Wait for the read to complete, and return result. */
524 return (biowait(bp));
525}
526
527/*
528 * Read-ahead multiple disk blocks. The first is sync, the rest async.
529 */
530int
531breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp)
532 struct vnode *vp;
533 daddr_t blkno; int size;
534 daddr_t rablks[]; int rasizes[];
535 int nrablks;
536 struct ucred *cred;
537 struct buf **bpp;
538{
539 return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_READ));
540}
541
542/*
543 * Read-ahead multiple disk blocks. The first is sync, the rest async.
544 * [breadn() for meta-data]
545 */
546int
547meta_breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp)
548 struct vnode *vp;
549 daddr_t blkno; int size;
550 daddr_t rablks[]; int rasizes[];
551 int nrablks;
552 struct ucred *cred;
553 struct buf **bpp;
554{
555 return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_META));
556}
557
558/*
559 * Perform the reads for breadn() and meta_breadn().
560 * Trivial modification to the breada algorithm presented in Bach (p.55).
561 */
562static int
563do_breadn_for_type(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablks, int *rasizes,
564 int nrablks, struct ucred *cred, struct buf **bpp, int queuetype)
565{
566 register struct buf *bp;
567 int i;
568
569 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, queuetype);
570
571 /*
572 * For each of the read-ahead blocks, start a read, if necessary.
573 */
574 for (i = 0; i < nrablks; i++) {
575 /* If it's in the cache, just go on to next one. */
576 if (incore(vp, rablks[i]))
577 continue;
578
579 /* Get a buffer for the read-ahead block */
580 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, queuetype);
581 }
582
583 /* Otherwise, we had to start a read for it; wait until it's valid. */
584 return (biowait(bp));
585}
586
587/*
588 * Read with single-block read-ahead. Defined in Bach (p.55), but
589 * implemented as a call to breadn().
590 * XXX for compatibility with old file systems.
591 */
592int
593breada(vp, blkno, size, rablkno, rabsize, cred, bpp)
594 struct vnode *vp;
595 daddr_t blkno; int size;
596 daddr_t rablkno; int rabsize;
597 struct ucred *cred;
598 struct buf **bpp;
599{
600
601 return (breadn(vp, blkno, size, &rablkno, &rabsize, 1, cred, bpp));
602}
603
604/*
605 * Block write. Described in Bach (p.56)
606 */
607int
608bwrite(bp)
609 struct buf *bp;
610{
611 int rv, sync, wasdelayed;
612 struct proc *p = current_proc();
613 struct vnode *vp = bp->b_vp;
614
615 if (bp->b_data == 0) {
616 if (brecover_data(bp) == 0)
617 return (0);
618 }
619 /* Remember buffer type, to switch on it later. */
620 sync = !ISSET(bp->b_flags, B_ASYNC);
621 wasdelayed = ISSET(bp->b_flags, B_DELWRI);
622 CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
623 if (wasdelayed) {
624 nbdwrite--;
625 wakeup((caddr_t)&nbdwrite);
626 }
627
628 if (!sync) {
629 /*
630 * If not synchronous, pay for the I/O operation and make
631 * sure the buf is on the correct vnode queue. We have
632 * to do this now, because if we don't, the vnode may not
633 * be properly notified that its I/O has completed.
634 */
635 if (wasdelayed)
636 reassignbuf(bp, vp);
637 else
638 if (p && p->p_stats)
639 p->p_stats->p_ru.ru_oublock++; /* XXX */
640 }
641
642 trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
643
644 /* Initiate disk write. Make sure the appropriate party is charged. */
645 SET(bp->b_flags, B_WRITEINPROG);
646 vp->v_numoutput++;
647
648 VOP_STRATEGY(bp);
649
650 if (sync) {
651 /*
652 * If I/O was synchronous, wait for it to complete.
653 */
654 rv = biowait(bp);
655
656 /*
657 * Pay for the I/O operation, if it's not been paid for, and
658 * make sure it's on the correct vnode queue. (async operatings
659 * were payed for above.)
660 */
661 if (wasdelayed)
662 reassignbuf(bp, vp);
663 else
664 if (p && p->p_stats)
665 p->p_stats->p_ru.ru_oublock++; /* XXX */
666
667 /* Release the buffer. */
668 // XXXdbg - only if the unused bit is set
669 if (!ISSET(bp->b_flags, B_NORELSE)) {
670 brelse(bp);
671 } else {
672 CLR(bp->b_flags, B_NORELSE);
673 }
674
675 return (rv);
676 } else {
677 return (0);
678 }
679}
680
681int
682vn_bwrite(ap)
683 struct vop_bwrite_args *ap;
684{
685 return (bwrite(ap->a_bp));
686}
687
688/*
689 * Delayed write.
690 *
691 * The buffer is marked dirty, but is not queued for I/O.
692 * This routine should be used when the buffer is expected
693 * to be modified again soon, typically a small write that
694 * partially fills a buffer.
695 *
696 * NB: magnetic tapes cannot be delayed; they must be
697 * written in the order that the writes are requested.
698 *
699 * Described in Leffler, et al. (pp. 208-213).
700 *
701 * Note: With the abilitty to allocate additional buffer
702 * headers, we can get in to the situation where "too" many
703 * bdwrite()s can create situation where the kernel can create
704 * buffers faster than the disks can service. Doing a bawrite() in
705 * cases were we have "too many" outstanding bdwrite()s avoids that.
706 */
707__private_extern__ int
708bdwrite_internal(bp, return_error)
709 struct buf *bp;
710 int return_error;
711{
712 struct proc *p = current_proc();
713 struct vnode *vp = bp->b_vp;
714
715 /*
716 * If the block hasn't been seen before:
717 * (1) Mark it as having been seen,
718 * (2) Charge for the write.
719 * (3) Make sure it's on its vnode's correct block list,
720 */
721 if (!ISSET(bp->b_flags, B_DELWRI)) {
722 SET(bp->b_flags, B_DELWRI);
723 if (p && p->p_stats)
724 p->p_stats->p_ru.ru_oublock++; /* XXX */
725 nbdwrite ++;
726 reassignbuf(bp, vp);
727 }
728
729 /* If this is a tape block, write it the block now. */
730 if (ISSET(bp->b_flags, B_TAPE)) {
731 /* bwrite(bp); */
732 VOP_BWRITE(bp);
733 return (0);
734 }
735
736 /*
737 * If the vnode has "too many" write operations in progress
738 * wait for them to finish the IO
739 */
740 while (vp->v_numoutput >= BUFWRITE_THROTTLE) {
741 vp->v_flag |= VTHROTTLED;
742 (void)tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "bdwrite", 0);
743 }
744
745 /*
746 * If we have too many delayed write buffers,
747 * more than we can "safely" handle, just fall back to
748 * doing the async write
749 */
750 if (nbdwrite < 0)
751 panic("bdwrite: Negative nbdwrite");
752
753 // can't do a bawrite() if the LOCKED bit is set because the
754 // buffer is part of a transaction and can't go to disk until
755 // the LOCKED bit is cleared.
756 if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf/4)*3)) {
757 if (return_error)
758 return (EAGAIN);
759 else
760 bawrite(bp);
761 return (0);
762 }
763
764 /* Otherwise, the "write" is done, so mark and release the buffer. */
765 SET(bp->b_flags, B_DONE);
766 brelse(bp);
767 return (0);
768}
769
770void
771bdwrite(bp)
772 struct buf *bp;
773{
774 (void) bdwrite_internal(bp, 0);
775}
776
777
778/*
779 * Asynchronous block write; just an asynchronous bwrite().
780 *
781 * Note: With the abilitty to allocate additional buffer
782 * headers, we can get in to the situation where "too" many
783 * bawrite()s can create situation where the kernel can create
784 * buffers faster than the disks can service.
785 * We limit the number of "in flight" writes a vnode can have to
786 * avoid this.
787 */
788static int
789bawrite_internal(bp, throttle)
790 struct buf *bp;
791 int throttle;
792{
793 struct vnode *vp = bp->b_vp;
794
795 if (vp) {
796 /*
797 * If the vnode has "too many" write operations in progress
798 * wait for them to finish the IO
799 */
800 while (vp->v_numoutput >= BUFWRITE_THROTTLE) {
801 if (throttle) {
802 vp->v_flag |= VTHROTTLED;
803 (void)tsleep((caddr_t)&vp->v_numoutput,
804 PRIBIO + 1, "bawrite", 0);
805 } else
806 return (EWOULDBLOCK);
807 }
808 }
809
810 SET(bp->b_flags, B_ASYNC);
811 VOP_BWRITE(bp);
812 return (0);
813}
814
815void
816bawrite(bp)
817 struct buf *bp;
818{
819 (void) bawrite_internal(bp, 1);
820}
821
822/*
823 * bwillwrite:
824 *
825 * Called prior to the locking of any vnodes when we are expecting to
826 * write. We do not want to starve the buffer cache with too many
827 * dirty buffers so we block here. By blocking prior to the locking
828 * of any vnodes we attempt to avoid the situation where a locked vnode
829 * prevents the various system daemons from flushing related buffers.
830 */
831
832void
833bwillwrite(void)
834{
835 /* XXX To be implemented later */
836}
837
838/*
839 * Release a buffer on to the free lists.
840 * Described in Bach (p. 46).
841 */
842void
843brelse(bp)
844 struct buf *bp;
845{
846 struct bqueues *bufq;
847 int s;
848 long whichq;
849
850 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
851 bp->b_lblkno * PAGE_SIZE, (int)bp, (int)bp->b_data,
852 bp->b_flags, 0);
853
854 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
855
856 // if we're invalidating a buffer that has the B_CALL bit
857 // set then call the b_iodone function so it gets cleaned
858 // up properly.
859 //
860 if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
861 if (ISSET(bp->b_flags, B_CALL) && !ISSET(bp->b_flags, B_DELWRI)) {
862 panic("brelse: CALL flag set but not DELWRI! bp 0x%x\n", bp);
863 }
864 if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */
865 void (*iodone_func)(struct buf *) = bp->b_iodone;
866
867 CLR(bp->b_flags, B_CALL); /* but note callout done */
868 bp->b_iodone = NULL;
869
870 if (iodone_func == NULL) {
871 panic("brelse: bp @ 0x%x has NULL b_iodone!\n", bp);
872 }
873 (*iodone_func)(bp);
874 }
875 }
876
877 /* IO is done. Cleanup the UPL state */
878 if (!ISSET(bp->b_flags, B_META)
879 && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
880 kern_return_t kret;
881 upl_t upl;
882 int upl_flags;
883
884 if ( !ISSET(bp->b_flags, B_PAGELIST)) {
885 if ( !ISSET(bp->b_flags, B_INVAL)) {
886 kret = ubc_create_upl(bp->b_vp,
887 ubc_blktooff(bp->b_vp, bp->b_lblkno),
888 bp->b_bufsize,
889 &upl,
890 NULL,
891 UPL_PRECIOUS);
892 if (kret != KERN_SUCCESS)
893 panic("brelse: Failed to get pagelists");
894#ifdef UBC_DEBUG
895 upl_ubc_alias_set(upl, bp, 5);
896#endif /* UBC_DEBUG */
897 } else
898 upl = (upl_t) 0;
899 } else {
900 upl = bp->b_pagelist;
901
902 if (bp->b_data) {
903 kret = ubc_upl_unmap(upl);
904
905 if (kret != KERN_SUCCESS)
906 panic("kernel_upl_unmap failed");
907 bp->b_data = 0;
908 }
909 }
910 if (upl) {
911 if (bp->b_flags & (B_ERROR | B_INVAL)) {
912 if (bp->b_flags & (B_READ | B_INVAL))
913 upl_flags = UPL_ABORT_DUMP_PAGES;
914 else
915 upl_flags = 0;
916 ubc_upl_abort(upl, upl_flags);
917 } else {
918 if (ISSET(bp->b_flags, B_NEEDCOMMIT))
919 upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
920 else if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY))
921 upl_flags = UPL_COMMIT_SET_DIRTY ;
922 else
923 upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
924 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
925 UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
926 }
927 s = splbio();
928 CLR(bp->b_flags, B_PAGELIST);
929 bp->b_pagelist = 0;
930 splx(s);
931 }
932 } else {
933 if(ISSET(bp->b_flags, B_PAGELIST))
934 panic("brelse: pagelist set for non VREG; vp=%x", bp->b_vp);
935 }
936
937 /* Wake up any processes waiting for any buffer to become free. */
938 if (needbuffer) {
939 needbuffer = 0;
940 wakeup(&needbuffer);
941 }
942
943 /* Wake up any proceeses waiting for _this_ buffer to become free. */
944 if (ISSET(bp->b_flags, B_WANTED)) {
945 CLR(bp->b_flags, B_WANTED);
946 wakeup(bp);
947 }
948
949 /* Block disk interrupts. */
950 s = splbio();
951
952 /*
953 * Determine which queue the buffer should be on, then put it there.
954 */
955
956 /* If it's locked, don't report an error; try again later. */
957 if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
958 CLR(bp->b_flags, B_ERROR);
959
960 /* If it's not cacheable, or an error, mark it invalid. */
961 if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
962 SET(bp->b_flags, B_INVAL);
963
964 if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
965 /*
966 * If it's invalid or empty, dissociate it from its vnode
967 * and put on the head of the appropriate queue.
968 */
969 if (bp->b_vp)
970 brelvp(bp);
971 if (ISSET(bp->b_flags, B_DELWRI)) {
972 CLR(bp->b_flags, B_DELWRI);
973 nbdwrite--;
974 wakeup((caddr_t)&nbdwrite);
975 }
976 if (bp->b_bufsize <= 0)
977 whichq = BQ_EMPTY; /* no data */
978 else if (ISSET(bp->b_flags, B_META))
979 whichq = BQ_META; /* meta-data */
980 else
981 whichq = BQ_AGE; /* invalid data */
982
983 bufq = &bufqueues[whichq];
984 binsheadfree(bp, bufq, whichq);
985 } else {
986 /*
987 * It has valid data. Put it on the end of the appropriate
988 * queue, so that it'll stick around for as long as possible.
989 */
990 if (ISSET(bp->b_flags, B_LOCKED))
991 whichq = BQ_LOCKED; /* locked in core */
992 else if (ISSET(bp->b_flags, B_META))
993 whichq = BQ_META; /* meta-data */
994 else if (ISSET(bp->b_flags, B_AGE))
995 whichq = BQ_AGE; /* stale but valid data */
996 else
997 whichq = BQ_LRU; /* valid data */
998
999 bufq = &bufqueues[whichq];
1000 binstailfree(bp, bufq, whichq);
1001 }
1002
1003 /* Unlock the buffer. */
1004 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE));
1005
1006 /* Allow disk interrupts. */
1007 splx(s);
1008
1009 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
1010 (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1011}
1012
1013/*
1014 * Determine if a block is in the cache.
1015 * Just look on what would be its hash chain. If it's there, return
1016 * a pointer to it, unless it's marked invalid. If it's marked invalid,
1017 * we normally don't return the buffer, unless the caller explicitly
1018 * wants us to.
1019 */
1020struct buf *
1021incore(vp, blkno)
1022 struct vnode *vp;
1023 daddr_t blkno;
1024{
1025 struct buf *bp;
1026
1027 bp = BUFHASH(vp, blkno)->lh_first;
1028
1029 /* Search hash chain */
1030 for (; bp != NULL; bp = bp->b_hash.le_next) {
1031 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
1032 !ISSET(bp->b_flags, B_INVAL))
1033 return (bp);
1034 }
1035
1036 return (0);
1037}
1038
1039
1040/* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
1041/*
1042 * Get a block of requested size that is associated with
1043 * a given vnode and block offset. If it is found in the
1044 * block cache, mark it as having been found, make it busy
1045 * and return it. Otherwise, return an empty block of the
1046 * correct size. It is up to the caller to insure that the
1047 * cached blocks be of the correct size.
1048 */
1049struct buf *
1050getblk(vp, blkno, size, slpflag, slptimeo, operation)
1051 register struct vnode *vp;
1052 daddr_t blkno;
1053 int size, slpflag, slptimeo, operation;
1054{
1055 struct buf *bp;
1056 int s, err;
1057 upl_t upl;
1058 upl_page_info_t *pl;
1059 kern_return_t kret;
1060 int error=0;
1061 int pagedirty = 0;
1062
1063 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
1064 blkno * PAGE_SIZE, size, operation, 0, 0);
1065start:
1066
1067 s = splbio();
1068 if ((bp = incore(vp, blkno))) {
1069 /* Found in the Buffer Cache */
1070 if (ISSET(bp->b_flags, B_BUSY)) {
1071 /* but is busy */
1072 switch (operation) {
1073 case BLK_READ:
1074 case BLK_WRITE:
1075 case BLK_META:
1076 SET(bp->b_flags, B_WANTED);
1077 bufstats.bufs_busyincore++;
1078 err = tsleep(bp, slpflag | (PRIBIO + 1), "getblk",
1079 slptimeo);
1080 splx(s);
1081 /*
1082 * Callers who call with PCATCH or timeout are
1083 * willing to deal with the NULL pointer
1084 */
1085 if (err && ((slpflag & PCATCH) ||
1086 ((err == EWOULDBLOCK) && slptimeo)))
1087 return (NULL);
1088 goto start;
1089 /*NOTREACHED*/
1090 break;
1091
1092 case BLK_PAGEIN:
1093 /* pagein operation must not use getblk */
1094 panic("getblk: pagein for incore busy buffer");
1095 splx(s);
1096 /*NOTREACHED*/
1097 break;
1098
1099 case BLK_PAGEOUT:
1100 /* pageout operation must not use getblk */
1101 panic("getblk: pageout for incore busy buffer");
1102 splx(s);
1103 /*NOTREACHED*/
1104 break;
1105
1106 default:
1107 panic("getblk: %d unknown operation 1", operation);
1108 /*NOTREACHED*/
1109 break;
1110 }
1111 } else {
1112 /* not busy */
1113 SET(bp->b_flags, (B_BUSY | B_CACHE));
1114 bremfree(bp);
1115 bufstats.bufs_incore++;
1116 splx(s);
1117
1118 allocbuf(bp, size);
1119 if (ISSET(bp->b_flags, B_PAGELIST))
1120 panic("pagelist buffer is not busy");
1121
1122 switch (operation) {
1123 case BLK_READ:
1124 case BLK_WRITE:
1125 if (UBCISVALID(bp->b_vp) && bp->b_bufsize) {
1126 kret = ubc_create_upl(vp,
1127 ubc_blktooff(vp, bp->b_lblkno),
1128 bp->b_bufsize,
1129 &upl,
1130 &pl,
1131 UPL_PRECIOUS);
1132 if (kret != KERN_SUCCESS)
1133 panic("Failed to get pagelists");
1134
1135 SET(bp->b_flags, B_PAGELIST);
1136 bp->b_pagelist = upl;
1137
1138 if (!upl_valid_page(pl, 0)) {
1139 if (vp->v_tag != VT_NFS)
1140 panic("getblk: incore buffer without valid page");
1141 CLR(bp->b_flags, B_CACHE);
1142 }
1143
1144 if (upl_dirty_page(pl, 0))
1145 SET(bp->b_flags, B_WASDIRTY);
1146 else
1147 CLR(bp->b_flags, B_WASDIRTY);
1148
1149 kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
1150 if (kret != KERN_SUCCESS)
1151 panic("getblk: ubc_upl_map() failed with (%d)",
1152 kret);
1153 if (bp->b_data == 0)
1154 panic("ubc_upl_map mapped 0");
1155 }
1156 break;
1157
1158 case BLK_META:
1159 /*
1160 * VM is not involved in IO for the meta data
1161 * buffer already has valid data
1162 */
1163 if(bp->b_data == 0)
1164 panic("bp->b_data null incore buf=%x", bp);
1165 break;
1166
1167 case BLK_PAGEIN:
1168 case BLK_PAGEOUT:
1169 panic("getblk: paging operation 1");
1170 break;
1171
1172 default:
1173 panic("getblk: %d unknown operation 2", operation);
1174 /*NOTREACHED*/
1175 break;
1176 }
1177 }
1178 } else { /* not incore() */
1179 int queue = BQ_EMPTY; /* Start with no preference */
1180 splx(s);
1181
1182 if ((operation == BLK_META) || (UBCINVALID(vp)) ||
1183 !(UBCINFOEXISTS(vp))) {
1184 operation = BLK_META;
1185 }
1186 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
1187 goto start;
1188 if (incore(vp, blkno)) {
1189 SET(bp->b_flags, B_INVAL);
1190 binshash(bp, &invalhash);
1191 brelse(bp);
1192 goto start;
1193 }
1194 /*
1195 * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
1196 * CALLED! BE CAREFUL.
1197 */
1198
1199 /*
1200 * if it is meta, the queue may be set to other
1201 * type so reset as well as mark it to be B_META
1202 * so that when buffer is released it will goto META queue
1203 * Also, if the vnode is not VREG, then it is META
1204 */
1205 if (operation == BLK_META) {
1206 SET(bp->b_flags, B_META);
1207 queue = BQ_META;
1208 }
1209
1210 bp->b_blkno = bp->b_lblkno = blkno;
1211 bp->b_vp = vp;
1212
1213 /*
1214 * Insert in the hash so that incore() can find it
1215 */
1216 binshash(bp, BUFHASH(vp, blkno));
1217
1218 s = splbio();
1219 bgetvp(vp, bp);
1220 splx(s);
1221
1222 allocbuf(bp, size);
1223
1224 switch (operation) {
1225 case BLK_META:
1226 /* buffer data is invalid */
1227
1228 if(bp->b_data == 0)
1229 panic("bp->b_data is null %x",bp);
1230
1231 bufstats.bufs_miss++;
1232
1233 /* wakeup the buffer */
1234 CLR(bp->b_flags, B_WANTED);
1235 wakeup(bp);
1236 break;
1237
1238 case BLK_READ:
1239 case BLK_WRITE:
1240
1241 if (ISSET(bp->b_flags, B_PAGELIST))
1242 panic("B_PAGELIST in bp=%x",bp);
1243
1244 kret = ubc_create_upl(vp,
1245 ubc_blktooff(vp, blkno),
1246 bp->b_bufsize,
1247 &upl,
1248 &pl,
1249 UPL_PRECIOUS);
1250 if (kret != KERN_SUCCESS)
1251 panic("Failed to get pagelists");
1252
1253#ifdef UBC_DEBUG
1254 upl_ubc_alias_set(upl, bp, 4);
1255#endif /* UBC_DEBUG */
1256 bp->b_pagelist = upl;
1257
1258 SET(bp->b_flags, B_PAGELIST);
1259
1260 if (upl_valid_page(pl, 0)) {
1261 SET(bp->b_flags, B_CACHE | B_DONE);
1262 bufstats.bufs_vmhits++;
1263
1264 pagedirty = upl_dirty_page(pl, 0);
1265
1266 if (pagedirty)
1267 SET(bp->b_flags, B_WASDIRTY);
1268
1269 if (vp->v_tag == VT_NFS) {
1270 off_t f_offset;
1271 int valid_size;
1272
1273 bp->b_validoff = 0;
1274 bp->b_dirtyoff = 0;
1275
1276 f_offset = ubc_blktooff(vp, blkno);
1277
1278 if (f_offset > vp->v_ubcinfo->ui_size) {
1279 CLR(bp->b_flags, (B_CACHE|B_DONE|B_WASDIRTY));
1280 bp->b_validend = 0;
1281 bp->b_dirtyend = 0;
1282 } else {
1283 valid_size = min(((unsigned int)(vp->v_ubcinfo->ui_size - f_offset)), PAGE_SIZE);
1284 bp->b_validend = valid_size;
1285
1286 if (pagedirty)
1287 bp->b_dirtyend = valid_size;
1288 else
1289 bp->b_dirtyend = 0;
1290
1291 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_NONE,
1292 bp->b_validend, bp->b_dirtyend,
1293 (int)vp->v_ubcinfo->ui_size, 0, 0);
1294 }
1295 } else {
1296 bp->b_validoff = 0;
1297 bp->b_dirtyoff = 0;
1298
1299 if (pagedirty) {
1300 /* page is dirty */
1301 bp->b_validend = bp->b_bcount;
1302 bp->b_dirtyend = bp->b_bcount;
1303 } else {
1304 /* page is clean */
1305 bp->b_validend = bp->b_bcount;
1306 bp->b_dirtyend = 0;
1307 }
1308 }
1309 error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL);
1310 if(error) {
1311 panic("getblk: VOP_BMAP failed");
1312 /*NOTREACHED*/
1313 /*
1314 * XXX: We probably should invalidate the VM Page
1315 */
1316 bp->b_error = error;
1317 SET(bp->b_flags, (B_ERROR | B_INVAL));
1318 /* undo B_DONE that was set before upl_commit() */
1319 CLR(bp->b_flags, B_DONE);
1320 brelse(bp);
1321 return (0);
1322 }
1323 } else {
1324 bufstats.bufs_miss++;
1325 }
1326 kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
1327 if (kret != KERN_SUCCESS) {
1328 panic("getblk: ubc_upl_map() "
1329 "failed with (%d)", kret);
1330 }
1331 if (bp->b_data == 0)
1332 panic("kernel_upl_map mapped 0");
1333
1334 break;
1335
1336 case BLK_PAGEIN:
1337 case BLK_PAGEOUT:
1338 panic("getblk: paging operation 2");
1339 break;
1340 default:
1341 panic("getblk: %d unknown operation 3", operation);
1342 /*NOTREACHED*/
1343 break;
1344 }
1345 }
1346
1347 if (bp->b_data == NULL)
1348 panic("getblk: bp->b_addr is null");
1349
1350 if (bp->b_bufsize & 0xfff) {
1351 if (ISSET(bp->b_flags, B_META) && (bp->b_bufsize & 0x1ff))
1352 panic("getblk: bp->b_bufsize = %d", bp->b_bufsize);
1353 }
1354
1355 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
1356 (int)bp, (int)bp->b_data, bp->b_flags, 3, 0);
1357
1358 return (bp);
1359}
1360
1361/*
1362 * Get an empty, disassociated buffer of given size.
1363 */
1364struct buf *
1365geteblk(size)
1366 int size;
1367{
1368 struct buf *bp;
1369 int queue = BQ_EMPTY;
1370
1371 while ((bp = getnewbuf(0, 0, &queue)) == 0)
1372 ;
1373 SET(bp->b_flags, (B_META|B_INVAL));
1374
1375#if DIAGNOSTIC
1376 assert(queue == BQ_EMPTY);
1377#endif /* DIAGNOSTIC */
1378 /* XXX need to implement logic to deal with other queues */
1379
1380 binshash(bp, &invalhash);
1381 allocbuf(bp, size);
1382 bufstats.bufs_eblk++;
1383
1384 return (bp);
1385}
1386
1387/*
1388 * Zones for the meta data buffers
1389 */
1390
1391#define MINMETA 512
1392#define MAXMETA 4096
1393
1394struct meta_zone_entry {
1395 zone_t mz_zone;
1396 vm_size_t mz_size;
1397 vm_size_t mz_max;
1398 char *mz_name;
1399};
1400
1401struct meta_zone_entry meta_zones[] = {
1402 {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
1403 {NULL, (MINMETA * 2), 64 * (MINMETA * 2), "buf.1024" },
1404 {NULL, (MINMETA * 4), 16 * (MINMETA * 4), "buf.2048" },
1405 {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
1406 {NULL, 0, 0, "" } /* End */
1407};
1408
1409/*
1410 * Initialize the meta data zones
1411 */
1412static void
1413bufzoneinit(void)
1414{
1415 int i;
1416
1417 for (i = 0; meta_zones[i].mz_size != 0; i++) {
1418 meta_zones[i].mz_zone =
1419 zinit(meta_zones[i].mz_size,
1420 meta_zones[i].mz_max,
1421 PAGE_SIZE,
1422 meta_zones[i].mz_name);
1423 }
1424 buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
1425}
1426
1427static __inline__ zone_t
1428getbufzone(size_t size)
1429{
1430 int i;
1431
1432 if ((size % 512) || (size < MINMETA) || (size > MAXMETA))
1433 panic("getbufzone: incorect size = %d", size);
1434
1435 for (i = 0; meta_zones[i].mz_size != 0; i++) {
1436 if (meta_zones[i].mz_size >= size)
1437 break;
1438 }
1439
1440 return (meta_zones[i].mz_zone);
1441}
1442
1443/*
1444 * With UBC, there is no need to expand / shrink the file data
1445 * buffer. The VM uses the same pages, hence no waste.
1446 * All the file data buffers can have one size.
1447 * In fact expand / shrink would be an expensive operation.
1448 *
1449 * Only exception to this is meta-data buffers. Most of the
1450 * meta data operations are smaller than PAGE_SIZE. Having the
1451 * meta-data buffers grow and shrink as needed, optimizes use
1452 * of the kernel wired memory.
1453 */
1454
1455int
1456allocbuf(bp, size)
1457 struct buf *bp;
1458 int size;
1459{
1460 vm_size_t desired_size;
1461
1462 desired_size = roundup(size, CLBYTES);
1463
1464 if(desired_size < PAGE_SIZE)
1465 desired_size = PAGE_SIZE;
1466 if (desired_size > MAXBSIZE)
1467 panic("allocbuf: buffer larger than MAXBSIZE requested");
1468
1469 if (ISSET(bp->b_flags, B_META)) {
1470 kern_return_t kret;
1471 zone_t zprev, z;
1472 size_t nsize = roundup(size, MINMETA);
1473
1474 if (bp->b_data) {
1475 vm_offset_t elem = (vm_offset_t)bp->b_data;
1476
1477 if (ISSET(bp->b_flags, B_ZALLOC))
1478 if (bp->b_bufsize <= MAXMETA) {
1479 if (bp->b_bufsize < nsize) {
1480 /* reallocate to a bigger size */
1481
1482 zprev = getbufzone(bp->b_bufsize);
1483 if (nsize <= MAXMETA) {
1484 desired_size = nsize;
1485 z = getbufzone(nsize);
1486 bp->b_data = (caddr_t)zalloc(z);
1487 if(bp->b_data == 0)
1488 panic("allocbuf: zalloc() returned NULL");
1489 } else {
1490 kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1491 if (kret != KERN_SUCCESS)
1492 panic("allocbuf: kmem_alloc() 0 returned %d", kret);
1493 if(bp->b_data == 0)
1494 panic("allocbuf: null b_data 0");
1495 CLR(bp->b_flags, B_ZALLOC);
1496 }
1497 bcopy((const void *)elem, bp->b_data, bp->b_bufsize);
1498 zfree(zprev, elem);
1499 } else {
1500 desired_size = bp->b_bufsize;
1501 }
1502 } else
1503 panic("allocbuf: B_ZALLOC set incorrectly");
1504 else
1505 if (bp->b_bufsize < desired_size) {
1506 /* reallocate to a bigger size */
1507 kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1508 if (kret != KERN_SUCCESS)
1509 panic("allocbuf: kmem_alloc() returned %d", kret);
1510 if(bp->b_data == 0)
1511 panic("allocbuf: null b_data");
1512 bcopy((const void *)elem, bp->b_data, bp->b_bufsize);
1513 kmem_free(kernel_map, elem, bp->b_bufsize);
1514 } else {
1515 desired_size = bp->b_bufsize;
1516 }
1517 } else {
1518 /* new allocation */
1519 if (nsize <= MAXMETA) {
1520 desired_size = nsize;
1521 z = getbufzone(nsize);
1522 bp->b_data = (caddr_t)zalloc(z);
1523 if(bp->b_data == 0)
1524 panic("allocbuf: zalloc() returned NULL 2");
1525 SET(bp->b_flags, B_ZALLOC);
1526 } else {
1527 kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1528 if (kret != KERN_SUCCESS)
1529 panic("allocbuf: kmem_alloc() 2 returned %d", kret);
1530 if(bp->b_data == 0)
1531 panic("allocbuf: null b_data 2");
1532 }
1533 }
1534 }
1535
1536 if (ISSET(bp->b_flags, B_META) && (bp->b_data == 0))
1537 panic("allocbuf: bp->b_data is NULL, buf @ 0x%x", bp);
1538
1539 bp->b_bufsize = desired_size;
1540 bp->b_bcount = size;
1541 return (0);
1542}
1543
1544/*
1545 * Get a new buffer from one of the free lists.
1546 *
1547 * Request for a queue is passes in. The queue from which the buffer was taken
1548 * from is returned. Out of range queue requests get BQ_EMPTY. Request for
1549 * BQUEUE means no preference. Use heuristics in that case.
1550 * Heuristics is as follows:
1551 * Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
1552 * If none available block till one is made available.
1553 * If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
1554 * Pick the most stale buffer.
1555 * If found buffer was marked delayed write, start the async. write
1556 * and restart the search.
1557 * Initialize the fields and disassociate the buffer from the vnode.
1558 * Remove the buffer from the hash. Return the buffer and the queue
1559 * on which it was found.
1560 */
1561
1562static struct buf *
1563getnewbuf(slpflag, slptimeo, queue)
1564 int slpflag, slptimeo;
1565 int *queue;
1566{
1567 register struct buf *bp;
1568 register struct buf *lru_bp;
1569 register struct buf *age_bp;
1570 register struct buf *meta_bp;
1571 register int age_time, lru_time, bp_time, meta_time;
1572 int s;
1573 int req = *queue; /* save it for restarts */
1574
1575start:
1576 s = splbio();
1577
1578 /* invalid request gets empty queue */
1579 if ((*queue > BQUEUES) || (*queue < 0)
1580 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED))
1581 *queue = BQ_EMPTY;
1582
1583 /* (*queue == BQUEUES) means no preference */
1584 if (*queue != BQUEUES) {
1585 /* Try for the requested queue first */
1586 bp = bufqueues[*queue].tqh_first;
1587 if (bp)
1588 goto found;
1589 }
1590
1591 /* Unable to use requested queue */
1592 age_bp = bufqueues[BQ_AGE].tqh_first;
1593 lru_bp = bufqueues[BQ_LRU].tqh_first;
1594 meta_bp = bufqueues[BQ_META].tqh_first;
1595
1596 if (!age_bp && !lru_bp && !meta_bp) {
1597 /*
1598 * Unavailble on AGE or LRU or META queues
1599 * Try the empty list first
1600 */
1601 bp = bufqueues[BQ_EMPTY].tqh_first;
1602 if (bp) {
1603 *queue = BQ_EMPTY;
1604 goto found;
1605 }
1606
1607 /* Create a new temparory buffer header */
1608 bp = (struct buf *)zalloc(buf_hdr_zone);
1609
1610 if (bp) {
1611 bufhdrinit(bp);
1612 BLISTNONE(bp);
1613 binshash(bp, &invalhash);
1614 SET(bp->b_flags, B_HDRALLOC);
1615 *queue = BQ_EMPTY;
1616 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
1617 buf_hdr_count++;
1618 goto found;
1619 }
1620
1621 /* Log this error condition */
1622 printf("getnewbuf: No useful buffers");
1623
1624 /* wait for a free buffer of any kind */
1625 needbuffer = 1;
1626 bufstats.bufs_sleeps++;
1627 tsleep(&needbuffer, slpflag|(PRIBIO+1), "getnewbuf", slptimeo);
1628 splx(s);
1629 return (0);
1630 }
1631
1632 /* Buffer available either on AGE or LRU or META */
1633 bp = NULL;
1634 *queue = -1;
1635
1636 /* Buffer available either on AGE or LRU */
1637 if (!age_bp) {
1638 bp = lru_bp;
1639 *queue = BQ_LRU;
1640 } else if (!lru_bp) {
1641 bp = age_bp;
1642 *queue = BQ_AGE;
1643 } else { /* buffer available on both AGE and LRU */
1644 age_time = time.tv_sec - age_bp->b_timestamp;
1645 lru_time = time.tv_sec - lru_bp->b_timestamp;
1646 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
1647 bp = age_bp;
1648 *queue = BQ_AGE;
1649 /*
1650 * we should probably re-timestamp eveything in the
1651 * queues at this point with the current time
1652 */
1653 } else {
1654 if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
1655 bp = lru_bp;
1656 *queue = BQ_LRU;
1657 } else {
1658 bp = age_bp;
1659 *queue = BQ_AGE;
1660 }
1661 }
1662 }
1663
1664 if (!bp) { /* Neither on AGE nor on LRU */
1665 bp = meta_bp;
1666 *queue = BQ_META;
1667 } else if (meta_bp) {
1668 bp_time = time.tv_sec - bp->b_timestamp;
1669 meta_time = time.tv_sec - meta_bp->b_timestamp;
1670
1671 if (!(bp_time < 0) && !(meta_time < 0)) {
1672 /* time not set backwards */
1673 int bp_is_stale;
1674 bp_is_stale = (*queue == BQ_LRU) ?
1675 lru_is_stale : age_is_stale;
1676
1677 if ((meta_time >= meta_is_stale) &&
1678 (bp_time < bp_is_stale)) {
1679 bp = meta_bp;
1680 *queue = BQ_META;
1681 }
1682 }
1683 }
1684
1685 if (bp == NULL)
1686 panic("getnewbuf: null bp");
1687
1688found:
1689 if (ISSET(bp->b_flags, B_LOCKED)) {
1690 panic("getnewbuf: bp @ 0x%x is LOCKED! (flags 0x%x)\n", bp, bp->b_flags);
1691 }
1692
1693 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1694 panic("getnewbuf: le_prev is deadbeef, buf @ 0x%x", bp);
1695
1696 if(ISSET(bp->b_flags, B_BUSY))
1697 panic("getnewbuf reusing BUSY buf @ 0x%x", bp);
1698
1699 /* Clean it */
1700 if (bcleanbuf(bp)) {
1701 /* bawrite() issued, buffer not ready */
1702 splx(s);
1703 *queue = req;
1704 goto start;
1705 }
1706 splx(s);
1707 return (bp);
1708}
1709
1710#include <mach/mach_types.h>
1711#include <mach/memory_object_types.h>
1712#include <kern/sched_prim.h>
1713
1714/*
1715 * Clean a buffer.
1716 * Returns 0 is buffer is ready to use,
1717 * Returns 1 if issued a bawrite() to indicate
1718 * that the buffer is not ready.
1719 */
1720static int
1721bcleanbuf(struct buf *bp)
1722{
1723 int s;
1724 struct ucred *cred;
1725 int hdralloc = 0;
1726
1727 s = splbio();
1728
1729 /* Remove from the queue */
1730 bremfree(bp);
1731
1732 /* Buffer is no longer on free lists. */
1733 SET(bp->b_flags, B_BUSY);
1734
1735 /* Check whether the buffer header was "allocated" */
1736 if (ISSET(bp->b_flags, B_HDRALLOC))
1737 hdralloc = 1;
1738
1739 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1740 panic("bcleanbuf: le_prev is deadbeef");
1741
1742 /*
1743 * If buffer was a delayed write, start the IO by queuing
1744 * it on the LAUNDRY queue, and return 1
1745 */
1746 if (ISSET(bp->b_flags, B_DELWRI)) {
1747 splx(s);
1748 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
1749 blaundrycnt++;
1750 wakeup(&blaundrycnt);
1751 /* and give it a chance to run */
1752 (void)thread_block(THREAD_CONTINUE_NULL);
1753 return (1);
1754 }
1755
1756 if (bp->b_vp)
1757 brelvp(bp);
1758 bremhash(bp);
1759 BLISTNONE(bp);
1760
1761 splx(s);
1762
1763 if (ISSET(bp->b_flags, B_META)) {
1764 vm_offset_t elem = (vm_offset_t)bp->b_data;
1765 if (elem == 0)
1766 panic("bcleanbuf: NULL bp->b_data B_META buffer");
1767
1768 if (ISSET(bp->b_flags, B_ZALLOC)) {
1769 if (bp->b_bufsize <= MAXMETA) {
1770 zone_t z;
1771
1772 z = getbufzone(bp->b_bufsize);
1773 bp->b_data = (caddr_t)0xdeadbeef;
1774 zfree(z, elem);
1775 CLR(bp->b_flags, B_ZALLOC);
1776 } else
1777 panic("bcleanbuf: B_ZALLOC set incorrectly");
1778 } else {
1779 bp->b_data = (caddr_t)0xdeadbeef;
1780 kmem_free(kernel_map, elem, bp->b_bufsize);
1781 }
1782 }
1783
1784 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
1785
1786 /* disassociate us from our vnode, if we had one... */
1787 s = splbio();
1788
1789 /* clear out various other fields */
1790 bp->b_bufsize = 0;
1791 bp->b_data = 0;
1792 bp->b_flags = B_BUSY;
1793 if (hdralloc)
1794 SET(bp->b_flags, B_HDRALLOC);
1795 bp->b_dev = NODEV;
1796 bp->b_blkno = bp->b_lblkno = 0;
1797 bp->b_iodone = 0;
1798 bp->b_error = 0;
1799 bp->b_resid = 0;
1800 bp->b_bcount = 0;
1801 bp->b_dirtyoff = bp->b_dirtyend = 0;
1802 bp->b_validoff = bp->b_validend = 0;
1803
1804 /* nuke any credentials we were holding */
1805 cred = bp->b_rcred;
1806 if (cred != NOCRED) {
1807 bp->b_rcred = NOCRED;
1808 crfree(cred);
1809 }
1810 cred = bp->b_wcred;
1811 if (cred != NOCRED) {
1812 bp->b_wcred = NOCRED;
1813 crfree(cred);
1814 }
1815 splx(s);
1816 return (0);
1817}
1818
1819
1820/*
1821 * Wait for operations on the buffer to complete.
1822 * When they do, extract and return the I/O's error value.
1823 */
1824int
1825biowait(bp)
1826 struct buf *bp;
1827{
1828 int s;
1829
1830 s = splbio();
1831 while (!ISSET(bp->b_flags, B_DONE))
1832 tsleep(bp, PRIBIO + 1, "biowait", 0);
1833 splx(s);
1834
1835 /* check for interruption of I/O (e.g. via NFS), then errors. */
1836 if (ISSET(bp->b_flags, B_EINTR)) {
1837 CLR(bp->b_flags, B_EINTR);
1838 return (EINTR);
1839 } else if (ISSET(bp->b_flags, B_ERROR))
1840 return (bp->b_error ? bp->b_error : EIO);
1841 else
1842 return (0);
1843}
1844
1845/*
1846 * Mark I/O complete on a buffer.
1847 *
1848 * If a callback has been requested, e.g. the pageout
1849 * daemon, do so. Otherwise, awaken waiting processes.
1850 *
1851 * [ Leffler, et al., says on p.247:
1852 * "This routine wakes up the blocked process, frees the buffer
1853 * for an asynchronous write, or, for a request by the pagedaemon
1854 * process, invokes a procedure specified in the buffer structure" ]
1855 *
1856 * In real life, the pagedaemon (or other system processes) wants
1857 * to do async stuff to, and doesn't want the buffer brelse()'d.
1858 * (for swap pager, that puts swap buffers on the free lists (!!!),
1859 * for the vn device, that puts malloc'd buffers on the free lists!)
1860 */
1861void
1862biodone(bp)
1863 struct buf *bp;
1864{
1865 boolean_t funnel_state;
1866 struct vnode *vp;
1867 extern struct timeval priority_IO_timestamp_for_root;
1868 extern int hard_throttle_on_root;
1869
1870 funnel_state = thread_funnel_set(kernel_flock, TRUE);
1871
1872 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
1873 (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1874
1875 if (ISSET(bp->b_flags, B_DONE))
1876 panic("biodone already");
1877 SET(bp->b_flags, B_DONE); /* note that it's done */
1878 /*
1879 * I/O was done, so don't believe
1880 * the DIRTY state from VM anymore
1881 */
1882 CLR(bp->b_flags, B_WASDIRTY);
1883
1884 if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
1885 vwakeup(bp); /* wake up reader */
1886
1887 if (kdebug_enable) {
1888 int code = DKIO_DONE;
1889
1890 if (bp->b_flags & B_READ)
1891 code |= DKIO_READ;
1892 if (bp->b_flags & B_ASYNC)
1893 code |= DKIO_ASYNC;
1894
1895 if (bp->b_flags & B_META)
1896 code |= DKIO_META;
1897 else if (bp->b_flags & (B_PGIN | B_PAGEOUT))
1898 code |= DKIO_PAGING;
1899
1900 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
1901 (unsigned int)bp, (unsigned int)bp->b_vp,
1902 bp->b_resid, bp->b_error, 0);
1903 }
1904
1905 /* Wakeup the throttled write operations as needed */
1906 vp = bp->b_vp;
1907 if (vp
1908 && (vp->v_flag & VTHROTTLED)
1909 && (vp->v_numoutput <= (BUFWRITE_THROTTLE / 3))) {
1910 vp->v_flag &= ~VTHROTTLED;
1911 wakeup((caddr_t)&vp->v_numoutput);
1912 }
1913 if ((bp->b_flags & B_PGIN) && (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV)) {
1914 priority_IO_timestamp_for_root = time;
1915 hard_throttle_on_root = 0;
1916 }
1917 if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */
1918 void (*iodone_func)(struct buf *) = bp->b_iodone;
1919
1920 CLR(bp->b_flags, B_CALL); /* but note callout done */
1921 bp->b_iodone = NULL;
1922
1923 if (iodone_func == NULL) {
1924 panic("biodone: bp @ 0x%x has NULL b_iodone!\n", bp);
1925 } else {
1926 (*iodone_func)(bp);
1927 }
1928 } else if (ISSET(bp->b_flags, B_ASYNC)) /* if async, release it */
1929 brelse(bp);
1930 else { /* or just wakeup the buffer */
1931 CLR(bp->b_flags, B_WANTED);
1932 wakeup(bp);
1933 }
1934
1935 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
1936 (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1937
1938 thread_funnel_set(kernel_flock, funnel_state);
1939}
1940
1941/*
1942 * Return a count of buffers on the "locked" queue.
1943 */
1944int
1945count_lock_queue()
1946{
1947 register struct buf *bp;
1948 register int n = 0;
1949
1950 for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
1951 bp = bp->b_freelist.tqe_next)
1952 n++;
1953 return (n);
1954}
1955
1956/*
1957 * Return a count of 'busy' buffers. Used at the time of shutdown.
1958 */
1959int
1960count_busy_buffers()
1961{
1962 register struct buf *bp;
1963 register int nbusy = 0;
1964
1965 for (bp = &buf[nbuf]; --bp >= buf; )
1966 if ((bp->b_flags & (B_BUSY|B_INVAL)) == B_BUSY)
1967 nbusy++;
1968 return (nbusy);
1969}
1970
1971#if DIAGNOSTIC
1972/*
1973 * Print out statistics on the current allocation of the buffer pool.
1974 * Can be enabled to print out on every ``sync'' by setting "syncprt"
1975 * in vfs_syscalls.c using sysctl.
1976 */
1977void
1978vfs_bufstats()
1979{
1980 int s, i, j, count;
1981 register struct buf *bp;
1982 register struct bqueues *dp;
1983 int counts[MAXBSIZE/CLBYTES+1];
1984 static char *bname[BQUEUES] =
1985 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
1986
1987 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
1988 count = 0;
1989 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1990 counts[j] = 0;
1991 s = splbio();
1992 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
1993 counts[bp->b_bufsize/CLBYTES]++;
1994 count++;
1995 }
1996 splx(s);
1997 printf("%s: total-%d", bname[i], count);
1998 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1999 if (counts[j] != 0)
2000 printf(", %d-%d", j * CLBYTES, counts[j]);
2001 printf("\n");
2002 }
2003}
2004#endif /* DIAGNOSTIC */
2005
2006#define NRESERVEDIOBUFS 64
2007
2008__private_extern__ struct buf *
2009alloc_io_buf(vp, priv)
2010 struct vnode *vp;
2011 int priv;
2012{
2013 register struct buf *bp;
2014 int s;
2015
2016 s = splbio();
2017
2018 while (niobuf - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse && !priv) {
2019 need_iobuffer = 1;
2020 bufstats.bufs_iobufsleeps++;
2021 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf", 0);
2022 }
2023
2024 while ((bp = iobufqueue.tqh_first) == NULL) {
2025 need_iobuffer = 1;
2026 bufstats.bufs_iobufsleeps++;
2027 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf1", 0);
2028 }
2029
2030 TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
2031 bp->b_timestamp = 0;
2032
2033 /* clear out various fields */
2034 bp->b_flags = B_BUSY;
2035 bp->b_blkno = bp->b_lblkno = 0;
2036
2037 bp->b_iodone = 0;
2038 bp->b_error = 0;
2039 bp->b_resid = 0;
2040 bp->b_bcount = 0;
2041 bp->b_bufsize = 0;
2042 bp->b_vp = vp;
2043
2044 if (vp->v_type == VBLK || vp->v_type == VCHR)
2045 bp->b_dev = vp->v_rdev;
2046 else
2047 bp->b_dev = NODEV;
2048 bufstats.bufs_iobufinuse++;
2049 if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
2050 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
2051 splx(s);
2052
2053 return (bp);
2054}
2055
2056__private_extern__ void
2057free_io_buf(bp)
2058 struct buf *bp;
2059{
2060 int s;
2061
2062 s = splbio();
2063 /* put buffer back on the head of the iobufqueue */
2064 bp->b_vp = NULL;
2065 bp->b_flags = B_INVAL;
2066
2067 binsheadfree(bp, &iobufqueue, -1);
2068
2069 /* Wake up any processes waiting for any buffer to become free. */
2070 if (need_iobuffer) {
2071 need_iobuffer = 0;
2072 wakeup(&need_iobuffer);
2073 }
2074 bufstats.bufs_iobufinuse--;
2075 splx(s);
2076}
2077
2078/* disabled for now */
2079
2080/* XXX move this to a separate file */
2081/*
2082 * Dynamic Scaling of the Buffer Queues
2083 */
2084
2085typedef long long blsize_t;
2086
2087blsize_t MAXNBUF; /* initialize to (sane_size / PAGE_SIZE) */
2088/* Global tunable limits */
2089blsize_t nbufh; /* number of buffer headers */
2090blsize_t nbuflow; /* minimum number of buffer headers required */
2091blsize_t nbufhigh; /* maximum number of buffer headers allowed */
2092blsize_t nbuftarget; /* preferred number of buffer headers */
2093
2094/*
2095 * assertions:
2096 *
2097 * 1. 0 < nbuflow <= nbufh <= nbufhigh
2098 * 2. nbufhigh <= MAXNBUF
2099 * 3. 0 < nbuflow <= nbuftarget <= nbufhigh
2100 * 4. nbufh can not be set by sysctl().
2101 */
2102
2103/* Per queue tunable limits */
2104
2105struct bufqlim {
2106 blsize_t bl_nlow; /* minimum number of buffer headers required */
2107 blsize_t bl_num; /* number of buffer headers on the queue */
2108 blsize_t bl_nlhigh; /* maximum number of buffer headers allowed */
2109 blsize_t bl_target; /* preferred number of buffer headers */
2110 long bl_stale; /* Seconds after which a buffer is considered stale */
2111} bufqlim[BQUEUES];
2112
2113/*
2114 * assertions:
2115 *
2116 * 1. 0 <= bl_nlow <= bl_num <= bl_nlhigh
2117 * 2. bl_nlhigh <= MAXNBUF
2118 * 3. bufqlim[BQ_META].bl_nlow != 0
2119 * 4. bufqlim[BQ_META].bl_nlow > (number of possible concurrent
2120 * file system IO operations)
2121 * 5. bl_num can not be set by sysctl().
2122 * 6. bl_nhigh <= nbufhigh
2123 */
2124
2125/*
2126 * Rationale:
2127 * ----------
2128 * Defining it blsize_t as long permits 2^31 buffer headers per queue.
2129 * Which can describe (2^31 * PAGE_SIZE) memory per queue.
2130 *
2131 * These limits are exported to by means of sysctl().
2132 * It was decided to define blsize_t as a 64 bit quantity.
2133 * This will make sure that we will not be required to change it
2134 * as long as we do not exceed 64 bit address space for the kernel.
2135 *
2136 * low and high numbers parameters initialized at compile time
2137 * and boot arguments can be used to override them. sysctl()
2138 * would not change the value. sysctl() can get all the values
2139 * but can set only target. num is the current level.
2140 *
2141 * Advantages of having a "bufqscan" thread doing the balancing are,
2142 * Keep enough bufs on BQ_EMPTY.
2143 * getnewbuf() by default will always select a buffer from the BQ_EMPTY.
2144 * getnewbuf() perfoms best if a buffer was found there.
2145 * Also this minimizes the possibility of starting IO
2146 * from getnewbuf(). That's a performance win, too.
2147 *
2148 * Localize complex logic [balancing as well as time aging]
2149 * to balancebufq().
2150 *
2151 * Simplify getnewbuf() logic by elimination of time aging code.
2152 */
2153
2154/*
2155 * Algorithm:
2156 * -----------
2157 * The goal of the dynamic scaling of the buffer queues to to keep
2158 * the size of the LRU close to bl_target. Buffers on a queue would
2159 * be time aged.
2160 *
2161 * There would be a thread which will be responsible for "balancing"
2162 * the buffer cache queues.
2163 *
2164 * The scan order would be: AGE, LRU, META, EMPTY.
2165 */
2166
2167long bufqscanwait = 0;
2168
2169static void bufqscan_thread();
2170static int balancebufq(int q);
2171static int btrimempty(int n);
2172static __inline__ int initbufqscan(void);
2173static __inline__ int nextbufq(int q);
2174static void buqlimprt(int all);
2175
2176static void
2177bufq_balance_thread_init()
2178{
2179
2180 if (bufqscanwait++ == 0) {
2181
2182 /* Initalize globals */
2183 MAXNBUF = (sane_size / PAGE_SIZE);
2184 nbufh = nbuf;
2185 nbuflow = min(nbufh, 100);
2186 nbufhigh = min(MAXNBUF, max(nbufh, 2048));
2187 nbuftarget = (sane_size >> 5) / PAGE_SIZE;
2188 nbuftarget = max(nbuflow, nbuftarget);
2189 nbuftarget = min(nbufhigh, nbuftarget);
2190
2191 /*
2192 * Initialize the bufqlim
2193 */
2194
2195 /* LOCKED queue */
2196 bufqlim[BQ_LOCKED].bl_nlow = 0;
2197 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
2198 bufqlim[BQ_LOCKED].bl_target = 0;
2199 bufqlim[BQ_LOCKED].bl_stale = 30;
2200
2201 /* LRU queue */
2202 bufqlim[BQ_LRU].bl_nlow = 0;
2203 bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
2204 bufqlim[BQ_LRU].bl_target = nbuftarget/4;
2205 bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
2206
2207 /* AGE queue */
2208 bufqlim[BQ_AGE].bl_nlow = 0;
2209 bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
2210 bufqlim[BQ_AGE].bl_target = nbuftarget/4;
2211 bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
2212
2213 /* EMPTY queue */
2214 bufqlim[BQ_EMPTY].bl_nlow = 0;
2215 bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
2216 bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
2217 bufqlim[BQ_EMPTY].bl_stale = 600000;
2218
2219 /* META queue */
2220 bufqlim[BQ_META].bl_nlow = 0;
2221 bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
2222 bufqlim[BQ_META].bl_target = nbuftarget/4;
2223 bufqlim[BQ_META].bl_stale = META_IS_STALE;
2224
2225 /* LAUNDRY queue */
2226 bufqlim[BQ_LOCKED].bl_nlow = 0;
2227 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
2228 bufqlim[BQ_LOCKED].bl_target = 0;
2229 bufqlim[BQ_LOCKED].bl_stale = 30;
2230
2231 buqlimprt(1);
2232 }
2233
2234 /* create worker thread */
2235 kernel_thread(kernel_task, bufqscan_thread);
2236}
2237
2238/* The workloop for the buffer balancing thread */
2239static void
2240bufqscan_thread()
2241{
2242 boolean_t funnel_state;
2243 int moretodo = 0;
2244
2245 funnel_state = thread_funnel_set(kernel_flock, TRUE);
2246
2247 for(;;) {
2248 do {
2249 int q; /* buffer queue to process */
2250
2251 q = initbufqscan();
2252 for (; q; ) {
2253 moretodo |= balancebufq(q);
2254 q = nextbufq(q);
2255 }
2256 } while (moretodo);
2257
2258#if DIAGNOSTIC
2259 vfs_bufstats();
2260 buqlimprt(0);
2261#endif
2262 (void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz);
2263 moretodo = 0;
2264 }
2265
2266 (void) thread_funnel_set(kernel_flock, FALSE);
2267}
2268
2269/* Seed for the buffer queue balancing */
2270static __inline__ int
2271initbufqscan()
2272{
2273 /* Start with AGE queue */
2274 return (BQ_AGE);
2275}
2276
2277/* Pick next buffer queue to balance */
2278static __inline__ int
2279nextbufq(int q)
2280{
2281 int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
2282
2283 q++;
2284 q %= sizeof(order);
2285 return (order[q]);
2286}
2287
2288/* function to balance the buffer queues */
2289static int
2290balancebufq(int q)
2291{
2292 int moretodo = 0;
2293 int s = splbio();
2294 int n;
2295
2296 /* reject invalid q */
2297 if ((q < 0) || (q >= BQUEUES))
2298 goto out;
2299
2300 /* LOCKED or LAUNDRY queue MUST not be balanced */
2301 if ((q == BQ_LOCKED) || (q == BQ_LAUNDRY))
2302 goto out;
2303
2304 n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
2305
2306 /* If queue has less than target nothing more to do */
2307 if (n < 0)
2308 goto out;
2309
2310 if ( n > 8 ) {
2311 /* Balance only a small amount (12.5%) at a time */
2312 n >>= 3;
2313 }
2314
2315 /* EMPTY queue needs special handling */
2316 if (q == BQ_EMPTY) {
2317 moretodo |= btrimempty(n);
2318 goto out;
2319 }
2320
2321 for (; n > 0; n--) {
2322 struct buf *bp = bufqueues[q].tqh_first;
2323 if (!bp)
2324 break;
2325
2326 /* check if it's stale */
2327 if ((time.tv_sec - bp->b_timestamp) > bufqlim[q].bl_stale) {
2328 if (bcleanbuf(bp)) {
2329 /* bawrite() issued, bp not ready */
2330 moretodo = 1;
2331 } else {
2332 /* release the cleaned buffer to BQ_EMPTY */
2333 SET(bp->b_flags, B_INVAL);
2334 brelse(bp);
2335 }
2336 } else
2337 break;
2338 }
2339
2340out:
2341 splx(s);
2342 return (moretodo);
2343}
2344
2345static int
2346btrimempty(int n)
2347{
2348 /*
2349 * When struct buf are allocated dynamically, this would
2350 * reclaim upto 'n' struct buf from the empty queue.
2351 */
2352
2353 return (0);
2354}
2355
2356static __inline__ void
2357bufqinc(int q)
2358{
2359 if ((q < 0) || (q >= BQUEUES))
2360 return;
2361
2362 bufqlim[q].bl_num++;
2363 return;
2364}
2365
2366static __inline__ void
2367bufqdec(int q)
2368{
2369 if ((q < 0) || (q >= BQUEUES))
2370 return;
2371
2372 bufqlim[q].bl_num--;
2373 return;
2374}
2375
2376static void
2377buqlimprt(int all)
2378{
2379 int i;
2380 static char *bname[BQUEUES] =
2381 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
2382
2383 if (all)
2384 for (i = 0; i < BQUEUES; i++) {
2385 printf("%s : ", bname[i]);
2386 printf("min = %ld, ", (long)bufqlim[i].bl_nlow);
2387 printf("cur = %ld, ", (long)bufqlim[i].bl_num);
2388 printf("max = %ld, ", (long)bufqlim[i].bl_nlhigh);
2389 printf("target = %ld, ", (long)bufqlim[i].bl_target);
2390 printf("stale after %ld seconds\n", bufqlim[i].bl_stale);
2391 }
2392 else
2393 for (i = 0; i < BQUEUES; i++) {
2394 printf("%s : ", bname[i]);
2395 printf("cur = %ld, ", (long)bufqlim[i].bl_num);
2396 }
2397}
2398
2399/*
2400 * If the getnewbuf() calls bcleanbuf() on the same thread
2401 * there is a potential for stack overrun and deadlocks.
2402 * So we always handoff the work to worker thread for completion
2403 */
2404
2405static void
2406bcleanbuf_thread_init()
2407{
2408 static void bcleanbuf_thread();
2409
2410 /* create worker thread */
2411 kernel_thread(kernel_task, bcleanbuf_thread);
2412}
2413
2414static void
2415bcleanbuf_thread()
2416{
2417 boolean_t funnel_state;
2418 struct buf *bp;
2419 int error = 0;
2420 int loopcnt = 0;
2421
2422 funnel_state = thread_funnel_set(kernel_flock, TRUE);
2423
2424doit:
2425 while (blaundrycnt == 0)
2426 (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 60 * hz);
2427 bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY]);
2428 /* Remove from the queue */
2429 bremfree(bp);
2430 blaundrycnt--;
2431
2432 /* do the IO */
2433 error = bawrite_internal(bp, 0);
2434 if (error) {
2435 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
2436 blaundrycnt++;
2437 if (loopcnt > 10) {
2438 (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 1);
2439 loopcnt = 0;
2440 } else {
2441 (void)thread_block(THREAD_CONTINUE_NULL);
2442 loopcnt++;
2443 }
2444 }
2445 /* start again */
2446 goto doit;
2447
2448 (void) thread_funnel_set(kernel_flock, funnel_state);
2449}
2450
2451
2452static int
2453brecover_data(struct buf *bp)
2454{
2455 upl_t upl;
2456 upl_page_info_t *pl;
2457 int upl_offset;
2458 kern_return_t kret;
2459 struct vnode *vp = bp->b_vp;
2460
2461 if (vp->v_tag == VT_NFS)
2462 /*
2463 * NFS currently deals with this case
2464 * in a slightly different manner...
2465 * continue to let it do so
2466 */
2467 return(1);
2468
2469 if (!UBCISVALID(vp) || bp->b_bufsize == 0)
2470 goto dump_buffer;
2471
2472 kret = ubc_create_upl(vp,
2473 ubc_blktooff(vp, bp->b_lblkno),
2474 bp->b_bufsize,
2475 &upl,
2476 &pl,
2477 UPL_PRECIOUS);
2478 if (kret != KERN_SUCCESS)
2479 panic("Failed to get pagelists");
2480
2481 for (upl_offset = 0; upl_offset < bp->b_bufsize; upl_offset += PAGE_SIZE) {
2482
2483 if (!upl_valid_page(pl, upl_offset / PAGE_SIZE) || !upl_dirty_page(pl, upl_offset / PAGE_SIZE)) {
2484 ubc_upl_abort(upl, 0);
2485 goto dump_buffer;
2486 }
2487 }
2488 SET(bp->b_flags, B_PAGELIST);
2489 bp->b_pagelist = upl;
2490
2491 kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
2492 if (kret != KERN_SUCCESS)
2493 panic("getblk: ubc_upl_map() failed with (%d)", kret);
2494 if (bp->b_data == 0)
2495 panic("ubc_upl_map mapped 0");
2496
2497 return (1);
2498
2499dump_buffer:
2500 bp->b_bufsize = 0;
2501 SET(bp->b_flags, B_INVAL);
2502 brelse(bp);
2503
2504 return(0);
2505}
2506
2507
2508static int
2509bp_cmp(void *a, void *b)
2510{
2511 struct buf *bp_a = *(struct buf **)a,
2512 *bp_b = *(struct buf **)b;
2513 daddr_t res;
2514
2515 // don't have to worry about negative block
2516 // numbers so this is ok to do.
2517 //
2518 res = (bp_a->b_blkno - bp_b->b_blkno);
2519
2520 return (int)res;
2521}
2522
2523#define NFLUSH 32
2524
2525int
2526bflushq(int whichq, struct mount *mp)
2527{
2528 struct buf *bp, *next;
2529 int i, buf_count, s;
2530 int counter=0, total_writes=0;
2531 static struct buf *flush_table[NFLUSH];
2532
2533 if (whichq < 0 || whichq >= BQUEUES) {
2534 return;
2535 }
2536
2537
2538 restart:
2539 bp = TAILQ_FIRST(&bufqueues[whichq]);
2540 for(buf_count=0; bp; bp=next) {
2541 next = bp->b_freelist.tqe_next;
2542
2543 if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) {
2544 continue;
2545 }
2546
2547 if ((bp->b_flags & B_DELWRI) && (bp->b_flags & B_BUSY) == 0) {
2548 if (whichq != BQ_LOCKED && (bp->b_flags & B_LOCKED)) {
2549 panic("bflushq: bp @ 0x%x is locked!\n", bp);
2550 }
2551
2552 bremfree(bp);
2553 bp->b_flags |= B_BUSY;
2554 flush_table[buf_count] = bp;
2555 buf_count++;
2556 total_writes++;
2557
2558 if (buf_count >= NFLUSH) {
2559 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
2560
2561 for(i=0; i < buf_count; i++) {
2562 bawrite(flush_table[i]);
2563 }
2564
2565 goto restart;
2566 }
2567 }
2568 }
2569
2570 if (buf_count > 0) {
2571 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
2572 for(i=0; i < buf_count; i++) {
2573 bawrite(flush_table[i]);
2574 }
2575 }
2576
2577 return total_writes;
2578}