]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_bio.c
xnu-344.2.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_bio.c
1 /*
2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
23 /*-
24 * Copyright (c) 1994 Christopher G. Demetriou
25 * Copyright (c) 1982, 1986, 1989, 1993
26 * The Regents of the University of California. All rights reserved.
27 * (c) UNIX System Laboratories, Inc.
28 * All or some portions of this file are derived from material licensed
29 * to the University of California by American Telephone and Telegraph
30 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
31 * the permission of UNIX System Laboratories, Inc.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * The NEXTSTEP Software License Agreement specifies the terms
62 * and conditions for redistribution.
63 *
64 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
65 */
66
67 /*
68 * Some references:
69 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
70 * Leffler, et al.: The Design and Implementation of the 4.3BSD
71 * UNIX Operating System (Addison Welley, 1989)
72 */
73
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/proc.h>
77 #include <sys/buf.h>
78 #include <sys/vnode.h>
79 #include <sys/mount.h>
80 #include <sys/trace.h>
81 #include <sys/malloc.h>
82 #include <sys/resourcevar.h>
83 #include <miscfs/specfs/specdev.h>
84 #include <sys/ubc.h>
85 #include <vm/vm_pageout.h>
86 #if DIAGNOSTIC
87 #include <kern/assert.h>
88 #endif /* DIAGNOSTIC */
89 #include <kern/task.h>
90 #include <kern/zalloc.h>
91
92 #include <sys/kdebug.h>
93 #include <machine/spl.h>
94
95 static __inline__ void bufqinc(int q);
96 static __inline__ void bufqdec(int q);
97
98 static struct buf *getnewbuf(int slpflag, int slptimeo, int *queue);
99 static int bcleanbuf(struct buf *bp);
100 extern void vwakeup();
101
102 extern int niobuf; /* The number of IO buffer headers for cluster IO */
103 int blaundrycnt;
104
105 /* zone allocated buffer headers */
106 static zone_t buf_hdr_zone;
107 static int buf_hdr_count;
108
109 #if TRACE
110 struct proc *traceproc;
111 int tracewhich, tracebuf[TRCSIZ];
112 u_int tracex;
113 char traceflags[TR_NFLAGS];
114 #endif /* TRACE */
115
116 /*
117 * Definitions for the buffer hash lists.
118 */
119 #define BUFHASH(dvp, lbn) \
120 (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
121 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
122 u_long bufhash;
123
124 /* Definitions for the buffer stats. */
125 struct bufstats bufstats;
126
127 /* Number of delayed write buffers */
128 int nbdwrite = 0;
129
130 /*
131 * Insq/Remq for the buffer hash lists.
132 */
133 #if 0
134 #define binshash(bp, dp) LIST_INSERT_HEAD(dp, bp, b_hash)
135 #define bremhash(bp) LIST_REMOVE(bp, b_hash)
136 #endif /* 0 */
137
138
139 TAILQ_HEAD(ioqueue, buf) iobufqueue;
140 TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
141 static int needbuffer;
142 static int need_iobuffer;
143
144 /*
145 * Insq/Remq for the buffer free lists.
146 */
147 #define binsheadfree(bp, dp, whichq) do { \
148 TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
149 bufqinc((whichq)); \
150 (bp)->b_whichq = whichq; \
151 (bp)->b_timestamp = time.tv_sec; \
152 } while (0)
153
154 #define binstailfree(bp, dp, whichq) do { \
155 TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
156 bufqinc((whichq)); \
157 (bp)->b_whichq = whichq; \
158 (bp)->b_timestamp = time.tv_sec; \
159 } while (0)
160
161 #define BHASHENTCHECK(bp) \
162 if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef) \
163 panic("%x: b_hash.le_prev is not deadbeef", (bp));
164
165 #define BLISTNONE(bp) \
166 (bp)->b_hash.le_next = (struct buf *)0; \
167 (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
168
169 /*
170 * Insq/Remq for the vnode usage lists.
171 */
172 #define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs)
173 #define bufremvn(bp) { \
174 LIST_REMOVE(bp, b_vnbufs); \
175 (bp)->b_vnbufs.le_next = NOLIST; \
176 }
177
178 simple_lock_data_t bufhashlist_slock; /* lock on buffer hash list */
179
180 /* number of per vnode, "in flight" buffer writes */
181 #define BUFWRITE_THROTTLE 9
182
183 /*
184 * Time in seconds before a buffer on a list is
185 * considered as a stale buffer
186 */
187 #define LRU_IS_STALE 120 /* default value for the LRU */
188 #define AGE_IS_STALE 60 /* default value for the AGE */
189 #define META_IS_STALE 180 /* default value for the BQ_META */
190
191 int lru_is_stale = LRU_IS_STALE;
192 int age_is_stale = AGE_IS_STALE;
193 int meta_is_stale = META_IS_STALE;
194
195 /* LIST_INSERT_HEAD() with assertions */
196 static __inline__ void
197 blistenterhead(struct bufhashhdr * head, struct buf * bp)
198 {
199 if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
200 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
201 (head)->lh_first = bp;
202 bp->b_hash.le_prev = &(head)->lh_first;
203 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
204 panic("blistenterhead: le_prev is deadbeef");
205 }
206
207 static __inline__ void
208 binshash(struct buf *bp, struct bufhashhdr *dp)
209 {
210 struct buf *nbp;
211
212 simple_lock(&bufhashlist_slock);
213
214 #if 0
215 if(incore(bp->b_vp, bp->b_lblkno))
216 panic("binshash: already incore");
217 #endif /* 0 */
218
219 BHASHENTCHECK(bp);
220
221 nbp = dp->lh_first;
222 for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
223 if(nbp == bp)
224 panic("buf already in hashlist");
225 }
226
227 blistenterhead(dp, bp);
228 simple_unlock(&bufhashlist_slock);
229 }
230
231 static __inline__ void
232 bremhash(struct buf *bp)
233 {
234 simple_lock(&bufhashlist_slock);
235 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
236 panic("bremhash le_prev is deadbeef");
237 if (bp->b_hash.le_next == bp)
238 panic("bremhash: next points to self");
239
240 if (bp->b_hash.le_next != NULL)
241 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
242 *bp->b_hash.le_prev = (bp)->b_hash.le_next;
243 simple_unlock(&bufhashlist_slock);
244 }
245
246 /*
247 * Remove a buffer from the free list it's on
248 */
249 void
250 bremfree(bp)
251 struct buf *bp;
252 {
253 struct bqueues *dp = NULL;
254 int whichq = -1;
255
256 /*
257 * We only calculate the head of the freelist when removing
258 * the last element of the list as that is the only time that
259 * it is needed (e.g. to reset the tail pointer).
260 *
261 * NB: This makes an assumption about how tailq's are implemented.
262 */
263 if (bp->b_freelist.tqe_next == NULL) {
264 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
265 if (dp->tqh_last == &bp->b_freelist.tqe_next)
266 break;
267 if (dp == &bufqueues[BQUEUES])
268 panic("bremfree: lost tail");
269 }
270 TAILQ_REMOVE(dp, bp, b_freelist);
271 whichq = bp->b_whichq;
272 bufqdec(whichq);
273 bp->b_whichq = -1;
274 bp->b_timestamp = 0;
275 }
276
277 /*
278 * Associate a buffer with a vnode.
279 */
280 static void
281 bgetvp(vp, bp)
282 register struct vnode *vp;
283 register struct buf *bp;
284 {
285
286 if (bp->b_vp != vp)
287 panic("bgetvp: not free");
288 VHOLD(vp);
289 bp->b_vp = vp;
290 if (vp->v_type == VBLK || vp->v_type == VCHR)
291 bp->b_dev = vp->v_rdev;
292 else
293 bp->b_dev = NODEV;
294 /*
295 * Insert onto list for new vnode.
296 */
297 bufinsvn(bp, &vp->v_cleanblkhd);
298 }
299
300 /*
301 * Disassociate a buffer from a vnode.
302 */
303 static void
304 brelvp(bp)
305 register struct buf *bp;
306 {
307 struct vnode *vp;
308
309 if (bp->b_vp == (struct vnode *) 0)
310 panic("brelvp: NULL vp");
311 /*
312 * Delete from old vnode list, if on one.
313 */
314 if (bp->b_vnbufs.le_next != NOLIST)
315 bufremvn(bp);
316 vp = bp->b_vp;
317 bp->b_vp = (struct vnode *) 0;
318 HOLDRELE(vp);
319 }
320
321 /*
322 * Reassign a buffer from one vnode to another.
323 * Used to assign file specific control information
324 * (indirect blocks) to the vnode to which they belong.
325 */
326 void
327 reassignbuf(bp, newvp)
328 register struct buf *bp;
329 register struct vnode *newvp;
330 {
331 register struct buflists *listheadp;
332
333 if (newvp == NULL) {
334 printf("reassignbuf: NULL");
335 return;
336 }
337 /*
338 * Delete from old vnode list, if on one.
339 */
340 if (bp->b_vnbufs.le_next != NOLIST)
341 bufremvn(bp);
342 /*
343 * If dirty, put on list of dirty buffers;
344 * otherwise insert onto list of clean buffers.
345 */
346 if (ISSET(bp->b_flags, B_DELWRI))
347 listheadp = &newvp->v_dirtyblkhd;
348 else
349 listheadp = &newvp->v_cleanblkhd;
350 bufinsvn(bp, listheadp);
351 }
352
353 static __inline__ void
354 bufhdrinit(struct buf *bp)
355 {
356 bzero((char *)bp, sizeof *bp);
357 bp->b_dev = NODEV;
358 bp->b_rcred = NOCRED;
359 bp->b_wcred = NOCRED;
360 bp->b_vnbufs.le_next = NOLIST;
361 bp->b_flags = B_INVAL;
362
363 return;
364 }
365
366 /*
367 * Initialize buffers and hash links for buffers.
368 */
369 __private_extern__ void
370 bufinit()
371 {
372 register struct buf *bp;
373 register struct bqueues *dp;
374 register int i;
375 int metabuf;
376 long whichq;
377 static void bufzoneinit();
378 static void bcleanbuf_thread_init();
379
380 /* Initialize the buffer queues ('freelists') and the hash table */
381 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
382 TAILQ_INIT(dp);
383 bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash);
384
385 simple_lock_init(&bufhashlist_slock );
386
387 metabuf = nbuf/8; /* reserved for meta buf */
388
389 /* Initialize the buffer headers */
390 for (i = 0; i < nbuf; i++) {
391 bp = &buf[i];
392 bufhdrinit(bp);
393
394 /*
395 * metabuf buffer headers on the meta-data list and
396 * rest of the buffer headers on the empty list
397 */
398 if (--metabuf)
399 whichq = BQ_META;
400 else
401 whichq = BQ_EMPTY;
402
403 BLISTNONE(bp);
404 dp = &bufqueues[whichq];
405 binsheadfree(bp, dp, whichq);
406 binshash(bp, &invalhash);
407 }
408
409 for (; i < nbuf + niobuf; i++) {
410 bp = &buf[i];
411 bufhdrinit(bp);
412 binsheadfree(bp, &iobufqueue, -1);
413 }
414
415 printf("using %d buffer headers and %d cluster IO buffer headers\n",
416 nbuf, niobuf);
417
418 /* Set up zones used by the buffer cache */
419 bufzoneinit();
420
421 /* start the bcleanbuf() thread */
422 bcleanbuf_thread_init();
423
424 #if 0 /* notyet */
425 {
426 static void bufq_balance_thread_init();
427 /* create a thread to do dynamic buffer queue balancing */
428 bufq_balance_thread_init();
429 }
430 #endif /* notyet */
431 }
432
433 static struct buf *
434 bio_doread(vp, blkno, size, cred, async, queuetype)
435 struct vnode *vp;
436 daddr_t blkno;
437 int size;
438 struct ucred *cred;
439 int async;
440 int queuetype;
441 {
442 register struct buf *bp;
443 struct proc *p = current_proc();
444
445 bp = getblk(vp, blkno, size, 0, 0, queuetype);
446
447 /*
448 * If buffer does not have data valid, start a read.
449 * Note that if buffer is B_INVAL, getblk() won't return it.
450 * Therefore, it's valid if it's I/O has completed or been delayed.
451 */
452 if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
453 /* Start I/O for the buffer (keeping credentials). */
454 SET(bp->b_flags, B_READ | async);
455 if (cred != NOCRED && bp->b_rcred == NOCRED) {
456 /*
457 * NFS has embedded ucred.
458 * Can not crhold() here as that causes zone corruption
459 */
460 bp->b_rcred = crdup(cred);
461 }
462 VOP_STRATEGY(bp);
463
464 trace(TR_BREADMISS, pack(vp, size), blkno);
465
466 /* Pay for the read. */
467 if (p && p->p_stats)
468 p->p_stats->p_ru.ru_inblock++; /* XXX */
469 } else if (async) {
470 brelse(bp);
471 }
472
473 trace(TR_BREADHIT, pack(vp, size), blkno);
474
475 return (bp);
476 }
477 /*
478 * Read a disk block.
479 * This algorithm described in Bach (p.54).
480 */
481 int
482 bread(vp, blkno, size, cred, bpp)
483 struct vnode *vp;
484 daddr_t blkno;
485 int size;
486 struct ucred *cred;
487 struct buf **bpp;
488 {
489 register struct buf *bp;
490
491 /* Get buffer for block. */
492 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
493
494 /* Wait for the read to complete, and return result. */
495 return (biowait(bp));
496 }
497
498 /*
499 * Read a disk block. [bread() for meta-data]
500 * This algorithm described in Bach (p.54).
501 */
502 int
503 meta_bread(vp, blkno, size, cred, bpp)
504 struct vnode *vp;
505 daddr_t blkno;
506 int size;
507 struct ucred *cred;
508 struct buf **bpp;
509 {
510 register struct buf *bp;
511
512 /* Get buffer for block. */
513 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
514
515 /* Wait for the read to complete, and return result. */
516 return (biowait(bp));
517 }
518
519 /*
520 * Read-ahead multiple disk blocks. The first is sync, the rest async.
521 * Trivial modification to the breada algorithm presented in Bach (p.55).
522 */
523 int
524 breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp)
525 struct vnode *vp;
526 daddr_t blkno; int size;
527 daddr_t rablks[]; int rasizes[];
528 int nrablks;
529 struct ucred *cred;
530 struct buf **bpp;
531 {
532 register struct buf *bp;
533 int i;
534
535 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
536
537 /*
538 * For each of the read-ahead blocks, start a read, if necessary.
539 */
540 for (i = 0; i < nrablks; i++) {
541 /* If it's in the cache, just go on to next one. */
542 if (incore(vp, rablks[i]))
543 continue;
544
545 /* Get a buffer for the read-ahead block */
546 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, BLK_READ);
547 }
548
549 /* Otherwise, we had to start a read for it; wait until it's valid. */
550 return (biowait(bp));
551 }
552
553 /*
554 * Read with single-block read-ahead. Defined in Bach (p.55), but
555 * implemented as a call to breadn().
556 * XXX for compatibility with old file systems.
557 */
558 int
559 breada(vp, blkno, size, rablkno, rabsize, cred, bpp)
560 struct vnode *vp;
561 daddr_t blkno; int size;
562 daddr_t rablkno; int rabsize;
563 struct ucred *cred;
564 struct buf **bpp;
565 {
566
567 return (breadn(vp, blkno, size, &rablkno, &rabsize, 1, cred, bpp));
568 }
569
570 /*
571 * Block write. Described in Bach (p.56)
572 */
573 int
574 bwrite(bp)
575 struct buf *bp;
576 {
577 int rv, sync, wasdelayed;
578 struct proc *p = current_proc();
579 struct vnode *vp = bp->b_vp;
580
581 /* Remember buffer type, to switch on it later. */
582 sync = !ISSET(bp->b_flags, B_ASYNC);
583 wasdelayed = ISSET(bp->b_flags, B_DELWRI);
584 CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
585 if (wasdelayed) {
586 nbdwrite--;
587 wakeup((caddr_t)&nbdwrite);
588 }
589
590 if (!sync) {
591 /*
592 * If not synchronous, pay for the I/O operation and make
593 * sure the buf is on the correct vnode queue. We have
594 * to do this now, because if we don't, the vnode may not
595 * be properly notified that its I/O has completed.
596 */
597 if (wasdelayed)
598 reassignbuf(bp, vp);
599 else
600 if (p && p->p_stats)
601 p->p_stats->p_ru.ru_oublock++; /* XXX */
602 }
603
604 trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
605
606 /* Initiate disk write. Make sure the appropriate party is charged. */
607 SET(bp->b_flags, B_WRITEINPROG);
608 vp->v_numoutput++;
609
610 VOP_STRATEGY(bp);
611
612 if (sync) {
613 /*
614 * If I/O was synchronous, wait for it to complete.
615 */
616 rv = biowait(bp);
617
618 /*
619 * Pay for the I/O operation, if it's not been paid for, and
620 * make sure it's on the correct vnode queue. (async operatings
621 * were payed for above.)
622 */
623 if (wasdelayed)
624 reassignbuf(bp, vp);
625 else
626 if (p && p->p_stats)
627 p->p_stats->p_ru.ru_oublock++; /* XXX */
628
629 /* Release the buffer. */
630 brelse(bp);
631
632 return (rv);
633 } else {
634 return (0);
635 }
636 }
637
638 int
639 vn_bwrite(ap)
640 struct vop_bwrite_args *ap;
641 {
642 return (bwrite(ap->a_bp));
643 }
644
645 /*
646 * Delayed write.
647 *
648 * The buffer is marked dirty, but is not queued for I/O.
649 * This routine should be used when the buffer is expected
650 * to be modified again soon, typically a small write that
651 * partially fills a buffer.
652 *
653 * NB: magnetic tapes cannot be delayed; they must be
654 * written in the order that the writes are requested.
655 *
656 * Described in Leffler, et al. (pp. 208-213).
657 *
658 * Note: With the abilitty to allocate additional buffer
659 * headers, we can get in to the situation where "too" many
660 * bdwrite()s can create situation where the kernel can create
661 * buffers faster than the disks can service. Doing a bawrite() in
662 * cases were we have "too many" outstanding bdwrite()s avoids that.
663 */
664 __private_extern__ int
665 bdwrite_internal(bp, return_error)
666 struct buf *bp;
667 int return_error;
668 {
669 struct proc *p = current_proc();
670 struct vnode *vp = bp->b_vp;
671
672 /*
673 * If the block hasn't been seen before:
674 * (1) Mark it as having been seen,
675 * (2) Charge for the write.
676 * (3) Make sure it's on its vnode's correct block list,
677 */
678 if (!ISSET(bp->b_flags, B_DELWRI)) {
679 SET(bp->b_flags, B_DELWRI);
680 if (p && p->p_stats)
681 p->p_stats->p_ru.ru_oublock++; /* XXX */
682 nbdwrite ++;
683 reassignbuf(bp, vp);
684 }
685
686 /* If this is a tape block, write it the block now. */
687 if (ISSET(bp->b_flags, B_TAPE)) {
688 /* bwrite(bp); */
689 VOP_BWRITE(bp);
690 return (0);
691 }
692
693 /*
694 * If the vnode has "too many" write operations in progress
695 * wait for them to finish the IO
696 */
697 while (vp->v_numoutput >= BUFWRITE_THROTTLE) {
698 vp->v_flag |= VTHROTTLED;
699 (void)tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "bdwrite", 0);
700 }
701
702 /*
703 * If we have too many delayed write buffers,
704 * more than we can "safely" handle, just fall back to
705 * doing the async write
706 */
707 if (nbdwrite < 0)
708 panic("bdwrite: Negative nbdwrite");
709
710 if (nbdwrite > ((nbuf/4)*3)) {
711 if (return_error)
712 return (EAGAIN);
713 else
714 bawrite(bp);
715 return (0);
716 }
717
718 /* Otherwise, the "write" is done, so mark and release the buffer. */
719 SET(bp->b_flags, B_DONE);
720 brelse(bp);
721 return (0);
722 }
723
724 void
725 bdwrite(bp)
726 struct buf *bp;
727 {
728 (void) bdwrite_internal(bp, 0);
729 }
730
731
732 /*
733 * Asynchronous block write; just an asynchronous bwrite().
734 *
735 * Note: With the abilitty to allocate additional buffer
736 * headers, we can get in to the situation where "too" many
737 * bawrite()s can create situation where the kernel can create
738 * buffers faster than the disks can service.
739 * We limit the number of "in flight" writes a vnode can have to
740 * avoid this.
741 */
742 static int
743 bawrite_internal(bp, throttle)
744 struct buf *bp;
745 int throttle;
746 {
747 struct vnode *vp = bp->b_vp;
748
749 if (vp) {
750 /*
751 * If the vnode has "too many" write operations in progress
752 * wait for them to finish the IO
753 */
754 while (vp->v_numoutput >= BUFWRITE_THROTTLE) {
755 if (throttle) {
756 vp->v_flag |= VTHROTTLED;
757 (void)tsleep((caddr_t)&vp->v_numoutput,
758 PRIBIO + 1, "bawrite", 0);
759 } else
760 return (EWOULDBLOCK);
761 }
762 }
763
764 SET(bp->b_flags, B_ASYNC);
765 VOP_BWRITE(bp);
766 return (0);
767 }
768
769 void
770 bawrite(bp)
771 struct buf *bp;
772 {
773 (void) bawrite_internal(bp, 1);
774 }
775
776 /*
777 * bwillwrite:
778 *
779 * Called prior to the locking of any vnodes when we are expecting to
780 * write. We do not want to starve the buffer cache with too many
781 * dirty buffers so we block here. By blocking prior to the locking
782 * of any vnodes we attempt to avoid the situation where a locked vnode
783 * prevents the various system daemons from flushing related buffers.
784 */
785
786 void
787 bwillwrite(void)
788 {
789 /* XXX To be implemented later */
790 }
791
792 /*
793 * Release a buffer on to the free lists.
794 * Described in Bach (p. 46).
795 */
796 void
797 brelse(bp)
798 struct buf *bp;
799 {
800 struct bqueues *bufq;
801 int s;
802 long whichq;
803
804 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
805 bp->b_lblkno * PAGE_SIZE, (int)bp, (int)bp->b_data,
806 bp->b_flags, 0);
807
808 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
809
810 /* IO is done. Cleanup the UPL state */
811 if (!ISSET(bp->b_flags, B_META)
812 && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
813 kern_return_t kret;
814 upl_t upl;
815 int upl_flags;
816
817 if ( !ISSET(bp->b_flags, B_PAGELIST)) {
818 if ( !ISSET(bp->b_flags, B_INVAL)) {
819 kret = ubc_create_upl(bp->b_vp,
820 ubc_blktooff(bp->b_vp, bp->b_lblkno),
821 bp->b_bufsize,
822 &upl,
823 NULL,
824 UPL_PRECIOUS);
825 if (kret != KERN_SUCCESS)
826 panic("brelse: Failed to get pagelists");
827 #ifdef UBC_DEBUG
828 upl_ubc_alias_set(upl, bp, 5);
829 #endif /* UBC_DEBUG */
830 } else
831 upl = (upl_t) 0;
832 } else {
833 upl = bp->b_pagelist;
834 kret = ubc_upl_unmap(upl);
835
836 if (kret != KERN_SUCCESS)
837 panic("kernel_upl_unmap failed");
838 bp->b_data = 0;
839 }
840 if (upl) {
841 if (bp->b_flags & (B_ERROR | B_INVAL)) {
842 if (bp->b_flags & (B_READ | B_INVAL))
843 upl_flags = UPL_ABORT_DUMP_PAGES;
844 else
845 upl_flags = 0;
846 ubc_upl_abort(upl, upl_flags);
847 } else {
848 if (ISSET(bp->b_flags, B_NEEDCOMMIT))
849 upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
850 else if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY))
851 upl_flags = UPL_COMMIT_SET_DIRTY ;
852 else
853 upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
854 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
855 UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
856 }
857 s = splbio();
858 CLR(bp->b_flags, B_PAGELIST);
859 bp->b_pagelist = 0;
860 splx(s);
861 }
862 } else {
863 if(ISSET(bp->b_flags, B_PAGELIST))
864 panic("brelse: pagelist set for non VREG; vp=%x", bp->b_vp);
865 }
866
867 /* Wake up any processes waiting for any buffer to become free. */
868 if (needbuffer) {
869 needbuffer = 0;
870 wakeup(&needbuffer);
871 }
872
873 /* Wake up any proceeses waiting for _this_ buffer to become free. */
874 if (ISSET(bp->b_flags, B_WANTED)) {
875 CLR(bp->b_flags, B_WANTED);
876 wakeup(bp);
877 }
878
879 /* Block disk interrupts. */
880 s = splbio();
881
882 /*
883 * Determine which queue the buffer should be on, then put it there.
884 */
885
886 /* If it's locked, don't report an error; try again later. */
887 if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
888 CLR(bp->b_flags, B_ERROR);
889
890 /* If it's not cacheable, or an error, mark it invalid. */
891 if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
892 SET(bp->b_flags, B_INVAL);
893
894 if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
895 /*
896 * If it's invalid or empty, dissociate it from its vnode
897 * and put on the head of the appropriate queue.
898 */
899 if (bp->b_vp)
900 brelvp(bp);
901 if (ISSET(bp->b_flags, B_DELWRI)) {
902 CLR(bp->b_flags, B_DELWRI);
903 nbdwrite--;
904 wakeup((caddr_t)&nbdwrite);
905 }
906 if (bp->b_bufsize <= 0)
907 whichq = BQ_EMPTY; /* no data */
908 else if (ISSET(bp->b_flags, B_META))
909 whichq = BQ_META; /* meta-data */
910 else
911 whichq = BQ_AGE; /* invalid data */
912
913 bufq = &bufqueues[whichq];
914 binsheadfree(bp, bufq, whichq);
915 } else {
916 /*
917 * It has valid data. Put it on the end of the appropriate
918 * queue, so that it'll stick around for as long as possible.
919 */
920 if (ISSET(bp->b_flags, B_LOCKED))
921 whichq = BQ_LOCKED; /* locked in core */
922 else if (ISSET(bp->b_flags, B_META))
923 whichq = BQ_META; /* meta-data */
924 else if (ISSET(bp->b_flags, B_AGE))
925 whichq = BQ_AGE; /* stale but valid data */
926 else
927 whichq = BQ_LRU; /* valid data */
928
929 bufq = &bufqueues[whichq];
930 binstailfree(bp, bufq, whichq);
931 }
932
933 /* Unlock the buffer. */
934 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE));
935
936 /* Allow disk interrupts. */
937 splx(s);
938
939 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
940 (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
941 }
942
943 /*
944 * Determine if a block is in the cache.
945 * Just look on what would be its hash chain. If it's there, return
946 * a pointer to it, unless it's marked invalid. If it's marked invalid,
947 * we normally don't return the buffer, unless the caller explicitly
948 * wants us to.
949 */
950 struct buf *
951 incore(vp, blkno)
952 struct vnode *vp;
953 daddr_t blkno;
954 {
955 struct buf *bp;
956
957 bp = BUFHASH(vp, blkno)->lh_first;
958
959 /* Search hash chain */
960 for (; bp != NULL; bp = bp->b_hash.le_next) {
961 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
962 !ISSET(bp->b_flags, B_INVAL))
963 return (bp);
964 }
965
966 return (0);
967 }
968
969
970 /* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
971 /*
972 * Get a block of requested size that is associated with
973 * a given vnode and block offset. If it is found in the
974 * block cache, mark it as having been found, make it busy
975 * and return it. Otherwise, return an empty block of the
976 * correct size. It is up to the caller to insure that the
977 * cached blocks be of the correct size.
978 */
979 struct buf *
980 getblk(vp, blkno, size, slpflag, slptimeo, operation)
981 register struct vnode *vp;
982 daddr_t blkno;
983 int size, slpflag, slptimeo, operation;
984 {
985 struct buf *bp;
986 int s, err;
987 upl_t upl;
988 upl_page_info_t *pl;
989 kern_return_t kret;
990 int error=0;
991 int pagedirty = 0;
992
993 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
994 blkno * PAGE_SIZE, size, operation, 0, 0);
995 start:
996
997 s = splbio();
998 if ((bp = incore(vp, blkno))) {
999 /* Found in the Buffer Cache */
1000 if (ISSET(bp->b_flags, B_BUSY)) {
1001 /* but is busy */
1002 switch (operation) {
1003 case BLK_READ:
1004 case BLK_WRITE:
1005 case BLK_META:
1006 SET(bp->b_flags, B_WANTED);
1007 bufstats.bufs_busyincore++;
1008 err = tsleep(bp, slpflag | (PRIBIO + 1), "getblk",
1009 slptimeo);
1010 splx(s);
1011 /*
1012 * Callers who call with PCATCH or timeout are
1013 * willing to deal with the NULL pointer
1014 */
1015 if (err && ((slpflag & PCATCH) ||
1016 ((err == EWOULDBLOCK) && slptimeo)))
1017 return (NULL);
1018 goto start;
1019 /*NOTREACHED*/
1020 break;
1021
1022 case BLK_PAGEIN:
1023 /* pagein operation must not use getblk */
1024 panic("getblk: pagein for incore busy buffer");
1025 splx(s);
1026 /*NOTREACHED*/
1027 break;
1028
1029 case BLK_PAGEOUT:
1030 /* pageout operation must not use getblk */
1031 panic("getblk: pageout for incore busy buffer");
1032 splx(s);
1033 /*NOTREACHED*/
1034 break;
1035
1036 default:
1037 panic("getblk: %d unknown operation 1", operation);
1038 /*NOTREACHED*/
1039 break;
1040 }
1041 } else {
1042 /* not busy */
1043 SET(bp->b_flags, (B_BUSY | B_CACHE));
1044 bremfree(bp);
1045 bufstats.bufs_incore++;
1046 splx(s);
1047
1048 allocbuf(bp, size);
1049 if (ISSET(bp->b_flags, B_PAGELIST))
1050 panic("pagelist buffer is not busy");
1051
1052 switch (operation) {
1053 case BLK_READ:
1054 case BLK_WRITE:
1055 if (UBCISVALID(bp->b_vp) && bp->b_bufsize) {
1056 kret = ubc_create_upl(vp,
1057 ubc_blktooff(vp, bp->b_lblkno),
1058 bp->b_bufsize,
1059 &upl,
1060 &pl,
1061 UPL_PRECIOUS);
1062 if (kret != KERN_SUCCESS)
1063 panic("Failed to get pagelists");
1064
1065 SET(bp->b_flags, B_PAGELIST);
1066 bp->b_pagelist = upl;
1067
1068 if (!upl_valid_page(pl, 0)) {
1069 if (vp->v_tag != VT_NFS)
1070 panic("getblk: incore buffer without valid page");
1071 CLR(bp->b_flags, B_CACHE);
1072 }
1073
1074 if (upl_dirty_page(pl, 0))
1075 SET(bp->b_flags, B_WASDIRTY);
1076 else
1077 CLR(bp->b_flags, B_WASDIRTY);
1078
1079 kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
1080 if (kret != KERN_SUCCESS)
1081 panic("getblk: ubc_upl_map() failed with (%d)",
1082 kret);
1083 if (bp->b_data == 0)
1084 panic("ubc_upl_map mapped 0");
1085 }
1086 break;
1087
1088 case BLK_META:
1089 /*
1090 * VM is not involved in IO for the meta data
1091 * buffer already has valid data
1092 */
1093 if(bp->b_data == 0)
1094 panic("bp->b_data null incore buf=%x", bp);
1095 break;
1096
1097 case BLK_PAGEIN:
1098 case BLK_PAGEOUT:
1099 panic("getblk: paging operation 1");
1100 break;
1101
1102 default:
1103 panic("getblk: %d unknown operation 2", operation);
1104 /*NOTREACHED*/
1105 break;
1106 }
1107 }
1108 } else { /* not incore() */
1109 int queue = BQ_EMPTY; /* Start with no preference */
1110 splx(s);
1111
1112 if ((operation == BLK_META) || (UBCINVALID(vp)) ||
1113 !(UBCINFOEXISTS(vp))) {
1114 operation = BLK_META;
1115 }
1116 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
1117 goto start;
1118 if (incore(vp, blkno)) {
1119 SET(bp->b_flags, B_INVAL);
1120 binshash(bp, &invalhash);
1121 brelse(bp);
1122 goto start;
1123 }
1124
1125 /*
1126 * if it is meta, the queue may be set to other
1127 * type so reset as well as mark it to be B_META
1128 * so that when buffer is released it will goto META queue
1129 * Also, if the vnode is not VREG, then it is META
1130 */
1131 if (operation == BLK_META) {
1132 SET(bp->b_flags, B_META);
1133 queue = BQ_META;
1134 }
1135
1136 bp->b_blkno = bp->b_lblkno = blkno;
1137 bp->b_vp = vp;
1138
1139 /*
1140 * Insert in the hash so that incore() can find it
1141 */
1142 binshash(bp, BUFHASH(vp, blkno));
1143
1144 s = splbio();
1145 bgetvp(vp, bp);
1146 splx(s);
1147
1148 allocbuf(bp, size);
1149
1150 switch (operation) {
1151 case BLK_META:
1152 /* buffer data is invalid */
1153
1154 if(bp->b_data == 0)
1155 panic("bp->b_data is null %x",bp);
1156
1157 bufstats.bufs_miss++;
1158
1159 /* wakeup the buffer */
1160 CLR(bp->b_flags, B_WANTED);
1161 wakeup(bp);
1162 break;
1163
1164 case BLK_READ:
1165 case BLK_WRITE:
1166
1167 if (ISSET(bp->b_flags, B_PAGELIST))
1168 panic("B_PAGELIST in bp=%x",bp);
1169
1170 kret = ubc_create_upl(vp,
1171 ubc_blktooff(vp, blkno),
1172 bp->b_bufsize,
1173 &upl,
1174 &pl,
1175 UPL_PRECIOUS);
1176 if (kret != KERN_SUCCESS)
1177 panic("Failed to get pagelists");
1178
1179 #ifdef UBC_DEBUG
1180 upl_ubc_alias_set(upl, bp, 4);
1181 #endif /* UBC_DEBUG */
1182 bp->b_pagelist = upl;
1183
1184 SET(bp->b_flags, B_PAGELIST);
1185
1186 if (upl_valid_page(pl, 0)) {
1187 SET(bp->b_flags, B_CACHE | B_DONE);
1188 bufstats.bufs_vmhits++;
1189
1190 pagedirty = upl_dirty_page(pl, 0);
1191
1192 if (pagedirty)
1193 SET(bp->b_flags, B_WASDIRTY);
1194
1195 if (vp->v_tag == VT_NFS) {
1196 off_t f_offset;
1197 int valid_size;
1198
1199 bp->b_validoff = 0;
1200 bp->b_dirtyoff = 0;
1201
1202 f_offset = ubc_blktooff(vp, blkno);
1203
1204 if (f_offset > vp->v_ubcinfo->ui_size) {
1205 CLR(bp->b_flags, (B_CACHE|B_DONE|B_WASDIRTY));
1206 bp->b_validend = 0;
1207 bp->b_dirtyend = 0;
1208 } else {
1209 valid_size = min(((unsigned int)(vp->v_ubcinfo->ui_size - f_offset)), PAGE_SIZE);
1210 bp->b_validend = valid_size;
1211
1212 if (pagedirty)
1213 bp->b_dirtyend = valid_size;
1214 else
1215 bp->b_dirtyend = 0;
1216
1217 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_NONE,
1218 bp->b_validend, bp->b_dirtyend,
1219 (int)vp->v_ubcinfo->ui_size, 0, 0);
1220 }
1221 } else {
1222 bp->b_validoff = 0;
1223 bp->b_dirtyoff = 0;
1224
1225 if (pagedirty) {
1226 /* page is dirty */
1227 bp->b_validend = bp->b_bcount;
1228 bp->b_dirtyend = bp->b_bcount;
1229 } else {
1230 /* page is clean */
1231 bp->b_validend = bp->b_bcount;
1232 bp->b_dirtyend = 0;
1233 }
1234 }
1235 error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL);
1236 if(error) {
1237 panic("getblk: VOP_BMAP failed");
1238 /*NOTREACHED*/
1239 /*
1240 * XXX: We probably should invalidate the VM Page
1241 */
1242 bp->b_error = error;
1243 SET(bp->b_flags, (B_ERROR | B_INVAL));
1244 /* undo B_DONE that was set before upl_commit() */
1245 CLR(bp->b_flags, B_DONE);
1246 brelse(bp);
1247 return (0);
1248 }
1249 } else {
1250 bufstats.bufs_miss++;
1251 }
1252 kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
1253 if (kret != KERN_SUCCESS) {
1254 panic("getblk: ubc_upl_map() "
1255 "failed with (%d)", kret);
1256 }
1257 if (bp->b_data == 0)
1258 panic("kernel_upl_map mapped 0");
1259
1260 break;
1261
1262 case BLK_PAGEIN:
1263 case BLK_PAGEOUT:
1264 panic("getblk: paging operation 2");
1265 break;
1266 default:
1267 panic("getblk: %d unknown operation 3", operation);
1268 /*NOTREACHED*/
1269 break;
1270 }
1271 }
1272
1273 if (bp->b_data == NULL)
1274 panic("getblk: bp->b_addr is null");
1275
1276 if (bp->b_bufsize & 0xfff) {
1277 if (ISSET(bp->b_flags, B_META) && (bp->b_bufsize & 0x1ff))
1278 panic("getblk: bp->b_bufsize = %d", bp->b_bufsize);
1279 }
1280
1281 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
1282 (int)bp, (int)bp->b_data, bp->b_flags, 3, 0);
1283
1284 return (bp);
1285 }
1286
1287 /*
1288 * Get an empty, disassociated buffer of given size.
1289 */
1290 struct buf *
1291 geteblk(size)
1292 int size;
1293 {
1294 struct buf *bp;
1295 int queue = BQ_EMPTY;
1296
1297 while ((bp = getnewbuf(0, 0, &queue)) == 0)
1298 ;
1299 SET(bp->b_flags, (B_META|B_INVAL));
1300
1301 #if DIAGNOSTIC
1302 assert(queue == BQ_EMPTY);
1303 #endif /* DIAGNOSTIC */
1304 /* XXX need to implement logic to deal with other queues */
1305
1306 binshash(bp, &invalhash);
1307 allocbuf(bp, size);
1308 bufstats.bufs_eblk++;
1309
1310 return (bp);
1311 }
1312
1313 /*
1314 * Zones for the meta data buffers
1315 */
1316
1317 #define MINMETA 512
1318 #define MAXMETA 4096
1319
1320 struct meta_zone_entry {
1321 zone_t mz_zone;
1322 vm_size_t mz_size;
1323 vm_size_t mz_max;
1324 char *mz_name;
1325 };
1326
1327 struct meta_zone_entry meta_zones[] = {
1328 {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
1329 {NULL, (MINMETA * 2), 64 * (MINMETA * 2), "buf.1024" },
1330 {NULL, (MINMETA * 4), 16 * (MINMETA * 4), "buf.2048" },
1331 {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
1332 {NULL, 0, 0, "" } /* End */
1333 };
1334
1335 /*
1336 * Initialize the meta data zones
1337 */
1338 static void
1339 bufzoneinit(void)
1340 {
1341 int i;
1342
1343 for (i = 0; meta_zones[i].mz_size != 0; i++) {
1344 meta_zones[i].mz_zone =
1345 zinit(meta_zones[i].mz_size,
1346 meta_zones[i].mz_max,
1347 PAGE_SIZE,
1348 meta_zones[i].mz_name);
1349 }
1350 buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
1351 }
1352
1353 static __inline__ zone_t
1354 getbufzone(size_t size)
1355 {
1356 int i;
1357
1358 if ((size % 512) || (size < MINMETA) || (size > MAXMETA))
1359 panic("getbufzone: incorect size = %d", size);
1360
1361 for (i = 0; meta_zones[i].mz_size != 0; i++) {
1362 if (meta_zones[i].mz_size >= size)
1363 break;
1364 }
1365
1366 return (meta_zones[i].mz_zone);
1367 }
1368
1369 /*
1370 * With UBC, there is no need to expand / shrink the file data
1371 * buffer. The VM uses the same pages, hence no waste.
1372 * All the file data buffers can have one size.
1373 * In fact expand / shrink would be an expensive operation.
1374 *
1375 * Only exception to this is meta-data buffers. Most of the
1376 * meta data operations are smaller than PAGE_SIZE. Having the
1377 * meta-data buffers grow and shrink as needed, optimizes use
1378 * of the kernel wired memory.
1379 */
1380
1381 int
1382 allocbuf(bp, size)
1383 struct buf *bp;
1384 int size;
1385 {
1386 vm_size_t desired_size;
1387
1388 desired_size = roundup(size, CLBYTES);
1389
1390 if(desired_size < PAGE_SIZE)
1391 desired_size = PAGE_SIZE;
1392 if (desired_size > MAXBSIZE)
1393 panic("allocbuf: buffer larger than MAXBSIZE requested");
1394
1395 if (ISSET(bp->b_flags, B_META)) {
1396 kern_return_t kret;
1397 zone_t zprev, z;
1398 size_t nsize = roundup(size, MINMETA);
1399
1400 if (bp->b_data) {
1401 vm_offset_t elem = (vm_offset_t)bp->b_data;
1402
1403 if (ISSET(bp->b_flags, B_ZALLOC))
1404 if (bp->b_bufsize <= MAXMETA) {
1405 if (bp->b_bufsize < nsize) {
1406 /* reallocate to a bigger size */
1407 desired_size = nsize;
1408
1409 zprev = getbufzone(bp->b_bufsize);
1410 z = getbufzone(nsize);
1411 bp->b_data = (caddr_t)zalloc(z);
1412 if(bp->b_data == 0)
1413 panic("allocbuf: zalloc() returned NULL");
1414 bcopy(elem, bp->b_data, bp->b_bufsize);
1415 zfree(zprev, elem);
1416 } else {
1417 desired_size = bp->b_bufsize;
1418 }
1419 } else
1420 panic("allocbuf: B_ZALLOC set incorrectly");
1421 else
1422 if (bp->b_bufsize < desired_size) {
1423 /* reallocate to a bigger size */
1424 kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1425 if (kret != KERN_SUCCESS)
1426 panic("allocbuf: kmem_alloc() returned %d", kret);
1427 if(bp->b_data == 0)
1428 panic("allocbuf: null b_data");
1429 bcopy(elem, bp->b_data, bp->b_bufsize);
1430 kmem_free(kernel_map, elem, bp->b_bufsize);
1431 } else {
1432 desired_size = bp->b_bufsize;
1433 }
1434 } else {
1435 /* new allocation */
1436 if (nsize <= MAXMETA) {
1437 desired_size = nsize;
1438 z = getbufzone(nsize);
1439 bp->b_data = (caddr_t)zalloc(z);
1440 if(bp->b_data == 0)
1441 panic("allocbuf: zalloc() returned NULL 2");
1442 SET(bp->b_flags, B_ZALLOC);
1443 } else {
1444 kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1445 if (kret != KERN_SUCCESS)
1446 panic("allocbuf: kmem_alloc() 2 returned %d", kret);
1447 if(bp->b_data == 0)
1448 panic("allocbuf: null b_data 2");
1449 }
1450 }
1451 }
1452
1453 if (ISSET(bp->b_flags, B_META) && (bp->b_data == 0))
1454 panic("allocbuf: bp->b_data is NULL");
1455
1456 bp->b_bufsize = desired_size;
1457 bp->b_bcount = size;
1458 return (0);
1459 }
1460
1461 /*
1462 * Get a new buffer from one of the free lists.
1463 *
1464 * Request for a queue is passes in. The queue from which the buffer was taken
1465 * from is returned. Out of range queue requests get BQ_EMPTY. Request for
1466 * BQUEUE means no preference. Use heuristics in that case.
1467 * Heuristics is as follows:
1468 * Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
1469 * If none available block till one is made available.
1470 * If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
1471 * Pick the most stale buffer.
1472 * If found buffer was marked delayed write, start the async. write
1473 * and restart the search.
1474 * Initialize the fields and disassociate the buffer from the vnode.
1475 * Remove the buffer from the hash. Return the buffer and the queue
1476 * on which it was found.
1477 */
1478
1479 static struct buf *
1480 getnewbuf(slpflag, slptimeo, queue)
1481 int slpflag, slptimeo;
1482 int *queue;
1483 {
1484 register struct buf *bp;
1485 register struct buf *lru_bp;
1486 register struct buf *age_bp;
1487 register struct buf *meta_bp;
1488 register int age_time, lru_time, bp_time, meta_time;
1489 int s;
1490 int req = *queue; /* save it for restarts */
1491
1492 start:
1493 s = splbio();
1494
1495 /* invalid request gets empty queue */
1496 if ((*queue > BQUEUES) || (*queue < 0)
1497 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED))
1498 *queue = BQ_EMPTY;
1499
1500 /* (*queue == BQUEUES) means no preference */
1501 if (*queue != BQUEUES) {
1502 /* Try for the requested queue first */
1503 bp = bufqueues[*queue].tqh_first;
1504 if (bp)
1505 goto found;
1506 }
1507
1508 /* Unable to use requested queue */
1509 age_bp = bufqueues[BQ_AGE].tqh_first;
1510 lru_bp = bufqueues[BQ_LRU].tqh_first;
1511 meta_bp = bufqueues[BQ_META].tqh_first;
1512
1513 if (!age_bp && !lru_bp && !meta_bp) {
1514 /*
1515 * Unavailble on AGE or LRU or META queues
1516 * Try the empty list first
1517 */
1518 bp = bufqueues[BQ_EMPTY].tqh_first;
1519 if (bp) {
1520 *queue = BQ_EMPTY;
1521 goto found;
1522 }
1523
1524 /* Create a new temparory buffer header */
1525 bp = (struct buf *)zalloc(buf_hdr_zone);
1526
1527 if (bp) {
1528 bufhdrinit(bp);
1529 BLISTNONE(bp);
1530 binshash(bp, &invalhash);
1531 SET(bp->b_flags, B_HDRALLOC);
1532 *queue = BQ_EMPTY;
1533 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
1534 buf_hdr_count++;
1535 goto found;
1536 }
1537
1538 /* Log this error condition */
1539 printf("getnewbuf: No useful buffers");
1540
1541 /* wait for a free buffer of any kind */
1542 needbuffer = 1;
1543 bufstats.bufs_sleeps++;
1544 tsleep(&needbuffer, slpflag|(PRIBIO+1), "getnewbuf", slptimeo);
1545 splx(s);
1546 return (0);
1547 }
1548
1549 /* Buffer available either on AGE or LRU or META */
1550 bp = NULL;
1551 *queue = -1;
1552
1553 /* Buffer available either on AGE or LRU */
1554 if (!age_bp) {
1555 bp = lru_bp;
1556 *queue = BQ_LRU;
1557 } else if (!lru_bp) {
1558 bp = age_bp;
1559 *queue = BQ_AGE;
1560 } else { /* buffer available on both AGE and LRU */
1561 age_time = time.tv_sec - age_bp->b_timestamp;
1562 lru_time = time.tv_sec - lru_bp->b_timestamp;
1563 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
1564 bp = age_bp;
1565 *queue = BQ_AGE;
1566 /*
1567 * we should probably re-timestamp eveything in the
1568 * queues at this point with the current time
1569 */
1570 } else {
1571 if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
1572 bp = lru_bp;
1573 *queue = BQ_LRU;
1574 } else {
1575 bp = age_bp;
1576 *queue = BQ_AGE;
1577 }
1578 }
1579 }
1580
1581 if (!bp) { /* Neither on AGE nor on LRU */
1582 bp = meta_bp;
1583 *queue = BQ_META;
1584 } else if (meta_bp) {
1585 bp_time = time.tv_sec - bp->b_timestamp;
1586 meta_time = time.tv_sec - meta_bp->b_timestamp;
1587
1588 if (!(bp_time < 0) && !(meta_time < 0)) {
1589 /* time not set backwards */
1590 int bp_is_stale;
1591 bp_is_stale = (*queue == BQ_LRU) ?
1592 lru_is_stale : age_is_stale;
1593
1594 if ((meta_time >= meta_is_stale) &&
1595 (bp_time < bp_is_stale)) {
1596 bp = meta_bp;
1597 *queue = BQ_META;
1598 }
1599 }
1600 }
1601
1602 if (bp == NULL)
1603 panic("getnewbuf: null bp");
1604
1605 found:
1606 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1607 panic("getnewbuf: le_prev is deadbeef");
1608
1609 if(ISSET(bp->b_flags, B_BUSY))
1610 panic("getnewbuf reusing BUSY buf");
1611
1612 /* Clean it */
1613 if (bcleanbuf(bp)) {
1614 /* bawrite() issued, buffer not ready */
1615 splx(s);
1616 *queue = req;
1617 goto start;
1618 }
1619 splx(s);
1620 return (bp);
1621 }
1622
1623 #include <mach/mach_types.h>
1624 #include <mach/memory_object_types.h>
1625 #include <kern/sched_prim.h>
1626
1627 /*
1628 * Clean a buffer.
1629 * Returns 0 is buffer is ready to use,
1630 * Returns 1 if issued a bawrite() to indicate
1631 * that the buffer is not ready.
1632 */
1633 static int
1634 bcleanbuf(struct buf *bp)
1635 {
1636 int s;
1637 struct ucred *cred;
1638 int hdralloc = 0;
1639
1640 s = splbio();
1641
1642 /* Remove from the queue */
1643 bremfree(bp);
1644
1645 /* Buffer is no longer on free lists. */
1646 SET(bp->b_flags, B_BUSY);
1647
1648 /* Check whether the buffer header was "allocated" */
1649 if (ISSET(bp->b_flags, B_HDRALLOC))
1650 hdralloc = 1;
1651
1652 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1653 panic("bcleanbuf: le_prev is deadbeef");
1654
1655 /*
1656 * If buffer was a delayed write, start the IO by queuing
1657 * it on the LAUNDRY queue, and return 1
1658 */
1659 if (ISSET(bp->b_flags, B_DELWRI)) {
1660 splx(s);
1661 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
1662 blaundrycnt++;
1663 wakeup(&blaundrycnt);
1664 /* and give it a chance to run */
1665 (void)thread_block(THREAD_CONTINUE_NULL);
1666 return (1);
1667 }
1668
1669 if (bp->b_vp)
1670 brelvp(bp);
1671 bremhash(bp);
1672 BLISTNONE(bp);
1673
1674 splx(s);
1675
1676 if (ISSET(bp->b_flags, B_META)) {
1677 vm_offset_t elem = (vm_offset_t)bp->b_data;
1678 if (elem == 0)
1679 panic("bcleanbuf: NULL bp->b_data B_META buffer");
1680
1681 if (ISSET(bp->b_flags, B_ZALLOC)) {
1682 if (bp->b_bufsize <= MAXMETA) {
1683 zone_t z;
1684
1685 z = getbufzone(bp->b_bufsize);
1686 bp->b_data = (caddr_t)0xdeadbeef;
1687 zfree(z, elem);
1688 CLR(bp->b_flags, B_ZALLOC);
1689 } else
1690 panic("bcleanbuf: B_ZALLOC set incorrectly");
1691 } else {
1692 bp->b_data = (caddr_t)0xdeadbeef;
1693 kmem_free(kernel_map, elem, bp->b_bufsize);
1694 }
1695 }
1696
1697 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
1698
1699 /* disassociate us from our vnode, if we had one... */
1700 s = splbio();
1701
1702 /* clear out various other fields */
1703 bp->b_bufsize = 0;
1704 bp->b_data = 0;
1705 bp->b_flags = B_BUSY;
1706 if (hdralloc)
1707 SET(bp->b_flags, B_HDRALLOC);
1708 bp->b_dev = NODEV;
1709 bp->b_blkno = bp->b_lblkno = 0;
1710 bp->b_iodone = 0;
1711 bp->b_error = 0;
1712 bp->b_resid = 0;
1713 bp->b_bcount = 0;
1714 bp->b_dirtyoff = bp->b_dirtyend = 0;
1715 bp->b_validoff = bp->b_validend = 0;
1716
1717 /* nuke any credentials we were holding */
1718 cred = bp->b_rcred;
1719 if (cred != NOCRED) {
1720 bp->b_rcred = NOCRED;
1721 crfree(cred);
1722 }
1723 cred = bp->b_wcred;
1724 if (cred != NOCRED) {
1725 bp->b_wcred = NOCRED;
1726 crfree(cred);
1727 }
1728 splx(s);
1729 return (0);
1730 }
1731
1732
1733 /*
1734 * Wait for operations on the buffer to complete.
1735 * When they do, extract and return the I/O's error value.
1736 */
1737 int
1738 biowait(bp)
1739 struct buf *bp;
1740 {
1741 int s;
1742
1743 s = splbio();
1744 while (!ISSET(bp->b_flags, B_DONE))
1745 tsleep(bp, PRIBIO + 1, "biowait", 0);
1746 splx(s);
1747
1748 /* check for interruption of I/O (e.g. via NFS), then errors. */
1749 if (ISSET(bp->b_flags, B_EINTR)) {
1750 CLR(bp->b_flags, B_EINTR);
1751 return (EINTR);
1752 } else if (ISSET(bp->b_flags, B_ERROR))
1753 return (bp->b_error ? bp->b_error : EIO);
1754 else
1755 return (0);
1756 }
1757
1758 /*
1759 * Mark I/O complete on a buffer.
1760 *
1761 * If a callback has been requested, e.g. the pageout
1762 * daemon, do so. Otherwise, awaken waiting processes.
1763 *
1764 * [ Leffler, et al., says on p.247:
1765 * "This routine wakes up the blocked process, frees the buffer
1766 * for an asynchronous write, or, for a request by the pagedaemon
1767 * process, invokes a procedure specified in the buffer structure" ]
1768 *
1769 * In real life, the pagedaemon (or other system processes) wants
1770 * to do async stuff to, and doesn't want the buffer brelse()'d.
1771 * (for swap pager, that puts swap buffers on the free lists (!!!),
1772 * for the vn device, that puts malloc'd buffers on the free lists!)
1773 */
1774 void
1775 biodone(bp)
1776 struct buf *bp;
1777 {
1778 boolean_t funnel_state;
1779 struct vnode *vp;
1780
1781 funnel_state = thread_funnel_set(kernel_flock, TRUE);
1782
1783 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
1784 (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1785
1786 if (ISSET(bp->b_flags, B_DONE))
1787 panic("biodone already");
1788 SET(bp->b_flags, B_DONE); /* note that it's done */
1789 /*
1790 * I/O was done, so don't believe
1791 * the DIRTY state from VM anymore
1792 */
1793 CLR(bp->b_flags, B_WASDIRTY);
1794
1795 if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
1796 vwakeup(bp); /* wake up reader */
1797
1798 if (kdebug_enable) {
1799 int code = DKIO_DONE;
1800
1801 if (bp->b_flags & B_READ)
1802 code |= DKIO_READ;
1803 if (bp->b_flags & B_ASYNC)
1804 code |= DKIO_ASYNC;
1805
1806 if (bp->b_flags & B_META)
1807 code |= DKIO_META;
1808 else if (bp->b_flags & (B_PGIN | B_PAGEOUT))
1809 code |= DKIO_PAGING;
1810
1811 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
1812 bp, bp->b_vp, bp->b_resid, bp->b_error, 0);
1813 }
1814
1815 /* Wakeup the throttled write operations as needed */
1816 vp = bp->b_vp;
1817 if (vp
1818 && (vp->v_flag & VTHROTTLED)
1819 && (vp->v_numoutput <= (BUFWRITE_THROTTLE / 3))) {
1820 vp->v_flag &= ~VTHROTTLED;
1821 wakeup((caddr_t)&vp->v_numoutput);
1822 }
1823
1824 if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */
1825 CLR(bp->b_flags, B_CALL); /* but note callout done */
1826 (*bp->b_iodone)(bp);
1827 } else if (ISSET(bp->b_flags, B_ASYNC)) /* if async, release it */
1828 brelse(bp);
1829 else { /* or just wakeup the buffer */
1830 CLR(bp->b_flags, B_WANTED);
1831 wakeup(bp);
1832 }
1833
1834 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
1835 (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1836
1837 thread_funnel_set(kernel_flock, funnel_state);
1838 }
1839
1840 /*
1841 * Return a count of buffers on the "locked" queue.
1842 */
1843 int
1844 count_lock_queue()
1845 {
1846 register struct buf *bp;
1847 register int n = 0;
1848
1849 for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
1850 bp = bp->b_freelist.tqe_next)
1851 n++;
1852 return (n);
1853 }
1854
1855 /*
1856 * Return a count of 'busy' buffers. Used at the time of shutdown.
1857 */
1858 int
1859 count_busy_buffers()
1860 {
1861 register struct buf *bp;
1862 register int nbusy = 0;
1863
1864 for (bp = &buf[nbuf]; --bp >= buf; )
1865 if ((bp->b_flags & (B_BUSY|B_INVAL)) == B_BUSY)
1866 nbusy++;
1867 return (nbusy);
1868 }
1869
1870 #if DIAGNOSTIC
1871 /*
1872 * Print out statistics on the current allocation of the buffer pool.
1873 * Can be enabled to print out on every ``sync'' by setting "syncprt"
1874 * in vfs_syscalls.c using sysctl.
1875 */
1876 void
1877 vfs_bufstats()
1878 {
1879 int s, i, j, count;
1880 register struct buf *bp;
1881 register struct bqueues *dp;
1882 int counts[MAXBSIZE/CLBYTES+1];
1883 static char *bname[BQUEUES] =
1884 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
1885
1886 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
1887 count = 0;
1888 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1889 counts[j] = 0;
1890 s = splbio();
1891 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
1892 counts[bp->b_bufsize/CLBYTES]++;
1893 count++;
1894 }
1895 splx(s);
1896 printf("%s: total-%d", bname[i], count);
1897 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1898 if (counts[j] != 0)
1899 printf(", %d-%d", j * CLBYTES, counts[j]);
1900 printf("\n");
1901 }
1902 }
1903 #endif /* DIAGNOSTIC */
1904
1905 #define NRESERVEDIOBUFS 64
1906
1907 __private_extern__ struct buf *
1908 alloc_io_buf(vp, priv)
1909 struct vnode *vp;
1910 int priv;
1911 {
1912 register struct buf *bp;
1913 int s;
1914
1915 s = splbio();
1916
1917 while (niobuf - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse && !priv) {
1918 need_iobuffer = 1;
1919 bufstats.bufs_iobufsleeps++;
1920 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf", 0);
1921 }
1922
1923 while ((bp = iobufqueue.tqh_first) == NULL) {
1924 need_iobuffer = 1;
1925 bufstats.bufs_iobufsleeps++;
1926 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf1", 0);
1927 }
1928
1929 TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
1930 bp->b_timestamp = 0;
1931
1932 /* clear out various fields */
1933 bp->b_flags = B_BUSY;
1934 bp->b_blkno = bp->b_lblkno = 0;
1935 bp->b_iodone = 0;
1936 bp->b_error = 0;
1937 bp->b_resid = 0;
1938 bp->b_bcount = 0;
1939 bp->b_bufsize = 0;
1940 bp->b_vp = vp;
1941
1942 if (vp->v_type == VBLK || vp->v_type == VCHR)
1943 bp->b_dev = vp->v_rdev;
1944 else
1945 bp->b_dev = NODEV;
1946 bufstats.bufs_iobufinuse++;
1947 if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
1948 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
1949 splx(s);
1950
1951 return (bp);
1952 }
1953
1954 __private_extern__ void
1955 free_io_buf(bp)
1956 struct buf *bp;
1957 {
1958 int s;
1959
1960 s = splbio();
1961 /* put buffer back on the head of the iobufqueue */
1962 bp->b_vp = NULL;
1963 bp->b_flags = B_INVAL;
1964
1965 binsheadfree(bp, &iobufqueue, -1);
1966
1967 /* Wake up any processes waiting for any buffer to become free. */
1968 if (need_iobuffer) {
1969 need_iobuffer = 0;
1970 wakeup(&need_iobuffer);
1971 }
1972 bufstats.bufs_iobufinuse--;
1973 splx(s);
1974 }
1975
1976 /* disabled for now */
1977
1978 /* XXX move this to a separate file */
1979 /*
1980 * Dynamic Scaling of the Buffer Queues
1981 */
1982
1983 typedef long long blsize_t;
1984
1985 blsize_t MAXNBUF; /* initialize to (mem_size / PAGE_SIZE) */
1986 /* Global tunable limits */
1987 blsize_t nbufh; /* number of buffer headers */
1988 blsize_t nbuflow; /* minimum number of buffer headers required */
1989 blsize_t nbufhigh; /* maximum number of buffer headers allowed */
1990 blsize_t nbuftarget; /* preferred number of buffer headers */
1991
1992 /*
1993 * assertions:
1994 *
1995 * 1. 0 < nbuflow <= nbufh <= nbufhigh
1996 * 2. nbufhigh <= MAXNBUF
1997 * 3. 0 < nbuflow <= nbuftarget <= nbufhigh
1998 * 4. nbufh can not be set by sysctl().
1999 */
2000
2001 /* Per queue tunable limits */
2002
2003 struct bufqlim {
2004 blsize_t bl_nlow; /* minimum number of buffer headers required */
2005 blsize_t bl_num; /* number of buffer headers on the queue */
2006 blsize_t bl_nlhigh; /* maximum number of buffer headers allowed */
2007 blsize_t bl_target; /* preferred number of buffer headers */
2008 long bl_stale; /* Seconds after which a buffer is considered stale */
2009 } bufqlim[BQUEUES];
2010
2011 /*
2012 * assertions:
2013 *
2014 * 1. 0 <= bl_nlow <= bl_num <= bl_nlhigh
2015 * 2. bl_nlhigh <= MAXNBUF
2016 * 3. bufqlim[BQ_META].bl_nlow != 0
2017 * 4. bufqlim[BQ_META].bl_nlow > (number of possible concurrent
2018 * file system IO operations)
2019 * 5. bl_num can not be set by sysctl().
2020 * 6. bl_nhigh <= nbufhigh
2021 */
2022
2023 /*
2024 * Rationale:
2025 * ----------
2026 * Defining it blsize_t as long permits 2^31 buffer headers per queue.
2027 * Which can describe (2^31 * PAGE_SIZE) memory per queue.
2028 *
2029 * These limits are exported to by means of sysctl().
2030 * It was decided to define blsize_t as a 64 bit quantity.
2031 * This will make sure that we will not be required to change it
2032 * as long as we do not exceed 64 bit address space for the kernel.
2033 *
2034 * low and high numbers parameters initialized at compile time
2035 * and boot arguments can be used to override them. sysctl()
2036 * would not change the value. sysctl() can get all the values
2037 * but can set only target. num is the current level.
2038 *
2039 * Advantages of having a "bufqscan" thread doing the balancing are,
2040 * Keep enough bufs on BQ_EMPTY.
2041 * getnewbuf() by default will always select a buffer from the BQ_EMPTY.
2042 * getnewbuf() perfoms best if a buffer was found there.
2043 * Also this minimizes the possibility of starting IO
2044 * from getnewbuf(). That's a performance win, too.
2045 *
2046 * Localize complex logic [balancing as well as time aging]
2047 * to balancebufq().
2048 *
2049 * Simplify getnewbuf() logic by elimination of time aging code.
2050 */
2051
2052 /*
2053 * Algorithm:
2054 * -----------
2055 * The goal of the dynamic scaling of the buffer queues to to keep
2056 * the size of the LRU close to bl_target. Buffers on a queue would
2057 * be time aged.
2058 *
2059 * There would be a thread which will be responsible for "balancing"
2060 * the buffer cache queues.
2061 *
2062 * The scan order would be: AGE, LRU, META, EMPTY.
2063 */
2064
2065 long bufqscanwait = 0;
2066
2067 static void bufqscan_thread();
2068 static int balancebufq(int q);
2069 static int btrimempty(int n);
2070 static __inline__ int initbufqscan(void);
2071 static __inline__ int nextbufq(int q);
2072 static void buqlimprt(int all);
2073
2074 static void
2075 bufq_balance_thread_init()
2076 {
2077
2078 if (bufqscanwait++ == 0) {
2079
2080 /* Initalize globals */
2081 MAXNBUF = (mem_size / PAGE_SIZE);
2082 nbufh = nbuf;
2083 nbuflow = min(nbufh, 100);
2084 nbufhigh = min(MAXNBUF, max(nbufh, 2048));
2085 nbuftarget = (mem_size >> 5) / PAGE_SIZE;
2086 nbuftarget = max(nbuflow, nbuftarget);
2087 nbuftarget = min(nbufhigh, nbuftarget);
2088
2089 /*
2090 * Initialize the bufqlim
2091 */
2092
2093 /* LOCKED queue */
2094 bufqlim[BQ_LOCKED].bl_nlow = 0;
2095 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
2096 bufqlim[BQ_LOCKED].bl_target = 0;
2097 bufqlim[BQ_LOCKED].bl_stale = 30;
2098
2099 /* LRU queue */
2100 bufqlim[BQ_LRU].bl_nlow = 0;
2101 bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
2102 bufqlim[BQ_LRU].bl_target = nbuftarget/4;
2103 bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
2104
2105 /* AGE queue */
2106 bufqlim[BQ_AGE].bl_nlow = 0;
2107 bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
2108 bufqlim[BQ_AGE].bl_target = nbuftarget/4;
2109 bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
2110
2111 /* EMPTY queue */
2112 bufqlim[BQ_EMPTY].bl_nlow = 0;
2113 bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
2114 bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
2115 bufqlim[BQ_EMPTY].bl_stale = 600000;
2116
2117 /* META queue */
2118 bufqlim[BQ_META].bl_nlow = 0;
2119 bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
2120 bufqlim[BQ_META].bl_target = nbuftarget/4;
2121 bufqlim[BQ_META].bl_stale = META_IS_STALE;
2122
2123 /* LAUNDRY queue */
2124 bufqlim[BQ_LOCKED].bl_nlow = 0;
2125 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
2126 bufqlim[BQ_LOCKED].bl_target = 0;
2127 bufqlim[BQ_LOCKED].bl_stale = 30;
2128
2129 buqlimprt(1);
2130 }
2131
2132 /* create worker thread */
2133 kernel_thread(kernel_task, bufqscan_thread);
2134 }
2135
2136 /* The workloop for the buffer balancing thread */
2137 static void
2138 bufqscan_thread()
2139 {
2140 boolean_t funnel_state;
2141 int moretodo = 0;
2142
2143 funnel_state = thread_funnel_set(kernel_flock, TRUE);
2144
2145 for(;;) {
2146 do {
2147 int q; /* buffer queue to process */
2148
2149 q = initbufqscan();
2150 for (; q; ) {
2151 moretodo |= balancebufq(q);
2152 q = nextbufq(q);
2153 }
2154 } while (moretodo);
2155
2156 #if DIAGNOSTIC
2157 vfs_bufstats();
2158 buqlimprt(0);
2159 #endif
2160 (void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz);
2161 moretodo = 0;
2162 }
2163
2164 (void) thread_funnel_set(kernel_flock, FALSE);
2165 }
2166
2167 /* Seed for the buffer queue balancing */
2168 static __inline__ int
2169 initbufqscan()
2170 {
2171 /* Start with AGE queue */
2172 return (BQ_AGE);
2173 }
2174
2175 /* Pick next buffer queue to balance */
2176 static __inline__ int
2177 nextbufq(int q)
2178 {
2179 int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
2180
2181 q++;
2182 q %= sizeof(order);
2183 return (order[q]);
2184 }
2185
2186 /* function to balance the buffer queues */
2187 static int
2188 balancebufq(int q)
2189 {
2190 int moretodo = 0;
2191 int s = splbio();
2192 int n;
2193
2194 /* reject invalid q */
2195 if ((q < 0) || (q >= BQUEUES))
2196 goto out;
2197
2198 /* LOCKED or LAUNDRY queue MUST not be balanced */
2199 if ((q == BQ_LOCKED) || (q == BQ_LAUNDRY))
2200 goto out;
2201
2202 n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
2203
2204 /* If queue has less than target nothing more to do */
2205 if (n < 0)
2206 goto out;
2207
2208 if ( n > 8 ) {
2209 /* Balance only a small amount (12.5%) at a time */
2210 n >>= 3;
2211 }
2212
2213 /* EMPTY queue needs special handling */
2214 if (q == BQ_EMPTY) {
2215 moretodo |= btrimempty(n);
2216 goto out;
2217 }
2218
2219 for (; n > 0; n--) {
2220 struct buf *bp = bufqueues[q].tqh_first;
2221 if (!bp)
2222 break;
2223
2224 /* check if it's stale */
2225 if ((time.tv_sec - bp->b_timestamp) > bufqlim[q].bl_stale) {
2226 if (bcleanbuf(bp)) {
2227 /* bawrite() issued, bp not ready */
2228 moretodo = 1;
2229 } else {
2230 /* release the cleaned buffer to BQ_EMPTY */
2231 SET(bp->b_flags, B_INVAL);
2232 brelse(bp);
2233 }
2234 } else
2235 break;
2236 }
2237
2238 out:
2239 splx(s);
2240 return (moretodo);
2241 }
2242
2243 static int
2244 btrimempty(int n)
2245 {
2246 /*
2247 * When struct buf are allocated dynamically, this would
2248 * reclaim upto 'n' struct buf from the empty queue.
2249 */
2250
2251 return (0);
2252 }
2253
2254 static __inline__ void
2255 bufqinc(int q)
2256 {
2257 if ((q < 0) || (q >= BQUEUES))
2258 return;
2259
2260 bufqlim[q].bl_num++;
2261 return;
2262 }
2263
2264 static __inline__ void
2265 bufqdec(int q)
2266 {
2267 if ((q < 0) || (q >= BQUEUES))
2268 return;
2269
2270 bufqlim[q].bl_num--;
2271 return;
2272 }
2273
2274 static void
2275 buqlimprt(int all)
2276 {
2277 int i;
2278 static char *bname[BQUEUES] =
2279 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
2280
2281 if (all)
2282 for (i = 0; i < BQUEUES; i++) {
2283 printf("%s : ", bname[i]);
2284 printf("min = %ld, ", (long)bufqlim[i].bl_nlow);
2285 printf("cur = %ld, ", (long)bufqlim[i].bl_num);
2286 printf("max = %ld, ", (long)bufqlim[i].bl_nlhigh);
2287 printf("target = %ld, ", (long)bufqlim[i].bl_target);
2288 printf("stale after %ld seconds\n", bufqlim[i].bl_stale);
2289 }
2290 else
2291 for (i = 0; i < BQUEUES; i++) {
2292 printf("%s : ", bname[i]);
2293 printf("cur = %ld, ", (long)bufqlim[i].bl_num);
2294 }
2295 }
2296
2297 /*
2298 * If the getnewbuf() calls bcleanbuf() on the same thread
2299 * there is a potential for stack overrun and deadlocks.
2300 * So we always handoff the work to worker thread for completion
2301 */
2302
2303 static void
2304 bcleanbuf_thread_init()
2305 {
2306 static void bcleanbuf_thread();
2307
2308 /* create worker thread */
2309 kernel_thread(kernel_task, bcleanbuf_thread);
2310 }
2311
2312 static void
2313 bcleanbuf_thread()
2314 {
2315 boolean_t funnel_state;
2316 struct buf *bp;
2317 int error = 0;
2318 int loopcnt = 0;
2319
2320 funnel_state = thread_funnel_set(kernel_flock, TRUE);
2321
2322 doit:
2323 while (blaundrycnt == 0)
2324 (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 60 * hz);
2325 bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY]);
2326 /* Remove from the queue */
2327 bremfree(bp);
2328 blaundrycnt--;
2329 /* do the IO */
2330 error = bawrite_internal(bp, 0);
2331 if (error) {
2332 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
2333 blaundrycnt++;
2334 if (loopcnt > 10) {
2335 (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 1);
2336 loopcnt = 0;
2337 } else {
2338 (void)thread_block(THREAD_CONTINUE_NULL);
2339 loopcnt++;
2340 }
2341 }
2342 /* start again */
2343 goto doit;
2344
2345 (void) thread_funnel_set(kernel_flock, funnel_state);
2346 }