]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_bio.c
e11f6cb1dccaee99bcc98e99470295c8e6ecae8b
[apple/xnu.git] / bsd / vfs / vfs_bio.c
1 /*
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
23 /*-
24 * Copyright (c) 1994 Christopher G. Demetriou
25 * Copyright (c) 1982, 1986, 1989, 1993
26 * The Regents of the University of California. All rights reserved.
27 * (c) UNIX System Laboratories, Inc.
28 * All or some portions of this file are derived from material licensed
29 * to the University of California by American Telephone and Telegraph
30 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
31 * the permission of UNIX System Laboratories, Inc.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * The NEXTSTEP Software License Agreement specifies the terms
62 * and conditions for redistribution.
63 *
64 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
65 */
66
67 /*
68 * Some references:
69 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
70 * Leffler, et al.: The Design and Implementation of the 4.3BSD
71 * UNIX Operating System (Addison Welley, 1989)
72 */
73 #define ZALLOC_METADATA 1
74
75 #include <sys/param.h>
76 #include <sys/systm.h>
77 #include <sys/proc.h>
78 #include <sys/buf.h>
79 #include <sys/vnode.h>
80 #include <sys/mount.h>
81 #include <sys/trace.h>
82 #include <sys/malloc.h>
83 #include <sys/resourcevar.h>
84 #include <miscfs/specfs/specdev.h>
85 #include <sys/ubc.h>
86 #include <vm/vm_pageout.h>
87 #if DIAGNOSTIC
88 #include <kern/assert.h>
89 #endif /* DIAGNOSTIC */
90 #include <kern/task.h>
91 #include <kern/zalloc.h>
92
93 #include <sys/kdebug.h>
94
95 extern void bufqinc(int q);
96 extern void bufqdec(int q);
97 extern void bufq_balance_thread_init();
98
99 extern void reassignbuf(struct buf *, struct vnode *);
100 static struct buf *getnewbuf(int slpflag, int slptimeo, int *queue);
101
102 extern int niobuf; /* The number of IO buffer headers for cluster IO */
103
104 #if TRACE
105 struct proc *traceproc;
106 int tracewhich, tracebuf[TRCSIZ];
107 u_int tracex;
108 char traceflags[TR_NFLAGS];
109 #endif /* TRACE */
110
111 /*
112 * Definitions for the buffer hash lists.
113 */
114 #define BUFHASH(dvp, lbn) \
115 (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
116 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
117 u_long bufhash;
118
119 /* Definitions for the buffer stats. */
120 struct bufstats bufstats;
121
122 /*
123 * Insq/Remq for the buffer hash lists.
124 */
125 #if 0
126 #define binshash(bp, dp) LIST_INSERT_HEAD(dp, bp, b_hash)
127 #define bremhash(bp) LIST_REMOVE(bp, b_hash)
128 #endif /* 0 */
129
130
131 TAILQ_HEAD(ioqueue, buf) iobufqueue;
132 TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
133 int needbuffer;
134 int need_iobuffer;
135
136 /*
137 * Insq/Remq for the buffer free lists.
138 */
139 #define binsheadfree(bp, dp, whichq) do { \
140 TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
141 bufqinc((whichq)); \
142 (bp)->b_whichq = whichq; \
143 (bp)->b_timestamp = time.tv_sec; \
144 } while (0)
145
146 #define binstailfree(bp, dp, whichq) do { \
147 TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
148 bufqinc((whichq)); \
149 (bp)->b_whichq = whichq; \
150 (bp)->b_timestamp = time.tv_sec; \
151 } while (0)
152
153 #define BHASHENTCHECK(bp) \
154 if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef) \
155 panic("%x: b_hash.le_prev is deadb", (bp));
156
157 #define BLISTNONE(bp) \
158 (bp)->b_hash.le_next = (struct buf *)0; \
159 (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
160
161 simple_lock_data_t bufhashlist_slock; /* lock on buffer hash list */
162
163 /*
164 * Time in seconds before a buffer on a list is
165 * considered as a stale buffer
166 */
167 #define LRU_IS_STALE 120 /* default value for the LRU */
168 #define AGE_IS_STALE 60 /* default value for the AGE */
169 #define META_IS_STALE 180 /* default value for the BQ_META */
170
171 int lru_is_stale = LRU_IS_STALE;
172 int age_is_stale = AGE_IS_STALE;
173 int meta_is_stale = META_IS_STALE;
174
175 #if 1
176 void
177 blistenterhead(struct bufhashhdr * head, struct buf * bp)
178 {
179 if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
180 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
181 (head)->lh_first = bp;
182 bp->b_hash.le_prev = &(head)->lh_first;
183 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
184 panic("blistenterhead: le_prev is deadbeef");
185
186 }
187 #endif
188
189 #if 1
190 void
191 binshash(struct buf *bp, struct bufhashhdr *dp)
192 {
193 int s;
194
195 struct buf *nbp;
196
197 simple_lock(&bufhashlist_slock);
198 #if 0
199 if(incore(bp->b_vp, bp->b_lblkno)) {
200 panic("adding to queue already existing element");
201 }
202 #endif /* 0 */
203 BHASHENTCHECK(bp);
204
205 nbp = dp->lh_first;
206 for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
207 if(nbp == bp)
208 panic("buf already in hashlist");
209 }
210
211 #if 0
212 LIST_INSERT_HEAD(dp, bp, b_hash);
213 #else
214 blistenterhead(dp, bp);
215 #endif
216 simple_unlock(&bufhashlist_slock);
217 }
218
219 void
220 bremhash(struct buf *bp)
221 {
222 int s;
223
224 simple_lock(&bufhashlist_slock);
225 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
226 panic("bremhash le_prev is deadbeef");
227 if (bp->b_hash.le_next == bp)
228 panic("bremhash: next points to self");
229
230 if (bp->b_hash.le_next != NULL)
231 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
232 *bp->b_hash.le_prev = (bp)->b_hash.le_next;
233 simple_unlock(&bufhashlist_slock);
234 }
235
236 #endif /* 1 */
237
238
239 /*
240 * Remove a buffer from the free list it's on
241 */
242 void
243 bremfree(bp)
244 struct buf *bp;
245 {
246 struct bqueues *dp = NULL;
247 int whichq = -1;
248
249 /*
250 * We only calculate the head of the freelist when removing
251 * the last element of the list as that is the only time that
252 * it is needed (e.g. to reset the tail pointer).
253 *
254 * NB: This makes an assumption about how tailq's are implemented.
255 */
256 if (bp->b_freelist.tqe_next == NULL) {
257 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
258 if (dp->tqh_last == &bp->b_freelist.tqe_next)
259 break;
260 if (dp == &bufqueues[BQUEUES])
261 panic("bremfree: lost tail");
262 }
263 TAILQ_REMOVE(dp, bp, b_freelist);
264 whichq = bp->b_whichq;
265 bufqdec(whichq);
266 bp->b_whichq = -1;
267 bp->b_timestamp = 0;
268 }
269
270 /*
271 * Initialize buffers and hash links for buffers.
272 */
273 void
274 bufinit()
275 {
276 register struct buf *bp;
277 register struct bqueues *dp;
278 register int i;
279 int metabuf;
280 long whichq;
281 #if ZALLOC_METADATA
282 static void bufzoneinit();
283 #endif /* ZALLOC_METADATA */
284
285 /* Initialize the buffer queues ('freelists') and the hash table */
286 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
287 TAILQ_INIT(dp);
288 bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash);
289
290 simple_lock_init(&bufhashlist_slock );
291
292 metabuf = nbuf/8; /* reserved for meta buf */
293
294 /* Initialize the buffer headers */
295 for (i = 0; i < nbuf; i++) {
296 bp = &buf[i];
297 bzero((char *)bp, sizeof *bp);
298 bp->b_dev = NODEV;
299 bp->b_rcred = NOCRED;
300 bp->b_wcred = NOCRED;
301 bp->b_vnbufs.le_next = NOLIST;
302 bp->b_flags = B_INVAL;
303 /*
304 * metabuf buffer headers on the meta-data list and
305 * rest of the buffer headers on the empty list
306 */
307 if (--metabuf )
308 whichq = BQ_META;
309 else
310 whichq = BQ_EMPTY;
311
312 BLISTNONE(bp);
313 dp = &bufqueues[whichq];
314 binsheadfree(bp, dp, whichq);
315 binshash(bp, &invalhash);
316 }
317
318 for (; i < nbuf + niobuf; i++) {
319 bp = &buf[i];
320 bzero((char *)bp, sizeof *bp);
321 bp->b_dev = NODEV;
322 bp->b_rcred = NOCRED;
323 bp->b_wcred = NOCRED;
324 bp->b_vnbufs.le_next = NOLIST;
325 bp->b_flags = B_INVAL;
326 binsheadfree(bp, &iobufqueue, -1);
327 }
328
329 printf("using %d buffer headers and %d cluster IO buffer headers\n",
330 nbuf, niobuf);
331
332 #if ZALLOC_METADATA
333 /* Set up zones for meta-data */
334 bufzoneinit();
335 #endif
336
337 #if XXX
338 /* create a thread to do dynamic buffer queue balancing */
339 bufq_balance_thread_init();
340 #endif /* XXX */
341 }
342
343 /* __inline */
344 struct buf *
345 bio_doread(vp, blkno, size, cred, async, queuetype)
346 struct vnode *vp;
347 daddr_t blkno;
348 int size;
349 struct ucred *cred;
350 int async;
351 int queuetype;
352 {
353 register struct buf *bp;
354 struct proc *p = current_proc();
355
356 bp = getblk(vp, blkno, size, 0, 0, queuetype);
357
358 /*
359 * If buffer does not have data valid, start a read.
360 * Note that if buffer is B_INVAL, getblk() won't return it.
361 * Therefore, it's valid if it's I/O has completed or been delayed.
362 */
363 if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
364 /* Start I/O for the buffer (keeping credentials). */
365 SET(bp->b_flags, B_READ | async);
366 if (cred != NOCRED && bp->b_rcred == NOCRED) {
367 crhold(cred);
368 bp->b_rcred = cred;
369 }
370 VOP_STRATEGY(bp);
371
372 trace(TR_BREADMISS, pack(vp, size), blkno);
373
374 /* Pay for the read. */
375 if (p && p->p_stats)
376 p->p_stats->p_ru.ru_inblock++; /* XXX */
377 } else if (async) {
378 brelse(bp);
379 }
380
381 trace(TR_BREADHIT, pack(vp, size), blkno);
382
383 return (bp);
384 }
385 /*
386 * Read a disk block.
387 * This algorithm described in Bach (p.54).
388 */
389 int
390 bread(vp, blkno, size, cred, bpp)
391 struct vnode *vp;
392 daddr_t blkno;
393 int size;
394 struct ucred *cred;
395 struct buf **bpp;
396 {
397 register struct buf *bp;
398
399 /* Get buffer for block. */
400 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
401
402 /* Wait for the read to complete, and return result. */
403 return (biowait(bp));
404 }
405
406 /*
407 * Read a disk block. [bread() for meta-data]
408 * This algorithm described in Bach (p.54).
409 */
410 int
411 meta_bread(vp, blkno, size, cred, bpp)
412 struct vnode *vp;
413 daddr_t blkno;
414 int size;
415 struct ucred *cred;
416 struct buf **bpp;
417 {
418 register struct buf *bp;
419
420 /* Get buffer for block. */
421 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
422
423 /* Wait for the read to complete, and return result. */
424 return (biowait(bp));
425 }
426
427 /*
428 * Read-ahead multiple disk blocks. The first is sync, the rest async.
429 * Trivial modification to the breada algorithm presented in Bach (p.55).
430 */
431 int
432 breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp)
433 struct vnode *vp;
434 daddr_t blkno; int size;
435 daddr_t rablks[]; int rasizes[];
436 int nrablks;
437 struct ucred *cred;
438 struct buf **bpp;
439 {
440 register struct buf *bp;
441 int i;
442
443 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
444
445 /*
446 * For each of the read-ahead blocks, start a read, if necessary.
447 */
448 for (i = 0; i < nrablks; i++) {
449 /* If it's in the cache, just go on to next one. */
450 if (incore(vp, rablks[i]))
451 continue;
452
453 /* Get a buffer for the read-ahead block */
454 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, BLK_READ);
455 }
456
457 /* Otherwise, we had to start a read for it; wait until it's valid. */
458 return (biowait(bp));
459 }
460
461 /*
462 * Read with single-block read-ahead. Defined in Bach (p.55), but
463 * implemented as a call to breadn().
464 * XXX for compatibility with old file systems.
465 */
466 int
467 breada(vp, blkno, size, rablkno, rabsize, cred, bpp)
468 struct vnode *vp;
469 daddr_t blkno; int size;
470 daddr_t rablkno; int rabsize;
471 struct ucred *cred;
472 struct buf **bpp;
473 {
474
475 return (breadn(vp, blkno, size, &rablkno, &rabsize, 1, cred, bpp));
476 }
477
478 /*
479 * Block write. Described in Bach (p.56)
480 */
481 int
482 bwrite(bp)
483 struct buf *bp;
484 {
485 int rv, sync, wasdelayed;
486 struct proc *p = current_proc();
487 upl_t upl;
488 upl_page_info_t *pl;
489 void * object;
490 kern_return_t kret;
491 struct vnode *vp = bp->b_vp;
492
493 /* Remember buffer type, to switch on it later. */
494 sync = !ISSET(bp->b_flags, B_ASYNC);
495 wasdelayed = ISSET(bp->b_flags, B_DELWRI);
496 CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
497
498 if (!sync) {
499 /*
500 * If not synchronous, pay for the I/O operation and make
501 * sure the buf is on the correct vnode queue. We have
502 * to do this now, because if we don't, the vnode may not
503 * be properly notified that its I/O has completed.
504 */
505 if (wasdelayed)
506 reassignbuf(bp, vp);
507 else
508 if (p && p->p_stats)
509 p->p_stats->p_ru.ru_oublock++; /* XXX */
510 }
511
512 trace(TR_BWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
513
514 /* Initiate disk write. Make sure the appropriate party is charged. */
515 SET(bp->b_flags, B_WRITEINPROG);
516 vp->v_numoutput++;
517
518 VOP_STRATEGY(bp);
519
520 if (sync) {
521 /*
522 * If I/O was synchronous, wait for it to complete.
523 */
524 rv = biowait(bp);
525
526 /*
527 * Pay for the I/O operation, if it's not been paid for, and
528 * make sure it's on the correct vnode queue. (async operatings
529 * were payed for above.)
530 */
531 if (wasdelayed)
532 reassignbuf(bp, vp);
533 else
534 if (p && p->p_stats)
535 p->p_stats->p_ru.ru_oublock++; /* XXX */
536
537 /* Release the buffer. */
538 brelse(bp);
539
540 return (rv);
541 } else {
542 return (0);
543 }
544 }
545
546 int
547 vn_bwrite(ap)
548 struct vop_bwrite_args *ap;
549 {
550 return (bwrite(ap->a_bp));
551 }
552
553 /*
554 * Delayed write.
555 *
556 * The buffer is marked dirty, but is not queued for I/O.
557 * This routine should be used when the buffer is expected
558 * to be modified again soon, typically a small write that
559 * partially fills a buffer.
560 *
561 * NB: magnetic tapes cannot be delayed; they must be
562 * written in the order that the writes are requested.
563 *
564 * Described in Leffler, et al. (pp. 208-213).
565 */
566 void
567 bdwrite(bp)
568 struct buf *bp;
569 {
570 struct proc *p = current_proc();
571 kern_return_t kret;
572 upl_t upl;
573 upl_page_info_t *pl;
574
575 /*
576 * If the block hasn't been seen before:
577 * (1) Mark it as having been seen,
578 * (2) Charge for the write.
579 * (3) Make sure it's on its vnode's correct block list,
580 */
581 if (!ISSET(bp->b_flags, B_DELWRI)) {
582 SET(bp->b_flags, B_DELWRI);
583 if (p && p->p_stats)
584 p->p_stats->p_ru.ru_oublock++; /* XXX */
585
586 reassignbuf(bp, bp->b_vp);
587 }
588
589
590 /* If this is a tape block, write it the block now. */
591 if (ISSET(bp->b_flags, B_TAPE)) {
592 /* bwrite(bp); */
593 VOP_BWRITE(bp);
594 return;
595 }
596
597 /* Otherwise, the "write" is done, so mark and release the buffer. */
598 SET(bp->b_flags, B_DONE);
599 brelse(bp);
600 }
601
602 /*
603 * Asynchronous block write; just an asynchronous bwrite().
604 */
605 void
606 bawrite(bp)
607 struct buf *bp;
608 {
609
610 SET(bp->b_flags, B_ASYNC);
611 VOP_BWRITE(bp);
612 }
613
614 /*
615 * Release a buffer on to the free lists.
616 * Described in Bach (p. 46).
617 */
618 void
619 brelse(bp)
620 struct buf *bp;
621 {
622 struct bqueues *bufq;
623 int s;
624 long whichq;
625
626 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
627 bp->b_lblkno * PAGE_SIZE, bp, bp->b_data, bp->b_flags, 0);
628
629 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
630
631 /* IO is done. Cleanup the UPL state */
632 if (!ISSET(bp->b_flags, B_META)
633 && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
634 kern_return_t kret;
635 upl_t upl;
636 upl_page_info_t *pl;
637 int upl_flags;
638
639 if ( !ISSET(bp->b_flags, B_PAGELIST)) {
640 if ( !ISSET(bp->b_flags, B_INVAL)) {
641 void *object;
642 off_t file_offset;
643
644 object = ubc_getobject(bp->b_vp, UBC_NOREACTIVATE);
645 if (object == (void *)NULL)
646 panic("vmobject for vp is null");
647 if (bp->b_bufsize & 0xfff)
648 panic("list request is with less than 4k");
649
650 file_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
651
652 kret = vm_fault_list_request(object,
653 (vm_object_offset_t)file_offset, bp->b_bufsize,
654 &upl, NULL, 0,
655 (UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_PRECIOUS
656 | UPL_SET_INTERNAL));
657 if (kret != KERN_SUCCESS)
658 panic("brelse: Failed to get pagelists");
659 #ifdef UBC_DEBUG
660 upl_ubc_alias_set(upl, bp, 5);
661 #endif /* UBC_DEBUG */
662 } else
663 upl = (upl_t) 0;
664 } else {
665 upl = bp->b_pagelist;
666 kret = kernel_upl_unmap(kernel_map, upl);
667
668 if (kret != KERN_SUCCESS)
669 panic("kernel_upl_unmap failed");
670 bp->b_data = 0;
671 }
672 if (upl) {
673 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
674
675 if (bp->b_flags & (B_ERROR | B_INVAL)) {
676 if (bp->b_flags & (B_READ | B_INVAL))
677 upl_flags = UPL_ABORT_DUMP_PAGES;
678 else
679 upl_flags = 0;
680 kernel_upl_abort(upl, upl_flags);
681 } else {
682 if (ISSET(bp->b_flags, (B_DELWRI | B_WASDIRTY)))
683 upl_flags = UPL_COMMIT_SET_DIRTY | UPL_COMMIT_FREE_ON_EMPTY;
684 else
685 upl_flags = UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY;
686 kernel_upl_commit_range(upl, 0, bp->b_bufsize,
687 upl_flags
688 | UPL_COMMIT_INACTIVATE,
689 pl, MAX_UPL_TRANSFER);
690 }
691 s = splbio();
692 CLR(bp->b_flags, B_PAGELIST);
693 bp->b_pagelist = 0;
694 splx(s);
695 }
696 } else {
697 if(ISSET(bp->b_flags, B_PAGELIST))
698 panic("brelse: pagelist set for non VREG; vp=%x", bp->b_vp);
699 }
700
701 /* Wake up any processes waiting for any buffer to become free. */
702 if (needbuffer) {
703 needbuffer = 0;
704 wakeup(&needbuffer);
705 }
706
707 /* Wake up any proceeses waiting for _this_ buffer to become free. */
708 if (ISSET(bp->b_flags, B_WANTED)) {
709 CLR(bp->b_flags, B_WANTED);
710 wakeup(bp);
711 }
712
713 /* Block disk interrupts. */
714 s = splbio();
715
716 /*
717 * Determine which queue the buffer should be on, then put it there.
718 */
719
720 /* If it's locked, don't report an error; try again later. */
721 if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
722 CLR(bp->b_flags, B_ERROR);
723
724 /* If it's not cacheable, or an error, mark it invalid. */
725 if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
726 SET(bp->b_flags, B_INVAL);
727
728 if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
729 /*
730 * If it's invalid or empty, dissociate it from its vnode
731 * and put on the head of the appropriate queue.
732 */
733 if (bp->b_vp)
734 brelvp(bp);
735 CLR(bp->b_flags, B_DELWRI);
736 if (bp->b_bufsize <= 0)
737 whichq = BQ_EMPTY; /* no data */
738 else
739 whichq = BQ_AGE; /* invalid data */
740
741 bufq = &bufqueues[whichq];
742 binsheadfree(bp, bufq, whichq);
743 } else {
744 /*
745 * It has valid data. Put it on the end of the appropriate
746 * queue, so that it'll stick around for as long as possible.
747 */
748 if (ISSET(bp->b_flags, B_LOCKED))
749 whichq = BQ_LOCKED; /* locked in core */
750 else if (ISSET(bp->b_flags, B_META))
751 whichq = BQ_META; /* meta-data */
752 else if (ISSET(bp->b_flags, B_AGE))
753 whichq = BQ_AGE; /* stale but valid data */
754 else
755 whichq = BQ_LRU; /* valid data */
756
757 bufq = &bufqueues[whichq];
758 binstailfree(bp, bufq, whichq);
759 }
760
761 /* Unlock the buffer. */
762 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE));
763
764 /* Allow disk interrupts. */
765 splx(s);
766
767 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
768 bp, bp->b_data, bp->b_flags, 0, 0);
769 }
770
771 /*
772 * Determine if a block is in the cache.
773 * Just look on what would be its hash chain. If it's there, return
774 * a pointer to it, unless it's marked invalid. If it's marked invalid,
775 * we normally don't return the buffer, unless the caller explicitly
776 * wants us to.
777 */
778 struct buf *
779 incore(vp, blkno)
780 struct vnode *vp;
781 daddr_t blkno;
782 {
783 struct buf *bp;
784 int bufseen = 0;
785
786 bp = BUFHASH(vp, blkno)->lh_first;
787
788 /* Search hash chain */
789 for (; bp != NULL; bp = bp->b_hash.le_next, bufseen++) {
790 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
791 !ISSET(bp->b_flags, B_INVAL))
792 return (bp);
793 if(bufseen >= nbuf)
794 panic("walked more than nbuf in incore");
795
796 }
797
798 return (0);
799 }
800
801 /* XXX FIXME -- Update the comment to reflect the UBC changes -- */
802 /*
803 * Get a block of requested size that is associated with
804 * a given vnode and block offset. If it is found in the
805 * block cache, mark it as having been found, make it busy
806 * and return it. Otherwise, return an empty block of the
807 * correct size. It is up to the caller to insure that the
808 * cached blocks be of the correct size.
809 */
810 struct buf *
811 getblk(vp, blkno, size, slpflag, slptimeo, operation)
812 register struct vnode *vp;
813 daddr_t blkno;
814 int size, slpflag, slptimeo, operation;
815 {
816 struct buf *bp;
817 int s, err;
818 upl_t upl;
819 upl_page_info_t *pl;
820 void * object;
821 kern_return_t kret;
822 void *pager;
823 off_t file_offset;
824 int error=0;
825 int pagedirty = 0;
826
827 start:
828 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
829 blkno * PAGE_SIZE, size, operation, 0, 0);
830
831 s = splbio();
832 if (bp = incore(vp, blkno)) {
833 /* Found in the Buffer Cache */
834 if (ISSET(bp->b_flags, B_BUSY)) {
835 /* but is busy */
836 switch (operation) {
837 case BLK_READ:
838 case BLK_WRITE:
839 case BLK_META:
840 SET(bp->b_flags, B_WANTED);
841 bufstats.bufs_busyincore++;
842 err = tsleep(bp, slpflag | (PRIBIO + 1), "getblk",
843 slptimeo);
844 splx(s);
845 /*
846 * Callers who call with PCATCH or timeout are
847 * willing to deal with the NULL pointer
848 */
849 if (err && ((slpflag & PCATCH) ||
850 ((err == EWOULDBLOCK) && slptimeo)))
851 return (NULL);
852 goto start;
853 /*NOTREACHED*/
854 break;
855
856 case BLK_PAGEIN:
857 /* pagein operation must not use getblk */
858 panic("getblk: pagein for incore busy buffer");
859 splx(s);
860 /*NOTREACHED*/
861 break;
862
863 case BLK_PAGEOUT:
864 /* pageout operation must not use getblk */
865 panic("getblk: pageout for incore busy buffer");
866 splx(s);
867 /*NOTREACHED*/
868 break;
869
870 default:
871 panic("getblk: %d unknown operation 1", operation);
872 /*NOTREACHED*/
873 break;
874 }
875 } else {
876 /* not busy */
877 SET(bp->b_flags, (B_BUSY | B_CACHE));
878 bremfree(bp);
879 bufstats.bufs_incore++;
880 splx(s);
881
882 allocbuf(bp, size);
883 if (ISSET(bp->b_flags, B_PAGELIST))
884 panic("pagelist buffer is not busy");
885
886 switch (operation) {
887 case BLK_READ:
888 case BLK_WRITE:
889 if (UBCISVALID(bp->b_vp) && bp->b_bufsize) {
890
891 if (bp->b_bufsize & 0xfff)
892 panic("list request is with less than 4k");
893
894 object = ubc_getobject(vp, UBC_NOREACTIVATE);
895 if (object == (void *)NULL)
896 panic("vmobject for vp is null");
897
898 file_offset = ubc_blktooff(vp, bp->b_lblkno);
899
900 kret = vm_fault_list_request(object,
901 (vm_object_offset_t)file_offset, bp->b_bufsize,
902 &upl, NULL, 0,
903 (UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_PRECIOUS | UPL_SET_INTERNAL));
904
905 if (kret != KERN_SUCCESS)
906 panic("Failed to get pagelists");
907
908 SET(bp->b_flags, B_PAGELIST);
909 bp->b_pagelist = upl;
910
911 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
912
913 if ( !upl_valid_page(pl, 0))
914 panic("getblk: incore buffer without valid page");
915
916 if (upl_dirty_page(pl, 0))
917 SET(bp->b_flags, B_WASDIRTY);
918 else
919 CLR(bp->b_flags, B_WASDIRTY);
920
921 kret = kernel_upl_map(kernel_map, upl, (vm_address_t *)&(bp->b_data));
922 if (kret != KERN_SUCCESS) {
923 panic("getblk: kernel_upl_map() "
924 "failed with (%d)", kret);
925 }
926 if (bp->b_data == 0) panic("kernel_upl_map mapped 0");
927 }
928 break;
929
930 case BLK_META:
931 /*
932 * VM is not involved in IO for the meta data
933 * buffer already has valid data
934 */
935 if(bp->b_data == 0)
936 panic("bp->b_data null incore buf=%x", bp);
937 break;
938
939 case BLK_PAGEIN:
940 case BLK_PAGEOUT:
941 panic("getblk: paging operation 1");
942 break;
943
944 default:
945 panic("getblk: %d unknown operation 2", operation);
946 /*NOTREACHED*/
947 break;
948 }
949 }
950 } else { /* not incore() */
951 int queue = BQ_EMPTY; /* Start with no preference */
952 splx(s);
953
954 if ((operation == BLK_META) || (UBCINVALID(vp)) ||
955 !(UBCINFOEXISTS(vp))) {
956 operation = BLK_META;
957 }
958 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
959 goto start;
960 /*
961 * if it is meta, the queue may be set to other
962 * type so reset as well as mark it to be B_META
963 * so that when buffer is released it will goto META queue
964 * Also, if the vnode is not VREG, then it is META
965 */
966 if (operation == BLK_META) {
967 SET(bp->b_flags, B_META);
968 queue = BQ_META;
969 }
970 allocbuf(bp, size);
971
972 switch (operation) {
973 case BLK_META:
974 /* buffer data is invalid */
975
976 /*
977 * Insert in the hash so that incore() can find it
978 */
979 binshash(bp, BUFHASH(vp, blkno));
980 #if !ZALLOC_METADATA
981 if (bp->b_data)
982 panic("bp->b_data is not nul; %x",bp);
983 kret = kmem_alloc(kernel_map,
984 &bp->b_data, bp->b_bufsize);
985 if (kret != KERN_SUCCESS)
986 panic("getblk: kmem_alloc() returned %d", kret);
987 #endif /* ZALLOC_METADATA */
988
989 if(bp->b_data == 0)
990 panic("bp->b_data is null %x",bp);
991
992 bp->b_blkno = bp->b_lblkno = blkno;
993 s = splbio();
994 bgetvp(vp, bp);
995 bufstats.bufs_miss++;
996 splx(s);
997 if (bp->b_data == 0)
998 panic("b_data is 0: 2");
999
1000 /* wakeup the buffer */
1001 CLR(bp->b_flags, B_WANTED);
1002 wakeup(bp);
1003 break;
1004
1005 case BLK_READ:
1006 case BLK_WRITE:
1007 /*
1008 * Insert in the hash so that incore() can find it
1009 */
1010 binshash(bp, BUFHASH(vp, blkno));
1011 pager = ubc_getpager(vp);
1012 file_offset = ubc_blktooff(vp, blkno);
1013
1014 object = ubc_getobject(vp, UBC_NOREACTIVATE);
1015 if (object == (void *)NULL)
1016 panic("vmobject for vp is null");
1017 if (bp->b_bufsize & 0xfff)
1018 panic("list request is with less than 4k");
1019
1020 if (ISSET(bp->b_flags, B_PAGELIST))
1021 panic("B_PAGELIST in bp=%x",bp);
1022
1023 kret = vm_fault_list_request(object,
1024 (vm_object_offset_t)file_offset, bp->b_bufsize,
1025 &upl, NULL, 0,
1026 (UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_PRECIOUS | UPL_SET_INTERNAL));
1027
1028 if (kret != KERN_SUCCESS)
1029 panic("Failed to get pagelists");
1030
1031 #ifdef UBC_DEBUG
1032 upl_ubc_alias_set(upl, bp, 4);
1033 #endif /* UBC_DEBUG */
1034 bp->b_blkno = bp->b_lblkno = blkno;
1035 bp->b_pagelist = upl;
1036
1037 SET(bp->b_flags, B_PAGELIST);
1038 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1039
1040 if (upl_valid_page(pl, 0)) {
1041 SET(bp->b_flags, B_CACHE | B_DONE);
1042 bufstats.bufs_vmhits++;
1043
1044 pagedirty = upl_dirty_page(pl, 0);
1045
1046 if (pagedirty)
1047 SET(bp->b_flags, B_WASDIRTY);
1048
1049 if (vp->v_tag == VT_NFS) {
1050 off_t f_offset;
1051 int valid_size;
1052
1053 bp->b_validoff = 0;
1054 bp->b_dirtyoff = 0;
1055
1056 f_offset = ubc_blktooff(vp, blkno);
1057
1058 if (f_offset > vp->v_ubcinfo->ui_size) {
1059 CLR(bp->b_flags, (B_CACHE|B_DONE|B_WASDIRTY));
1060 bp->b_validend = 0;
1061 bp->b_dirtyend = 0;
1062 } else {
1063 valid_size = min(((unsigned int)(vp->v_ubcinfo->ui_size - f_offset)), PAGE_SIZE);
1064 bp->b_validend = valid_size;
1065
1066 if (pagedirty)
1067 bp->b_dirtyend = valid_size;
1068 else
1069 bp->b_dirtyend = 0;
1070
1071 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_NONE,
1072 bp->b_validend, bp->b_dirtyend,
1073 (int)vp->v_ubcinfo->ui_size, 0, 0);
1074 }
1075 } else {
1076 bp->b_validoff = 0;
1077 bp->b_dirtyoff = 0;
1078
1079 if (pagedirty) {
1080 /* page is dirty */
1081 bp->b_validend = bp->b_bcount;
1082 bp->b_dirtyend = bp->b_bcount;
1083 } else {
1084 /* page is clean */
1085 bp->b_validend = bp->b_bcount;
1086 bp->b_dirtyend = 0;
1087 }
1088 }
1089 if (error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL)) {
1090 panic("VOP_BMAP failed in getblk");
1091 /*NOTREACHED*/
1092 /*
1093 * XXX: We probably should invalidate the VM Page
1094 */
1095 bp->b_error = error;
1096 SET(bp->b_flags, (B_ERROR | B_INVAL));
1097 /* undo B_DONE that was set before upl_commit() */
1098 CLR(bp->b_flags, B_DONE);
1099 brelse(bp);
1100 return (0);
1101 }
1102 } else {
1103 bufstats.bufs_miss++;
1104 }
1105 kret = kernel_upl_map(kernel_map, upl, (vm_address_t *)&(bp->b_data));
1106 if (kret != KERN_SUCCESS) {
1107 panic("getblk: kernel_upl_map() "
1108 "failed with (%d)", kret);
1109 }
1110 if (bp->b_data == 0) panic("kernel_upl_map mapped 0");
1111
1112 s = splbio();
1113 bgetvp(vp, bp);
1114 splx(s);
1115
1116 break;
1117
1118 case BLK_PAGEIN:
1119 case BLK_PAGEOUT:
1120 panic("getblk: paging operation 2");
1121 break;
1122 default:
1123 panic("getblk: %d unknown operation 3", operation);
1124 /*NOTREACHED*/
1125 break;
1126 }
1127 }
1128
1129 if (bp->b_data == NULL)
1130 panic("getblk: bp->b_addr is null");
1131
1132 if (bp->b_bufsize & 0xfff) {
1133 #if ZALLOC_METADATA
1134 if (ISSET(bp->b_flags, B_META) && (bp->b_bufsize & 0x1ff))
1135 #endif /* ZALLOC_METADATA */
1136 panic("getblk: bp->b_bufsize = %d", bp->b_bufsize);
1137 }
1138
1139 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
1140 bp, bp->b_data, bp->b_flags, 3, 0);
1141
1142 return (bp);
1143 }
1144
1145 /*
1146 * Get an empty, disassociated buffer of given size.
1147 */
1148 struct buf *
1149 geteblk(size)
1150 int size;
1151 {
1152 struct buf *bp;
1153 int queue = BQ_EMPTY;
1154 #if !ZALLOC_METADATA
1155 kern_return_t kret;
1156 vm_size_t desired_size = roundup(size, CLBYTES);
1157
1158 if (desired_size > MAXBSIZE)
1159 panic("geteblk: buffer larger than MAXBSIZE requested");
1160 #endif /* ZALLOC_METADATA */
1161
1162 while ((bp = getnewbuf(0, 0, &queue)) == 0)
1163 ;
1164 #if ZALLOC_METADATA
1165 SET(bp->b_flags, (B_META|B_INVAL));
1166 #else
1167 SET(bp->b_flags, B_INVAL);
1168 #endif /* ZALLOC_METADATA */
1169
1170 #if DIAGNOSTIC
1171 assert(queue == BQ_EMPTY);
1172 #endif /* DIAGNOSTIC */
1173 /* XXX need to implement logic to deal with other queues */
1174
1175 #if !ZALLOC_METADATA
1176 /* Empty buffer - allocate pages */
1177 kret = kmem_alloc_aligned(kernel_map, &bp->b_data, desired_size);
1178 if (kret != KERN_SUCCESS)
1179 panic("geteblk: kmem_alloc_aligned returned %d", kret);
1180 #endif /* ZALLOC_METADATA */
1181
1182 binshash(bp, &invalhash);
1183 allocbuf(bp, size);
1184 bufstats.bufs_eblk++;
1185
1186 return (bp);
1187 }
1188
1189 #if ZALLOC_METADATA
1190 /*
1191 * Zones for the meta data buffers
1192 */
1193
1194 #define MINMETA 512
1195 #define MAXMETA 4096
1196
1197 struct meta_zone_entry {
1198 zone_t mz_zone;
1199 vm_size_t mz_size;
1200 vm_size_t mz_max;
1201 char *mz_name;
1202 };
1203
1204 struct meta_zone_entry meta_zones[] = {
1205 {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
1206 {NULL, (MINMETA * 2), 64 * (MINMETA * 2), "buf.1024" },
1207 {NULL, (MINMETA * 3), 16 * (MINMETA * 3), "buf.1536" },
1208 {NULL, (MINMETA * 4), 16 * (MINMETA * 4), "buf.2048" },
1209 {NULL, (MINMETA * 5), 16 * (MINMETA * 5), "buf.2560" },
1210 {NULL, (MINMETA * 6), 16 * (MINMETA * 6), "buf.3072" },
1211 {NULL, (MINMETA * 7), 16 * (MINMETA * 7), "buf.3584" },
1212 {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
1213 {NULL, 0, 0, "" } /* End */
1214 };
1215
1216 /*
1217 * Initialize the meta data zones
1218 */
1219 static void
1220 bufzoneinit(void)
1221 {
1222 int i;
1223
1224 for (i = 0; meta_zones[i].mz_size != 0; i++) {
1225 meta_zones[i].mz_zone =
1226 zinit(meta_zones[i].mz_size,
1227 meta_zones[i].mz_max,
1228 PAGE_SIZE,
1229 meta_zones[i].mz_name);
1230 }
1231 }
1232
1233 static zone_t
1234 getbufzone(size_t size)
1235 {
1236 int i;
1237
1238 if (size % 512)
1239 panic("getbufzone: incorect size = %d", size);
1240
1241 i = (size / 512) - 1;
1242 return (meta_zones[i].mz_zone);
1243 }
1244 #endif /* ZALLOC_METADATA */
1245
1246 /*
1247 * With UBC, there is no need to expand / shrink the file data
1248 * buffer. The VM uses the same pages, hence no waste.
1249 * All the file data buffers can have one size.
1250 * In fact expand / shrink would be an expensive operation.
1251 *
1252 * Only exception to this is meta-data buffers. Most of the
1253 * meta data operations are smaller than PAGE_SIZE. Having the
1254 * meta-data buffers grow and shrink as needed, optimizes use
1255 * of the kernel wired memory.
1256 */
1257
1258 int
1259 allocbuf(bp, size)
1260 struct buf *bp;
1261 int size;
1262 {
1263 vm_size_t desired_size;
1264
1265 desired_size = roundup(size, CLBYTES);
1266
1267 if(desired_size < PAGE_SIZE)
1268 desired_size = PAGE_SIZE;
1269 if (desired_size > MAXBSIZE)
1270 panic("allocbuf: buffer larger than MAXBSIZE requested");
1271
1272 #if ZALLOC_METADATA
1273 if (ISSET(bp->b_flags, B_META)) {
1274 kern_return_t kret;
1275 zone_t zprev, z;
1276 size_t nsize = roundup(size, MINMETA);
1277
1278 if (bp->b_data) {
1279 vm_offset_t elem = (vm_offset_t)bp->b_data;
1280
1281 if (ISSET(bp->b_flags, B_ZALLOC))
1282 if (bp->b_bufsize <= MAXMETA) {
1283 if (bp->b_bufsize < nsize) {
1284 /* reallocate to a bigger size */
1285 desired_size = nsize;
1286
1287 zprev = getbufzone(bp->b_bufsize);
1288 z = getbufzone(nsize);
1289 bp->b_data = (caddr_t)zalloc(z);
1290 if(bp->b_data == 0)
1291 panic("allocbuf: zalloc() returned NULL");
1292 bcopy(elem, bp->b_data, bp->b_bufsize);
1293 zfree(zprev, elem);
1294 } else {
1295 desired_size = bp->b_bufsize;
1296 }
1297 } else
1298 panic("allocbuf: B_ZALLOC set incorrectly");
1299 else
1300 if (bp->b_bufsize < desired_size) {
1301 /* reallocate to a bigger size */
1302 kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1303 if (kret != KERN_SUCCESS)
1304 panic("allocbuf: kmem_alloc() returned %d", kret);
1305 if(bp->b_data == 0)
1306 panic("allocbuf: null b_data");
1307 bcopy(elem, bp->b_data, bp->b_bufsize);
1308 kmem_free(kernel_map, elem, bp->b_bufsize);
1309 } else {
1310 desired_size = bp->b_bufsize;
1311 }
1312 } else {
1313 /* new allocation */
1314 if (nsize <= MAXMETA) {
1315 desired_size = nsize;
1316 z = getbufzone(nsize);
1317 bp->b_data = (caddr_t)zalloc(z);
1318 if(bp->b_data == 0)
1319 panic("allocbuf: zalloc() returned NULL 2");
1320 SET(bp->b_flags, B_ZALLOC);
1321 } else {
1322 kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1323 if (kret != KERN_SUCCESS)
1324 panic("allocbuf: kmem_alloc() 2 returned %d", kret);
1325 if(bp->b_data == 0)
1326 panic("allocbuf: null b_data 2");
1327 }
1328 }
1329 }
1330
1331 if (ISSET(bp->b_flags, B_META) && (bp->b_data == 0))
1332 panic("allocbuf: bp->b_data is NULL");
1333 #endif /* ZALLOC_METADATA */
1334
1335 bp->b_bufsize = desired_size;
1336 bp->b_bcount = size;
1337 }
1338
1339 /*
1340 * Get a new buffer from one of the free lists.
1341 *
1342 * Request for a queue is passes in. The queue from which the buffer was taken
1343 * from is returned. Out of range queue requests get BQ_EMPTY. Request for
1344 * BQUEUE means no preference. Use heuristics in that case.
1345 * Heuristics is as follows:
1346 * Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
1347 * If none available block till one is made available.
1348 * If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
1349 * Pick the most stale buffer.
1350 * If found buffer was marked delayed write, start the async. write
1351 * and restart the search.
1352 * Initialize the fields and disassociate the buffer from the vnode.
1353 * Remove the buffer from the hash. Return the buffer and the queue
1354 * on which it was found.
1355 */
1356
1357 static struct buf *
1358 getnewbuf(slpflag, slptimeo, queue)
1359 int slpflag, slptimeo;
1360 int *queue;
1361 {
1362 register struct buf *bp;
1363 register struct buf *lru_bp;
1364 register struct buf *age_bp;
1365 register struct buf *meta_bp;
1366 register int age_time, lru_time, bp_time, meta_time;
1367 int s;
1368 struct ucred *cred;
1369 int req = *queue; /* save it for restarts */
1370
1371 start:
1372 s = splbio();
1373
1374 /* invalid request gets empty queue */
1375 if ((*queue > BQUEUES) || (*queue < 0))
1376 *queue = BQ_EMPTY;
1377
1378 /* (*queue == BQUEUES) means no preference */
1379 if (*queue != BQUEUES) {
1380 /* Try for the requested queue first */
1381 bp = bufqueues[*queue].tqh_first;
1382 if (bp)
1383 goto found;
1384 }
1385
1386 /* Unable to use requested queue */
1387 age_bp = bufqueues[BQ_AGE].tqh_first;
1388 lru_bp = bufqueues[BQ_LRU].tqh_first;
1389 meta_bp = bufqueues[BQ_META].tqh_first;
1390
1391 if (!age_bp && !lru_bp && !meta_bp) { /* Unavailble on AGE or LRU */
1392 /* Try the empty list first */
1393 bp = bufqueues[BQ_EMPTY].tqh_first;
1394 if (bp) {
1395 *queue = BQ_EMPTY;
1396 goto found;
1397 }
1398 #if DIAGNOSTIC
1399 /* with UBC this is a fatal condition */
1400 panic("getnewbuf: No useful buffers");
1401 #else
1402 /* Log this error condition */
1403 printf("getnewbuf: No useful buffers");
1404 #endif /* DIAGNOSTIC */
1405
1406 /* wait for a free buffer of any kind */
1407 needbuffer = 1;
1408 bufstats.bufs_sleeps++;
1409 tsleep(&needbuffer, slpflag|(PRIBIO+1), "getnewbuf", slptimeo);
1410 splx(s);
1411 return (0);
1412 }
1413
1414 /* Buffer available either on AGE or LRU or META */
1415 bp = NULL;
1416 *queue = -1;
1417
1418 /* Buffer available either on AGE or LRU */
1419 if (!age_bp) {
1420 bp = lru_bp;
1421 *queue = BQ_LRU;
1422 } else if (!lru_bp) {
1423 bp = age_bp;
1424 *queue = BQ_AGE;
1425 } else { /* buffer available on both AGE and LRU */
1426 age_time = time.tv_sec - age_bp->b_timestamp;
1427 lru_time = time.tv_sec - lru_bp->b_timestamp;
1428 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
1429 bp = age_bp;
1430 *queue = BQ_AGE;
1431 /*
1432 * we should probably re-timestamp eveything in the
1433 * queues at this point with the current time
1434 */
1435 } else {
1436 if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
1437 bp = lru_bp;
1438 *queue = BQ_LRU;
1439 } else {
1440 bp = age_bp;
1441 *queue = BQ_AGE;
1442 }
1443 }
1444 }
1445
1446 if (!bp) { /* Neither on AGE nor on LRU */
1447 bp = meta_bp;
1448 *queue = BQ_META;
1449 } else if (meta_bp) {
1450 bp_time = time.tv_sec - bp->b_timestamp;
1451 meta_time = time.tv_sec - meta_bp->b_timestamp;
1452
1453 if (!(bp_time < 0) && !(meta_time < 0)) {
1454 /* time not set backwards */
1455 int bp_is_stale;
1456 bp_is_stale = (*queue == BQ_LRU) ?
1457 lru_is_stale : age_is_stale;
1458
1459 if ((meta_time >= meta_is_stale) &&
1460 (bp_time < bp_is_stale)) {
1461 bp = meta_bp;
1462 *queue = BQ_META;
1463 }
1464 }
1465 }
1466
1467 if (bp == NULL)
1468 panic("getnewbuf: null bp");
1469
1470 found:
1471 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1472 panic("getnewbuf: le_prev is deadbeef");
1473
1474 if(ISSET(bp->b_flags, B_BUSY))
1475 panic("getnewbuf reusing BUSY buf");
1476
1477 /* Clean it */
1478 if (bcleanbuf(bp)) {
1479 /* bawrite() issued, buffer not ready */
1480 splx(s);
1481 *queue = req;
1482 goto start;
1483 }
1484 splx(s);
1485 return (bp);
1486 }
1487 #include <mach/mach_types.h>
1488 #include <mach/memory_object_types.h>
1489
1490 /*
1491 * Clean a buffer.
1492 * Returns 0 is buffer is ready to use,
1493 * Returns 1 if issued a bawrite() to indicate
1494 * that the buffer is not ready.
1495 */
1496 int
1497 bcleanbuf(struct buf *bp)
1498 {
1499 int s;
1500 struct ucred *cred;
1501
1502 s = splbio();
1503
1504 /* Remove from the queue */
1505 bremfree(bp);
1506
1507 /* Buffer is no longer on free lists. */
1508 SET(bp->b_flags, B_BUSY);
1509
1510 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1511 panic("bcleanbuf: le_prev is deadbeef");
1512
1513 /* If buffer was a delayed write, start it, and return 1 */
1514 if (ISSET(bp->b_flags, B_DELWRI)) {
1515 splx(s);
1516 bawrite (bp);
1517 return (1);
1518 }
1519
1520 if (bp->b_vp)
1521 brelvp(bp);
1522 bremhash(bp);
1523 BLISTNONE(bp);
1524
1525 splx(s);
1526
1527 if (ISSET(bp->b_flags, B_META)) {
1528 #if ZALLOC_METADATA
1529 vm_offset_t elem = (vm_offset_t)bp->b_data;
1530 if (elem == 0)
1531 panic("bcleanbuf: NULL bp->b_data B_META buffer");
1532
1533 if (ISSET(bp->b_flags, B_ZALLOC)) {
1534 if (bp->b_bufsize <= MAXMETA) {
1535 zone_t z;
1536
1537 z = getbufzone(bp->b_bufsize);
1538 bp->b_data = (caddr_t)0xdeadbeef;
1539 zfree(z, elem);
1540 CLR(bp->b_flags, B_ZALLOC);
1541 } else
1542 panic("bcleanbuf: B_ZALLOC set incorrectly");
1543 } else {
1544 bp->b_data = (caddr_t)0xdeadbeef;
1545 kmem_free(kernel_map, elem, bp->b_bufsize);
1546 }
1547 #else
1548 if (bp->b_data == 0)
1549 panic("bcleanbuf: bp->b_data == NULL for B_META buffer");
1550
1551 kmem_free(kernel_map, bp->b_data, bp->b_bufsize);
1552 #endif /* ZALLOC_METADATA */
1553 }
1554
1555 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
1556
1557 /* disassociate us from our vnode, if we had one... */
1558 s = splbio();
1559
1560 /* clear out various other fields */
1561 bp->b_data = 0;
1562 bp->b_flags = B_BUSY;
1563 bp->b_dev = NODEV;
1564 bp->b_blkno = bp->b_lblkno = 0;
1565 bp->b_iodone = 0;
1566 bp->b_error = 0;
1567 bp->b_resid = 0;
1568 bp->b_bcount = 0;
1569 bp->b_dirtyoff = bp->b_dirtyend = 0;
1570 bp->b_validoff = bp->b_validend = 0;
1571
1572 /* nuke any credentials we were holding */
1573 cred = bp->b_rcred;
1574 if (cred != NOCRED) {
1575 bp->b_rcred = NOCRED;
1576 crfree(cred);
1577 }
1578 cred = bp->b_wcred;
1579 if (cred != NOCRED) {
1580 bp->b_wcred = NOCRED;
1581 crfree(cred);
1582 }
1583 splx(s);
1584 return (0);
1585 }
1586
1587
1588 /*
1589 * Wait for operations on the buffer to complete.
1590 * When they do, extract and return the I/O's error value.
1591 */
1592 int
1593 biowait(bp)
1594 struct buf *bp;
1595 {
1596 upl_t upl;
1597 upl_page_info_t *pl;
1598 int s;
1599 kern_return_t kret;
1600
1601 s = splbio();
1602 while (!ISSET(bp->b_flags, B_DONE))
1603 tsleep(bp, PRIBIO + 1, "biowait", 0);
1604 splx(s);
1605
1606 /* check for interruption of I/O (e.g. via NFS), then errors. */
1607 if (ISSET(bp->b_flags, B_EINTR)) {
1608 CLR(bp->b_flags, B_EINTR);
1609 return (EINTR);
1610 } else if (ISSET(bp->b_flags, B_ERROR))
1611 return (bp->b_error ? bp->b_error : EIO);
1612 else
1613 return (0);
1614 }
1615
1616 /*
1617 * Mark I/O complete on a buffer.
1618 *
1619 * If a callback has been requested, e.g. the pageout
1620 * daemon, do so. Otherwise, awaken waiting processes.
1621 *
1622 * [ Leffler, et al., says on p.247:
1623 * "This routine wakes up the blocked process, frees the buffer
1624 * for an asynchronous write, or, for a request by the pagedaemon
1625 * process, invokes a procedure specified in the buffer structure" ]
1626 *
1627 * In real life, the pagedaemon (or other system processes) wants
1628 * to do async stuff to, and doesn't want the buffer brelse()'d.
1629 * (for swap pager, that puts swap buffers on the free lists (!!!),
1630 * for the vn device, that puts malloc'd buffers on the free lists!)
1631 */
1632 void
1633 biodone(bp)
1634 struct buf *bp;
1635 {
1636 boolean_t funnel_state;
1637 int s;
1638
1639 funnel_state = thread_funnel_set(kernel_flock, TRUE);
1640
1641 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
1642 bp, bp->b_data, bp->b_flags, 0, 0);
1643
1644 if (ISSET(bp->b_flags, B_DONE))
1645 panic("biodone already");
1646 SET(bp->b_flags, B_DONE); /* note that it's done */
1647 /*
1648 * I/O was done, so don't believe
1649 * the DIRTY state from VM anymore
1650 */
1651 CLR(bp->b_flags, B_WASDIRTY);
1652
1653 if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
1654 vwakeup(bp); /* wake up reader */
1655
1656 if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */
1657 CLR(bp->b_flags, B_CALL); /* but note callout done */
1658 (*bp->b_iodone)(bp);
1659 } else if (ISSET(bp->b_flags, B_ASYNC)) /* if async, release it */
1660 brelse(bp);
1661 else { /* or just wakeup the buffer */
1662 CLR(bp->b_flags, B_WANTED);
1663 wakeup(bp);
1664 }
1665
1666 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
1667 bp, bp->b_data, bp->b_flags, 0, 0);
1668
1669 thread_funnel_set(kernel_flock, funnel_state);
1670 }
1671
1672 /*
1673 * Return a count of buffers on the "locked" queue.
1674 */
1675 int
1676 count_lock_queue()
1677 {
1678 register struct buf *bp;
1679 register int n = 0;
1680
1681 for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
1682 bp = bp->b_freelist.tqe_next)
1683 n++;
1684 return (n);
1685 }
1686
1687 /*
1688 * Return a count of 'busy' buffers. Used at the time of shutdown.
1689 */
1690 int
1691 count_busy_buffers()
1692 {
1693 register struct buf *bp;
1694 register int nbusy = 0;
1695
1696 for (bp = &buf[nbuf]; --bp >= buf; )
1697 if ((bp->b_flags & (B_BUSY|B_INVAL)) == B_BUSY)
1698 nbusy++;
1699 return (nbusy);
1700 }
1701
1702 #if 1 /*DIAGNOSTIC */
1703 /*
1704 * Print out statistics on the current allocation of the buffer pool.
1705 * Can be enabled to print out on every ``sync'' by setting "syncprt"
1706 * in vfs_syscalls.c using sysctl.
1707 */
1708 void
1709 vfs_bufstats()
1710 {
1711 int s, i, j, count;
1712 register struct buf *bp;
1713 register struct bqueues *dp;
1714 int counts[MAXBSIZE/CLBYTES+1];
1715 static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY", "META" };
1716
1717 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
1718 count = 0;
1719 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1720 counts[j] = 0;
1721 s = splbio();
1722 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
1723 counts[bp->b_bufsize/CLBYTES]++;
1724 count++;
1725 }
1726 splx(s);
1727 printf("%s: total-%d", bname[i], count);
1728 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1729 if (counts[j] != 0)
1730 printf(", %d-%d", j * CLBYTES, counts[j]);
1731 printf("\n");
1732 }
1733 }
1734 #endif /* DIAGNOSTIC */
1735
1736
1737 struct buf *
1738 alloc_io_buf(vp)
1739 struct vnode *vp;
1740 {
1741 register struct buf *bp;
1742 int s;
1743
1744 s = splbio();
1745
1746 while ((bp = iobufqueue.tqh_first) == NULL) {
1747 need_iobuffer = 1;
1748 bufstats.bufs_iobufsleeps++;
1749 tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf", 0);
1750 }
1751 TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
1752 bp->b_timestamp = 0;
1753
1754 /* clear out various fields */
1755 bp->b_flags = B_BUSY;
1756 bp->b_blkno = bp->b_lblkno = 0;
1757 bp->b_iodone = 0;
1758 bp->b_error = 0;
1759 bp->b_resid = 0;
1760 bp->b_bcount = 0;
1761 bp->b_bufsize = 0;
1762 bp->b_vp = vp;
1763
1764 if (vp->v_type == VBLK || vp->v_type == VCHR)
1765 bp->b_dev = vp->v_rdev;
1766 else
1767 bp->b_dev = NODEV;
1768 bufstats.bufs_iobufinuse++;
1769 if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
1770 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
1771 splx(s);
1772
1773 return (bp);
1774 }
1775
1776 void
1777 free_io_buf(bp)
1778 struct buf *bp;
1779 {
1780 int s;
1781
1782 s = splbio();
1783 /* put buffer back on the head of the iobufqueue */
1784 bp->b_vp = NULL;
1785 bp->b_flags = B_INVAL;
1786
1787 binsheadfree(bp, &iobufqueue, -1);
1788
1789 /* Wake up any processes waiting for any buffer to become free. */
1790 if (need_iobuffer) {
1791 need_iobuffer = 0;
1792 wakeup(&need_iobuffer);
1793 }
1794 bufstats.bufs_iobufinuse--;
1795 splx(s);
1796 }
1797
1798
1799 /* not hookedup yet */
1800
1801 /* XXX move this to a separate file */
1802 /*
1803 * Dynamic Scaling of the Buffer Queues
1804 */
1805
1806 typedef long long blsize_t;
1807
1808 blsize_t MAXNBUF; /* initialize to (mem_size / PAGE_SIZE) */
1809 /* Global tunable limits */
1810 blsize_t nbufh; /* number of buffer headers */
1811 blsize_t nbuflow; /* minimum number of buffer headers required */
1812 blsize_t nbufhigh; /* maximum number of buffer headers allowed */
1813 blsize_t nbuftarget; /* preferred number of buffer headers */
1814
1815 /*
1816 * assertions:
1817 *
1818 * 1. 0 < nbuflow <= nbufh <= nbufhigh
1819 * 2. nbufhigh <= MAXNBUF
1820 * 3. 0 < nbuflow <= nbuftarget <= nbufhigh
1821 * 4. nbufh can not be set by sysctl().
1822 */
1823
1824 /* Per queue tunable limits */
1825
1826 struct bufqlim {
1827 blsize_t bl_nlow; /* minimum number of buffer headers required */
1828 blsize_t bl_num; /* number of buffer headers on the queue */
1829 blsize_t bl_nlhigh; /* maximum number of buffer headers allowed */
1830 blsize_t bl_target; /* preferred number of buffer headers */
1831 long bl_stale; /* Seconds after which a buffer is considered stale */
1832 } bufqlim[BQUEUES];
1833
1834 /*
1835 * assertions:
1836 *
1837 * 1. 0 <= bl_nlow <= bl_num <= bl_nlhigh
1838 * 2. bl_nlhigh <= MAXNBUF
1839 * 3. bufqlim[BQ_META].bl_nlow != 0
1840 * 4. bufqlim[BQ_META].bl_nlow > (number of possible concurrent
1841 * file system IO operations)
1842 * 5. bl_num can not be set by sysctl().
1843 * 6. bl_nhigh <= nbufhigh
1844 */
1845
1846 /*
1847 * Rationale:
1848 * ----------
1849 * Defining it blsize_t as long permits 2^31 buffer headers per queue.
1850 * Which can describe (2^31 * PAGE_SIZE) memory per queue.
1851 *
1852 * These limits are exported to by means of sysctl().
1853 * It was decided to define blsize_t as a 64 bit quantity.
1854 * This will make sure that we will not be required to change it
1855 * as long as we do not exceed 64 bit address space for the kernel.
1856 *
1857 * low and high numbers parameters initialized at compile time
1858 * and boot arguments can be used to override them. sysctl()
1859 * would not change the value. sysctl() can get all the values
1860 * but can set only target. num is the current level.
1861 *
1862 * Advantages of having a "bufqscan" thread doing the balancing are,
1863 * Keep enough bufs on BQ_EMPTY.
1864 * getnewbuf() by default will always select a buffer from the BQ_EMPTY.
1865 * getnewbuf() perfoms best if a buffer was found there.
1866 * Also this minimizes the possibility of starting IO
1867 * from getnewbuf(). That's a performance win, too.
1868 *
1869 * Localize complex logic [balancing as well as time aging]
1870 * to balancebufq().
1871 *
1872 * Simplify getnewbuf() logic by elimination of time aging code.
1873 */
1874
1875 /*
1876 * Algorithm:
1877 * -----------
1878 * The goal of the dynamic scaling of the buffer queues to to keep
1879 * the size of the LRU close to bl_target. Buffers on a queue would
1880 * be time aged.
1881 *
1882 * There would be a thread which will be responsible for "balancing"
1883 * the buffer cache queues.
1884 *
1885 * The scan order would be: AGE, LRU, META, EMPTY.
1886 */
1887
1888 long bufqscanwait = 0;
1889
1890 extern void bufqscan_thread();
1891 extern int balancebufq(int q);
1892 extern int btrimempty(int n);
1893 extern int initbufqscan(void);
1894 extern int nextbufq(int q);
1895 extern void buqlimprt(int all);
1896
1897 void
1898 bufq_balance_thread_init()
1899 {
1900
1901 if (bufqscanwait++ == 0) {
1902 int i;
1903
1904 /* Initalize globals */
1905 MAXNBUF = (mem_size / PAGE_SIZE);
1906 nbufh = nbuf;
1907 nbuflow = min(nbufh, 100);
1908 nbufhigh = min(MAXNBUF, max(nbufh, 2048));
1909 nbuftarget = (mem_size >> 5) / PAGE_SIZE;
1910 nbuftarget = max(nbuflow, nbuftarget);
1911 nbuftarget = min(nbufhigh, nbuftarget);
1912
1913 /*
1914 * Initialize the bufqlim
1915 */
1916
1917 /* LOCKED queue */
1918 bufqlim[BQ_LOCKED].bl_nlow = 0;
1919 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
1920 bufqlim[BQ_LOCKED].bl_target = 0;
1921 bufqlim[BQ_LOCKED].bl_stale = 30;
1922
1923 /* LRU queue */
1924 bufqlim[BQ_LRU].bl_nlow = 0;
1925 bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
1926 bufqlim[BQ_LRU].bl_target = nbuftarget/4;
1927 bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
1928
1929 /* AGE queue */
1930 bufqlim[BQ_AGE].bl_nlow = 0;
1931 bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
1932 bufqlim[BQ_AGE].bl_target = nbuftarget/4;
1933 bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
1934
1935 /* EMPTY queue */
1936 bufqlim[BQ_EMPTY].bl_nlow = 0;
1937 bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
1938 bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
1939 bufqlim[BQ_EMPTY].bl_stale = 600000;
1940
1941 /* META queue */
1942 bufqlim[BQ_META].bl_nlow = 0;
1943 bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
1944 bufqlim[BQ_META].bl_target = nbuftarget/4;
1945 bufqlim[BQ_META].bl_stale = META_IS_STALE;
1946
1947 buqlimprt(1);
1948 }
1949
1950 /* create worker thread */
1951 kernel_thread(kernel_task, bufqscan_thread);
1952 }
1953
1954 /* The workloop for the buffer balancing thread */
1955 void
1956 bufqscan_thread()
1957 {
1958 boolean_t funnel_state;
1959 int moretodo = 0;
1960
1961 funnel_state = thread_funnel_set(kernel_flock, TRUE);
1962
1963 for(;;) {
1964 do {
1965 int q; /* buffer queue to process */
1966
1967 for (q = initbufqscan(); q; ) {
1968 moretodo |= balancebufq(q);
1969 q = nextbufq(q);
1970 }
1971 } while (moretodo);
1972
1973 #if 1 || DIAGNOSTIC
1974 vfs_bufstats();
1975 buqlimprt(0);
1976 #endif
1977 (void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz);
1978 moretodo = 0;
1979 }
1980
1981 (void) thread_funnel_set(kernel_flock, FALSE);
1982 }
1983
1984 /* Seed for the buffer queue balancing */
1985 int
1986 initbufqscan()
1987 {
1988 /* Start with AGE queue */
1989 return (BQ_AGE);
1990 }
1991
1992 /* Pick next buffer queue to balance */
1993 int
1994 nextbufq(int q)
1995 {
1996 int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
1997
1998 q++;
1999 q %= sizeof(order);
2000 return (order[q]);
2001 }
2002
2003 /* function to balance the buffer queues */
2004 int
2005 balancebufq(int q)
2006 {
2007 int moretodo = 0;
2008 int s = splbio();
2009 int n;
2010
2011 /* reject invalid q */
2012 if ((q < 0) || (q >= BQUEUES))
2013 goto out;
2014
2015 /* LOCKED queue MUST not be balanced */
2016 if (q == BQ_LOCKED)
2017 goto out;
2018
2019 n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
2020
2021 /* If queue has less than target nothing more to do */
2022 if (n < 0)
2023 goto out;
2024
2025 if ( n > 8 ) {
2026 /* Balance only a small amount (12.5%) at a time */
2027 n >>= 3;
2028 }
2029
2030 /* EMPTY queue needs special handling */
2031 if (q == BQ_EMPTY) {
2032 moretodo |= btrimempty(n);
2033 goto out;
2034 }
2035
2036 for (; n > 0; n--) {
2037 struct buf *bp = bufqueues[q].tqh_first;
2038 if (!bp)
2039 break;
2040
2041 /* check if it's stale */
2042 if ((time.tv_sec - bp->b_timestamp) > bufqlim[q].bl_stale) {
2043 if (bcleanbuf(bp)) {
2044 /* bawrite() issued, bp not ready */
2045 moretodo = 1;
2046 } else {
2047 /* release the cleaned buffer to BQ_EMPTY */
2048 SET(bp->b_flags, B_INVAL);
2049 brelse(bp);
2050 }
2051 } else
2052 break;
2053 }
2054
2055 out:
2056 splx(s);
2057 return (moretodo);
2058 }
2059
2060 int
2061 btrimempty(int n)
2062 {
2063 /*
2064 * When struct buf are allocated dynamically, this would
2065 * reclaim upto 'n' struct buf from the empty queue.
2066 */
2067
2068 return (0);
2069 }
2070
2071 void
2072 bufqinc(int q)
2073 {
2074 if ((q < 0) || (q >= BQUEUES))
2075 return;
2076
2077 bufqlim[q].bl_num++;
2078 return;
2079 }
2080
2081 void
2082 bufqdec(int q)
2083 {
2084 if ((q < 0) || (q >= BQUEUES))
2085 return;
2086
2087 bufqlim[q].bl_num--;
2088 return;
2089 }
2090
2091 void
2092 buqlimprt(int all)
2093 {
2094 int i;
2095 static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY", "META" };
2096
2097 if (all)
2098 for (i = 0; i < BQUEUES; i++) {
2099 printf("%s : ", bname[i]);
2100 printf("min = %d, ", (long)bufqlim[i].bl_nlow);
2101 printf("cur = %d, ", (long)bufqlim[i].bl_num);
2102 printf("max = %d, ", (long)bufqlim[i].bl_nlhigh);
2103 printf("target = %d, ", (long)bufqlim[i].bl_target);
2104 printf("stale after %d seconds\n", bufqlim[i].bl_stale);
2105 }
2106 else
2107 for (i = 0; i < BQUEUES; i++) {
2108 printf("%s : ", bname[i]);
2109 printf("cur = %d, ", (long)bufqlim[i].bl_num);
2110 }
2111 }