]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_bio.c
xnu-201.19.3.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_bio.c
1 /*
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
23 /*-
24 * Copyright (c) 1994 Christopher G. Demetriou
25 * Copyright (c) 1982, 1986, 1989, 1993
26 * The Regents of the University of California. All rights reserved.
27 * (c) UNIX System Laboratories, Inc.
28 * All or some portions of this file are derived from material licensed
29 * to the University of California by American Telephone and Telegraph
30 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
31 * the permission of UNIX System Laboratories, Inc.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * The NEXTSTEP Software License Agreement specifies the terms
62 * and conditions for redistribution.
63 *
64 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
65 */
66
67
68 /*
69 * Some references:
70 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
71 * Leffler, et al.: The Design and Implementation of the 4.3BSD
72 * UNIX Operating System (Addison Welley, 1989)
73 */
74 #define ZALLOC_METADATA 1
75
76 #include <sys/param.h>
77 #include <sys/systm.h>
78 #include <sys/proc.h>
79 #include <sys/buf.h>
80 #include <sys/vnode.h>
81 #include <sys/mount.h>
82 #include <sys/trace.h>
83 #include <sys/malloc.h>
84 #include <sys/resourcevar.h>
85 #include <miscfs/specfs/specdev.h>
86 #include <sys/ubc.h>
87 #include <vm/vm_pageout.h>
88 #if DIAGNOSTIC
89 #include <kern/assert.h>
90 #endif /* DIAGNOSTIC */
91 #include <kern/task.h>
92 #include <kern/zalloc.h>
93
94 #include <sys/kdebug.h>
95
96 extern void bufqinc(int q);
97 extern void bufqdec(int q);
98 extern void bufq_balance_thread_init();
99
100 extern void reassignbuf(struct buf *, struct vnode *);
101 static struct buf *getnewbuf(int slpflag, int slptimeo, int *queue);
102
103 extern int niobuf; /* The number of IO buffer headers for cluster IO */
104 int blaundrycnt;
105
106 #if TRACE
107 struct proc *traceproc;
108 int tracewhich, tracebuf[TRCSIZ];
109 u_int tracex;
110 char traceflags[TR_NFLAGS];
111 #endif /* TRACE */
112
113 /*
114 * Definitions for the buffer hash lists.
115 */
116 #define BUFHASH(dvp, lbn) \
117 (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
118 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
119 u_long bufhash;
120
121 /* Definitions for the buffer stats. */
122 struct bufstats bufstats;
123
124 /*
125 * Insq/Remq for the buffer hash lists.
126 */
127 #if 0
128 #define binshash(bp, dp) LIST_INSERT_HEAD(dp, bp, b_hash)
129 #define bremhash(bp) LIST_REMOVE(bp, b_hash)
130 #endif /* 0 */
131
132
133 TAILQ_HEAD(ioqueue, buf) iobufqueue;
134 TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
135 int needbuffer;
136 int need_iobuffer;
137
138 /*
139 * Insq/Remq for the buffer free lists.
140 */
141 #define binsheadfree(bp, dp, whichq) do { \
142 TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
143 bufqinc((whichq)); \
144 (bp)->b_whichq = whichq; \
145 (bp)->b_timestamp = time.tv_sec; \
146 } while (0)
147
148 #define binstailfree(bp, dp, whichq) do { \
149 TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
150 bufqinc((whichq)); \
151 (bp)->b_whichq = whichq; \
152 (bp)->b_timestamp = time.tv_sec; \
153 } while (0)
154
155 #define BHASHENTCHECK(bp) \
156 if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef) \
157 panic("%x: b_hash.le_prev is not deadbeef", (bp));
158
159 #define BLISTNONE(bp) \
160 (bp)->b_hash.le_next = (struct buf *)0; \
161 (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
162
163 simple_lock_data_t bufhashlist_slock; /* lock on buffer hash list */
164
165 /*
166 * Time in seconds before a buffer on a list is
167 * considered as a stale buffer
168 */
169 #define LRU_IS_STALE 120 /* default value for the LRU */
170 #define AGE_IS_STALE 60 /* default value for the AGE */
171 #define META_IS_STALE 180 /* default value for the BQ_META */
172
173 int lru_is_stale = LRU_IS_STALE;
174 int age_is_stale = AGE_IS_STALE;
175 int meta_is_stale = META_IS_STALE;
176
177 #if 1
178 void
179 blistenterhead(struct bufhashhdr * head, struct buf * bp)
180 {
181 if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
182 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
183 (head)->lh_first = bp;
184 bp->b_hash.le_prev = &(head)->lh_first;
185 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
186 panic("blistenterhead: le_prev is deadbeef");
187
188 }
189 #endif
190
191 #if 1
192 void
193 binshash(struct buf *bp, struct bufhashhdr *dp)
194 {
195 int s;
196
197 struct buf *nbp;
198
199 simple_lock(&bufhashlist_slock);
200 #if 0
201 if(incore(bp->b_vp, bp->b_lblkno)) {
202 panic("adding to queue already existing element");
203 }
204 #endif /* 0 */
205 BHASHENTCHECK(bp);
206
207 nbp = dp->lh_first;
208 for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
209 if(nbp == bp)
210 panic("buf already in hashlist");
211 }
212
213 #if 0
214 LIST_INSERT_HEAD(dp, bp, b_hash);
215 #else
216 blistenterhead(dp, bp);
217 #endif
218 simple_unlock(&bufhashlist_slock);
219 }
220
221 void
222 bremhash(struct buf *bp)
223 {
224 int s;
225
226 simple_lock(&bufhashlist_slock);
227 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
228 panic("bremhash le_prev is deadbeef");
229 if (bp->b_hash.le_next == bp)
230 panic("bremhash: next points to self");
231
232 if (bp->b_hash.le_next != NULL)
233 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
234 *bp->b_hash.le_prev = (bp)->b_hash.le_next;
235 simple_unlock(&bufhashlist_slock);
236 }
237
238 #endif /* 1 */
239
240
241 /*
242 * Remove a buffer from the free list it's on
243 */
244 void
245 bremfree(bp)
246 struct buf *bp;
247 {
248 struct bqueues *dp = NULL;
249 int whichq = -1;
250
251 /*
252 * We only calculate the head of the freelist when removing
253 * the last element of the list as that is the only time that
254 * it is needed (e.g. to reset the tail pointer).
255 *
256 * NB: This makes an assumption about how tailq's are implemented.
257 */
258 if (bp->b_freelist.tqe_next == NULL) {
259 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
260 if (dp->tqh_last == &bp->b_freelist.tqe_next)
261 break;
262 if (dp == &bufqueues[BQUEUES])
263 panic("bremfree: lost tail");
264 }
265 TAILQ_REMOVE(dp, bp, b_freelist);
266 whichq = bp->b_whichq;
267 bufqdec(whichq);
268 bp->b_whichq = -1;
269 bp->b_timestamp = 0;
270 }
271
272 static __inline__ void
273 bufhdrinit(struct buf *bp)
274 {
275 bzero((char *)bp, sizeof *bp);
276 bp->b_dev = NODEV;
277 bp->b_rcred = NOCRED;
278 bp->b_wcred = NOCRED;
279 bp->b_vnbufs.le_next = NOLIST;
280 bp->b_flags = B_INVAL;
281
282 return;
283 }
284
285 /*
286 * Initialize buffers and hash links for buffers.
287 */
288 void
289 bufinit()
290 {
291 register struct buf *bp;
292 register struct bqueues *dp;
293 register int i;
294 int metabuf;
295 long whichq;
296 static void bufzoneinit();
297 static void bcleanbuf_thread_init();
298
299 /* Initialize the buffer queues ('freelists') and the hash table */
300 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
301 TAILQ_INIT(dp);
302 bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash);
303
304 simple_lock_init(&bufhashlist_slock );
305
306 metabuf = nbuf/8; /* reserved for meta buf */
307
308 /* Initialize the buffer headers */
309 for (i = 0; i < nbuf; i++) {
310 bp = &buf[i];
311 bufhdrinit(bp);
312
313 /*
314 * metabuf buffer headers on the meta-data list and
315 * rest of the buffer headers on the empty list
316 */
317 if (--metabuf)
318 whichq = BQ_META;
319 else
320 whichq = BQ_EMPTY;
321
322 BLISTNONE(bp);
323 dp = &bufqueues[whichq];
324 binsheadfree(bp, dp, whichq);
325 binshash(bp, &invalhash);
326 }
327
328 for (; i < nbuf + niobuf; i++) {
329 bp = &buf[i];
330 bufhdrinit(bp);
331 binsheadfree(bp, &iobufqueue, -1);
332 }
333
334 printf("using %d buffer headers and %d cluster IO buffer headers\n",
335 nbuf, niobuf);
336
337 /* Set up zones used by the buffer cache */
338 bufzoneinit();
339
340 /* start the bcleanbuf() thread */
341 bcleanbuf_thread_init();
342
343 #if 0 /* notyet */
344 /* create a thread to do dynamic buffer queue balancing */
345 bufq_balance_thread_init();
346 #endif /* XXX */
347 }
348
349 /* __inline */
350 struct buf *
351 bio_doread(vp, blkno, size, cred, async, queuetype)
352 struct vnode *vp;
353 daddr_t blkno;
354 int size;
355 struct ucred *cred;
356 int async;
357 int queuetype;
358 {
359 register struct buf *bp;
360 struct proc *p = current_proc();
361
362 bp = getblk(vp, blkno, size, 0, 0, queuetype);
363
364 /*
365 * If buffer does not have data valid, start a read.
366 * Note that if buffer is B_INVAL, getblk() won't return it.
367 * Therefore, it's valid if it's I/O has completed or been delayed.
368 */
369 if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
370 /* Start I/O for the buffer (keeping credentials). */
371 SET(bp->b_flags, B_READ | async);
372 if (cred != NOCRED && bp->b_rcred == NOCRED) {
373 /*
374 * NFS has embedded ucred.
375 * Can not crhold() here as that causes zone corruption
376 */
377 bp->b_rcred = crdup(cred);
378 }
379 VOP_STRATEGY(bp);
380
381 trace(TR_BREADMISS, pack(vp, size), blkno);
382
383 /* Pay for the read. */
384 if (p && p->p_stats)
385 p->p_stats->p_ru.ru_inblock++; /* XXX */
386 } else if (async) {
387 brelse(bp);
388 }
389
390 trace(TR_BREADHIT, pack(vp, size), blkno);
391
392 return (bp);
393 }
394 /*
395 * Read a disk block.
396 * This algorithm described in Bach (p.54).
397 */
398 int
399 bread(vp, blkno, size, cred, bpp)
400 struct vnode *vp;
401 daddr_t blkno;
402 int size;
403 struct ucred *cred;
404 struct buf **bpp;
405 {
406 register struct buf *bp;
407
408 /* Get buffer for block. */
409 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
410
411 /* Wait for the read to complete, and return result. */
412 return (biowait(bp));
413 }
414
415 /*
416 * Read a disk block. [bread() for meta-data]
417 * This algorithm described in Bach (p.54).
418 */
419 int
420 meta_bread(vp, blkno, size, cred, bpp)
421 struct vnode *vp;
422 daddr_t blkno;
423 int size;
424 struct ucred *cred;
425 struct buf **bpp;
426 {
427 register struct buf *bp;
428
429 /* Get buffer for block. */
430 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
431
432 /* Wait for the read to complete, and return result. */
433 return (biowait(bp));
434 }
435
436 /*
437 * Read-ahead multiple disk blocks. The first is sync, the rest async.
438 * Trivial modification to the breada algorithm presented in Bach (p.55).
439 */
440 int
441 breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp)
442 struct vnode *vp;
443 daddr_t blkno; int size;
444 daddr_t rablks[]; int rasizes[];
445 int nrablks;
446 struct ucred *cred;
447 struct buf **bpp;
448 {
449 register struct buf *bp;
450 int i;
451
452 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
453
454 /*
455 * For each of the read-ahead blocks, start a read, if necessary.
456 */
457 for (i = 0; i < nrablks; i++) {
458 /* If it's in the cache, just go on to next one. */
459 if (incore(vp, rablks[i]))
460 continue;
461
462 /* Get a buffer for the read-ahead block */
463 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, BLK_READ);
464 }
465
466 /* Otherwise, we had to start a read for it; wait until it's valid. */
467 return (biowait(bp));
468 }
469
470 /*
471 * Read with single-block read-ahead. Defined in Bach (p.55), but
472 * implemented as a call to breadn().
473 * XXX for compatibility with old file systems.
474 */
475 int
476 breada(vp, blkno, size, rablkno, rabsize, cred, bpp)
477 struct vnode *vp;
478 daddr_t blkno; int size;
479 daddr_t rablkno; int rabsize;
480 struct ucred *cred;
481 struct buf **bpp;
482 {
483
484 return (breadn(vp, blkno, size, &rablkno, &rabsize, 1, cred, bpp));
485 }
486
487 /*
488 * Block write. Described in Bach (p.56)
489 */
490 int
491 bwrite(bp)
492 struct buf *bp;
493 {
494 int rv, sync, wasdelayed;
495 struct proc *p = current_proc();
496 upl_t upl;
497 upl_page_info_t *pl;
498 void * object;
499 kern_return_t kret;
500 struct vnode *vp = bp->b_vp;
501
502 /* Remember buffer type, to switch on it later. */
503 sync = !ISSET(bp->b_flags, B_ASYNC);
504 wasdelayed = ISSET(bp->b_flags, B_DELWRI);
505 CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
506
507 if (!sync) {
508 /*
509 * If not synchronous, pay for the I/O operation and make
510 * sure the buf is on the correct vnode queue. We have
511 * to do this now, because if we don't, the vnode may not
512 * be properly notified that its I/O has completed.
513 */
514 if (wasdelayed)
515 reassignbuf(bp, vp);
516 else
517 if (p && p->p_stats)
518 p->p_stats->p_ru.ru_oublock++; /* XXX */
519 }
520
521 trace(TR_BWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
522
523 /* Initiate disk write. Make sure the appropriate party is charged. */
524 SET(bp->b_flags, B_WRITEINPROG);
525 vp->v_numoutput++;
526
527 VOP_STRATEGY(bp);
528
529 if (sync) {
530 /*
531 * If I/O was synchronous, wait for it to complete.
532 */
533 rv = biowait(bp);
534
535 /*
536 * Pay for the I/O operation, if it's not been paid for, and
537 * make sure it's on the correct vnode queue. (async operatings
538 * were payed for above.)
539 */
540 if (wasdelayed)
541 reassignbuf(bp, vp);
542 else
543 if (p && p->p_stats)
544 p->p_stats->p_ru.ru_oublock++; /* XXX */
545
546 /* Release the buffer. */
547 brelse(bp);
548
549 return (rv);
550 } else {
551 return (0);
552 }
553 }
554
555 int
556 vn_bwrite(ap)
557 struct vop_bwrite_args *ap;
558 {
559 return (bwrite(ap->a_bp));
560 }
561
562 /*
563 * Delayed write.
564 *
565 * The buffer is marked dirty, but is not queued for I/O.
566 * This routine should be used when the buffer is expected
567 * to be modified again soon, typically a small write that
568 * partially fills a buffer.
569 *
570 * NB: magnetic tapes cannot be delayed; they must be
571 * written in the order that the writes are requested.
572 *
573 * Described in Leffler, et al. (pp. 208-213).
574 */
575 void
576 bdwrite(bp)
577 struct buf *bp;
578 {
579 struct proc *p = current_proc();
580 kern_return_t kret;
581 upl_t upl;
582 upl_page_info_t *pl;
583
584 /*
585 * If the block hasn't been seen before:
586 * (1) Mark it as having been seen,
587 * (2) Charge for the write.
588 * (3) Make sure it's on its vnode's correct block list,
589 */
590 if (!ISSET(bp->b_flags, B_DELWRI)) {
591 SET(bp->b_flags, B_DELWRI);
592 if (p && p->p_stats)
593 p->p_stats->p_ru.ru_oublock++; /* XXX */
594
595 reassignbuf(bp, bp->b_vp);
596 }
597
598
599 /* If this is a tape block, write it the block now. */
600 if (ISSET(bp->b_flags, B_TAPE)) {
601 /* bwrite(bp); */
602 VOP_BWRITE(bp);
603 return;
604 }
605
606 /* Otherwise, the "write" is done, so mark and release the buffer. */
607 SET(bp->b_flags, B_DONE);
608 brelse(bp);
609 }
610
611 /*
612 * Asynchronous block write; just an asynchronous bwrite().
613 */
614 void
615 bawrite(bp)
616 struct buf *bp;
617 {
618
619 SET(bp->b_flags, B_ASYNC);
620 VOP_BWRITE(bp);
621 }
622
623 /*
624 * Release a buffer on to the free lists.
625 * Described in Bach (p. 46).
626 */
627 void
628 brelse(bp)
629 struct buf *bp;
630 {
631 struct bqueues *bufq;
632 int s;
633 long whichq;
634
635 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
636 bp->b_lblkno * PAGE_SIZE, (int)bp, (int)bp->b_data,
637 bp->b_flags, 0);
638
639 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
640
641 /* IO is done. Cleanup the UPL state */
642 if (!ISSET(bp->b_flags, B_META)
643 && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
644 kern_return_t kret;
645 upl_t upl;
646 int upl_flags;
647
648 if ( !ISSET(bp->b_flags, B_PAGELIST)) {
649 if ( !ISSET(bp->b_flags, B_INVAL)) {
650 kret = ubc_create_upl(bp->b_vp,
651 ubc_blktooff(bp->b_vp, bp->b_lblkno),
652 bp->b_bufsize,
653 &upl,
654 NULL,
655 UPL_PRECIOUS);
656 if (kret != KERN_SUCCESS)
657 panic("brelse: Failed to get pagelists");
658 #ifdef UBC_DEBUG
659 upl_ubc_alias_set(upl, bp, 5);
660 #endif /* UBC_DEBUG */
661 } else
662 upl = (upl_t) 0;
663 } else {
664 upl = bp->b_pagelist;
665 kret = ubc_upl_unmap(upl);
666
667 if (kret != KERN_SUCCESS)
668 panic("kernel_upl_unmap failed");
669 bp->b_data = 0;
670 }
671 if (upl) {
672 if (bp->b_flags & (B_ERROR | B_INVAL)) {
673 if (bp->b_flags & (B_READ | B_INVAL))
674 upl_flags = UPL_ABORT_DUMP_PAGES;
675 else
676 upl_flags = 0;
677 ubc_upl_abort(upl, upl_flags);
678 } else {
679 if (ISSET(bp->b_flags, B_NEEDCOMMIT))
680 upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
681 else if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY))
682 upl_flags = UPL_COMMIT_SET_DIRTY ;
683 else
684 upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
685 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
686 UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
687 }
688 s = splbio();
689 CLR(bp->b_flags, B_PAGELIST);
690 bp->b_pagelist = 0;
691 splx(s);
692 }
693 } else {
694 if(ISSET(bp->b_flags, B_PAGELIST))
695 panic("brelse: pagelist set for non VREG; vp=%x", bp->b_vp);
696 }
697
698 /* Wake up any processes waiting for any buffer to become free. */
699 if (needbuffer) {
700 needbuffer = 0;
701 wakeup(&needbuffer);
702 }
703
704 /* Wake up any proceeses waiting for _this_ buffer to become free. */
705 if (ISSET(bp->b_flags, B_WANTED)) {
706 CLR(bp->b_flags, B_WANTED);
707 wakeup(bp);
708 }
709
710 /* Block disk interrupts. */
711 s = splbio();
712
713 /*
714 * Determine which queue the buffer should be on, then put it there.
715 */
716
717 /* If it's locked, don't report an error; try again later. */
718 if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
719 CLR(bp->b_flags, B_ERROR);
720
721 /* If it's not cacheable, or an error, mark it invalid. */
722 if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
723 SET(bp->b_flags, B_INVAL);
724
725 if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
726 /*
727 * If it's invalid or empty, dissociate it from its vnode
728 * and put on the head of the appropriate queue.
729 */
730 if (bp->b_vp)
731 brelvp(bp);
732 CLR(bp->b_flags, B_DELWRI);
733 if (bp->b_bufsize <= 0)
734 whichq = BQ_EMPTY; /* no data */
735 else
736 whichq = BQ_AGE; /* invalid data */
737
738 bufq = &bufqueues[whichq];
739 binsheadfree(bp, bufq, whichq);
740 } else {
741 /*
742 * It has valid data. Put it on the end of the appropriate
743 * queue, so that it'll stick around for as long as possible.
744 */
745 if (ISSET(bp->b_flags, B_LOCKED))
746 whichq = BQ_LOCKED; /* locked in core */
747 else if (ISSET(bp->b_flags, B_META))
748 whichq = BQ_META; /* meta-data */
749 else if (ISSET(bp->b_flags, B_AGE))
750 whichq = BQ_AGE; /* stale but valid data */
751 else
752 whichq = BQ_LRU; /* valid data */
753
754 bufq = &bufqueues[whichq];
755 binstailfree(bp, bufq, whichq);
756 }
757
758 /* Unlock the buffer. */
759 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE));
760
761 /* Allow disk interrupts. */
762 splx(s);
763
764 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
765 (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
766 }
767
768 /*
769 * Determine if a block is in the cache.
770 * Just look on what would be its hash chain. If it's there, return
771 * a pointer to it, unless it's marked invalid. If it's marked invalid,
772 * we normally don't return the buffer, unless the caller explicitly
773 * wants us to.
774 */
775 struct buf *
776 incore(vp, blkno)
777 struct vnode *vp;
778 daddr_t blkno;
779 {
780 struct buf *bp;
781 int bufseen = 0;
782
783 bp = BUFHASH(vp, blkno)->lh_first;
784
785 /* Search hash chain */
786 for (; bp != NULL; bp = bp->b_hash.le_next, bufseen++) {
787 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
788 !ISSET(bp->b_flags, B_INVAL))
789 return (bp);
790 if(bufseen >= nbuf)
791 panic("walked more than nbuf in incore");
792
793 }
794
795 return (0);
796 }
797
798
799 /* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
800 /*
801 * Get a block of requested size that is associated with
802 * a given vnode and block offset. If it is found in the
803 * block cache, mark it as having been found, make it busy
804 * and return it. Otherwise, return an empty block of the
805 * correct size. It is up to the caller to insure that the
806 * cached blocks be of the correct size.
807 */
808 struct buf *
809 getblk(vp, blkno, size, slpflag, slptimeo, operation)
810 register struct vnode *vp;
811 daddr_t blkno;
812 int size, slpflag, slptimeo, operation;
813 {
814 struct buf *bp;
815 int s, err;
816 upl_t upl;
817 upl_page_info_t *pl;
818 kern_return_t kret;
819 int error=0;
820 int pagedirty = 0;
821
822 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
823 blkno * PAGE_SIZE, size, operation, 0, 0);
824 start:
825
826 s = splbio();
827 if (bp = incore(vp, blkno)) {
828 /* Found in the Buffer Cache */
829 if (ISSET(bp->b_flags, B_BUSY)) {
830 /* but is busy */
831 switch (operation) {
832 case BLK_READ:
833 case BLK_WRITE:
834 case BLK_META:
835 SET(bp->b_flags, B_WANTED);
836 bufstats.bufs_busyincore++;
837 err = tsleep(bp, slpflag | (PRIBIO + 1), "getblk",
838 slptimeo);
839 splx(s);
840 /*
841 * Callers who call with PCATCH or timeout are
842 * willing to deal with the NULL pointer
843 */
844 if (err && ((slpflag & PCATCH) ||
845 ((err == EWOULDBLOCK) && slptimeo)))
846 return (NULL);
847 goto start;
848 /*NOTREACHED*/
849 break;
850
851 case BLK_PAGEIN:
852 /* pagein operation must not use getblk */
853 panic("getblk: pagein for incore busy buffer");
854 splx(s);
855 /*NOTREACHED*/
856 break;
857
858 case BLK_PAGEOUT:
859 /* pageout operation must not use getblk */
860 panic("getblk: pageout for incore busy buffer");
861 splx(s);
862 /*NOTREACHED*/
863 break;
864
865 default:
866 panic("getblk: %d unknown operation 1", operation);
867 /*NOTREACHED*/
868 break;
869 }
870 } else {
871 /* not busy */
872 SET(bp->b_flags, (B_BUSY | B_CACHE));
873 bremfree(bp);
874 bufstats.bufs_incore++;
875 splx(s);
876
877 allocbuf(bp, size);
878 if (ISSET(bp->b_flags, B_PAGELIST))
879 panic("pagelist buffer is not busy");
880
881 switch (operation) {
882 case BLK_READ:
883 case BLK_WRITE:
884 if (UBCISVALID(bp->b_vp) && bp->b_bufsize) {
885 kret = ubc_create_upl(vp,
886 ubc_blktooff(vp, bp->b_lblkno),
887 bp->b_bufsize,
888 &upl,
889 &pl,
890 UPL_PRECIOUS);
891 if (kret != KERN_SUCCESS)
892 panic("Failed to get pagelists");
893
894 SET(bp->b_flags, B_PAGELIST);
895 bp->b_pagelist = upl;
896
897 if (!upl_valid_page(pl, 0)) {
898 if (vp->v_tag != VT_NFS)
899 panic("getblk: incore buffer without valid page");
900 CLR(bp->b_flags, B_CACHE);
901 }
902
903 if (upl_dirty_page(pl, 0))
904 SET(bp->b_flags, B_WASDIRTY);
905 else
906 CLR(bp->b_flags, B_WASDIRTY);
907
908 kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
909 if (kret != KERN_SUCCESS) {
910 panic("getblk: ubc_upl_map() failed with (%d)",
911 kret);
912 }
913 if (bp->b_data == 0) panic("ubc_upl_map mapped 0");
914 }
915 break;
916
917 case BLK_META:
918 /*
919 * VM is not involved in IO for the meta data
920 * buffer already has valid data
921 */
922 if(bp->b_data == 0)
923 panic("bp->b_data null incore buf=%x", bp);
924 break;
925
926 case BLK_PAGEIN:
927 case BLK_PAGEOUT:
928 panic("getblk: paging operation 1");
929 break;
930
931 default:
932 panic("getblk: %d unknown operation 2", operation);
933 /*NOTREACHED*/
934 break;
935 }
936 }
937 } else { /* not incore() */
938 int queue = BQ_EMPTY; /* Start with no preference */
939 splx(s);
940
941 if ((operation == BLK_META) || (UBCINVALID(vp)) ||
942 !(UBCINFOEXISTS(vp))) {
943 operation = BLK_META;
944 }
945 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
946 goto start;
947 if (incore(vp, blkno)) {
948 SET(bp->b_flags, B_INVAL);
949 binshash(bp, &invalhash);
950 brelse(bp);
951 goto start;
952 }
953
954 /*
955 * if it is meta, the queue may be set to other
956 * type so reset as well as mark it to be B_META
957 * so that when buffer is released it will goto META queue
958 * Also, if the vnode is not VREG, then it is META
959 */
960 if (operation == BLK_META) {
961 SET(bp->b_flags, B_META);
962 queue = BQ_META;
963 }
964 /*
965 * Insert in the hash so that incore() can find it
966 */
967 binshash(bp, BUFHASH(vp, blkno));
968
969 allocbuf(bp, size);
970
971 switch (operation) {
972 case BLK_META:
973 /* buffer data is invalid */
974
975 #if !ZALLOC_METADATA
976 if (bp->b_data)
977 panic("bp->b_data is not nul; %x",bp);
978 kret = kmem_alloc(kernel_map,
979 &bp->b_data, bp->b_bufsize);
980 if (kret != KERN_SUCCESS)
981 panic("getblk: kmem_alloc() returned %d", kret);
982 #endif /* ZALLOC_METADATA */
983
984 if(bp->b_data == 0)
985 panic("bp->b_data is null %x",bp);
986
987 bp->b_blkno = bp->b_lblkno = blkno;
988 s = splbio();
989 bgetvp(vp, bp);
990 bufstats.bufs_miss++;
991 splx(s);
992 if (bp->b_data == 0)
993 panic("b_data is 0: 2");
994
995 /* wakeup the buffer */
996 CLR(bp->b_flags, B_WANTED);
997 wakeup(bp);
998 break;
999
1000 case BLK_READ:
1001 case BLK_WRITE:
1002
1003 if (ISSET(bp->b_flags, B_PAGELIST))
1004 panic("B_PAGELIST in bp=%x",bp);
1005
1006 kret = ubc_create_upl(vp,
1007 ubc_blktooff(vp, blkno),
1008 bp->b_bufsize,
1009 &upl,
1010 &pl,
1011 UPL_PRECIOUS);
1012 if (kret != KERN_SUCCESS)
1013 panic("Failed to get pagelists");
1014
1015 #ifdef UBC_DEBUG
1016 upl_ubc_alias_set(upl, bp, 4);
1017 #endif /* UBC_DEBUG */
1018 bp->b_blkno = bp->b_lblkno = blkno;
1019 bp->b_pagelist = upl;
1020
1021 SET(bp->b_flags, B_PAGELIST);
1022
1023 if (upl_valid_page(pl, 0)) {
1024 SET(bp->b_flags, B_CACHE | B_DONE);
1025 bufstats.bufs_vmhits++;
1026
1027 pagedirty = upl_dirty_page(pl, 0);
1028
1029 if (pagedirty)
1030 SET(bp->b_flags, B_WASDIRTY);
1031
1032 if (vp->v_tag == VT_NFS) {
1033 off_t f_offset;
1034 int valid_size;
1035
1036 bp->b_validoff = 0;
1037 bp->b_dirtyoff = 0;
1038
1039 f_offset = ubc_blktooff(vp, blkno);
1040
1041 if (f_offset > vp->v_ubcinfo->ui_size) {
1042 CLR(bp->b_flags, (B_CACHE|B_DONE|B_WASDIRTY));
1043 bp->b_validend = 0;
1044 bp->b_dirtyend = 0;
1045 } else {
1046 valid_size = min(((unsigned int)(vp->v_ubcinfo->ui_size - f_offset)), PAGE_SIZE);
1047 bp->b_validend = valid_size;
1048
1049 if (pagedirty)
1050 bp->b_dirtyend = valid_size;
1051 else
1052 bp->b_dirtyend = 0;
1053
1054 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_NONE,
1055 bp->b_validend, bp->b_dirtyend,
1056 (int)vp->v_ubcinfo->ui_size, 0, 0);
1057 }
1058 } else {
1059 bp->b_validoff = 0;
1060 bp->b_dirtyoff = 0;
1061
1062 if (pagedirty) {
1063 /* page is dirty */
1064 bp->b_validend = bp->b_bcount;
1065 bp->b_dirtyend = bp->b_bcount;
1066 } else {
1067 /* page is clean */
1068 bp->b_validend = bp->b_bcount;
1069 bp->b_dirtyend = 0;
1070 }
1071 }
1072 if (error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL)) {
1073 panic("VOP_BMAP failed in getblk");
1074 /*NOTREACHED*/
1075 /*
1076 * XXX: We probably should invalidate the VM Page
1077 */
1078 bp->b_error = error;
1079 SET(bp->b_flags, (B_ERROR | B_INVAL));
1080 /* undo B_DONE that was set before upl_commit() */
1081 CLR(bp->b_flags, B_DONE);
1082 brelse(bp);
1083 return (0);
1084 }
1085 } else {
1086 bufstats.bufs_miss++;
1087 }
1088 kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
1089 if (kret != KERN_SUCCESS) {
1090 panic("getblk: ubc_upl_map() "
1091 "failed with (%d)", kret);
1092 }
1093 if (bp->b_data == 0) panic("kernel_upl_map mapped 0");
1094
1095 s = splbio();
1096 bgetvp(vp, bp);
1097 splx(s);
1098
1099 break;
1100
1101 case BLK_PAGEIN:
1102 case BLK_PAGEOUT:
1103 panic("getblk: paging operation 2");
1104 break;
1105 default:
1106 panic("getblk: %d unknown operation 3", operation);
1107 /*NOTREACHED*/
1108 break;
1109 }
1110 }
1111
1112 if (bp->b_data == NULL)
1113 panic("getblk: bp->b_addr is null");
1114
1115 if (bp->b_bufsize & 0xfff) {
1116 #if ZALLOC_METADATA
1117 if (ISSET(bp->b_flags, B_META) && (bp->b_bufsize & 0x1ff))
1118 #endif /* ZALLOC_METADATA */
1119 panic("getblk: bp->b_bufsize = %d", bp->b_bufsize);
1120 }
1121
1122 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
1123 (int)bp, (int)bp->b_data, bp->b_flags, 3, 0);
1124
1125 return (bp);
1126 }
1127
1128 /*
1129 * Get an empty, disassociated buffer of given size.
1130 */
1131 struct buf *
1132 geteblk(size)
1133 int size;
1134 {
1135 struct buf *bp;
1136 int queue = BQ_EMPTY;
1137 #if !ZALLOC_METADATA
1138 kern_return_t kret;
1139 vm_size_t desired_size = roundup(size, CLBYTES);
1140
1141 if (desired_size > MAXBSIZE)
1142 panic("geteblk: buffer larger than MAXBSIZE requested");
1143 #endif /* ZALLOC_METADATA */
1144
1145 while ((bp = getnewbuf(0, 0, &queue)) == 0)
1146 ;
1147 #if ZALLOC_METADATA
1148 SET(bp->b_flags, (B_META|B_INVAL));
1149 #else
1150 SET(bp->b_flags, B_INVAL);
1151 #endif /* ZALLOC_METADATA */
1152
1153 #if DIAGNOSTIC
1154 assert(queue == BQ_EMPTY);
1155 #endif /* DIAGNOSTIC */
1156 /* XXX need to implement logic to deal with other queues */
1157
1158 #if !ZALLOC_METADATA
1159 /* Empty buffer - allocate pages */
1160 kret = kmem_alloc_aligned(kernel_map, &bp->b_data, desired_size);
1161 if (kret != KERN_SUCCESS)
1162 panic("geteblk: kmem_alloc_aligned returned %d", kret);
1163 #endif /* ZALLOC_METADATA */
1164
1165 binshash(bp, &invalhash);
1166 allocbuf(bp, size);
1167 bufstats.bufs_eblk++;
1168
1169 return (bp);
1170 }
1171
1172 #if ZALLOC_METADATA
1173 /*
1174 * Zones for the meta data buffers
1175 */
1176
1177 #define MINMETA 512
1178 #define MAXMETA 4096
1179
1180 struct meta_zone_entry {
1181 zone_t mz_zone;
1182 vm_size_t mz_size;
1183 vm_size_t mz_max;
1184 char *mz_name;
1185 };
1186
1187 struct meta_zone_entry meta_zones[] = {
1188 {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
1189 {NULL, (MINMETA * 2), 64 * (MINMETA * 2), "buf.1024" },
1190 {NULL, (MINMETA * 3), 16 * (MINMETA * 3), "buf.1536" },
1191 {NULL, (MINMETA * 4), 16 * (MINMETA * 4), "buf.2048" },
1192 {NULL, (MINMETA * 5), 16 * (MINMETA * 5), "buf.2560" },
1193 {NULL, (MINMETA * 6), 16 * (MINMETA * 6), "buf.3072" },
1194 {NULL, (MINMETA * 7), 16 * (MINMETA * 7), "buf.3584" },
1195 {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
1196 {NULL, 0, 0, "" } /* End */
1197 };
1198 #endif /* ZALLOC_METADATA */
1199
1200 zone_t buf_hdr_zone;
1201 int buf_hdr_count;
1202
1203 /*
1204 * Initialize the meta data zones
1205 */
1206 static void
1207 bufzoneinit(void)
1208 {
1209 #if ZALLOC_METADATA
1210 int i;
1211
1212 for (i = 0; meta_zones[i].mz_size != 0; i++) {
1213 meta_zones[i].mz_zone =
1214 zinit(meta_zones[i].mz_size,
1215 meta_zones[i].mz_max,
1216 PAGE_SIZE,
1217 meta_zones[i].mz_name);
1218 }
1219 #endif /* ZALLOC_METADATA */
1220 buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
1221 }
1222
1223 #if ZALLOC_METADATA
1224 static zone_t
1225 getbufzone(size_t size)
1226 {
1227 int i;
1228
1229 if (size % 512)
1230 panic("getbufzone: incorect size = %d", size);
1231
1232 i = (size / 512) - 1;
1233 return (meta_zones[i].mz_zone);
1234 }
1235 #endif /* ZALLOC_METADATA */
1236
1237 /*
1238 * With UBC, there is no need to expand / shrink the file data
1239 * buffer. The VM uses the same pages, hence no waste.
1240 * All the file data buffers can have one size.
1241 * In fact expand / shrink would be an expensive operation.
1242 *
1243 * Only exception to this is meta-data buffers. Most of the
1244 * meta data operations are smaller than PAGE_SIZE. Having the
1245 * meta-data buffers grow and shrink as needed, optimizes use
1246 * of the kernel wired memory.
1247 */
1248
1249 int
1250 allocbuf(bp, size)
1251 struct buf *bp;
1252 int size;
1253 {
1254 vm_size_t desired_size;
1255
1256 desired_size = roundup(size, CLBYTES);
1257
1258 if(desired_size < PAGE_SIZE)
1259 desired_size = PAGE_SIZE;
1260 if (desired_size > MAXBSIZE)
1261 panic("allocbuf: buffer larger than MAXBSIZE requested");
1262
1263 #if ZALLOC_METADATA
1264 if (ISSET(bp->b_flags, B_META)) {
1265 kern_return_t kret;
1266 zone_t zprev, z;
1267 size_t nsize = roundup(size, MINMETA);
1268
1269 if (bp->b_data) {
1270 vm_offset_t elem = (vm_offset_t)bp->b_data;
1271
1272 if (ISSET(bp->b_flags, B_ZALLOC))
1273 if (bp->b_bufsize <= MAXMETA) {
1274 if (bp->b_bufsize < nsize) {
1275 /* reallocate to a bigger size */
1276 desired_size = nsize;
1277
1278 zprev = getbufzone(bp->b_bufsize);
1279 z = getbufzone(nsize);
1280 bp->b_data = (caddr_t)zalloc(z);
1281 if(bp->b_data == 0)
1282 panic("allocbuf: zalloc() returned NULL");
1283 bcopy(elem, bp->b_data, bp->b_bufsize);
1284 zfree(zprev, elem);
1285 } else {
1286 desired_size = bp->b_bufsize;
1287 }
1288 } else
1289 panic("allocbuf: B_ZALLOC set incorrectly");
1290 else
1291 if (bp->b_bufsize < desired_size) {
1292 /* reallocate to a bigger size */
1293 kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1294 if (kret != KERN_SUCCESS)
1295 panic("allocbuf: kmem_alloc() returned %d", kret);
1296 if(bp->b_data == 0)
1297 panic("allocbuf: null b_data");
1298 bcopy(elem, bp->b_data, bp->b_bufsize);
1299 kmem_free(kernel_map, elem, bp->b_bufsize);
1300 } else {
1301 desired_size = bp->b_bufsize;
1302 }
1303 } else {
1304 /* new allocation */
1305 if (nsize <= MAXMETA) {
1306 desired_size = nsize;
1307 z = getbufzone(nsize);
1308 bp->b_data = (caddr_t)zalloc(z);
1309 if(bp->b_data == 0)
1310 panic("allocbuf: zalloc() returned NULL 2");
1311 SET(bp->b_flags, B_ZALLOC);
1312 } else {
1313 kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1314 if (kret != KERN_SUCCESS)
1315 panic("allocbuf: kmem_alloc() 2 returned %d", kret);
1316 if(bp->b_data == 0)
1317 panic("allocbuf: null b_data 2");
1318 }
1319 }
1320 }
1321
1322 if (ISSET(bp->b_flags, B_META) && (bp->b_data == 0))
1323 panic("allocbuf: bp->b_data is NULL");
1324 #endif /* ZALLOC_METADATA */
1325
1326 bp->b_bufsize = desired_size;
1327 bp->b_bcount = size;
1328 }
1329
1330 /*
1331 * Get a new buffer from one of the free lists.
1332 *
1333 * Request for a queue is passes in. The queue from which the buffer was taken
1334 * from is returned. Out of range queue requests get BQ_EMPTY. Request for
1335 * BQUEUE means no preference. Use heuristics in that case.
1336 * Heuristics is as follows:
1337 * Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
1338 * If none available block till one is made available.
1339 * If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
1340 * Pick the most stale buffer.
1341 * If found buffer was marked delayed write, start the async. write
1342 * and restart the search.
1343 * Initialize the fields and disassociate the buffer from the vnode.
1344 * Remove the buffer from the hash. Return the buffer and the queue
1345 * on which it was found.
1346 */
1347
1348 static struct buf *
1349 getnewbuf(slpflag, slptimeo, queue)
1350 int slpflag, slptimeo;
1351 int *queue;
1352 {
1353 register struct buf *bp;
1354 register struct buf *lru_bp;
1355 register struct buf *age_bp;
1356 register struct buf *meta_bp;
1357 register int age_time, lru_time, bp_time, meta_time;
1358 int s;
1359 struct ucred *cred;
1360 int req = *queue; /* save it for restarts */
1361
1362 start:
1363 s = splbio();
1364
1365 /* invalid request gets empty queue */
1366 if ((*queue > BQUEUES) || (*queue < 0)
1367 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED))
1368 *queue = BQ_EMPTY;
1369
1370 /* (*queue == BQUEUES) means no preference */
1371 if (*queue != BQUEUES) {
1372 /* Try for the requested queue first */
1373 bp = bufqueues[*queue].tqh_first;
1374 if (bp)
1375 goto found;
1376 }
1377
1378 /* Unable to use requested queue */
1379 age_bp = bufqueues[BQ_AGE].tqh_first;
1380 lru_bp = bufqueues[BQ_LRU].tqh_first;
1381 meta_bp = bufqueues[BQ_META].tqh_first;
1382
1383 if (!age_bp && !lru_bp && !meta_bp) { /* Unavailble on AGE or LRU */
1384 /* Try the empty list first */
1385 bp = bufqueues[BQ_EMPTY].tqh_first;
1386 if (bp) {
1387 *queue = BQ_EMPTY;
1388 goto found;
1389 }
1390
1391 /* Create a new temparory buffer header */
1392 bp = (struct buf *)zalloc(buf_hdr_zone);
1393
1394 if (bp) {
1395 bufhdrinit(bp);
1396 BLISTNONE(bp);
1397 binshash(bp, &invalhash);
1398 SET(bp->b_flags, B_HDRALLOC);
1399 *queue = BQ_EMPTY;
1400 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
1401 buf_hdr_count++;
1402 goto found;
1403 }
1404
1405 /* Log this error condition */
1406 printf("getnewbuf: No useful buffers");
1407
1408 /* wait for a free buffer of any kind */
1409 needbuffer = 1;
1410 bufstats.bufs_sleeps++;
1411 tsleep(&needbuffer, slpflag|(PRIBIO+1), "getnewbuf", slptimeo);
1412 splx(s);
1413 return (0);
1414 }
1415
1416 /* Buffer available either on AGE or LRU or META */
1417 bp = NULL;
1418 *queue = -1;
1419
1420 /* Buffer available either on AGE or LRU */
1421 if (!age_bp) {
1422 bp = lru_bp;
1423 *queue = BQ_LRU;
1424 } else if (!lru_bp) {
1425 bp = age_bp;
1426 *queue = BQ_AGE;
1427 } else { /* buffer available on both AGE and LRU */
1428 age_time = time.tv_sec - age_bp->b_timestamp;
1429 lru_time = time.tv_sec - lru_bp->b_timestamp;
1430 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
1431 bp = age_bp;
1432 *queue = BQ_AGE;
1433 /*
1434 * we should probably re-timestamp eveything in the
1435 * queues at this point with the current time
1436 */
1437 } else {
1438 if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
1439 bp = lru_bp;
1440 *queue = BQ_LRU;
1441 } else {
1442 bp = age_bp;
1443 *queue = BQ_AGE;
1444 }
1445 }
1446 }
1447
1448 if (!bp) { /* Neither on AGE nor on LRU */
1449 bp = meta_bp;
1450 *queue = BQ_META;
1451 } else if (meta_bp) {
1452 bp_time = time.tv_sec - bp->b_timestamp;
1453 meta_time = time.tv_sec - meta_bp->b_timestamp;
1454
1455 if (!(bp_time < 0) && !(meta_time < 0)) {
1456 /* time not set backwards */
1457 int bp_is_stale;
1458 bp_is_stale = (*queue == BQ_LRU) ?
1459 lru_is_stale : age_is_stale;
1460
1461 if ((meta_time >= meta_is_stale) &&
1462 (bp_time < bp_is_stale)) {
1463 bp = meta_bp;
1464 *queue = BQ_META;
1465 }
1466 }
1467 }
1468
1469 if (bp == NULL)
1470 panic("getnewbuf: null bp");
1471
1472 found:
1473 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1474 panic("getnewbuf: le_prev is deadbeef");
1475
1476 if(ISSET(bp->b_flags, B_BUSY))
1477 panic("getnewbuf reusing BUSY buf");
1478
1479 /* Clean it */
1480 if (bcleanbuf(bp)) {
1481 /* bawrite() issued, buffer not ready */
1482 splx(s);
1483 *queue = req;
1484 goto start;
1485 }
1486 splx(s);
1487 return (bp);
1488 }
1489 #include <mach/mach_types.h>
1490 #include <mach/memory_object_types.h>
1491
1492 /*
1493 * Clean a buffer.
1494 * Returns 0 is buffer is ready to use,
1495 * Returns 1 if issued a bawrite() to indicate
1496 * that the buffer is not ready.
1497 */
1498 int
1499 bcleanbuf(struct buf *bp)
1500 {
1501 int s;
1502 struct ucred *cred;
1503
1504 s = splbio();
1505
1506 /* Remove from the queue */
1507 bremfree(bp);
1508
1509 /* Buffer is no longer on free lists. */
1510 SET(bp->b_flags, B_BUSY);
1511
1512 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1513 panic("bcleanbuf: le_prev is deadbeef");
1514
1515 /*
1516 * If buffer was a delayed write, start the IO by queuing
1517 * it on the LAUNDRY queue, and return 1
1518 */
1519 if (ISSET(bp->b_flags, B_DELWRI)) {
1520 splx(s);
1521 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
1522 blaundrycnt++;
1523 wakeup(&blaundrycnt);
1524 return (1);
1525 }
1526
1527 if (bp->b_vp)
1528 brelvp(bp);
1529 bremhash(bp);
1530 BLISTNONE(bp);
1531
1532 splx(s);
1533
1534 if (ISSET(bp->b_flags, B_META)) {
1535 #if ZALLOC_METADATA
1536 vm_offset_t elem = (vm_offset_t)bp->b_data;
1537 if (elem == 0)
1538 panic("bcleanbuf: NULL bp->b_data B_META buffer");
1539
1540 if (ISSET(bp->b_flags, B_ZALLOC)) {
1541 if (bp->b_bufsize <= MAXMETA) {
1542 zone_t z;
1543
1544 z = getbufzone(bp->b_bufsize);
1545 bp->b_data = (caddr_t)0xdeadbeef;
1546 zfree(z, elem);
1547 CLR(bp->b_flags, B_ZALLOC);
1548 } else
1549 panic("bcleanbuf: B_ZALLOC set incorrectly");
1550 } else {
1551 bp->b_data = (caddr_t)0xdeadbeef;
1552 kmem_free(kernel_map, elem, bp->b_bufsize);
1553 }
1554 #else
1555 if (bp->b_data == 0)
1556 panic("bcleanbuf: bp->b_data == NULL for B_META buffer");
1557
1558 kmem_free(kernel_map, bp->b_data, bp->b_bufsize);
1559 #endif /* ZALLOC_METADATA */
1560 }
1561
1562 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
1563
1564 /* disassociate us from our vnode, if we had one... */
1565 s = splbio();
1566
1567 /* clear out various other fields */
1568 bp->b_bufsize = 0;
1569 bp->b_data = 0;
1570 bp->b_flags = B_BUSY;
1571 bp->b_dev = NODEV;
1572 bp->b_blkno = bp->b_lblkno = 0;
1573 bp->b_iodone = 0;
1574 bp->b_error = 0;
1575 bp->b_resid = 0;
1576 bp->b_bcount = 0;
1577 bp->b_dirtyoff = bp->b_dirtyend = 0;
1578 bp->b_validoff = bp->b_validend = 0;
1579
1580 /* nuke any credentials we were holding */
1581 cred = bp->b_rcred;
1582 if (cred != NOCRED) {
1583 bp->b_rcred = NOCRED;
1584 crfree(cred);
1585 }
1586 cred = bp->b_wcred;
1587 if (cred != NOCRED) {
1588 bp->b_wcred = NOCRED;
1589 crfree(cred);
1590 }
1591 splx(s);
1592 return (0);
1593 }
1594
1595
1596 /*
1597 * Wait for operations on the buffer to complete.
1598 * When they do, extract and return the I/O's error value.
1599 */
1600 int
1601 biowait(bp)
1602 struct buf *bp;
1603 {
1604 upl_t upl;
1605 upl_page_info_t *pl;
1606 int s;
1607 kern_return_t kret;
1608
1609 s = splbio();
1610 while (!ISSET(bp->b_flags, B_DONE))
1611 tsleep(bp, PRIBIO + 1, "biowait", 0);
1612 splx(s);
1613
1614 /* check for interruption of I/O (e.g. via NFS), then errors. */
1615 if (ISSET(bp->b_flags, B_EINTR)) {
1616 CLR(bp->b_flags, B_EINTR);
1617 return (EINTR);
1618 } else if (ISSET(bp->b_flags, B_ERROR))
1619 return (bp->b_error ? bp->b_error : EIO);
1620 else
1621 return (0);
1622 }
1623
1624 /*
1625 * Mark I/O complete on a buffer.
1626 *
1627 * If a callback has been requested, e.g. the pageout
1628 * daemon, do so. Otherwise, awaken waiting processes.
1629 *
1630 * [ Leffler, et al., says on p.247:
1631 * "This routine wakes up the blocked process, frees the buffer
1632 * for an asynchronous write, or, for a request by the pagedaemon
1633 * process, invokes a procedure specified in the buffer structure" ]
1634 *
1635 * In real life, the pagedaemon (or other system processes) wants
1636 * to do async stuff to, and doesn't want the buffer brelse()'d.
1637 * (for swap pager, that puts swap buffers on the free lists (!!!),
1638 * for the vn device, that puts malloc'd buffers on the free lists!)
1639 */
1640 void
1641 biodone(bp)
1642 struct buf *bp;
1643 {
1644 boolean_t funnel_state;
1645 int s;
1646
1647 funnel_state = thread_funnel_set(kernel_flock, TRUE);
1648
1649 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
1650 (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1651
1652 if (ISSET(bp->b_flags, B_DONE))
1653 panic("biodone already");
1654 SET(bp->b_flags, B_DONE); /* note that it's done */
1655 /*
1656 * I/O was done, so don't believe
1657 * the DIRTY state from VM anymore
1658 */
1659 CLR(bp->b_flags, B_WASDIRTY);
1660
1661 if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
1662 vwakeup(bp); /* wake up reader */
1663
1664 if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */
1665 CLR(bp->b_flags, B_CALL); /* but note callout done */
1666 (*bp->b_iodone)(bp);
1667 } else if (ISSET(bp->b_flags, B_ASYNC)) /* if async, release it */
1668 brelse(bp);
1669 else { /* or just wakeup the buffer */
1670 CLR(bp->b_flags, B_WANTED);
1671 wakeup(bp);
1672 }
1673
1674 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
1675 (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1676
1677 thread_funnel_set(kernel_flock, funnel_state);
1678 }
1679
1680 /*
1681 * Return a count of buffers on the "locked" queue.
1682 */
1683 int
1684 count_lock_queue()
1685 {
1686 register struct buf *bp;
1687 register int n = 0;
1688
1689 for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
1690 bp = bp->b_freelist.tqe_next)
1691 n++;
1692 return (n);
1693 }
1694
1695 /*
1696 * Return a count of 'busy' buffers. Used at the time of shutdown.
1697 */
1698 int
1699 count_busy_buffers()
1700 {
1701 register struct buf *bp;
1702 register int nbusy = 0;
1703
1704 for (bp = &buf[nbuf]; --bp >= buf; )
1705 if ((bp->b_flags & (B_BUSY|B_INVAL)) == B_BUSY)
1706 nbusy++;
1707 return (nbusy);
1708 }
1709
1710 #if 1 /*DIAGNOSTIC */
1711 /*
1712 * Print out statistics on the current allocation of the buffer pool.
1713 * Can be enabled to print out on every ``sync'' by setting "syncprt"
1714 * in vfs_syscalls.c using sysctl.
1715 */
1716 void
1717 vfs_bufstats()
1718 {
1719 int s, i, j, count;
1720 register struct buf *bp;
1721 register struct bqueues *dp;
1722 int counts[MAXBSIZE/CLBYTES+1];
1723 static char *bname[BQUEUES] =
1724 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
1725
1726 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
1727 count = 0;
1728 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1729 counts[j] = 0;
1730 s = splbio();
1731 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
1732 counts[bp->b_bufsize/CLBYTES]++;
1733 count++;
1734 }
1735 splx(s);
1736 printf("%s: total-%d", bname[i], count);
1737 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1738 if (counts[j] != 0)
1739 printf(", %d-%d", j * CLBYTES, counts[j]);
1740 printf("\n");
1741 }
1742 }
1743 #endif /* DIAGNOSTIC */
1744
1745 #define NRESERVEDIOBUFS 16
1746
1747 struct buf *
1748 alloc_io_buf(vp, priv)
1749 struct vnode *vp;
1750 int priv;
1751 {
1752 register struct buf *bp;
1753 int s;
1754
1755 s = splbio();
1756
1757 while (niobuf - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse && !priv) {
1758 need_iobuffer = 1;
1759 bufstats.bufs_iobufsleeps++;
1760 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf", 0);
1761 }
1762
1763 while ((bp = iobufqueue.tqh_first) == NULL) {
1764 need_iobuffer = 1;
1765 bufstats.bufs_iobufsleeps++;
1766 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf1", 0);
1767 }
1768
1769 TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
1770 bp->b_timestamp = 0;
1771
1772 /* clear out various fields */
1773 bp->b_flags = B_BUSY;
1774 bp->b_blkno = bp->b_lblkno = 0;
1775 bp->b_iodone = 0;
1776 bp->b_error = 0;
1777 bp->b_resid = 0;
1778 bp->b_bcount = 0;
1779 bp->b_bufsize = 0;
1780 bp->b_vp = vp;
1781
1782 if (vp->v_type == VBLK || vp->v_type == VCHR)
1783 bp->b_dev = vp->v_rdev;
1784 else
1785 bp->b_dev = NODEV;
1786 bufstats.bufs_iobufinuse++;
1787 if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
1788 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
1789 splx(s);
1790
1791 return (bp);
1792 }
1793
1794 void
1795 free_io_buf(bp)
1796 struct buf *bp;
1797 {
1798 int s;
1799
1800 s = splbio();
1801 /* put buffer back on the head of the iobufqueue */
1802 bp->b_vp = NULL;
1803 bp->b_flags = B_INVAL;
1804
1805 binsheadfree(bp, &iobufqueue, -1);
1806
1807 /* Wake up any processes waiting for any buffer to become free. */
1808 if (need_iobuffer) {
1809 need_iobuffer = 0;
1810 wakeup(&need_iobuffer);
1811 }
1812 bufstats.bufs_iobufinuse--;
1813 splx(s);
1814 }
1815
1816
1817 /* not hookedup yet */
1818
1819 /* XXX move this to a separate file */
1820 /*
1821 * Dynamic Scaling of the Buffer Queues
1822 */
1823
1824 typedef long long blsize_t;
1825
1826 blsize_t MAXNBUF; /* initialize to (mem_size / PAGE_SIZE) */
1827 /* Global tunable limits */
1828 blsize_t nbufh; /* number of buffer headers */
1829 blsize_t nbuflow; /* minimum number of buffer headers required */
1830 blsize_t nbufhigh; /* maximum number of buffer headers allowed */
1831 blsize_t nbuftarget; /* preferred number of buffer headers */
1832
1833 /*
1834 * assertions:
1835 *
1836 * 1. 0 < nbuflow <= nbufh <= nbufhigh
1837 * 2. nbufhigh <= MAXNBUF
1838 * 3. 0 < nbuflow <= nbuftarget <= nbufhigh
1839 * 4. nbufh can not be set by sysctl().
1840 */
1841
1842 /* Per queue tunable limits */
1843
1844 struct bufqlim {
1845 blsize_t bl_nlow; /* minimum number of buffer headers required */
1846 blsize_t bl_num; /* number of buffer headers on the queue */
1847 blsize_t bl_nlhigh; /* maximum number of buffer headers allowed */
1848 blsize_t bl_target; /* preferred number of buffer headers */
1849 long bl_stale; /* Seconds after which a buffer is considered stale */
1850 } bufqlim[BQUEUES];
1851
1852 /*
1853 * assertions:
1854 *
1855 * 1. 0 <= bl_nlow <= bl_num <= bl_nlhigh
1856 * 2. bl_nlhigh <= MAXNBUF
1857 * 3. bufqlim[BQ_META].bl_nlow != 0
1858 * 4. bufqlim[BQ_META].bl_nlow > (number of possible concurrent
1859 * file system IO operations)
1860 * 5. bl_num can not be set by sysctl().
1861 * 6. bl_nhigh <= nbufhigh
1862 */
1863
1864 /*
1865 * Rationale:
1866 * ----------
1867 * Defining it blsize_t as long permits 2^31 buffer headers per queue.
1868 * Which can describe (2^31 * PAGE_SIZE) memory per queue.
1869 *
1870 * These limits are exported to by means of sysctl().
1871 * It was decided to define blsize_t as a 64 bit quantity.
1872 * This will make sure that we will not be required to change it
1873 * as long as we do not exceed 64 bit address space for the kernel.
1874 *
1875 * low and high numbers parameters initialized at compile time
1876 * and boot arguments can be used to override them. sysctl()
1877 * would not change the value. sysctl() can get all the values
1878 * but can set only target. num is the current level.
1879 *
1880 * Advantages of having a "bufqscan" thread doing the balancing are,
1881 * Keep enough bufs on BQ_EMPTY.
1882 * getnewbuf() by default will always select a buffer from the BQ_EMPTY.
1883 * getnewbuf() perfoms best if a buffer was found there.
1884 * Also this minimizes the possibility of starting IO
1885 * from getnewbuf(). That's a performance win, too.
1886 *
1887 * Localize complex logic [balancing as well as time aging]
1888 * to balancebufq().
1889 *
1890 * Simplify getnewbuf() logic by elimination of time aging code.
1891 */
1892
1893 /*
1894 * Algorithm:
1895 * -----------
1896 * The goal of the dynamic scaling of the buffer queues to to keep
1897 * the size of the LRU close to bl_target. Buffers on a queue would
1898 * be time aged.
1899 *
1900 * There would be a thread which will be responsible for "balancing"
1901 * the buffer cache queues.
1902 *
1903 * The scan order would be: AGE, LRU, META, EMPTY.
1904 */
1905
1906 long bufqscanwait = 0;
1907
1908 extern void bufqscan_thread();
1909 extern int balancebufq(int q);
1910 extern int btrimempty(int n);
1911 extern int initbufqscan(void);
1912 extern int nextbufq(int q);
1913 extern void buqlimprt(int all);
1914
1915 void
1916 bufq_balance_thread_init()
1917 {
1918
1919 if (bufqscanwait++ == 0) {
1920 int i;
1921
1922 /* Initalize globals */
1923 MAXNBUF = (mem_size / PAGE_SIZE);
1924 nbufh = nbuf;
1925 nbuflow = min(nbufh, 100);
1926 nbufhigh = min(MAXNBUF, max(nbufh, 2048));
1927 nbuftarget = (mem_size >> 5) / PAGE_SIZE;
1928 nbuftarget = max(nbuflow, nbuftarget);
1929 nbuftarget = min(nbufhigh, nbuftarget);
1930
1931 /*
1932 * Initialize the bufqlim
1933 */
1934
1935 /* LOCKED queue */
1936 bufqlim[BQ_LOCKED].bl_nlow = 0;
1937 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
1938 bufqlim[BQ_LOCKED].bl_target = 0;
1939 bufqlim[BQ_LOCKED].bl_stale = 30;
1940
1941 /* LRU queue */
1942 bufqlim[BQ_LRU].bl_nlow = 0;
1943 bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
1944 bufqlim[BQ_LRU].bl_target = nbuftarget/4;
1945 bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
1946
1947 /* AGE queue */
1948 bufqlim[BQ_AGE].bl_nlow = 0;
1949 bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
1950 bufqlim[BQ_AGE].bl_target = nbuftarget/4;
1951 bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
1952
1953 /* EMPTY queue */
1954 bufqlim[BQ_EMPTY].bl_nlow = 0;
1955 bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
1956 bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
1957 bufqlim[BQ_EMPTY].bl_stale = 600000;
1958
1959 /* META queue */
1960 bufqlim[BQ_META].bl_nlow = 0;
1961 bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
1962 bufqlim[BQ_META].bl_target = nbuftarget/4;
1963 bufqlim[BQ_META].bl_stale = META_IS_STALE;
1964
1965 /* LAUNDRY queue */
1966 bufqlim[BQ_LOCKED].bl_nlow = 0;
1967 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
1968 bufqlim[BQ_LOCKED].bl_target = 0;
1969 bufqlim[BQ_LOCKED].bl_stale = 30;
1970
1971 buqlimprt(1);
1972 }
1973
1974 /* create worker thread */
1975 kernel_thread(kernel_task, bufqscan_thread);
1976 }
1977
1978 /* The workloop for the buffer balancing thread */
1979 void
1980 bufqscan_thread()
1981 {
1982 boolean_t funnel_state;
1983 int moretodo = 0;
1984
1985 funnel_state = thread_funnel_set(kernel_flock, TRUE);
1986
1987 for(;;) {
1988 do {
1989 int q; /* buffer queue to process */
1990
1991 for (q = initbufqscan(); q; ) {
1992 moretodo |= balancebufq(q);
1993 q = nextbufq(q);
1994 }
1995 } while (moretodo);
1996
1997 #if 1 || DIAGNOSTIC
1998 vfs_bufstats();
1999 buqlimprt(0);
2000 #endif
2001 (void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz);
2002 moretodo = 0;
2003 }
2004
2005 (void) thread_funnel_set(kernel_flock, FALSE);
2006 }
2007
2008 /* Seed for the buffer queue balancing */
2009 int
2010 initbufqscan()
2011 {
2012 /* Start with AGE queue */
2013 return (BQ_AGE);
2014 }
2015
2016 /* Pick next buffer queue to balance */
2017 int
2018 nextbufq(int q)
2019 {
2020 int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
2021
2022 q++;
2023 q %= sizeof(order);
2024 return (order[q]);
2025 }
2026
2027 /* function to balance the buffer queues */
2028 int
2029 balancebufq(int q)
2030 {
2031 int moretodo = 0;
2032 int s = splbio();
2033 int n;
2034
2035 /* reject invalid q */
2036 if ((q < 0) || (q >= BQUEUES))
2037 goto out;
2038
2039 /* LOCKED or LAUNDRY queue MUST not be balanced */
2040 if ((q == BQ_LOCKED) || (q == BQ_LAUNDRY))
2041 goto out;
2042
2043 n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
2044
2045 /* If queue has less than target nothing more to do */
2046 if (n < 0)
2047 goto out;
2048
2049 if ( n > 8 ) {
2050 /* Balance only a small amount (12.5%) at a time */
2051 n >>= 3;
2052 }
2053
2054 /* EMPTY queue needs special handling */
2055 if (q == BQ_EMPTY) {
2056 moretodo |= btrimempty(n);
2057 goto out;
2058 }
2059
2060 for (; n > 0; n--) {
2061 struct buf *bp = bufqueues[q].tqh_first;
2062 if (!bp)
2063 break;
2064
2065 /* check if it's stale */
2066 if ((time.tv_sec - bp->b_timestamp) > bufqlim[q].bl_stale) {
2067 if (bcleanbuf(bp)) {
2068 /* bawrite() issued, bp not ready */
2069 moretodo = 1;
2070 } else {
2071 /* release the cleaned buffer to BQ_EMPTY */
2072 SET(bp->b_flags, B_INVAL);
2073 brelse(bp);
2074 }
2075 } else
2076 break;
2077 }
2078
2079 out:
2080 splx(s);
2081 return (moretodo);
2082 }
2083
2084 int
2085 btrimempty(int n)
2086 {
2087 /*
2088 * When struct buf are allocated dynamically, this would
2089 * reclaim upto 'n' struct buf from the empty queue.
2090 */
2091
2092 return (0);
2093 }
2094
2095 void
2096 bufqinc(int q)
2097 {
2098 if ((q < 0) || (q >= BQUEUES))
2099 return;
2100
2101 bufqlim[q].bl_num++;
2102 return;
2103 }
2104
2105 void
2106 bufqdec(int q)
2107 {
2108 if ((q < 0) || (q >= BQUEUES))
2109 return;
2110
2111 bufqlim[q].bl_num--;
2112 return;
2113 }
2114
2115 void
2116 buqlimprt(int all)
2117 {
2118 int i;
2119 static char *bname[BQUEUES] =
2120 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
2121
2122 if (all)
2123 for (i = 0; i < BQUEUES; i++) {
2124 printf("%s : ", bname[i]);
2125 printf("min = %d, ", (long)bufqlim[i].bl_nlow);
2126 printf("cur = %d, ", (long)bufqlim[i].bl_num);
2127 printf("max = %d, ", (long)bufqlim[i].bl_nlhigh);
2128 printf("target = %d, ", (long)bufqlim[i].bl_target);
2129 printf("stale after %d seconds\n", bufqlim[i].bl_stale);
2130 }
2131 else
2132 for (i = 0; i < BQUEUES; i++) {
2133 printf("%s : ", bname[i]);
2134 printf("cur = %d, ", (long)bufqlim[i].bl_num);
2135 }
2136 }
2137
2138 /*
2139 * If the getnewbuf() calls bcleanbuf() on the same thread
2140 * there is a potential for stack overrun and deadlocks.
2141 * So we always handoff the work to worker thread for completion
2142 */
2143
2144 static void
2145 bcleanbuf_thread_init()
2146 {
2147 static void bcleanbuf_thread();
2148
2149 /* create worker thread */
2150 kernel_thread(kernel_task, bcleanbuf_thread);
2151 }
2152
2153 static void
2154 bcleanbuf_thread()
2155 {
2156 boolean_t funnel_state;
2157 struct buf *bp;
2158
2159 funnel_state = thread_funnel_set(kernel_flock, TRUE);
2160
2161 doit:
2162 while (blaundrycnt == 0)
2163 (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 60 * hz);
2164 bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY]);
2165 /* Remove from the queue */
2166 bremfree(bp);
2167 blaundrycnt--;
2168 /* do the IO */
2169 bawrite(bp);
2170 /* start again */
2171 goto doit;
2172
2173 (void) thread_funnel_set(kernel_flock, funnel_state);
2174 }
2175