]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_bio.c
573ad97d8acc11100726d3ea77374b9f27d6dcfe
[apple/xnu.git] / bsd / vfs / vfs_bio.c
1 /*
2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
23 /*-
24 * Copyright (c) 1994 Christopher G. Demetriou
25 * Copyright (c) 1982, 1986, 1989, 1993
26 * The Regents of the University of California. All rights reserved.
27 * (c) UNIX System Laboratories, Inc.
28 * All or some portions of this file are derived from material licensed
29 * to the University of California by American Telephone and Telegraph
30 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
31 * the permission of UNIX System Laboratories, Inc.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * The NEXTSTEP Software License Agreement specifies the terms
62 * and conditions for redistribution.
63 *
64 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
65 */
66
67
68 /*
69 * Some references:
70 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
71 * Leffler, et al.: The Design and Implementation of the 4.3BSD
72 * UNIX Operating System (Addison Welley, 1989)
73 */
74 #define ZALLOC_METADATA 1
75
76 #include <sys/param.h>
77 #include <sys/systm.h>
78 #include <sys/proc.h>
79 #include <sys/buf.h>
80 #include <sys/vnode.h>
81 #include <sys/mount.h>
82 #include <sys/trace.h>
83 #include <sys/malloc.h>
84 #include <sys/resourcevar.h>
85 #include <miscfs/specfs/specdev.h>
86 #include <sys/ubc.h>
87 #include <vm/vm_pageout.h>
88 #if DIAGNOSTIC
89 #include <kern/assert.h>
90 #endif /* DIAGNOSTIC */
91 #include <kern/task.h>
92 #include <kern/zalloc.h>
93
94 #include <sys/kdebug.h>
95
96 extern void bufqinc(int q);
97 extern void bufqdec(int q);
98 extern void bufq_balance_thread_init();
99
100 extern void reassignbuf(struct buf *, struct vnode *);
101 static struct buf *getnewbuf(int slpflag, int slptimeo, int *queue);
102
103 extern int niobuf; /* The number of IO buffer headers for cluster IO */
104 int blaundrycnt;
105
106 /* zone allocated buffer headers */
107 static zone_t buf_hdr_zone;
108 static int buf_hdr_count;
109
110 #if TRACE
111 struct proc *traceproc;
112 int tracewhich, tracebuf[TRCSIZ];
113 u_int tracex;
114 char traceflags[TR_NFLAGS];
115 #endif /* TRACE */
116
117 /*
118 * Definitions for the buffer hash lists.
119 */
120 #define BUFHASH(dvp, lbn) \
121 (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
122 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
123 u_long bufhash;
124
125 /* Definitions for the buffer stats. */
126 struct bufstats bufstats;
127
128 /* Number of delayed write buffers */
129 int nbdwrite = 0;
130
131 /*
132 * Insq/Remq for the buffer hash lists.
133 */
134 #if 0
135 #define binshash(bp, dp) LIST_INSERT_HEAD(dp, bp, b_hash)
136 #define bremhash(bp) LIST_REMOVE(bp, b_hash)
137 #endif /* 0 */
138
139
140 TAILQ_HEAD(ioqueue, buf) iobufqueue;
141 TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
142 static int needbuffer;
143 static int need_iobuffer;
144
145 /*
146 * Insq/Remq for the buffer free lists.
147 */
148 #define binsheadfree(bp, dp, whichq) do { \
149 TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
150 bufqinc((whichq)); \
151 (bp)->b_whichq = whichq; \
152 (bp)->b_timestamp = time.tv_sec; \
153 } while (0)
154
155 #define binstailfree(bp, dp, whichq) do { \
156 TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
157 bufqinc((whichq)); \
158 (bp)->b_whichq = whichq; \
159 (bp)->b_timestamp = time.tv_sec; \
160 } while (0)
161
162 #define BHASHENTCHECK(bp) \
163 if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef) \
164 panic("%x: b_hash.le_prev is not deadbeef", (bp));
165
166 #define BLISTNONE(bp) \
167 (bp)->b_hash.le_next = (struct buf *)0; \
168 (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
169
170 simple_lock_data_t bufhashlist_slock; /* lock on buffer hash list */
171
172 /* number of per vnode, "in flight" buffer writes */
173 #define BUFWRITE_THROTTLE 9
174
175 /*
176 * Time in seconds before a buffer on a list is
177 * considered as a stale buffer
178 */
179 #define LRU_IS_STALE 120 /* default value for the LRU */
180 #define AGE_IS_STALE 60 /* default value for the AGE */
181 #define META_IS_STALE 180 /* default value for the BQ_META */
182
183 int lru_is_stale = LRU_IS_STALE;
184 int age_is_stale = AGE_IS_STALE;
185 int meta_is_stale = META_IS_STALE;
186
187 #if 1
188 void
189 blistenterhead(struct bufhashhdr * head, struct buf * bp)
190 {
191 if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
192 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
193 (head)->lh_first = bp;
194 bp->b_hash.le_prev = &(head)->lh_first;
195 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
196 panic("blistenterhead: le_prev is deadbeef");
197
198 }
199 #endif
200
201 #if 1
202 void
203 binshash(struct buf *bp, struct bufhashhdr *dp)
204 {
205 int s;
206
207 struct buf *nbp;
208
209 simple_lock(&bufhashlist_slock);
210 #if 0
211 if(incore(bp->b_vp, bp->b_lblkno)) {
212 panic("adding to queue already existing element");
213 }
214 #endif /* 0 */
215 BHASHENTCHECK(bp);
216
217 nbp = dp->lh_first;
218 for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
219 if(nbp == bp)
220 panic("buf already in hashlist");
221 }
222
223 #if 0
224 LIST_INSERT_HEAD(dp, bp, b_hash);
225 #else
226 blistenterhead(dp, bp);
227 #endif
228 simple_unlock(&bufhashlist_slock);
229 }
230
231 void
232 bremhash(struct buf *bp)
233 {
234 int s;
235
236 simple_lock(&bufhashlist_slock);
237 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
238 panic("bremhash le_prev is deadbeef");
239 if (bp->b_hash.le_next == bp)
240 panic("bremhash: next points to self");
241
242 if (bp->b_hash.le_next != NULL)
243 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
244 *bp->b_hash.le_prev = (bp)->b_hash.le_next;
245 simple_unlock(&bufhashlist_slock);
246 }
247
248 #endif /* 1 */
249
250
251 /*
252 * Remove a buffer from the free list it's on
253 */
254 void
255 bremfree(bp)
256 struct buf *bp;
257 {
258 struct bqueues *dp = NULL;
259 int whichq = -1;
260
261 /*
262 * We only calculate the head of the freelist when removing
263 * the last element of the list as that is the only time that
264 * it is needed (e.g. to reset the tail pointer).
265 *
266 * NB: This makes an assumption about how tailq's are implemented.
267 */
268 if (bp->b_freelist.tqe_next == NULL) {
269 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
270 if (dp->tqh_last == &bp->b_freelist.tqe_next)
271 break;
272 if (dp == &bufqueues[BQUEUES])
273 panic("bremfree: lost tail");
274 }
275 TAILQ_REMOVE(dp, bp, b_freelist);
276 whichq = bp->b_whichq;
277 bufqdec(whichq);
278 bp->b_whichq = -1;
279 bp->b_timestamp = 0;
280 }
281
282 static __inline__ void
283 bufhdrinit(struct buf *bp)
284 {
285 bzero((char *)bp, sizeof *bp);
286 bp->b_dev = NODEV;
287 bp->b_rcred = NOCRED;
288 bp->b_wcred = NOCRED;
289 bp->b_vnbufs.le_next = NOLIST;
290 bp->b_flags = B_INVAL;
291
292 return;
293 }
294
295 /*
296 * Initialize buffers and hash links for buffers.
297 */
298 void
299 bufinit()
300 {
301 register struct buf *bp;
302 register struct bqueues *dp;
303 register int i;
304 int metabuf;
305 long whichq;
306 static void bufzoneinit();
307 static void bcleanbuf_thread_init();
308
309 /* Initialize the buffer queues ('freelists') and the hash table */
310 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
311 TAILQ_INIT(dp);
312 bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash);
313
314 simple_lock_init(&bufhashlist_slock );
315
316 metabuf = nbuf/8; /* reserved for meta buf */
317
318 /* Initialize the buffer headers */
319 for (i = 0; i < nbuf; i++) {
320 bp = &buf[i];
321 bufhdrinit(bp);
322
323 /*
324 * metabuf buffer headers on the meta-data list and
325 * rest of the buffer headers on the empty list
326 */
327 if (--metabuf)
328 whichq = BQ_META;
329 else
330 whichq = BQ_EMPTY;
331
332 BLISTNONE(bp);
333 dp = &bufqueues[whichq];
334 binsheadfree(bp, dp, whichq);
335 binshash(bp, &invalhash);
336 }
337
338 for (; i < nbuf + niobuf; i++) {
339 bp = &buf[i];
340 bufhdrinit(bp);
341 binsheadfree(bp, &iobufqueue, -1);
342 }
343
344 printf("using %d buffer headers and %d cluster IO buffer headers\n",
345 nbuf, niobuf);
346
347 /* Set up zones used by the buffer cache */
348 bufzoneinit();
349
350 /* start the bcleanbuf() thread */
351 bcleanbuf_thread_init();
352
353 #if 0 /* notyet */
354 /* create a thread to do dynamic buffer queue balancing */
355 bufq_balance_thread_init();
356 #endif /* XXX */
357 }
358
359 /* __inline */
360 struct buf *
361 bio_doread(vp, blkno, size, cred, async, queuetype)
362 struct vnode *vp;
363 daddr_t blkno;
364 int size;
365 struct ucred *cred;
366 int async;
367 int queuetype;
368 {
369 register struct buf *bp;
370 struct proc *p = current_proc();
371
372 bp = getblk(vp, blkno, size, 0, 0, queuetype);
373
374 /*
375 * If buffer does not have data valid, start a read.
376 * Note that if buffer is B_INVAL, getblk() won't return it.
377 * Therefore, it's valid if it's I/O has completed or been delayed.
378 */
379 if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
380 /* Start I/O for the buffer (keeping credentials). */
381 SET(bp->b_flags, B_READ | async);
382 if (cred != NOCRED && bp->b_rcred == NOCRED) {
383 /*
384 * NFS has embedded ucred.
385 * Can not crhold() here as that causes zone corruption
386 */
387 bp->b_rcred = crdup(cred);
388 }
389 VOP_STRATEGY(bp);
390
391 trace(TR_BREADMISS, pack(vp, size), blkno);
392
393 /* Pay for the read. */
394 if (p && p->p_stats)
395 p->p_stats->p_ru.ru_inblock++; /* XXX */
396 } else if (async) {
397 brelse(bp);
398 }
399
400 trace(TR_BREADHIT, pack(vp, size), blkno);
401
402 return (bp);
403 }
404 /*
405 * Read a disk block.
406 * This algorithm described in Bach (p.54).
407 */
408 int
409 bread(vp, blkno, size, cred, bpp)
410 struct vnode *vp;
411 daddr_t blkno;
412 int size;
413 struct ucred *cred;
414 struct buf **bpp;
415 {
416 register struct buf *bp;
417
418 /* Get buffer for block. */
419 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
420
421 /* Wait for the read to complete, and return result. */
422 return (biowait(bp));
423 }
424
425 /*
426 * Read a disk block. [bread() for meta-data]
427 * This algorithm described in Bach (p.54).
428 */
429 int
430 meta_bread(vp, blkno, size, cred, bpp)
431 struct vnode *vp;
432 daddr_t blkno;
433 int size;
434 struct ucred *cred;
435 struct buf **bpp;
436 {
437 register struct buf *bp;
438
439 /* Get buffer for block. */
440 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
441
442 /* Wait for the read to complete, and return result. */
443 return (biowait(bp));
444 }
445
446 /*
447 * Read-ahead multiple disk blocks. The first is sync, the rest async.
448 * Trivial modification to the breada algorithm presented in Bach (p.55).
449 */
450 int
451 breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp)
452 struct vnode *vp;
453 daddr_t blkno; int size;
454 daddr_t rablks[]; int rasizes[];
455 int nrablks;
456 struct ucred *cred;
457 struct buf **bpp;
458 {
459 register struct buf *bp;
460 int i;
461
462 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
463
464 /*
465 * For each of the read-ahead blocks, start a read, if necessary.
466 */
467 for (i = 0; i < nrablks; i++) {
468 /* If it's in the cache, just go on to next one. */
469 if (incore(vp, rablks[i]))
470 continue;
471
472 /* Get a buffer for the read-ahead block */
473 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, BLK_READ);
474 }
475
476 /* Otherwise, we had to start a read for it; wait until it's valid. */
477 return (biowait(bp));
478 }
479
480 /*
481 * Read with single-block read-ahead. Defined in Bach (p.55), but
482 * implemented as a call to breadn().
483 * XXX for compatibility with old file systems.
484 */
485 int
486 breada(vp, blkno, size, rablkno, rabsize, cred, bpp)
487 struct vnode *vp;
488 daddr_t blkno; int size;
489 daddr_t rablkno; int rabsize;
490 struct ucred *cred;
491 struct buf **bpp;
492 {
493
494 return (breadn(vp, blkno, size, &rablkno, &rabsize, 1, cred, bpp));
495 }
496
497 /*
498 * Block write. Described in Bach (p.56)
499 */
500 int
501 bwrite(bp)
502 struct buf *bp;
503 {
504 int rv, sync, wasdelayed;
505 struct proc *p = current_proc();
506 upl_t upl;
507 upl_page_info_t *pl;
508 void * object;
509 kern_return_t kret;
510 struct vnode *vp = bp->b_vp;
511
512 /* Remember buffer type, to switch on it later. */
513 sync = !ISSET(bp->b_flags, B_ASYNC);
514 wasdelayed = ISSET(bp->b_flags, B_DELWRI);
515 CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
516 if (wasdelayed)
517 nbdwrite--;
518
519 if (!sync) {
520 /*
521 * If not synchronous, pay for the I/O operation and make
522 * sure the buf is on the correct vnode queue. We have
523 * to do this now, because if we don't, the vnode may not
524 * be properly notified that its I/O has completed.
525 */
526 if (wasdelayed)
527 reassignbuf(bp, vp);
528 else
529 if (p && p->p_stats)
530 p->p_stats->p_ru.ru_oublock++; /* XXX */
531 }
532
533 trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
534
535 /* Initiate disk write. Make sure the appropriate party is charged. */
536 SET(bp->b_flags, B_WRITEINPROG);
537 vp->v_numoutput++;
538
539 VOP_STRATEGY(bp);
540
541 if (sync) {
542 /*
543 * If I/O was synchronous, wait for it to complete.
544 */
545 rv = biowait(bp);
546
547 /*
548 * Pay for the I/O operation, if it's not been paid for, and
549 * make sure it's on the correct vnode queue. (async operatings
550 * were payed for above.)
551 */
552 if (wasdelayed)
553 reassignbuf(bp, vp);
554 else
555 if (p && p->p_stats)
556 p->p_stats->p_ru.ru_oublock++; /* XXX */
557
558 /* Release the buffer. */
559 brelse(bp);
560
561 return (rv);
562 } else {
563 return (0);
564 }
565 }
566
567 int
568 vn_bwrite(ap)
569 struct vop_bwrite_args *ap;
570 {
571 return (bwrite(ap->a_bp));
572 }
573
574 /*
575 * Delayed write.
576 *
577 * The buffer is marked dirty, but is not queued for I/O.
578 * This routine should be used when the buffer is expected
579 * to be modified again soon, typically a small write that
580 * partially fills a buffer.
581 *
582 * NB: magnetic tapes cannot be delayed; they must be
583 * written in the order that the writes are requested.
584 *
585 * Described in Leffler, et al. (pp. 208-213).
586 *
587 * Note: With the abilitty to allocate additional buffer
588 * headers, we can get in to the situation where "too" many
589 * bdwrite()s can create situation where the kernel can create
590 * buffers faster than the disks can service. Doing a bawrite() in
591 * cases were we have "too many" outstanding bdwrite()s avoids that.
592 */
593 void
594 bdwrite(bp)
595 struct buf *bp;
596 {
597 struct proc *p = current_proc();
598 struct vnode *vp = bp->b_vp;
599
600 /*
601 * If the block hasn't been seen before:
602 * (1) Mark it as having been seen,
603 * (2) Charge for the write.
604 * (3) Make sure it's on its vnode's correct block list,
605 */
606 if (!ISSET(bp->b_flags, B_DELWRI)) {
607 SET(bp->b_flags, B_DELWRI);
608 if (p && p->p_stats)
609 p->p_stats->p_ru.ru_oublock++; /* XXX */
610 nbdwrite ++;
611 reassignbuf(bp, vp);
612 }
613
614
615 /* If this is a tape block, write it the block now. */
616 if (ISSET(bp->b_flags, B_TAPE)) {
617 /* bwrite(bp); */
618 VOP_BWRITE(bp);
619 return;
620 }
621
622 /*
623 * If the vnode has "too many" write operations in progress
624 * wait for them to finish the IO
625 */
626 while (vp->v_numoutput >= BUFWRITE_THROTTLE) {
627 vp->v_flag |= VTHROTTLED;
628 (void)tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "bdwrite", 0);
629 }
630
631 /*
632 * If we have too many delayed write buffers,
633 * more than we can "safely" handle, just fall back to
634 * doing the async write
635 */
636 if (nbdwrite < 0)
637 panic("bdwrite: Negative nbdwrite");
638
639 if (nbdwrite > ((nbuf/4)*3)) {
640 bawrite(bp);
641 return;
642 }
643
644 /* Otherwise, the "write" is done, so mark and release the buffer. */
645 SET(bp->b_flags, B_DONE);
646 brelse(bp);
647 }
648
649 /*
650 * Asynchronous block write; just an asynchronous bwrite().
651 *
652 * Note: With the abilitty to allocate additional buffer
653 * headers, we can get in to the situation where "too" many
654 * bawrite()s can create situation where the kernel can create
655 * buffers faster than the disks can service.
656 * We limit the number of "in flight" writes a vnode can have to
657 * avoid this.
658 */
659 void
660 bawrite(bp)
661 struct buf *bp;
662 {
663 struct vnode *vp = bp->b_vp;
664
665 if (vp) {
666 /*
667 * If the vnode has "too many" write operations in progress
668 * wait for them to finish the IO
669 */
670 while (vp->v_numoutput >= BUFWRITE_THROTTLE) {
671 vp->v_flag |= VTHROTTLED;
672 (void)tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "bawrite", 0);
673 }
674 }
675
676 SET(bp->b_flags, B_ASYNC);
677 VOP_BWRITE(bp);
678 }
679
680 /*
681 * Release a buffer on to the free lists.
682 * Described in Bach (p. 46).
683 */
684 void
685 brelse(bp)
686 struct buf *bp;
687 {
688 struct bqueues *bufq;
689 int s;
690 long whichq;
691
692 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
693 bp->b_lblkno * PAGE_SIZE, (int)bp, (int)bp->b_data,
694 bp->b_flags, 0);
695
696 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
697
698 /* IO is done. Cleanup the UPL state */
699 if (!ISSET(bp->b_flags, B_META)
700 && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
701 kern_return_t kret;
702 upl_t upl;
703 int upl_flags;
704
705 if ( !ISSET(bp->b_flags, B_PAGELIST)) {
706 if ( !ISSET(bp->b_flags, B_INVAL)) {
707 kret = ubc_create_upl(bp->b_vp,
708 ubc_blktooff(bp->b_vp, bp->b_lblkno),
709 bp->b_bufsize,
710 &upl,
711 NULL,
712 UPL_PRECIOUS);
713 if (kret != KERN_SUCCESS)
714 panic("brelse: Failed to get pagelists");
715 #ifdef UBC_DEBUG
716 upl_ubc_alias_set(upl, bp, 5);
717 #endif /* UBC_DEBUG */
718 } else
719 upl = (upl_t) 0;
720 } else {
721 upl = bp->b_pagelist;
722 kret = ubc_upl_unmap(upl);
723
724 if (kret != KERN_SUCCESS)
725 panic("kernel_upl_unmap failed");
726 bp->b_data = 0;
727 }
728 if (upl) {
729 if (bp->b_flags & (B_ERROR | B_INVAL)) {
730 if (bp->b_flags & (B_READ | B_INVAL))
731 upl_flags = UPL_ABORT_DUMP_PAGES;
732 else
733 upl_flags = 0;
734 ubc_upl_abort(upl, upl_flags);
735 } else {
736 if (ISSET(bp->b_flags, B_NEEDCOMMIT))
737 upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
738 else if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY))
739 upl_flags = UPL_COMMIT_SET_DIRTY ;
740 else
741 upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
742 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
743 UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
744 }
745 s = splbio();
746 CLR(bp->b_flags, B_PAGELIST);
747 bp->b_pagelist = 0;
748 splx(s);
749 }
750 } else {
751 if(ISSET(bp->b_flags, B_PAGELIST))
752 panic("brelse: pagelist set for non VREG; vp=%x", bp->b_vp);
753 }
754
755 /* Wake up any processes waiting for any buffer to become free. */
756 if (needbuffer) {
757 needbuffer = 0;
758 wakeup(&needbuffer);
759 }
760
761 /* Wake up any proceeses waiting for _this_ buffer to become free. */
762 if (ISSET(bp->b_flags, B_WANTED)) {
763 CLR(bp->b_flags, B_WANTED);
764 wakeup(bp);
765 }
766
767 /* Block disk interrupts. */
768 s = splbio();
769
770 /*
771 * Determine which queue the buffer should be on, then put it there.
772 */
773
774 /* If it's locked, don't report an error; try again later. */
775 if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
776 CLR(bp->b_flags, B_ERROR);
777
778 /* If it's not cacheable, or an error, mark it invalid. */
779 if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
780 SET(bp->b_flags, B_INVAL);
781
782 if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
783 /*
784 * If it's invalid or empty, dissociate it from its vnode
785 * and put on the head of the appropriate queue.
786 */
787 if (bp->b_vp)
788 brelvp(bp);
789 if (ISSET(bp->b_flags, B_DELWRI)) {
790 CLR(bp->b_flags, B_DELWRI);
791 nbdwrite--;
792 }
793 if (bp->b_bufsize <= 0)
794 whichq = BQ_EMPTY; /* no data */
795 else
796 whichq = BQ_AGE; /* invalid data */
797
798 bufq = &bufqueues[whichq];
799 binsheadfree(bp, bufq, whichq);
800 } else {
801 /*
802 * It has valid data. Put it on the end of the appropriate
803 * queue, so that it'll stick around for as long as possible.
804 */
805 if (ISSET(bp->b_flags, B_LOCKED))
806 whichq = BQ_LOCKED; /* locked in core */
807 else if (ISSET(bp->b_flags, B_META))
808 whichq = BQ_META; /* meta-data */
809 else if (ISSET(bp->b_flags, B_AGE))
810 whichq = BQ_AGE; /* stale but valid data */
811 else
812 whichq = BQ_LRU; /* valid data */
813
814 bufq = &bufqueues[whichq];
815 binstailfree(bp, bufq, whichq);
816 }
817
818 /* Unlock the buffer. */
819 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE));
820
821 /* Allow disk interrupts. */
822 splx(s);
823
824 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
825 (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
826 }
827
828 /*
829 * Determine if a block is in the cache.
830 * Just look on what would be its hash chain. If it's there, return
831 * a pointer to it, unless it's marked invalid. If it's marked invalid,
832 * we normally don't return the buffer, unless the caller explicitly
833 * wants us to.
834 */
835 struct buf *
836 incore(vp, blkno)
837 struct vnode *vp;
838 daddr_t blkno;
839 {
840 struct buf *bp;
841 int bufseen = 0;
842
843 bp = BUFHASH(vp, blkno)->lh_first;
844
845 /* Search hash chain */
846 for (; bp != NULL; bp = bp->b_hash.le_next, bufseen++) {
847 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
848 !ISSET(bp->b_flags, B_INVAL))
849 return (bp);
850 if(bufseen >= nbuf)
851 panic("walked more than nbuf in incore");
852
853 }
854
855 return (0);
856 }
857
858
859 /* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
860 /*
861 * Get a block of requested size that is associated with
862 * a given vnode and block offset. If it is found in the
863 * block cache, mark it as having been found, make it busy
864 * and return it. Otherwise, return an empty block of the
865 * correct size. It is up to the caller to insure that the
866 * cached blocks be of the correct size.
867 */
868 struct buf *
869 getblk(vp, blkno, size, slpflag, slptimeo, operation)
870 register struct vnode *vp;
871 daddr_t blkno;
872 int size, slpflag, slptimeo, operation;
873 {
874 struct buf *bp;
875 int s, err;
876 upl_t upl;
877 upl_page_info_t *pl;
878 kern_return_t kret;
879 int error=0;
880 int pagedirty = 0;
881
882 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
883 blkno * PAGE_SIZE, size, operation, 0, 0);
884 start:
885
886 s = splbio();
887 if (bp = incore(vp, blkno)) {
888 /* Found in the Buffer Cache */
889 if (ISSET(bp->b_flags, B_BUSY)) {
890 /* but is busy */
891 switch (operation) {
892 case BLK_READ:
893 case BLK_WRITE:
894 case BLK_META:
895 SET(bp->b_flags, B_WANTED);
896 bufstats.bufs_busyincore++;
897 err = tsleep(bp, slpflag | (PRIBIO + 1), "getblk",
898 slptimeo);
899 splx(s);
900 /*
901 * Callers who call with PCATCH or timeout are
902 * willing to deal with the NULL pointer
903 */
904 if (err && ((slpflag & PCATCH) ||
905 ((err == EWOULDBLOCK) && slptimeo)))
906 return (NULL);
907 goto start;
908 /*NOTREACHED*/
909 break;
910
911 case BLK_PAGEIN:
912 /* pagein operation must not use getblk */
913 panic("getblk: pagein for incore busy buffer");
914 splx(s);
915 /*NOTREACHED*/
916 break;
917
918 case BLK_PAGEOUT:
919 /* pageout operation must not use getblk */
920 panic("getblk: pageout for incore busy buffer");
921 splx(s);
922 /*NOTREACHED*/
923 break;
924
925 default:
926 panic("getblk: %d unknown operation 1", operation);
927 /*NOTREACHED*/
928 break;
929 }
930 } else {
931 /* not busy */
932 SET(bp->b_flags, (B_BUSY | B_CACHE));
933 bremfree(bp);
934 bufstats.bufs_incore++;
935 splx(s);
936
937 allocbuf(bp, size);
938 if (ISSET(bp->b_flags, B_PAGELIST))
939 panic("pagelist buffer is not busy");
940
941 switch (operation) {
942 case BLK_READ:
943 case BLK_WRITE:
944 if (UBCISVALID(bp->b_vp) && bp->b_bufsize) {
945 kret = ubc_create_upl(vp,
946 ubc_blktooff(vp, bp->b_lblkno),
947 bp->b_bufsize,
948 &upl,
949 &pl,
950 UPL_PRECIOUS);
951 if (kret != KERN_SUCCESS)
952 panic("Failed to get pagelists");
953
954 SET(bp->b_flags, B_PAGELIST);
955 bp->b_pagelist = upl;
956
957 if (!upl_valid_page(pl, 0)) {
958 if (vp->v_tag != VT_NFS)
959 panic("getblk: incore buffer without valid page");
960 CLR(bp->b_flags, B_CACHE);
961 }
962
963 if (upl_dirty_page(pl, 0))
964 SET(bp->b_flags, B_WASDIRTY);
965 else
966 CLR(bp->b_flags, B_WASDIRTY);
967
968 kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
969 if (kret != KERN_SUCCESS) {
970 panic("getblk: ubc_upl_map() failed with (%d)",
971 kret);
972 }
973 if (bp->b_data == 0) panic("ubc_upl_map mapped 0");
974 }
975 break;
976
977 case BLK_META:
978 /*
979 * VM is not involved in IO for the meta data
980 * buffer already has valid data
981 */
982 if(bp->b_data == 0)
983 panic("bp->b_data null incore buf=%x", bp);
984 break;
985
986 case BLK_PAGEIN:
987 case BLK_PAGEOUT:
988 panic("getblk: paging operation 1");
989 break;
990
991 default:
992 panic("getblk: %d unknown operation 2", operation);
993 /*NOTREACHED*/
994 break;
995 }
996 }
997 } else { /* not incore() */
998 int queue = BQ_EMPTY; /* Start with no preference */
999 splx(s);
1000
1001 if ((operation == BLK_META) || (UBCINVALID(vp)) ||
1002 !(UBCINFOEXISTS(vp))) {
1003 operation = BLK_META;
1004 }
1005 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
1006 goto start;
1007 if (incore(vp, blkno)) {
1008 SET(bp->b_flags, B_INVAL);
1009 binshash(bp, &invalhash);
1010 brelse(bp);
1011 goto start;
1012 }
1013
1014 /*
1015 * if it is meta, the queue may be set to other
1016 * type so reset as well as mark it to be B_META
1017 * so that when buffer is released it will goto META queue
1018 * Also, if the vnode is not VREG, then it is META
1019 */
1020 if (operation == BLK_META) {
1021 SET(bp->b_flags, B_META);
1022 queue = BQ_META;
1023 }
1024 /*
1025 * Insert in the hash so that incore() can find it
1026 */
1027 binshash(bp, BUFHASH(vp, blkno));
1028
1029 allocbuf(bp, size);
1030
1031 switch (operation) {
1032 case BLK_META:
1033 /* buffer data is invalid */
1034
1035 #if !ZALLOC_METADATA
1036 if (bp->b_data)
1037 panic("bp->b_data is not nul; %x",bp);
1038 kret = kmem_alloc(kernel_map,
1039 &bp->b_data, bp->b_bufsize);
1040 if (kret != KERN_SUCCESS)
1041 panic("getblk: kmem_alloc() returned %d", kret);
1042 #endif /* ZALLOC_METADATA */
1043
1044 if(bp->b_data == 0)
1045 panic("bp->b_data is null %x",bp);
1046
1047 bp->b_blkno = bp->b_lblkno = blkno;
1048 s = splbio();
1049 bgetvp(vp, bp);
1050 bufstats.bufs_miss++;
1051 splx(s);
1052 if (bp->b_data == 0)
1053 panic("b_data is 0: 2");
1054
1055 /* wakeup the buffer */
1056 CLR(bp->b_flags, B_WANTED);
1057 wakeup(bp);
1058 break;
1059
1060 case BLK_READ:
1061 case BLK_WRITE:
1062
1063 if (ISSET(bp->b_flags, B_PAGELIST))
1064 panic("B_PAGELIST in bp=%x",bp);
1065
1066 kret = ubc_create_upl(vp,
1067 ubc_blktooff(vp, blkno),
1068 bp->b_bufsize,
1069 &upl,
1070 &pl,
1071 UPL_PRECIOUS);
1072 if (kret != KERN_SUCCESS)
1073 panic("Failed to get pagelists");
1074
1075 #ifdef UBC_DEBUG
1076 upl_ubc_alias_set(upl, bp, 4);
1077 #endif /* UBC_DEBUG */
1078 bp->b_blkno = bp->b_lblkno = blkno;
1079 bp->b_pagelist = upl;
1080
1081 SET(bp->b_flags, B_PAGELIST);
1082
1083 if (upl_valid_page(pl, 0)) {
1084 SET(bp->b_flags, B_CACHE | B_DONE);
1085 bufstats.bufs_vmhits++;
1086
1087 pagedirty = upl_dirty_page(pl, 0);
1088
1089 if (pagedirty)
1090 SET(bp->b_flags, B_WASDIRTY);
1091
1092 if (vp->v_tag == VT_NFS) {
1093 off_t f_offset;
1094 int valid_size;
1095
1096 bp->b_validoff = 0;
1097 bp->b_dirtyoff = 0;
1098
1099 f_offset = ubc_blktooff(vp, blkno);
1100
1101 if (f_offset > vp->v_ubcinfo->ui_size) {
1102 CLR(bp->b_flags, (B_CACHE|B_DONE|B_WASDIRTY));
1103 bp->b_validend = 0;
1104 bp->b_dirtyend = 0;
1105 } else {
1106 valid_size = min(((unsigned int)(vp->v_ubcinfo->ui_size - f_offset)), PAGE_SIZE);
1107 bp->b_validend = valid_size;
1108
1109 if (pagedirty)
1110 bp->b_dirtyend = valid_size;
1111 else
1112 bp->b_dirtyend = 0;
1113
1114 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_NONE,
1115 bp->b_validend, bp->b_dirtyend,
1116 (int)vp->v_ubcinfo->ui_size, 0, 0);
1117 }
1118 } else {
1119 bp->b_validoff = 0;
1120 bp->b_dirtyoff = 0;
1121
1122 if (pagedirty) {
1123 /* page is dirty */
1124 bp->b_validend = bp->b_bcount;
1125 bp->b_dirtyend = bp->b_bcount;
1126 } else {
1127 /* page is clean */
1128 bp->b_validend = bp->b_bcount;
1129 bp->b_dirtyend = 0;
1130 }
1131 }
1132 if (error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL)) {
1133 panic("VOP_BMAP failed in getblk");
1134 /*NOTREACHED*/
1135 /*
1136 * XXX: We probably should invalidate the VM Page
1137 */
1138 bp->b_error = error;
1139 SET(bp->b_flags, (B_ERROR | B_INVAL));
1140 /* undo B_DONE that was set before upl_commit() */
1141 CLR(bp->b_flags, B_DONE);
1142 brelse(bp);
1143 return (0);
1144 }
1145 } else {
1146 bufstats.bufs_miss++;
1147 }
1148 kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
1149 if (kret != KERN_SUCCESS) {
1150 panic("getblk: ubc_upl_map() "
1151 "failed with (%d)", kret);
1152 }
1153 if (bp->b_data == 0) panic("kernel_upl_map mapped 0");
1154
1155 s = splbio();
1156 bgetvp(vp, bp);
1157 splx(s);
1158
1159 break;
1160
1161 case BLK_PAGEIN:
1162 case BLK_PAGEOUT:
1163 panic("getblk: paging operation 2");
1164 break;
1165 default:
1166 panic("getblk: %d unknown operation 3", operation);
1167 /*NOTREACHED*/
1168 break;
1169 }
1170 }
1171
1172 if (bp->b_data == NULL)
1173 panic("getblk: bp->b_addr is null");
1174
1175 if (bp->b_bufsize & 0xfff) {
1176 #if ZALLOC_METADATA
1177 if (ISSET(bp->b_flags, B_META) && (bp->b_bufsize & 0x1ff))
1178 #endif /* ZALLOC_METADATA */
1179 panic("getblk: bp->b_bufsize = %d", bp->b_bufsize);
1180 }
1181
1182 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
1183 (int)bp, (int)bp->b_data, bp->b_flags, 3, 0);
1184
1185 return (bp);
1186 }
1187
1188 /*
1189 * Get an empty, disassociated buffer of given size.
1190 */
1191 struct buf *
1192 geteblk(size)
1193 int size;
1194 {
1195 struct buf *bp;
1196 int queue = BQ_EMPTY;
1197 #if !ZALLOC_METADATA
1198 kern_return_t kret;
1199 vm_size_t desired_size = roundup(size, CLBYTES);
1200
1201 if (desired_size > MAXBSIZE)
1202 panic("geteblk: buffer larger than MAXBSIZE requested");
1203 #endif /* ZALLOC_METADATA */
1204
1205 while ((bp = getnewbuf(0, 0, &queue)) == 0)
1206 ;
1207 #if ZALLOC_METADATA
1208 SET(bp->b_flags, (B_META|B_INVAL));
1209 #else
1210 SET(bp->b_flags, B_INVAL);
1211 #endif /* ZALLOC_METADATA */
1212
1213 #if DIAGNOSTIC
1214 assert(queue == BQ_EMPTY);
1215 #endif /* DIAGNOSTIC */
1216 /* XXX need to implement logic to deal with other queues */
1217
1218 #if !ZALLOC_METADATA
1219 /* Empty buffer - allocate pages */
1220 kret = kmem_alloc_aligned(kernel_map, &bp->b_data, desired_size);
1221 if (kret != KERN_SUCCESS)
1222 panic("geteblk: kmem_alloc_aligned returned %d", kret);
1223 #endif /* ZALLOC_METADATA */
1224
1225 binshash(bp, &invalhash);
1226 allocbuf(bp, size);
1227 bufstats.bufs_eblk++;
1228
1229 return (bp);
1230 }
1231
1232 #if ZALLOC_METADATA
1233 /*
1234 * Zones for the meta data buffers
1235 */
1236
1237 #define MINMETA 512
1238 #define MAXMETA 4096
1239
1240 struct meta_zone_entry {
1241 zone_t mz_zone;
1242 vm_size_t mz_size;
1243 vm_size_t mz_max;
1244 char *mz_name;
1245 };
1246
1247 struct meta_zone_entry meta_zones[] = {
1248 {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
1249 {NULL, (MINMETA * 2), 64 * (MINMETA * 2), "buf.1024" },
1250 {NULL, (MINMETA * 3), 16 * (MINMETA * 3), "buf.1536" },
1251 {NULL, (MINMETA * 4), 16 * (MINMETA * 4), "buf.2048" },
1252 {NULL, (MINMETA * 5), 16 * (MINMETA * 5), "buf.2560" },
1253 {NULL, (MINMETA * 6), 16 * (MINMETA * 6), "buf.3072" },
1254 {NULL, (MINMETA * 7), 16 * (MINMETA * 7), "buf.3584" },
1255 {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
1256 {NULL, 0, 0, "" } /* End */
1257 };
1258 #endif /* ZALLOC_METADATA */
1259
1260 /*
1261 * Initialize the meta data zones
1262 */
1263 static void
1264 bufzoneinit(void)
1265 {
1266 #if ZALLOC_METADATA
1267 int i;
1268
1269 for (i = 0; meta_zones[i].mz_size != 0; i++) {
1270 meta_zones[i].mz_zone =
1271 zinit(meta_zones[i].mz_size,
1272 meta_zones[i].mz_max,
1273 PAGE_SIZE,
1274 meta_zones[i].mz_name);
1275 }
1276 #endif /* ZALLOC_METADATA */
1277 buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
1278 }
1279
1280 #if ZALLOC_METADATA
1281 static zone_t
1282 getbufzone(size_t size)
1283 {
1284 int i;
1285
1286 if (size % 512)
1287 panic("getbufzone: incorect size = %d", size);
1288
1289 i = (size / 512) - 1;
1290 return (meta_zones[i].mz_zone);
1291 }
1292 #endif /* ZALLOC_METADATA */
1293
1294 /*
1295 * With UBC, there is no need to expand / shrink the file data
1296 * buffer. The VM uses the same pages, hence no waste.
1297 * All the file data buffers can have one size.
1298 * In fact expand / shrink would be an expensive operation.
1299 *
1300 * Only exception to this is meta-data buffers. Most of the
1301 * meta data operations are smaller than PAGE_SIZE. Having the
1302 * meta-data buffers grow and shrink as needed, optimizes use
1303 * of the kernel wired memory.
1304 */
1305
1306 int
1307 allocbuf(bp, size)
1308 struct buf *bp;
1309 int size;
1310 {
1311 vm_size_t desired_size;
1312
1313 desired_size = roundup(size, CLBYTES);
1314
1315 if(desired_size < PAGE_SIZE)
1316 desired_size = PAGE_SIZE;
1317 if (desired_size > MAXBSIZE)
1318 panic("allocbuf: buffer larger than MAXBSIZE requested");
1319
1320 #if ZALLOC_METADATA
1321 if (ISSET(bp->b_flags, B_META)) {
1322 kern_return_t kret;
1323 zone_t zprev, z;
1324 size_t nsize = roundup(size, MINMETA);
1325
1326 if (bp->b_data) {
1327 vm_offset_t elem = (vm_offset_t)bp->b_data;
1328
1329 if (ISSET(bp->b_flags, B_ZALLOC))
1330 if (bp->b_bufsize <= MAXMETA) {
1331 if (bp->b_bufsize < nsize) {
1332 /* reallocate to a bigger size */
1333 desired_size = nsize;
1334
1335 zprev = getbufzone(bp->b_bufsize);
1336 z = getbufzone(nsize);
1337 bp->b_data = (caddr_t)zalloc(z);
1338 if(bp->b_data == 0)
1339 panic("allocbuf: zalloc() returned NULL");
1340 bcopy(elem, bp->b_data, bp->b_bufsize);
1341 zfree(zprev, elem);
1342 } else {
1343 desired_size = bp->b_bufsize;
1344 }
1345 } else
1346 panic("allocbuf: B_ZALLOC set incorrectly");
1347 else
1348 if (bp->b_bufsize < desired_size) {
1349 /* reallocate to a bigger size */
1350 kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1351 if (kret != KERN_SUCCESS)
1352 panic("allocbuf: kmem_alloc() returned %d", kret);
1353 if(bp->b_data == 0)
1354 panic("allocbuf: null b_data");
1355 bcopy(elem, bp->b_data, bp->b_bufsize);
1356 kmem_free(kernel_map, elem, bp->b_bufsize);
1357 } else {
1358 desired_size = bp->b_bufsize;
1359 }
1360 } else {
1361 /* new allocation */
1362 if (nsize <= MAXMETA) {
1363 desired_size = nsize;
1364 z = getbufzone(nsize);
1365 bp->b_data = (caddr_t)zalloc(z);
1366 if(bp->b_data == 0)
1367 panic("allocbuf: zalloc() returned NULL 2");
1368 SET(bp->b_flags, B_ZALLOC);
1369 } else {
1370 kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1371 if (kret != KERN_SUCCESS)
1372 panic("allocbuf: kmem_alloc() 2 returned %d", kret);
1373 if(bp->b_data == 0)
1374 panic("allocbuf: null b_data 2");
1375 }
1376 }
1377 }
1378
1379 if (ISSET(bp->b_flags, B_META) && (bp->b_data == 0))
1380 panic("allocbuf: bp->b_data is NULL");
1381 #endif /* ZALLOC_METADATA */
1382
1383 bp->b_bufsize = desired_size;
1384 bp->b_bcount = size;
1385 }
1386
1387 /*
1388 * Get a new buffer from one of the free lists.
1389 *
1390 * Request for a queue is passes in. The queue from which the buffer was taken
1391 * from is returned. Out of range queue requests get BQ_EMPTY. Request for
1392 * BQUEUE means no preference. Use heuristics in that case.
1393 * Heuristics is as follows:
1394 * Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
1395 * If none available block till one is made available.
1396 * If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
1397 * Pick the most stale buffer.
1398 * If found buffer was marked delayed write, start the async. write
1399 * and restart the search.
1400 * Initialize the fields and disassociate the buffer from the vnode.
1401 * Remove the buffer from the hash. Return the buffer and the queue
1402 * on which it was found.
1403 */
1404
1405 static struct buf *
1406 getnewbuf(slpflag, slptimeo, queue)
1407 int slpflag, slptimeo;
1408 int *queue;
1409 {
1410 register struct buf *bp;
1411 register struct buf *lru_bp;
1412 register struct buf *age_bp;
1413 register struct buf *meta_bp;
1414 register int age_time, lru_time, bp_time, meta_time;
1415 int s;
1416 struct ucred *cred;
1417 int req = *queue; /* save it for restarts */
1418
1419 start:
1420 s = splbio();
1421
1422 /* invalid request gets empty queue */
1423 if ((*queue > BQUEUES) || (*queue < 0)
1424 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED))
1425 *queue = BQ_EMPTY;
1426
1427 /* (*queue == BQUEUES) means no preference */
1428 if (*queue != BQUEUES) {
1429 /* Try for the requested queue first */
1430 bp = bufqueues[*queue].tqh_first;
1431 if (bp)
1432 goto found;
1433 }
1434
1435 /* Unable to use requested queue */
1436 age_bp = bufqueues[BQ_AGE].tqh_first;
1437 lru_bp = bufqueues[BQ_LRU].tqh_first;
1438 meta_bp = bufqueues[BQ_META].tqh_first;
1439
1440 if (!age_bp && !lru_bp && !meta_bp) { /* Unavailble on AGE or LRU */
1441 /* Try the empty list first */
1442 bp = bufqueues[BQ_EMPTY].tqh_first;
1443 if (bp) {
1444 *queue = BQ_EMPTY;
1445 goto found;
1446 }
1447
1448 /* Create a new temparory buffer header */
1449 bp = (struct buf *)zalloc(buf_hdr_zone);
1450
1451 if (bp) {
1452 bufhdrinit(bp);
1453 BLISTNONE(bp);
1454 binshash(bp, &invalhash);
1455 SET(bp->b_flags, B_HDRALLOC);
1456 *queue = BQ_EMPTY;
1457 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
1458 buf_hdr_count++;
1459 goto found;
1460 }
1461
1462 /* Log this error condition */
1463 printf("getnewbuf: No useful buffers");
1464
1465 /* wait for a free buffer of any kind */
1466 needbuffer = 1;
1467 bufstats.bufs_sleeps++;
1468 tsleep(&needbuffer, slpflag|(PRIBIO+1), "getnewbuf", slptimeo);
1469 splx(s);
1470 return (0);
1471 }
1472
1473 /* Buffer available either on AGE or LRU or META */
1474 bp = NULL;
1475 *queue = -1;
1476
1477 /* Buffer available either on AGE or LRU */
1478 if (!age_bp) {
1479 bp = lru_bp;
1480 *queue = BQ_LRU;
1481 } else if (!lru_bp) {
1482 bp = age_bp;
1483 *queue = BQ_AGE;
1484 } else { /* buffer available on both AGE and LRU */
1485 age_time = time.tv_sec - age_bp->b_timestamp;
1486 lru_time = time.tv_sec - lru_bp->b_timestamp;
1487 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
1488 bp = age_bp;
1489 *queue = BQ_AGE;
1490 /*
1491 * we should probably re-timestamp eveything in the
1492 * queues at this point with the current time
1493 */
1494 } else {
1495 if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
1496 bp = lru_bp;
1497 *queue = BQ_LRU;
1498 } else {
1499 bp = age_bp;
1500 *queue = BQ_AGE;
1501 }
1502 }
1503 }
1504
1505 if (!bp) { /* Neither on AGE nor on LRU */
1506 bp = meta_bp;
1507 *queue = BQ_META;
1508 } else if (meta_bp) {
1509 bp_time = time.tv_sec - bp->b_timestamp;
1510 meta_time = time.tv_sec - meta_bp->b_timestamp;
1511
1512 if (!(bp_time < 0) && !(meta_time < 0)) {
1513 /* time not set backwards */
1514 int bp_is_stale;
1515 bp_is_stale = (*queue == BQ_LRU) ?
1516 lru_is_stale : age_is_stale;
1517
1518 if ((meta_time >= meta_is_stale) &&
1519 (bp_time < bp_is_stale)) {
1520 bp = meta_bp;
1521 *queue = BQ_META;
1522 }
1523 }
1524 }
1525
1526 if (bp == NULL)
1527 panic("getnewbuf: null bp");
1528
1529 found:
1530 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1531 panic("getnewbuf: le_prev is deadbeef");
1532
1533 if(ISSET(bp->b_flags, B_BUSY))
1534 panic("getnewbuf reusing BUSY buf");
1535
1536 /* Clean it */
1537 if (bcleanbuf(bp)) {
1538 /* bawrite() issued, buffer not ready */
1539 splx(s);
1540 *queue = req;
1541 goto start;
1542 }
1543 splx(s);
1544 return (bp);
1545 }
1546 #include <mach/mach_types.h>
1547 #include <mach/memory_object_types.h>
1548
1549 /*
1550 * Clean a buffer.
1551 * Returns 0 is buffer is ready to use,
1552 * Returns 1 if issued a bawrite() to indicate
1553 * that the buffer is not ready.
1554 */
1555 int
1556 bcleanbuf(struct buf *bp)
1557 {
1558 int s;
1559 struct ucred *cred;
1560 int hdralloc = 0;
1561
1562 s = splbio();
1563
1564 /* Remove from the queue */
1565 bremfree(bp);
1566
1567 /* Buffer is no longer on free lists. */
1568 SET(bp->b_flags, B_BUSY);
1569
1570 /* Check whether the buffer header was "allocated" */
1571 if (ISSET(bp->b_flags, B_HDRALLOC))
1572 hdralloc = 1;
1573
1574 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1575 panic("bcleanbuf: le_prev is deadbeef");
1576
1577 /*
1578 * If buffer was a delayed write, start the IO by queuing
1579 * it on the LAUNDRY queue, and return 1
1580 */
1581 if (ISSET(bp->b_flags, B_DELWRI)) {
1582 splx(s);
1583 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
1584 blaundrycnt++;
1585 wakeup(&blaundrycnt);
1586 return (1);
1587 }
1588
1589 if (bp->b_vp)
1590 brelvp(bp);
1591 bremhash(bp);
1592 BLISTNONE(bp);
1593
1594 splx(s);
1595
1596 if (ISSET(bp->b_flags, B_META)) {
1597 #if ZALLOC_METADATA
1598 vm_offset_t elem = (vm_offset_t)bp->b_data;
1599 if (elem == 0)
1600 panic("bcleanbuf: NULL bp->b_data B_META buffer");
1601
1602 if (ISSET(bp->b_flags, B_ZALLOC)) {
1603 if (bp->b_bufsize <= MAXMETA) {
1604 zone_t z;
1605
1606 z = getbufzone(bp->b_bufsize);
1607 bp->b_data = (caddr_t)0xdeadbeef;
1608 zfree(z, elem);
1609 CLR(bp->b_flags, B_ZALLOC);
1610 } else
1611 panic("bcleanbuf: B_ZALLOC set incorrectly");
1612 } else {
1613 bp->b_data = (caddr_t)0xdeadbeef;
1614 kmem_free(kernel_map, elem, bp->b_bufsize);
1615 }
1616 #else
1617 if (bp->b_data == 0)
1618 panic("bcleanbuf: bp->b_data == NULL for B_META buffer");
1619
1620 kmem_free(kernel_map, bp->b_data, bp->b_bufsize);
1621 #endif /* ZALLOC_METADATA */
1622 }
1623
1624 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
1625
1626 /* disassociate us from our vnode, if we had one... */
1627 s = splbio();
1628
1629 /* clear out various other fields */
1630 bp->b_bufsize = 0;
1631 bp->b_data = 0;
1632 bp->b_flags = B_BUSY;
1633 if (hdralloc)
1634 SET(bp->b_flags, B_HDRALLOC);
1635 bp->b_dev = NODEV;
1636 bp->b_blkno = bp->b_lblkno = 0;
1637 bp->b_iodone = 0;
1638 bp->b_error = 0;
1639 bp->b_resid = 0;
1640 bp->b_bcount = 0;
1641 bp->b_dirtyoff = bp->b_dirtyend = 0;
1642 bp->b_validoff = bp->b_validend = 0;
1643
1644 /* nuke any credentials we were holding */
1645 cred = bp->b_rcred;
1646 if (cred != NOCRED) {
1647 bp->b_rcred = NOCRED;
1648 crfree(cred);
1649 }
1650 cred = bp->b_wcred;
1651 if (cred != NOCRED) {
1652 bp->b_wcred = NOCRED;
1653 crfree(cred);
1654 }
1655 splx(s);
1656 return (0);
1657 }
1658
1659
1660 /*
1661 * Wait for operations on the buffer to complete.
1662 * When they do, extract and return the I/O's error value.
1663 */
1664 int
1665 biowait(bp)
1666 struct buf *bp;
1667 {
1668 upl_t upl;
1669 upl_page_info_t *pl;
1670 int s;
1671 kern_return_t kret;
1672
1673 s = splbio();
1674 while (!ISSET(bp->b_flags, B_DONE))
1675 tsleep(bp, PRIBIO + 1, "biowait", 0);
1676 splx(s);
1677
1678 /* check for interruption of I/O (e.g. via NFS), then errors. */
1679 if (ISSET(bp->b_flags, B_EINTR)) {
1680 CLR(bp->b_flags, B_EINTR);
1681 return (EINTR);
1682 } else if (ISSET(bp->b_flags, B_ERROR))
1683 return (bp->b_error ? bp->b_error : EIO);
1684 else
1685 return (0);
1686 }
1687
1688 /*
1689 * Mark I/O complete on a buffer.
1690 *
1691 * If a callback has been requested, e.g. the pageout
1692 * daemon, do so. Otherwise, awaken waiting processes.
1693 *
1694 * [ Leffler, et al., says on p.247:
1695 * "This routine wakes up the blocked process, frees the buffer
1696 * for an asynchronous write, or, for a request by the pagedaemon
1697 * process, invokes a procedure specified in the buffer structure" ]
1698 *
1699 * In real life, the pagedaemon (or other system processes) wants
1700 * to do async stuff to, and doesn't want the buffer brelse()'d.
1701 * (for swap pager, that puts swap buffers on the free lists (!!!),
1702 * for the vn device, that puts malloc'd buffers on the free lists!)
1703 */
1704 void
1705 biodone(bp)
1706 struct buf *bp;
1707 {
1708 boolean_t funnel_state;
1709 struct vnode *vp;
1710
1711 funnel_state = thread_funnel_set(kernel_flock, TRUE);
1712
1713 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
1714 (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1715
1716 if (ISSET(bp->b_flags, B_DONE))
1717 panic("biodone already");
1718 SET(bp->b_flags, B_DONE); /* note that it's done */
1719 /*
1720 * I/O was done, so don't believe
1721 * the DIRTY state from VM anymore
1722 */
1723 CLR(bp->b_flags, B_WASDIRTY);
1724
1725 if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
1726 vwakeup(bp); /* wake up reader */
1727
1728 /* Wakeup the throttled write operations as needed */
1729 vp = bp->b_vp;
1730 if (vp
1731 && (vp->v_flag & VTHROTTLED)
1732 && (vp->v_numoutput <= (BUFWRITE_THROTTLE / 3))) {
1733 vp->v_flag &= ~VTHROTTLED;
1734 wakeup((caddr_t)&vp->v_numoutput);
1735 }
1736
1737 if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */
1738 CLR(bp->b_flags, B_CALL); /* but note callout done */
1739 (*bp->b_iodone)(bp);
1740 } else if (ISSET(bp->b_flags, B_ASYNC)) /* if async, release it */
1741 brelse(bp);
1742 else { /* or just wakeup the buffer */
1743 CLR(bp->b_flags, B_WANTED);
1744 wakeup(bp);
1745 }
1746
1747 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
1748 (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1749
1750 thread_funnel_set(kernel_flock, funnel_state);
1751 }
1752
1753 /*
1754 * Return a count of buffers on the "locked" queue.
1755 */
1756 int
1757 count_lock_queue()
1758 {
1759 register struct buf *bp;
1760 register int n = 0;
1761
1762 for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
1763 bp = bp->b_freelist.tqe_next)
1764 n++;
1765 return (n);
1766 }
1767
1768 /*
1769 * Return a count of 'busy' buffers. Used at the time of shutdown.
1770 */
1771 int
1772 count_busy_buffers()
1773 {
1774 register struct buf *bp;
1775 register int nbusy = 0;
1776
1777 for (bp = &buf[nbuf]; --bp >= buf; )
1778 if ((bp->b_flags & (B_BUSY|B_INVAL)) == B_BUSY)
1779 nbusy++;
1780 return (nbusy);
1781 }
1782
1783 #if 1 /*DIAGNOSTIC */
1784 /*
1785 * Print out statistics on the current allocation of the buffer pool.
1786 * Can be enabled to print out on every ``sync'' by setting "syncprt"
1787 * in vfs_syscalls.c using sysctl.
1788 */
1789 void
1790 vfs_bufstats()
1791 {
1792 int s, i, j, count;
1793 register struct buf *bp;
1794 register struct bqueues *dp;
1795 int counts[MAXBSIZE/CLBYTES+1];
1796 static char *bname[BQUEUES] =
1797 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
1798
1799 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
1800 count = 0;
1801 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1802 counts[j] = 0;
1803 s = splbio();
1804 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
1805 counts[bp->b_bufsize/CLBYTES]++;
1806 count++;
1807 }
1808 splx(s);
1809 printf("%s: total-%d", bname[i], count);
1810 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1811 if (counts[j] != 0)
1812 printf(", %d-%d", j * CLBYTES, counts[j]);
1813 printf("\n");
1814 }
1815 }
1816 #endif /* DIAGNOSTIC */
1817
1818 #define NRESERVEDIOBUFS 16
1819
1820 struct buf *
1821 alloc_io_buf(vp, priv)
1822 struct vnode *vp;
1823 int priv;
1824 {
1825 register struct buf *bp;
1826 int s;
1827
1828 s = splbio();
1829
1830 while (niobuf - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse && !priv) {
1831 need_iobuffer = 1;
1832 bufstats.bufs_iobufsleeps++;
1833 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf", 0);
1834 }
1835
1836 while ((bp = iobufqueue.tqh_first) == NULL) {
1837 need_iobuffer = 1;
1838 bufstats.bufs_iobufsleeps++;
1839 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf1", 0);
1840 }
1841
1842 TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
1843 bp->b_timestamp = 0;
1844
1845 /* clear out various fields */
1846 bp->b_flags = B_BUSY;
1847 bp->b_blkno = bp->b_lblkno = 0;
1848 bp->b_iodone = 0;
1849 bp->b_error = 0;
1850 bp->b_resid = 0;
1851 bp->b_bcount = 0;
1852 bp->b_bufsize = 0;
1853 bp->b_vp = vp;
1854
1855 if (vp->v_type == VBLK || vp->v_type == VCHR)
1856 bp->b_dev = vp->v_rdev;
1857 else
1858 bp->b_dev = NODEV;
1859 bufstats.bufs_iobufinuse++;
1860 if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
1861 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
1862 splx(s);
1863
1864 return (bp);
1865 }
1866
1867 void
1868 free_io_buf(bp)
1869 struct buf *bp;
1870 {
1871 int s;
1872
1873 s = splbio();
1874 /* put buffer back on the head of the iobufqueue */
1875 bp->b_vp = NULL;
1876 bp->b_flags = B_INVAL;
1877
1878 binsheadfree(bp, &iobufqueue, -1);
1879
1880 /* Wake up any processes waiting for any buffer to become free. */
1881 if (need_iobuffer) {
1882 need_iobuffer = 0;
1883 wakeup(&need_iobuffer);
1884 }
1885 bufstats.bufs_iobufinuse--;
1886 splx(s);
1887 }
1888
1889
1890 /* not hookedup yet */
1891
1892 /* XXX move this to a separate file */
1893 /*
1894 * Dynamic Scaling of the Buffer Queues
1895 */
1896
1897 typedef long long blsize_t;
1898
1899 blsize_t MAXNBUF; /* initialize to (mem_size / PAGE_SIZE) */
1900 /* Global tunable limits */
1901 blsize_t nbufh; /* number of buffer headers */
1902 blsize_t nbuflow; /* minimum number of buffer headers required */
1903 blsize_t nbufhigh; /* maximum number of buffer headers allowed */
1904 blsize_t nbuftarget; /* preferred number of buffer headers */
1905
1906 /*
1907 * assertions:
1908 *
1909 * 1. 0 < nbuflow <= nbufh <= nbufhigh
1910 * 2. nbufhigh <= MAXNBUF
1911 * 3. 0 < nbuflow <= nbuftarget <= nbufhigh
1912 * 4. nbufh can not be set by sysctl().
1913 */
1914
1915 /* Per queue tunable limits */
1916
1917 struct bufqlim {
1918 blsize_t bl_nlow; /* minimum number of buffer headers required */
1919 blsize_t bl_num; /* number of buffer headers on the queue */
1920 blsize_t bl_nlhigh; /* maximum number of buffer headers allowed */
1921 blsize_t bl_target; /* preferred number of buffer headers */
1922 long bl_stale; /* Seconds after which a buffer is considered stale */
1923 } bufqlim[BQUEUES];
1924
1925 /*
1926 * assertions:
1927 *
1928 * 1. 0 <= bl_nlow <= bl_num <= bl_nlhigh
1929 * 2. bl_nlhigh <= MAXNBUF
1930 * 3. bufqlim[BQ_META].bl_nlow != 0
1931 * 4. bufqlim[BQ_META].bl_nlow > (number of possible concurrent
1932 * file system IO operations)
1933 * 5. bl_num can not be set by sysctl().
1934 * 6. bl_nhigh <= nbufhigh
1935 */
1936
1937 /*
1938 * Rationale:
1939 * ----------
1940 * Defining it blsize_t as long permits 2^31 buffer headers per queue.
1941 * Which can describe (2^31 * PAGE_SIZE) memory per queue.
1942 *
1943 * These limits are exported to by means of sysctl().
1944 * It was decided to define blsize_t as a 64 bit quantity.
1945 * This will make sure that we will not be required to change it
1946 * as long as we do not exceed 64 bit address space for the kernel.
1947 *
1948 * low and high numbers parameters initialized at compile time
1949 * and boot arguments can be used to override them. sysctl()
1950 * would not change the value. sysctl() can get all the values
1951 * but can set only target. num is the current level.
1952 *
1953 * Advantages of having a "bufqscan" thread doing the balancing are,
1954 * Keep enough bufs on BQ_EMPTY.
1955 * getnewbuf() by default will always select a buffer from the BQ_EMPTY.
1956 * getnewbuf() perfoms best if a buffer was found there.
1957 * Also this minimizes the possibility of starting IO
1958 * from getnewbuf(). That's a performance win, too.
1959 *
1960 * Localize complex logic [balancing as well as time aging]
1961 * to balancebufq().
1962 *
1963 * Simplify getnewbuf() logic by elimination of time aging code.
1964 */
1965
1966 /*
1967 * Algorithm:
1968 * -----------
1969 * The goal of the dynamic scaling of the buffer queues to to keep
1970 * the size of the LRU close to bl_target. Buffers on a queue would
1971 * be time aged.
1972 *
1973 * There would be a thread which will be responsible for "balancing"
1974 * the buffer cache queues.
1975 *
1976 * The scan order would be: AGE, LRU, META, EMPTY.
1977 */
1978
1979 long bufqscanwait = 0;
1980
1981 extern void bufqscan_thread();
1982 extern int balancebufq(int q);
1983 extern int btrimempty(int n);
1984 extern int initbufqscan(void);
1985 extern int nextbufq(int q);
1986 extern void buqlimprt(int all);
1987
1988 void
1989 bufq_balance_thread_init()
1990 {
1991
1992 if (bufqscanwait++ == 0) {
1993 int i;
1994
1995 /* Initalize globals */
1996 MAXNBUF = (mem_size / PAGE_SIZE);
1997 nbufh = nbuf;
1998 nbuflow = min(nbufh, 100);
1999 nbufhigh = min(MAXNBUF, max(nbufh, 2048));
2000 nbuftarget = (mem_size >> 5) / PAGE_SIZE;
2001 nbuftarget = max(nbuflow, nbuftarget);
2002 nbuftarget = min(nbufhigh, nbuftarget);
2003
2004 /*
2005 * Initialize the bufqlim
2006 */
2007
2008 /* LOCKED queue */
2009 bufqlim[BQ_LOCKED].bl_nlow = 0;
2010 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
2011 bufqlim[BQ_LOCKED].bl_target = 0;
2012 bufqlim[BQ_LOCKED].bl_stale = 30;
2013
2014 /* LRU queue */
2015 bufqlim[BQ_LRU].bl_nlow = 0;
2016 bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
2017 bufqlim[BQ_LRU].bl_target = nbuftarget/4;
2018 bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
2019
2020 /* AGE queue */
2021 bufqlim[BQ_AGE].bl_nlow = 0;
2022 bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
2023 bufqlim[BQ_AGE].bl_target = nbuftarget/4;
2024 bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
2025
2026 /* EMPTY queue */
2027 bufqlim[BQ_EMPTY].bl_nlow = 0;
2028 bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
2029 bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
2030 bufqlim[BQ_EMPTY].bl_stale = 600000;
2031
2032 /* META queue */
2033 bufqlim[BQ_META].bl_nlow = 0;
2034 bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
2035 bufqlim[BQ_META].bl_target = nbuftarget/4;
2036 bufqlim[BQ_META].bl_stale = META_IS_STALE;
2037
2038 /* LAUNDRY queue */
2039 bufqlim[BQ_LOCKED].bl_nlow = 0;
2040 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
2041 bufqlim[BQ_LOCKED].bl_target = 0;
2042 bufqlim[BQ_LOCKED].bl_stale = 30;
2043
2044 buqlimprt(1);
2045 }
2046
2047 /* create worker thread */
2048 kernel_thread(kernel_task, bufqscan_thread);
2049 }
2050
2051 /* The workloop for the buffer balancing thread */
2052 void
2053 bufqscan_thread()
2054 {
2055 boolean_t funnel_state;
2056 int moretodo = 0;
2057
2058 funnel_state = thread_funnel_set(kernel_flock, TRUE);
2059
2060 for(;;) {
2061 do {
2062 int q; /* buffer queue to process */
2063
2064 for (q = initbufqscan(); q; ) {
2065 moretodo |= balancebufq(q);
2066 q = nextbufq(q);
2067 }
2068 } while (moretodo);
2069
2070 #if 1 || DIAGNOSTIC
2071 vfs_bufstats();
2072 buqlimprt(0);
2073 #endif
2074 (void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz);
2075 moretodo = 0;
2076 }
2077
2078 (void) thread_funnel_set(kernel_flock, FALSE);
2079 }
2080
2081 /* Seed for the buffer queue balancing */
2082 int
2083 initbufqscan()
2084 {
2085 /* Start with AGE queue */
2086 return (BQ_AGE);
2087 }
2088
2089 /* Pick next buffer queue to balance */
2090 int
2091 nextbufq(int q)
2092 {
2093 int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
2094
2095 q++;
2096 q %= sizeof(order);
2097 return (order[q]);
2098 }
2099
2100 /* function to balance the buffer queues */
2101 int
2102 balancebufq(int q)
2103 {
2104 int moretodo = 0;
2105 int s = splbio();
2106 int n;
2107
2108 /* reject invalid q */
2109 if ((q < 0) || (q >= BQUEUES))
2110 goto out;
2111
2112 /* LOCKED or LAUNDRY queue MUST not be balanced */
2113 if ((q == BQ_LOCKED) || (q == BQ_LAUNDRY))
2114 goto out;
2115
2116 n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
2117
2118 /* If queue has less than target nothing more to do */
2119 if (n < 0)
2120 goto out;
2121
2122 if ( n > 8 ) {
2123 /* Balance only a small amount (12.5%) at a time */
2124 n >>= 3;
2125 }
2126
2127 /* EMPTY queue needs special handling */
2128 if (q == BQ_EMPTY) {
2129 moretodo |= btrimempty(n);
2130 goto out;
2131 }
2132
2133 for (; n > 0; n--) {
2134 struct buf *bp = bufqueues[q].tqh_first;
2135 if (!bp)
2136 break;
2137
2138 /* check if it's stale */
2139 if ((time.tv_sec - bp->b_timestamp) > bufqlim[q].bl_stale) {
2140 if (bcleanbuf(bp)) {
2141 /* bawrite() issued, bp not ready */
2142 moretodo = 1;
2143 } else {
2144 /* release the cleaned buffer to BQ_EMPTY */
2145 SET(bp->b_flags, B_INVAL);
2146 brelse(bp);
2147 }
2148 } else
2149 break;
2150 }
2151
2152 out:
2153 splx(s);
2154 return (moretodo);
2155 }
2156
2157 int
2158 btrimempty(int n)
2159 {
2160 /*
2161 * When struct buf are allocated dynamically, this would
2162 * reclaim upto 'n' struct buf from the empty queue.
2163 */
2164
2165 return (0);
2166 }
2167
2168 void
2169 bufqinc(int q)
2170 {
2171 if ((q < 0) || (q >= BQUEUES))
2172 return;
2173
2174 bufqlim[q].bl_num++;
2175 return;
2176 }
2177
2178 void
2179 bufqdec(int q)
2180 {
2181 if ((q < 0) || (q >= BQUEUES))
2182 return;
2183
2184 bufqlim[q].bl_num--;
2185 return;
2186 }
2187
2188 void
2189 buqlimprt(int all)
2190 {
2191 int i;
2192 static char *bname[BQUEUES] =
2193 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
2194
2195 if (all)
2196 for (i = 0; i < BQUEUES; i++) {
2197 printf("%s : ", bname[i]);
2198 printf("min = %d, ", (long)bufqlim[i].bl_nlow);
2199 printf("cur = %d, ", (long)bufqlim[i].bl_num);
2200 printf("max = %d, ", (long)bufqlim[i].bl_nlhigh);
2201 printf("target = %d, ", (long)bufqlim[i].bl_target);
2202 printf("stale after %d seconds\n", bufqlim[i].bl_stale);
2203 }
2204 else
2205 for (i = 0; i < BQUEUES; i++) {
2206 printf("%s : ", bname[i]);
2207 printf("cur = %d, ", (long)bufqlim[i].bl_num);
2208 }
2209 }
2210
2211 /*
2212 * If the getnewbuf() calls bcleanbuf() on the same thread
2213 * there is a potential for stack overrun and deadlocks.
2214 * So we always handoff the work to worker thread for completion
2215 */
2216
2217 static void
2218 bcleanbuf_thread_init()
2219 {
2220 static void bcleanbuf_thread();
2221
2222 /* create worker thread */
2223 kernel_thread(kernel_task, bcleanbuf_thread);
2224 }
2225
2226 static void
2227 bcleanbuf_thread()
2228 {
2229 boolean_t funnel_state;
2230 struct buf *bp;
2231
2232 funnel_state = thread_funnel_set(kernel_flock, TRUE);
2233
2234 doit:
2235 while (blaundrycnt == 0)
2236 (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 60 * hz);
2237 bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY]);
2238 /* Remove from the queue */
2239 bremfree(bp);
2240 blaundrycnt--;
2241 /* do the IO */
2242 bawrite(bp);
2243 /* start again */
2244 goto doit;
2245
2246 (void) thread_funnel_set(kernel_flock, funnel_state);
2247 }
2248