]> git.saurik.com Git - apple/xnu.git/blame - bsd/vfs/vfs_bio.c
xnu-517.12.7.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_bio.c
CommitLineData
1c79356b 1/*
d52fe63f 2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
1c79356b
A
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
e5568f75
A
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
1c79356b 11 *
e5568f75
A
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
1c79356b
A
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
e5568f75
A
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
1c79356b
A
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
23/*-
24 * Copyright (c) 1994 Christopher G. Demetriou
25 * Copyright (c) 1982, 1986, 1989, 1993
26 * The Regents of the University of California. All rights reserved.
27 * (c) UNIX System Laboratories, Inc.
28 * All or some portions of this file are derived from material licensed
29 * to the University of California by American Telephone and Telegraph
30 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
31 * the permission of UNIX System Laboratories, Inc.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * The NEXTSTEP Software License Agreement specifies the terms
62 * and conditions for redistribution.
63 *
64 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
65 */
66
67/*
68 * Some references:
69 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
70 * Leffler, et al.: The Design and Implementation of the 4.3BSD
71 * UNIX Operating System (Addison Welley, 1989)
72 */
1c79356b
A
73
74#include <sys/param.h>
75#include <sys/systm.h>
76#include <sys/proc.h>
77#include <sys/buf.h>
78#include <sys/vnode.h>
79#include <sys/mount.h>
80#include <sys/trace.h>
81#include <sys/malloc.h>
82#include <sys/resourcevar.h>
83#include <miscfs/specfs/specdev.h>
84#include <sys/ubc.h>
85#include <vm/vm_pageout.h>
86#if DIAGNOSTIC
87#include <kern/assert.h>
88#endif /* DIAGNOSTIC */
89#include <kern/task.h>
90#include <kern/zalloc.h>
91
92#include <sys/kdebug.h>
9bccf70c 93#include <machine/spl.h>
1c79356b 94
9bccf70c
A
95static __inline__ void bufqinc(int q);
96static __inline__ void bufqdec(int q);
1c79356b 97
55e303ae
A
98static int do_breadn_for_type(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablks,
99 int *rasizes, int nrablks, struct ucred *cred, struct buf **bpp, int queuetype);
1c79356b 100static struct buf *getnewbuf(int slpflag, int slptimeo, int *queue);
9bccf70c 101static int bcleanbuf(struct buf *bp);
55e303ae 102static int brecover_data(struct buf *bp);
9bccf70c 103extern void vwakeup();
1c79356b 104
fa4905b1 105extern int niobuf; /* The number of IO buffer headers for cluster IO */
765c9de3 106int blaundrycnt;
1c79356b 107
d52fe63f
A
108/* zone allocated buffer headers */
109static zone_t buf_hdr_zone;
110static int buf_hdr_count;
111
1c79356b
A
112#if TRACE
113struct proc *traceproc;
114int tracewhich, tracebuf[TRCSIZ];
115u_int tracex;
116char traceflags[TR_NFLAGS];
117#endif /* TRACE */
118
119/*
120 * Definitions for the buffer hash lists.
121 */
122#define BUFHASH(dvp, lbn) \
123 (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
124LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
125u_long bufhash;
126
127/* Definitions for the buffer stats. */
128struct bufstats bufstats;
129
d52fe63f
A
130/* Number of delayed write buffers */
131int nbdwrite = 0;
132
1c79356b
A
133/*
134 * Insq/Remq for the buffer hash lists.
135 */
136#if 0
137#define binshash(bp, dp) LIST_INSERT_HEAD(dp, bp, b_hash)
138#define bremhash(bp) LIST_REMOVE(bp, b_hash)
139#endif /* 0 */
140
141
142TAILQ_HEAD(ioqueue, buf) iobufqueue;
143TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
d52fe63f
A
144static int needbuffer;
145static int need_iobuffer;
1c79356b
A
146
147/*
148 * Insq/Remq for the buffer free lists.
149 */
150#define binsheadfree(bp, dp, whichq) do { \
151 TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
152 bufqinc((whichq)); \
153 (bp)->b_whichq = whichq; \
154 (bp)->b_timestamp = time.tv_sec; \
155 } while (0)
156
157#define binstailfree(bp, dp, whichq) do { \
158 TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
159 bufqinc((whichq)); \
160 (bp)->b_whichq = whichq; \
161 (bp)->b_timestamp = time.tv_sec; \
162 } while (0)
163
164#define BHASHENTCHECK(bp) \
165 if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef) \
765c9de3 166 panic("%x: b_hash.le_prev is not deadbeef", (bp));
1c79356b
A
167
168#define BLISTNONE(bp) \
169 (bp)->b_hash.le_next = (struct buf *)0; \
170 (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
171
9bccf70c
A
172/*
173 * Insq/Remq for the vnode usage lists.
174 */
175#define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs)
176#define bufremvn(bp) { \
177 LIST_REMOVE(bp, b_vnbufs); \
178 (bp)->b_vnbufs.le_next = NOLIST; \
179}
180
1c79356b
A
181simple_lock_data_t bufhashlist_slock; /* lock on buffer hash list */
182
d52fe63f
A
183/* number of per vnode, "in flight" buffer writes */
184#define BUFWRITE_THROTTLE 9
185
b4c24cb9 186
1c79356b
A
187/*
188 * Time in seconds before a buffer on a list is
189 * considered as a stale buffer
190 */
191#define LRU_IS_STALE 120 /* default value for the LRU */
192#define AGE_IS_STALE 60 /* default value for the AGE */
193#define META_IS_STALE 180 /* default value for the BQ_META */
194
195int lru_is_stale = LRU_IS_STALE;
196int age_is_stale = AGE_IS_STALE;
197int meta_is_stale = META_IS_STALE;
198
9bccf70c
A
199/* LIST_INSERT_HEAD() with assertions */
200static __inline__ void
1c79356b
A
201blistenterhead(struct bufhashhdr * head, struct buf * bp)
202{
203 if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
204 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
205 (head)->lh_first = bp;
206 bp->b_hash.le_prev = &(head)->lh_first;
207 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
208 panic("blistenterhead: le_prev is deadbeef");
1c79356b 209}
1c79356b 210
9bccf70c 211static __inline__ void
1c79356b
A
212binshash(struct buf *bp, struct bufhashhdr *dp)
213{
9bccf70c 214 struct buf *nbp;
1c79356b
A
215
216 simple_lock(&bufhashlist_slock);
9bccf70c 217
b4c24cb9
A
218#if 0
219 if((bad = incore(bp->b_vp, bp->b_lblkno)))
220 panic("binshash: already incore bp 0x%x, bad 0x%x\n", bp, bad);
1c79356b 221#endif /* 0 */
9bccf70c 222
1c79356b 223 BHASHENTCHECK(bp);
9bccf70c 224
1c79356b
A
225 nbp = dp->lh_first;
226 for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
227 if(nbp == bp)
228 panic("buf already in hashlist");
229 }
230
1c79356b 231 blistenterhead(dp, bp);
1c79356b
A
232 simple_unlock(&bufhashlist_slock);
233}
234
9bccf70c 235static __inline__ void
1c79356b
A
236bremhash(struct buf *bp)
237{
1c79356b
A
238 simple_lock(&bufhashlist_slock);
239 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
240 panic("bremhash le_prev is deadbeef");
241 if (bp->b_hash.le_next == bp)
242 panic("bremhash: next points to self");
243
244 if (bp->b_hash.le_next != NULL)
245 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
246 *bp->b_hash.le_prev = (bp)->b_hash.le_next;
247 simple_unlock(&bufhashlist_slock);
248}
249
1c79356b
A
250/*
251 * Remove a buffer from the free list it's on
252 */
253void
254bremfree(bp)
255 struct buf *bp;
256{
257 struct bqueues *dp = NULL;
258 int whichq = -1;
259
260 /*
261 * We only calculate the head of the freelist when removing
262 * the last element of the list as that is the only time that
263 * it is needed (e.g. to reset the tail pointer).
264 *
265 * NB: This makes an assumption about how tailq's are implemented.
266 */
267 if (bp->b_freelist.tqe_next == NULL) {
268 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
269 if (dp->tqh_last == &bp->b_freelist.tqe_next)
270 break;
271 if (dp == &bufqueues[BQUEUES])
272 panic("bremfree: lost tail");
273 }
274 TAILQ_REMOVE(dp, bp, b_freelist);
275 whichq = bp->b_whichq;
276 bufqdec(whichq);
277 bp->b_whichq = -1;
278 bp->b_timestamp = 0;
279}
280
9bccf70c
A
281/*
282 * Associate a buffer with a vnode.
283 */
284static void
285bgetvp(vp, bp)
286 register struct vnode *vp;
287 register struct buf *bp;
288{
289
290 if (bp->b_vp != vp)
291 panic("bgetvp: not free");
292 VHOLD(vp);
293 bp->b_vp = vp;
294 if (vp->v_type == VBLK || vp->v_type == VCHR)
295 bp->b_dev = vp->v_rdev;
296 else
297 bp->b_dev = NODEV;
298 /*
299 * Insert onto list for new vnode.
300 */
301 bufinsvn(bp, &vp->v_cleanblkhd);
302}
303
304/*
305 * Disassociate a buffer from a vnode.
306 */
307static void
308brelvp(bp)
309 register struct buf *bp;
310{
311 struct vnode *vp;
312
313 if (bp->b_vp == (struct vnode *) 0)
314 panic("brelvp: NULL vp");
315 /*
316 * Delete from old vnode list, if on one.
317 */
318 if (bp->b_vnbufs.le_next != NOLIST)
319 bufremvn(bp);
320 vp = bp->b_vp;
321 bp->b_vp = (struct vnode *) 0;
322 HOLDRELE(vp);
323}
324
325/*
326 * Reassign a buffer from one vnode to another.
327 * Used to assign file specific control information
328 * (indirect blocks) to the vnode to which they belong.
329 */
330void
331reassignbuf(bp, newvp)
332 register struct buf *bp;
333 register struct vnode *newvp;
334{
335 register struct buflists *listheadp;
336
337 if (newvp == NULL) {
338 printf("reassignbuf: NULL");
339 return;
340 }
341 /*
342 * Delete from old vnode list, if on one.
343 */
344 if (bp->b_vnbufs.le_next != NOLIST)
345 bufremvn(bp);
346 /*
347 * If dirty, put on list of dirty buffers;
348 * otherwise insert onto list of clean buffers.
349 */
350 if (ISSET(bp->b_flags, B_DELWRI))
351 listheadp = &newvp->v_dirtyblkhd;
352 else
353 listheadp = &newvp->v_cleanblkhd;
354 bufinsvn(bp, listheadp);
355}
356
765c9de3
A
357static __inline__ void
358bufhdrinit(struct buf *bp)
359{
360 bzero((char *)bp, sizeof *bp);
361 bp->b_dev = NODEV;
362 bp->b_rcred = NOCRED;
363 bp->b_wcred = NOCRED;
364 bp->b_vnbufs.le_next = NOLIST;
365 bp->b_flags = B_INVAL;
366
367 return;
368}
369
1c79356b
A
370/*
371 * Initialize buffers and hash links for buffers.
372 */
9bccf70c 373__private_extern__ void
1c79356b
A
374bufinit()
375{
376 register struct buf *bp;
377 register struct bqueues *dp;
378 register int i;
379 int metabuf;
380 long whichq;
1c79356b 381 static void bufzoneinit();
765c9de3 382 static void bcleanbuf_thread_init();
1c79356b
A
383
384 /* Initialize the buffer queues ('freelists') and the hash table */
385 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
386 TAILQ_INIT(dp);
387 bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash);
388
389 simple_lock_init(&bufhashlist_slock );
390
391 metabuf = nbuf/8; /* reserved for meta buf */
392
393 /* Initialize the buffer headers */
394 for (i = 0; i < nbuf; i++) {
395 bp = &buf[i];
765c9de3
A
396 bufhdrinit(bp);
397
1c79356b
A
398 /*
399 * metabuf buffer headers on the meta-data list and
400 * rest of the buffer headers on the empty list
401 */
765c9de3 402 if (--metabuf)
1c79356b
A
403 whichq = BQ_META;
404 else
405 whichq = BQ_EMPTY;
406
407 BLISTNONE(bp);
408 dp = &bufqueues[whichq];
409 binsheadfree(bp, dp, whichq);
410 binshash(bp, &invalhash);
411 }
412
413 for (; i < nbuf + niobuf; i++) {
414 bp = &buf[i];
765c9de3 415 bufhdrinit(bp);
1c79356b
A
416 binsheadfree(bp, &iobufqueue, -1);
417 }
418
419 printf("using %d buffer headers and %d cluster IO buffer headers\n",
420 nbuf, niobuf);
421
765c9de3 422 /* Set up zones used by the buffer cache */
1c79356b 423 bufzoneinit();
1c79356b 424
765c9de3
A
425 /* start the bcleanbuf() thread */
426 bcleanbuf_thread_init();
427
428#if 0 /* notyet */
9bccf70c
A
429 {
430 static void bufq_balance_thread_init();
1c79356b
A
431 /* create a thread to do dynamic buffer queue balancing */
432 bufq_balance_thread_init();
9bccf70c
A
433 }
434#endif /* notyet */
1c79356b
A
435}
436
9bccf70c 437static struct buf *
1c79356b
A
438bio_doread(vp, blkno, size, cred, async, queuetype)
439 struct vnode *vp;
440 daddr_t blkno;
441 int size;
442 struct ucred *cred;
443 int async;
444 int queuetype;
445{
446 register struct buf *bp;
447 struct proc *p = current_proc();
448
449 bp = getblk(vp, blkno, size, 0, 0, queuetype);
450
451 /*
452 * If buffer does not have data valid, start a read.
453 * Note that if buffer is B_INVAL, getblk() won't return it.
454 * Therefore, it's valid if it's I/O has completed or been delayed.
455 */
456 if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
457 /* Start I/O for the buffer (keeping credentials). */
458 SET(bp->b_flags, B_READ | async);
459 if (cred != NOCRED && bp->b_rcred == NOCRED) {
0b4e3aa0
A
460 /*
461 * NFS has embedded ucred.
462 * Can not crhold() here as that causes zone corruption
463 */
464 bp->b_rcred = crdup(cred);
1c79356b 465 }
b4c24cb9 466
1c79356b
A
467 VOP_STRATEGY(bp);
468
469 trace(TR_BREADMISS, pack(vp, size), blkno);
470
471 /* Pay for the read. */
472 if (p && p->p_stats)
473 p->p_stats->p_ru.ru_inblock++; /* XXX */
474 } else if (async) {
475 brelse(bp);
476 }
477
478 trace(TR_BREADHIT, pack(vp, size), blkno);
479
480 return (bp);
481}
482/*
483 * Read a disk block.
484 * This algorithm described in Bach (p.54).
485 */
486int
487bread(vp, blkno, size, cred, bpp)
488 struct vnode *vp;
489 daddr_t blkno;
490 int size;
491 struct ucred *cred;
492 struct buf **bpp;
493{
494 register struct buf *bp;
495
496 /* Get buffer for block. */
497 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
498
499 /* Wait for the read to complete, and return result. */
500 return (biowait(bp));
501}
502
503/*
504 * Read a disk block. [bread() for meta-data]
505 * This algorithm described in Bach (p.54).
506 */
507int
508meta_bread(vp, blkno, size, cred, bpp)
509 struct vnode *vp;
510 daddr_t blkno;
511 int size;
512 struct ucred *cred;
513 struct buf **bpp;
514{
515 register struct buf *bp;
516
517 /* Get buffer for block. */
518 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
519
520 /* Wait for the read to complete, and return result. */
521 return (biowait(bp));
522}
523
524/*
525 * Read-ahead multiple disk blocks. The first is sync, the rest async.
1c79356b
A
526 */
527int
528breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp)
529 struct vnode *vp;
530 daddr_t blkno; int size;
531 daddr_t rablks[]; int rasizes[];
532 int nrablks;
533 struct ucred *cred;
534 struct buf **bpp;
55e303ae
A
535{
536 return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_READ));
537}
538
539/*
540 * Read-ahead multiple disk blocks. The first is sync, the rest async.
541 * [breadn() for meta-data]
542 */
543int
544meta_breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp)
545 struct vnode *vp;
546 daddr_t blkno; int size;
547 daddr_t rablks[]; int rasizes[];
548 int nrablks;
549 struct ucred *cred;
550 struct buf **bpp;
551{
552 return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_META));
553}
554
555/*
556 * Perform the reads for breadn() and meta_breadn().
557 * Trivial modification to the breada algorithm presented in Bach (p.55).
558 */
559static int
560do_breadn_for_type(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablks, int *rasizes,
561 int nrablks, struct ucred *cred, struct buf **bpp, int queuetype)
1c79356b
A
562{
563 register struct buf *bp;
564 int i;
565
55e303ae 566 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, queuetype);
1c79356b
A
567
568 /*
569 * For each of the read-ahead blocks, start a read, if necessary.
570 */
571 for (i = 0; i < nrablks; i++) {
572 /* If it's in the cache, just go on to next one. */
573 if (incore(vp, rablks[i]))
574 continue;
575
576 /* Get a buffer for the read-ahead block */
55e303ae 577 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, queuetype);
1c79356b
A
578 }
579
580 /* Otherwise, we had to start a read for it; wait until it's valid. */
581 return (biowait(bp));
582}
583
584/*
585 * Read with single-block read-ahead. Defined in Bach (p.55), but
586 * implemented as a call to breadn().
587 * XXX for compatibility with old file systems.
588 */
589int
590breada(vp, blkno, size, rablkno, rabsize, cred, bpp)
591 struct vnode *vp;
592 daddr_t blkno; int size;
593 daddr_t rablkno; int rabsize;
594 struct ucred *cred;
595 struct buf **bpp;
596{
597
598 return (breadn(vp, blkno, size, &rablkno, &rabsize, 1, cred, bpp));
599}
600
601/*
602 * Block write. Described in Bach (p.56)
603 */
604int
605bwrite(bp)
606 struct buf *bp;
607{
608 int rv, sync, wasdelayed;
609 struct proc *p = current_proc();
1c79356b
A
610 struct vnode *vp = bp->b_vp;
611
55e303ae
A
612 if (bp->b_data == 0) {
613 if (brecover_data(bp) == 0)
614 return (0);
615 }
1c79356b
A
616 /* Remember buffer type, to switch on it later. */
617 sync = !ISSET(bp->b_flags, B_ASYNC);
618 wasdelayed = ISSET(bp->b_flags, B_DELWRI);
619 CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
9bccf70c 620 if (wasdelayed) {
d52fe63f 621 nbdwrite--;
9bccf70c
A
622 wakeup((caddr_t)&nbdwrite);
623 }
1c79356b
A
624
625 if (!sync) {
626 /*
627 * If not synchronous, pay for the I/O operation and make
628 * sure the buf is on the correct vnode queue. We have
629 * to do this now, because if we don't, the vnode may not
630 * be properly notified that its I/O has completed.
631 */
632 if (wasdelayed)
633 reassignbuf(bp, vp);
634 else
635 if (p && p->p_stats)
636 p->p_stats->p_ru.ru_oublock++; /* XXX */
637 }
638
d52fe63f 639 trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
1c79356b
A
640
641 /* Initiate disk write. Make sure the appropriate party is charged. */
642 SET(bp->b_flags, B_WRITEINPROG);
643 vp->v_numoutput++;
644
645 VOP_STRATEGY(bp);
646
647 if (sync) {
648 /*
649 * If I/O was synchronous, wait for it to complete.
650 */
651 rv = biowait(bp);
652
653 /*
654 * Pay for the I/O operation, if it's not been paid for, and
655 * make sure it's on the correct vnode queue. (async operatings
656 * were payed for above.)
657 */
658 if (wasdelayed)
659 reassignbuf(bp, vp);
660 else
661 if (p && p->p_stats)
662 p->p_stats->p_ru.ru_oublock++; /* XXX */
663
664 /* Release the buffer. */
b4c24cb9
A
665 // XXXdbg - only if the unused bit is set
666 if (!ISSET(bp->b_flags, B_NORELSE)) {
667 brelse(bp);
668 } else {
669 CLR(bp->b_flags, B_NORELSE);
670 }
1c79356b
A
671
672 return (rv);
673 } else {
674 return (0);
675 }
676}
677
678int
679vn_bwrite(ap)
680 struct vop_bwrite_args *ap;
681{
682 return (bwrite(ap->a_bp));
683}
684
685/*
686 * Delayed write.
687 *
688 * The buffer is marked dirty, but is not queued for I/O.
689 * This routine should be used when the buffer is expected
690 * to be modified again soon, typically a small write that
691 * partially fills a buffer.
692 *
693 * NB: magnetic tapes cannot be delayed; they must be
694 * written in the order that the writes are requested.
695 *
696 * Described in Leffler, et al. (pp. 208-213).
d52fe63f
A
697 *
698 * Note: With the abilitty to allocate additional buffer
699 * headers, we can get in to the situation where "too" many
700 * bdwrite()s can create situation where the kernel can create
701 * buffers faster than the disks can service. Doing a bawrite() in
702 * cases were we have "too many" outstanding bdwrite()s avoids that.
1c79356b 703 */
9bccf70c
A
704__private_extern__ int
705bdwrite_internal(bp, return_error)
1c79356b 706 struct buf *bp;
9bccf70c 707 int return_error;
1c79356b
A
708{
709 struct proc *p = current_proc();
d52fe63f 710 struct vnode *vp = bp->b_vp;
1c79356b
A
711
712 /*
713 * If the block hasn't been seen before:
714 * (1) Mark it as having been seen,
715 * (2) Charge for the write.
716 * (3) Make sure it's on its vnode's correct block list,
717 */
718 if (!ISSET(bp->b_flags, B_DELWRI)) {
719 SET(bp->b_flags, B_DELWRI);
720 if (p && p->p_stats)
721 p->p_stats->p_ru.ru_oublock++; /* XXX */
d52fe63f
A
722 nbdwrite ++;
723 reassignbuf(bp, vp);
1c79356b
A
724 }
725
1c79356b
A
726 /* If this is a tape block, write it the block now. */
727 if (ISSET(bp->b_flags, B_TAPE)) {
728 /* bwrite(bp); */
9bccf70c
A
729 VOP_BWRITE(bp);
730 return (0);
1c79356b
A
731 }
732
d52fe63f
A
733 /*
734 * If the vnode has "too many" write operations in progress
735 * wait for them to finish the IO
736 */
737 while (vp->v_numoutput >= BUFWRITE_THROTTLE) {
738 vp->v_flag |= VTHROTTLED;
739 (void)tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "bdwrite", 0);
740 }
741
742 /*
743 * If we have too many delayed write buffers,
744 * more than we can "safely" handle, just fall back to
745 * doing the async write
746 */
747 if (nbdwrite < 0)
748 panic("bdwrite: Negative nbdwrite");
749
b4c24cb9
A
750 // can't do a bawrite() if the LOCKED bit is set because the
751 // buffer is part of a transaction and can't go to disk until
752 // the LOCKED bit is cleared.
753 if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf/4)*3)) {
9bccf70c
A
754 if (return_error)
755 return (EAGAIN);
756 else
757 bawrite(bp);
758 return (0);
d52fe63f
A
759 }
760
1c79356b
A
761 /* Otherwise, the "write" is done, so mark and release the buffer. */
762 SET(bp->b_flags, B_DONE);
763 brelse(bp);
9bccf70c 764 return (0);
1c79356b
A
765}
766
9bccf70c
A
767void
768bdwrite(bp)
769 struct buf *bp;
770{
771 (void) bdwrite_internal(bp, 0);
772}
773
774
1c79356b
A
775/*
776 * Asynchronous block write; just an asynchronous bwrite().
d52fe63f
A
777 *
778 * Note: With the abilitty to allocate additional buffer
779 * headers, we can get in to the situation where "too" many
780 * bawrite()s can create situation where the kernel can create
781 * buffers faster than the disks can service.
782 * We limit the number of "in flight" writes a vnode can have to
783 * avoid this.
1c79356b 784 */
9bccf70c
A
785static int
786bawrite_internal(bp, throttle)
1c79356b 787 struct buf *bp;
9bccf70c 788 int throttle;
1c79356b 789{
d52fe63f
A
790 struct vnode *vp = bp->b_vp;
791
792 if (vp) {
793 /*
794 * If the vnode has "too many" write operations in progress
795 * wait for them to finish the IO
796 */
797 while (vp->v_numoutput >= BUFWRITE_THROTTLE) {
9bccf70c
A
798 if (throttle) {
799 vp->v_flag |= VTHROTTLED;
800 (void)tsleep((caddr_t)&vp->v_numoutput,
801 PRIBIO + 1, "bawrite", 0);
802 } else
803 return (EWOULDBLOCK);
d52fe63f
A
804 }
805 }
1c79356b
A
806
807 SET(bp->b_flags, B_ASYNC);
808 VOP_BWRITE(bp);
9bccf70c
A
809 return (0);
810}
811
812void
813bawrite(bp)
814 struct buf *bp;
815{
816 (void) bawrite_internal(bp, 1);
817}
818
819/*
820 * bwillwrite:
821 *
822 * Called prior to the locking of any vnodes when we are expecting to
823 * write. We do not want to starve the buffer cache with too many
824 * dirty buffers so we block here. By blocking prior to the locking
825 * of any vnodes we attempt to avoid the situation where a locked vnode
826 * prevents the various system daemons from flushing related buffers.
827 */
828
829void
830bwillwrite(void)
831{
832 /* XXX To be implemented later */
1c79356b
A
833}
834
835/*
836 * Release a buffer on to the free lists.
837 * Described in Bach (p. 46).
838 */
839void
840brelse(bp)
841 struct buf *bp;
842{
843 struct bqueues *bufq;
844 int s;
845 long whichq;
846
847 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
fa4905b1
A
848 bp->b_lblkno * PAGE_SIZE, (int)bp, (int)bp->b_data,
849 bp->b_flags, 0);
1c79356b
A
850
851 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
852
b4c24cb9
A
853 // if we're invalidating a buffer that has the B_CALL bit
854 // set then call the b_iodone function so it gets cleaned
855 // up properly.
856 //
857 if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
858 if (ISSET(bp->b_flags, B_CALL) && !ISSET(bp->b_flags, B_DELWRI)) {
859 panic("brelse: CALL flag set but not DELWRI! bp 0x%x\n", bp);
860 }
861 if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */
862 void (*iodone_func)(struct buf *) = bp->b_iodone;
863
864 CLR(bp->b_flags, B_CALL); /* but note callout done */
865 bp->b_iodone = NULL;
866
867 if (iodone_func == NULL) {
868 panic("brelse: bp @ 0x%x has NULL b_iodone!\n", bp);
869 }
870 (*iodone_func)(bp);
871 }
872 }
873
1c79356b
A
874 /* IO is done. Cleanup the UPL state */
875 if (!ISSET(bp->b_flags, B_META)
876 && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
877 kern_return_t kret;
878 upl_t upl;
1c79356b
A
879 int upl_flags;
880
881 if ( !ISSET(bp->b_flags, B_PAGELIST)) {
882 if ( !ISSET(bp->b_flags, B_INVAL)) {
0b4e3aa0
A
883 kret = ubc_create_upl(bp->b_vp,
884 ubc_blktooff(bp->b_vp, bp->b_lblkno),
885 bp->b_bufsize,
886 &upl,
887 NULL,
888 UPL_PRECIOUS);
1c79356b
A
889 if (kret != KERN_SUCCESS)
890 panic("brelse: Failed to get pagelists");
891#ifdef UBC_DEBUG
892 upl_ubc_alias_set(upl, bp, 5);
893#endif /* UBC_DEBUG */
894 } else
0b4e3aa0 895 upl = (upl_t) 0;
1c79356b 896 } else {
0b4e3aa0 897 upl = bp->b_pagelist;
1c79356b 898
55e303ae
A
899 if (bp->b_data) {
900 kret = ubc_upl_unmap(upl);
901
902 if (kret != KERN_SUCCESS)
903 panic("kernel_upl_unmap failed");
904 bp->b_data = 0;
905 }
1c79356b
A
906 }
907 if (upl) {
1c79356b 908 if (bp->b_flags & (B_ERROR | B_INVAL)) {
0b4e3aa0 909 if (bp->b_flags & (B_READ | B_INVAL))
1c79356b
A
910 upl_flags = UPL_ABORT_DUMP_PAGES;
911 else
912 upl_flags = 0;
0b4e3aa0 913 ubc_upl_abort(upl, upl_flags);
1c79356b 914 } else {
fa4905b1
A
915 if (ISSET(bp->b_flags, B_NEEDCOMMIT))
916 upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
917 else if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY))
0b4e3aa0 918 upl_flags = UPL_COMMIT_SET_DIRTY ;
55e303ae 919 else
0b4e3aa0
A
920 upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
921 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
922 UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
1c79356b
A
923 }
924 s = splbio();
925 CLR(bp->b_flags, B_PAGELIST);
926 bp->b_pagelist = 0;
927 splx(s);
928 }
929 } else {
930 if(ISSET(bp->b_flags, B_PAGELIST))
931 panic("brelse: pagelist set for non VREG; vp=%x", bp->b_vp);
932 }
933
934 /* Wake up any processes waiting for any buffer to become free. */
935 if (needbuffer) {
936 needbuffer = 0;
937 wakeup(&needbuffer);
938 }
939
940 /* Wake up any proceeses waiting for _this_ buffer to become free. */
941 if (ISSET(bp->b_flags, B_WANTED)) {
942 CLR(bp->b_flags, B_WANTED);
943 wakeup(bp);
944 }
945
946 /* Block disk interrupts. */
947 s = splbio();
948
949 /*
950 * Determine which queue the buffer should be on, then put it there.
951 */
952
953 /* If it's locked, don't report an error; try again later. */
954 if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
955 CLR(bp->b_flags, B_ERROR);
956
957 /* If it's not cacheable, or an error, mark it invalid. */
958 if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
959 SET(bp->b_flags, B_INVAL);
960
961 if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
962 /*
963 * If it's invalid or empty, dissociate it from its vnode
964 * and put on the head of the appropriate queue.
965 */
966 if (bp->b_vp)
967 brelvp(bp);
d52fe63f
A
968 if (ISSET(bp->b_flags, B_DELWRI)) {
969 CLR(bp->b_flags, B_DELWRI);
970 nbdwrite--;
9bccf70c 971 wakeup((caddr_t)&nbdwrite);
d52fe63f 972 }
1c79356b
A
973 if (bp->b_bufsize <= 0)
974 whichq = BQ_EMPTY; /* no data */
9bccf70c
A
975 else if (ISSET(bp->b_flags, B_META))
976 whichq = BQ_META; /* meta-data */
1c79356b
A
977 else
978 whichq = BQ_AGE; /* invalid data */
979
980 bufq = &bufqueues[whichq];
981 binsheadfree(bp, bufq, whichq);
982 } else {
983 /*
984 * It has valid data. Put it on the end of the appropriate
985 * queue, so that it'll stick around for as long as possible.
986 */
987 if (ISSET(bp->b_flags, B_LOCKED))
988 whichq = BQ_LOCKED; /* locked in core */
989 else if (ISSET(bp->b_flags, B_META))
990 whichq = BQ_META; /* meta-data */
991 else if (ISSET(bp->b_flags, B_AGE))
992 whichq = BQ_AGE; /* stale but valid data */
993 else
994 whichq = BQ_LRU; /* valid data */
995
996 bufq = &bufqueues[whichq];
997 binstailfree(bp, bufq, whichq);
998 }
999
1000 /* Unlock the buffer. */
1001 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE));
1002
1003 /* Allow disk interrupts. */
1004 splx(s);
1005
1006 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
fa4905b1 1007 (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1c79356b
A
1008}
1009
1010/*
1011 * Determine if a block is in the cache.
1012 * Just look on what would be its hash chain. If it's there, return
1013 * a pointer to it, unless it's marked invalid. If it's marked invalid,
1014 * we normally don't return the buffer, unless the caller explicitly
1015 * wants us to.
1016 */
1017struct buf *
1018incore(vp, blkno)
1019 struct vnode *vp;
1020 daddr_t blkno;
1021{
1022 struct buf *bp;
1c79356b
A
1023
1024 bp = BUFHASH(vp, blkno)->lh_first;
1025
1026 /* Search hash chain */
9bccf70c 1027 for (; bp != NULL; bp = bp->b_hash.le_next) {
1c79356b
A
1028 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
1029 !ISSET(bp->b_flags, B_INVAL))
1030 return (bp);
1c79356b
A
1031 }
1032
1033 return (0);
1034}
1035
fa4905b1
A
1036
1037/* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
1c79356b
A
1038/*
1039 * Get a block of requested size that is associated with
1040 * a given vnode and block offset. If it is found in the
1041 * block cache, mark it as having been found, make it busy
1042 * and return it. Otherwise, return an empty block of the
1043 * correct size. It is up to the caller to insure that the
1044 * cached blocks be of the correct size.
1045 */
1046struct buf *
1047getblk(vp, blkno, size, slpflag, slptimeo, operation)
1048 register struct vnode *vp;
1049 daddr_t blkno;
1050 int size, slpflag, slptimeo, operation;
1051{
1052 struct buf *bp;
1053 int s, err;
1054 upl_t upl;
1055 upl_page_info_t *pl;
1c79356b 1056 kern_return_t kret;
1c79356b
A
1057 int error=0;
1058 int pagedirty = 0;
1059
1c79356b
A
1060 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
1061 blkno * PAGE_SIZE, size, operation, 0, 0);
0b4e3aa0 1062start:
1c79356b
A
1063
1064 s = splbio();
9bccf70c 1065 if ((bp = incore(vp, blkno))) {
1c79356b
A
1066 /* Found in the Buffer Cache */
1067 if (ISSET(bp->b_flags, B_BUSY)) {
1068 /* but is busy */
1069 switch (operation) {
1070 case BLK_READ:
1071 case BLK_WRITE:
1072 case BLK_META:
1073 SET(bp->b_flags, B_WANTED);
1074 bufstats.bufs_busyincore++;
1075 err = tsleep(bp, slpflag | (PRIBIO + 1), "getblk",
1076 slptimeo);
1077 splx(s);
1078 /*
1079 * Callers who call with PCATCH or timeout are
1080 * willing to deal with the NULL pointer
1081 */
1082 if (err && ((slpflag & PCATCH) ||
1083 ((err == EWOULDBLOCK) && slptimeo)))
1084 return (NULL);
1085 goto start;
1086 /*NOTREACHED*/
1087 break;
1088
1089 case BLK_PAGEIN:
1090 /* pagein operation must not use getblk */
1091 panic("getblk: pagein for incore busy buffer");
1092 splx(s);
1093 /*NOTREACHED*/
1094 break;
1095
1096 case BLK_PAGEOUT:
1097 /* pageout operation must not use getblk */
1098 panic("getblk: pageout for incore busy buffer");
1099 splx(s);
1100 /*NOTREACHED*/
1101 break;
1102
1103 default:
1104 panic("getblk: %d unknown operation 1", operation);
1105 /*NOTREACHED*/
1106 break;
1107 }
1108 } else {
1109 /* not busy */
1110 SET(bp->b_flags, (B_BUSY | B_CACHE));
1111 bremfree(bp);
1112 bufstats.bufs_incore++;
1113 splx(s);
1114
1115 allocbuf(bp, size);
1116 if (ISSET(bp->b_flags, B_PAGELIST))
1117 panic("pagelist buffer is not busy");
1118
1119 switch (operation) {
1120 case BLK_READ:
1121 case BLK_WRITE:
1122 if (UBCISVALID(bp->b_vp) && bp->b_bufsize) {
0b4e3aa0
A
1123 kret = ubc_create_upl(vp,
1124 ubc_blktooff(vp, bp->b_lblkno),
1125 bp->b_bufsize,
1126 &upl,
1127 &pl,
1128 UPL_PRECIOUS);
1c79356b
A
1129 if (kret != KERN_SUCCESS)
1130 panic("Failed to get pagelists");
1131
1132 SET(bp->b_flags, B_PAGELIST);
1133 bp->b_pagelist = upl;
1134
fa4905b1
A
1135 if (!upl_valid_page(pl, 0)) {
1136 if (vp->v_tag != VT_NFS)
1137 panic("getblk: incore buffer without valid page");
1138 CLR(bp->b_flags, B_CACHE);
1139 }
1c79356b
A
1140
1141 if (upl_dirty_page(pl, 0))
1142 SET(bp->b_flags, B_WASDIRTY);
1143 else
1144 CLR(bp->b_flags, B_WASDIRTY);
1145
0b4e3aa0 1146 kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
9bccf70c 1147 if (kret != KERN_SUCCESS)
0b4e3aa0
A
1148 panic("getblk: ubc_upl_map() failed with (%d)",
1149 kret);
9bccf70c
A
1150 if (bp->b_data == 0)
1151 panic("ubc_upl_map mapped 0");
1c79356b
A
1152 }
1153 break;
1154
1155 case BLK_META:
1156 /*
1157 * VM is not involved in IO for the meta data
1158 * buffer already has valid data
1159 */
9bccf70c 1160 if(bp->b_data == 0)
1c79356b
A
1161 panic("bp->b_data null incore buf=%x", bp);
1162 break;
1163
1164 case BLK_PAGEIN:
1165 case BLK_PAGEOUT:
1166 panic("getblk: paging operation 1");
1167 break;
1168
1169 default:
1170 panic("getblk: %d unknown operation 2", operation);
1171 /*NOTREACHED*/
1172 break;
1173 }
1174 }
1175 } else { /* not incore() */
1176 int queue = BQ_EMPTY; /* Start with no preference */
1177 splx(s);
1178
1179 if ((operation == BLK_META) || (UBCINVALID(vp)) ||
1180 !(UBCINFOEXISTS(vp))) {
1181 operation = BLK_META;
1182 }
1183 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
1184 goto start;
0b4e3aa0
A
1185 if (incore(vp, blkno)) {
1186 SET(bp->b_flags, B_INVAL);
1187 binshash(bp, &invalhash);
1188 brelse(bp);
1189 goto start;
1190 }
b4c24cb9
A
1191 /*
1192 * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
1193 * CALLED! BE CAREFUL.
1194 */
0b4e3aa0 1195
1c79356b
A
1196 /*
1197 * if it is meta, the queue may be set to other
1198 * type so reset as well as mark it to be B_META
1199 * so that when buffer is released it will goto META queue
1200 * Also, if the vnode is not VREG, then it is META
1201 */
1202 if (operation == BLK_META) {
1203 SET(bp->b_flags, B_META);
1204 queue = BQ_META;
1205 }
9bccf70c
A
1206
1207 bp->b_blkno = bp->b_lblkno = blkno;
1208 bp->b_vp = vp;
1209
0b4e3aa0
A
1210 /*
1211 * Insert in the hash so that incore() can find it
1212 */
1213 binshash(bp, BUFHASH(vp, blkno));
1214
9bccf70c
A
1215 s = splbio();
1216 bgetvp(vp, bp);
1217 splx(s);
1218
1c79356b
A
1219 allocbuf(bp, size);
1220
1221 switch (operation) {
1222 case BLK_META:
1223 /* buffer data is invalid */
1224
1c79356b
A
1225 if(bp->b_data == 0)
1226 panic("bp->b_data is null %x",bp);
1227
1c79356b 1228 bufstats.bufs_miss++;
1c79356b
A
1229
1230 /* wakeup the buffer */
1231 CLR(bp->b_flags, B_WANTED);
1232 wakeup(bp);
1233 break;
1234
1235 case BLK_READ:
1236 case BLK_WRITE:
1c79356b
A
1237
1238 if (ISSET(bp->b_flags, B_PAGELIST))
1239 panic("B_PAGELIST in bp=%x",bp);
1240
0b4e3aa0
A
1241 kret = ubc_create_upl(vp,
1242 ubc_blktooff(vp, blkno),
1243 bp->b_bufsize,
1244 &upl,
1245 &pl,
1246 UPL_PRECIOUS);
1c79356b
A
1247 if (kret != KERN_SUCCESS)
1248 panic("Failed to get pagelists");
1249
1250#ifdef UBC_DEBUG
1251 upl_ubc_alias_set(upl, bp, 4);
1252#endif /* UBC_DEBUG */
1c79356b
A
1253 bp->b_pagelist = upl;
1254
1255 SET(bp->b_flags, B_PAGELIST);
1c79356b
A
1256
1257 if (upl_valid_page(pl, 0)) {
1258 SET(bp->b_flags, B_CACHE | B_DONE);
1259 bufstats.bufs_vmhits++;
1260
1261 pagedirty = upl_dirty_page(pl, 0);
1262
1263 if (pagedirty)
1264 SET(bp->b_flags, B_WASDIRTY);
1265
1266 if (vp->v_tag == VT_NFS) {
1267 off_t f_offset;
1268 int valid_size;
1269
1270 bp->b_validoff = 0;
1271 bp->b_dirtyoff = 0;
1272
1273 f_offset = ubc_blktooff(vp, blkno);
1274
1275 if (f_offset > vp->v_ubcinfo->ui_size) {
1276 CLR(bp->b_flags, (B_CACHE|B_DONE|B_WASDIRTY));
1277 bp->b_validend = 0;
1278 bp->b_dirtyend = 0;
1279 } else {
1280 valid_size = min(((unsigned int)(vp->v_ubcinfo->ui_size - f_offset)), PAGE_SIZE);
1281 bp->b_validend = valid_size;
1282
1283 if (pagedirty)
1284 bp->b_dirtyend = valid_size;
1285 else
1286 bp->b_dirtyend = 0;
1287
1288 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_NONE,
1289 bp->b_validend, bp->b_dirtyend,
1290 (int)vp->v_ubcinfo->ui_size, 0, 0);
1291 }
1292 } else {
1293 bp->b_validoff = 0;
1294 bp->b_dirtyoff = 0;
1295
1296 if (pagedirty) {
1297 /* page is dirty */
1298 bp->b_validend = bp->b_bcount;
1299 bp->b_dirtyend = bp->b_bcount;
1300 } else {
1301 /* page is clean */
1302 bp->b_validend = bp->b_bcount;
1303 bp->b_dirtyend = 0;
1304 }
1305 }
9bccf70c
A
1306 error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL);
1307 if(error) {
1308 panic("getblk: VOP_BMAP failed");
1c79356b
A
1309 /*NOTREACHED*/
1310 /*
1311 * XXX: We probably should invalidate the VM Page
1312 */
1313 bp->b_error = error;
1314 SET(bp->b_flags, (B_ERROR | B_INVAL));
1315 /* undo B_DONE that was set before upl_commit() */
1316 CLR(bp->b_flags, B_DONE);
1317 brelse(bp);
1318 return (0);
1319 }
1320 } else {
1321 bufstats.bufs_miss++;
1322 }
0b4e3aa0 1323 kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
1c79356b 1324 if (kret != KERN_SUCCESS) {
0b4e3aa0 1325 panic("getblk: ubc_upl_map() "
1c79356b
A
1326 "failed with (%d)", kret);
1327 }
9bccf70c
A
1328 if (bp->b_data == 0)
1329 panic("kernel_upl_map mapped 0");
1c79356b
A
1330
1331 break;
1332
1333 case BLK_PAGEIN:
1334 case BLK_PAGEOUT:
1335 panic("getblk: paging operation 2");
1336 break;
1337 default:
1338 panic("getblk: %d unknown operation 3", operation);
1339 /*NOTREACHED*/
1340 break;
1341 }
1342 }
1343
1344 if (bp->b_data == NULL)
1345 panic("getblk: bp->b_addr is null");
1346
1347 if (bp->b_bufsize & 0xfff) {
1c79356b 1348 if (ISSET(bp->b_flags, B_META) && (bp->b_bufsize & 0x1ff))
1c79356b
A
1349 panic("getblk: bp->b_bufsize = %d", bp->b_bufsize);
1350 }
1351
1352 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
fa4905b1 1353 (int)bp, (int)bp->b_data, bp->b_flags, 3, 0);
1c79356b
A
1354
1355 return (bp);
1356}
1357
1358/*
1359 * Get an empty, disassociated buffer of given size.
1360 */
1361struct buf *
1362geteblk(size)
1363 int size;
1364{
1365 struct buf *bp;
1366 int queue = BQ_EMPTY;
1c79356b
A
1367
1368 while ((bp = getnewbuf(0, 0, &queue)) == 0)
1369 ;
1c79356b 1370 SET(bp->b_flags, (B_META|B_INVAL));
1c79356b
A
1371
1372#if DIAGNOSTIC
1373 assert(queue == BQ_EMPTY);
1374#endif /* DIAGNOSTIC */
1375 /* XXX need to implement logic to deal with other queues */
1376
1c79356b
A
1377 binshash(bp, &invalhash);
1378 allocbuf(bp, size);
1379 bufstats.bufs_eblk++;
1380
1381 return (bp);
1382}
1383
1c79356b
A
1384/*
1385 * Zones for the meta data buffers
1386 */
1387
1388#define MINMETA 512
1389#define MAXMETA 4096
1390
1391struct meta_zone_entry {
1392 zone_t mz_zone;
1393 vm_size_t mz_size;
1394 vm_size_t mz_max;
1395 char *mz_name;
1396};
1397
1398struct meta_zone_entry meta_zones[] = {
1399 {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
1400 {NULL, (MINMETA * 2), 64 * (MINMETA * 2), "buf.1024" },
1c79356b 1401 {NULL, (MINMETA * 4), 16 * (MINMETA * 4), "buf.2048" },
1c79356b
A
1402 {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
1403 {NULL, 0, 0, "" } /* End */
1404};
765c9de3 1405
1c79356b
A
1406/*
1407 * Initialize the meta data zones
1408 */
1409static void
1410bufzoneinit(void)
1411{
1412 int i;
1413
1414 for (i = 0; meta_zones[i].mz_size != 0; i++) {
1415 meta_zones[i].mz_zone =
1416 zinit(meta_zones[i].mz_size,
1417 meta_zones[i].mz_max,
1418 PAGE_SIZE,
1419 meta_zones[i].mz_name);
1420 }
765c9de3 1421 buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
1c79356b
A
1422}
1423
9bccf70c 1424static __inline__ zone_t
1c79356b
A
1425getbufzone(size_t size)
1426{
1427 int i;
1428
9bccf70c 1429 if ((size % 512) || (size < MINMETA) || (size > MAXMETA))
1c79356b
A
1430 panic("getbufzone: incorect size = %d", size);
1431
9bccf70c
A
1432 for (i = 0; meta_zones[i].mz_size != 0; i++) {
1433 if (meta_zones[i].mz_size >= size)
1434 break;
1435 }
1436
1c79356b
A
1437 return (meta_zones[i].mz_zone);
1438}
1c79356b
A
1439
1440/*
1441 * With UBC, there is no need to expand / shrink the file data
1442 * buffer. The VM uses the same pages, hence no waste.
1443 * All the file data buffers can have one size.
1444 * In fact expand / shrink would be an expensive operation.
1445 *
1446 * Only exception to this is meta-data buffers. Most of the
1447 * meta data operations are smaller than PAGE_SIZE. Having the
1448 * meta-data buffers grow and shrink as needed, optimizes use
1449 * of the kernel wired memory.
1450 */
1451
1452int
1453allocbuf(bp, size)
1454 struct buf *bp;
1455 int size;
1456{
1457 vm_size_t desired_size;
1458
1459 desired_size = roundup(size, CLBYTES);
1460
1461 if(desired_size < PAGE_SIZE)
1462 desired_size = PAGE_SIZE;
1463 if (desired_size > MAXBSIZE)
1464 panic("allocbuf: buffer larger than MAXBSIZE requested");
1465
1c79356b
A
1466 if (ISSET(bp->b_flags, B_META)) {
1467 kern_return_t kret;
1468 zone_t zprev, z;
1469 size_t nsize = roundup(size, MINMETA);
1470
1471 if (bp->b_data) {
1472 vm_offset_t elem = (vm_offset_t)bp->b_data;
1473
1474 if (ISSET(bp->b_flags, B_ZALLOC))
1475 if (bp->b_bufsize <= MAXMETA) {
1476 if (bp->b_bufsize < nsize) {
1477 /* reallocate to a bigger size */
1c79356b
A
1478
1479 zprev = getbufzone(bp->b_bufsize);
55e303ae
A
1480 if (nsize <= MAXMETA) {
1481 desired_size = nsize;
1482 z = getbufzone(nsize);
1483 bp->b_data = (caddr_t)zalloc(z);
1484 if(bp->b_data == 0)
1485 panic("allocbuf: zalloc() returned NULL");
1486 } else {
1487 kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1488 if (kret != KERN_SUCCESS)
1489 panic("allocbuf: kmem_alloc() 0 returned %d", kret);
1490 if(bp->b_data == 0)
1491 panic("allocbuf: null b_data 0");
1492 CLR(bp->b_flags, B_ZALLOC);
1493 }
1494 bcopy((const void *)elem, bp->b_data, bp->b_bufsize);
1c79356b
A
1495 zfree(zprev, elem);
1496 } else {
1497 desired_size = bp->b_bufsize;
1498 }
1499 } else
1500 panic("allocbuf: B_ZALLOC set incorrectly");
1501 else
1502 if (bp->b_bufsize < desired_size) {
1503 /* reallocate to a bigger size */
1504 kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1505 if (kret != KERN_SUCCESS)
1506 panic("allocbuf: kmem_alloc() returned %d", kret);
1507 if(bp->b_data == 0)
1508 panic("allocbuf: null b_data");
55e303ae 1509 bcopy((const void *)elem, bp->b_data, bp->b_bufsize);
1c79356b
A
1510 kmem_free(kernel_map, elem, bp->b_bufsize);
1511 } else {
1512 desired_size = bp->b_bufsize;
1513 }
1514 } else {
1515 /* new allocation */
1516 if (nsize <= MAXMETA) {
1517 desired_size = nsize;
1518 z = getbufzone(nsize);
1519 bp->b_data = (caddr_t)zalloc(z);
1520 if(bp->b_data == 0)
1521 panic("allocbuf: zalloc() returned NULL 2");
1522 SET(bp->b_flags, B_ZALLOC);
1523 } else {
1524 kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1525 if (kret != KERN_SUCCESS)
1526 panic("allocbuf: kmem_alloc() 2 returned %d", kret);
1527 if(bp->b_data == 0)
1528 panic("allocbuf: null b_data 2");
1529 }
1530 }
1531 }
1532
1533 if (ISSET(bp->b_flags, B_META) && (bp->b_data == 0))
b4c24cb9 1534 panic("allocbuf: bp->b_data is NULL, buf @ 0x%x", bp);
1c79356b 1535
9bccf70c
A
1536 bp->b_bufsize = desired_size;
1537 bp->b_bcount = size;
1538 return (0);
1c79356b
A
1539}
1540
1541/*
1542 * Get a new buffer from one of the free lists.
1543 *
1544 * Request for a queue is passes in. The queue from which the buffer was taken
1545 * from is returned. Out of range queue requests get BQ_EMPTY. Request for
1546 * BQUEUE means no preference. Use heuristics in that case.
1547 * Heuristics is as follows:
1548 * Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
1549 * If none available block till one is made available.
1550 * If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
1551 * Pick the most stale buffer.
1552 * If found buffer was marked delayed write, start the async. write
1553 * and restart the search.
1554 * Initialize the fields and disassociate the buffer from the vnode.
1555 * Remove the buffer from the hash. Return the buffer and the queue
1556 * on which it was found.
1557 */
1558
1559static struct buf *
1560getnewbuf(slpflag, slptimeo, queue)
1561 int slpflag, slptimeo;
1562 int *queue;
1563{
1564 register struct buf *bp;
1565 register struct buf *lru_bp;
1566 register struct buf *age_bp;
1567 register struct buf *meta_bp;
1568 register int age_time, lru_time, bp_time, meta_time;
1569 int s;
1c79356b
A
1570 int req = *queue; /* save it for restarts */
1571
1572start:
1573 s = splbio();
1574
1575 /* invalid request gets empty queue */
765c9de3
A
1576 if ((*queue > BQUEUES) || (*queue < 0)
1577 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED))
1c79356b
A
1578 *queue = BQ_EMPTY;
1579
1580 /* (*queue == BQUEUES) means no preference */
1581 if (*queue != BQUEUES) {
1582 /* Try for the requested queue first */
1583 bp = bufqueues[*queue].tqh_first;
1584 if (bp)
1585 goto found;
1586 }
1587
1588 /* Unable to use requested queue */
1589 age_bp = bufqueues[BQ_AGE].tqh_first;
1590 lru_bp = bufqueues[BQ_LRU].tqh_first;
1591 meta_bp = bufqueues[BQ_META].tqh_first;
1592
9bccf70c
A
1593 if (!age_bp && !lru_bp && !meta_bp) {
1594 /*
1595 * Unavailble on AGE or LRU or META queues
1596 * Try the empty list first
1597 */
1c79356b
A
1598 bp = bufqueues[BQ_EMPTY].tqh_first;
1599 if (bp) {
1600 *queue = BQ_EMPTY;
1601 goto found;
1602 }
765c9de3
A
1603
1604 /* Create a new temparory buffer header */
1605 bp = (struct buf *)zalloc(buf_hdr_zone);
1606
1607 if (bp) {
1608 bufhdrinit(bp);
1609 BLISTNONE(bp);
1610 binshash(bp, &invalhash);
1611 SET(bp->b_flags, B_HDRALLOC);
1612 *queue = BQ_EMPTY;
1613 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
1614 buf_hdr_count++;
1615 goto found;
1616 }
1617
1c79356b
A
1618 /* Log this error condition */
1619 printf("getnewbuf: No useful buffers");
765c9de3 1620
1c79356b
A
1621 /* wait for a free buffer of any kind */
1622 needbuffer = 1;
1623 bufstats.bufs_sleeps++;
1624 tsleep(&needbuffer, slpflag|(PRIBIO+1), "getnewbuf", slptimeo);
1625 splx(s);
1626 return (0);
1627 }
1628
1629 /* Buffer available either on AGE or LRU or META */
1630 bp = NULL;
1631 *queue = -1;
1632
1633 /* Buffer available either on AGE or LRU */
1634 if (!age_bp) {
1635 bp = lru_bp;
1636 *queue = BQ_LRU;
1637 } else if (!lru_bp) {
1638 bp = age_bp;
1639 *queue = BQ_AGE;
1640 } else { /* buffer available on both AGE and LRU */
1641 age_time = time.tv_sec - age_bp->b_timestamp;
1642 lru_time = time.tv_sec - lru_bp->b_timestamp;
1643 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
1644 bp = age_bp;
1645 *queue = BQ_AGE;
1646 /*
1647 * we should probably re-timestamp eveything in the
1648 * queues at this point with the current time
1649 */
1650 } else {
1651 if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
1652 bp = lru_bp;
1653 *queue = BQ_LRU;
1654 } else {
1655 bp = age_bp;
1656 *queue = BQ_AGE;
1657 }
1658 }
1659 }
1660
1661 if (!bp) { /* Neither on AGE nor on LRU */
1662 bp = meta_bp;
1663 *queue = BQ_META;
1664 } else if (meta_bp) {
1665 bp_time = time.tv_sec - bp->b_timestamp;
1666 meta_time = time.tv_sec - meta_bp->b_timestamp;
1667
1668 if (!(bp_time < 0) && !(meta_time < 0)) {
1669 /* time not set backwards */
1670 int bp_is_stale;
1671 bp_is_stale = (*queue == BQ_LRU) ?
1672 lru_is_stale : age_is_stale;
1673
1674 if ((meta_time >= meta_is_stale) &&
1675 (bp_time < bp_is_stale)) {
1676 bp = meta_bp;
1677 *queue = BQ_META;
1678 }
1679 }
1680 }
1681
1682 if (bp == NULL)
1683 panic("getnewbuf: null bp");
1684
1685found:
b4c24cb9
A
1686 if (ISSET(bp->b_flags, B_LOCKED)) {
1687 panic("getnewbuf: bp @ 0x%x is LOCKED! (flags 0x%x)\n", bp, bp->b_flags);
1688 }
1689
1c79356b 1690 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
b4c24cb9 1691 panic("getnewbuf: le_prev is deadbeef, buf @ 0x%x", bp);
1c79356b
A
1692
1693 if(ISSET(bp->b_flags, B_BUSY))
b4c24cb9 1694 panic("getnewbuf reusing BUSY buf @ 0x%x", bp);
1c79356b
A
1695
1696 /* Clean it */
1697 if (bcleanbuf(bp)) {
1698 /* bawrite() issued, buffer not ready */
1699 splx(s);
1700 *queue = req;
1701 goto start;
1702 }
1703 splx(s);
1704 return (bp);
1705}
9bccf70c 1706
1c79356b
A
1707#include <mach/mach_types.h>
1708#include <mach/memory_object_types.h>
9bccf70c 1709#include <kern/sched_prim.h>
1c79356b
A
1710
1711/*
1712 * Clean a buffer.
1713 * Returns 0 is buffer is ready to use,
1714 * Returns 1 if issued a bawrite() to indicate
1715 * that the buffer is not ready.
1716 */
9bccf70c 1717static int
1c79356b
A
1718bcleanbuf(struct buf *bp)
1719{
1720 int s;
1721 struct ucred *cred;
d52fe63f 1722 int hdralloc = 0;
1c79356b
A
1723
1724 s = splbio();
1725
1726 /* Remove from the queue */
1727 bremfree(bp);
1728
1729 /* Buffer is no longer on free lists. */
1730 SET(bp->b_flags, B_BUSY);
1731
d52fe63f
A
1732 /* Check whether the buffer header was "allocated" */
1733 if (ISSET(bp->b_flags, B_HDRALLOC))
1734 hdralloc = 1;
1735
1c79356b
A
1736 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1737 panic("bcleanbuf: le_prev is deadbeef");
1738
765c9de3
A
1739 /*
1740 * If buffer was a delayed write, start the IO by queuing
1741 * it on the LAUNDRY queue, and return 1
1742 */
1c79356b
A
1743 if (ISSET(bp->b_flags, B_DELWRI)) {
1744 splx(s);
765c9de3
A
1745 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
1746 blaundrycnt++;
1747 wakeup(&blaundrycnt);
9bccf70c
A
1748 /* and give it a chance to run */
1749 (void)thread_block(THREAD_CONTINUE_NULL);
1c79356b
A
1750 return (1);
1751 }
1752
1753 if (bp->b_vp)
1754 brelvp(bp);
1755 bremhash(bp);
1756 BLISTNONE(bp);
1757
1758 splx(s);
1759
1760 if (ISSET(bp->b_flags, B_META)) {
1c79356b
A
1761 vm_offset_t elem = (vm_offset_t)bp->b_data;
1762 if (elem == 0)
1763 panic("bcleanbuf: NULL bp->b_data B_META buffer");
1764
1765 if (ISSET(bp->b_flags, B_ZALLOC)) {
1766 if (bp->b_bufsize <= MAXMETA) {
1767 zone_t z;
1768
1769 z = getbufzone(bp->b_bufsize);
1770 bp->b_data = (caddr_t)0xdeadbeef;
1771 zfree(z, elem);
1772 CLR(bp->b_flags, B_ZALLOC);
1773 } else
1774 panic("bcleanbuf: B_ZALLOC set incorrectly");
1775 } else {
1776 bp->b_data = (caddr_t)0xdeadbeef;
1777 kmem_free(kernel_map, elem, bp->b_bufsize);
1778 }
1c79356b
A
1779 }
1780
1781 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
1782
1783 /* disassociate us from our vnode, if we had one... */
1784 s = splbio();
1785
1786 /* clear out various other fields */
0b4e3aa0 1787 bp->b_bufsize = 0;
1c79356b
A
1788 bp->b_data = 0;
1789 bp->b_flags = B_BUSY;
d52fe63f
A
1790 if (hdralloc)
1791 SET(bp->b_flags, B_HDRALLOC);
1c79356b
A
1792 bp->b_dev = NODEV;
1793 bp->b_blkno = bp->b_lblkno = 0;
1794 bp->b_iodone = 0;
1795 bp->b_error = 0;
1796 bp->b_resid = 0;
1797 bp->b_bcount = 0;
1798 bp->b_dirtyoff = bp->b_dirtyend = 0;
1799 bp->b_validoff = bp->b_validend = 0;
1800
1801 /* nuke any credentials we were holding */
1802 cred = bp->b_rcred;
1803 if (cred != NOCRED) {
1804 bp->b_rcred = NOCRED;
1805 crfree(cred);
1806 }
1807 cred = bp->b_wcred;
1808 if (cred != NOCRED) {
1809 bp->b_wcred = NOCRED;
1810 crfree(cred);
1811 }
1812 splx(s);
1813 return (0);
1814}
1815
1816
1817/*
1818 * Wait for operations on the buffer to complete.
1819 * When they do, extract and return the I/O's error value.
1820 */
1821int
1822biowait(bp)
1823 struct buf *bp;
1824{
1c79356b 1825 int s;
1c79356b
A
1826
1827 s = splbio();
1828 while (!ISSET(bp->b_flags, B_DONE))
1829 tsleep(bp, PRIBIO + 1, "biowait", 0);
1830 splx(s);
1831
1832 /* check for interruption of I/O (e.g. via NFS), then errors. */
1833 if (ISSET(bp->b_flags, B_EINTR)) {
1834 CLR(bp->b_flags, B_EINTR);
1835 return (EINTR);
1836 } else if (ISSET(bp->b_flags, B_ERROR))
1837 return (bp->b_error ? bp->b_error : EIO);
1838 else
1839 return (0);
1840}
1841
1842/*
1843 * Mark I/O complete on a buffer.
1844 *
1845 * If a callback has been requested, e.g. the pageout
1846 * daemon, do so. Otherwise, awaken waiting processes.
1847 *
1848 * [ Leffler, et al., says on p.247:
1849 * "This routine wakes up the blocked process, frees the buffer
1850 * for an asynchronous write, or, for a request by the pagedaemon
1851 * process, invokes a procedure specified in the buffer structure" ]
1852 *
1853 * In real life, the pagedaemon (or other system processes) wants
1854 * to do async stuff to, and doesn't want the buffer brelse()'d.
1855 * (for swap pager, that puts swap buffers on the free lists (!!!),
1856 * for the vn device, that puts malloc'd buffers on the free lists!)
1857 */
1858void
1859biodone(bp)
1860 struct buf *bp;
1861{
1862 boolean_t funnel_state;
d52fe63f 1863 struct vnode *vp;
55e303ae
A
1864 extern struct timeval priority_IO_timestamp_for_root;
1865 extern int hard_throttle_on_root;
1c79356b
A
1866
1867 funnel_state = thread_funnel_set(kernel_flock, TRUE);
1868
1869 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
fa4905b1 1870 (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1c79356b
A
1871
1872 if (ISSET(bp->b_flags, B_DONE))
1873 panic("biodone already");
1874 SET(bp->b_flags, B_DONE); /* note that it's done */
1875 /*
1876 * I/O was done, so don't believe
1877 * the DIRTY state from VM anymore
1878 */
1879 CLR(bp->b_flags, B_WASDIRTY);
1880
1881 if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
1882 vwakeup(bp); /* wake up reader */
9bccf70c
A
1883
1884 if (kdebug_enable) {
1885 int code = DKIO_DONE;
1886
1887 if (bp->b_flags & B_READ)
1888 code |= DKIO_READ;
1889 if (bp->b_flags & B_ASYNC)
1890 code |= DKIO_ASYNC;
1891
1892 if (bp->b_flags & B_META)
1893 code |= DKIO_META;
1894 else if (bp->b_flags & (B_PGIN | B_PAGEOUT))
1895 code |= DKIO_PAGING;
1896
1897 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
55e303ae
A
1898 (unsigned int)bp, (unsigned int)bp->b_vp,
1899 bp->b_resid, bp->b_error, 0);
9bccf70c
A
1900 }
1901
d52fe63f
A
1902 /* Wakeup the throttled write operations as needed */
1903 vp = bp->b_vp;
1904 if (vp
1905 && (vp->v_flag & VTHROTTLED)
1906 && (vp->v_numoutput <= (BUFWRITE_THROTTLE / 3))) {
1907 vp->v_flag &= ~VTHROTTLED;
1908 wakeup((caddr_t)&vp->v_numoutput);
1909 }
55e303ae
A
1910 if ((bp->b_flags & B_PGIN) && (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV)) {
1911 priority_IO_timestamp_for_root = time;
1912 hard_throttle_on_root = 0;
1913 }
1c79356b 1914 if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */
b4c24cb9
A
1915 void (*iodone_func)(struct buf *) = bp->b_iodone;
1916
1c79356b 1917 CLR(bp->b_flags, B_CALL); /* but note callout done */
b4c24cb9
A
1918 bp->b_iodone = NULL;
1919
1920 if (iodone_func == NULL) {
1921 panic("biodone: bp @ 0x%x has NULL b_iodone!\n", bp);
1922 } else {
1923 (*iodone_func)(bp);
1924 }
1c79356b
A
1925 } else if (ISSET(bp->b_flags, B_ASYNC)) /* if async, release it */
1926 brelse(bp);
1927 else { /* or just wakeup the buffer */
1928 CLR(bp->b_flags, B_WANTED);
1929 wakeup(bp);
1930 }
1931
1932 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
fa4905b1 1933 (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1c79356b
A
1934
1935 thread_funnel_set(kernel_flock, funnel_state);
1936}
1937
1938/*
1939 * Return a count of buffers on the "locked" queue.
1940 */
1941int
1942count_lock_queue()
1943{
1944 register struct buf *bp;
1945 register int n = 0;
1946
1947 for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
1948 bp = bp->b_freelist.tqe_next)
1949 n++;
1950 return (n);
1951}
1952
1953/*
1954 * Return a count of 'busy' buffers. Used at the time of shutdown.
1955 */
1956int
1957count_busy_buffers()
1958{
1959 register struct buf *bp;
1960 register int nbusy = 0;
1961
1962 for (bp = &buf[nbuf]; --bp >= buf; )
1963 if ((bp->b_flags & (B_BUSY|B_INVAL)) == B_BUSY)
1964 nbusy++;
1965 return (nbusy);
1966}
1967
9bccf70c 1968#if DIAGNOSTIC
1c79356b
A
1969/*
1970 * Print out statistics on the current allocation of the buffer pool.
1971 * Can be enabled to print out on every ``sync'' by setting "syncprt"
1972 * in vfs_syscalls.c using sysctl.
1973 */
1974void
1975vfs_bufstats()
1976{
1977 int s, i, j, count;
1978 register struct buf *bp;
1979 register struct bqueues *dp;
1980 int counts[MAXBSIZE/CLBYTES+1];
765c9de3
A
1981 static char *bname[BQUEUES] =
1982 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
1c79356b
A
1983
1984 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
1985 count = 0;
1986 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1987 counts[j] = 0;
1988 s = splbio();
1989 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
1990 counts[bp->b_bufsize/CLBYTES]++;
1991 count++;
1992 }
1993 splx(s);
1994 printf("%s: total-%d", bname[i], count);
1995 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1996 if (counts[j] != 0)
1997 printf(", %d-%d", j * CLBYTES, counts[j]);
1998 printf("\n");
1999 }
2000}
2001#endif /* DIAGNOSTIC */
2002
9bccf70c 2003#define NRESERVEDIOBUFS 64
1c79356b 2004
9bccf70c 2005__private_extern__ struct buf *
0b4e3aa0 2006alloc_io_buf(vp, priv)
1c79356b 2007 struct vnode *vp;
0b4e3aa0 2008 int priv;
1c79356b
A
2009{
2010 register struct buf *bp;
2011 int s;
2012
2013 s = splbio();
2014
0b4e3aa0
A
2015 while (niobuf - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse && !priv) {
2016 need_iobuffer = 1;
2017 bufstats.bufs_iobufsleeps++;
2018 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf", 0);
2019 }
2020
1c79356b
A
2021 while ((bp = iobufqueue.tqh_first) == NULL) {
2022 need_iobuffer = 1;
2023 bufstats.bufs_iobufsleeps++;
0b4e3aa0 2024 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf1", 0);
1c79356b 2025 }
0b4e3aa0 2026
1c79356b
A
2027 TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
2028 bp->b_timestamp = 0;
2029
2030 /* clear out various fields */
2031 bp->b_flags = B_BUSY;
2032 bp->b_blkno = bp->b_lblkno = 0;
b4c24cb9 2033
1c79356b
A
2034 bp->b_iodone = 0;
2035 bp->b_error = 0;
2036 bp->b_resid = 0;
2037 bp->b_bcount = 0;
2038 bp->b_bufsize = 0;
2039 bp->b_vp = vp;
2040
2041 if (vp->v_type == VBLK || vp->v_type == VCHR)
2042 bp->b_dev = vp->v_rdev;
2043 else
2044 bp->b_dev = NODEV;
2045 bufstats.bufs_iobufinuse++;
2046 if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
2047 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
2048 splx(s);
2049
2050 return (bp);
2051}
2052
9bccf70c 2053__private_extern__ void
1c79356b
A
2054free_io_buf(bp)
2055 struct buf *bp;
2056{
2057 int s;
2058
2059 s = splbio();
2060 /* put buffer back on the head of the iobufqueue */
2061 bp->b_vp = NULL;
2062 bp->b_flags = B_INVAL;
2063
2064 binsheadfree(bp, &iobufqueue, -1);
2065
2066 /* Wake up any processes waiting for any buffer to become free. */
2067 if (need_iobuffer) {
2068 need_iobuffer = 0;
2069 wakeup(&need_iobuffer);
2070 }
2071 bufstats.bufs_iobufinuse--;
2072 splx(s);
2073}
2074
9bccf70c 2075/* disabled for now */
1c79356b
A
2076
2077/* XXX move this to a separate file */
2078/*
2079 * Dynamic Scaling of the Buffer Queues
2080 */
2081
2082typedef long long blsize_t;
2083
55e303ae 2084blsize_t MAXNBUF; /* initialize to (sane_size / PAGE_SIZE) */
1c79356b
A
2085/* Global tunable limits */
2086blsize_t nbufh; /* number of buffer headers */
2087blsize_t nbuflow; /* minimum number of buffer headers required */
2088blsize_t nbufhigh; /* maximum number of buffer headers allowed */
2089blsize_t nbuftarget; /* preferred number of buffer headers */
2090
2091/*
2092 * assertions:
2093 *
2094 * 1. 0 < nbuflow <= nbufh <= nbufhigh
2095 * 2. nbufhigh <= MAXNBUF
2096 * 3. 0 < nbuflow <= nbuftarget <= nbufhigh
2097 * 4. nbufh can not be set by sysctl().
2098 */
2099
2100/* Per queue tunable limits */
2101
2102struct bufqlim {
2103 blsize_t bl_nlow; /* minimum number of buffer headers required */
2104 blsize_t bl_num; /* number of buffer headers on the queue */
2105 blsize_t bl_nlhigh; /* maximum number of buffer headers allowed */
2106 blsize_t bl_target; /* preferred number of buffer headers */
2107 long bl_stale; /* Seconds after which a buffer is considered stale */
2108} bufqlim[BQUEUES];
2109
2110/*
2111 * assertions:
2112 *
2113 * 1. 0 <= bl_nlow <= bl_num <= bl_nlhigh
2114 * 2. bl_nlhigh <= MAXNBUF
2115 * 3. bufqlim[BQ_META].bl_nlow != 0
2116 * 4. bufqlim[BQ_META].bl_nlow > (number of possible concurrent
2117 * file system IO operations)
2118 * 5. bl_num can not be set by sysctl().
2119 * 6. bl_nhigh <= nbufhigh
2120 */
2121
2122/*
2123 * Rationale:
2124 * ----------
2125 * Defining it blsize_t as long permits 2^31 buffer headers per queue.
2126 * Which can describe (2^31 * PAGE_SIZE) memory per queue.
2127 *
2128 * These limits are exported to by means of sysctl().
2129 * It was decided to define blsize_t as a 64 bit quantity.
2130 * This will make sure that we will not be required to change it
2131 * as long as we do not exceed 64 bit address space for the kernel.
2132 *
2133 * low and high numbers parameters initialized at compile time
2134 * and boot arguments can be used to override them. sysctl()
2135 * would not change the value. sysctl() can get all the values
2136 * but can set only target. num is the current level.
2137 *
2138 * Advantages of having a "bufqscan" thread doing the balancing are,
2139 * Keep enough bufs on BQ_EMPTY.
2140 * getnewbuf() by default will always select a buffer from the BQ_EMPTY.
2141 * getnewbuf() perfoms best if a buffer was found there.
2142 * Also this minimizes the possibility of starting IO
2143 * from getnewbuf(). That's a performance win, too.
2144 *
2145 * Localize complex logic [balancing as well as time aging]
2146 * to balancebufq().
2147 *
2148 * Simplify getnewbuf() logic by elimination of time aging code.
2149 */
2150
2151/*
2152 * Algorithm:
2153 * -----------
2154 * The goal of the dynamic scaling of the buffer queues to to keep
2155 * the size of the LRU close to bl_target. Buffers on a queue would
2156 * be time aged.
2157 *
2158 * There would be a thread which will be responsible for "balancing"
2159 * the buffer cache queues.
2160 *
2161 * The scan order would be: AGE, LRU, META, EMPTY.
2162 */
2163
2164long bufqscanwait = 0;
2165
9bccf70c
A
2166static void bufqscan_thread();
2167static int balancebufq(int q);
2168static int btrimempty(int n);
2169static __inline__ int initbufqscan(void);
2170static __inline__ int nextbufq(int q);
2171static void buqlimprt(int all);
1c79356b 2172
9bccf70c 2173static void
1c79356b
A
2174bufq_balance_thread_init()
2175{
2176
2177 if (bufqscanwait++ == 0) {
1c79356b
A
2178
2179 /* Initalize globals */
55e303ae 2180 MAXNBUF = (sane_size / PAGE_SIZE);
1c79356b
A
2181 nbufh = nbuf;
2182 nbuflow = min(nbufh, 100);
2183 nbufhigh = min(MAXNBUF, max(nbufh, 2048));
55e303ae 2184 nbuftarget = (sane_size >> 5) / PAGE_SIZE;
1c79356b
A
2185 nbuftarget = max(nbuflow, nbuftarget);
2186 nbuftarget = min(nbufhigh, nbuftarget);
2187
2188 /*
2189 * Initialize the bufqlim
2190 */
2191
2192 /* LOCKED queue */
2193 bufqlim[BQ_LOCKED].bl_nlow = 0;
2194 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
2195 bufqlim[BQ_LOCKED].bl_target = 0;
2196 bufqlim[BQ_LOCKED].bl_stale = 30;
2197
2198 /* LRU queue */
2199 bufqlim[BQ_LRU].bl_nlow = 0;
2200 bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
2201 bufqlim[BQ_LRU].bl_target = nbuftarget/4;
2202 bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
2203
2204 /* AGE queue */
2205 bufqlim[BQ_AGE].bl_nlow = 0;
2206 bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
2207 bufqlim[BQ_AGE].bl_target = nbuftarget/4;
2208 bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
2209
2210 /* EMPTY queue */
2211 bufqlim[BQ_EMPTY].bl_nlow = 0;
2212 bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
2213 bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
2214 bufqlim[BQ_EMPTY].bl_stale = 600000;
2215
2216 /* META queue */
2217 bufqlim[BQ_META].bl_nlow = 0;
2218 bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
2219 bufqlim[BQ_META].bl_target = nbuftarget/4;
2220 bufqlim[BQ_META].bl_stale = META_IS_STALE;
2221
765c9de3
A
2222 /* LAUNDRY queue */
2223 bufqlim[BQ_LOCKED].bl_nlow = 0;
2224 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
2225 bufqlim[BQ_LOCKED].bl_target = 0;
2226 bufqlim[BQ_LOCKED].bl_stale = 30;
2227
1c79356b
A
2228 buqlimprt(1);
2229 }
2230
2231 /* create worker thread */
2232 kernel_thread(kernel_task, bufqscan_thread);
2233}
2234
2235/* The workloop for the buffer balancing thread */
9bccf70c 2236static void
1c79356b
A
2237bufqscan_thread()
2238{
2239 boolean_t funnel_state;
2240 int moretodo = 0;
2241
2242 funnel_state = thread_funnel_set(kernel_flock, TRUE);
2243
2244 for(;;) {
2245 do {
2246 int q; /* buffer queue to process */
2247
9bccf70c
A
2248 q = initbufqscan();
2249 for (; q; ) {
1c79356b
A
2250 moretodo |= balancebufq(q);
2251 q = nextbufq(q);
2252 }
2253 } while (moretodo);
2254
9bccf70c 2255#if DIAGNOSTIC
1c79356b
A
2256 vfs_bufstats();
2257 buqlimprt(0);
2258#endif
2259 (void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz);
2260 moretodo = 0;
2261 }
2262
2263 (void) thread_funnel_set(kernel_flock, FALSE);
2264}
2265
2266/* Seed for the buffer queue balancing */
9bccf70c 2267static __inline__ int
1c79356b
A
2268initbufqscan()
2269{
2270 /* Start with AGE queue */
2271 return (BQ_AGE);
2272}
2273
2274/* Pick next buffer queue to balance */
9bccf70c 2275static __inline__ int
1c79356b
A
2276nextbufq(int q)
2277{
2278 int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
2279
2280 q++;
2281 q %= sizeof(order);
2282 return (order[q]);
2283}
2284
2285/* function to balance the buffer queues */
9bccf70c 2286static int
1c79356b
A
2287balancebufq(int q)
2288{
2289 int moretodo = 0;
2290 int s = splbio();
2291 int n;
2292
2293 /* reject invalid q */
2294 if ((q < 0) || (q >= BQUEUES))
2295 goto out;
2296
765c9de3
A
2297 /* LOCKED or LAUNDRY queue MUST not be balanced */
2298 if ((q == BQ_LOCKED) || (q == BQ_LAUNDRY))
1c79356b
A
2299 goto out;
2300
2301 n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
2302
2303 /* If queue has less than target nothing more to do */
2304 if (n < 0)
2305 goto out;
2306
2307 if ( n > 8 ) {
2308 /* Balance only a small amount (12.5%) at a time */
2309 n >>= 3;
2310 }
2311
2312 /* EMPTY queue needs special handling */
2313 if (q == BQ_EMPTY) {
2314 moretodo |= btrimempty(n);
2315 goto out;
2316 }
2317
2318 for (; n > 0; n--) {
2319 struct buf *bp = bufqueues[q].tqh_first;
2320 if (!bp)
2321 break;
2322
2323 /* check if it's stale */
2324 if ((time.tv_sec - bp->b_timestamp) > bufqlim[q].bl_stale) {
2325 if (bcleanbuf(bp)) {
2326 /* bawrite() issued, bp not ready */
2327 moretodo = 1;
2328 } else {
2329 /* release the cleaned buffer to BQ_EMPTY */
2330 SET(bp->b_flags, B_INVAL);
2331 brelse(bp);
2332 }
2333 } else
2334 break;
2335 }
2336
2337out:
2338 splx(s);
2339 return (moretodo);
2340}
2341
9bccf70c 2342static int
1c79356b
A
2343btrimempty(int n)
2344{
2345 /*
2346 * When struct buf are allocated dynamically, this would
2347 * reclaim upto 'n' struct buf from the empty queue.
2348 */
2349
2350 return (0);
2351}
2352
9bccf70c 2353static __inline__ void
1c79356b
A
2354bufqinc(int q)
2355{
2356 if ((q < 0) || (q >= BQUEUES))
2357 return;
2358
2359 bufqlim[q].bl_num++;
2360 return;
2361}
2362
9bccf70c 2363static __inline__ void
1c79356b
A
2364bufqdec(int q)
2365{
2366 if ((q < 0) || (q >= BQUEUES))
2367 return;
2368
2369 bufqlim[q].bl_num--;
2370 return;
2371}
2372
9bccf70c 2373static void
1c79356b
A
2374buqlimprt(int all)
2375{
2376 int i;
765c9de3
A
2377 static char *bname[BQUEUES] =
2378 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
1c79356b
A
2379
2380 if (all)
2381 for (i = 0; i < BQUEUES; i++) {
2382 printf("%s : ", bname[i]);
9bccf70c
A
2383 printf("min = %ld, ", (long)bufqlim[i].bl_nlow);
2384 printf("cur = %ld, ", (long)bufqlim[i].bl_num);
2385 printf("max = %ld, ", (long)bufqlim[i].bl_nlhigh);
2386 printf("target = %ld, ", (long)bufqlim[i].bl_target);
2387 printf("stale after %ld seconds\n", bufqlim[i].bl_stale);
1c79356b
A
2388 }
2389 else
2390 for (i = 0; i < BQUEUES; i++) {
2391 printf("%s : ", bname[i]);
9bccf70c 2392 printf("cur = %ld, ", (long)bufqlim[i].bl_num);
1c79356b
A
2393 }
2394}
765c9de3
A
2395
2396/*
2397 * If the getnewbuf() calls bcleanbuf() on the same thread
2398 * there is a potential for stack overrun and deadlocks.
2399 * So we always handoff the work to worker thread for completion
2400 */
2401
2402static void
2403bcleanbuf_thread_init()
2404{
2405 static void bcleanbuf_thread();
2406
2407 /* create worker thread */
2408 kernel_thread(kernel_task, bcleanbuf_thread);
2409}
2410
2411static void
2412bcleanbuf_thread()
2413{
2414 boolean_t funnel_state;
2415 struct buf *bp;
9bccf70c
A
2416 int error = 0;
2417 int loopcnt = 0;
765c9de3
A
2418
2419 funnel_state = thread_funnel_set(kernel_flock, TRUE);
2420
2421doit:
2422 while (blaundrycnt == 0)
2423 (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 60 * hz);
2424 bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY]);
2425 /* Remove from the queue */
2426 bremfree(bp);
2427 blaundrycnt--;
55e303ae 2428
765c9de3 2429 /* do the IO */
9bccf70c
A
2430 error = bawrite_internal(bp, 0);
2431 if (error) {
2432 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
2433 blaundrycnt++;
2434 if (loopcnt > 10) {
2435 (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 1);
2436 loopcnt = 0;
2437 } else {
2438 (void)thread_block(THREAD_CONTINUE_NULL);
2439 loopcnt++;
2440 }
2441 }
765c9de3
A
2442 /* start again */
2443 goto doit;
2444
2445 (void) thread_funnel_set(kernel_flock, funnel_state);
2446}
b4c24cb9
A
2447
2448
55e303ae
A
2449static int
2450brecover_data(struct buf *bp)
2451{
2452 upl_t upl;
2453 upl_page_info_t *pl;
2454 int upl_offset;
2455 kern_return_t kret;
2456 struct vnode *vp = bp->b_vp;
2457
2458 if (vp->v_tag == VT_NFS)
2459 /*
2460 * NFS currently deals with this case
2461 * in a slightly different manner...
2462 * continue to let it do so
2463 */
2464 return(1);
2465
2466 if (!UBCISVALID(vp) || bp->b_bufsize == 0)
2467 goto dump_buffer;
2468
2469 kret = ubc_create_upl(vp,
2470 ubc_blktooff(vp, bp->b_lblkno),
2471 bp->b_bufsize,
2472 &upl,
2473 &pl,
2474 UPL_PRECIOUS);
2475 if (kret != KERN_SUCCESS)
2476 panic("Failed to get pagelists");
2477
2478 for (upl_offset = 0; upl_offset < bp->b_bufsize; upl_offset += PAGE_SIZE) {
2479
2480 if (!upl_valid_page(pl, upl_offset / PAGE_SIZE) || !upl_dirty_page(pl, upl_offset / PAGE_SIZE)) {
2481 ubc_upl_abort(upl, 0);
2482 goto dump_buffer;
2483 }
2484 }
2485 SET(bp->b_flags, B_PAGELIST);
2486 bp->b_pagelist = upl;
2487
2488 kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
2489 if (kret != KERN_SUCCESS)
2490 panic("getblk: ubc_upl_map() failed with (%d)", kret);
2491 if (bp->b_data == 0)
2492 panic("ubc_upl_map mapped 0");
2493
2494 return (1);
2495
2496dump_buffer:
2497 bp->b_bufsize = 0;
2498 SET(bp->b_flags, B_INVAL);
2499 brelse(bp);
2500
2501 return(0);
2502}
2503
2504
b4c24cb9
A
2505static int
2506bp_cmp(void *a, void *b)
2507{
2508 struct buf *bp_a = *(struct buf **)a,
2509 *bp_b = *(struct buf **)b;
2510 daddr_t res;
2511
2512 // don't have to worry about negative block
2513 // numbers so this is ok to do.
2514 //
2515 res = (bp_a->b_blkno - bp_b->b_blkno);
2516
2517 return (int)res;
2518}
2519
2520#define NFLUSH 32
2521
2522int
2523bflushq(int whichq, struct mount *mp)
2524{
2525 struct buf *bp, *next;
2526 int i, buf_count, s;
2527 int counter=0, total_writes=0;
2528 static struct buf *flush_table[NFLUSH];
2529
2530 if (whichq < 0 || whichq >= BQUEUES) {
2531 return;
2532 }
2533
2534
2535 restart:
2536 bp = TAILQ_FIRST(&bufqueues[whichq]);
2537 for(buf_count=0; bp; bp=next) {
2538 next = bp->b_freelist.tqe_next;
2539
2540 if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) {
2541 continue;
2542 }
2543
2544 if ((bp->b_flags & B_DELWRI) && (bp->b_flags & B_BUSY) == 0) {
2545 if (whichq != BQ_LOCKED && (bp->b_flags & B_LOCKED)) {
2546 panic("bflushq: bp @ 0x%x is locked!\n", bp);
2547 }
2548
2549 bremfree(bp);
2550 bp->b_flags |= B_BUSY;
2551 flush_table[buf_count] = bp;
2552 buf_count++;
2553 total_writes++;
2554
2555 if (buf_count >= NFLUSH) {
2556 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
2557
2558 for(i=0; i < buf_count; i++) {
2559 bawrite(flush_table[i]);
2560 }
2561
2562 goto restart;
2563 }
2564 }
2565 }
2566
2567 if (buf_count > 0) {
2568 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
2569 for(i=0; i < buf_count; i++) {
2570 bawrite(flush_table[i]);
2571 }
2572 }
2573
2574 return total_writes;
2575}