bsd/vfs/vfs_bio.c

   1 /*
   2  * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  23 /*-
  24  * Copyright (c) 1994 Christopher G. Demetriou
  25  * Copyright (c) 1982, 1986, 1989, 1993
  26  *      The Regents of the University of California.  All rights reserved.
  27  * (c) UNIX System Laboratories, Inc.
  28  * All or some portions of this file are derived from material licensed
  29  * to the University of California by American Telephone and Telegraph
  30  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  31  * the permission of UNIX System Laboratories, Inc.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  * The NEXTSTEP Software License Agreement specifies the terms
  62  * and conditions for redistribution.
  63  *
  64  *      @(#)vfs_bio.c   8.6 (Berkeley) 1/11/94
  65  */
  66
  67 /*
  68  * Some references:
  69  *      Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
  70  *      Leffler, et al.: The Design and Implementation of the 4.3BSD
  71  *              UNIX Operating System (Addison Welley, 1989)
  72  */
  73
  74 #include <sys/param.h>
  75 #include <sys/systm.h>
  76 #include <sys/proc.h>
  77 #include <sys/buf.h>
  78 #include <sys/vnode.h>
  79 #include <sys/mount.h>
  80 #include <sys/trace.h>
  81 #include <sys/malloc.h>
  82 #include <sys/resourcevar.h>
  83 #include <miscfs/specfs/specdev.h>
  84 #include <sys/ubc.h>
  85 #include <vm/vm_pageout.h>
  86 #if DIAGNOSTIC
  87 #include <kern/assert.h>
  88 #endif /* DIAGNOSTIC */
  89 #include <kern/task.h>
  90 #include <kern/zalloc.h>
  91
  92 #include <sys/kdebug.h>
  93 #include <machine/spl.h>
  94
  95 static __inline__ void bufqinc(int q);
  96 static __inline__ void bufqdec(int q);
  97
  98 static struct buf *getnewbuf(int slpflag, int slptimeo, int *queue);
  99 static int bcleanbuf(struct buf *bp);
 100 extern void vwakeup();
 101
 102 extern int niobuf;      /* The number of IO buffer headers for cluster IO */
 103 int blaundrycnt;
 104
 105 /* zone allocated buffer headers */
 106 static zone_t buf_hdr_zone;
 107 static int buf_hdr_count;
 108
 109 #if TRACE
 110 struct  proc *traceproc;
 111 int     tracewhich, tracebuf[TRCSIZ];
 112 u_int   tracex;
 113 char    traceflags[TR_NFLAGS];
 114 #endif /* TRACE */
 115
 116 /*
 117  * Definitions for the buffer hash lists.
 118  */
 119 #define BUFHASH(dvp, lbn)       \
 120         (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
 121 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
 122 u_long  bufhash;
 123
 124 /* Definitions for the buffer stats. */
 125 struct bufstats bufstats;
 126
 127 /* Number of delayed write buffers */
 128 int nbdwrite = 0;
 129
 130 /*
 131  * Insq/Remq for the buffer hash lists.
 132  */
 133 #if 0
 134 #define binshash(bp, dp)        LIST_INSERT_HEAD(dp, bp, b_hash)
 135 #define bremhash(bp)            LIST_REMOVE(bp, b_hash)
 136 #endif /* 0 */
 137
 138
 139 TAILQ_HEAD(ioqueue, buf) iobufqueue;
 140 TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
 141 static int needbuffer;
 142 static int need_iobuffer;
 143
 144 /*
 145  * Insq/Remq for the buffer free lists.
 146  */
 147 #define binsheadfree(bp, dp, whichq)    do { \
 148                                     TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
 149                                         bufqinc((whichq));      \
 150                                         (bp)->b_whichq = whichq; \
 151                                     (bp)->b_timestamp = time.tv_sec; \
 152                                 } while (0)
 153
 154 #define binstailfree(bp, dp, whichq)    do { \
 155                                     TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
 156                                         bufqinc((whichq));      \
 157                                         (bp)->b_whichq = whichq; \
 158                                     (bp)->b_timestamp = time.tv_sec; \
 159                                 } while (0)
 160
 161 #define BHASHENTCHECK(bp)       \
 162         if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef)  \
 163                 panic("%x: b_hash.le_prev is not deadbeef", (bp));
 164
 165 #define BLISTNONE(bp)   \
 166         (bp)->b_hash.le_next = (struct buf *)0; \
 167         (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
 168
 169 /*
 170  * Insq/Remq for the vnode usage lists.
 171  */
 172 #define bufinsvn(bp, dp)        LIST_INSERT_HEAD(dp, bp, b_vnbufs)
 173 #define bufremvn(bp) {                                                  \
 174         LIST_REMOVE(bp, b_vnbufs);                                      \
 175         (bp)->b_vnbufs.le_next = NOLIST;                                \
 176 }
 177
 178 simple_lock_data_t bufhashlist_slock;           /* lock on buffer hash list */
 179
 180 /* number of per vnode, "in flight" buffer writes */
 181 #define BUFWRITE_THROTTLE       9
 182
 183 /*
 184  * Time in seconds before a buffer on a list is
 185  * considered as a stale buffer
 186  */
 187 #define LRU_IS_STALE 120 /* default value for the LRU */
 188 #define AGE_IS_STALE 60  /* default value for the AGE */
 189 #define META_IS_STALE 180 /* default value for the BQ_META */
 190
 191 int lru_is_stale = LRU_IS_STALE;
 192 int age_is_stale = AGE_IS_STALE;
 193 int meta_is_stale = META_IS_STALE;
 194
 195 /* LIST_INSERT_HEAD() with assertions */
 196 static __inline__ void
 197 blistenterhead(struct bufhashhdr * head, struct buf * bp)
 198 {
 199         if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
 200                 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
 201         (head)->lh_first = bp;
 202         bp->b_hash.le_prev = &(head)->lh_first;
 203         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
 204                 panic("blistenterhead: le_prev is deadbeef");
 205 }
 206
 207 static __inline__ void
 208 binshash(struct buf *bp, struct bufhashhdr *dp)
 209 {
 210         struct buf *nbp;
 211
 212         simple_lock(&bufhashlist_slock);
 213
 214 #if 0
 215         if(incore(bp->b_vp, bp->b_lblkno))
 216                 panic("binshash: already incore");
 217 #endif /* 0 */
 218
 219         BHASHENTCHECK(bp);
 220
 221         nbp = dp->lh_first;
 222         for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
 223                 if(nbp == bp)
 224                         panic("buf already in hashlist");
 225         }
 226
 227         blistenterhead(dp, bp);
 228         simple_unlock(&bufhashlist_slock);
 229 }
 230
 231 static __inline__ void
 232 bremhash(struct buf *bp)
 233 {
 234         simple_lock(&bufhashlist_slock);
 235         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
 236                 panic("bremhash le_prev is deadbeef");
 237         if (bp->b_hash.le_next == bp)
 238                 panic("bremhash: next points to self");
 239
 240         if (bp->b_hash.le_next != NULL)
 241                 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
 242         *bp->b_hash.le_prev = (bp)->b_hash.le_next;
 243         simple_unlock(&bufhashlist_slock);
 244 }
 245
 246 /*
 247  * Remove a buffer from the free list it's on
 248  */
 249 void
 250 bremfree(bp)
 251         struct buf *bp;
 252 {
 253         struct bqueues *dp = NULL;
 254         int whichq = -1;
 255
 256         /*
 257          * We only calculate the head of the freelist when removing
 258          * the last element of the list as that is the only time that
 259          * it is needed (e.g. to reset the tail pointer).
 260          *
 261          * NB: This makes an assumption about how tailq's are implemented.
 262          */
 263         if (bp->b_freelist.tqe_next == NULL) {
 264                 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
 265                         if (dp->tqh_last == &bp->b_freelist.tqe_next)
 266                                 break;
 267                 if (dp == &bufqueues[BQUEUES])
 268                         panic("bremfree: lost tail");
 269         }
 270         TAILQ_REMOVE(dp, bp, b_freelist);
 271         whichq = bp->b_whichq;
 272         bufqdec(whichq);
 273         bp->b_whichq = -1;
 274         bp->b_timestamp = 0;
 275 }
 276
 277 /*
 278  * Associate a buffer with a vnode.
 279  */
 280 static void
 281 bgetvp(vp, bp)
 282         register struct vnode *vp;
 283         register struct buf *bp;
 284 {
 285
 286         if (bp->b_vp != vp)
 287                 panic("bgetvp: not free");
 288         VHOLD(vp);
 289         bp->b_vp = vp;
 290         if (vp->v_type == VBLK || vp->v_type == VCHR)
 291                 bp->b_dev = vp->v_rdev;
 292         else
 293                 bp->b_dev = NODEV;
 294         /*
 295          * Insert onto list for new vnode.
 296          */
 297         bufinsvn(bp, &vp->v_cleanblkhd);
 298 }
 299
 300 /*
 301  * Disassociate a buffer from a vnode.
 302  */
 303 static void
 304 brelvp(bp)
 305         register struct buf *bp;
 306 {
 307         struct vnode *vp;
 308
 309         if (bp->b_vp == (struct vnode *) 0)
 310                 panic("brelvp: NULL vp");
 311         /*
 312          * Delete from old vnode list, if on one.
 313          */
 314         if (bp->b_vnbufs.le_next != NOLIST)
 315                 bufremvn(bp);
 316         vp = bp->b_vp;
 317         bp->b_vp = (struct vnode *) 0;
 318         HOLDRELE(vp);
 319 }
 320
 321 /*
 322  * Reassign a buffer from one vnode to another.
 323  * Used to assign file specific control information
 324  * (indirect blocks) to the vnode to which they belong.
 325  */
 326 void
 327 reassignbuf(bp, newvp)
 328         register struct buf *bp;
 329         register struct vnode *newvp;
 330 {
 331         register struct buflists *listheadp;
 332
 333         if (newvp == NULL) {
 334                 printf("reassignbuf: NULL");
 335                 return;
 336         }
 337         /*
 338          * Delete from old vnode list, if on one.
 339          */
 340         if (bp->b_vnbufs.le_next != NOLIST)
 341                 bufremvn(bp);
 342         /*
 343          * If dirty, put on list of dirty buffers;
 344          * otherwise insert onto list of clean buffers.
 345          */
 346         if (ISSET(bp->b_flags, B_DELWRI))
 347                 listheadp = &newvp->v_dirtyblkhd;
 348         else
 349                 listheadp = &newvp->v_cleanblkhd;
 350         bufinsvn(bp, listheadp);
 351 }
 352
 353 static __inline__ void
 354 bufhdrinit(struct buf *bp)
 355 {
 356         bzero((char *)bp, sizeof *bp);
 357         bp->b_dev = NODEV;
 358         bp->b_rcred = NOCRED;
 359         bp->b_wcred = NOCRED;
 360         bp->b_vnbufs.le_next = NOLIST;
 361         bp->b_flags = B_INVAL;
 362
 363         return;
 364 }
 365
 366 /*
 367  * Initialize buffers and hash links for buffers.
 368  */
 369 __private_extern__ void
 370 bufinit()
 371 {
 372         register struct buf *bp;
 373         register struct bqueues *dp;
 374         register int i;
 375         int metabuf;
 376         long whichq;
 377         static void bufzoneinit();
 378         static void bcleanbuf_thread_init();
 379
 380         /* Initialize the buffer queues ('freelists') and the hash table */
 381         for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
 382                 TAILQ_INIT(dp);
 383         bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash);
 384
 385         simple_lock_init(&bufhashlist_slock );
 386
 387         metabuf = nbuf/8; /* reserved for meta buf */
 388
 389         /* Initialize the buffer headers */
 390         for (i = 0; i < nbuf; i++) {
 391                 bp = &buf[i];
 392                 bufhdrinit(bp);
 393
 394                 /*
 395                  * metabuf buffer headers on the meta-data list and
 396                  * rest of the buffer headers on the empty list
 397                  */
 398                 if (--metabuf)
 399                         whichq = BQ_META;
 400                 else
 401                         whichq = BQ_EMPTY;
 402
 403                 BLISTNONE(bp);
 404                 dp = &bufqueues[whichq];
 405                 binsheadfree(bp, dp, whichq);
 406                 binshash(bp, &invalhash);
 407         }
 408
 409         for (; i < nbuf + niobuf; i++) {
 410                 bp = &buf[i];
 411                 bufhdrinit(bp);
 412                 binsheadfree(bp, &iobufqueue, -1);
 413         }
 414
 415         printf("using %d buffer headers and %d cluster IO buffer headers\n",
 416                 nbuf, niobuf);
 417
 418         /* Set up zones used by the buffer cache */
 419         bufzoneinit();
 420
 421         /* start the bcleanbuf() thread */
 422         bcleanbuf_thread_init();
 423
 424 #if 0   /* notyet */
 425         {
 426         static void bufq_balance_thread_init();
 427         /* create a thread to do dynamic buffer queue balancing */
 428         bufq_balance_thread_init();
 429         }
 430 #endif /* notyet */
 431 }
 432
 433 static struct buf *
 434 bio_doread(vp, blkno, size, cred, async, queuetype)
 435         struct vnode *vp;
 436         daddr_t blkno;
 437         int size;
 438         struct ucred *cred;
 439         int async;
 440         int queuetype;
 441 {
 442         register struct buf *bp;
 443         struct proc     *p = current_proc();
 444
 445         bp = getblk(vp, blkno, size, 0, 0, queuetype);
 446
 447         /*
 448          * If buffer does not have data valid, start a read.
 449          * Note that if buffer is B_INVAL, getblk() won't return it.
 450          * Therefore, it's valid if it's I/O has completed or been delayed.
 451          */
 452         if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
 453                 /* Start I/O for the buffer (keeping credentials). */
 454                 SET(bp->b_flags, B_READ | async);
 455                 if (cred != NOCRED && bp->b_rcred == NOCRED) {
 456                         /*
 457                          * NFS has embedded ucred.
 458                          * Can not crhold() here as that causes zone corruption
 459                          */
 460                         bp->b_rcred = crdup(cred);
 461                 }
 462                 VOP_STRATEGY(bp);
 463
 464                 trace(TR_BREADMISS, pack(vp, size), blkno);
 465
 466                 /* Pay for the read. */
 467                 if (p && p->p_stats)
 468                         p->p_stats->p_ru.ru_inblock++;          /* XXX */
 469         } else if (async) {
 470                 brelse(bp);
 471         }
 472
 473         trace(TR_BREADHIT, pack(vp, size), blkno);
 474
 475         return (bp);
 476 }
 477 /*
 478  * Read a disk block.
 479  * This algorithm described in Bach (p.54).
 480  */
 481 int
 482 bread(vp, blkno, size, cred, bpp)
 483         struct vnode *vp;
 484         daddr_t blkno;
 485         int size;
 486         struct ucred *cred;
 487         struct buf **bpp;
 488 {
 489         register struct buf *bp;
 490
 491         /* Get buffer for block. */
 492         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
 493
 494         /* Wait for the read to complete, and return result. */
 495         return (biowait(bp));
 496 }
 497
 498 /*
 499  * Read a disk block. [bread() for meta-data]
 500  * This algorithm described in Bach (p.54).
 501  */
 502 int
 503 meta_bread(vp, blkno, size, cred, bpp)
 504         struct vnode *vp;
 505         daddr_t blkno;
 506         int size;
 507         struct ucred *cred;
 508         struct buf **bpp;
 509 {
 510         register struct buf *bp;
 511
 512         /* Get buffer for block. */
 513         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
 514
 515         /* Wait for the read to complete, and return result. */
 516         return (biowait(bp));
 517 }
 518
 519 /*
 520  * Read-ahead multiple disk blocks. The first is sync, the rest async.
 521  * Trivial modification to the breada algorithm presented in Bach (p.55).
 522  */
 523 int
 524 breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp)
 525         struct vnode *vp;
 526         daddr_t blkno; int size;
 527         daddr_t rablks[]; int rasizes[];
 528         int nrablks;
 529         struct ucred *cred;
 530         struct buf **bpp;
 531 {
 532         register struct buf *bp;
 533         int i;
 534
 535         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
 536
 537         /*
 538          * For each of the read-ahead blocks, start a read, if necessary.
 539          */
 540         for (i = 0; i < nrablks; i++) {
 541                 /* If it's in the cache, just go on to next one. */
 542                 if (incore(vp, rablks[i]))
 543                         continue;
 544
 545                 /* Get a buffer for the read-ahead block */
 546                 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, BLK_READ);
 547         }
 548
 549         /* Otherwise, we had to start a read for it; wait until it's valid. */
 550         return (biowait(bp));
 551 }
 552
 553 /*
 554  * Read with single-block read-ahead.  Defined in Bach (p.55), but
 555  * implemented as a call to breadn().
 556  * XXX for compatibility with old file systems.
 557  */
 558 int
 559 breada(vp, blkno, size, rablkno, rabsize, cred, bpp)
 560         struct vnode *vp;
 561         daddr_t blkno; int size;
 562         daddr_t rablkno; int rabsize;
 563         struct ucred *cred;
 564         struct buf **bpp;
 565 {
 566
 567         return (breadn(vp, blkno, size, &rablkno, &rabsize, 1, cred, bpp));
 568 }
 569
 570 /*
 571  * Block write.  Described in Bach (p.56)
 572  */
 573 int
 574 bwrite(bp)
 575         struct buf *bp;
 576 {
 577         int rv, sync, wasdelayed;
 578         struct proc     *p = current_proc();
 579         struct vnode *vp = bp->b_vp;
 580
 581         /* Remember buffer type, to switch on it later. */
 582         sync = !ISSET(bp->b_flags, B_ASYNC);
 583         wasdelayed = ISSET(bp->b_flags, B_DELWRI);
 584         CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
 585         if (wasdelayed) {
 586                 nbdwrite--;
 587                 wakeup((caddr_t)&nbdwrite);
 588         }
 589
 590         if (!sync) {
 591                 /*
 592                  * If not synchronous, pay for the I/O operation and make
 593                  * sure the buf is on the correct vnode queue.  We have
 594                  * to do this now, because if we don't, the vnode may not
 595                  * be properly notified that its I/O has completed.
 596                  */
 597                 if (wasdelayed)
 598                         reassignbuf(bp, vp);
 599                 else
 600                 if (p && p->p_stats)
 601                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
 602         }
 603
 604         trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
 605
 606         /* Initiate disk write.  Make sure the appropriate party is charged. */
 607         SET(bp->b_flags, B_WRITEINPROG);
 608         vp->v_numoutput++;
 609
 610         VOP_STRATEGY(bp);
 611
 612         if (sync) {
 613                 /*
 614                  * If I/O was synchronous, wait for it to complete.
 615                  */
 616                 rv = biowait(bp);
 617
 618                 /*
 619                  * Pay for the I/O operation, if it's not been paid for, and
 620                  * make sure it's on the correct vnode queue. (async operatings
 621                  * were payed for above.)
 622                  */
 623                 if (wasdelayed)
 624                         reassignbuf(bp, vp);
 625                 else
 626                 if (p && p->p_stats)
 627                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
 628
 629                 /* Release the buffer. */
 630                 brelse(bp);
 631
 632                 return (rv);
 633         } else {
 634                 return (0);
 635         }
 636 }
 637
 638 int
 639 vn_bwrite(ap)
 640         struct vop_bwrite_args *ap;
 641 {
 642         return (bwrite(ap->a_bp));
 643 }
 644
 645 /*
 646  * Delayed write.
 647  *
 648  * The buffer is marked dirty, but is not queued for I/O.
 649  * This routine should be used when the buffer is expected
 650  * to be modified again soon, typically a small write that
 651  * partially fills a buffer.
 652  *
 653  * NB: magnetic tapes cannot be delayed; they must be
 654  * written in the order that the writes are requested.
 655  *
 656  * Described in Leffler, et al. (pp. 208-213).
 657  *
 658  * Note: With the abilitty to allocate additional buffer
 659  * headers, we can get in to the situation where "too" many
 660  * bdwrite()s can create situation where the kernel can create
 661  * buffers faster than the disks can service. Doing a bawrite() in
 662  * cases were we have "too many" outstanding bdwrite()s avoids that.
 663  */
 664 __private_extern__ int
 665 bdwrite_internal(bp, return_error)
 666         struct buf *bp;
 667         int return_error;
 668 {
 669         struct proc *p = current_proc();
 670         struct vnode *vp = bp->b_vp;
 671
 672         /*
 673          * If the block hasn't been seen before:
 674          *      (1) Mark it as having been seen,
 675          *      (2) Charge for the write.
 676          *      (3) Make sure it's on its vnode's correct block list,
 677          */
 678         if (!ISSET(bp->b_flags, B_DELWRI)) {
 679                 SET(bp->b_flags, B_DELWRI);
 680                 if (p && p->p_stats)
 681                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
 682                 nbdwrite ++;
 683                 reassignbuf(bp, vp);
 684         }
 685
 686         /* If this is a tape block, write it the block now. */
 687         if (ISSET(bp->b_flags, B_TAPE)) {
 688                 /* bwrite(bp); */
 689                 VOP_BWRITE(bp);
 690                 return (0);
 691         }
 692
 693         /*
 694          * If the vnode has "too many" write operations in progress
 695          * wait for them to finish the IO
 696          */
 697         while (vp->v_numoutput >= BUFWRITE_THROTTLE) {
 698                 vp->v_flag |= VTHROTTLED;
 699                 (void)tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "bdwrite", 0);
 700         }
 701
 702         /*
 703          * If we have too many delayed write buffers,
 704          * more than we can "safely" handle, just fall back to
 705          * doing the async write
 706          */
 707         if (nbdwrite < 0)
 708                 panic("bdwrite: Negative nbdwrite");
 709
 710         if (nbdwrite > ((nbuf/4)*3)) {
 711                 if (return_error)
 712                         return (EAGAIN);
 713                 else
 714                         bawrite(bp);
 715                 return (0);
 716         }
 717
 718         /* Otherwise, the "write" is done, so mark and release the buffer. */
 719         SET(bp->b_flags, B_DONE);
 720         brelse(bp);
 721         return (0);
 722 }
 723
 724 void
 725 bdwrite(bp)
 726         struct buf *bp;
 727 {
 728         (void) bdwrite_internal(bp, 0);
 729 }
 730
 731
 732 /*
 733  * Asynchronous block write; just an asynchronous bwrite().
 734  *
 735  * Note: With the abilitty to allocate additional buffer
 736  * headers, we can get in to the situation where "too" many
 737  * bawrite()s can create situation where the kernel can create
 738  * buffers faster than the disks can service.
 739  * We limit the number of "in flight" writes a vnode can have to
 740  * avoid this.
 741  */
 742 static int
 743 bawrite_internal(bp, throttle)
 744         struct buf *bp;
 745         int throttle;
 746 {
 747         struct vnode *vp = bp->b_vp;
 748
 749         if (vp) {
 750                 /*
 751                  * If the vnode has "too many" write operations in progress
 752                  * wait for them to finish the IO
 753                  */
 754                 while (vp->v_numoutput >= BUFWRITE_THROTTLE) {
 755                         if (throttle) {
 756                                 vp->v_flag |= VTHROTTLED;
 757                                 (void)tsleep((caddr_t)&vp->v_numoutput,
 758                                                 PRIBIO + 1, "bawrite", 0);
 759                         } else
 760                                 return (EWOULDBLOCK);
 761                 }
 762         }
 763
 764         SET(bp->b_flags, B_ASYNC);
 765         VOP_BWRITE(bp);
 766         return (0);
 767 }
 768
 769 void
 770 bawrite(bp)
 771         struct buf *bp;
 772 {
 773         (void) bawrite_internal(bp, 1);
 774 }
 775
 776 /*
 777  *      bwillwrite:
 778  *
 779  *      Called prior to the locking of any vnodes when we are expecting to
 780  *      write.  We do not want to starve the buffer cache with too many
 781  *      dirty buffers so we block here.  By blocking prior to the locking
 782  *      of any vnodes we attempt to avoid the situation where a locked vnode
 783  *      prevents the various system daemons from flushing related buffers.
 784  */
 785
 786 void
 787 bwillwrite(void)
 788 {
 789         /* XXX To be implemented later */
 790 }
 791
 792 /*
 793  * Release a buffer on to the free lists.
 794  * Described in Bach (p. 46).
 795  */
 796 void
 797 brelse(bp)
 798         struct buf *bp;
 799 {
 800         struct bqueues *bufq;
 801         int s;
 802         long whichq;
 803
 804         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
 805                      bp->b_lblkno * PAGE_SIZE, (int)bp, (int)bp->b_data,
 806                      bp->b_flags, 0);
 807
 808         trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
 809
 810         /* IO is done. Cleanup the UPL state */
 811         if (!ISSET(bp->b_flags, B_META)
 812                 && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
 813                 kern_return_t kret;
 814                 upl_t         upl;
 815                 int           upl_flags;
 816
 817                 if ( !ISSET(bp->b_flags, B_PAGELIST)) {
 818                         if ( !ISSET(bp->b_flags, B_INVAL)) {
 819                                 kret = ubc_create_upl(bp->b_vp,
 820                                                                 ubc_blktooff(bp->b_vp, bp->b_lblkno),
 821                                                                 bp->b_bufsize,
 822                                                             &upl,
 823                                                                 NULL,
 824                                                                 UPL_PRECIOUS);
 825                                 if (kret != KERN_SUCCESS)
 826                                         panic("brelse: Failed to get pagelists");
 827 #ifdef  UBC_DEBUG
 828                                 upl_ubc_alias_set(upl, bp, 5);
 829 #endif /* UBC_DEBUG */
 830                         } else
 831                                 upl = (upl_t) 0;
 832                 } else {
 833                         upl = bp->b_pagelist;
 834                         kret = ubc_upl_unmap(upl);
 835
 836                         if (kret != KERN_SUCCESS)
 837                                 panic("kernel_upl_unmap failed");
 838                         bp->b_data = 0;
 839                 }
 840                 if (upl) {
 841                         if (bp->b_flags & (B_ERROR | B_INVAL)) {
 842                             if (bp->b_flags & (B_READ | B_INVAL))
 843                                         upl_flags = UPL_ABORT_DUMP_PAGES;
 844                                 else
 845                                         upl_flags = 0;
 846                                 ubc_upl_abort(upl, upl_flags);
 847                         } else {
 848                             if (ISSET(bp->b_flags, B_NEEDCOMMIT))
 849                                     upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
 850                             else if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY))
 851                                         upl_flags = UPL_COMMIT_SET_DIRTY ;
 852                                 else
 853                                     upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
 854                                 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
 855                                         UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
 856                         }
 857                         s = splbio();
 858                         CLR(bp->b_flags, B_PAGELIST);
 859                         bp->b_pagelist = 0;
 860                         splx(s);
 861                 }
 862         } else {
 863                 if(ISSET(bp->b_flags, B_PAGELIST))
 864                         panic("brelse: pagelist set for non VREG; vp=%x", bp->b_vp);
 865         }
 866
 867         /* Wake up any processes waiting for any buffer to become free. */
 868         if (needbuffer) {
 869                 needbuffer = 0;
 870                 wakeup(&needbuffer);
 871         }
 872
 873         /* Wake up any proceeses waiting for _this_ buffer to become free. */
 874         if (ISSET(bp->b_flags, B_WANTED)) {
 875                 CLR(bp->b_flags, B_WANTED);
 876                 wakeup(bp);
 877         }
 878
 879         /* Block disk interrupts. */
 880         s = splbio();
 881
 882         /*
 883          * Determine which queue the buffer should be on, then put it there.
 884          */
 885
 886         /* If it's locked, don't report an error; try again later. */
 887         if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
 888                 CLR(bp->b_flags, B_ERROR);
 889
 890         /* If it's not cacheable, or an error, mark it invalid. */
 891         if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
 892                 SET(bp->b_flags, B_INVAL);
 893
 894         if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
 895                 /*
 896                  * If it's invalid or empty, dissociate it from its vnode
 897                  * and put on the head of the appropriate queue.
 898                  */
 899                 if (bp->b_vp)
 900                         brelvp(bp);
 901                 if (ISSET(bp->b_flags, B_DELWRI)) {
 902                         CLR(bp->b_flags, B_DELWRI);
 903                         nbdwrite--;
 904                         wakeup((caddr_t)&nbdwrite);
 905                 }
 906                 if (bp->b_bufsize <= 0)
 907                         whichq = BQ_EMPTY;      /* no data */
 908                 else if (ISSET(bp->b_flags, B_META))
 909                         whichq = BQ_META;               /* meta-data */
 910                 else
 911                         whichq = BQ_AGE;        /* invalid data */
 912
 913                 bufq = &bufqueues[whichq];
 914                 binsheadfree(bp, bufq, whichq);
 915         } else {
 916                 /*
 917                  * It has valid data.  Put it on the end of the appropriate
 918                  * queue, so that it'll stick around for as long as possible.
 919                  */
 920                 if (ISSET(bp->b_flags, B_LOCKED))
 921                         whichq = BQ_LOCKED;             /* locked in core */
 922                 else if (ISSET(bp->b_flags, B_META))
 923                         whichq = BQ_META;               /* meta-data */
 924                 else if (ISSET(bp->b_flags, B_AGE))
 925                         whichq = BQ_AGE;                /* stale but valid data */
 926                 else
 927                         whichq = BQ_LRU;                /* valid data */
 928
 929                 bufq = &bufqueues[whichq];
 930                 binstailfree(bp, bufq, whichq);
 931         }
 932
 933         /* Unlock the buffer. */
 934         CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE));
 935
 936         /* Allow disk interrupts. */
 937         splx(s);
 938
 939         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
 940                      (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
 941 }
 942
 943 /*
 944  * Determine if a block is in the cache.
 945  * Just look on what would be its hash chain.  If it's there, return
 946  * a pointer to it, unless it's marked invalid.  If it's marked invalid,
 947  * we normally don't return the buffer, unless the caller explicitly
 948  * wants us to.
 949  */
 950 struct buf *
 951 incore(vp, blkno)
 952         struct vnode *vp;
 953         daddr_t blkno;
 954 {
 955         struct buf *bp;
 956
 957         bp = BUFHASH(vp, blkno)->lh_first;
 958
 959         /* Search hash chain */
 960         for (; bp != NULL; bp = bp->b_hash.le_next) {
 961                 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
 962                     !ISSET(bp->b_flags, B_INVAL))
 963                         return (bp);
 964         }
 965
 966         return (0);
 967 }
 968
 969
 970 /* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
 971 /*
 972  * Get a block of requested size that is associated with
 973  * a given vnode and block offset. If it is found in the
 974  * block cache, mark it as having been found, make it busy
 975  * and return it. Otherwise, return an empty block of the
 976  * correct size. It is up to the caller to insure that the
 977  * cached blocks be of the correct size.
 978  */
 979 struct buf *
 980 getblk(vp, blkno, size, slpflag, slptimeo, operation)
 981         register struct vnode *vp;
 982         daddr_t blkno;
 983         int size, slpflag, slptimeo, operation;
 984 {
 985         struct buf *bp;
 986         int s, err;
 987         upl_t upl;
 988         upl_page_info_t *pl;
 989         kern_return_t kret;
 990         int error=0;
 991         int pagedirty = 0;
 992
 993         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
 994                      blkno * PAGE_SIZE, size, operation, 0, 0);
 995 start:
 996
 997         s = splbio();
 998         if ((bp = incore(vp, blkno))) {
 999                 /* Found in the Buffer Cache */
1000                 if (ISSET(bp->b_flags, B_BUSY)) {
1001                         /* but is busy */
1002                         switch (operation) {
1003                         case BLK_READ:
1004                         case BLK_WRITE:
1005                         case BLK_META:
1006                                 SET(bp->b_flags, B_WANTED);
1007                                 bufstats.bufs_busyincore++;
1008                                 err = tsleep(bp, slpflag | (PRIBIO + 1), "getblk",
1009                                     slptimeo);
1010                                 splx(s);
1011                                 /*
1012                                  * Callers who call with PCATCH or timeout are
1013                                  * willing to deal with the NULL pointer
1014                                  */
1015                                 if (err && ((slpflag & PCATCH) ||
1016                                                          ((err == EWOULDBLOCK) && slptimeo)))
1017                                         return (NULL);
1018                                 goto start;
1019                                 /*NOTREACHED*/
1020                                 break;
1021
1022                         case BLK_PAGEIN:
1023                                 /* pagein operation must not use getblk */
1024                                 panic("getblk: pagein for incore busy buffer");
1025                                 splx(s);
1026                                 /*NOTREACHED*/
1027                                 break;
1028
1029                         case BLK_PAGEOUT:
1030                                 /* pageout operation must not use getblk */
1031                                 panic("getblk: pageout for incore busy buffer");
1032                                 splx(s);
1033                                 /*NOTREACHED*/
1034                                 break;
1035
1036                         default:
1037                                 panic("getblk: %d unknown operation 1", operation);
1038                                 /*NOTREACHED*/
1039                                 break;
1040                         }
1041                 } else {
1042                         /* not busy */
1043                         SET(bp->b_flags, (B_BUSY | B_CACHE));
1044                         bremfree(bp);
1045                         bufstats.bufs_incore++;
1046                         splx(s);
1047
1048                         allocbuf(bp, size);
1049                         if (ISSET(bp->b_flags, B_PAGELIST))
1050                                         panic("pagelist buffer is not busy");
1051
1052                         switch (operation) {
1053                         case BLK_READ:
1054                         case BLK_WRITE:
1055                                 if (UBCISVALID(bp->b_vp) && bp->b_bufsize) {
1056                                         kret = ubc_create_upl(vp,
1057                                                                         ubc_blktooff(vp, bp->b_lblkno),
1058                                                                         bp->b_bufsize,
1059                                                                         &upl,
1060                                                                         &pl,
1061                                                                         UPL_PRECIOUS);
1062                                         if (kret != KERN_SUCCESS)
1063                                                 panic("Failed to get pagelists");
1064
1065                                         SET(bp->b_flags, B_PAGELIST);
1066                                         bp->b_pagelist = upl;
1067
1068                                         if (!upl_valid_page(pl, 0)) {
1069                                                 if (vp->v_tag != VT_NFS)
1070                                                         panic("getblk: incore buffer without valid page");
1071                                                 CLR(bp->b_flags, B_CACHE);
1072                                         }
1073
1074                                         if (upl_dirty_page(pl, 0))
1075                                                 SET(bp->b_flags, B_WASDIRTY);
1076                                         else
1077                                                 CLR(bp->b_flags, B_WASDIRTY);
1078
1079                                         kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
1080                                         if (kret != KERN_SUCCESS)
1081                                                 panic("getblk: ubc_upl_map() failed with (%d)",
1082                                                                   kret);
1083                                         if (bp->b_data == 0)
1084                                                 panic("ubc_upl_map mapped 0");
1085                                 }
1086                                 break;
1087
1088                         case BLK_META:
1089                                 /*
1090                                  * VM is not involved in IO for the meta data
1091                                  * buffer already has valid data
1092                                  */
1093                                 if(bp->b_data == 0)
1094                                         panic("bp->b_data null incore buf=%x", bp);
1095                                 break;
1096
1097                         case BLK_PAGEIN:
1098                         case BLK_PAGEOUT:
1099                                 panic("getblk: paging operation 1");
1100                                 break;
1101
1102                         default:
1103                                 panic("getblk: %d unknown operation 2", operation);
1104                                 /*NOTREACHED*/
1105                                 break;
1106                         }
1107                 }
1108         } else { /* not incore() */
1109                 int queue = BQ_EMPTY; /* Start with no preference */
1110                 splx(s);
1111
1112                 if ((operation == BLK_META) || (UBCINVALID(vp)) ||
1113                         !(UBCINFOEXISTS(vp))) {
1114                         operation = BLK_META;
1115                 }
1116                 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
1117                         goto start;
1118                 if (incore(vp, blkno)) {
1119                         SET(bp->b_flags, B_INVAL);
1120                         binshash(bp, &invalhash);
1121                         brelse(bp);
1122                         goto start;
1123                 }
1124
1125                 /*
1126                  * if it is meta, the queue may be set to other
1127                  * type so reset as well as mark it to be B_META
1128                  * so that when buffer is released it will goto META queue
1129                  * Also, if the vnode is not VREG, then it is META
1130                  */
1131                 if (operation == BLK_META) {
1132                         SET(bp->b_flags, B_META);
1133                         queue = BQ_META;
1134                 }
1135
1136                 bp->b_blkno = bp->b_lblkno = blkno;
1137                 bp->b_vp = vp;
1138
1139                 /*
1140                  * Insert in the hash so that incore() can find it
1141                  */
1142                 binshash(bp, BUFHASH(vp, blkno));
1143
1144                 s = splbio();
1145                 bgetvp(vp, bp);
1146                 splx(s);
1147
1148                 allocbuf(bp, size);
1149
1150                 switch (operation) {
1151                 case BLK_META:
1152                         /* buffer data is invalid */
1153
1154                         if(bp->b_data == 0)
1155                                 panic("bp->b_data is null %x",bp);
1156
1157                         bufstats.bufs_miss++;
1158
1159                         /* wakeup the buffer */
1160                         CLR(bp->b_flags, B_WANTED);
1161                         wakeup(bp);
1162                         break;
1163
1164                 case BLK_READ:
1165                 case BLK_WRITE:
1166
1167                         if (ISSET(bp->b_flags, B_PAGELIST))
1168                                 panic("B_PAGELIST in bp=%x",bp);
1169
1170                         kret = ubc_create_upl(vp,
1171                                                         ubc_blktooff(vp, blkno),
1172                                                         bp->b_bufsize,
1173                                                         &upl,
1174                                                         &pl,
1175                                                         UPL_PRECIOUS);
1176                         if (kret != KERN_SUCCESS)
1177                                 panic("Failed to get pagelists");
1178
1179 #ifdef  UBC_DEBUG
1180                         upl_ubc_alias_set(upl, bp, 4);
1181 #endif /* UBC_DEBUG */
1182                         bp->b_pagelist = upl;
1183
1184                         SET(bp->b_flags, B_PAGELIST);
1185
1186                         if (upl_valid_page(pl, 0)) {
1187                                 SET(bp->b_flags, B_CACHE | B_DONE);
1188                                 bufstats.bufs_vmhits++;
1189
1190                                 pagedirty = upl_dirty_page(pl, 0);
1191
1192                                 if (pagedirty)
1193                                         SET(bp->b_flags, B_WASDIRTY);
1194
1195                                 if (vp->v_tag == VT_NFS) {
1196                                         off_t  f_offset;
1197                                         int    valid_size;
1198
1199                                         bp->b_validoff = 0;
1200                                         bp->b_dirtyoff = 0;
1201
1202                                         f_offset = ubc_blktooff(vp, blkno);
1203
1204                                         if (f_offset > vp->v_ubcinfo->ui_size) {
1205                                                 CLR(bp->b_flags, (B_CACHE|B_DONE|B_WASDIRTY));
1206                                                 bp->b_validend = 0;
1207                                                 bp->b_dirtyend = 0;
1208                                         } else {
1209                                                 valid_size = min(((unsigned int)(vp->v_ubcinfo->ui_size - f_offset)), PAGE_SIZE);
1210                                                 bp->b_validend = valid_size;
1211
1212                                                 if (pagedirty)
1213                                                        bp->b_dirtyend = valid_size;
1214                                                 else
1215                                                        bp->b_dirtyend = 0;
1216
1217                                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_NONE,
1218                                                              bp->b_validend, bp->b_dirtyend,
1219                                                              (int)vp->v_ubcinfo->ui_size, 0, 0);
1220                                         }
1221                                 } else {
1222                                         bp->b_validoff = 0;
1223                                         bp->b_dirtyoff = 0;
1224
1225                                         if (pagedirty) {
1226                                                 /* page is dirty */
1227                                                 bp->b_validend = bp->b_bcount;
1228                                                 bp->b_dirtyend = bp->b_bcount;
1229                                         } else {
1230                                                 /* page is clean */
1231                                                 bp->b_validend = bp->b_bcount;
1232                                                 bp->b_dirtyend = 0;
1233                                         }
1234                                 }
1235                                 error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL);
1236                                 if(error) {
1237                                         panic("getblk: VOP_BMAP failed");
1238                                         /*NOTREACHED*/
1239                                         /*
1240                                          * XXX:  We probably should invalidate the VM Page
1241                                          */
1242                                         bp->b_error = error;
1243                                         SET(bp->b_flags, (B_ERROR | B_INVAL));
1244                                         /* undo B_DONE that was set before upl_commit() */
1245                                         CLR(bp->b_flags, B_DONE);
1246                                         brelse(bp);
1247                                         return (0);
1248                                 }
1249                         } else {
1250                                 bufstats.bufs_miss++;
1251                         }
1252                         kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
1253                         if (kret != KERN_SUCCESS) {
1254                                 panic("getblk: ubc_upl_map() "
1255                                       "failed with (%d)", kret);
1256                         }
1257                         if (bp->b_data == 0)
1258                                 panic("kernel_upl_map mapped 0");
1259
1260                         break;
1261
1262                 case BLK_PAGEIN:
1263                 case BLK_PAGEOUT:
1264                         panic("getblk: paging operation 2");
1265                         break;
1266                 default:
1267                         panic("getblk: %d unknown operation 3", operation);
1268                         /*NOTREACHED*/
1269                         break;
1270                 }
1271         }
1272
1273         if (bp->b_data == NULL)
1274                 panic("getblk: bp->b_addr is null");
1275
1276         if (bp->b_bufsize & 0xfff) {
1277                 if (ISSET(bp->b_flags, B_META) && (bp->b_bufsize & 0x1ff))
1278                         panic("getblk: bp->b_bufsize = %d", bp->b_bufsize);
1279         }
1280
1281         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
1282                      (int)bp, (int)bp->b_data, bp->b_flags, 3, 0);
1283
1284         return (bp);
1285 }
1286
1287 /*
1288  * Get an empty, disassociated buffer of given size.
1289  */
1290 struct buf *
1291 geteblk(size)
1292         int size;
1293 {
1294         struct buf *bp;
1295     int queue = BQ_EMPTY;
1296
1297         while ((bp = getnewbuf(0, 0, &queue)) == 0)
1298                 ;
1299         SET(bp->b_flags, (B_META|B_INVAL));
1300
1301 #if DIAGNOSTIC
1302         assert(queue == BQ_EMPTY);
1303 #endif /* DIAGNOSTIC */
1304         /* XXX need to implement logic to deal with other queues */
1305
1306         binshash(bp, &invalhash);
1307         allocbuf(bp, size);
1308         bufstats.bufs_eblk++;
1309
1310         return (bp);
1311 }
1312
1313 /*
1314  * Zones for the meta data buffers
1315  */
1316
1317 #define MINMETA 512
1318 #define MAXMETA 4096
1319
1320 struct meta_zone_entry {
1321         zone_t mz_zone;
1322         vm_size_t mz_size;
1323         vm_size_t mz_max;
1324         char *mz_name;
1325 };
1326
1327 struct meta_zone_entry meta_zones[] = {
1328         {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
1329         {NULL, (MINMETA * 2),  64 * (MINMETA * 2), "buf.1024" },
1330         {NULL, (MINMETA * 4),  16 * (MINMETA * 4), "buf.2048" },
1331         {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
1332         {NULL, 0, 0, "" } /* End */
1333 };
1334
1335 /*
1336  * Initialize the meta data zones
1337  */
1338 static void
1339 bufzoneinit(void)
1340 {
1341         int i;
1342
1343         for (i = 0; meta_zones[i].mz_size != 0; i++) {
1344                 meta_zones[i].mz_zone =
1345                                 zinit(meta_zones[i].mz_size,
1346                                         meta_zones[i].mz_max,
1347                                         PAGE_SIZE,
1348                                         meta_zones[i].mz_name);
1349         }
1350         buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
1351 }
1352
1353 static __inline__ zone_t
1354 getbufzone(size_t size)
1355 {
1356         int i;
1357
1358         if ((size % 512) || (size < MINMETA) || (size > MAXMETA))
1359                 panic("getbufzone: incorect size = %d", size);
1360
1361     for (i = 0; meta_zones[i].mz_size != 0; i++) {
1362                 if (meta_zones[i].mz_size >= size)
1363                         break;
1364         }
1365
1366         return (meta_zones[i].mz_zone);
1367 }
1368
1369 /*
1370  * With UBC, there is no need to expand / shrink the file data
1371  * buffer. The VM uses the same pages, hence no waste.
1372  * All the file data buffers can have one size.
1373  * In fact expand / shrink would be an expensive operation.
1374  *
1375  * Only exception to this is meta-data buffers. Most of the
1376  * meta data operations are smaller than PAGE_SIZE. Having the
1377  * meta-data buffers grow and shrink as needed, optimizes use
1378  * of the kernel wired memory.
1379  */
1380
1381 int
1382 allocbuf(bp, size)
1383         struct buf *bp;
1384         int size;
1385 {
1386         vm_size_t desired_size;
1387
1388         desired_size = roundup(size, CLBYTES);
1389
1390         if(desired_size < PAGE_SIZE)
1391                 desired_size = PAGE_SIZE;
1392         if (desired_size > MAXBSIZE)
1393                 panic("allocbuf: buffer larger than MAXBSIZE requested");
1394
1395         if (ISSET(bp->b_flags, B_META)) {
1396                 kern_return_t kret;
1397                 zone_t zprev, z;
1398                 size_t nsize = roundup(size, MINMETA);
1399
1400                 if (bp->b_data) {
1401                         vm_offset_t elem = (vm_offset_t)bp->b_data;
1402
1403                         if (ISSET(bp->b_flags, B_ZALLOC))
1404                                 if (bp->b_bufsize <= MAXMETA) {
1405                                         if (bp->b_bufsize < nsize) {
1406                                                 /* reallocate to a bigger size */
1407                                                 desired_size = nsize;
1408
1409                                                 zprev = getbufzone(bp->b_bufsize);
1410                                                 z = getbufzone(nsize);
1411                                                 bp->b_data = (caddr_t)zalloc(z);
1412                                                 if(bp->b_data == 0)
1413                                                         panic("allocbuf: zalloc() returned NULL");
1414                                                 bcopy(elem, bp->b_data, bp->b_bufsize);
1415                                                 zfree(zprev, elem);
1416                                         } else {
1417                                                 desired_size = bp->b_bufsize;
1418                                         }
1419                                 } else
1420                                         panic("allocbuf: B_ZALLOC set incorrectly");
1421                         else
1422                                 if (bp->b_bufsize < desired_size) {
1423                                         /* reallocate to a bigger size */
1424                                         kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1425                                         if (kret != KERN_SUCCESS)
1426                                                 panic("allocbuf: kmem_alloc() returned %d", kret);
1427                                         if(bp->b_data == 0)
1428                                                 panic("allocbuf: null b_data");
1429                                         bcopy(elem, bp->b_data, bp->b_bufsize);
1430                                         kmem_free(kernel_map, elem, bp->b_bufsize);
1431                                 } else {
1432                                         desired_size = bp->b_bufsize;
1433                                 }
1434                 } else {
1435                         /* new allocation */
1436                         if (nsize <= MAXMETA) {
1437                                 desired_size = nsize;
1438                                 z = getbufzone(nsize);
1439                                 bp->b_data = (caddr_t)zalloc(z);
1440                                 if(bp->b_data == 0)
1441                                         panic("allocbuf: zalloc() returned NULL 2");
1442                                 SET(bp->b_flags, B_ZALLOC);
1443                         } else {
1444                                 kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1445                                 if (kret != KERN_SUCCESS)
1446                                         panic("allocbuf: kmem_alloc() 2 returned %d", kret);
1447                                 if(bp->b_data == 0)
1448                                         panic("allocbuf: null b_data 2");
1449                         }
1450                 }
1451         }
1452
1453         if (ISSET(bp->b_flags, B_META) && (bp->b_data == 0))
1454                 panic("allocbuf: bp->b_data is NULL");
1455
1456         bp->b_bufsize = desired_size;
1457         bp->b_bcount = size;
1458         return (0);
1459 }
1460
1461 /*
1462  *      Get a new buffer from one of the free lists.
1463  *
1464  *      Request for a queue is passes in. The queue from which the buffer was taken
1465  *      from is returned. Out of range queue requests get BQ_EMPTY. Request for
1466  *      BQUEUE means no preference. Use heuristics in that case.
1467  *      Heuristics is as follows:
1468  *      Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
1469  *      If none available block till one is made available.
1470  *      If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
1471  *      Pick the most stale buffer.
1472  *      If found buffer was marked delayed write, start the async. write
1473  *      and restart the search.
1474  *      Initialize the fields and disassociate the buffer from the vnode.
1475  *      Remove the buffer from the hash. Return the buffer and the queue
1476  *      on which it was found.
1477  */
1478
1479 static struct buf *
1480 getnewbuf(slpflag, slptimeo, queue)
1481         int slpflag, slptimeo;
1482         int *queue;
1483 {
1484         register struct buf *bp;
1485         register struct buf *lru_bp;
1486         register struct buf *age_bp;
1487         register struct buf *meta_bp;
1488         register int age_time, lru_time, bp_time, meta_time;
1489         int s;
1490         int req = *queue; /* save it for restarts */
1491
1492 start:
1493         s = splbio();
1494
1495         /* invalid request gets empty queue */
1496         if ((*queue > BQUEUES) || (*queue < 0)
1497                 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED))
1498                 *queue = BQ_EMPTY;
1499
1500         /* (*queue == BQUEUES) means no preference */
1501         if (*queue != BQUEUES) {
1502                 /* Try for the requested queue first */
1503                 bp = bufqueues[*queue].tqh_first;
1504                 if (bp)
1505                         goto found;
1506         }
1507
1508         /* Unable to use requested queue */
1509         age_bp = bufqueues[BQ_AGE].tqh_first;
1510         lru_bp = bufqueues[BQ_LRU].tqh_first;
1511         meta_bp = bufqueues[BQ_META].tqh_first;
1512
1513         if (!age_bp && !lru_bp && !meta_bp) {
1514                 /*
1515                  * Unavailble on AGE or LRU or META queues
1516                  * Try the empty list first
1517                  */
1518                 bp = bufqueues[BQ_EMPTY].tqh_first;
1519                 if (bp) {
1520                         *queue = BQ_EMPTY;
1521                         goto found;
1522                 }
1523
1524                 /* Create a new temparory buffer header */
1525                 bp = (struct buf *)zalloc(buf_hdr_zone);
1526
1527                 if (bp) {
1528                         bufhdrinit(bp);
1529                         BLISTNONE(bp);
1530                         binshash(bp, &invalhash);
1531                         SET(bp->b_flags, B_HDRALLOC);
1532                         *queue = BQ_EMPTY;
1533                         binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
1534                         buf_hdr_count++;
1535                         goto found;
1536                 }
1537
1538                 /* Log this error condition */
1539                 printf("getnewbuf: No useful buffers");
1540
1541                 /* wait for a free buffer of any kind */
1542                 needbuffer = 1;
1543                 bufstats.bufs_sleeps++;
1544                 tsleep(&needbuffer, slpflag|(PRIBIO+1), "getnewbuf", slptimeo);
1545                 splx(s);
1546                 return (0);
1547         }
1548
1549         /* Buffer available either on AGE or LRU or META */
1550         bp = NULL;
1551         *queue = -1;
1552
1553         /* Buffer available either on AGE or LRU */
1554         if (!age_bp) {
1555                 bp = lru_bp;
1556                 *queue = BQ_LRU;
1557         } else if (!lru_bp) {
1558                 bp = age_bp;
1559                 *queue = BQ_AGE;
1560         } else { /* buffer available on both AGE and LRU */
1561                 age_time = time.tv_sec - age_bp->b_timestamp;
1562                 lru_time = time.tv_sec - lru_bp->b_timestamp;
1563                 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
1564                         bp = age_bp;
1565                         *queue = BQ_AGE;
1566                         /*
1567                          * we should probably re-timestamp eveything in the
1568                          * queues at this point with the current time
1569                          */
1570                 } else {
1571                         if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
1572                                 bp = lru_bp;
1573                                 *queue = BQ_LRU;
1574                         } else {
1575                                 bp = age_bp;
1576                                 *queue = BQ_AGE;
1577                         }
1578                 }
1579         }
1580
1581         if (!bp) { /* Neither on AGE nor on LRU */
1582                 bp = meta_bp;
1583                 *queue = BQ_META;
1584         }  else if (meta_bp) {
1585                 bp_time = time.tv_sec - bp->b_timestamp;
1586                 meta_time = time.tv_sec - meta_bp->b_timestamp;
1587
1588                 if (!(bp_time < 0) && !(meta_time < 0)) {
1589                         /* time not set backwards */
1590                         int bp_is_stale;
1591                         bp_is_stale = (*queue == BQ_LRU) ?
1592                                         lru_is_stale : age_is_stale;
1593
1594                         if ((meta_time >= meta_is_stale) &&
1595                                         (bp_time < bp_is_stale)) {
1596                                 bp = meta_bp;
1597                                 *queue = BQ_META;
1598                         }
1599                 }
1600         }
1601
1602         if (bp == NULL)
1603                 panic("getnewbuf: null bp");
1604
1605 found:
1606         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1607                 panic("getnewbuf: le_prev is deadbeef");
1608
1609         if(ISSET(bp->b_flags, B_BUSY))
1610                 panic("getnewbuf reusing BUSY buf");
1611
1612         /* Clean it */
1613         if (bcleanbuf(bp)) {
1614                 /* bawrite() issued, buffer not ready */
1615                 splx(s);
1616                 *queue = req;
1617                 goto start;
1618         }
1619         splx(s);
1620         return (bp);
1621 }
1622
1623 #include <mach/mach_types.h>
1624 #include <mach/memory_object_types.h>
1625 #include <kern/sched_prim.h>
1626
1627 /*
1628  * Clean a buffer.
1629  * Returns 0 is buffer is ready to use,
1630  * Returns 1 if issued a bawrite() to indicate
1631  * that the buffer is not ready.
1632  */
1633 static int
1634 bcleanbuf(struct buf *bp)
1635 {
1636         int s;
1637         struct ucred *cred;
1638         int     hdralloc = 0;
1639
1640         s = splbio();
1641
1642         /* Remove from the queue */
1643         bremfree(bp);
1644
1645         /* Buffer is no longer on free lists. */
1646         SET(bp->b_flags, B_BUSY);
1647
1648         /* Check whether the buffer header was "allocated" */
1649         if (ISSET(bp->b_flags, B_HDRALLOC))
1650                 hdralloc = 1;
1651
1652         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1653                 panic("bcleanbuf: le_prev is deadbeef");
1654
1655         /*
1656          * If buffer was a delayed write, start the IO by queuing
1657          * it on the LAUNDRY queue, and return 1
1658          */
1659         if (ISSET(bp->b_flags, B_DELWRI)) {
1660                 splx(s);
1661                 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
1662                 blaundrycnt++;
1663                 wakeup(&blaundrycnt);
1664                 /* and give it a chance to run */
1665                 (void)thread_block(THREAD_CONTINUE_NULL);
1666                 return (1);
1667         }
1668
1669         if (bp->b_vp)
1670                 brelvp(bp);
1671         bremhash(bp);
1672         BLISTNONE(bp);
1673
1674         splx(s);
1675
1676         if (ISSET(bp->b_flags, B_META)) {
1677                 vm_offset_t elem = (vm_offset_t)bp->b_data;
1678                 if (elem == 0)
1679                         panic("bcleanbuf: NULL bp->b_data B_META buffer");
1680
1681                 if (ISSET(bp->b_flags, B_ZALLOC)) {
1682                         if (bp->b_bufsize <= MAXMETA) {
1683                                 zone_t z;
1684
1685                                 z = getbufzone(bp->b_bufsize);
1686                                 bp->b_data = (caddr_t)0xdeadbeef;
1687                                 zfree(z, elem);
1688                                 CLR(bp->b_flags, B_ZALLOC);
1689                         } else
1690                                 panic("bcleanbuf: B_ZALLOC set incorrectly");
1691                 } else {
1692                         bp->b_data = (caddr_t)0xdeadbeef;
1693                         kmem_free(kernel_map, elem, bp->b_bufsize);
1694                 }
1695         }
1696
1697         trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
1698
1699         /* disassociate us from our vnode, if we had one... */
1700         s = splbio();
1701
1702         /* clear out various other fields */
1703         bp->b_bufsize = 0;
1704         bp->b_data = 0;
1705         bp->b_flags = B_BUSY;
1706         if (hdralloc)
1707                 SET(bp->b_flags, B_HDRALLOC);
1708         bp->b_dev = NODEV;
1709         bp->b_blkno = bp->b_lblkno = 0;
1710         bp->b_iodone = 0;
1711         bp->b_error = 0;
1712         bp->b_resid = 0;
1713         bp->b_bcount = 0;
1714         bp->b_dirtyoff = bp->b_dirtyend = 0;
1715         bp->b_validoff = bp->b_validend = 0;
1716
1717         /* nuke any credentials we were holding */
1718         cred = bp->b_rcred;
1719         if (cred != NOCRED) {
1720                 bp->b_rcred = NOCRED;
1721                 crfree(cred);
1722         }
1723         cred = bp->b_wcred;
1724         if (cred != NOCRED) {
1725                 bp->b_wcred = NOCRED;
1726                 crfree(cred);
1727         }
1728         splx(s);
1729         return (0);
1730 }
1731
1732
1733 /*
1734  * Wait for operations on the buffer to complete.
1735  * When they do, extract and return the I/O's error value.
1736  */
1737 int
1738 biowait(bp)
1739         struct buf *bp;
1740 {
1741         int s;
1742
1743         s = splbio();
1744         while (!ISSET(bp->b_flags, B_DONE))
1745                 tsleep(bp, PRIBIO + 1, "biowait", 0);
1746         splx(s);
1747
1748         /* check for interruption of I/O (e.g. via NFS), then errors. */
1749         if (ISSET(bp->b_flags, B_EINTR)) {
1750                 CLR(bp->b_flags, B_EINTR);
1751                 return (EINTR);
1752         } else if (ISSET(bp->b_flags, B_ERROR))
1753                 return (bp->b_error ? bp->b_error : EIO);
1754         else
1755                 return (0);
1756 }
1757
1758 /*
1759  * Mark I/O complete on a buffer.
1760  *
1761  * If a callback has been requested, e.g. the pageout
1762  * daemon, do so. Otherwise, awaken waiting processes.
1763  *
1764  * [ Leffler, et al., says on p.247:
1765  *      "This routine wakes up the blocked process, frees the buffer
1766  *      for an asynchronous write, or, for a request by the pagedaemon
1767  *      process, invokes a procedure specified in the buffer structure" ]
1768  *
1769  * In real life, the pagedaemon (or other system processes) wants
1770  * to do async stuff to, and doesn't want the buffer brelse()'d.
1771  * (for swap pager, that puts swap buffers on the free lists (!!!),
1772  * for the vn device, that puts malloc'd buffers on the free lists!)
1773  */
1774 void
1775 biodone(bp)
1776         struct buf *bp;
1777 {
1778         boolean_t       funnel_state;
1779         struct vnode *vp;
1780
1781         funnel_state = thread_funnel_set(kernel_flock, TRUE);
1782
1783         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
1784                      (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1785
1786         if (ISSET(bp->b_flags, B_DONE))
1787                 panic("biodone already");
1788         SET(bp->b_flags, B_DONE);               /* note that it's done */
1789         /*
1790          * I/O was done, so don't believe
1791          * the DIRTY state from VM anymore
1792          */
1793         CLR(bp->b_flags, B_WASDIRTY);
1794
1795         if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
1796                 vwakeup(bp);     /* wake up reader */
1797
1798         if (kdebug_enable) {
1799             int    code = DKIO_DONE;
1800
1801             if (bp->b_flags & B_READ)
1802                 code |= DKIO_READ;
1803             if (bp->b_flags & B_ASYNC)
1804                 code |= DKIO_ASYNC;
1805
1806             if (bp->b_flags & B_META)
1807                 code |= DKIO_META;
1808             else if (bp->b_flags & (B_PGIN | B_PAGEOUT))
1809                 code |= DKIO_PAGING;
1810
1811             KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
1812                                     bp, bp->b_vp, bp->b_resid, bp->b_error, 0);
1813         }
1814
1815         /* Wakeup the throttled write operations as needed */
1816         vp = bp->b_vp;
1817         if (vp
1818                 && (vp->v_flag & VTHROTTLED)
1819                 && (vp->v_numoutput <= (BUFWRITE_THROTTLE / 3))) {
1820                 vp->v_flag &= ~VTHROTTLED;
1821                 wakeup((caddr_t)&vp->v_numoutput);
1822         }
1823
1824         if (ISSET(bp->b_flags, B_CALL)) {       /* if necessary, call out */
1825                 CLR(bp->b_flags, B_CALL);       /* but note callout done */
1826                 (*bp->b_iodone)(bp);
1827         } else if (ISSET(bp->b_flags, B_ASYNC)) /* if async, release it */
1828                 brelse(bp);
1829         else {                                  /* or just wakeup the buffer */
1830                 CLR(bp->b_flags, B_WANTED);
1831                 wakeup(bp);
1832         }
1833
1834         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
1835                      (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1836
1837         thread_funnel_set(kernel_flock, funnel_state);
1838 }
1839
1840 /*
1841  * Return a count of buffers on the "locked" queue.
1842  */
1843 int
1844 count_lock_queue()
1845 {
1846         register struct buf *bp;
1847         register int n = 0;
1848
1849         for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
1850             bp = bp->b_freelist.tqe_next)
1851                 n++;
1852         return (n);
1853 }
1854
1855 /*
1856  * Return a count of 'busy' buffers. Used at the time of shutdown.
1857  */
1858 int
1859 count_busy_buffers()
1860 {
1861         register struct buf *bp;
1862         register int nbusy = 0;
1863
1864         for (bp = &buf[nbuf]; --bp >= buf; )
1865                 if ((bp->b_flags & (B_BUSY|B_INVAL)) == B_BUSY)
1866                         nbusy++;
1867         return (nbusy);
1868 }
1869
1870 #if DIAGNOSTIC
1871 /*
1872  * Print out statistics on the current allocation of the buffer pool.
1873  * Can be enabled to print out on every ``sync'' by setting "syncprt"
1874  * in vfs_syscalls.c using sysctl.
1875  */
1876 void
1877 vfs_bufstats()
1878 {
1879         int s, i, j, count;
1880         register struct buf *bp;
1881         register struct bqueues *dp;
1882         int counts[MAXBSIZE/CLBYTES+1];
1883         static char *bname[BQUEUES] =
1884                 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
1885
1886         for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
1887                 count = 0;
1888                 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1889                         counts[j] = 0;
1890                 s = splbio();
1891                 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
1892                         counts[bp->b_bufsize/CLBYTES]++;
1893                         count++;
1894                 }
1895                 splx(s);
1896                 printf("%s: total-%d", bname[i], count);
1897                 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1898                         if (counts[j] != 0)
1899                                 printf(", %d-%d", j * CLBYTES, counts[j]);
1900                 printf("\n");
1901         }
1902 }
1903 #endif /* DIAGNOSTIC */
1904
1905 #define NRESERVEDIOBUFS 64
1906
1907 __private_extern__ struct buf *
1908 alloc_io_buf(vp, priv)
1909         struct vnode *vp;
1910         int priv;
1911 {
1912         register struct buf *bp;
1913         int s;
1914
1915         s = splbio();
1916
1917         while (niobuf - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse && !priv) {
1918                 need_iobuffer = 1;
1919                 bufstats.bufs_iobufsleeps++;
1920                 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf", 0);
1921         }
1922
1923         while ((bp = iobufqueue.tqh_first) == NULL) {
1924                 need_iobuffer = 1;
1925                 bufstats.bufs_iobufsleeps++;
1926                 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf1", 0);
1927         }
1928
1929         TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
1930         bp->b_timestamp = 0;
1931
1932         /* clear out various fields */
1933         bp->b_flags = B_BUSY;
1934         bp->b_blkno = bp->b_lblkno = 0;
1935         bp->b_iodone = 0;
1936         bp->b_error = 0;
1937         bp->b_resid = 0;
1938         bp->b_bcount = 0;
1939         bp->b_bufsize = 0;
1940         bp->b_vp = vp;
1941
1942         if (vp->v_type == VBLK || vp->v_type == VCHR)
1943                 bp->b_dev = vp->v_rdev;
1944         else
1945                 bp->b_dev = NODEV;
1946         bufstats.bufs_iobufinuse++;
1947         if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
1948                 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
1949         splx(s);
1950
1951         return (bp);
1952 }
1953
1954 __private_extern__ void
1955 free_io_buf(bp)
1956         struct buf *bp;
1957 {
1958         int s;
1959
1960         s = splbio();
1961         /* put buffer back on the head of the iobufqueue */
1962         bp->b_vp = NULL;
1963         bp->b_flags = B_INVAL;
1964
1965         binsheadfree(bp, &iobufqueue, -1);
1966
1967         /* Wake up any processes waiting for any buffer to become free. */
1968         if (need_iobuffer) {
1969                 need_iobuffer = 0;
1970                 wakeup(&need_iobuffer);
1971         }
1972         bufstats.bufs_iobufinuse--;
1973         splx(s);
1974 }
1975
1976 /* disabled for now */
1977
1978 /* XXX move this to a separate file */
1979 /*
1980  * Dynamic Scaling of the Buffer Queues
1981  */
1982
1983 typedef long long blsize_t;
1984
1985 blsize_t MAXNBUF; /* initialize to (mem_size / PAGE_SIZE) */
1986 /* Global tunable limits */
1987 blsize_t nbufh;                 /* number of buffer headers */
1988 blsize_t nbuflow;               /* minimum number of buffer headers required */
1989 blsize_t nbufhigh;              /* maximum number of buffer headers allowed */
1990 blsize_t nbuftarget;    /* preferred number of buffer headers */
1991
1992 /*
1993  * assertions:
1994  *
1995  * 1.   0 < nbuflow <= nbufh <= nbufhigh
1996  * 2.   nbufhigh <= MAXNBUF
1997  * 3.   0 < nbuflow <= nbuftarget <= nbufhigh
1998  * 4.   nbufh can not be set by sysctl().
1999  */
2000
2001 /* Per queue tunable limits */
2002
2003 struct bufqlim {
2004         blsize_t        bl_nlow;        /* minimum number of buffer headers required */
2005         blsize_t        bl_num;         /* number of buffer headers on the queue */
2006         blsize_t        bl_nlhigh;      /* maximum number of buffer headers allowed */
2007         blsize_t        bl_target;      /* preferred number of buffer headers */
2008         long    bl_stale;       /* Seconds after which a buffer is considered stale */
2009 } bufqlim[BQUEUES];
2010
2011 /*
2012  * assertions:
2013  *
2014  * 1.   0 <= bl_nlow <= bl_num <= bl_nlhigh
2015  * 2.   bl_nlhigh <= MAXNBUF
2016  * 3.  bufqlim[BQ_META].bl_nlow != 0
2017  * 4.  bufqlim[BQ_META].bl_nlow > (number of possible concurrent
2018  *                                                                      file system IO operations)
2019  * 5.   bl_num can not be set by sysctl().
2020  * 6.   bl_nhigh <= nbufhigh
2021  */
2022
2023 /*
2024  * Rationale:
2025  * ----------
2026  * Defining it blsize_t as long permits 2^31 buffer headers per queue.
2027  * Which can describe (2^31 * PAGE_SIZE) memory per queue.
2028  *
2029  * These limits are exported to by means of sysctl().
2030  * It was decided to define blsize_t as a 64 bit quantity.
2031  * This will make sure that we will not be required to change it
2032  * as long as we do not exceed 64 bit address space for the kernel.
2033  *
2034  * low and high numbers parameters initialized at compile time
2035  * and boot arguments can be used to override them. sysctl()
2036  * would not change the value. sysctl() can get all the values
2037  * but can set only target. num is the current level.
2038  *
2039  * Advantages of having a "bufqscan" thread doing the balancing are,
2040  * Keep enough bufs on BQ_EMPTY.
2041  *      getnewbuf() by default will always select a buffer from the BQ_EMPTY.
2042  *              getnewbuf() perfoms best if a buffer was found there.
2043  *              Also this minimizes the possibility of starting IO
2044  *              from getnewbuf(). That's a performance win, too.
2045  *
2046  *      Localize complex logic [balancing as well as time aging]
2047  *              to balancebufq().
2048  *
2049  *      Simplify getnewbuf() logic by elimination of time aging code.
2050  */
2051
2052 /*
2053  * Algorithm:
2054  * -----------
2055  * The goal of the dynamic scaling of the buffer queues to to keep
2056  * the size of the LRU close to bl_target. Buffers on a queue would
2057  * be time aged.
2058  *
2059  * There would be a thread which will be responsible for "balancing"
2060  * the buffer cache queues.
2061  *
2062  * The scan order would be:     AGE, LRU, META, EMPTY.
2063  */
2064
2065 long bufqscanwait = 0;
2066
2067 static void bufqscan_thread();
2068 static int balancebufq(int q);
2069 static int btrimempty(int n);
2070 static __inline__ int initbufqscan(void);
2071 static __inline__ int nextbufq(int q);
2072 static void buqlimprt(int all);
2073
2074 static void
2075 bufq_balance_thread_init()
2076 {
2077
2078         if (bufqscanwait++ == 0) {
2079
2080                 /* Initalize globals */
2081                 MAXNBUF = (mem_size / PAGE_SIZE);
2082                 nbufh = nbuf;
2083                 nbuflow = min(nbufh, 100);
2084                 nbufhigh = min(MAXNBUF, max(nbufh, 2048));
2085                 nbuftarget = (mem_size >> 5) / PAGE_SIZE;
2086                 nbuftarget = max(nbuflow, nbuftarget);
2087                 nbuftarget = min(nbufhigh, nbuftarget);
2088
2089                 /*
2090                  * Initialize the bufqlim
2091                  */
2092
2093                 /* LOCKED queue */
2094                 bufqlim[BQ_LOCKED].bl_nlow = 0;
2095                 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
2096                 bufqlim[BQ_LOCKED].bl_target = 0;
2097                 bufqlim[BQ_LOCKED].bl_stale = 30;
2098
2099                 /* LRU queue */
2100                 bufqlim[BQ_LRU].bl_nlow = 0;
2101                 bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
2102                 bufqlim[BQ_LRU].bl_target = nbuftarget/4;
2103                 bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
2104
2105                 /* AGE queue */
2106                 bufqlim[BQ_AGE].bl_nlow = 0;
2107                 bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
2108                 bufqlim[BQ_AGE].bl_target = nbuftarget/4;
2109                 bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
2110
2111                 /* EMPTY queue */
2112                 bufqlim[BQ_EMPTY].bl_nlow = 0;
2113                 bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
2114                 bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
2115                 bufqlim[BQ_EMPTY].bl_stale = 600000;
2116
2117                 /* META queue */
2118                 bufqlim[BQ_META].bl_nlow = 0;
2119                 bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
2120                 bufqlim[BQ_META].bl_target = nbuftarget/4;
2121                 bufqlim[BQ_META].bl_stale = META_IS_STALE;
2122
2123                 /* LAUNDRY queue */
2124                 bufqlim[BQ_LOCKED].bl_nlow = 0;
2125                 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
2126                 bufqlim[BQ_LOCKED].bl_target = 0;
2127                 bufqlim[BQ_LOCKED].bl_stale = 30;
2128
2129                 buqlimprt(1);
2130         }
2131
2132         /* create worker thread */
2133         kernel_thread(kernel_task, bufqscan_thread);
2134 }
2135
2136 /* The workloop for the buffer balancing thread */
2137 static void
2138 bufqscan_thread()
2139 {
2140         boolean_t       funnel_state;
2141         int moretodo = 0;
2142
2143         funnel_state = thread_funnel_set(kernel_flock, TRUE);
2144
2145         for(;;) {
2146                 do {
2147                         int q;  /* buffer queue to process */
2148
2149                         q = initbufqscan();
2150                         for (; q; ) {
2151                                 moretodo |= balancebufq(q);
2152                                 q = nextbufq(q);
2153                         }
2154                 } while (moretodo);
2155
2156 #if DIAGNOSTIC
2157                 vfs_bufstats();
2158                 buqlimprt(0);
2159 #endif
2160                 (void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz);
2161                 moretodo = 0;
2162         }
2163
2164         (void) thread_funnel_set(kernel_flock, FALSE);
2165 }
2166
2167 /* Seed for the buffer queue balancing */
2168 static __inline__ int
2169 initbufqscan()
2170 {
2171         /* Start with AGE queue */
2172         return (BQ_AGE);
2173 }
2174
2175 /* Pick next buffer queue to balance */
2176 static __inline__ int
2177 nextbufq(int q)
2178 {
2179         int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
2180
2181         q++;
2182         q %= sizeof(order);
2183         return (order[q]);
2184 }
2185
2186 /* function to balance the buffer queues */
2187 static int
2188 balancebufq(int q)
2189 {
2190         int moretodo = 0;
2191         int s = splbio();
2192         int n;
2193
2194         /* reject invalid q */
2195         if ((q < 0) || (q >= BQUEUES))
2196                 goto out;
2197
2198         /* LOCKED or LAUNDRY queue MUST not be balanced */
2199         if ((q == BQ_LOCKED) || (q == BQ_LAUNDRY))
2200                 goto out;
2201
2202         n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
2203
2204         /* If queue has less than target nothing more to do */
2205         if (n < 0)
2206                 goto out;
2207
2208         if ( n > 8 ) {
2209                 /* Balance only a small amount (12.5%) at a time */
2210                 n >>= 3;
2211         }
2212
2213         /* EMPTY queue needs special handling */
2214         if (q == BQ_EMPTY) {
2215                 moretodo |= btrimempty(n);
2216                 goto out;
2217         }
2218
2219         for (; n > 0; n--) {
2220                 struct buf *bp = bufqueues[q].tqh_first;
2221                 if (!bp)
2222                         break;
2223
2224                 /* check if it's stale */
2225                 if ((time.tv_sec - bp->b_timestamp) > bufqlim[q].bl_stale) {
2226                         if (bcleanbuf(bp)) {
2227                                 /* bawrite() issued, bp not ready */
2228                                 moretodo = 1;
2229                         } else {
2230                                 /* release the cleaned buffer to BQ_EMPTY */
2231                                 SET(bp->b_flags, B_INVAL);
2232                                 brelse(bp);
2233                         }
2234                 } else
2235                         break;
2236         }
2237
2238 out:
2239         splx(s);
2240         return (moretodo);
2241 }
2242
2243 static int
2244 btrimempty(int n)
2245 {
2246         /*
2247          * When struct buf are allocated dynamically, this would
2248          * reclaim upto 'n' struct buf from the empty queue.
2249          */
2250
2251          return (0);
2252 }
2253
2254 static __inline__ void
2255 bufqinc(int q)
2256 {
2257         if ((q < 0) || (q >= BQUEUES))
2258                 return;
2259
2260         bufqlim[q].bl_num++;
2261         return;
2262 }
2263
2264 static __inline__ void
2265 bufqdec(int q)
2266 {
2267         if ((q < 0) || (q >= BQUEUES))
2268                 return;
2269
2270         bufqlim[q].bl_num--;
2271         return;
2272 }
2273
2274 static void
2275 buqlimprt(int all)
2276 {
2277         int i;
2278     static char *bname[BQUEUES] =
2279                 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
2280
2281         if (all)
2282                 for (i = 0; i < BQUEUES; i++) {
2283                         printf("%s : ", bname[i]);
2284                         printf("min = %ld, ", (long)bufqlim[i].bl_nlow);
2285                         printf("cur = %ld, ", (long)bufqlim[i].bl_num);
2286                         printf("max = %ld, ", (long)bufqlim[i].bl_nlhigh);
2287                         printf("target = %ld, ", (long)bufqlim[i].bl_target);
2288                         printf("stale after %ld seconds\n", bufqlim[i].bl_stale);
2289                 }
2290         else
2291                 for (i = 0; i < BQUEUES; i++) {
2292                         printf("%s : ", bname[i]);
2293                         printf("cur = %ld, ", (long)bufqlim[i].bl_num);
2294                 }
2295 }
2296
2297 /*
2298  * If the getnewbuf() calls bcleanbuf() on the same thread
2299  * there is a potential for stack overrun and deadlocks.
2300  * So we always handoff the work to worker thread for completion
2301  */
2302
2303 static void
2304 bcleanbuf_thread_init()
2305 {
2306         static void bcleanbuf_thread();
2307
2308         /* create worker thread */
2309         kernel_thread(kernel_task, bcleanbuf_thread);
2310 }
2311
2312 static void
2313 bcleanbuf_thread()
2314 {
2315         boolean_t       funnel_state;
2316         struct buf *bp;
2317         int error = 0;
2318         int loopcnt = 0;
2319
2320         funnel_state = thread_funnel_set(kernel_flock, TRUE);
2321
2322 doit:
2323         while (blaundrycnt == 0)
2324                 (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 60 * hz);
2325         bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY]);
2326         /* Remove from the queue */
2327         bremfree(bp);
2328         blaundrycnt--;
2329         /* do the IO */
2330         error = bawrite_internal(bp, 0);
2331         if (error) {
2332                 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
2333                 blaundrycnt++;
2334                 if (loopcnt > 10) {
2335                         (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 1);
2336                         loopcnt = 0;
2337                 } else {
2338                         (void)thread_block(THREAD_CONTINUE_NULL);
2339                         loopcnt++;
2340                 }
2341         }
2342         /* start again */
2343         goto doit;
2344
2345         (void) thread_funnel_set(kernel_flock, funnel_state);
2346 }