bsd/vfs/vfs_bio.c

   1 /*
   2  * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  23 /*-
  24  * Copyright (c) 1994 Christopher G. Demetriou
  25  * Copyright (c) 1982, 1986, 1989, 1993
  26  *      The Regents of the University of California.  All rights reserved.
  27  * (c) UNIX System Laboratories, Inc.
  28  * All or some portions of this file are derived from material licensed
  29  * to the University of California by American Telephone and Telegraph
  30  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  31  * the permission of UNIX System Laboratories, Inc.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  * The NEXTSTEP Software License Agreement specifies the terms
  62  * and conditions for redistribution.
  63  *
  64  *      @(#)vfs_bio.c   8.6 (Berkeley) 1/11/94
  65  */
  66
  67
  68 /*
  69  * Some references:
  70  *      Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
  71  *      Leffler, et al.: The Design and Implementation of the 4.3BSD
  72  *              UNIX Operating System (Addison Welley, 1989)
  73  */
  74 #define ZALLOC_METADATA 1
  75
  76 #include <sys/param.h>
  77 #include <sys/systm.h>
  78 #include <sys/proc.h>
  79 #include <sys/buf.h>
  80 #include <sys/vnode.h>
  81 #include <sys/mount.h>
  82 #include <sys/trace.h>
  83 #include <sys/malloc.h>
  84 #include <sys/resourcevar.h>
  85 #include <miscfs/specfs/specdev.h>
  86 #include <sys/ubc.h>
  87 #include <vm/vm_pageout.h>
  88 #if DIAGNOSTIC
  89 #include <kern/assert.h>
  90 #endif /* DIAGNOSTIC */
  91 #include <kern/task.h>
  92 #include <kern/zalloc.h>
  93
  94 #include <sys/kdebug.h>
  95
  96 extern void bufqinc(int q);
  97 extern void bufqdec(int q);
  98 extern void bufq_balance_thread_init();
  99
 100 extern void reassignbuf(struct buf *, struct vnode *);
 101 static struct buf *getnewbuf(int slpflag, int slptimeo, int *queue);
 102
 103 extern int niobuf;      /* The number of IO buffer headers for cluster IO */
 104 int blaundrycnt;
 105
 106 /* zone allocated buffer headers */
 107 static zone_t buf_hdr_zone;
 108 static int buf_hdr_count;
 109
 110 #if TRACE
 111 struct  proc *traceproc;
 112 int     tracewhich, tracebuf[TRCSIZ];
 113 u_int   tracex;
 114 char    traceflags[TR_NFLAGS];
 115 #endif /* TRACE */
 116
 117 /*
 118  * Definitions for the buffer hash lists.
 119  */
 120 #define BUFHASH(dvp, lbn)       \
 121         (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
 122 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
 123 u_long  bufhash;
 124
 125 /* Definitions for the buffer stats. */
 126 struct bufstats bufstats;
 127
 128 /* Number of delayed write buffers */
 129 int nbdwrite = 0;
 130
 131 /*
 132  * Insq/Remq for the buffer hash lists.
 133  */
 134 #if 0
 135 #define binshash(bp, dp)        LIST_INSERT_HEAD(dp, bp, b_hash)
 136 #define bremhash(bp)            LIST_REMOVE(bp, b_hash)
 137 #endif /* 0 */
 138
 139
 140 TAILQ_HEAD(ioqueue, buf) iobufqueue;
 141 TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
 142 static int needbuffer;
 143 static int need_iobuffer;
 144
 145 /*
 146  * Insq/Remq for the buffer free lists.
 147  */
 148 #define binsheadfree(bp, dp, whichq)    do { \
 149                                     TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
 150                                         bufqinc((whichq));      \
 151                                         (bp)->b_whichq = whichq; \
 152                                     (bp)->b_timestamp = time.tv_sec; \
 153                                 } while (0)
 154
 155 #define binstailfree(bp, dp, whichq)    do { \
 156                                     TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
 157                                         bufqinc((whichq));      \
 158                                         (bp)->b_whichq = whichq; \
 159                                     (bp)->b_timestamp = time.tv_sec; \
 160                                 } while (0)
 161
 162 #define BHASHENTCHECK(bp)       \
 163         if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef)  \
 164                 panic("%x: b_hash.le_prev is not deadbeef", (bp));
 165
 166 #define BLISTNONE(bp)   \
 167         (bp)->b_hash.le_next = (struct buf *)0; \
 168         (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
 169
 170 simple_lock_data_t bufhashlist_slock;           /* lock on buffer hash list */
 171
 172 /* number of per vnode, "in flight" buffer writes */
 173 #define BUFWRITE_THROTTLE       9
 174
 175 /*
 176  * Time in seconds before a buffer on a list is
 177  * considered as a stale buffer
 178  */
 179 #define LRU_IS_STALE 120 /* default value for the LRU */
 180 #define AGE_IS_STALE 60  /* default value for the AGE */
 181 #define META_IS_STALE 180 /* default value for the BQ_META */
 182
 183 int lru_is_stale = LRU_IS_STALE;
 184 int age_is_stale = AGE_IS_STALE;
 185 int meta_is_stale = META_IS_STALE;
 186
 187 #if 1
 188 void
 189 blistenterhead(struct bufhashhdr * head, struct buf * bp)
 190 {
 191         if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
 192                 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
 193         (head)->lh_first = bp;
 194         bp->b_hash.le_prev = &(head)->lh_first;
 195         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
 196                 panic("blistenterhead: le_prev is deadbeef");
 197
 198 }
 199 #endif
 200
 201 #if 1
 202 void
 203 binshash(struct buf *bp, struct bufhashhdr *dp)
 204 {
 205 int s;
 206
 207 struct buf *nbp;
 208
 209         simple_lock(&bufhashlist_slock);
 210 #if 0
 211         if(incore(bp->b_vp, bp->b_lblkno)) {
 212                 panic("adding to queue already existing element");
 213         }
 214 #endif /* 0 */
 215         BHASHENTCHECK(bp);
 216
 217         nbp = dp->lh_first;
 218         for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
 219                 if(nbp == bp)
 220                         panic("buf already in hashlist");
 221         }
 222
 223 #if 0
 224         LIST_INSERT_HEAD(dp, bp, b_hash);
 225 #else
 226         blistenterhead(dp, bp);
 227 #endif
 228         simple_unlock(&bufhashlist_slock);
 229 }
 230
 231 void
 232 bremhash(struct buf *bp)
 233 {
 234         int s;
 235
 236         simple_lock(&bufhashlist_slock);
 237         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
 238                 panic("bremhash le_prev is deadbeef");
 239         if (bp->b_hash.le_next == bp)
 240                 panic("bremhash: next points to self");
 241
 242         if (bp->b_hash.le_next != NULL)
 243                 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
 244         *bp->b_hash.le_prev = (bp)->b_hash.le_next;
 245         simple_unlock(&bufhashlist_slock);
 246 }
 247
 248 #endif /* 1 */
 249
 250
 251 /*
 252  * Remove a buffer from the free list it's on
 253  */
 254 void
 255 bremfree(bp)
 256         struct buf *bp;
 257 {
 258         struct bqueues *dp = NULL;
 259         int whichq = -1;
 260
 261         /*
 262          * We only calculate the head of the freelist when removing
 263          * the last element of the list as that is the only time that
 264          * it is needed (e.g. to reset the tail pointer).
 265          *
 266          * NB: This makes an assumption about how tailq's are implemented.
 267          */
 268         if (bp->b_freelist.tqe_next == NULL) {
 269                 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
 270                         if (dp->tqh_last == &bp->b_freelist.tqe_next)
 271                                 break;
 272                 if (dp == &bufqueues[BQUEUES])
 273                         panic("bremfree: lost tail");
 274         }
 275         TAILQ_REMOVE(dp, bp, b_freelist);
 276         whichq = bp->b_whichq;
 277         bufqdec(whichq);
 278         bp->b_whichq = -1;
 279         bp->b_timestamp = 0;
 280 }
 281
 282 static __inline__ void
 283 bufhdrinit(struct buf *bp)
 284 {
 285         bzero((char *)bp, sizeof *bp);
 286         bp->b_dev = NODEV;
 287         bp->b_rcred = NOCRED;
 288         bp->b_wcred = NOCRED;
 289         bp->b_vnbufs.le_next = NOLIST;
 290         bp->b_flags = B_INVAL;
 291
 292         return;
 293 }
 294
 295 /*
 296  * Initialize buffers and hash links for buffers.
 297  */
 298 void
 299 bufinit()
 300 {
 301         register struct buf *bp;
 302         register struct bqueues *dp;
 303         register int i;
 304         int metabuf;
 305         long whichq;
 306         static void bufzoneinit();
 307         static void bcleanbuf_thread_init();
 308
 309         /* Initialize the buffer queues ('freelists') and the hash table */
 310         for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
 311                 TAILQ_INIT(dp);
 312         bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash);
 313
 314         simple_lock_init(&bufhashlist_slock );
 315
 316         metabuf = nbuf/8; /* reserved for meta buf */
 317
 318         /* Initialize the buffer headers */
 319         for (i = 0; i < nbuf; i++) {
 320                 bp = &buf[i];
 321                 bufhdrinit(bp);
 322
 323                 /*
 324                  * metabuf buffer headers on the meta-data list and
 325                  * rest of the buffer headers on the empty list
 326                  */
 327                 if (--metabuf)
 328                         whichq = BQ_META;
 329                 else
 330                         whichq = BQ_EMPTY;
 331
 332                 BLISTNONE(bp);
 333                 dp = &bufqueues[whichq];
 334                 binsheadfree(bp, dp, whichq);
 335                 binshash(bp, &invalhash);
 336         }
 337
 338         for (; i < nbuf + niobuf; i++) {
 339                 bp = &buf[i];
 340                 bufhdrinit(bp);
 341                 binsheadfree(bp, &iobufqueue, -1);
 342         }
 343
 344         printf("using %d buffer headers and %d cluster IO buffer headers\n",
 345                 nbuf, niobuf);
 346
 347         /* Set up zones used by the buffer cache */
 348         bufzoneinit();
 349
 350         /* start the bcleanbuf() thread */
 351         bcleanbuf_thread_init();
 352
 353 #if 0   /* notyet */
 354         /* create a thread to do dynamic buffer queue balancing */
 355         bufq_balance_thread_init();
 356 #endif /* XXX */
 357 }
 358
 359 /* __inline  */
 360 struct buf *
 361 bio_doread(vp, blkno, size, cred, async, queuetype)
 362         struct vnode *vp;
 363         daddr_t blkno;
 364         int size;
 365         struct ucred *cred;
 366         int async;
 367         int queuetype;
 368 {
 369         register struct buf *bp;
 370         struct proc     *p = current_proc();
 371
 372         bp = getblk(vp, blkno, size, 0, 0, queuetype);
 373
 374         /*
 375          * If buffer does not have data valid, start a read.
 376          * Note that if buffer is B_INVAL, getblk() won't return it.
 377          * Therefore, it's valid if it's I/O has completed or been delayed.
 378          */
 379         if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
 380                 /* Start I/O for the buffer (keeping credentials). */
 381                 SET(bp->b_flags, B_READ | async);
 382                 if (cred != NOCRED && bp->b_rcred == NOCRED) {
 383                         /*
 384                          * NFS has embedded ucred.
 385                          * Can not crhold() here as that causes zone corruption
 386                          */
 387                         bp->b_rcred = crdup(cred);
 388                 }
 389                 VOP_STRATEGY(bp);
 390
 391                 trace(TR_BREADMISS, pack(vp, size), blkno);
 392
 393                 /* Pay for the read. */
 394                 if (p && p->p_stats)
 395                         p->p_stats->p_ru.ru_inblock++;          /* XXX */
 396         } else if (async) {
 397                 brelse(bp);
 398         }
 399
 400         trace(TR_BREADHIT, pack(vp, size), blkno);
 401
 402         return (bp);
 403 }
 404 /*
 405  * Read a disk block.
 406  * This algorithm described in Bach (p.54).
 407  */
 408 int
 409 bread(vp, blkno, size, cred, bpp)
 410         struct vnode *vp;
 411         daddr_t blkno;
 412         int size;
 413         struct ucred *cred;
 414         struct buf **bpp;
 415 {
 416         register struct buf *bp;
 417
 418         /* Get buffer for block. */
 419         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
 420
 421         /* Wait for the read to complete, and return result. */
 422         return (biowait(bp));
 423 }
 424
 425 /*
 426  * Read a disk block. [bread() for meta-data]
 427  * This algorithm described in Bach (p.54).
 428  */
 429 int
 430 meta_bread(vp, blkno, size, cred, bpp)
 431         struct vnode *vp;
 432         daddr_t blkno;
 433         int size;
 434         struct ucred *cred;
 435         struct buf **bpp;
 436 {
 437         register struct buf *bp;
 438
 439         /* Get buffer for block. */
 440         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
 441
 442         /* Wait for the read to complete, and return result. */
 443         return (biowait(bp));
 444 }
 445
 446 /*
 447  * Read-ahead multiple disk blocks. The first is sync, the rest async.
 448  * Trivial modification to the breada algorithm presented in Bach (p.55).
 449  */
 450 int
 451 breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp)
 452         struct vnode *vp;
 453         daddr_t blkno; int size;
 454         daddr_t rablks[]; int rasizes[];
 455         int nrablks;
 456         struct ucred *cred;
 457         struct buf **bpp;
 458 {
 459         register struct buf *bp;
 460         int i;
 461
 462         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
 463
 464         /*
 465          * For each of the read-ahead blocks, start a read, if necessary.
 466          */
 467         for (i = 0; i < nrablks; i++) {
 468                 /* If it's in the cache, just go on to next one. */
 469                 if (incore(vp, rablks[i]))
 470                         continue;
 471
 472                 /* Get a buffer for the read-ahead block */
 473                 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, BLK_READ);
 474         }
 475
 476         /* Otherwise, we had to start a read for it; wait until it's valid. */
 477         return (biowait(bp));
 478 }
 479
 480 /*
 481  * Read with single-block read-ahead.  Defined in Bach (p.55), but
 482  * implemented as a call to breadn().
 483  * XXX for compatibility with old file systems.
 484  */
 485 int
 486 breada(vp, blkno, size, rablkno, rabsize, cred, bpp)
 487         struct vnode *vp;
 488         daddr_t blkno; int size;
 489         daddr_t rablkno; int rabsize;
 490         struct ucred *cred;
 491         struct buf **bpp;
 492 {
 493
 494         return (breadn(vp, blkno, size, &rablkno, &rabsize, 1, cred, bpp));
 495 }
 496
 497 /*
 498  * Block write.  Described in Bach (p.56)
 499  */
 500 int
 501 bwrite(bp)
 502         struct buf *bp;
 503 {
 504         int rv, sync, wasdelayed;
 505         struct proc     *p = current_proc();
 506         upl_t  upl;
 507         upl_page_info_t *pl;
 508         void * object;
 509         kern_return_t kret;
 510         struct vnode *vp = bp->b_vp;
 511
 512         /* Remember buffer type, to switch on it later. */
 513         sync = !ISSET(bp->b_flags, B_ASYNC);
 514         wasdelayed = ISSET(bp->b_flags, B_DELWRI);
 515         CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
 516         if (wasdelayed)
 517                 nbdwrite--;
 518
 519         if (!sync) {
 520                 /*
 521                  * If not synchronous, pay for the I/O operation and make
 522                  * sure the buf is on the correct vnode queue.  We have
 523                  * to do this now, because if we don't, the vnode may not
 524                  * be properly notified that its I/O has completed.
 525                  */
 526                 if (wasdelayed)
 527                         reassignbuf(bp, vp);
 528                 else
 529                 if (p && p->p_stats)
 530                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
 531         }
 532
 533         trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
 534
 535         /* Initiate disk write.  Make sure the appropriate party is charged. */
 536         SET(bp->b_flags, B_WRITEINPROG);
 537         vp->v_numoutput++;
 538
 539         VOP_STRATEGY(bp);
 540
 541         if (sync) {
 542                 /*
 543                  * If I/O was synchronous, wait for it to complete.
 544                  */
 545                 rv = biowait(bp);
 546
 547                 /*
 548                  * Pay for the I/O operation, if it's not been paid for, and
 549                  * make sure it's on the correct vnode queue. (async operatings
 550                  * were payed for above.)
 551                  */
 552                 if (wasdelayed)
 553                         reassignbuf(bp, vp);
 554                 else
 555                 if (p && p->p_stats)
 556                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
 557
 558                 /* Release the buffer. */
 559                 brelse(bp);
 560
 561                 return (rv);
 562         } else {
 563                 return (0);
 564         }
 565 }
 566
 567 int
 568 vn_bwrite(ap)
 569         struct vop_bwrite_args *ap;
 570 {
 571         return (bwrite(ap->a_bp));
 572 }
 573
 574 /*
 575  * Delayed write.
 576  *
 577  * The buffer is marked dirty, but is not queued for I/O.
 578  * This routine should be used when the buffer is expected
 579  * to be modified again soon, typically a small write that
 580  * partially fills a buffer.
 581  *
 582  * NB: magnetic tapes cannot be delayed; they must be
 583  * written in the order that the writes are requested.
 584  *
 585  * Described in Leffler, et al. (pp. 208-213).
 586  *
 587  * Note: With the abilitty to allocate additional buffer
 588  * headers, we can get in to the situation where "too" many
 589  * bdwrite()s can create situation where the kernel can create
 590  * buffers faster than the disks can service. Doing a bawrite() in
 591  * cases were we have "too many" outstanding bdwrite()s avoids that.
 592  */
 593 void
 594 bdwrite(bp)
 595         struct buf *bp;
 596 {
 597         struct proc *p = current_proc();
 598         struct vnode *vp = bp->b_vp;
 599
 600         /*
 601          * If the block hasn't been seen before:
 602          *      (1) Mark it as having been seen,
 603          *      (2) Charge for the write.
 604          *      (3) Make sure it's on its vnode's correct block list,
 605          */
 606         if (!ISSET(bp->b_flags, B_DELWRI)) {
 607                 SET(bp->b_flags, B_DELWRI);
 608                 if (p && p->p_stats)
 609                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
 610                 nbdwrite ++;
 611                 reassignbuf(bp, vp);
 612         }
 613
 614
 615         /* If this is a tape block, write it the block now. */
 616         if (ISSET(bp->b_flags, B_TAPE)) {
 617                 /* bwrite(bp); */
 618         VOP_BWRITE(bp);
 619                 return;
 620         }
 621
 622         /*
 623          * If the vnode has "too many" write operations in progress
 624          * wait for them to finish the IO
 625          */
 626         while (vp->v_numoutput >= BUFWRITE_THROTTLE) {
 627                 vp->v_flag |= VTHROTTLED;
 628                 (void)tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "bdwrite", 0);
 629         }
 630
 631         /*
 632          * If we have too many delayed write buffers,
 633          * more than we can "safely" handle, just fall back to
 634          * doing the async write
 635          */
 636         if (nbdwrite < 0)
 637                 panic("bdwrite: Negative nbdwrite");
 638
 639         if (nbdwrite > ((nbuf/4)*3)) {
 640                 bawrite(bp);
 641                 return;
 642         }
 643
 644         /* Otherwise, the "write" is done, so mark and release the buffer. */
 645         SET(bp->b_flags, B_DONE);
 646         brelse(bp);
 647 }
 648
 649 /*
 650  * Asynchronous block write; just an asynchronous bwrite().
 651  *
 652  * Note: With the abilitty to allocate additional buffer
 653  * headers, we can get in to the situation where "too" many
 654  * bawrite()s can create situation where the kernel can create
 655  * buffers faster than the disks can service.
 656  * We limit the number of "in flight" writes a vnode can have to
 657  * avoid this.
 658  */
 659 void
 660 bawrite(bp)
 661         struct buf *bp;
 662 {
 663         struct vnode *vp = bp->b_vp;
 664
 665         if (vp) {
 666                 /*
 667                  * If the vnode has "too many" write operations in progress
 668                  * wait for them to finish the IO
 669                  */
 670                 while (vp->v_numoutput >= BUFWRITE_THROTTLE) {
 671                         vp->v_flag |= VTHROTTLED;
 672                         (void)tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "bawrite", 0);
 673                 }
 674         }
 675
 676         SET(bp->b_flags, B_ASYNC);
 677         VOP_BWRITE(bp);
 678 }
 679
 680 /*
 681  * Release a buffer on to the free lists.
 682  * Described in Bach (p. 46).
 683  */
 684 void
 685 brelse(bp)
 686         struct buf *bp;
 687 {
 688         struct bqueues *bufq;
 689         int s;
 690         long whichq;
 691
 692         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
 693                      bp->b_lblkno * PAGE_SIZE, (int)bp, (int)bp->b_data,
 694                      bp->b_flags, 0);
 695
 696         trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
 697
 698         /* IO is done. Cleanup the UPL state */
 699         if (!ISSET(bp->b_flags, B_META)
 700                 && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
 701                 kern_return_t kret;
 702                 upl_t         upl;
 703                 int           upl_flags;
 704
 705                 if ( !ISSET(bp->b_flags, B_PAGELIST)) {
 706                         if ( !ISSET(bp->b_flags, B_INVAL)) {
 707                                 kret = ubc_create_upl(bp->b_vp,
 708                                                                 ubc_blktooff(bp->b_vp, bp->b_lblkno),
 709                                                                 bp->b_bufsize,
 710                                                             &upl,
 711                                                                 NULL,
 712                                                                 UPL_PRECIOUS);
 713                                 if (kret != KERN_SUCCESS)
 714                                         panic("brelse: Failed to get pagelists");
 715 #ifdef  UBC_DEBUG
 716                                 upl_ubc_alias_set(upl, bp, 5);
 717 #endif /* UBC_DEBUG */
 718                         } else
 719                                 upl = (upl_t) 0;
 720                 } else {
 721                         upl = bp->b_pagelist;
 722                         kret = ubc_upl_unmap(upl);
 723
 724                         if (kret != KERN_SUCCESS)
 725                                 panic("kernel_upl_unmap failed");
 726                         bp->b_data = 0;
 727                 }
 728                 if (upl) {
 729                         if (bp->b_flags & (B_ERROR | B_INVAL)) {
 730                             if (bp->b_flags & (B_READ | B_INVAL))
 731                                         upl_flags = UPL_ABORT_DUMP_PAGES;
 732                                 else
 733                                         upl_flags = 0;
 734                                 ubc_upl_abort(upl, upl_flags);
 735                         } else {
 736                             if (ISSET(bp->b_flags, B_NEEDCOMMIT))
 737                                     upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
 738                             else if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY))
 739                                         upl_flags = UPL_COMMIT_SET_DIRTY ;
 740                                 else
 741                                     upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
 742                                 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
 743                                         UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
 744                         }
 745                         s = splbio();
 746                         CLR(bp->b_flags, B_PAGELIST);
 747                         bp->b_pagelist = 0;
 748                         splx(s);
 749                 }
 750         } else {
 751                 if(ISSET(bp->b_flags, B_PAGELIST))
 752                         panic("brelse: pagelist set for non VREG; vp=%x", bp->b_vp);
 753         }
 754
 755         /* Wake up any processes waiting for any buffer to become free. */
 756         if (needbuffer) {
 757                 needbuffer = 0;
 758                 wakeup(&needbuffer);
 759         }
 760
 761         /* Wake up any proceeses waiting for _this_ buffer to become free. */
 762         if (ISSET(bp->b_flags, B_WANTED)) {
 763                 CLR(bp->b_flags, B_WANTED);
 764                 wakeup(bp);
 765         }
 766
 767         /* Block disk interrupts. */
 768         s = splbio();
 769
 770         /*
 771          * Determine which queue the buffer should be on, then put it there.
 772          */
 773
 774         /* If it's locked, don't report an error; try again later. */
 775         if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
 776                 CLR(bp->b_flags, B_ERROR);
 777
 778         /* If it's not cacheable, or an error, mark it invalid. */
 779         if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
 780                 SET(bp->b_flags, B_INVAL);
 781
 782         if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
 783                 /*
 784                  * If it's invalid or empty, dissociate it from its vnode
 785                  * and put on the head of the appropriate queue.
 786                  */
 787                 if (bp->b_vp)
 788                         brelvp(bp);
 789                 if (ISSET(bp->b_flags, B_DELWRI)) {
 790                         CLR(bp->b_flags, B_DELWRI);
 791                         nbdwrite--;
 792                 }
 793                 if (bp->b_bufsize <= 0)
 794                         whichq = BQ_EMPTY;      /* no data */
 795                 else
 796                         whichq = BQ_AGE;        /* invalid data */
 797
 798                 bufq = &bufqueues[whichq];
 799                 binsheadfree(bp, bufq, whichq);
 800         } else {
 801                 /*
 802                  * It has valid data.  Put it on the end of the appropriate
 803                  * queue, so that it'll stick around for as long as possible.
 804                  */
 805                 if (ISSET(bp->b_flags, B_LOCKED))
 806                         whichq = BQ_LOCKED;             /* locked in core */
 807                 else if (ISSET(bp->b_flags, B_META))
 808                         whichq = BQ_META;               /* meta-data */
 809                 else if (ISSET(bp->b_flags, B_AGE))
 810                         whichq = BQ_AGE;                /* stale but valid data */
 811                 else
 812                         whichq = BQ_LRU;                /* valid data */
 813
 814                 bufq = &bufqueues[whichq];
 815                 binstailfree(bp, bufq, whichq);
 816         }
 817
 818         /* Unlock the buffer. */
 819         CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE));
 820
 821         /* Allow disk interrupts. */
 822         splx(s);
 823
 824         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
 825                      (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
 826 }
 827
 828 /*
 829  * Determine if a block is in the cache.
 830  * Just look on what would be its hash chain.  If it's there, return
 831  * a pointer to it, unless it's marked invalid.  If it's marked invalid,
 832  * we normally don't return the buffer, unless the caller explicitly
 833  * wants us to.
 834  */
 835 struct buf *
 836 incore(vp, blkno)
 837         struct vnode *vp;
 838         daddr_t blkno;
 839 {
 840         struct buf *bp;
 841         int bufseen = 0;
 842
 843         bp = BUFHASH(vp, blkno)->lh_first;
 844
 845         /* Search hash chain */
 846         for (; bp != NULL; bp = bp->b_hash.le_next, bufseen++) {
 847                 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
 848                     !ISSET(bp->b_flags, B_INVAL))
 849                         return (bp);
 850         if(bufseen >= nbuf)
 851                 panic("walked more than nbuf in incore");
 852
 853         }
 854
 855         return (0);
 856 }
 857
 858
 859 /* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
 860 /*
 861  * Get a block of requested size that is associated with
 862  * a given vnode and block offset. If it is found in the
 863  * block cache, mark it as having been found, make it busy
 864  * and return it. Otherwise, return an empty block of the
 865  * correct size. It is up to the caller to insure that the
 866  * cached blocks be of the correct size.
 867  */
 868 struct buf *
 869 getblk(vp, blkno, size, slpflag, slptimeo, operation)
 870         register struct vnode *vp;
 871         daddr_t blkno;
 872         int size, slpflag, slptimeo, operation;
 873 {
 874         struct buf *bp;
 875         int s, err;
 876         upl_t upl;
 877         upl_page_info_t *pl;
 878         kern_return_t kret;
 879         int error=0;
 880         int pagedirty = 0;
 881
 882         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
 883                      blkno * PAGE_SIZE, size, operation, 0, 0);
 884 start:
 885
 886         s = splbio();
 887         if (bp = incore(vp, blkno)) {
 888                 /* Found in the Buffer Cache */
 889                 if (ISSET(bp->b_flags, B_BUSY)) {
 890                         /* but is busy */
 891                         switch (operation) {
 892                         case BLK_READ:
 893                         case BLK_WRITE:
 894                         case BLK_META:
 895                                 SET(bp->b_flags, B_WANTED);
 896                                 bufstats.bufs_busyincore++;
 897                                 err = tsleep(bp, slpflag | (PRIBIO + 1), "getblk",
 898                                     slptimeo);
 899                                 splx(s);
 900                                 /*
 901                                  * Callers who call with PCATCH or timeout are
 902                                  * willing to deal with the NULL pointer
 903                                  */
 904                                 if (err && ((slpflag & PCATCH) ||
 905                                                          ((err == EWOULDBLOCK) && slptimeo)))
 906                                         return (NULL);
 907                                 goto start;
 908                                 /*NOTREACHED*/
 909                                 break;
 910
 911                         case BLK_PAGEIN:
 912                                 /* pagein operation must not use getblk */
 913                                 panic("getblk: pagein for incore busy buffer");
 914                                 splx(s);
 915                                 /*NOTREACHED*/
 916                                 break;
 917
 918                         case BLK_PAGEOUT:
 919                                 /* pageout operation must not use getblk */
 920                                 panic("getblk: pageout for incore busy buffer");
 921                                 splx(s);
 922                                 /*NOTREACHED*/
 923                                 break;
 924
 925                         default:
 926                                 panic("getblk: %d unknown operation 1", operation);
 927                                 /*NOTREACHED*/
 928                                 break;
 929                         }
 930                 } else {
 931                         /* not busy */
 932                         SET(bp->b_flags, (B_BUSY | B_CACHE));
 933                         bremfree(bp);
 934                         bufstats.bufs_incore++;
 935                         splx(s);
 936
 937                         allocbuf(bp, size);
 938                         if (ISSET(bp->b_flags, B_PAGELIST))
 939                                         panic("pagelist buffer is not busy");
 940
 941                         switch (operation) {
 942                         case BLK_READ:
 943                         case BLK_WRITE:
 944                                 if (UBCISVALID(bp->b_vp) && bp->b_bufsize) {
 945                                         kret = ubc_create_upl(vp,
 946                                                                         ubc_blktooff(vp, bp->b_lblkno),
 947                                                                         bp->b_bufsize,
 948                                                                         &upl,
 949                                                                         &pl,
 950                                                                         UPL_PRECIOUS);
 951                                         if (kret != KERN_SUCCESS)
 952                                                 panic("Failed to get pagelists");
 953
 954                                         SET(bp->b_flags, B_PAGELIST);
 955                                         bp->b_pagelist = upl;
 956
 957                                         if (!upl_valid_page(pl, 0)) {
 958                                                 if (vp->v_tag != VT_NFS)
 959                                                         panic("getblk: incore buffer without valid page");
 960                                                 CLR(bp->b_flags, B_CACHE);
 961                                         }
 962
 963                                         if (upl_dirty_page(pl, 0))
 964                                                 SET(bp->b_flags, B_WASDIRTY);
 965                                         else
 966                                                 CLR(bp->b_flags, B_WASDIRTY);
 967
 968                                         kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
 969                                         if (kret != KERN_SUCCESS) {
 970                                                 panic("getblk: ubc_upl_map() failed with (%d)",
 971                                                                   kret);
 972                                         }
 973                                         if (bp->b_data == 0) panic("ubc_upl_map mapped 0");
 974                                 }
 975                                 break;
 976
 977                         case BLK_META:
 978                                 /*
 979                                  * VM is not involved in IO for the meta data
 980                                  * buffer already has valid data
 981                                  */
 982                         if(bp->b_data == 0)
 983                                         panic("bp->b_data null incore buf=%x", bp);
 984                                 break;
 985
 986                         case BLK_PAGEIN:
 987                         case BLK_PAGEOUT:
 988                                 panic("getblk: paging operation 1");
 989                                 break;
 990
 991                         default:
 992                                 panic("getblk: %d unknown operation 2", operation);
 993                                 /*NOTREACHED*/
 994                                 break;
 995                         }
 996                 }
 997         } else { /* not incore() */
 998                 int queue = BQ_EMPTY; /* Start with no preference */
 999                 splx(s);
1000
1001                 if ((operation == BLK_META) || (UBCINVALID(vp)) ||
1002                         !(UBCINFOEXISTS(vp))) {
1003                         operation = BLK_META;
1004                 }
1005                 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
1006                         goto start;
1007                 if (incore(vp, blkno)) {
1008                         SET(bp->b_flags, B_INVAL);
1009                         binshash(bp, &invalhash);
1010                         brelse(bp);
1011                         goto start;
1012                 }
1013
1014                 /*
1015                  * if it is meta, the queue may be set to other
1016                  * type so reset as well as mark it to be B_META
1017                  * so that when buffer is released it will goto META queue
1018                  * Also, if the vnode is not VREG, then it is META
1019                  */
1020                 if (operation == BLK_META) {
1021                         SET(bp->b_flags, B_META);
1022                         queue = BQ_META;
1023                 }
1024                 /*
1025                  * Insert in the hash so that incore() can find it
1026                  */
1027                 binshash(bp, BUFHASH(vp, blkno));
1028
1029                 allocbuf(bp, size);
1030
1031                 switch (operation) {
1032                 case BLK_META:
1033                         /* buffer data is invalid */
1034
1035 #if !ZALLOC_METADATA
1036                         if (bp->b_data)
1037                                 panic("bp->b_data is not nul; %x",bp);
1038                         kret = kmem_alloc(kernel_map,
1039                                                 &bp->b_data, bp->b_bufsize);
1040                         if (kret != KERN_SUCCESS)
1041                                 panic("getblk: kmem_alloc() returned %d", kret);
1042 #endif /* ZALLOC_METADATA */
1043
1044                         if(bp->b_data == 0)
1045                                 panic("bp->b_data is null %x",bp);
1046
1047                         bp->b_blkno = bp->b_lblkno = blkno;
1048                         s = splbio();
1049                         bgetvp(vp, bp);
1050                         bufstats.bufs_miss++;
1051                         splx(s);
1052                         if (bp->b_data == 0)
1053                                 panic("b_data is 0: 2");
1054
1055                         /* wakeup the buffer */
1056                         CLR(bp->b_flags, B_WANTED);
1057                         wakeup(bp);
1058                         break;
1059
1060                 case BLK_READ:
1061                 case BLK_WRITE:
1062
1063                         if (ISSET(bp->b_flags, B_PAGELIST))
1064                                 panic("B_PAGELIST in bp=%x",bp);
1065
1066                         kret = ubc_create_upl(vp,
1067                                                         ubc_blktooff(vp, blkno),
1068                                                         bp->b_bufsize,
1069                                                         &upl,
1070                                                         &pl,
1071                                                         UPL_PRECIOUS);
1072                         if (kret != KERN_SUCCESS)
1073                                 panic("Failed to get pagelists");
1074
1075 #ifdef  UBC_DEBUG
1076                         upl_ubc_alias_set(upl, bp, 4);
1077 #endif /* UBC_DEBUG */
1078                         bp->b_blkno = bp->b_lblkno = blkno;
1079                         bp->b_pagelist = upl;
1080
1081                         SET(bp->b_flags, B_PAGELIST);
1082
1083                         if (upl_valid_page(pl, 0)) {
1084                                 SET(bp->b_flags, B_CACHE | B_DONE);
1085                                 bufstats.bufs_vmhits++;
1086
1087                                 pagedirty = upl_dirty_page(pl, 0);
1088
1089                                 if (pagedirty)
1090                                         SET(bp->b_flags, B_WASDIRTY);
1091
1092                                 if (vp->v_tag == VT_NFS) {
1093                                         off_t  f_offset;
1094                                         int    valid_size;
1095
1096                                         bp->b_validoff = 0;
1097                                         bp->b_dirtyoff = 0;
1098
1099                                         f_offset = ubc_blktooff(vp, blkno);
1100
1101                                         if (f_offset > vp->v_ubcinfo->ui_size) {
1102                                                 CLR(bp->b_flags, (B_CACHE|B_DONE|B_WASDIRTY));
1103                                                 bp->b_validend = 0;
1104                                                 bp->b_dirtyend = 0;
1105                                         } else {
1106                                                 valid_size = min(((unsigned int)(vp->v_ubcinfo->ui_size - f_offset)), PAGE_SIZE);
1107                                                 bp->b_validend = valid_size;
1108
1109                                                 if (pagedirty)
1110                                                        bp->b_dirtyend = valid_size;
1111                                                 else
1112                                                        bp->b_dirtyend = 0;
1113
1114                                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_NONE,
1115                                                              bp->b_validend, bp->b_dirtyend,
1116                                                              (int)vp->v_ubcinfo->ui_size, 0, 0);
1117                                         }
1118                                 } else {
1119                                         bp->b_validoff = 0;
1120                                         bp->b_dirtyoff = 0;
1121
1122                                         if (pagedirty) {
1123                                                 /* page is dirty */
1124                                                 bp->b_validend = bp->b_bcount;
1125                                                 bp->b_dirtyend = bp->b_bcount;
1126                                         } else {
1127                                                 /* page is clean */
1128                                                 bp->b_validend = bp->b_bcount;
1129                                                 bp->b_dirtyend = 0;
1130                                         }
1131                                 }
1132                                 if (error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL)) {
1133                                         panic("VOP_BMAP failed in getblk");
1134                                         /*NOTREACHED*/
1135                                         /*
1136                                          * XXX:  We probably should invalidate the VM Page
1137                                          */
1138                                         bp->b_error = error;
1139                                         SET(bp->b_flags, (B_ERROR | B_INVAL));
1140                                         /* undo B_DONE that was set before upl_commit() */
1141                                         CLR(bp->b_flags, B_DONE);
1142                                         brelse(bp);
1143                                         return (0);
1144                                 }
1145                         } else {
1146                                 bufstats.bufs_miss++;
1147                         }
1148                         kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
1149                         if (kret != KERN_SUCCESS) {
1150                                 panic("getblk: ubc_upl_map() "
1151                                       "failed with (%d)", kret);
1152                         }
1153                         if (bp->b_data == 0) panic("kernel_upl_map mapped 0");
1154
1155                         s = splbio();
1156                         bgetvp(vp, bp);
1157                         splx(s);
1158
1159                         break;
1160
1161                 case BLK_PAGEIN:
1162                 case BLK_PAGEOUT:
1163                         panic("getblk: paging operation 2");
1164                         break;
1165                 default:
1166                         panic("getblk: %d unknown operation 3", operation);
1167                         /*NOTREACHED*/
1168                         break;
1169                 }
1170         }
1171
1172         if (bp->b_data == NULL)
1173                 panic("getblk: bp->b_addr is null");
1174
1175         if (bp->b_bufsize & 0xfff) {
1176 #if ZALLOC_METADATA
1177                 if (ISSET(bp->b_flags, B_META) && (bp->b_bufsize & 0x1ff))
1178 #endif /* ZALLOC_METADATA */
1179                         panic("getblk: bp->b_bufsize = %d", bp->b_bufsize);
1180         }
1181
1182         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
1183                      (int)bp, (int)bp->b_data, bp->b_flags, 3, 0);
1184
1185         return (bp);
1186 }
1187
1188 /*
1189  * Get an empty, disassociated buffer of given size.
1190  */
1191 struct buf *
1192 geteblk(size)
1193         int size;
1194 {
1195         struct buf *bp;
1196     int queue = BQ_EMPTY;
1197 #if !ZALLOC_METADATA
1198         kern_return_t kret;
1199         vm_size_t desired_size = roundup(size, CLBYTES);
1200
1201         if (desired_size > MAXBSIZE)
1202                 panic("geteblk: buffer larger than MAXBSIZE requested");
1203 #endif /* ZALLOC_METADATA */
1204
1205         while ((bp = getnewbuf(0, 0, &queue)) == 0)
1206                 ;
1207 #if ZALLOC_METADATA
1208         SET(bp->b_flags, (B_META|B_INVAL));
1209 #else
1210         SET(bp->b_flags, B_INVAL);
1211 #endif /* ZALLOC_METADATA */
1212
1213 #if DIAGNOSTIC
1214         assert(queue == BQ_EMPTY);
1215 #endif /* DIAGNOSTIC */
1216         /* XXX need to implement logic to deal with other queues */
1217
1218 #if !ZALLOC_METADATA
1219         /* Empty buffer - allocate pages */
1220         kret = kmem_alloc_aligned(kernel_map, &bp->b_data, desired_size);
1221         if (kret != KERN_SUCCESS)
1222                 panic("geteblk: kmem_alloc_aligned returned %d", kret);
1223 #endif /* ZALLOC_METADATA */
1224
1225         binshash(bp, &invalhash);
1226         allocbuf(bp, size);
1227         bufstats.bufs_eblk++;
1228
1229         return (bp);
1230 }
1231
1232 #if ZALLOC_METADATA
1233 /*
1234  * Zones for the meta data buffers
1235  */
1236
1237 #define MINMETA 512
1238 #define MAXMETA 4096
1239
1240 struct meta_zone_entry {
1241         zone_t mz_zone;
1242         vm_size_t mz_size;
1243         vm_size_t mz_max;
1244         char *mz_name;
1245 };
1246
1247 struct meta_zone_entry meta_zones[] = {
1248         {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
1249         {NULL, (MINMETA * 2),  64 * (MINMETA * 2), "buf.1024" },
1250         {NULL, (MINMETA * 3),  16 * (MINMETA * 3), "buf.1536" },
1251         {NULL, (MINMETA * 4),  16 * (MINMETA * 4), "buf.2048" },
1252         {NULL, (MINMETA * 5),  16 * (MINMETA * 5), "buf.2560" },
1253         {NULL, (MINMETA * 6),  16 * (MINMETA * 6), "buf.3072" },
1254         {NULL, (MINMETA * 7),  16 * (MINMETA * 7), "buf.3584" },
1255         {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
1256         {NULL, 0, 0, "" } /* End */
1257 };
1258 #endif /* ZALLOC_METADATA */
1259
1260 /*
1261  * Initialize the meta data zones
1262  */
1263 static void
1264 bufzoneinit(void)
1265 {
1266 #if ZALLOC_METADATA
1267         int i;
1268
1269         for (i = 0; meta_zones[i].mz_size != 0; i++) {
1270                 meta_zones[i].mz_zone =
1271                                 zinit(meta_zones[i].mz_size,
1272                                         meta_zones[i].mz_max,
1273                                         PAGE_SIZE,
1274                                         meta_zones[i].mz_name);
1275         }
1276 #endif /* ZALLOC_METADATA */
1277         buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
1278 }
1279
1280 #if ZALLOC_METADATA
1281 static zone_t
1282 getbufzone(size_t size)
1283 {
1284         int i;
1285
1286         if (size % 512)
1287                 panic("getbufzone: incorect size = %d", size);
1288
1289         i = (size / 512) - 1;
1290         return (meta_zones[i].mz_zone);
1291 }
1292 #endif /* ZALLOC_METADATA */
1293
1294 /*
1295  * With UBC, there is no need to expand / shrink the file data
1296  * buffer. The VM uses the same pages, hence no waste.
1297  * All the file data buffers can have one size.
1298  * In fact expand / shrink would be an expensive operation.
1299  *
1300  * Only exception to this is meta-data buffers. Most of the
1301  * meta data operations are smaller than PAGE_SIZE. Having the
1302  * meta-data buffers grow and shrink as needed, optimizes use
1303  * of the kernel wired memory.
1304  */
1305
1306 int
1307 allocbuf(bp, size)
1308         struct buf *bp;
1309         int size;
1310 {
1311         vm_size_t desired_size;
1312
1313         desired_size = roundup(size, CLBYTES);
1314
1315         if(desired_size < PAGE_SIZE)
1316                 desired_size = PAGE_SIZE;
1317         if (desired_size > MAXBSIZE)
1318                 panic("allocbuf: buffer larger than MAXBSIZE requested");
1319
1320 #if ZALLOC_METADATA
1321         if (ISSET(bp->b_flags, B_META)) {
1322                 kern_return_t kret;
1323                 zone_t zprev, z;
1324                 size_t nsize = roundup(size, MINMETA);
1325
1326                 if (bp->b_data) {
1327                         vm_offset_t elem = (vm_offset_t)bp->b_data;
1328
1329                         if (ISSET(bp->b_flags, B_ZALLOC))
1330                                 if (bp->b_bufsize <= MAXMETA) {
1331                                         if (bp->b_bufsize < nsize) {
1332                                                 /* reallocate to a bigger size */
1333                                                 desired_size = nsize;
1334
1335                                                 zprev = getbufzone(bp->b_bufsize);
1336                                                 z = getbufzone(nsize);
1337                                                 bp->b_data = (caddr_t)zalloc(z);
1338                                                 if(bp->b_data == 0)
1339                                                         panic("allocbuf: zalloc() returned NULL");
1340                                                 bcopy(elem, bp->b_data, bp->b_bufsize);
1341                                                 zfree(zprev, elem);
1342                                         } else {
1343                                                 desired_size = bp->b_bufsize;
1344                                         }
1345                                 } else
1346                                         panic("allocbuf: B_ZALLOC set incorrectly");
1347                         else
1348                                 if (bp->b_bufsize < desired_size) {
1349                                         /* reallocate to a bigger size */
1350                                         kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1351                                         if (kret != KERN_SUCCESS)
1352                                                 panic("allocbuf: kmem_alloc() returned %d", kret);
1353                                         if(bp->b_data == 0)
1354                                                 panic("allocbuf: null b_data");
1355                                         bcopy(elem, bp->b_data, bp->b_bufsize);
1356                                         kmem_free(kernel_map, elem, bp->b_bufsize);
1357                                 } else {
1358                                         desired_size = bp->b_bufsize;
1359                                 }
1360                 } else {
1361                         /* new allocation */
1362                         if (nsize <= MAXMETA) {
1363                                 desired_size = nsize;
1364                                 z = getbufzone(nsize);
1365                                 bp->b_data = (caddr_t)zalloc(z);
1366                                 if(bp->b_data == 0)
1367                                         panic("allocbuf: zalloc() returned NULL 2");
1368                                 SET(bp->b_flags, B_ZALLOC);
1369                         } else {
1370                                 kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1371                                 if (kret != KERN_SUCCESS)
1372                                         panic("allocbuf: kmem_alloc() 2 returned %d", kret);
1373                                 if(bp->b_data == 0)
1374                                         panic("allocbuf: null b_data 2");
1375                         }
1376                 }
1377         }
1378
1379         if (ISSET(bp->b_flags, B_META) && (bp->b_data == 0))
1380                 panic("allocbuf: bp->b_data is NULL");
1381 #endif /* ZALLOC_METADATA */
1382
1383                 bp->b_bufsize = desired_size;
1384                 bp->b_bcount = size;
1385 }
1386
1387 /*
1388  *      Get a new buffer from one of the free lists.
1389  *
1390  *      Request for a queue is passes in. The queue from which the buffer was taken
1391  *      from is returned. Out of range queue requests get BQ_EMPTY. Request for
1392  *      BQUEUE means no preference. Use heuristics in that case.
1393  *      Heuristics is as follows:
1394  *      Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
1395  *      If none available block till one is made available.
1396  *      If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
1397  *      Pick the most stale buffer.
1398  *      If found buffer was marked delayed write, start the async. write
1399  *      and restart the search.
1400  *      Initialize the fields and disassociate the buffer from the vnode.
1401  *      Remove the buffer from the hash. Return the buffer and the queue
1402  *      on which it was found.
1403  */
1404
1405 static struct buf *
1406 getnewbuf(slpflag, slptimeo, queue)
1407         int slpflag, slptimeo;
1408         int *queue;
1409 {
1410         register struct buf *bp;
1411         register struct buf *lru_bp;
1412         register struct buf *age_bp;
1413         register struct buf *meta_bp;
1414         register int age_time, lru_time, bp_time, meta_time;
1415         int s;
1416         struct ucred *cred;
1417         int req = *queue; /* save it for restarts */
1418
1419 start:
1420         s = splbio();
1421
1422         /* invalid request gets empty queue */
1423         if ((*queue > BQUEUES) || (*queue < 0)
1424                 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED))
1425                 *queue = BQ_EMPTY;
1426
1427         /* (*queue == BQUEUES) means no preference */
1428         if (*queue != BQUEUES) {
1429                 /* Try for the requested queue first */
1430                 bp = bufqueues[*queue].tqh_first;
1431                 if (bp)
1432                         goto found;
1433         }
1434
1435         /* Unable to use requested queue */
1436         age_bp = bufqueues[BQ_AGE].tqh_first;
1437         lru_bp = bufqueues[BQ_LRU].tqh_first;
1438         meta_bp = bufqueues[BQ_META].tqh_first;
1439
1440         if (!age_bp && !lru_bp && !meta_bp) { /* Unavailble on AGE or LRU */
1441                 /* Try the empty list first */
1442                 bp = bufqueues[BQ_EMPTY].tqh_first;
1443                 if (bp) {
1444                         *queue = BQ_EMPTY;
1445                         goto found;
1446                 }
1447
1448                 /* Create a new temparory buffer header */
1449                 bp = (struct buf *)zalloc(buf_hdr_zone);
1450
1451                 if (bp) {
1452                         bufhdrinit(bp);
1453                         BLISTNONE(bp);
1454                         binshash(bp, &invalhash);
1455                         SET(bp->b_flags, B_HDRALLOC);
1456                         *queue = BQ_EMPTY;
1457                         binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
1458                         buf_hdr_count++;
1459                         goto found;
1460                 }
1461
1462                 /* Log this error condition */
1463                 printf("getnewbuf: No useful buffers");
1464
1465                 /* wait for a free buffer of any kind */
1466                 needbuffer = 1;
1467                 bufstats.bufs_sleeps++;
1468                 tsleep(&needbuffer, slpflag|(PRIBIO+1), "getnewbuf", slptimeo);
1469                 splx(s);
1470                 return (0);
1471         }
1472
1473         /* Buffer available either on AGE or LRU or META */
1474         bp = NULL;
1475         *queue = -1;
1476
1477         /* Buffer available either on AGE or LRU */
1478         if (!age_bp) {
1479                 bp = lru_bp;
1480                 *queue = BQ_LRU;
1481         } else if (!lru_bp) {
1482                 bp = age_bp;
1483                 *queue = BQ_AGE;
1484         } else { /* buffer available on both AGE and LRU */
1485                 age_time = time.tv_sec - age_bp->b_timestamp;
1486                 lru_time = time.tv_sec - lru_bp->b_timestamp;
1487                 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
1488                         bp = age_bp;
1489                         *queue = BQ_AGE;
1490                         /*
1491                          * we should probably re-timestamp eveything in the
1492                          * queues at this point with the current time
1493                          */
1494                 } else {
1495                         if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
1496                                 bp = lru_bp;
1497                                 *queue = BQ_LRU;
1498                         } else {
1499                                 bp = age_bp;
1500                                 *queue = BQ_AGE;
1501                         }
1502                 }
1503         }
1504
1505         if (!bp) { /* Neither on AGE nor on LRU */
1506                 bp = meta_bp;
1507                 *queue = BQ_META;
1508         }  else if (meta_bp) {
1509                 bp_time = time.tv_sec - bp->b_timestamp;
1510                 meta_time = time.tv_sec - meta_bp->b_timestamp;
1511
1512                 if (!(bp_time < 0) && !(meta_time < 0)) {
1513                         /* time not set backwards */
1514                         int bp_is_stale;
1515                         bp_is_stale = (*queue == BQ_LRU) ?
1516                                         lru_is_stale : age_is_stale;
1517
1518                         if ((meta_time >= meta_is_stale) &&
1519                                         (bp_time < bp_is_stale)) {
1520                                 bp = meta_bp;
1521                                 *queue = BQ_META;
1522                         }
1523                 }
1524         }
1525
1526         if (bp == NULL)
1527                 panic("getnewbuf: null bp");
1528
1529 found:
1530         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1531                 panic("getnewbuf: le_prev is deadbeef");
1532
1533         if(ISSET(bp->b_flags, B_BUSY))
1534                 panic("getnewbuf reusing BUSY buf");
1535
1536         /* Clean it */
1537         if (bcleanbuf(bp)) {
1538                 /* bawrite() issued, buffer not ready */
1539                 splx(s);
1540                 *queue = req;
1541                 goto start;
1542         }
1543         splx(s);
1544         return (bp);
1545 }
1546 #include <mach/mach_types.h>
1547 #include <mach/memory_object_types.h>
1548
1549 /*
1550  * Clean a buffer.
1551  * Returns 0 is buffer is ready to use,
1552  * Returns 1 if issued a bawrite() to indicate
1553  * that the buffer is not ready.
1554  */
1555 int
1556 bcleanbuf(struct buf *bp)
1557 {
1558         int s;
1559         struct ucred *cred;
1560         int     hdralloc = 0;
1561
1562         s = splbio();
1563
1564         /* Remove from the queue */
1565         bremfree(bp);
1566
1567         /* Buffer is no longer on free lists. */
1568         SET(bp->b_flags, B_BUSY);
1569
1570         /* Check whether the buffer header was "allocated" */
1571         if (ISSET(bp->b_flags, B_HDRALLOC))
1572                 hdralloc = 1;
1573
1574         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1575                 panic("bcleanbuf: le_prev is deadbeef");
1576
1577         /*
1578          * If buffer was a delayed write, start the IO by queuing
1579          * it on the LAUNDRY queue, and return 1
1580          */
1581         if (ISSET(bp->b_flags, B_DELWRI)) {
1582                 splx(s);
1583                 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
1584                 blaundrycnt++;
1585                 wakeup(&blaundrycnt);
1586                 return (1);
1587         }
1588
1589         if (bp->b_vp)
1590                 brelvp(bp);
1591         bremhash(bp);
1592         BLISTNONE(bp);
1593
1594         splx(s);
1595
1596         if (ISSET(bp->b_flags, B_META)) {
1597 #if ZALLOC_METADATA
1598                 vm_offset_t elem = (vm_offset_t)bp->b_data;
1599                 if (elem == 0)
1600                         panic("bcleanbuf: NULL bp->b_data B_META buffer");
1601
1602                 if (ISSET(bp->b_flags, B_ZALLOC)) {
1603                         if (bp->b_bufsize <= MAXMETA) {
1604                                 zone_t z;
1605
1606                                 z = getbufzone(bp->b_bufsize);
1607                                 bp->b_data = (caddr_t)0xdeadbeef;
1608                                 zfree(z, elem);
1609                                 CLR(bp->b_flags, B_ZALLOC);
1610                         } else
1611                                 panic("bcleanbuf: B_ZALLOC set incorrectly");
1612                 } else {
1613                         bp->b_data = (caddr_t)0xdeadbeef;
1614                         kmem_free(kernel_map, elem, bp->b_bufsize);
1615                 }
1616 #else
1617            if (bp->b_data == 0)
1618                    panic("bcleanbuf: bp->b_data == NULL for B_META buffer");
1619
1620            kmem_free(kernel_map, bp->b_data, bp->b_bufsize);
1621 #endif /* ZALLOC_METADATA */
1622         }
1623
1624         trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
1625
1626         /* disassociate us from our vnode, if we had one... */
1627         s = splbio();
1628
1629         /* clear out various other fields */
1630         bp->b_bufsize = 0;
1631         bp->b_data = 0;
1632         bp->b_flags = B_BUSY;
1633         if (hdralloc)
1634                 SET(bp->b_flags, B_HDRALLOC);
1635         bp->b_dev = NODEV;
1636         bp->b_blkno = bp->b_lblkno = 0;
1637         bp->b_iodone = 0;
1638         bp->b_error = 0;
1639         bp->b_resid = 0;
1640         bp->b_bcount = 0;
1641         bp->b_dirtyoff = bp->b_dirtyend = 0;
1642         bp->b_validoff = bp->b_validend = 0;
1643
1644         /* nuke any credentials we were holding */
1645         cred = bp->b_rcred;
1646         if (cred != NOCRED) {
1647                 bp->b_rcred = NOCRED;
1648                 crfree(cred);
1649         }
1650         cred = bp->b_wcred;
1651         if (cred != NOCRED) {
1652                 bp->b_wcred = NOCRED;
1653                 crfree(cred);
1654         }
1655         splx(s);
1656         return (0);
1657 }
1658
1659
1660 /*
1661  * Wait for operations on the buffer to complete.
1662  * When they do, extract and return the I/O's error value.
1663  */
1664 int
1665 biowait(bp)
1666         struct buf *bp;
1667 {
1668         upl_t           upl;
1669         upl_page_info_t *pl;
1670         int s;
1671         kern_return_t kret;
1672
1673         s = splbio();
1674         while (!ISSET(bp->b_flags, B_DONE))
1675                 tsleep(bp, PRIBIO + 1, "biowait", 0);
1676         splx(s);
1677
1678         /* check for interruption of I/O (e.g. via NFS), then errors. */
1679         if (ISSET(bp->b_flags, B_EINTR)) {
1680                 CLR(bp->b_flags, B_EINTR);
1681                 return (EINTR);
1682         } else if (ISSET(bp->b_flags, B_ERROR))
1683                 return (bp->b_error ? bp->b_error : EIO);
1684         else
1685                 return (0);
1686 }
1687
1688 /*
1689  * Mark I/O complete on a buffer.
1690  *
1691  * If a callback has been requested, e.g. the pageout
1692  * daemon, do so. Otherwise, awaken waiting processes.
1693  *
1694  * [ Leffler, et al., says on p.247:
1695  *      "This routine wakes up the blocked process, frees the buffer
1696  *      for an asynchronous write, or, for a request by the pagedaemon
1697  *      process, invokes a procedure specified in the buffer structure" ]
1698  *
1699  * In real life, the pagedaemon (or other system processes) wants
1700  * to do async stuff to, and doesn't want the buffer brelse()'d.
1701  * (for swap pager, that puts swap buffers on the free lists (!!!),
1702  * for the vn device, that puts malloc'd buffers on the free lists!)
1703  */
1704 void
1705 biodone(bp)
1706         struct buf *bp;
1707 {
1708         boolean_t       funnel_state;
1709         struct vnode *vp;
1710
1711         funnel_state = thread_funnel_set(kernel_flock, TRUE);
1712
1713         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
1714                      (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1715
1716         if (ISSET(bp->b_flags, B_DONE))
1717                 panic("biodone already");
1718         SET(bp->b_flags, B_DONE);               /* note that it's done */
1719         /*
1720          * I/O was done, so don't believe
1721          * the DIRTY state from VM anymore
1722          */
1723         CLR(bp->b_flags, B_WASDIRTY);
1724
1725         if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
1726                 vwakeup(bp);     /* wake up reader */
1727
1728         /* Wakeup the throttled write operations as needed */
1729         vp = bp->b_vp;
1730         if (vp
1731                 && (vp->v_flag & VTHROTTLED)
1732                 && (vp->v_numoutput <= (BUFWRITE_THROTTLE / 3))) {
1733                 vp->v_flag &= ~VTHROTTLED;
1734                 wakeup((caddr_t)&vp->v_numoutput);
1735         }
1736
1737         if (ISSET(bp->b_flags, B_CALL)) {       /* if necessary, call out */
1738                 CLR(bp->b_flags, B_CALL);       /* but note callout done */
1739                 (*bp->b_iodone)(bp);
1740         } else if (ISSET(bp->b_flags, B_ASYNC)) /* if async, release it */
1741                 brelse(bp);
1742         else {                                  /* or just wakeup the buffer */
1743                 CLR(bp->b_flags, B_WANTED);
1744                 wakeup(bp);
1745         }
1746
1747         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
1748                      (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1749
1750         thread_funnel_set(kernel_flock, funnel_state);
1751 }
1752
1753 /*
1754  * Return a count of buffers on the "locked" queue.
1755  */
1756 int
1757 count_lock_queue()
1758 {
1759         register struct buf *bp;
1760         register int n = 0;
1761
1762         for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
1763             bp = bp->b_freelist.tqe_next)
1764                 n++;
1765         return (n);
1766 }
1767
1768 /*
1769  * Return a count of 'busy' buffers. Used at the time of shutdown.
1770  */
1771 int
1772 count_busy_buffers()
1773 {
1774         register struct buf *bp;
1775         register int nbusy = 0;
1776
1777         for (bp = &buf[nbuf]; --bp >= buf; )
1778                 if ((bp->b_flags & (B_BUSY|B_INVAL)) == B_BUSY)
1779                         nbusy++;
1780         return (nbusy);
1781 }
1782
1783 #if 1 /*DIAGNOSTIC */
1784 /*
1785  * Print out statistics on the current allocation of the buffer pool.
1786  * Can be enabled to print out on every ``sync'' by setting "syncprt"
1787  * in vfs_syscalls.c using sysctl.
1788  */
1789 void
1790 vfs_bufstats()
1791 {
1792         int s, i, j, count;
1793         register struct buf *bp;
1794         register struct bqueues *dp;
1795         int counts[MAXBSIZE/CLBYTES+1];
1796         static char *bname[BQUEUES] =
1797                 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
1798
1799         for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
1800                 count = 0;
1801                 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1802                         counts[j] = 0;
1803                 s = splbio();
1804                 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
1805                         counts[bp->b_bufsize/CLBYTES]++;
1806                         count++;
1807                 }
1808                 splx(s);
1809                 printf("%s: total-%d", bname[i], count);
1810                 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1811                         if (counts[j] != 0)
1812                                 printf(", %d-%d", j * CLBYTES, counts[j]);
1813                 printf("\n");
1814         }
1815 }
1816 #endif /* DIAGNOSTIC */
1817
1818 #define NRESERVEDIOBUFS 16
1819
1820 struct buf *
1821 alloc_io_buf(vp, priv)
1822         struct vnode *vp;
1823         int priv;
1824 {
1825         register struct buf *bp;
1826         int s;
1827
1828         s = splbio();
1829
1830         while (niobuf - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse && !priv) {
1831                 need_iobuffer = 1;
1832                 bufstats.bufs_iobufsleeps++;
1833                 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf", 0);
1834         }
1835
1836         while ((bp = iobufqueue.tqh_first) == NULL) {
1837                 need_iobuffer = 1;
1838                 bufstats.bufs_iobufsleeps++;
1839                 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf1", 0);
1840         }
1841
1842         TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
1843         bp->b_timestamp = 0;
1844
1845         /* clear out various fields */
1846         bp->b_flags = B_BUSY;
1847         bp->b_blkno = bp->b_lblkno = 0;
1848         bp->b_iodone = 0;
1849         bp->b_error = 0;
1850         bp->b_resid = 0;
1851         bp->b_bcount = 0;
1852         bp->b_bufsize = 0;
1853         bp->b_vp = vp;
1854
1855         if (vp->v_type == VBLK || vp->v_type == VCHR)
1856                 bp->b_dev = vp->v_rdev;
1857         else
1858                 bp->b_dev = NODEV;
1859         bufstats.bufs_iobufinuse++;
1860         if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
1861                 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
1862         splx(s);
1863
1864         return (bp);
1865 }
1866
1867 void
1868 free_io_buf(bp)
1869         struct buf *bp;
1870 {
1871         int s;
1872
1873         s = splbio();
1874         /* put buffer back on the head of the iobufqueue */
1875         bp->b_vp = NULL;
1876         bp->b_flags = B_INVAL;
1877
1878         binsheadfree(bp, &iobufqueue, -1);
1879
1880         /* Wake up any processes waiting for any buffer to become free. */
1881         if (need_iobuffer) {
1882                 need_iobuffer = 0;
1883                 wakeup(&need_iobuffer);
1884         }
1885         bufstats.bufs_iobufinuse--;
1886         splx(s);
1887 }
1888
1889
1890 /* not hookedup yet */
1891
1892 /* XXX move this to a separate file */
1893 /*
1894  * Dynamic Scaling of the Buffer Queues
1895  */
1896
1897 typedef long long blsize_t;
1898
1899 blsize_t MAXNBUF; /* initialize to (mem_size / PAGE_SIZE) */
1900 /* Global tunable limits */
1901 blsize_t nbufh;                 /* number of buffer headers */
1902 blsize_t nbuflow;               /* minimum number of buffer headers required */
1903 blsize_t nbufhigh;              /* maximum number of buffer headers allowed */
1904 blsize_t nbuftarget;    /* preferred number of buffer headers */
1905
1906 /*
1907  * assertions:
1908  *
1909  * 1.   0 < nbuflow <= nbufh <= nbufhigh
1910  * 2.   nbufhigh <= MAXNBUF
1911  * 3.   0 < nbuflow <= nbuftarget <= nbufhigh
1912  * 4.   nbufh can not be set by sysctl().
1913  */
1914
1915 /* Per queue tunable limits */
1916
1917 struct bufqlim {
1918         blsize_t        bl_nlow;        /* minimum number of buffer headers required */
1919         blsize_t        bl_num;         /* number of buffer headers on the queue */
1920         blsize_t        bl_nlhigh;      /* maximum number of buffer headers allowed */
1921         blsize_t        bl_target;      /* preferred number of buffer headers */
1922         long    bl_stale;       /* Seconds after which a buffer is considered stale */
1923 } bufqlim[BQUEUES];
1924
1925 /*
1926  * assertions:
1927  *
1928  * 1.   0 <= bl_nlow <= bl_num <= bl_nlhigh
1929  * 2.   bl_nlhigh <= MAXNBUF
1930  * 3.  bufqlim[BQ_META].bl_nlow != 0
1931  * 4.  bufqlim[BQ_META].bl_nlow > (number of possible concurrent
1932  *                                                                      file system IO operations)
1933  * 5.   bl_num can not be set by sysctl().
1934  * 6.   bl_nhigh <= nbufhigh
1935  */
1936
1937 /*
1938  * Rationale:
1939  * ----------
1940  * Defining it blsize_t as long permits 2^31 buffer headers per queue.
1941  * Which can describe (2^31 * PAGE_SIZE) memory per queue.
1942  *
1943  * These limits are exported to by means of sysctl().
1944  * It was decided to define blsize_t as a 64 bit quantity.
1945  * This will make sure that we will not be required to change it
1946  * as long as we do not exceed 64 bit address space for the kernel.
1947  *
1948  * low and high numbers parameters initialized at compile time
1949  * and boot arguments can be used to override them. sysctl()
1950  * would not change the value. sysctl() can get all the values
1951  * but can set only target. num is the current level.
1952  *
1953  * Advantages of having a "bufqscan" thread doing the balancing are,
1954  * Keep enough bufs on BQ_EMPTY.
1955  *      getnewbuf() by default will always select a buffer from the BQ_EMPTY.
1956  *              getnewbuf() perfoms best if a buffer was found there.
1957  *              Also this minimizes the possibility of starting IO
1958  *              from getnewbuf(). That's a performance win, too.
1959  *
1960  *      Localize complex logic [balancing as well as time aging]
1961  *              to balancebufq().
1962  *
1963  *      Simplify getnewbuf() logic by elimination of time aging code.
1964  */
1965
1966 /*
1967  * Algorithm:
1968  * -----------
1969  * The goal of the dynamic scaling of the buffer queues to to keep
1970  * the size of the LRU close to bl_target. Buffers on a queue would
1971  * be time aged.
1972  *
1973  * There would be a thread which will be responsible for "balancing"
1974  * the buffer cache queues.
1975  *
1976  * The scan order would be:     AGE, LRU, META, EMPTY.
1977  */
1978
1979 long bufqscanwait = 0;
1980
1981 extern void bufqscan_thread();
1982 extern int balancebufq(int q);
1983 extern int btrimempty(int n);
1984 extern int initbufqscan(void);
1985 extern int nextbufq(int q);
1986 extern void buqlimprt(int all);
1987
1988 void
1989 bufq_balance_thread_init()
1990 {
1991
1992         if (bufqscanwait++ == 0) {
1993                 int i;
1994
1995                 /* Initalize globals */
1996                 MAXNBUF = (mem_size / PAGE_SIZE);
1997                 nbufh = nbuf;
1998                 nbuflow = min(nbufh, 100);
1999                 nbufhigh = min(MAXNBUF, max(nbufh, 2048));
2000                 nbuftarget = (mem_size >> 5) / PAGE_SIZE;
2001                 nbuftarget = max(nbuflow, nbuftarget);
2002                 nbuftarget = min(nbufhigh, nbuftarget);
2003
2004                 /*
2005                  * Initialize the bufqlim
2006                  */
2007
2008                 /* LOCKED queue */
2009                 bufqlim[BQ_LOCKED].bl_nlow = 0;
2010                 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
2011                 bufqlim[BQ_LOCKED].bl_target = 0;
2012                 bufqlim[BQ_LOCKED].bl_stale = 30;
2013
2014                 /* LRU queue */
2015                 bufqlim[BQ_LRU].bl_nlow = 0;
2016                 bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
2017                 bufqlim[BQ_LRU].bl_target = nbuftarget/4;
2018                 bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
2019
2020                 /* AGE queue */
2021                 bufqlim[BQ_AGE].bl_nlow = 0;
2022                 bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
2023                 bufqlim[BQ_AGE].bl_target = nbuftarget/4;
2024                 bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
2025
2026                 /* EMPTY queue */
2027                 bufqlim[BQ_EMPTY].bl_nlow = 0;
2028                 bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
2029                 bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
2030                 bufqlim[BQ_EMPTY].bl_stale = 600000;
2031
2032                 /* META queue */
2033                 bufqlim[BQ_META].bl_nlow = 0;
2034                 bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
2035                 bufqlim[BQ_META].bl_target = nbuftarget/4;
2036                 bufqlim[BQ_META].bl_stale = META_IS_STALE;
2037
2038                 /* LAUNDRY queue */
2039                 bufqlim[BQ_LOCKED].bl_nlow = 0;
2040                 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
2041                 bufqlim[BQ_LOCKED].bl_target = 0;
2042                 bufqlim[BQ_LOCKED].bl_stale = 30;
2043
2044                 buqlimprt(1);
2045         }
2046
2047         /* create worker thread */
2048         kernel_thread(kernel_task, bufqscan_thread);
2049 }
2050
2051 /* The workloop for the buffer balancing thread */
2052 void
2053 bufqscan_thread()
2054 {
2055         boolean_t       funnel_state;
2056         int moretodo = 0;
2057
2058         funnel_state = thread_funnel_set(kernel_flock, TRUE);
2059
2060         for(;;) {
2061                 do {
2062                         int q;  /* buffer queue to process */
2063
2064                         for (q = initbufqscan(); q; ) {
2065                                 moretodo |= balancebufq(q);
2066                                 q = nextbufq(q);
2067                         }
2068                 } while (moretodo);
2069
2070 #if 1 || DIAGNOSTIC
2071                 vfs_bufstats();
2072                 buqlimprt(0);
2073 #endif
2074                 (void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz);
2075                 moretodo = 0;
2076         }
2077
2078         (void) thread_funnel_set(kernel_flock, FALSE);
2079 }
2080
2081 /* Seed for the buffer queue balancing */
2082 int
2083 initbufqscan()
2084 {
2085         /* Start with AGE queue */
2086         return (BQ_AGE);
2087 }
2088
2089 /* Pick next buffer queue to balance */
2090 int
2091 nextbufq(int q)
2092 {
2093         int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
2094
2095         q++;
2096         q %= sizeof(order);
2097         return (order[q]);
2098 }
2099
2100 /* function to balance the buffer queues */
2101 int
2102 balancebufq(int q)
2103 {
2104         int moretodo = 0;
2105         int s = splbio();
2106         int n;
2107
2108         /* reject invalid q */
2109         if ((q < 0) || (q >= BQUEUES))
2110                 goto out;
2111
2112         /* LOCKED or LAUNDRY queue MUST not be balanced */
2113         if ((q == BQ_LOCKED) || (q == BQ_LAUNDRY))
2114                 goto out;
2115
2116         n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
2117
2118         /* If queue has less than target nothing more to do */
2119         if (n < 0)
2120                 goto out;
2121
2122         if ( n > 8 ) {
2123                 /* Balance only a small amount (12.5%) at a time */
2124                 n >>= 3;
2125         }
2126
2127         /* EMPTY queue needs special handling */
2128         if (q == BQ_EMPTY) {
2129                 moretodo |= btrimempty(n);
2130                 goto out;
2131         }
2132
2133         for (; n > 0; n--) {
2134                 struct buf *bp = bufqueues[q].tqh_first;
2135                 if (!bp)
2136                         break;
2137
2138                 /* check if it's stale */
2139                 if ((time.tv_sec - bp->b_timestamp) > bufqlim[q].bl_stale) {
2140                         if (bcleanbuf(bp)) {
2141                                 /* bawrite() issued, bp not ready */
2142                                 moretodo = 1;
2143                         } else {
2144                                 /* release the cleaned buffer to BQ_EMPTY */
2145                                 SET(bp->b_flags, B_INVAL);
2146                                 brelse(bp);
2147                         }
2148                 } else
2149                         break;
2150         }
2151
2152 out:
2153         splx(s);
2154         return (moretodo);
2155 }
2156
2157 int
2158 btrimempty(int n)
2159 {
2160         /*
2161          * When struct buf are allocated dynamically, this would
2162          * reclaim upto 'n' struct buf from the empty queue.
2163          */
2164
2165          return (0);
2166 }
2167
2168 void
2169 bufqinc(int q)
2170 {
2171         if ((q < 0) || (q >= BQUEUES))
2172                 return;
2173
2174         bufqlim[q].bl_num++;
2175         return;
2176 }
2177
2178 void
2179 bufqdec(int q)
2180 {
2181         if ((q < 0) || (q >= BQUEUES))
2182                 return;
2183
2184         bufqlim[q].bl_num--;
2185         return;
2186 }
2187
2188 void
2189 buqlimprt(int all)
2190 {
2191         int i;
2192     static char *bname[BQUEUES] =
2193                 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
2194
2195         if (all)
2196                 for (i = 0; i < BQUEUES; i++) {
2197                         printf("%s : ", bname[i]);
2198                         printf("min = %d, ", (long)bufqlim[i].bl_nlow);
2199                         printf("cur = %d, ", (long)bufqlim[i].bl_num);
2200                         printf("max = %d, ", (long)bufqlim[i].bl_nlhigh);
2201                         printf("target = %d, ", (long)bufqlim[i].bl_target);
2202                         printf("stale after %d seconds\n", bufqlim[i].bl_stale);
2203                 }
2204         else
2205                 for (i = 0; i < BQUEUES; i++) {
2206                         printf("%s : ", bname[i]);
2207                         printf("cur = %d, ", (long)bufqlim[i].bl_num);
2208                 }
2209 }
2210
2211 /*
2212  * If the getnewbuf() calls bcleanbuf() on the same thread
2213  * there is a potential for stack overrun and deadlocks.
2214  * So we always handoff the work to worker thread for completion
2215  */
2216
2217 static void
2218 bcleanbuf_thread_init()
2219 {
2220         static void bcleanbuf_thread();
2221
2222         /* create worker thread */
2223         kernel_thread(kernel_task, bcleanbuf_thread);
2224 }
2225
2226 static void
2227 bcleanbuf_thread()
2228 {
2229         boolean_t       funnel_state;
2230         struct buf *bp;
2231
2232         funnel_state = thread_funnel_set(kernel_flock, TRUE);
2233
2234 doit:
2235         while (blaundrycnt == 0)
2236                 (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 60 * hz);
2237         bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY]);
2238         /* Remove from the queue */
2239         bremfree(bp);
2240         blaundrycnt--;
2241         /* do the IO */
2242         bawrite(bp);
2243         /* start again */
2244         goto doit;
2245
2246         (void) thread_funnel_set(kernel_flock, funnel_state);
2247 }
2248