bsd/vfs/vfs_bio.c

   1 /*
   2  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  23 /*-
  24  * Copyright (c) 1994 Christopher G. Demetriou
  25  * Copyright (c) 1982, 1986, 1989, 1993
  26  *      The Regents of the University of California.  All rights reserved.
  27  * (c) UNIX System Laboratories, Inc.
  28  * All or some portions of this file are derived from material licensed
  29  * to the University of California by American Telephone and Telegraph
  30  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  31  * the permission of UNIX System Laboratories, Inc.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  * The NEXTSTEP Software License Agreement specifies the terms
  62  * and conditions for redistribution.
  63  *
  64  *      @(#)vfs_bio.c   8.6 (Berkeley) 1/11/94
  65  */
  66
  67 /*
  68  * Some references:
  69  *      Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
  70  *      Leffler, et al.: The Design and Implementation of the 4.3BSD
  71  *              UNIX Operating System (Addison Welley, 1989)
  72  */
  73 #define ZALLOC_METADATA 1
  74
  75 #include <sys/param.h>
  76 #include <sys/systm.h>
  77 #include <sys/proc.h>
  78 #include <sys/buf.h>
  79 #include <sys/vnode.h>
  80 #include <sys/mount.h>
  81 #include <sys/trace.h>
  82 #include <sys/malloc.h>
  83 #include <sys/resourcevar.h>
  84 #include <miscfs/specfs/specdev.h>
  85 #include <sys/ubc.h>
  86 #include <vm/vm_pageout.h>
  87 #if DIAGNOSTIC
  88 #include <kern/assert.h>
  89 #endif /* DIAGNOSTIC */
  90 #include <kern/task.h>
  91 #include <kern/zalloc.h>
  92
  93 #include <sys/kdebug.h>
  94
  95 extern void bufqinc(int q);
  96 extern void bufqdec(int q);
  97 extern void bufq_balance_thread_init();
  98
  99 extern void reassignbuf(struct buf *, struct vnode *);
 100 static struct buf *getnewbuf(int slpflag, int slptimeo, int *queue);
 101
 102 extern int niobuf;              /* The number of IO buffer headers for cluster IO */
 103 int blaundrycnt;
 104
 105 #if TRACE
 106 struct  proc *traceproc;
 107 int     tracewhich, tracebuf[TRCSIZ];
 108 u_int   tracex;
 109 char    traceflags[TR_NFLAGS];
 110 #endif /* TRACE */
 111
 112 /*
 113  * Definitions for the buffer hash lists.
 114  */
 115 #define BUFHASH(dvp, lbn)       \
 116         (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
 117 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
 118 u_long  bufhash;
 119
 120 /* Definitions for the buffer stats. */
 121 struct bufstats bufstats;
 122
 123 /*
 124  * Insq/Remq for the buffer hash lists.
 125  */
 126 #if 0
 127 #define binshash(bp, dp)        LIST_INSERT_HEAD(dp, bp, b_hash)
 128 #define bremhash(bp)            LIST_REMOVE(bp, b_hash)
 129 #endif /* 0 */
 130
 131
 132 TAILQ_HEAD(ioqueue, buf) iobufqueue;
 133 TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
 134 int needbuffer;
 135 int need_iobuffer;
 136
 137 /*
 138  * Insq/Remq for the buffer free lists.
 139  */
 140 #define binsheadfree(bp, dp, whichq)    do { \
 141                                     TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
 142                                         bufqinc((whichq));      \
 143                                         (bp)->b_whichq = whichq; \
 144                                     (bp)->b_timestamp = time.tv_sec; \
 145                                 } while (0)
 146
 147 #define binstailfree(bp, dp, whichq)    do { \
 148                                     TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
 149                                         bufqinc((whichq));      \
 150                                         (bp)->b_whichq = whichq; \
 151                                     (bp)->b_timestamp = time.tv_sec; \
 152                                 } while (0)
 153
 154 #define BHASHENTCHECK(bp)       \
 155         if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef)  \
 156                 panic("%x: b_hash.le_prev is not deadbeef", (bp));
 157
 158 #define BLISTNONE(bp)   \
 159         (bp)->b_hash.le_next = (struct buf *)0; \
 160         (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
 161
 162 simple_lock_data_t bufhashlist_slock;           /* lock on buffer hash list */
 163
 164 /*
 165  * Time in seconds before a buffer on a list is
 166  * considered as a stale buffer
 167  */
 168 #define LRU_IS_STALE 120 /* default value for the LRU */
 169 #define AGE_IS_STALE 60  /* default value for the AGE */
 170 #define META_IS_STALE 180 /* default value for the BQ_META */
 171
 172 int lru_is_stale = LRU_IS_STALE;
 173 int age_is_stale = AGE_IS_STALE;
 174 int meta_is_stale = META_IS_STALE;
 175
 176 #if 1
 177 void
 178 blistenterhead(struct bufhashhdr * head, struct buf * bp)
 179 {
 180         if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
 181                 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
 182         (head)->lh_first = bp;
 183         bp->b_hash.le_prev = &(head)->lh_first;
 184         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
 185                 panic("blistenterhead: le_prev is deadbeef");
 186
 187 }
 188 #endif
 189
 190 #if 1
 191 void
 192 binshash(struct buf *bp, struct bufhashhdr *dp)
 193 {
 194 int s;
 195
 196 struct buf *nbp;
 197
 198         simple_lock(&bufhashlist_slock);
 199 #if 0
 200         if(incore(bp->b_vp, bp->b_lblkno)) {
 201                 panic("adding to queue already existing element");
 202         }
 203 #endif /* 0 */
 204         BHASHENTCHECK(bp);
 205
 206         nbp = dp->lh_first;
 207         for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
 208                 if(nbp == bp)
 209                         panic("buf already in hashlist");
 210         }
 211
 212 #if 0
 213         LIST_INSERT_HEAD(dp, bp, b_hash);
 214 #else
 215         blistenterhead(dp, bp);
 216 #endif
 217         simple_unlock(&bufhashlist_slock);
 218 }
 219
 220 void
 221 bremhash(struct buf *bp)
 222 {
 223         int s;
 224
 225         simple_lock(&bufhashlist_slock);
 226         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
 227                 panic("bremhash le_prev is deadbeef");
 228         if (bp->b_hash.le_next == bp)
 229                 panic("bremhash: next points to self");
 230
 231         if (bp->b_hash.le_next != NULL)
 232                 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
 233         *bp->b_hash.le_prev = (bp)->b_hash.le_next;
 234         simple_unlock(&bufhashlist_slock);
 235 }
 236
 237 #endif /* 1 */
 238
 239
 240 /*
 241  * Remove a buffer from the free list it's on
 242  */
 243 void
 244 bremfree(bp)
 245         struct buf *bp;
 246 {
 247         struct bqueues *dp = NULL;
 248         int whichq = -1;
 249
 250         /*
 251          * We only calculate the head of the freelist when removing
 252          * the last element of the list as that is the only time that
 253          * it is needed (e.g. to reset the tail pointer).
 254          *
 255          * NB: This makes an assumption about how tailq's are implemented.
 256          */
 257         if (bp->b_freelist.tqe_next == NULL) {
 258                 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
 259                         if (dp->tqh_last == &bp->b_freelist.tqe_next)
 260                                 break;
 261                 if (dp == &bufqueues[BQUEUES])
 262                         panic("bremfree: lost tail");
 263         }
 264         TAILQ_REMOVE(dp, bp, b_freelist);
 265         whichq = bp->b_whichq;
 266         bufqdec(whichq);
 267         bp->b_whichq = -1;
 268         bp->b_timestamp = 0;
 269 }
 270
 271 static __inline__ void
 272 bufhdrinit(struct buf *bp)
 273 {
 274         bzero((char *)bp, sizeof *bp);
 275         bp->b_dev = NODEV;
 276         bp->b_rcred = NOCRED;
 277         bp->b_wcred = NOCRED;
 278         bp->b_vnbufs.le_next = NOLIST;
 279         bp->b_flags = B_INVAL;
 280
 281         return;
 282 }
 283
 284 /*
 285  * Initialize buffers and hash links for buffers.
 286  */
 287 void
 288 bufinit()
 289 {
 290         register struct buf *bp;
 291         register struct bqueues *dp;
 292         register int i;
 293         int metabuf;
 294         long whichq;
 295         static void bufzoneinit();
 296         static void bcleanbuf_thread_init();
 297
 298         /* Initialize the buffer queues ('freelists') and the hash table */
 299         for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
 300                 TAILQ_INIT(dp);
 301         bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash);
 302
 303         simple_lock_init(&bufhashlist_slock );
 304
 305         metabuf = nbuf/8; /* reserved for meta buf */
 306
 307         /* Initialize the buffer headers */
 308         for (i = 0; i < nbuf; i++) {
 309                 bp = &buf[i];
 310                 bufhdrinit(bp);
 311
 312                 /*
 313                  * metabuf buffer headers on the meta-data list and
 314                  * rest of the buffer headers on the empty list
 315                  */
 316                 if (--metabuf)
 317                         whichq = BQ_META;
 318                 else
 319                         whichq = BQ_EMPTY;
 320
 321                 BLISTNONE(bp);
 322                 dp = &bufqueues[whichq];
 323                 binsheadfree(bp, dp, whichq);
 324                 binshash(bp, &invalhash);
 325         }
 326
 327         for (; i < nbuf + niobuf; i++) {
 328                 bp = &buf[i];
 329                 bufhdrinit(bp);
 330                 binsheadfree(bp, &iobufqueue, -1);
 331         }
 332
 333         printf("using %d buffer headers and %d cluster IO buffer headers\n",
 334                 nbuf, niobuf);
 335
 336         /* Set up zones used by the buffer cache */
 337         bufzoneinit();
 338
 339         /* start the bcleanbuf() thread */
 340         bcleanbuf_thread_init();
 341
 342 #if 0   /* notyet */
 343         /* create a thread to do dynamic buffer queue balancing */
 344         bufq_balance_thread_init();
 345 #endif /* XXX */
 346 }
 347
 348 /* __inline  */
 349 struct buf *
 350 bio_doread(vp, blkno, size, cred, async, queuetype)
 351         struct vnode *vp;
 352         daddr_t blkno;
 353         int size;
 354         struct ucred *cred;
 355         int async;
 356         int queuetype;
 357 {
 358         register struct buf *bp;
 359         struct proc     *p = current_proc();
 360
 361         bp = getblk(vp, blkno, size, 0, 0, queuetype);
 362
 363         /*
 364          * If buffer does not have data valid, start a read.
 365          * Note that if buffer is B_INVAL, getblk() won't return it.
 366          * Therefore, it's valid if it's I/O has completed or been delayed.
 367          */
 368         if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
 369                 /* Start I/O for the buffer (keeping credentials). */
 370                 SET(bp->b_flags, B_READ | async);
 371                 if (cred != NOCRED && bp->b_rcred == NOCRED) {
 372                         /*
 373                          * NFS has embedded ucred.
 374                          * Can not crhold() here as that causes zone corruption
 375                          */
 376                         bp->b_rcred = crdup(cred);
 377                 }
 378                 VOP_STRATEGY(bp);
 379
 380                 trace(TR_BREADMISS, pack(vp, size), blkno);
 381
 382                 /* Pay for the read. */
 383                 if (p && p->p_stats)
 384                         p->p_stats->p_ru.ru_inblock++;          /* XXX */
 385         } else if (async) {
 386                 brelse(bp);
 387         }
 388
 389         trace(TR_BREADHIT, pack(vp, size), blkno);
 390
 391         return (bp);
 392 }
 393 /*
 394  * Read a disk block.
 395  * This algorithm described in Bach (p.54).
 396  */
 397 int
 398 bread(vp, blkno, size, cred, bpp)
 399         struct vnode *vp;
 400         daddr_t blkno;
 401         int size;
 402         struct ucred *cred;
 403         struct buf **bpp;
 404 {
 405         register struct buf *bp;
 406
 407         /* Get buffer for block. */
 408         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
 409
 410         /* Wait for the read to complete, and return result. */
 411         return (biowait(bp));
 412 }
 413
 414 /*
 415  * Read a disk block. [bread() for meta-data]
 416  * This algorithm described in Bach (p.54).
 417  */
 418 int
 419 meta_bread(vp, blkno, size, cred, bpp)
 420         struct vnode *vp;
 421         daddr_t blkno;
 422         int size;
 423         struct ucred *cred;
 424         struct buf **bpp;
 425 {
 426         register struct buf *bp;
 427
 428         /* Get buffer for block. */
 429         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
 430
 431         /* Wait for the read to complete, and return result. */
 432         return (biowait(bp));
 433 }
 434
 435 /*
 436  * Read-ahead multiple disk blocks. The first is sync, the rest async.
 437  * Trivial modification to the breada algorithm presented in Bach (p.55).
 438  */
 439 int
 440 breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp)
 441         struct vnode *vp;
 442         daddr_t blkno; int size;
 443         daddr_t rablks[]; int rasizes[];
 444         int nrablks;
 445         struct ucred *cred;
 446         struct buf **bpp;
 447 {
 448         register struct buf *bp;
 449         int i;
 450
 451         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
 452
 453         /*
 454          * For each of the read-ahead blocks, start a read, if necessary.
 455          */
 456         for (i = 0; i < nrablks; i++) {
 457                 /* If it's in the cache, just go on to next one. */
 458                 if (incore(vp, rablks[i]))
 459                         continue;
 460
 461                 /* Get a buffer for the read-ahead block */
 462                 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, BLK_READ);
 463         }
 464
 465         /* Otherwise, we had to start a read for it; wait until it's valid. */
 466         return (biowait(bp));
 467 }
 468
 469 /*
 470  * Read with single-block read-ahead.  Defined in Bach (p.55), but
 471  * implemented as a call to breadn().
 472  * XXX for compatibility with old file systems.
 473  */
 474 int
 475 breada(vp, blkno, size, rablkno, rabsize, cred, bpp)
 476         struct vnode *vp;
 477         daddr_t blkno; int size;
 478         daddr_t rablkno; int rabsize;
 479         struct ucred *cred;
 480         struct buf **bpp;
 481 {
 482
 483         return (breadn(vp, blkno, size, &rablkno, &rabsize, 1, cred, bpp));
 484 }
 485
 486 /*
 487  * Block write.  Described in Bach (p.56)
 488  */
 489 int
 490 bwrite(bp)
 491         struct buf *bp;
 492 {
 493         int rv, sync, wasdelayed;
 494         struct proc     *p = current_proc();
 495         upl_t  upl;
 496         upl_page_info_t *pl;
 497         void * object;
 498         kern_return_t kret;
 499         struct vnode *vp = bp->b_vp;
 500
 501         /* Remember buffer type, to switch on it later. */
 502         sync = !ISSET(bp->b_flags, B_ASYNC);
 503         wasdelayed = ISSET(bp->b_flags, B_DELWRI);
 504         CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
 505
 506         if (!sync) {
 507                 /*
 508                  * If not synchronous, pay for the I/O operation and make
 509                  * sure the buf is on the correct vnode queue.  We have
 510                  * to do this now, because if we don't, the vnode may not
 511                  * be properly notified that its I/O has completed.
 512                  */
 513                 if (wasdelayed)
 514                         reassignbuf(bp, vp);
 515                 else
 516                 if (p && p->p_stats)
 517                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
 518         }
 519
 520         trace(TR_BWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
 521
 522         /* Initiate disk write.  Make sure the appropriate party is charged. */
 523         SET(bp->b_flags, B_WRITEINPROG);
 524         vp->v_numoutput++;
 525
 526         VOP_STRATEGY(bp);
 527
 528         if (sync) {
 529                 /*
 530                  * If I/O was synchronous, wait for it to complete.
 531                  */
 532                 rv = biowait(bp);
 533
 534                 /*
 535                  * Pay for the I/O operation, if it's not been paid for, and
 536                  * make sure it's on the correct vnode queue. (async operatings
 537                  * were payed for above.)
 538                  */
 539                 if (wasdelayed)
 540                         reassignbuf(bp, vp);
 541                 else
 542                 if (p && p->p_stats)
 543                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
 544
 545                 /* Release the buffer. */
 546                 brelse(bp);
 547
 548                 return (rv);
 549         } else {
 550                 return (0);
 551         }
 552 }
 553
 554 int
 555 vn_bwrite(ap)
 556         struct vop_bwrite_args *ap;
 557 {
 558         return (bwrite(ap->a_bp));
 559 }
 560
 561 /*
 562  * Delayed write.
 563  *
 564  * The buffer is marked dirty, but is not queued for I/O.
 565  * This routine should be used when the buffer is expected
 566  * to be modified again soon, typically a small write that
 567  * partially fills a buffer.
 568  *
 569  * NB: magnetic tapes cannot be delayed; they must be
 570  * written in the order that the writes are requested.
 571  *
 572  * Described in Leffler, et al. (pp. 208-213).
 573  */
 574 void
 575 bdwrite(bp)
 576         struct buf *bp;
 577 {
 578         struct proc *p = current_proc();
 579         kern_return_t kret;
 580         upl_t upl;
 581         upl_page_info_t *pl;
 582
 583         /*
 584          * If the block hasn't been seen before:
 585          *      (1) Mark it as having been seen,
 586          *      (2) Charge for the write.
 587          *      (3) Make sure it's on its vnode's correct block list,
 588          */
 589         if (!ISSET(bp->b_flags, B_DELWRI)) {
 590                 SET(bp->b_flags, B_DELWRI);
 591                 if (p && p->p_stats)
 592                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
 593
 594                 reassignbuf(bp, bp->b_vp);
 595         }
 596
 597
 598         /* If this is a tape block, write it the block now. */
 599         if (ISSET(bp->b_flags, B_TAPE)) {
 600                 /* bwrite(bp); */
 601         VOP_BWRITE(bp);
 602                 return;
 603         }
 604
 605         /* Otherwise, the "write" is done, so mark and release the buffer. */
 606         SET(bp->b_flags, B_DONE);
 607         brelse(bp);
 608 }
 609
 610 /*
 611  * Asynchronous block write; just an asynchronous bwrite().
 612  */
 613 void
 614 bawrite(bp)
 615         struct buf *bp;
 616 {
 617
 618         SET(bp->b_flags, B_ASYNC);
 619         VOP_BWRITE(bp);
 620 }
 621
 622 /*
 623  * Release a buffer on to the free lists.
 624  * Described in Bach (p. 46).
 625  */
 626 void
 627 brelse(bp)
 628         struct buf *bp;
 629 {
 630         struct bqueues *bufq;
 631         int s;
 632         long whichq;
 633
 634         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
 635                      bp->b_lblkno * PAGE_SIZE, bp, bp->b_data, bp->b_flags, 0);
 636
 637         trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
 638
 639         /* IO is done. Cleanup the UPL state */
 640         if (!ISSET(bp->b_flags, B_META)
 641                 && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
 642                 kern_return_t kret;
 643                 upl_t         upl;
 644                 int           upl_flags;
 645
 646                 if ( !ISSET(bp->b_flags, B_PAGELIST)) {
 647                         if ( !ISSET(bp->b_flags, B_INVAL)) {
 648                                 kret = ubc_create_upl(bp->b_vp,
 649                                                                 ubc_blktooff(bp->b_vp, bp->b_lblkno),
 650                                                                 bp->b_bufsize,
 651                                                             &upl,
 652                                                                 NULL,
 653                                                                 UPL_PRECIOUS);
 654                                 if (kret != KERN_SUCCESS)
 655                                         panic("brelse: Failed to get pagelists");
 656 #ifdef  UBC_DEBUG
 657                                 upl_ubc_alias_set(upl, bp, 5);
 658 #endif /* UBC_DEBUG */
 659                         } else
 660                                 upl = (upl_t) 0;
 661                 } else {
 662                         upl = bp->b_pagelist;
 663                         kret = ubc_upl_unmap(upl);
 664
 665                         if (kret != KERN_SUCCESS)
 666                                 panic("kernel_upl_unmap failed");
 667                         bp->b_data = 0;
 668                 }
 669                 if (upl) {
 670                         if (bp->b_flags & (B_ERROR | B_INVAL)) {
 671                             if (bp->b_flags & (B_READ | B_INVAL))
 672                                         upl_flags = UPL_ABORT_DUMP_PAGES;
 673                                 else
 674                                         upl_flags = 0;
 675                                 ubc_upl_abort(upl, upl_flags);
 676                         } else {
 677                             if (ISSET(bp->b_flags, (B_DELWRI | B_WASDIRTY)))
 678                                         upl_flags = UPL_COMMIT_SET_DIRTY ;
 679                                 else
 680                                     upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
 681                                 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
 682                                         UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
 683                         }
 684                         s = splbio();
 685                         CLR(bp->b_flags, B_PAGELIST);
 686                         bp->b_pagelist = 0;
 687                         splx(s);
 688                 }
 689         } else {
 690                 if(ISSET(bp->b_flags, B_PAGELIST))
 691                         panic("brelse: pagelist set for non VREG; vp=%x", bp->b_vp);
 692         }
 693
 694         /* Wake up any processes waiting for any buffer to become free. */
 695         if (needbuffer) {
 696                 needbuffer = 0;
 697                 wakeup(&needbuffer);
 698         }
 699
 700         /* Wake up any proceeses waiting for _this_ buffer to become free. */
 701         if (ISSET(bp->b_flags, B_WANTED)) {
 702                 CLR(bp->b_flags, B_WANTED);
 703                 wakeup(bp);
 704         }
 705
 706         /* Block disk interrupts. */
 707         s = splbio();
 708
 709         /*
 710          * Determine which queue the buffer should be on, then put it there.
 711          */
 712
 713         /* If it's locked, don't report an error; try again later. */
 714         if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
 715                 CLR(bp->b_flags, B_ERROR);
 716
 717         /* If it's not cacheable, or an error, mark it invalid. */
 718         if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
 719                 SET(bp->b_flags, B_INVAL);
 720
 721         if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
 722                 /*
 723                  * If it's invalid or empty, dissociate it from its vnode
 724                  * and put on the head of the appropriate queue.
 725                  */
 726                 if (bp->b_vp)
 727                         brelvp(bp);
 728                 CLR(bp->b_flags, B_DELWRI);
 729                 if (bp->b_bufsize <= 0)
 730                         whichq = BQ_EMPTY;      /* no data */
 731                 else
 732                         whichq = BQ_AGE;        /* invalid data */
 733
 734                 bufq = &bufqueues[whichq];
 735                 binsheadfree(bp, bufq, whichq);
 736         } else {
 737                 /*
 738                  * It has valid data.  Put it on the end of the appropriate
 739                  * queue, so that it'll stick around for as long as possible.
 740                  */
 741                 if (ISSET(bp->b_flags, B_LOCKED))
 742                         whichq = BQ_LOCKED;             /* locked in core */
 743                 else if (ISSET(bp->b_flags, B_META))
 744                         whichq = BQ_META;               /* meta-data */
 745                 else if (ISSET(bp->b_flags, B_AGE))
 746                         whichq = BQ_AGE;                /* stale but valid data */
 747                 else
 748                         whichq = BQ_LRU;                /* valid data */
 749
 750                 bufq = &bufqueues[whichq];
 751                 binstailfree(bp, bufq, whichq);
 752         }
 753
 754         /* Unlock the buffer. */
 755         CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE));
 756
 757         /* Allow disk interrupts. */
 758         splx(s);
 759
 760         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
 761                      bp, bp->b_data, bp->b_flags, 0, 0);
 762 }
 763
 764 /*
 765  * Determine if a block is in the cache.
 766  * Just look on what would be its hash chain.  If it's there, return
 767  * a pointer to it, unless it's marked invalid.  If it's marked invalid,
 768  * we normally don't return the buffer, unless the caller explicitly
 769  * wants us to.
 770  */
 771 struct buf *
 772 incore(vp, blkno)
 773         struct vnode *vp;
 774         daddr_t blkno;
 775 {
 776         struct buf *bp;
 777         int bufseen = 0;
 778
 779         bp = BUFHASH(vp, blkno)->lh_first;
 780
 781         /* Search hash chain */
 782         for (; bp != NULL; bp = bp->b_hash.le_next, bufseen++) {
 783                 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
 784                     !ISSET(bp->b_flags, B_INVAL))
 785                         return (bp);
 786         if(bufseen >= nbuf)
 787                 panic("walked more than nbuf in incore");
 788
 789         }
 790
 791         return (0);
 792 }
 793
 794 /* XXX FIXME -- Update the comment to reflect the UBC changes -- */
 795 /*
 796  * Get a block of requested size that is associated with
 797  * a given vnode and block offset. If it is found in the
 798  * block cache, mark it as having been found, make it busy
 799  * and return it. Otherwise, return an empty block of the
 800  * correct size. It is up to the caller to insure that the
 801  * cached blocks be of the correct size.
 802  */
 803 struct buf *
 804 getblk(vp, blkno, size, slpflag, slptimeo, operation)
 805         register struct vnode *vp;
 806         daddr_t blkno;
 807         int size, slpflag, slptimeo, operation;
 808 {
 809         struct buf *bp;
 810         int s, err;
 811         upl_t upl;
 812         upl_page_info_t *pl;
 813         kern_return_t kret;
 814         int error=0;
 815         int pagedirty = 0;
 816
 817         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
 818                      blkno * PAGE_SIZE, size, operation, 0, 0);
 819 start:
 820
 821         s = splbio();
 822         if (bp = incore(vp, blkno)) {
 823                 /* Found in the Buffer Cache */
 824                 if (ISSET(bp->b_flags, B_BUSY)) {
 825                         /* but is busy */
 826                         switch (operation) {
 827                         case BLK_READ:
 828                         case BLK_WRITE:
 829                         case BLK_META:
 830                                 SET(bp->b_flags, B_WANTED);
 831                                 bufstats.bufs_busyincore++;
 832                                 err = tsleep(bp, slpflag | (PRIBIO + 1), "getblk",
 833                                     slptimeo);
 834                                 splx(s);
 835                                 /*
 836                                  * Callers who call with PCATCH or timeout are
 837                                  * willing to deal with the NULL pointer
 838                                  */
 839                                 if (err && ((slpflag & PCATCH) ||
 840                                                          ((err == EWOULDBLOCK) && slptimeo)))
 841                                         return (NULL);
 842                                 goto start;
 843                                 /*NOTREACHED*/
 844                                 break;
 845
 846                         case BLK_PAGEIN:
 847                                 /* pagein operation must not use getblk */
 848                                 panic("getblk: pagein for incore busy buffer");
 849                                 splx(s);
 850                                 /*NOTREACHED*/
 851                                 break;
 852
 853                         case BLK_PAGEOUT:
 854                                 /* pageout operation must not use getblk */
 855                                 panic("getblk: pageout for incore busy buffer");
 856                                 splx(s);
 857                                 /*NOTREACHED*/
 858                                 break;
 859
 860                         default:
 861                                 panic("getblk: %d unknown operation 1", operation);
 862                                 /*NOTREACHED*/
 863                                 break;
 864                         }
 865                 } else {
 866                         /* not busy */
 867                         SET(bp->b_flags, (B_BUSY | B_CACHE));
 868                         bremfree(bp);
 869                         bufstats.bufs_incore++;
 870                         splx(s);
 871
 872                         allocbuf(bp, size);
 873                         if (ISSET(bp->b_flags, B_PAGELIST))
 874                                         panic("pagelist buffer is not busy");
 875
 876                         switch (operation) {
 877                         case BLK_READ:
 878                         case BLK_WRITE:
 879                                 if (UBCISVALID(bp->b_vp) && bp->b_bufsize) {
 880                                         kret = ubc_create_upl(vp,
 881                                                                         ubc_blktooff(vp, bp->b_lblkno),
 882                                                                         bp->b_bufsize,
 883                                                                         &upl,
 884                                                                         &pl,
 885                                                                         UPL_PRECIOUS);
 886                                         if (kret != KERN_SUCCESS)
 887                                                 panic("Failed to get pagelists");
 888
 889                                         SET(bp->b_flags, B_PAGELIST);
 890                                         bp->b_pagelist = upl;
 891
 892                                         if ( !upl_valid_page(pl, 0))
 893                                                 panic("getblk: incore buffer without valid page");
 894
 895                                         if (upl_dirty_page(pl, 0))
 896                                                 SET(bp->b_flags, B_WASDIRTY);
 897                                         else
 898                                                 CLR(bp->b_flags, B_WASDIRTY);
 899
 900                                         kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
 901                                         if (kret != KERN_SUCCESS) {
 902                                                 panic("getblk: ubc_upl_map() failed with (%d)",
 903                                                                   kret);
 904                                         }
 905                                         if (bp->b_data == 0) panic("ubc_upl_map mapped 0");
 906                                 }
 907                                 break;
 908
 909                         case BLK_META:
 910                                 /*
 911                                  * VM is not involved in IO for the meta data
 912                                  * buffer already has valid data
 913                                  */
 914                         if(bp->b_data == 0)
 915                                         panic("bp->b_data null incore buf=%x", bp);
 916                                 break;
 917
 918                         case BLK_PAGEIN:
 919                         case BLK_PAGEOUT:
 920                                 panic("getblk: paging operation 1");
 921                                 break;
 922
 923                         default:
 924                                 panic("getblk: %d unknown operation 2", operation);
 925                                 /*NOTREACHED*/
 926                                 break;
 927                         }
 928                 }
 929         } else { /* not incore() */
 930                 int queue = BQ_EMPTY; /* Start with no preference */
 931                 splx(s);
 932
 933                 if ((operation == BLK_META) || (UBCINVALID(vp)) ||
 934                         !(UBCINFOEXISTS(vp))) {
 935                         operation = BLK_META;
 936                 }
 937                 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
 938                         goto start;
 939                 if (incore(vp, blkno)) {
 940                         SET(bp->b_flags, B_INVAL);
 941                         binshash(bp, &invalhash);
 942                         brelse(bp);
 943                         goto start;
 944                 }
 945
 946                 /*
 947                  * if it is meta, the queue may be set to other
 948                  * type so reset as well as mark it to be B_META
 949                  * so that when buffer is released it will goto META queue
 950                  * Also, if the vnode is not VREG, then it is META
 951                  */
 952                 if (operation == BLK_META) {
 953                         SET(bp->b_flags, B_META);
 954                         queue = BQ_META;
 955                 }
 956                 /*
 957                  * Insert in the hash so that incore() can find it
 958                  */
 959                 binshash(bp, BUFHASH(vp, blkno));
 960
 961                 allocbuf(bp, size);
 962
 963                 switch (operation) {
 964                 case BLK_META:
 965                         /* buffer data is invalid */
 966
 967 #if !ZALLOC_METADATA
 968                         if (bp->b_data)
 969                                 panic("bp->b_data is not nul; %x",bp);
 970                         kret = kmem_alloc(kernel_map,
 971                                                 &bp->b_data, bp->b_bufsize);
 972                         if (kret != KERN_SUCCESS)
 973                                 panic("getblk: kmem_alloc() returned %d", kret);
 974 #endif /* ZALLOC_METADATA */
 975
 976                         if(bp->b_data == 0)
 977                                 panic("bp->b_data is null %x",bp);
 978
 979                         bp->b_blkno = bp->b_lblkno = blkno;
 980                         s = splbio();
 981                         bgetvp(vp, bp);
 982                         bufstats.bufs_miss++;
 983                         splx(s);
 984                         if (bp->b_data == 0)
 985                                 panic("b_data is 0: 2");
 986
 987                         /* wakeup the buffer */
 988                         CLR(bp->b_flags, B_WANTED);
 989                         wakeup(bp);
 990                         break;
 991
 992                 case BLK_READ:
 993                 case BLK_WRITE:
 994
 995                         if (ISSET(bp->b_flags, B_PAGELIST))
 996                                 panic("B_PAGELIST in bp=%x",bp);
 997
 998                         kret = ubc_create_upl(vp,
 999                                                         ubc_blktooff(vp, blkno),
1000                                                         bp->b_bufsize,
1001                                                         &upl,
1002                                                         &pl,
1003                                                         UPL_PRECIOUS);
1004                         if (kret != KERN_SUCCESS)
1005                                 panic("Failed to get pagelists");
1006
1007 #ifdef  UBC_DEBUG
1008                         upl_ubc_alias_set(upl, bp, 4);
1009 #endif /* UBC_DEBUG */
1010                         bp->b_blkno = bp->b_lblkno = blkno;
1011                         bp->b_pagelist = upl;
1012
1013                         SET(bp->b_flags, B_PAGELIST);
1014
1015                         if (upl_valid_page(pl, 0)) {
1016                                 SET(bp->b_flags, B_CACHE | B_DONE);
1017                                 bufstats.bufs_vmhits++;
1018
1019                                 pagedirty = upl_dirty_page(pl, 0);
1020
1021                                 if (pagedirty)
1022                                         SET(bp->b_flags, B_WASDIRTY);
1023
1024                                 if (vp->v_tag == VT_NFS) {
1025                                         off_t  f_offset;
1026                                         int    valid_size;
1027
1028                                         bp->b_validoff = 0;
1029                                         bp->b_dirtyoff = 0;
1030
1031                                         f_offset = ubc_blktooff(vp, blkno);
1032
1033                                         if (f_offset > vp->v_ubcinfo->ui_size) {
1034                                                 CLR(bp->b_flags, (B_CACHE|B_DONE|B_WASDIRTY));
1035                                                 bp->b_validend = 0;
1036                                                 bp->b_dirtyend = 0;
1037                                         } else {
1038                                                 valid_size = min(((unsigned int)(vp->v_ubcinfo->ui_size - f_offset)), PAGE_SIZE);
1039                                                 bp->b_validend = valid_size;
1040
1041                                                 if (pagedirty)
1042                                                        bp->b_dirtyend = valid_size;
1043                                                 else
1044                                                        bp->b_dirtyend = 0;
1045
1046                                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_NONE,
1047                                                              bp->b_validend, bp->b_dirtyend,
1048                                                              (int)vp->v_ubcinfo->ui_size, 0, 0);
1049                                         }
1050                                 } else {
1051                                         bp->b_validoff = 0;
1052                                         bp->b_dirtyoff = 0;
1053
1054                                         if (pagedirty) {
1055                                                 /* page is dirty */
1056                                                 bp->b_validend = bp->b_bcount;
1057                                                 bp->b_dirtyend = bp->b_bcount;
1058                                         } else {
1059                                                 /* page is clean */
1060                                                 bp->b_validend = bp->b_bcount;
1061                                                 bp->b_dirtyend = 0;
1062                                         }
1063                                 }
1064                                 if (error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL)) {
1065                                         panic("VOP_BMAP failed in getblk");
1066                                         /*NOTREACHED*/
1067                                         /*
1068                                          * XXX:  We probably should invalidate the VM Page
1069                                          */
1070                                         bp->b_error = error;
1071                                         SET(bp->b_flags, (B_ERROR | B_INVAL));
1072                                         /* undo B_DONE that was set before upl_commit() */
1073                                         CLR(bp->b_flags, B_DONE);
1074                                         brelse(bp);
1075                                         return (0);
1076                                 }
1077                         } else {
1078                                 bufstats.bufs_miss++;
1079                         }
1080                         kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
1081                         if (kret != KERN_SUCCESS) {
1082                                 panic("getblk: ubc_upl_map() "
1083                                       "failed with (%d)", kret);
1084                         }
1085                         if (bp->b_data == 0) panic("kernel_upl_map mapped 0");
1086
1087                         s = splbio();
1088                         bgetvp(vp, bp);
1089                         splx(s);
1090
1091                         break;
1092
1093                 case BLK_PAGEIN:
1094                 case BLK_PAGEOUT:
1095                         panic("getblk: paging operation 2");
1096                         break;
1097                 default:
1098                         panic("getblk: %d unknown operation 3", operation);
1099                         /*NOTREACHED*/
1100                         break;
1101                 }
1102         }
1103
1104         if (bp->b_data == NULL)
1105                 panic("getblk: bp->b_addr is null");
1106
1107         if (bp->b_bufsize & 0xfff) {
1108 #if ZALLOC_METADATA
1109                 if (ISSET(bp->b_flags, B_META) && (bp->b_bufsize & 0x1ff))
1110 #endif /* ZALLOC_METADATA */
1111                         panic("getblk: bp->b_bufsize = %d", bp->b_bufsize);
1112         }
1113
1114         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
1115                      bp, bp->b_data, bp->b_flags, 3, 0);
1116
1117         return (bp);
1118 }
1119
1120 /*
1121  * Get an empty, disassociated buffer of given size.
1122  */
1123 struct buf *
1124 geteblk(size)
1125         int size;
1126 {
1127         struct buf *bp;
1128     int queue = BQ_EMPTY;
1129 #if !ZALLOC_METADATA
1130         kern_return_t kret;
1131         vm_size_t desired_size = roundup(size, CLBYTES);
1132
1133         if (desired_size > MAXBSIZE)
1134                 panic("geteblk: buffer larger than MAXBSIZE requested");
1135 #endif /* ZALLOC_METADATA */
1136
1137         while ((bp = getnewbuf(0, 0, &queue)) == 0)
1138                 ;
1139 #if ZALLOC_METADATA
1140         SET(bp->b_flags, (B_META|B_INVAL));
1141 #else
1142         SET(bp->b_flags, B_INVAL);
1143 #endif /* ZALLOC_METADATA */
1144
1145 #if DIAGNOSTIC
1146         assert(queue == BQ_EMPTY);
1147 #endif /* DIAGNOSTIC */
1148         /* XXX need to implement logic to deal with other queues */
1149
1150 #if !ZALLOC_METADATA
1151         /* Empty buffer - allocate pages */
1152         kret = kmem_alloc_aligned(kernel_map, &bp->b_data, desired_size);
1153         if (kret != KERN_SUCCESS)
1154                 panic("geteblk: kmem_alloc_aligned returned %d", kret);
1155 #endif /* ZALLOC_METADATA */
1156
1157         binshash(bp, &invalhash);
1158         allocbuf(bp, size);
1159         bufstats.bufs_eblk++;
1160
1161         return (bp);
1162 }
1163
1164 #if ZALLOC_METADATA
1165 /*
1166  * Zones for the meta data buffers
1167  */
1168
1169 #define MINMETA 512
1170 #define MAXMETA 4096
1171
1172 struct meta_zone_entry {
1173         zone_t mz_zone;
1174         vm_size_t mz_size;
1175         vm_size_t mz_max;
1176         char *mz_name;
1177 };
1178
1179 struct meta_zone_entry meta_zones[] = {
1180         {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
1181         {NULL, (MINMETA * 2),  64 * (MINMETA * 2), "buf.1024" },
1182         {NULL, (MINMETA * 3),  16 * (MINMETA * 3), "buf.1536" },
1183         {NULL, (MINMETA * 4),  16 * (MINMETA * 4), "buf.2048" },
1184         {NULL, (MINMETA * 5),  16 * (MINMETA * 5), "buf.2560" },
1185         {NULL, (MINMETA * 6),  16 * (MINMETA * 6), "buf.3072" },
1186         {NULL, (MINMETA * 7),  16 * (MINMETA * 7), "buf.3584" },
1187         {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
1188         {NULL, 0, 0, "" } /* End */
1189 };
1190 #endif /* ZALLOC_METADATA */
1191
1192 zone_t buf_hdr_zone;
1193 int buf_hdr_count;
1194
1195 /*
1196  * Initialize the meta data zones
1197  */
1198 static void
1199 bufzoneinit(void)
1200 {
1201 #if ZALLOC_METADATA
1202         int i;
1203
1204         for (i = 0; meta_zones[i].mz_size != 0; i++) {
1205                 meta_zones[i].mz_zone =
1206                                 zinit(meta_zones[i].mz_size,
1207                                         meta_zones[i].mz_max,
1208                                         PAGE_SIZE,
1209                                         meta_zones[i].mz_name);
1210         }
1211 #endif /* ZALLOC_METADATA */
1212         buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
1213 }
1214
1215 #if ZALLOC_METADATA
1216 static zone_t
1217 getbufzone(size_t size)
1218 {
1219         int i;
1220
1221         if (size % 512)
1222                 panic("getbufzone: incorect size = %d", size);
1223
1224         i = (size / 512) - 1;
1225         return (meta_zones[i].mz_zone);
1226 }
1227 #endif /* ZALLOC_METADATA */
1228
1229 /*
1230  * With UBC, there is no need to expand / shrink the file data
1231  * buffer. The VM uses the same pages, hence no waste.
1232  * All the file data buffers can have one size.
1233  * In fact expand / shrink would be an expensive operation.
1234  *
1235  * Only exception to this is meta-data buffers. Most of the
1236  * meta data operations are smaller than PAGE_SIZE. Having the
1237  * meta-data buffers grow and shrink as needed, optimizes use
1238  * of the kernel wired memory.
1239  */
1240
1241 int
1242 allocbuf(bp, size)
1243         struct buf *bp;
1244         int size;
1245 {
1246         vm_size_t desired_size;
1247
1248         desired_size = roundup(size, CLBYTES);
1249
1250         if(desired_size < PAGE_SIZE)
1251                 desired_size = PAGE_SIZE;
1252         if (desired_size > MAXBSIZE)
1253                 panic("allocbuf: buffer larger than MAXBSIZE requested");
1254
1255 #if ZALLOC_METADATA
1256         if (ISSET(bp->b_flags, B_META)) {
1257                 kern_return_t kret;
1258                 zone_t zprev, z;
1259                 size_t nsize = roundup(size, MINMETA);
1260
1261                 if (bp->b_data) {
1262                         vm_offset_t elem = (vm_offset_t)bp->b_data;
1263
1264                         if (ISSET(bp->b_flags, B_ZALLOC))
1265                                 if (bp->b_bufsize <= MAXMETA) {
1266                                         if (bp->b_bufsize < nsize) {
1267                                                 /* reallocate to a bigger size */
1268                                                 desired_size = nsize;
1269
1270                                                 zprev = getbufzone(bp->b_bufsize);
1271                                                 z = getbufzone(nsize);
1272                                                 bp->b_data = (caddr_t)zalloc(z);
1273                                                 if(bp->b_data == 0)
1274                                                         panic("allocbuf: zalloc() returned NULL");
1275                                                 bcopy(elem, bp->b_data, bp->b_bufsize);
1276                                                 zfree(zprev, elem);
1277                                         } else {
1278                                                 desired_size = bp->b_bufsize;
1279                                         }
1280                                 } else
1281                                         panic("allocbuf: B_ZALLOC set incorrectly");
1282                         else
1283                                 if (bp->b_bufsize < desired_size) {
1284                                         /* reallocate to a bigger size */
1285                                         kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1286                                         if (kret != KERN_SUCCESS)
1287                                                 panic("allocbuf: kmem_alloc() returned %d", kret);
1288                                         if(bp->b_data == 0)
1289                                                 panic("allocbuf: null b_data");
1290                                         bcopy(elem, bp->b_data, bp->b_bufsize);
1291                                         kmem_free(kernel_map, elem, bp->b_bufsize);
1292                                 } else {
1293                                         desired_size = bp->b_bufsize;
1294                                 }
1295                 } else {
1296                         /* new allocation */
1297                         if (nsize <= MAXMETA) {
1298                                 desired_size = nsize;
1299                                 z = getbufzone(nsize);
1300                                 bp->b_data = (caddr_t)zalloc(z);
1301                                 if(bp->b_data == 0)
1302                                         panic("allocbuf: zalloc() returned NULL 2");
1303                                 SET(bp->b_flags, B_ZALLOC);
1304                         } else {
1305                                 kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1306                                 if (kret != KERN_SUCCESS)
1307                                         panic("allocbuf: kmem_alloc() 2 returned %d", kret);
1308                                 if(bp->b_data == 0)
1309                                         panic("allocbuf: null b_data 2");
1310                         }
1311                 }
1312         }
1313
1314         if (ISSET(bp->b_flags, B_META) && (bp->b_data == 0))
1315                 panic("allocbuf: bp->b_data is NULL");
1316 #endif /* ZALLOC_METADATA */
1317
1318                 bp->b_bufsize = desired_size;
1319                 bp->b_bcount = size;
1320 }
1321
1322 /*
1323  *      Get a new buffer from one of the free lists.
1324  *
1325  *      Request for a queue is passes in. The queue from which the buffer was taken
1326  *      from is returned. Out of range queue requests get BQ_EMPTY. Request for
1327  *      BQUEUE means no preference. Use heuristics in that case.
1328  *      Heuristics is as follows:
1329  *      Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
1330  *      If none available block till one is made available.
1331  *      If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
1332  *      Pick the most stale buffer.
1333  *      If found buffer was marked delayed write, start the async. write
1334  *      and restart the search.
1335  *      Initialize the fields and disassociate the buffer from the vnode.
1336  *      Remove the buffer from the hash. Return the buffer and the queue
1337  *      on which it was found.
1338  */
1339
1340 static struct buf *
1341 getnewbuf(slpflag, slptimeo, queue)
1342         int slpflag, slptimeo;
1343         int *queue;
1344 {
1345         register struct buf *bp;
1346         register struct buf *lru_bp;
1347         register struct buf *age_bp;
1348         register struct buf *meta_bp;
1349         register int age_time, lru_time, bp_time, meta_time;
1350         int s;
1351         struct ucred *cred;
1352         int req = *queue; /* save it for restarts */
1353
1354 start:
1355         s = splbio();
1356
1357         /* invalid request gets empty queue */
1358         if ((*queue > BQUEUES) || (*queue < 0)
1359                 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED))
1360                 *queue = BQ_EMPTY;
1361
1362         /* (*queue == BQUEUES) means no preference */
1363         if (*queue != BQUEUES) {
1364                 /* Try for the requested queue first */
1365                 bp = bufqueues[*queue].tqh_first;
1366                 if (bp)
1367                         goto found;
1368         }
1369
1370         /* Unable to use requested queue */
1371         age_bp = bufqueues[BQ_AGE].tqh_first;
1372         lru_bp = bufqueues[BQ_LRU].tqh_first;
1373         meta_bp = bufqueues[BQ_META].tqh_first;
1374
1375         if (!age_bp && !lru_bp && !meta_bp) { /* Unavailble on AGE or LRU */
1376                 /* Try the empty list first */
1377                 bp = bufqueues[BQ_EMPTY].tqh_first;
1378                 if (bp) {
1379                         *queue = BQ_EMPTY;
1380                         goto found;
1381                 }
1382
1383                 /* Create a new temparory buffer header */
1384                 bp = (struct buf *)zalloc(buf_hdr_zone);
1385
1386                 if (bp) {
1387                         bufhdrinit(bp);
1388                         BLISTNONE(bp);
1389                         binshash(bp, &invalhash);
1390                         SET(bp->b_flags, B_HDRALLOC);
1391                         *queue = BQ_EMPTY;
1392                         binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
1393                         buf_hdr_count++;
1394                         goto found;
1395                 }
1396
1397                 /* Log this error condition */
1398                 printf("getnewbuf: No useful buffers");
1399
1400                 /* wait for a free buffer of any kind */
1401                 needbuffer = 1;
1402                 bufstats.bufs_sleeps++;
1403                 tsleep(&needbuffer, slpflag|(PRIBIO+1), "getnewbuf", slptimeo);
1404                 splx(s);
1405                 return (0);
1406         }
1407
1408         /* Buffer available either on AGE or LRU or META */
1409         bp = NULL;
1410         *queue = -1;
1411
1412         /* Buffer available either on AGE or LRU */
1413         if (!age_bp) {
1414                 bp = lru_bp;
1415                 *queue = BQ_LRU;
1416         } else if (!lru_bp) {
1417                 bp = age_bp;
1418                 *queue = BQ_AGE;
1419         } else { /* buffer available on both AGE and LRU */
1420                 age_time = time.tv_sec - age_bp->b_timestamp;
1421                 lru_time = time.tv_sec - lru_bp->b_timestamp;
1422                 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
1423                         bp = age_bp;
1424                         *queue = BQ_AGE;
1425                         /*
1426                          * we should probably re-timestamp eveything in the
1427                          * queues at this point with the current time
1428                          */
1429                 } else {
1430                         if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
1431                                 bp = lru_bp;
1432                                 *queue = BQ_LRU;
1433                         } else {
1434                                 bp = age_bp;
1435                                 *queue = BQ_AGE;
1436                         }
1437                 }
1438         }
1439
1440         if (!bp) { /* Neither on AGE nor on LRU */
1441                 bp = meta_bp;
1442                 *queue = BQ_META;
1443         }  else if (meta_bp) {
1444                 bp_time = time.tv_sec - bp->b_timestamp;
1445                 meta_time = time.tv_sec - meta_bp->b_timestamp;
1446
1447                 if (!(bp_time < 0) && !(meta_time < 0)) {
1448                         /* time not set backwards */
1449                         int bp_is_stale;
1450                         bp_is_stale = (*queue == BQ_LRU) ?
1451                                         lru_is_stale : age_is_stale;
1452
1453                         if ((meta_time >= meta_is_stale) &&
1454                                         (bp_time < bp_is_stale)) {
1455                                 bp = meta_bp;
1456                                 *queue = BQ_META;
1457                         }
1458                 }
1459         }
1460
1461         if (bp == NULL)
1462                 panic("getnewbuf: null bp");
1463
1464 found:
1465         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1466                 panic("getnewbuf: le_prev is deadbeef");
1467
1468         if(ISSET(bp->b_flags, B_BUSY))
1469                 panic("getnewbuf reusing BUSY buf");
1470
1471         /* Clean it */
1472         if (bcleanbuf(bp)) {
1473                 /* bawrite() issued, buffer not ready */
1474                 splx(s);
1475                 *queue = req;
1476                 goto start;
1477         }
1478         splx(s);
1479         return (bp);
1480 }
1481 #include <mach/mach_types.h>
1482 #include <mach/memory_object_types.h>
1483
1484 /*
1485  * Clean a buffer.
1486  * Returns 0 is buffer is ready to use,
1487  * Returns 1 if issued a bawrite() to indicate
1488  * that the buffer is not ready.
1489  */
1490 int
1491 bcleanbuf(struct buf *bp)
1492 {
1493         int s;
1494         struct ucred *cred;
1495
1496         s = splbio();
1497
1498         /* Remove from the queue */
1499         bremfree(bp);
1500
1501         /* Buffer is no longer on free lists. */
1502         SET(bp->b_flags, B_BUSY);
1503
1504         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1505                 panic("bcleanbuf: le_prev is deadbeef");
1506
1507         /*
1508          * If buffer was a delayed write, start the IO by queuing
1509          * it on the LAUNDRY queue, and return 1
1510          */
1511         if (ISSET(bp->b_flags, B_DELWRI)) {
1512                 splx(s);
1513                 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
1514                 blaundrycnt++;
1515                 wakeup(&blaundrycnt);
1516                 return (1);
1517         }
1518
1519         if (bp->b_vp)
1520                 brelvp(bp);
1521         bremhash(bp);
1522         BLISTNONE(bp);
1523
1524         splx(s);
1525
1526         if (ISSET(bp->b_flags, B_META)) {
1527 #if ZALLOC_METADATA
1528                 vm_offset_t elem = (vm_offset_t)bp->b_data;
1529                 if (elem == 0)
1530                         panic("bcleanbuf: NULL bp->b_data B_META buffer");
1531
1532                 if (ISSET(bp->b_flags, B_ZALLOC)) {
1533                         if (bp->b_bufsize <= MAXMETA) {
1534                                 zone_t z;
1535
1536                                 z = getbufzone(bp->b_bufsize);
1537                                 bp->b_data = (caddr_t)0xdeadbeef;
1538                                 zfree(z, elem);
1539                                 CLR(bp->b_flags, B_ZALLOC);
1540                         } else
1541                                 panic("bcleanbuf: B_ZALLOC set incorrectly");
1542                 } else {
1543                         bp->b_data = (caddr_t)0xdeadbeef;
1544                         kmem_free(kernel_map, elem, bp->b_bufsize);
1545                 }
1546 #else
1547            if (bp->b_data == 0)
1548                    panic("bcleanbuf: bp->b_data == NULL for B_META buffer");
1549
1550            kmem_free(kernel_map, bp->b_data, bp->b_bufsize);
1551 #endif /* ZALLOC_METADATA */
1552         }
1553
1554         trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
1555
1556         /* disassociate us from our vnode, if we had one... */
1557         s = splbio();
1558
1559         /* clear out various other fields */
1560         bp->b_bufsize = 0;
1561         bp->b_data = 0;
1562         bp->b_flags = B_BUSY;
1563         bp->b_dev = NODEV;
1564         bp->b_blkno = bp->b_lblkno = 0;
1565         bp->b_iodone = 0;
1566         bp->b_error = 0;
1567         bp->b_resid = 0;
1568         bp->b_bcount = 0;
1569         bp->b_dirtyoff = bp->b_dirtyend = 0;
1570         bp->b_validoff = bp->b_validend = 0;
1571
1572         /* nuke any credentials we were holding */
1573         cred = bp->b_rcred;
1574         if (cred != NOCRED) {
1575                 bp->b_rcred = NOCRED;
1576                 crfree(cred);
1577         }
1578         cred = bp->b_wcred;
1579         if (cred != NOCRED) {
1580                 bp->b_wcred = NOCRED;
1581                 crfree(cred);
1582         }
1583         splx(s);
1584         return (0);
1585 }
1586
1587
1588 /*
1589  * Wait for operations on the buffer to complete.
1590  * When they do, extract and return the I/O's error value.
1591  */
1592 int
1593 biowait(bp)
1594         struct buf *bp;
1595 {
1596         upl_t           upl;
1597         upl_page_info_t *pl;
1598         int s;
1599         kern_return_t kret;
1600
1601         s = splbio();
1602         while (!ISSET(bp->b_flags, B_DONE))
1603                 tsleep(bp, PRIBIO + 1, "biowait", 0);
1604         splx(s);
1605
1606         /* check for interruption of I/O (e.g. via NFS), then errors. */
1607         if (ISSET(bp->b_flags, B_EINTR)) {
1608                 CLR(bp->b_flags, B_EINTR);
1609                 return (EINTR);
1610         } else if (ISSET(bp->b_flags, B_ERROR))
1611                 return (bp->b_error ? bp->b_error : EIO);
1612         else
1613                 return (0);
1614 }
1615
1616 /*
1617  * Mark I/O complete on a buffer.
1618  *
1619  * If a callback has been requested, e.g. the pageout
1620  * daemon, do so. Otherwise, awaken waiting processes.
1621  *
1622  * [ Leffler, et al., says on p.247:
1623  *      "This routine wakes up the blocked process, frees the buffer
1624  *      for an asynchronous write, or, for a request by the pagedaemon
1625  *      process, invokes a procedure specified in the buffer structure" ]
1626  *
1627  * In real life, the pagedaemon (or other system processes) wants
1628  * to do async stuff to, and doesn't want the buffer brelse()'d.
1629  * (for swap pager, that puts swap buffers on the free lists (!!!),
1630  * for the vn device, that puts malloc'd buffers on the free lists!)
1631  */
1632 void
1633 biodone(bp)
1634         struct buf *bp;
1635 {
1636         boolean_t       funnel_state;
1637         int s;
1638
1639         funnel_state = thread_funnel_set(kernel_flock, TRUE);
1640
1641         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
1642                      bp, bp->b_data, bp->b_flags, 0, 0);
1643
1644         if (ISSET(bp->b_flags, B_DONE))
1645                 panic("biodone already");
1646         SET(bp->b_flags, B_DONE);               /* note that it's done */
1647         /*
1648          * I/O was done, so don't believe
1649          * the DIRTY state from VM anymore
1650          */
1651         CLR(bp->b_flags, B_WASDIRTY);
1652
1653         if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
1654                 vwakeup(bp);     /* wake up reader */
1655
1656         if (ISSET(bp->b_flags, B_CALL)) {       /* if necessary, call out */
1657                 CLR(bp->b_flags, B_CALL);       /* but note callout done */
1658                 (*bp->b_iodone)(bp);
1659         } else if (ISSET(bp->b_flags, B_ASYNC)) /* if async, release it */
1660                 brelse(bp);
1661         else {                                  /* or just wakeup the buffer */
1662                 CLR(bp->b_flags, B_WANTED);
1663                 wakeup(bp);
1664         }
1665
1666         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
1667                      bp, bp->b_data, bp->b_flags, 0, 0);
1668
1669         thread_funnel_set(kernel_flock, funnel_state);
1670 }
1671
1672 /*
1673  * Return a count of buffers on the "locked" queue.
1674  */
1675 int
1676 count_lock_queue()
1677 {
1678         register struct buf *bp;
1679         register int n = 0;
1680
1681         for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
1682             bp = bp->b_freelist.tqe_next)
1683                 n++;
1684         return (n);
1685 }
1686
1687 /*
1688  * Return a count of 'busy' buffers. Used at the time of shutdown.
1689  */
1690 int
1691 count_busy_buffers()
1692 {
1693         register struct buf *bp;
1694         register int nbusy = 0;
1695
1696         for (bp = &buf[nbuf]; --bp >= buf; )
1697                 if ((bp->b_flags & (B_BUSY|B_INVAL)) == B_BUSY)
1698                         nbusy++;
1699         return (nbusy);
1700 }
1701
1702 #if 1 /*DIAGNOSTIC */
1703 /*
1704  * Print out statistics on the current allocation of the buffer pool.
1705  * Can be enabled to print out on every ``sync'' by setting "syncprt"
1706  * in vfs_syscalls.c using sysctl.
1707  */
1708 void
1709 vfs_bufstats()
1710 {
1711         int s, i, j, count;
1712         register struct buf *bp;
1713         register struct bqueues *dp;
1714         int counts[MAXBSIZE/CLBYTES+1];
1715         static char *bname[BQUEUES] =
1716                 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
1717
1718         for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
1719                 count = 0;
1720                 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1721                         counts[j] = 0;
1722                 s = splbio();
1723                 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
1724                         counts[bp->b_bufsize/CLBYTES]++;
1725                         count++;
1726                 }
1727                 splx(s);
1728                 printf("%s: total-%d", bname[i], count);
1729                 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1730                         if (counts[j] != 0)
1731                                 printf(", %d-%d", j * CLBYTES, counts[j]);
1732                 printf("\n");
1733         }
1734 }
1735 #endif /* DIAGNOSTIC */
1736
1737 #define NRESERVEDIOBUFS 16
1738
1739 struct buf *
1740 alloc_io_buf(vp, priv)
1741         struct vnode *vp;
1742         int priv;
1743 {
1744         register struct buf *bp;
1745         int s;
1746
1747         s = splbio();
1748
1749         while (niobuf - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse && !priv) {
1750                 need_iobuffer = 1;
1751                 bufstats.bufs_iobufsleeps++;
1752                 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf", 0);
1753         }
1754
1755         while ((bp = iobufqueue.tqh_first) == NULL) {
1756                 need_iobuffer = 1;
1757                 bufstats.bufs_iobufsleeps++;
1758                 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf1", 0);
1759         }
1760
1761         TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
1762         bp->b_timestamp = 0;
1763
1764         /* clear out various fields */
1765         bp->b_flags = B_BUSY;
1766         bp->b_blkno = bp->b_lblkno = 0;
1767         bp->b_iodone = 0;
1768         bp->b_error = 0;
1769         bp->b_resid = 0;
1770         bp->b_bcount = 0;
1771         bp->b_bufsize = 0;
1772         bp->b_vp = vp;
1773
1774         if (vp->v_type == VBLK || vp->v_type == VCHR)
1775                 bp->b_dev = vp->v_rdev;
1776         else
1777                 bp->b_dev = NODEV;
1778         bufstats.bufs_iobufinuse++;
1779         if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
1780                 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
1781         splx(s);
1782
1783         return (bp);
1784 }
1785
1786 void
1787 free_io_buf(bp)
1788         struct buf *bp;
1789 {
1790         int s;
1791
1792         s = splbio();
1793         /* put buffer back on the head of the iobufqueue */
1794         bp->b_vp = NULL;
1795         bp->b_flags = B_INVAL;
1796
1797         binsheadfree(bp, &iobufqueue, -1);
1798
1799         /* Wake up any processes waiting for any buffer to become free. */
1800         if (need_iobuffer) {
1801                 need_iobuffer = 0;
1802                 wakeup(&need_iobuffer);
1803         }
1804         bufstats.bufs_iobufinuse--;
1805         splx(s);
1806 }
1807
1808
1809 /* not hookedup yet */
1810
1811 /* XXX move this to a separate file */
1812 /*
1813  * Dynamic Scaling of the Buffer Queues
1814  */
1815
1816 typedef long long blsize_t;
1817
1818 blsize_t MAXNBUF; /* initialize to (mem_size / PAGE_SIZE) */
1819 /* Global tunable limits */
1820 blsize_t nbufh;                 /* number of buffer headers */
1821 blsize_t nbuflow;               /* minimum number of buffer headers required */
1822 blsize_t nbufhigh;              /* maximum number of buffer headers allowed */
1823 blsize_t nbuftarget;    /* preferred number of buffer headers */
1824
1825 /*
1826  * assertions:
1827  *
1828  * 1.   0 < nbuflow <= nbufh <= nbufhigh
1829  * 2.   nbufhigh <= MAXNBUF
1830  * 3.   0 < nbuflow <= nbuftarget <= nbufhigh
1831  * 4.   nbufh can not be set by sysctl().
1832  */
1833
1834 /* Per queue tunable limits */
1835
1836 struct bufqlim {
1837         blsize_t        bl_nlow;        /* minimum number of buffer headers required */
1838         blsize_t        bl_num;         /* number of buffer headers on the queue */
1839         blsize_t        bl_nlhigh;      /* maximum number of buffer headers allowed */
1840         blsize_t        bl_target;      /* preferred number of buffer headers */
1841         long    bl_stale;       /* Seconds after which a buffer is considered stale */
1842 } bufqlim[BQUEUES];
1843
1844 /*
1845  * assertions:
1846  *
1847  * 1.   0 <= bl_nlow <= bl_num <= bl_nlhigh
1848  * 2.   bl_nlhigh <= MAXNBUF
1849  * 3.  bufqlim[BQ_META].bl_nlow != 0
1850  * 4.  bufqlim[BQ_META].bl_nlow > (number of possible concurrent
1851  *                                                                      file system IO operations)
1852  * 5.   bl_num can not be set by sysctl().
1853  * 6.   bl_nhigh <= nbufhigh
1854  */
1855
1856 /*
1857  * Rationale:
1858  * ----------
1859  * Defining it blsize_t as long permits 2^31 buffer headers per queue.
1860  * Which can describe (2^31 * PAGE_SIZE) memory per queue.
1861  *
1862  * These limits are exported to by means of sysctl().
1863  * It was decided to define blsize_t as a 64 bit quantity.
1864  * This will make sure that we will not be required to change it
1865  * as long as we do not exceed 64 bit address space for the kernel.
1866  *
1867  * low and high numbers parameters initialized at compile time
1868  * and boot arguments can be used to override them. sysctl()
1869  * would not change the value. sysctl() can get all the values
1870  * but can set only target. num is the current level.
1871  *
1872  * Advantages of having a "bufqscan" thread doing the balancing are,
1873  * Keep enough bufs on BQ_EMPTY.
1874  *      getnewbuf() by default will always select a buffer from the BQ_EMPTY.
1875  *              getnewbuf() perfoms best if a buffer was found there.
1876  *              Also this minimizes the possibility of starting IO
1877  *              from getnewbuf(). That's a performance win, too.
1878  *
1879  *      Localize complex logic [balancing as well as time aging]
1880  *              to balancebufq().
1881  *
1882  *      Simplify getnewbuf() logic by elimination of time aging code.
1883  */
1884
1885 /*
1886  * Algorithm:
1887  * -----------
1888  * The goal of the dynamic scaling of the buffer queues to to keep
1889  * the size of the LRU close to bl_target. Buffers on a queue would
1890  * be time aged.
1891  *
1892  * There would be a thread which will be responsible for "balancing"
1893  * the buffer cache queues.
1894  *
1895  * The scan order would be:     AGE, LRU, META, EMPTY.
1896  */
1897
1898 long bufqscanwait = 0;
1899
1900 extern void bufqscan_thread();
1901 extern int balancebufq(int q);
1902 extern int btrimempty(int n);
1903 extern int initbufqscan(void);
1904 extern int nextbufq(int q);
1905 extern void buqlimprt(int all);
1906
1907 void
1908 bufq_balance_thread_init()
1909 {
1910
1911         if (bufqscanwait++ == 0) {
1912                 int i;
1913
1914                 /* Initalize globals */
1915                 MAXNBUF = (mem_size / PAGE_SIZE);
1916                 nbufh = nbuf;
1917                 nbuflow = min(nbufh, 100);
1918                 nbufhigh = min(MAXNBUF, max(nbufh, 2048));
1919                 nbuftarget = (mem_size >> 5) / PAGE_SIZE;
1920                 nbuftarget = max(nbuflow, nbuftarget);
1921                 nbuftarget = min(nbufhigh, nbuftarget);
1922
1923                 /*
1924                  * Initialize the bufqlim
1925                  */
1926
1927                 /* LOCKED queue */
1928                 bufqlim[BQ_LOCKED].bl_nlow = 0;
1929                 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
1930                 bufqlim[BQ_LOCKED].bl_target = 0;
1931                 bufqlim[BQ_LOCKED].bl_stale = 30;
1932
1933                 /* LRU queue */
1934                 bufqlim[BQ_LRU].bl_nlow = 0;
1935                 bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
1936                 bufqlim[BQ_LRU].bl_target = nbuftarget/4;
1937                 bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
1938
1939                 /* AGE queue */
1940                 bufqlim[BQ_AGE].bl_nlow = 0;
1941                 bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
1942                 bufqlim[BQ_AGE].bl_target = nbuftarget/4;
1943                 bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
1944
1945                 /* EMPTY queue */
1946                 bufqlim[BQ_EMPTY].bl_nlow = 0;
1947                 bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
1948                 bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
1949                 bufqlim[BQ_EMPTY].bl_stale = 600000;
1950
1951                 /* META queue */
1952                 bufqlim[BQ_META].bl_nlow = 0;
1953                 bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
1954                 bufqlim[BQ_META].bl_target = nbuftarget/4;
1955                 bufqlim[BQ_META].bl_stale = META_IS_STALE;
1956
1957                 /* LAUNDRY queue */
1958                 bufqlim[BQ_LOCKED].bl_nlow = 0;
1959                 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
1960                 bufqlim[BQ_LOCKED].bl_target = 0;
1961                 bufqlim[BQ_LOCKED].bl_stale = 30;
1962
1963                 buqlimprt(1);
1964         }
1965
1966         /* create worker thread */
1967         kernel_thread(kernel_task, bufqscan_thread);
1968 }
1969
1970 /* The workloop for the buffer balancing thread */
1971 void
1972 bufqscan_thread()
1973 {
1974         boolean_t       funnel_state;
1975         int moretodo = 0;
1976
1977         funnel_state = thread_funnel_set(kernel_flock, TRUE);
1978
1979         for(;;) {
1980                 do {
1981                         int q;  /* buffer queue to process */
1982
1983                         for (q = initbufqscan(); q; ) {
1984                                 moretodo |= balancebufq(q);
1985                                 q = nextbufq(q);
1986                         }
1987                 } while (moretodo);
1988
1989 #if 1 || DIAGNOSTIC
1990                 vfs_bufstats();
1991                 buqlimprt(0);
1992 #endif
1993                 (void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz);
1994                 moretodo = 0;
1995         }
1996
1997         (void) thread_funnel_set(kernel_flock, FALSE);
1998 }
1999
2000 /* Seed for the buffer queue balancing */
2001 int
2002 initbufqscan()
2003 {
2004         /* Start with AGE queue */
2005         return (BQ_AGE);
2006 }
2007
2008 /* Pick next buffer queue to balance */
2009 int
2010 nextbufq(int q)
2011 {
2012         int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
2013
2014         q++;
2015         q %= sizeof(order);
2016         return (order[q]);
2017 }
2018
2019 /* function to balance the buffer queues */
2020 int
2021 balancebufq(int q)
2022 {
2023         int moretodo = 0;
2024         int s = splbio();
2025         int n;
2026
2027         /* reject invalid q */
2028         if ((q < 0) || (q >= BQUEUES))
2029                 goto out;
2030
2031         /* LOCKED or LAUNDRY queue MUST not be balanced */
2032         if ((q == BQ_LOCKED) || (q == BQ_LAUNDRY))
2033                 goto out;
2034
2035         n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
2036
2037         /* If queue has less than target nothing more to do */
2038         if (n < 0)
2039                 goto out;
2040
2041         if ( n > 8 ) {
2042                 /* Balance only a small amount (12.5%) at a time */
2043                 n >>= 3;
2044         }
2045
2046         /* EMPTY queue needs special handling */
2047         if (q == BQ_EMPTY) {
2048                 moretodo |= btrimempty(n);
2049                 goto out;
2050         }
2051
2052         for (; n > 0; n--) {
2053                 struct buf *bp = bufqueues[q].tqh_first;
2054                 if (!bp)
2055                         break;
2056
2057                 /* check if it's stale */
2058                 if ((time.tv_sec - bp->b_timestamp) > bufqlim[q].bl_stale) {
2059                         if (bcleanbuf(bp)) {
2060                                 /* bawrite() issued, bp not ready */
2061                                 moretodo = 1;
2062                         } else {
2063                                 /* release the cleaned buffer to BQ_EMPTY */
2064                                 SET(bp->b_flags, B_INVAL);
2065                                 brelse(bp);
2066                         }
2067                 } else
2068                         break;
2069         }
2070
2071 out:
2072         splx(s);
2073         return (moretodo);
2074 }
2075
2076 int
2077 btrimempty(int n)
2078 {
2079         /*
2080          * When struct buf are allocated dynamically, this would
2081          * reclaim upto 'n' struct buf from the empty queue.
2082          */
2083
2084          return (0);
2085 }
2086
2087 void
2088 bufqinc(int q)
2089 {
2090         if ((q < 0) || (q >= BQUEUES))
2091                 return;
2092
2093         bufqlim[q].bl_num++;
2094         return;
2095 }
2096
2097 void
2098 bufqdec(int q)
2099 {
2100         if ((q < 0) || (q >= BQUEUES))
2101                 return;
2102
2103         bufqlim[q].bl_num--;
2104         return;
2105 }
2106
2107 void
2108 buqlimprt(int all)
2109 {
2110         int i;
2111     static char *bname[BQUEUES] =
2112                 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
2113
2114         if (all)
2115                 for (i = 0; i < BQUEUES; i++) {
2116                         printf("%s : ", bname[i]);
2117                         printf("min = %d, ", (long)bufqlim[i].bl_nlow);
2118                         printf("cur = %d, ", (long)bufqlim[i].bl_num);
2119                         printf("max = %d, ", (long)bufqlim[i].bl_nlhigh);
2120                         printf("target = %d, ", (long)bufqlim[i].bl_target);
2121                         printf("stale after %d seconds\n", bufqlim[i].bl_stale);
2122                 }
2123         else
2124                 for (i = 0; i < BQUEUES; i++) {
2125                         printf("%s : ", bname[i]);
2126                         printf("cur = %d, ", (long)bufqlim[i].bl_num);
2127                 }
2128 }
2129
2130 /*
2131  * If the getnewbuf() calls bcleanbuf() on the same thread
2132  * there is a potential for stack overrun and deadlocks.
2133  * So we always handoff the work to worker thread for completion
2134  */
2135
2136 static void
2137 bcleanbuf_thread_init()
2138 {
2139         static void bcleanbuf_thread();
2140
2141         /* create worker thread */
2142         kernel_thread(kernel_task, bcleanbuf_thread);
2143 }
2144
2145 static void
2146 bcleanbuf_thread()
2147 {
2148         boolean_t       funnel_state;
2149         struct buf *bp;
2150
2151         funnel_state = thread_funnel_set(kernel_flock, TRUE);
2152
2153 doit:
2154         while (blaundrycnt == 0)
2155                 (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 60 * hz);
2156         bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY]);
2157         /* Remove from the queue */
2158         bremfree(bp);
2159         blaundrycnt--;
2160         /* do the IO */
2161         bawrite(bp);
2162         /* start again */
2163         goto doit;
2164
2165         (void) thread_funnel_set(kernel_flock, funnel_state);
2166 }
2167