bsd/vfs/vfs_bio.c

   1 /*
   2  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  23 /*-
  24  * Copyright (c) 1994 Christopher G. Demetriou
  25  * Copyright (c) 1982, 1986, 1989, 1993
  26  *      The Regents of the University of California.  All rights reserved.
  27  * (c) UNIX System Laboratories, Inc.
  28  * All or some portions of this file are derived from material licensed
  29  * to the University of California by American Telephone and Telegraph
  30  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  31  * the permission of UNIX System Laboratories, Inc.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  * The NEXTSTEP Software License Agreement specifies the terms
  62  * and conditions for redistribution.
  63  *
  64  *      @(#)vfs_bio.c   8.6 (Berkeley) 1/11/94
  65  */
  66
  67 /*
  68  * Some references:
  69  *      Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
  70  *      Leffler, et al.: The Design and Implementation of the 4.3BSD
  71  *              UNIX Operating System (Addison Welley, 1989)
  72  */
  73 #define ZALLOC_METADATA 1
  74
  75 #include <sys/param.h>
  76 #include <sys/systm.h>
  77 #include <sys/proc.h>
  78 #include <sys/buf.h>
  79 #include <sys/vnode.h>
  80 #include <sys/mount.h>
  81 #include <sys/trace.h>
  82 #include <sys/malloc.h>
  83 #include <sys/resourcevar.h>
  84 #include <miscfs/specfs/specdev.h>
  85 #include <sys/ubc.h>
  86 #include <vm/vm_pageout.h>
  87 #if DIAGNOSTIC
  88 #include <kern/assert.h>
  89 #endif /* DIAGNOSTIC */
  90 #include <kern/task.h>
  91 #include <kern/zalloc.h>
  92
  93 #include <sys/kdebug.h>
  94
  95 extern void bufqinc(int q);
  96 extern void bufqdec(int q);
  97 extern void bufq_balance_thread_init();
  98
  99 extern void reassignbuf(struct buf *, struct vnode *);
 100 static struct buf *getnewbuf(int slpflag, int slptimeo, int *queue);
 101
 102 extern int niobuf;              /* The number of IO buffer headers for cluster IO */
 103
 104 #if TRACE
 105 struct  proc *traceproc;
 106 int     tracewhich, tracebuf[TRCSIZ];
 107 u_int   tracex;
 108 char    traceflags[TR_NFLAGS];
 109 #endif /* TRACE */
 110
 111 /*
 112  * Definitions for the buffer hash lists.
 113  */
 114 #define BUFHASH(dvp, lbn)       \
 115         (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
 116 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
 117 u_long  bufhash;
 118
 119 /* Definitions for the buffer stats. */
 120 struct bufstats bufstats;
 121
 122 /*
 123  * Insq/Remq for the buffer hash lists.
 124  */
 125 #if 0
 126 #define binshash(bp, dp)        LIST_INSERT_HEAD(dp, bp, b_hash)
 127 #define bremhash(bp)            LIST_REMOVE(bp, b_hash)
 128 #endif /* 0 */
 129
 130
 131 TAILQ_HEAD(ioqueue, buf) iobufqueue;
 132 TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
 133 int needbuffer;
 134 int need_iobuffer;
 135
 136 /*
 137  * Insq/Remq for the buffer free lists.
 138  */
 139 #define binsheadfree(bp, dp, whichq)    do { \
 140                                     TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
 141                                         bufqinc((whichq));      \
 142                                         (bp)->b_whichq = whichq; \
 143                                     (bp)->b_timestamp = time.tv_sec; \
 144                                 } while (0)
 145
 146 #define binstailfree(bp, dp, whichq)    do { \
 147                                     TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
 148                                         bufqinc((whichq));      \
 149                                         (bp)->b_whichq = whichq; \
 150                                     (bp)->b_timestamp = time.tv_sec; \
 151                                 } while (0)
 152
 153 #define BHASHENTCHECK(bp)       \
 154         if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef)  \
 155                 panic("%x: b_hash.le_prev is deadb", (bp));
 156
 157 #define BLISTNONE(bp)   \
 158         (bp)->b_hash.le_next = (struct buf *)0; \
 159         (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
 160
 161 simple_lock_data_t bufhashlist_slock;           /* lock on buffer hash list */
 162
 163 /*
 164  * Time in seconds before a buffer on a list is
 165  * considered as a stale buffer
 166  */
 167 #define LRU_IS_STALE 120 /* default value for the LRU */
 168 #define AGE_IS_STALE 60  /* default value for the AGE */
 169 #define META_IS_STALE 180 /* default value for the BQ_META */
 170
 171 int lru_is_stale = LRU_IS_STALE;
 172 int age_is_stale = AGE_IS_STALE;
 173 int meta_is_stale = META_IS_STALE;
 174
 175 #if 1
 176 void
 177 blistenterhead(struct bufhashhdr * head, struct buf * bp)
 178 {
 179         if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
 180                 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
 181         (head)->lh_first = bp;
 182         bp->b_hash.le_prev = &(head)->lh_first;
 183         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
 184                 panic("blistenterhead: le_prev is deadbeef");
 185
 186 }
 187 #endif
 188
 189 #if 1
 190 void
 191 binshash(struct buf *bp, struct bufhashhdr *dp)
 192 {
 193 int s;
 194
 195 struct buf *nbp;
 196
 197         simple_lock(&bufhashlist_slock);
 198 #if 0
 199         if(incore(bp->b_vp, bp->b_lblkno)) {
 200                 panic("adding to queue already existing element");
 201         }
 202 #endif /* 0 */
 203         BHASHENTCHECK(bp);
 204
 205         nbp = dp->lh_first;
 206         for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
 207                 if(nbp == bp)
 208                         panic("buf already in hashlist");
 209         }
 210
 211 #if 0
 212         LIST_INSERT_HEAD(dp, bp, b_hash);
 213 #else
 214         blistenterhead(dp, bp);
 215 #endif
 216         simple_unlock(&bufhashlist_slock);
 217 }
 218
 219 void
 220 bremhash(struct buf *bp)
 221 {
 222         int s;
 223
 224         simple_lock(&bufhashlist_slock);
 225         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
 226                 panic("bremhash le_prev is deadbeef");
 227         if (bp->b_hash.le_next == bp)
 228                 panic("bremhash: next points to self");
 229
 230         if (bp->b_hash.le_next != NULL)
 231                 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
 232         *bp->b_hash.le_prev = (bp)->b_hash.le_next;
 233         simple_unlock(&bufhashlist_slock);
 234 }
 235
 236 #endif /* 1 */
 237
 238
 239 /*
 240  * Remove a buffer from the free list it's on
 241  */
 242 void
 243 bremfree(bp)
 244         struct buf *bp;
 245 {
 246         struct bqueues *dp = NULL;
 247         int whichq = -1;
 248
 249         /*
 250          * We only calculate the head of the freelist when removing
 251          * the last element of the list as that is the only time that
 252          * it is needed (e.g. to reset the tail pointer).
 253          *
 254          * NB: This makes an assumption about how tailq's are implemented.
 255          */
 256         if (bp->b_freelist.tqe_next == NULL) {
 257                 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
 258                         if (dp->tqh_last == &bp->b_freelist.tqe_next)
 259                                 break;
 260                 if (dp == &bufqueues[BQUEUES])
 261                         panic("bremfree: lost tail");
 262         }
 263         TAILQ_REMOVE(dp, bp, b_freelist);
 264         whichq = bp->b_whichq;
 265         bufqdec(whichq);
 266         bp->b_whichq = -1;
 267         bp->b_timestamp = 0;
 268 }
 269
 270 /*
 271  * Initialize buffers and hash links for buffers.
 272  */
 273 void
 274 bufinit()
 275 {
 276         register struct buf *bp;
 277         register struct bqueues *dp;
 278         register int i;
 279         int metabuf;
 280         long whichq;
 281 #if ZALLOC_METADATA
 282         static void bufzoneinit();
 283 #endif /* ZALLOC_METADATA */
 284
 285         /* Initialize the buffer queues ('freelists') and the hash table */
 286         for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
 287                 TAILQ_INIT(dp);
 288         bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash);
 289
 290         simple_lock_init(&bufhashlist_slock );
 291
 292         metabuf = nbuf/8; /* reserved for meta buf */
 293
 294         /* Initialize the buffer headers */
 295         for (i = 0; i < nbuf; i++) {
 296                 bp = &buf[i];
 297                 bzero((char *)bp, sizeof *bp);
 298                 bp->b_dev = NODEV;
 299                 bp->b_rcred = NOCRED;
 300                 bp->b_wcred = NOCRED;
 301                 bp->b_vnbufs.le_next = NOLIST;
 302                 bp->b_flags = B_INVAL;
 303                 /*
 304                  * metabuf buffer headers on the meta-data list and
 305                  * rest of the buffer headers on the empty list
 306                  */
 307                 if (--metabuf )
 308                         whichq = BQ_META;
 309                 else
 310                         whichq = BQ_EMPTY;
 311
 312                 BLISTNONE(bp);
 313                 dp = &bufqueues[whichq];
 314                 binsheadfree(bp, dp, whichq);
 315                 binshash(bp, &invalhash);
 316         }
 317
 318         for (; i < nbuf + niobuf; i++) {
 319                 bp = &buf[i];
 320                 bzero((char *)bp, sizeof *bp);
 321                 bp->b_dev = NODEV;
 322                 bp->b_rcred = NOCRED;
 323                 bp->b_wcred = NOCRED;
 324                 bp->b_vnbufs.le_next = NOLIST;
 325                 bp->b_flags = B_INVAL;
 326                 binsheadfree(bp, &iobufqueue, -1);
 327         }
 328
 329         printf("using %d buffer headers and %d cluster IO buffer headers\n",
 330                 nbuf, niobuf);
 331
 332 #if ZALLOC_METADATA
 333         /* Set up zones for meta-data */
 334         bufzoneinit();
 335 #endif
 336
 337 #if XXX
 338         /* create a thread to do dynamic buffer queue balancing */
 339         bufq_balance_thread_init();
 340 #endif /* XXX */
 341 }
 342
 343 /* __inline  */
 344 struct buf *
 345 bio_doread(vp, blkno, size, cred, async, queuetype)
 346         struct vnode *vp;
 347         daddr_t blkno;
 348         int size;
 349         struct ucred *cred;
 350         int async;
 351         int queuetype;
 352 {
 353         register struct buf *bp;
 354         struct proc     *p = current_proc();
 355
 356         bp = getblk(vp, blkno, size, 0, 0, queuetype);
 357
 358         /*
 359          * If buffer does not have data valid, start a read.
 360          * Note that if buffer is B_INVAL, getblk() won't return it.
 361          * Therefore, it's valid if it's I/O has completed or been delayed.
 362          */
 363         if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
 364                 /* Start I/O for the buffer (keeping credentials). */
 365                 SET(bp->b_flags, B_READ | async);
 366                 if (cred != NOCRED && bp->b_rcred == NOCRED) {
 367                         /*
 368                          * NFS has embedded ucred.
 369                          * Can not crhold() here as that causes zone corruption
 370                          */
 371                         bp->b_rcred = crdup(cred);
 372                 }
 373                 VOP_STRATEGY(bp);
 374
 375                 trace(TR_BREADMISS, pack(vp, size), blkno);
 376
 377                 /* Pay for the read. */
 378                 if (p && p->p_stats)
 379                         p->p_stats->p_ru.ru_inblock++;          /* XXX */
 380         } else if (async) {
 381                 brelse(bp);
 382         }
 383
 384         trace(TR_BREADHIT, pack(vp, size), blkno);
 385
 386         return (bp);
 387 }
 388 /*
 389  * Read a disk block.
 390  * This algorithm described in Bach (p.54).
 391  */
 392 int
 393 bread(vp, blkno, size, cred, bpp)
 394         struct vnode *vp;
 395         daddr_t blkno;
 396         int size;
 397         struct ucred *cred;
 398         struct buf **bpp;
 399 {
 400         register struct buf *bp;
 401
 402         /* Get buffer for block. */
 403         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
 404
 405         /* Wait for the read to complete, and return result. */
 406         return (biowait(bp));
 407 }
 408
 409 /*
 410  * Read a disk block. [bread() for meta-data]
 411  * This algorithm described in Bach (p.54).
 412  */
 413 int
 414 meta_bread(vp, blkno, size, cred, bpp)
 415         struct vnode *vp;
 416         daddr_t blkno;
 417         int size;
 418         struct ucred *cred;
 419         struct buf **bpp;
 420 {
 421         register struct buf *bp;
 422
 423         /* Get buffer for block. */
 424         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
 425
 426         /* Wait for the read to complete, and return result. */
 427         return (biowait(bp));
 428 }
 429
 430 /*
 431  * Read-ahead multiple disk blocks. The first is sync, the rest async.
 432  * Trivial modification to the breada algorithm presented in Bach (p.55).
 433  */
 434 int
 435 breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp)
 436         struct vnode *vp;
 437         daddr_t blkno; int size;
 438         daddr_t rablks[]; int rasizes[];
 439         int nrablks;
 440         struct ucred *cred;
 441         struct buf **bpp;
 442 {
 443         register struct buf *bp;
 444         int i;
 445
 446         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
 447
 448         /*
 449          * For each of the read-ahead blocks, start a read, if necessary.
 450          */
 451         for (i = 0; i < nrablks; i++) {
 452                 /* If it's in the cache, just go on to next one. */
 453                 if (incore(vp, rablks[i]))
 454                         continue;
 455
 456                 /* Get a buffer for the read-ahead block */
 457                 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, BLK_READ);
 458         }
 459
 460         /* Otherwise, we had to start a read for it; wait until it's valid. */
 461         return (biowait(bp));
 462 }
 463
 464 /*
 465  * Read with single-block read-ahead.  Defined in Bach (p.55), but
 466  * implemented as a call to breadn().
 467  * XXX for compatibility with old file systems.
 468  */
 469 int
 470 breada(vp, blkno, size, rablkno, rabsize, cred, bpp)
 471         struct vnode *vp;
 472         daddr_t blkno; int size;
 473         daddr_t rablkno; int rabsize;
 474         struct ucred *cred;
 475         struct buf **bpp;
 476 {
 477
 478         return (breadn(vp, blkno, size, &rablkno, &rabsize, 1, cred, bpp));
 479 }
 480
 481 /*
 482  * Block write.  Described in Bach (p.56)
 483  */
 484 int
 485 bwrite(bp)
 486         struct buf *bp;
 487 {
 488         int rv, sync, wasdelayed;
 489         struct proc     *p = current_proc();
 490         upl_t  upl;
 491         upl_page_info_t *pl;
 492         void * object;
 493         kern_return_t kret;
 494         struct vnode *vp = bp->b_vp;
 495
 496         /* Remember buffer type, to switch on it later. */
 497         sync = !ISSET(bp->b_flags, B_ASYNC);
 498         wasdelayed = ISSET(bp->b_flags, B_DELWRI);
 499         CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
 500
 501         if (!sync) {
 502                 /*
 503                  * If not synchronous, pay for the I/O operation and make
 504                  * sure the buf is on the correct vnode queue.  We have
 505                  * to do this now, because if we don't, the vnode may not
 506                  * be properly notified that its I/O has completed.
 507                  */
 508                 if (wasdelayed)
 509                         reassignbuf(bp, vp);
 510                 else
 511                 if (p && p->p_stats)
 512                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
 513         }
 514
 515         trace(TR_BWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
 516
 517         /* Initiate disk write.  Make sure the appropriate party is charged. */
 518         SET(bp->b_flags, B_WRITEINPROG);
 519         vp->v_numoutput++;
 520
 521         VOP_STRATEGY(bp);
 522
 523         if (sync) {
 524                 /*
 525                  * If I/O was synchronous, wait for it to complete.
 526                  */
 527                 rv = biowait(bp);
 528
 529                 /*
 530                  * Pay for the I/O operation, if it's not been paid for, and
 531                  * make sure it's on the correct vnode queue. (async operatings
 532                  * were payed for above.)
 533                  */
 534                 if (wasdelayed)
 535                         reassignbuf(bp, vp);
 536                 else
 537                 if (p && p->p_stats)
 538                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
 539
 540                 /* Release the buffer. */
 541                 brelse(bp);
 542
 543                 return (rv);
 544         } else {
 545                 return (0);
 546         }
 547 }
 548
 549 int
 550 vn_bwrite(ap)
 551         struct vop_bwrite_args *ap;
 552 {
 553         return (bwrite(ap->a_bp));
 554 }
 555
 556 /*
 557  * Delayed write.
 558  *
 559  * The buffer is marked dirty, but is not queued for I/O.
 560  * This routine should be used when the buffer is expected
 561  * to be modified again soon, typically a small write that
 562  * partially fills a buffer.
 563  *
 564  * NB: magnetic tapes cannot be delayed; they must be
 565  * written in the order that the writes are requested.
 566  *
 567  * Described in Leffler, et al. (pp. 208-213).
 568  */
 569 void
 570 bdwrite(bp)
 571         struct buf *bp;
 572 {
 573         struct proc *p = current_proc();
 574         kern_return_t kret;
 575         upl_t upl;
 576         upl_page_info_t *pl;
 577
 578         /*
 579          * If the block hasn't been seen before:
 580          *      (1) Mark it as having been seen,
 581          *      (2) Charge for the write.
 582          *      (3) Make sure it's on its vnode's correct block list,
 583          */
 584         if (!ISSET(bp->b_flags, B_DELWRI)) {
 585                 SET(bp->b_flags, B_DELWRI);
 586                 if (p && p->p_stats)
 587                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
 588
 589                 reassignbuf(bp, bp->b_vp);
 590         }
 591
 592
 593         /* If this is a tape block, write it the block now. */
 594         if (ISSET(bp->b_flags, B_TAPE)) {
 595                 /* bwrite(bp); */
 596         VOP_BWRITE(bp);
 597                 return;
 598         }
 599
 600         /* Otherwise, the "write" is done, so mark and release the buffer. */
 601         SET(bp->b_flags, B_DONE);
 602         brelse(bp);
 603 }
 604
 605 /*
 606  * Asynchronous block write; just an asynchronous bwrite().
 607  */
 608 void
 609 bawrite(bp)
 610         struct buf *bp;
 611 {
 612
 613         SET(bp->b_flags, B_ASYNC);
 614         VOP_BWRITE(bp);
 615 }
 616
 617 /*
 618  * Release a buffer on to the free lists.
 619  * Described in Bach (p. 46).
 620  */
 621 void
 622 brelse(bp)
 623         struct buf *bp;
 624 {
 625         struct bqueues *bufq;
 626         int s;
 627         long whichq;
 628
 629         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
 630                      bp->b_lblkno * PAGE_SIZE, bp, bp->b_data, bp->b_flags, 0);
 631
 632         trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
 633
 634         /* IO is done. Cleanup the UPL state */
 635         if (!ISSET(bp->b_flags, B_META)
 636                 && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
 637                 kern_return_t kret;
 638                 upl_t         upl;
 639                 int           upl_flags;
 640
 641                 if ( !ISSET(bp->b_flags, B_PAGELIST)) {
 642                         if ( !ISSET(bp->b_flags, B_INVAL)) {
 643                                 kret = ubc_create_upl(bp->b_vp,
 644                                                                 ubc_blktooff(bp->b_vp, bp->b_lblkno),
 645                                                                 bp->b_bufsize,
 646                                                             &upl,
 647                                                                 NULL,
 648                                                                 UPL_PRECIOUS);
 649                                 if (kret != KERN_SUCCESS)
 650                                         panic("brelse: Failed to get pagelists");
 651 #ifdef  UBC_DEBUG
 652                                 upl_ubc_alias_set(upl, bp, 5);
 653 #endif /* UBC_DEBUG */
 654                         } else
 655                                 upl = (upl_t) 0;
 656                 } else {
 657                         upl = bp->b_pagelist;
 658                         kret = ubc_upl_unmap(upl);
 659
 660                         if (kret != KERN_SUCCESS)
 661                                 panic("kernel_upl_unmap failed");
 662                         bp->b_data = 0;
 663                 }
 664                 if (upl) {
 665                         if (bp->b_flags & (B_ERROR | B_INVAL)) {
 666                             if (bp->b_flags & (B_READ | B_INVAL))
 667                                         upl_flags = UPL_ABORT_DUMP_PAGES;
 668                                 else
 669                                         upl_flags = 0;
 670                                 ubc_upl_abort(upl, upl_flags);
 671                         } else {
 672                             if (ISSET(bp->b_flags, (B_DELWRI | B_WASDIRTY)))
 673                                         upl_flags = UPL_COMMIT_SET_DIRTY ;
 674                                 else
 675                                     upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
 676                                 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
 677                                         UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
 678                         }
 679                         s = splbio();
 680                         CLR(bp->b_flags, B_PAGELIST);
 681                         bp->b_pagelist = 0;
 682                         splx(s);
 683                 }
 684         } else {
 685                 if(ISSET(bp->b_flags, B_PAGELIST))
 686                         panic("brelse: pagelist set for non VREG; vp=%x", bp->b_vp);
 687         }
 688
 689         /* Wake up any processes waiting for any buffer to become free. */
 690         if (needbuffer) {
 691                 needbuffer = 0;
 692                 wakeup(&needbuffer);
 693         }
 694
 695         /* Wake up any proceeses waiting for _this_ buffer to become free. */
 696         if (ISSET(bp->b_flags, B_WANTED)) {
 697                 CLR(bp->b_flags, B_WANTED);
 698                 wakeup(bp);
 699         }
 700
 701         /* Block disk interrupts. */
 702         s = splbio();
 703
 704         /*
 705          * Determine which queue the buffer should be on, then put it there.
 706          */
 707
 708         /* If it's locked, don't report an error; try again later. */
 709         if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
 710                 CLR(bp->b_flags, B_ERROR);
 711
 712         /* If it's not cacheable, or an error, mark it invalid. */
 713         if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
 714                 SET(bp->b_flags, B_INVAL);
 715
 716         if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
 717                 /*
 718                  * If it's invalid or empty, dissociate it from its vnode
 719                  * and put on the head of the appropriate queue.
 720                  */
 721                 if (bp->b_vp)
 722                         brelvp(bp);
 723                 CLR(bp->b_flags, B_DELWRI);
 724                 if (bp->b_bufsize <= 0)
 725                         whichq = BQ_EMPTY;      /* no data */
 726                 else
 727                         whichq = BQ_AGE;        /* invalid data */
 728
 729                 bufq = &bufqueues[whichq];
 730                 binsheadfree(bp, bufq, whichq);
 731         } else {
 732                 /*
 733                  * It has valid data.  Put it on the end of the appropriate
 734                  * queue, so that it'll stick around for as long as possible.
 735                  */
 736                 if (ISSET(bp->b_flags, B_LOCKED))
 737                         whichq = BQ_LOCKED;             /* locked in core */
 738                 else if (ISSET(bp->b_flags, B_META))
 739                         whichq = BQ_META;               /* meta-data */
 740                 else if (ISSET(bp->b_flags, B_AGE))
 741                         whichq = BQ_AGE;                /* stale but valid data */
 742                 else
 743                         whichq = BQ_LRU;                /* valid data */
 744
 745                 bufq = &bufqueues[whichq];
 746                 binstailfree(bp, bufq, whichq);
 747         }
 748
 749         /* Unlock the buffer. */
 750         CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE));
 751
 752         /* Allow disk interrupts. */
 753         splx(s);
 754
 755         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
 756                      bp, bp->b_data, bp->b_flags, 0, 0);
 757 }
 758
 759 /*
 760  * Determine if a block is in the cache.
 761  * Just look on what would be its hash chain.  If it's there, return
 762  * a pointer to it, unless it's marked invalid.  If it's marked invalid,
 763  * we normally don't return the buffer, unless the caller explicitly
 764  * wants us to.
 765  */
 766 struct buf *
 767 incore(vp, blkno)
 768         struct vnode *vp;
 769         daddr_t blkno;
 770 {
 771         struct buf *bp;
 772         int bufseen = 0;
 773
 774         bp = BUFHASH(vp, blkno)->lh_first;
 775
 776         /* Search hash chain */
 777         for (; bp != NULL; bp = bp->b_hash.le_next, bufseen++) {
 778                 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
 779                     !ISSET(bp->b_flags, B_INVAL))
 780                         return (bp);
 781         if(bufseen >= nbuf)
 782                 panic("walked more than nbuf in incore");
 783
 784         }
 785
 786         return (0);
 787 }
 788
 789 /* XXX FIXME -- Update the comment to reflect the UBC changes -- */
 790 /*
 791  * Get a block of requested size that is associated with
 792  * a given vnode and block offset. If it is found in the
 793  * block cache, mark it as having been found, make it busy
 794  * and return it. Otherwise, return an empty block of the
 795  * correct size. It is up to the caller to insure that the
 796  * cached blocks be of the correct size.
 797  */
 798 struct buf *
 799 getblk(vp, blkno, size, slpflag, slptimeo, operation)
 800         register struct vnode *vp;
 801         daddr_t blkno;
 802         int size, slpflag, slptimeo, operation;
 803 {
 804         struct buf *bp;
 805         int s, err;
 806         upl_t upl;
 807         upl_page_info_t *pl;
 808         kern_return_t kret;
 809         int error=0;
 810         int pagedirty = 0;
 811
 812         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
 813                      blkno * PAGE_SIZE, size, operation, 0, 0);
 814 start:
 815
 816         s = splbio();
 817         if (bp = incore(vp, blkno)) {
 818                 /* Found in the Buffer Cache */
 819                 if (ISSET(bp->b_flags, B_BUSY)) {
 820                         /* but is busy */
 821                         switch (operation) {
 822                         case BLK_READ:
 823                         case BLK_WRITE:
 824                         case BLK_META:
 825                                 SET(bp->b_flags, B_WANTED);
 826                                 bufstats.bufs_busyincore++;
 827                                 err = tsleep(bp, slpflag | (PRIBIO + 1), "getblk",
 828                                     slptimeo);
 829                                 splx(s);
 830                                 /*
 831                                  * Callers who call with PCATCH or timeout are
 832                                  * willing to deal with the NULL pointer
 833                                  */
 834                                 if (err && ((slpflag & PCATCH) ||
 835                                                          ((err == EWOULDBLOCK) && slptimeo)))
 836                                         return (NULL);
 837                                 goto start;
 838                                 /*NOTREACHED*/
 839                                 break;
 840
 841                         case BLK_PAGEIN:
 842                                 /* pagein operation must not use getblk */
 843                                 panic("getblk: pagein for incore busy buffer");
 844                                 splx(s);
 845                                 /*NOTREACHED*/
 846                                 break;
 847
 848                         case BLK_PAGEOUT:
 849                                 /* pageout operation must not use getblk */
 850                                 panic("getblk: pageout for incore busy buffer");
 851                                 splx(s);
 852                                 /*NOTREACHED*/
 853                                 break;
 854
 855                         default:
 856                                 panic("getblk: %d unknown operation 1", operation);
 857                                 /*NOTREACHED*/
 858                                 break;
 859                         }
 860                 } else {
 861                         /* not busy */
 862                         SET(bp->b_flags, (B_BUSY | B_CACHE));
 863                         bremfree(bp);
 864                         bufstats.bufs_incore++;
 865                         splx(s);
 866
 867                         allocbuf(bp, size);
 868                         if (ISSET(bp->b_flags, B_PAGELIST))
 869                                         panic("pagelist buffer is not busy");
 870
 871                         switch (operation) {
 872                         case BLK_READ:
 873                         case BLK_WRITE:
 874                                 if (UBCISVALID(bp->b_vp) && bp->b_bufsize) {
 875                                         kret = ubc_create_upl(vp,
 876                                                                         ubc_blktooff(vp, bp->b_lblkno),
 877                                                                         bp->b_bufsize,
 878                                                                         &upl,
 879                                                                         &pl,
 880                                                                         UPL_PRECIOUS);
 881                                         if (kret != KERN_SUCCESS)
 882                                                 panic("Failed to get pagelists");
 883
 884                                         SET(bp->b_flags, B_PAGELIST);
 885                                         bp->b_pagelist = upl;
 886
 887                                         if ( !upl_valid_page(pl, 0))
 888                                                 panic("getblk: incore buffer without valid page");
 889
 890                                         if (upl_dirty_page(pl, 0))
 891                                                 SET(bp->b_flags, B_WASDIRTY);
 892                                         else
 893                                                 CLR(bp->b_flags, B_WASDIRTY);
 894
 895                                         kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
 896                                         if (kret != KERN_SUCCESS) {
 897                                                 panic("getblk: ubc_upl_map() failed with (%d)",
 898                                                                   kret);
 899                                         }
 900                                         if (bp->b_data == 0) panic("ubc_upl_map mapped 0");
 901                                 }
 902                                 break;
 903
 904                         case BLK_META:
 905                                 /*
 906                                  * VM is not involved in IO for the meta data
 907                                  * buffer already has valid data
 908                                  */
 909                         if(bp->b_data == 0)
 910                                         panic("bp->b_data null incore buf=%x", bp);
 911                                 break;
 912
 913                         case BLK_PAGEIN:
 914                         case BLK_PAGEOUT:
 915                                 panic("getblk: paging operation 1");
 916                                 break;
 917
 918                         default:
 919                                 panic("getblk: %d unknown operation 2", operation);
 920                                 /*NOTREACHED*/
 921                                 break;
 922                         }
 923                 }
 924         } else { /* not incore() */
 925                 int queue = BQ_EMPTY; /* Start with no preference */
 926                 splx(s);
 927
 928                 if ((operation == BLK_META) || (UBCINVALID(vp)) ||
 929                         !(UBCINFOEXISTS(vp))) {
 930                         operation = BLK_META;
 931                 }
 932                 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
 933                         goto start;
 934                 if (incore(vp, blkno)) {
 935                         SET(bp->b_flags, B_INVAL);
 936                         binshash(bp, &invalhash);
 937                         brelse(bp);
 938                         goto start;
 939                 }
 940
 941                 /*
 942                  * if it is meta, the queue may be set to other
 943                  * type so reset as well as mark it to be B_META
 944                  * so that when buffer is released it will goto META queue
 945                  * Also, if the vnode is not VREG, then it is META
 946                  */
 947                 if (operation == BLK_META) {
 948                         SET(bp->b_flags, B_META);
 949                         queue = BQ_META;
 950                 }
 951                 /*
 952                  * Insert in the hash so that incore() can find it
 953                  */
 954                 binshash(bp, BUFHASH(vp, blkno));
 955
 956                 allocbuf(bp, size);
 957
 958                 switch (operation) {
 959                 case BLK_META:
 960                         /* buffer data is invalid */
 961
 962 #if !ZALLOC_METADATA
 963                         if (bp->b_data)
 964                                 panic("bp->b_data is not nul; %x",bp);
 965                         kret = kmem_alloc(kernel_map,
 966                                                 &bp->b_data, bp->b_bufsize);
 967                         if (kret != KERN_SUCCESS)
 968                                 panic("getblk: kmem_alloc() returned %d", kret);
 969 #endif /* ZALLOC_METADATA */
 970
 971                         if(bp->b_data == 0)
 972                                 panic("bp->b_data is null %x",bp);
 973
 974                         bp->b_blkno = bp->b_lblkno = blkno;
 975                         s = splbio();
 976                         bgetvp(vp, bp);
 977                         bufstats.bufs_miss++;
 978                         splx(s);
 979                         if (bp->b_data == 0)
 980                                 panic("b_data is 0: 2");
 981
 982                         /* wakeup the buffer */
 983                         CLR(bp->b_flags, B_WANTED);
 984                         wakeup(bp);
 985                         break;
 986
 987                 case BLK_READ:
 988                 case BLK_WRITE:
 989
 990                         if (ISSET(bp->b_flags, B_PAGELIST))
 991                                 panic("B_PAGELIST in bp=%x",bp);
 992
 993                         kret = ubc_create_upl(vp,
 994                                                         ubc_blktooff(vp, blkno),
 995                                                         bp->b_bufsize,
 996                                                         &upl,
 997                                                         &pl,
 998                                                         UPL_PRECIOUS);
 999                         if (kret != KERN_SUCCESS)
1000                                 panic("Failed to get pagelists");
1001
1002 #ifdef  UBC_DEBUG
1003                         upl_ubc_alias_set(upl, bp, 4);
1004 #endif /* UBC_DEBUG */
1005                         bp->b_blkno = bp->b_lblkno = blkno;
1006                         bp->b_pagelist = upl;
1007
1008                         SET(bp->b_flags, B_PAGELIST);
1009
1010                         if (upl_valid_page(pl, 0)) {
1011                                 SET(bp->b_flags, B_CACHE | B_DONE);
1012                                 bufstats.bufs_vmhits++;
1013
1014                                 pagedirty = upl_dirty_page(pl, 0);
1015
1016                                 if (pagedirty)
1017                                         SET(bp->b_flags, B_WASDIRTY);
1018
1019                                 if (vp->v_tag == VT_NFS) {
1020                                         off_t  f_offset;
1021                                         int    valid_size;
1022
1023                                         bp->b_validoff = 0;
1024                                         bp->b_dirtyoff = 0;
1025
1026                                         f_offset = ubc_blktooff(vp, blkno);
1027
1028                                         if (f_offset > vp->v_ubcinfo->ui_size) {
1029                                                 CLR(bp->b_flags, (B_CACHE|B_DONE|B_WASDIRTY));
1030                                                 bp->b_validend = 0;
1031                                                 bp->b_dirtyend = 0;
1032                                         } else {
1033                                                 valid_size = min(((unsigned int)(vp->v_ubcinfo->ui_size - f_offset)), PAGE_SIZE);
1034                                                 bp->b_validend = valid_size;
1035
1036                                                 if (pagedirty)
1037                                                        bp->b_dirtyend = valid_size;
1038                                                 else
1039                                                        bp->b_dirtyend = 0;
1040
1041                                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_NONE,
1042                                                              bp->b_validend, bp->b_dirtyend,
1043                                                              (int)vp->v_ubcinfo->ui_size, 0, 0);
1044                                         }
1045                                 } else {
1046                                         bp->b_validoff = 0;
1047                                         bp->b_dirtyoff = 0;
1048
1049                                         if (pagedirty) {
1050                                                 /* page is dirty */
1051                                                 bp->b_validend = bp->b_bcount;
1052                                                 bp->b_dirtyend = bp->b_bcount;
1053                                         } else {
1054                                                 /* page is clean */
1055                                                 bp->b_validend = bp->b_bcount;
1056                                                 bp->b_dirtyend = 0;
1057                                         }
1058                                 }
1059                                 if (error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL)) {
1060                                         panic("VOP_BMAP failed in getblk");
1061                                         /*NOTREACHED*/
1062                                         /*
1063                                          * XXX:  We probably should invalidate the VM Page
1064                                          */
1065                                         bp->b_error = error;
1066                                         SET(bp->b_flags, (B_ERROR | B_INVAL));
1067                                         /* undo B_DONE that was set before upl_commit() */
1068                                         CLR(bp->b_flags, B_DONE);
1069                                         brelse(bp);
1070                                         return (0);
1071                                 }
1072                         } else {
1073                                 bufstats.bufs_miss++;
1074                         }
1075                         kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
1076                         if (kret != KERN_SUCCESS) {
1077                                 panic("getblk: ubc_upl_map() "
1078                                       "failed with (%d)", kret);
1079                         }
1080                         if (bp->b_data == 0) panic("kernel_upl_map mapped 0");
1081
1082                         s = splbio();
1083                         bgetvp(vp, bp);
1084                         splx(s);
1085
1086                         break;
1087
1088                 case BLK_PAGEIN:
1089                 case BLK_PAGEOUT:
1090                         panic("getblk: paging operation 2");
1091                         break;
1092                 default:
1093                         panic("getblk: %d unknown operation 3", operation);
1094                         /*NOTREACHED*/
1095                         break;
1096                 }
1097         }
1098
1099         if (bp->b_data == NULL)
1100                 panic("getblk: bp->b_addr is null");
1101
1102         if (bp->b_bufsize & 0xfff) {
1103 #if ZALLOC_METADATA
1104                 if (ISSET(bp->b_flags, B_META) && (bp->b_bufsize & 0x1ff))
1105 #endif /* ZALLOC_METADATA */
1106                         panic("getblk: bp->b_bufsize = %d", bp->b_bufsize);
1107         }
1108
1109         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
1110                      bp, bp->b_data, bp->b_flags, 3, 0);
1111
1112         return (bp);
1113 }
1114
1115 /*
1116  * Get an empty, disassociated buffer of given size.
1117  */
1118 struct buf *
1119 geteblk(size)
1120         int size;
1121 {
1122         struct buf *bp;
1123     int queue = BQ_EMPTY;
1124 #if !ZALLOC_METADATA
1125         kern_return_t kret;
1126         vm_size_t desired_size = roundup(size, CLBYTES);
1127
1128         if (desired_size > MAXBSIZE)
1129                 panic("geteblk: buffer larger than MAXBSIZE requested");
1130 #endif /* ZALLOC_METADATA */
1131
1132         while ((bp = getnewbuf(0, 0, &queue)) == 0)
1133                 ;
1134 #if ZALLOC_METADATA
1135         SET(bp->b_flags, (B_META|B_INVAL));
1136 #else
1137         SET(bp->b_flags, B_INVAL);
1138 #endif /* ZALLOC_METADATA */
1139
1140 #if DIAGNOSTIC
1141         assert(queue == BQ_EMPTY);
1142 #endif /* DIAGNOSTIC */
1143         /* XXX need to implement logic to deal with other queues */
1144
1145 #if !ZALLOC_METADATA
1146         /* Empty buffer - allocate pages */
1147         kret = kmem_alloc_aligned(kernel_map, &bp->b_data, desired_size);
1148         if (kret != KERN_SUCCESS)
1149                 panic("geteblk: kmem_alloc_aligned returned %d", kret);
1150 #endif /* ZALLOC_METADATA */
1151
1152         binshash(bp, &invalhash);
1153         allocbuf(bp, size);
1154         bufstats.bufs_eblk++;
1155
1156         return (bp);
1157 }
1158
1159 #if ZALLOC_METADATA
1160 /*
1161  * Zones for the meta data buffers
1162  */
1163
1164 #define MINMETA 512
1165 #define MAXMETA 4096
1166
1167 struct meta_zone_entry {
1168         zone_t mz_zone;
1169         vm_size_t mz_size;
1170         vm_size_t mz_max;
1171         char *mz_name;
1172 };
1173
1174 struct meta_zone_entry meta_zones[] = {
1175         {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
1176         {NULL, (MINMETA * 2),  64 * (MINMETA * 2), "buf.1024" },
1177         {NULL, (MINMETA * 3),  16 * (MINMETA * 3), "buf.1536" },
1178         {NULL, (MINMETA * 4),  16 * (MINMETA * 4), "buf.2048" },
1179         {NULL, (MINMETA * 5),  16 * (MINMETA * 5), "buf.2560" },
1180         {NULL, (MINMETA * 6),  16 * (MINMETA * 6), "buf.3072" },
1181         {NULL, (MINMETA * 7),  16 * (MINMETA * 7), "buf.3584" },
1182         {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
1183         {NULL, 0, 0, "" } /* End */
1184 };
1185
1186 /*
1187  * Initialize the meta data zones
1188  */
1189 static void
1190 bufzoneinit(void)
1191 {
1192         int i;
1193
1194         for (i = 0; meta_zones[i].mz_size != 0; i++) {
1195                 meta_zones[i].mz_zone =
1196                                 zinit(meta_zones[i].mz_size,
1197                                         meta_zones[i].mz_max,
1198                                         PAGE_SIZE,
1199                                         meta_zones[i].mz_name);
1200         }
1201 }
1202
1203 static zone_t
1204 getbufzone(size_t size)
1205 {
1206         int i;
1207
1208         if (size % 512)
1209                 panic("getbufzone: incorect size = %d", size);
1210
1211         i = (size / 512) - 1;
1212         return (meta_zones[i].mz_zone);
1213 }
1214 #endif /* ZALLOC_METADATA */
1215
1216 /*
1217  * With UBC, there is no need to expand / shrink the file data
1218  * buffer. The VM uses the same pages, hence no waste.
1219  * All the file data buffers can have one size.
1220  * In fact expand / shrink would be an expensive operation.
1221  *
1222  * Only exception to this is meta-data buffers. Most of the
1223  * meta data operations are smaller than PAGE_SIZE. Having the
1224  * meta-data buffers grow and shrink as needed, optimizes use
1225  * of the kernel wired memory.
1226  */
1227
1228 int
1229 allocbuf(bp, size)
1230         struct buf *bp;
1231         int size;
1232 {
1233         vm_size_t desired_size;
1234
1235         desired_size = roundup(size, CLBYTES);
1236
1237         if(desired_size < PAGE_SIZE)
1238                 desired_size = PAGE_SIZE;
1239         if (desired_size > MAXBSIZE)
1240                 panic("allocbuf: buffer larger than MAXBSIZE requested");
1241
1242 #if ZALLOC_METADATA
1243         if (ISSET(bp->b_flags, B_META)) {
1244                 kern_return_t kret;
1245                 zone_t zprev, z;
1246                 size_t nsize = roundup(size, MINMETA);
1247
1248                 if (bp->b_data) {
1249                         vm_offset_t elem = (vm_offset_t)bp->b_data;
1250
1251                         if (ISSET(bp->b_flags, B_ZALLOC))
1252                                 if (bp->b_bufsize <= MAXMETA) {
1253                                         if (bp->b_bufsize < nsize) {
1254                                                 /* reallocate to a bigger size */
1255                                                 desired_size = nsize;
1256
1257                                                 zprev = getbufzone(bp->b_bufsize);
1258                                                 z = getbufzone(nsize);
1259                                                 bp->b_data = (caddr_t)zalloc(z);
1260                                                 if(bp->b_data == 0)
1261                                                         panic("allocbuf: zalloc() returned NULL");
1262                                                 bcopy(elem, bp->b_data, bp->b_bufsize);
1263                                                 zfree(zprev, elem);
1264                                         } else {
1265                                                 desired_size = bp->b_bufsize;
1266                                         }
1267                                 } else
1268                                         panic("allocbuf: B_ZALLOC set incorrectly");
1269                         else
1270                                 if (bp->b_bufsize < desired_size) {
1271                                         /* reallocate to a bigger size */
1272                                         kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1273                                         if (kret != KERN_SUCCESS)
1274                                                 panic("allocbuf: kmem_alloc() returned %d", kret);
1275                                         if(bp->b_data == 0)
1276                                                 panic("allocbuf: null b_data");
1277                                         bcopy(elem, bp->b_data, bp->b_bufsize);
1278                                         kmem_free(kernel_map, elem, bp->b_bufsize);
1279                                 } else {
1280                                         desired_size = bp->b_bufsize;
1281                                 }
1282                 } else {
1283                         /* new allocation */
1284                         if (nsize <= MAXMETA) {
1285                                 desired_size = nsize;
1286                                 z = getbufzone(nsize);
1287                                 bp->b_data = (caddr_t)zalloc(z);
1288                                 if(bp->b_data == 0)
1289                                         panic("allocbuf: zalloc() returned NULL 2");
1290                                 SET(bp->b_flags, B_ZALLOC);
1291                         } else {
1292                                 kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1293                                 if (kret != KERN_SUCCESS)
1294                                         panic("allocbuf: kmem_alloc() 2 returned %d", kret);
1295                                 if(bp->b_data == 0)
1296                                         panic("allocbuf: null b_data 2");
1297                         }
1298                 }
1299         }
1300
1301         if (ISSET(bp->b_flags, B_META) && (bp->b_data == 0))
1302                 panic("allocbuf: bp->b_data is NULL");
1303 #endif /* ZALLOC_METADATA */
1304
1305                 bp->b_bufsize = desired_size;
1306                 bp->b_bcount = size;
1307 }
1308
1309 /*
1310  *      Get a new buffer from one of the free lists.
1311  *
1312  *      Request for a queue is passes in. The queue from which the buffer was taken
1313  *      from is returned. Out of range queue requests get BQ_EMPTY. Request for
1314  *      BQUEUE means no preference. Use heuristics in that case.
1315  *      Heuristics is as follows:
1316  *      Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
1317  *      If none available block till one is made available.
1318  *      If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
1319  *      Pick the most stale buffer.
1320  *      If found buffer was marked delayed write, start the async. write
1321  *      and restart the search.
1322  *      Initialize the fields and disassociate the buffer from the vnode.
1323  *      Remove the buffer from the hash. Return the buffer and the queue
1324  *      on which it was found.
1325  */
1326
1327 static struct buf *
1328 getnewbuf(slpflag, slptimeo, queue)
1329         int slpflag, slptimeo;
1330         int *queue;
1331 {
1332         register struct buf *bp;
1333         register struct buf *lru_bp;
1334         register struct buf *age_bp;
1335         register struct buf *meta_bp;
1336         register int age_time, lru_time, bp_time, meta_time;
1337         int s;
1338         struct ucred *cred;
1339         int req = *queue; /* save it for restarts */
1340
1341 start:
1342         s = splbio();
1343
1344         /* invalid request gets empty queue */
1345         if ((*queue > BQUEUES) || (*queue < 0))
1346                 *queue = BQ_EMPTY;
1347
1348         /* (*queue == BQUEUES) means no preference */
1349         if (*queue != BQUEUES) {
1350                 /* Try for the requested queue first */
1351                 bp = bufqueues[*queue].tqh_first;
1352                 if (bp)
1353                         goto found;
1354         }
1355
1356         /* Unable to use requested queue */
1357         age_bp = bufqueues[BQ_AGE].tqh_first;
1358         lru_bp = bufqueues[BQ_LRU].tqh_first;
1359         meta_bp = bufqueues[BQ_META].tqh_first;
1360
1361         if (!age_bp && !lru_bp && !meta_bp) { /* Unavailble on AGE or LRU */
1362                 /* Try the empty list first */
1363                 bp = bufqueues[BQ_EMPTY].tqh_first;
1364                 if (bp) {
1365                         *queue = BQ_EMPTY;
1366                         goto found;
1367                 }
1368 #if DIAGNOSTIC
1369                 /* with UBC this is a fatal condition */
1370                 panic("getnewbuf: No useful buffers");
1371 #else
1372                 /* Log this error condition */
1373                 printf("getnewbuf: No useful buffers");
1374 #endif  /* DIAGNOSTIC */
1375
1376                 /* wait for a free buffer of any kind */
1377                 needbuffer = 1;
1378                 bufstats.bufs_sleeps++;
1379                 tsleep(&needbuffer, slpflag|(PRIBIO+1), "getnewbuf", slptimeo);
1380                 splx(s);
1381                 return (0);
1382         }
1383
1384         /* Buffer available either on AGE or LRU or META */
1385         bp = NULL;
1386         *queue = -1;
1387
1388         /* Buffer available either on AGE or LRU */
1389         if (!age_bp) {
1390                 bp = lru_bp;
1391                 *queue = BQ_LRU;
1392         } else if (!lru_bp) {
1393                 bp = age_bp;
1394                 *queue = BQ_AGE;
1395         } else { /* buffer available on both AGE and LRU */
1396                 age_time = time.tv_sec - age_bp->b_timestamp;
1397                 lru_time = time.tv_sec - lru_bp->b_timestamp;
1398                 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
1399                         bp = age_bp;
1400                         *queue = BQ_AGE;
1401                         /*
1402                          * we should probably re-timestamp eveything in the
1403                          * queues at this point with the current time
1404                          */
1405                 } else {
1406                         if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
1407                                 bp = lru_bp;
1408                                 *queue = BQ_LRU;
1409                         } else {
1410                                 bp = age_bp;
1411                                 *queue = BQ_AGE;
1412                         }
1413                 }
1414         }
1415
1416         if (!bp) { /* Neither on AGE nor on LRU */
1417                 bp = meta_bp;
1418                 *queue = BQ_META;
1419         }  else if (meta_bp) {
1420                 bp_time = time.tv_sec - bp->b_timestamp;
1421                 meta_time = time.tv_sec - meta_bp->b_timestamp;
1422
1423                 if (!(bp_time < 0) && !(meta_time < 0)) {
1424                         /* time not set backwards */
1425                         int bp_is_stale;
1426                         bp_is_stale = (*queue == BQ_LRU) ?
1427                                         lru_is_stale : age_is_stale;
1428
1429                         if ((meta_time >= meta_is_stale) &&
1430                                         (bp_time < bp_is_stale)) {
1431                                 bp = meta_bp;
1432                                 *queue = BQ_META;
1433                         }
1434                 }
1435         }
1436
1437         if (bp == NULL)
1438                 panic("getnewbuf: null bp");
1439
1440 found:
1441         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1442                 panic("getnewbuf: le_prev is deadbeef");
1443
1444         if(ISSET(bp->b_flags, B_BUSY))
1445                 panic("getnewbuf reusing BUSY buf");
1446
1447         /* Clean it */
1448         if (bcleanbuf(bp)) {
1449                 /* bawrite() issued, buffer not ready */
1450                 splx(s);
1451                 *queue = req;
1452                 goto start;
1453         }
1454         splx(s);
1455         return (bp);
1456 }
1457 #include <mach/mach_types.h>
1458 #include <mach/memory_object_types.h>
1459
1460 /*
1461  * Clean a buffer.
1462  * Returns 0 is buffer is ready to use,
1463  * Returns 1 if issued a bawrite() to indicate
1464  * that the buffer is not ready.
1465  */
1466 int
1467 bcleanbuf(struct buf *bp)
1468 {
1469         int s;
1470         struct ucred *cred;
1471
1472         s = splbio();
1473
1474         /* Remove from the queue */
1475         bremfree(bp);
1476
1477         /* Buffer is no longer on free lists. */
1478         SET(bp->b_flags, B_BUSY);
1479
1480         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1481                 panic("bcleanbuf: le_prev is deadbeef");
1482
1483         /* If buffer was a delayed write, start it, and return 1 */
1484         if (ISSET(bp->b_flags, B_DELWRI)) {
1485                 splx(s);
1486                 bawrite (bp);
1487                 return (1);
1488         }
1489
1490         if (bp->b_vp)
1491                 brelvp(bp);
1492         bremhash(bp);
1493         BLISTNONE(bp);
1494
1495         splx(s);
1496
1497         if (ISSET(bp->b_flags, B_META)) {
1498 #if ZALLOC_METADATA
1499                 vm_offset_t elem = (vm_offset_t)bp->b_data;
1500                 if (elem == 0)
1501                         panic("bcleanbuf: NULL bp->b_data B_META buffer");
1502
1503                 if (ISSET(bp->b_flags, B_ZALLOC)) {
1504                         if (bp->b_bufsize <= MAXMETA) {
1505                                 zone_t z;
1506
1507                                 z = getbufzone(bp->b_bufsize);
1508                                 bp->b_data = (caddr_t)0xdeadbeef;
1509                                 zfree(z, elem);
1510                                 CLR(bp->b_flags, B_ZALLOC);
1511                         } else
1512                                 panic("bcleanbuf: B_ZALLOC set incorrectly");
1513                 } else {
1514                         bp->b_data = (caddr_t)0xdeadbeef;
1515                         kmem_free(kernel_map, elem, bp->b_bufsize);
1516                 }
1517 #else
1518            if (bp->b_data == 0)
1519                    panic("bcleanbuf: bp->b_data == NULL for B_META buffer");
1520
1521            kmem_free(kernel_map, bp->b_data, bp->b_bufsize);
1522 #endif /* ZALLOC_METADATA */
1523         }
1524
1525         trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
1526
1527         /* disassociate us from our vnode, if we had one... */
1528         s = splbio();
1529
1530         /* clear out various other fields */
1531         bp->b_bufsize = 0;
1532         bp->b_data = 0;
1533         bp->b_flags = B_BUSY;
1534         bp->b_dev = NODEV;
1535         bp->b_blkno = bp->b_lblkno = 0;
1536         bp->b_iodone = 0;
1537         bp->b_error = 0;
1538         bp->b_resid = 0;
1539         bp->b_bcount = 0;
1540         bp->b_dirtyoff = bp->b_dirtyend = 0;
1541         bp->b_validoff = bp->b_validend = 0;
1542
1543         /* nuke any credentials we were holding */
1544         cred = bp->b_rcred;
1545         if (cred != NOCRED) {
1546                 bp->b_rcred = NOCRED;
1547                 crfree(cred);
1548         }
1549         cred = bp->b_wcred;
1550         if (cred != NOCRED) {
1551                 bp->b_wcred = NOCRED;
1552                 crfree(cred);
1553         }
1554         splx(s);
1555         return (0);
1556 }
1557
1558
1559 /*
1560  * Wait for operations on the buffer to complete.
1561  * When they do, extract and return the I/O's error value.
1562  */
1563 int
1564 biowait(bp)
1565         struct buf *bp;
1566 {
1567         upl_t           upl;
1568         upl_page_info_t *pl;
1569         int s;
1570         kern_return_t kret;
1571
1572         s = splbio();
1573         while (!ISSET(bp->b_flags, B_DONE))
1574                 tsleep(bp, PRIBIO + 1, "biowait", 0);
1575         splx(s);
1576
1577         /* check for interruption of I/O (e.g. via NFS), then errors. */
1578         if (ISSET(bp->b_flags, B_EINTR)) {
1579                 CLR(bp->b_flags, B_EINTR);
1580                 return (EINTR);
1581         } else if (ISSET(bp->b_flags, B_ERROR))
1582                 return (bp->b_error ? bp->b_error : EIO);
1583         else
1584                 return (0);
1585 }
1586
1587 /*
1588  * Mark I/O complete on a buffer.
1589  *
1590  * If a callback has been requested, e.g. the pageout
1591  * daemon, do so. Otherwise, awaken waiting processes.
1592  *
1593  * [ Leffler, et al., says on p.247:
1594  *      "This routine wakes up the blocked process, frees the buffer
1595  *      for an asynchronous write, or, for a request by the pagedaemon
1596  *      process, invokes a procedure specified in the buffer structure" ]
1597  *
1598  * In real life, the pagedaemon (or other system processes) wants
1599  * to do async stuff to, and doesn't want the buffer brelse()'d.
1600  * (for swap pager, that puts swap buffers on the free lists (!!!),
1601  * for the vn device, that puts malloc'd buffers on the free lists!)
1602  */
1603 void
1604 biodone(bp)
1605         struct buf *bp;
1606 {
1607         boolean_t       funnel_state;
1608         int s;
1609
1610         funnel_state = thread_funnel_set(kernel_flock, TRUE);
1611
1612         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
1613                      bp, bp->b_data, bp->b_flags, 0, 0);
1614
1615         if (ISSET(bp->b_flags, B_DONE))
1616                 panic("biodone already");
1617         SET(bp->b_flags, B_DONE);               /* note that it's done */
1618         /*
1619          * I/O was done, so don't believe
1620          * the DIRTY state from VM anymore
1621          */
1622         CLR(bp->b_flags, B_WASDIRTY);
1623
1624         if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
1625                 vwakeup(bp);     /* wake up reader */
1626
1627         if (ISSET(bp->b_flags, B_CALL)) {       /* if necessary, call out */
1628                 CLR(bp->b_flags, B_CALL);       /* but note callout done */
1629                 (*bp->b_iodone)(bp);
1630         } else if (ISSET(bp->b_flags, B_ASYNC)) /* if async, release it */
1631                 brelse(bp);
1632         else {                                  /* or just wakeup the buffer */
1633                 CLR(bp->b_flags, B_WANTED);
1634                 wakeup(bp);
1635         }
1636
1637         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
1638                      bp, bp->b_data, bp->b_flags, 0, 0);
1639
1640         thread_funnel_set(kernel_flock, funnel_state);
1641 }
1642
1643 /*
1644  * Return a count of buffers on the "locked" queue.
1645  */
1646 int
1647 count_lock_queue()
1648 {
1649         register struct buf *bp;
1650         register int n = 0;
1651
1652         for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
1653             bp = bp->b_freelist.tqe_next)
1654                 n++;
1655         return (n);
1656 }
1657
1658 /*
1659  * Return a count of 'busy' buffers. Used at the time of shutdown.
1660  */
1661 int
1662 count_busy_buffers()
1663 {
1664         register struct buf *bp;
1665         register int nbusy = 0;
1666
1667         for (bp = &buf[nbuf]; --bp >= buf; )
1668                 if ((bp->b_flags & (B_BUSY|B_INVAL)) == B_BUSY)
1669                         nbusy++;
1670         return (nbusy);
1671 }
1672
1673 #if 1 /*DIAGNOSTIC */
1674 /*
1675  * Print out statistics on the current allocation of the buffer pool.
1676  * Can be enabled to print out on every ``sync'' by setting "syncprt"
1677  * in vfs_syscalls.c using sysctl.
1678  */
1679 void
1680 vfs_bufstats()
1681 {
1682         int s, i, j, count;
1683         register struct buf *bp;
1684         register struct bqueues *dp;
1685         int counts[MAXBSIZE/CLBYTES+1];
1686         static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY", "META" };
1687
1688         for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
1689                 count = 0;
1690                 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1691                         counts[j] = 0;
1692                 s = splbio();
1693                 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
1694                         counts[bp->b_bufsize/CLBYTES]++;
1695                         count++;
1696                 }
1697                 splx(s);
1698                 printf("%s: total-%d", bname[i], count);
1699                 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1700                         if (counts[j] != 0)
1701                                 printf(", %d-%d", j * CLBYTES, counts[j]);
1702                 printf("\n");
1703         }
1704 }
1705 #endif /* DIAGNOSTIC */
1706
1707 #define NRESERVEDIOBUFS 16
1708
1709 struct buf *
1710 alloc_io_buf(vp, priv)
1711         struct vnode *vp;
1712         int priv;
1713 {
1714         register struct buf *bp;
1715         int s;
1716
1717         s = splbio();
1718
1719         while (niobuf - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse && !priv) {
1720                 need_iobuffer = 1;
1721                 bufstats.bufs_iobufsleeps++;
1722                 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf", 0);
1723         }
1724
1725         while ((bp = iobufqueue.tqh_first) == NULL) {
1726                 need_iobuffer = 1;
1727                 bufstats.bufs_iobufsleeps++;
1728                 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf1", 0);
1729         }
1730
1731         TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
1732         bp->b_timestamp = 0;
1733
1734         /* clear out various fields */
1735         bp->b_flags = B_BUSY;
1736         bp->b_blkno = bp->b_lblkno = 0;
1737         bp->b_iodone = 0;
1738         bp->b_error = 0;
1739         bp->b_resid = 0;
1740         bp->b_bcount = 0;
1741         bp->b_bufsize = 0;
1742         bp->b_vp = vp;
1743
1744         if (vp->v_type == VBLK || vp->v_type == VCHR)
1745                 bp->b_dev = vp->v_rdev;
1746         else
1747                 bp->b_dev = NODEV;
1748         bufstats.bufs_iobufinuse++;
1749         if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
1750                 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
1751         splx(s);
1752
1753         return (bp);
1754 }
1755
1756 void
1757 free_io_buf(bp)
1758         struct buf *bp;
1759 {
1760         int s;
1761
1762         s = splbio();
1763         /* put buffer back on the head of the iobufqueue */
1764         bp->b_vp = NULL;
1765         bp->b_flags = B_INVAL;
1766
1767         binsheadfree(bp, &iobufqueue, -1);
1768
1769         /* Wake up any processes waiting for any buffer to become free. */
1770         if (need_iobuffer) {
1771                 need_iobuffer = 0;
1772                 wakeup(&need_iobuffer);
1773         }
1774         bufstats.bufs_iobufinuse--;
1775         splx(s);
1776 }
1777
1778
1779 /* not hookedup yet */
1780
1781 /* XXX move this to a separate file */
1782 /*
1783  * Dynamic Scaling of the Buffer Queues
1784  */
1785
1786 typedef long long blsize_t;
1787
1788 blsize_t MAXNBUF; /* initialize to (mem_size / PAGE_SIZE) */
1789 /* Global tunable limits */
1790 blsize_t nbufh;                 /* number of buffer headers */
1791 blsize_t nbuflow;               /* minimum number of buffer headers required */
1792 blsize_t nbufhigh;              /* maximum number of buffer headers allowed */
1793 blsize_t nbuftarget;    /* preferred number of buffer headers */
1794
1795 /*
1796  * assertions:
1797  *
1798  * 1.   0 < nbuflow <= nbufh <= nbufhigh
1799  * 2.   nbufhigh <= MAXNBUF
1800  * 3.   0 < nbuflow <= nbuftarget <= nbufhigh
1801  * 4.   nbufh can not be set by sysctl().
1802  */
1803
1804 /* Per queue tunable limits */
1805
1806 struct bufqlim {
1807         blsize_t        bl_nlow;        /* minimum number of buffer headers required */
1808         blsize_t        bl_num;         /* number of buffer headers on the queue */
1809         blsize_t        bl_nlhigh;      /* maximum number of buffer headers allowed */
1810         blsize_t        bl_target;      /* preferred number of buffer headers */
1811         long    bl_stale;       /* Seconds after which a buffer is considered stale */
1812 } bufqlim[BQUEUES];
1813
1814 /*
1815  * assertions:
1816  *
1817  * 1.   0 <= bl_nlow <= bl_num <= bl_nlhigh
1818  * 2.   bl_nlhigh <= MAXNBUF
1819  * 3.  bufqlim[BQ_META].bl_nlow != 0
1820  * 4.  bufqlim[BQ_META].bl_nlow > (number of possible concurrent
1821  *                                                                      file system IO operations)
1822  * 5.   bl_num can not be set by sysctl().
1823  * 6.   bl_nhigh <= nbufhigh
1824  */
1825
1826 /*
1827  * Rationale:
1828  * ----------
1829  * Defining it blsize_t as long permits 2^31 buffer headers per queue.
1830  * Which can describe (2^31 * PAGE_SIZE) memory per queue.
1831  *
1832  * These limits are exported to by means of sysctl().
1833  * It was decided to define blsize_t as a 64 bit quantity.
1834  * This will make sure that we will not be required to change it
1835  * as long as we do not exceed 64 bit address space for the kernel.
1836  *
1837  * low and high numbers parameters initialized at compile time
1838  * and boot arguments can be used to override them. sysctl()
1839  * would not change the value. sysctl() can get all the values
1840  * but can set only target. num is the current level.
1841  *
1842  * Advantages of having a "bufqscan" thread doing the balancing are,
1843  * Keep enough bufs on BQ_EMPTY.
1844  *      getnewbuf() by default will always select a buffer from the BQ_EMPTY.
1845  *              getnewbuf() perfoms best if a buffer was found there.
1846  *              Also this minimizes the possibility of starting IO
1847  *              from getnewbuf(). That's a performance win, too.
1848  *
1849  *      Localize complex logic [balancing as well as time aging]
1850  *              to balancebufq().
1851  *
1852  *      Simplify getnewbuf() logic by elimination of time aging code.
1853  */
1854
1855 /*
1856  * Algorithm:
1857  * -----------
1858  * The goal of the dynamic scaling of the buffer queues to to keep
1859  * the size of the LRU close to bl_target. Buffers on a queue would
1860  * be time aged.
1861  *
1862  * There would be a thread which will be responsible for "balancing"
1863  * the buffer cache queues.
1864  *
1865  * The scan order would be:     AGE, LRU, META, EMPTY.
1866  */
1867
1868 long bufqscanwait = 0;
1869
1870 extern void bufqscan_thread();
1871 extern int balancebufq(int q);
1872 extern int btrimempty(int n);
1873 extern int initbufqscan(void);
1874 extern int nextbufq(int q);
1875 extern void buqlimprt(int all);
1876
1877 void
1878 bufq_balance_thread_init()
1879 {
1880
1881         if (bufqscanwait++ == 0) {
1882                 int i;
1883
1884                 /* Initalize globals */
1885                 MAXNBUF = (mem_size / PAGE_SIZE);
1886                 nbufh = nbuf;
1887                 nbuflow = min(nbufh, 100);
1888                 nbufhigh = min(MAXNBUF, max(nbufh, 2048));
1889                 nbuftarget = (mem_size >> 5) / PAGE_SIZE;
1890                 nbuftarget = max(nbuflow, nbuftarget);
1891                 nbuftarget = min(nbufhigh, nbuftarget);
1892
1893                 /*
1894                  * Initialize the bufqlim
1895                  */
1896
1897                 /* LOCKED queue */
1898                 bufqlim[BQ_LOCKED].bl_nlow = 0;
1899                 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
1900                 bufqlim[BQ_LOCKED].bl_target = 0;
1901                 bufqlim[BQ_LOCKED].bl_stale = 30;
1902
1903                 /* LRU queue */
1904                 bufqlim[BQ_LRU].bl_nlow = 0;
1905                 bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
1906                 bufqlim[BQ_LRU].bl_target = nbuftarget/4;
1907                 bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
1908
1909                 /* AGE queue */
1910                 bufqlim[BQ_AGE].bl_nlow = 0;
1911                 bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
1912                 bufqlim[BQ_AGE].bl_target = nbuftarget/4;
1913                 bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
1914
1915                 /* EMPTY queue */
1916                 bufqlim[BQ_EMPTY].bl_nlow = 0;
1917                 bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
1918                 bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
1919                 bufqlim[BQ_EMPTY].bl_stale = 600000;
1920
1921                 /* META queue */
1922                 bufqlim[BQ_META].bl_nlow = 0;
1923                 bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
1924                 bufqlim[BQ_META].bl_target = nbuftarget/4;
1925                 bufqlim[BQ_META].bl_stale = META_IS_STALE;
1926
1927                 buqlimprt(1);
1928         }
1929
1930         /* create worker thread */
1931         kernel_thread(kernel_task, bufqscan_thread);
1932 }
1933
1934 /* The workloop for the buffer balancing thread */
1935 void
1936 bufqscan_thread()
1937 {
1938         boolean_t       funnel_state;
1939         int moretodo = 0;
1940
1941         funnel_state = thread_funnel_set(kernel_flock, TRUE);
1942
1943         for(;;) {
1944                 do {
1945                         int q;  /* buffer queue to process */
1946
1947                         for (q = initbufqscan(); q; ) {
1948                                 moretodo |= balancebufq(q);
1949                                 q = nextbufq(q);
1950                         }
1951                 } while (moretodo);
1952
1953 #if 1 || DIAGNOSTIC
1954                 vfs_bufstats();
1955                 buqlimprt(0);
1956 #endif
1957                 (void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz);
1958                 moretodo = 0;
1959         }
1960
1961         (void) thread_funnel_set(kernel_flock, FALSE);
1962 }
1963
1964 /* Seed for the buffer queue balancing */
1965 int
1966 initbufqscan()
1967 {
1968         /* Start with AGE queue */
1969         return (BQ_AGE);
1970 }
1971
1972 /* Pick next buffer queue to balance */
1973 int
1974 nextbufq(int q)
1975 {
1976         int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
1977
1978         q++;
1979         q %= sizeof(order);
1980         return (order[q]);
1981 }
1982
1983 /* function to balance the buffer queues */
1984 int
1985 balancebufq(int q)
1986 {
1987         int moretodo = 0;
1988         int s = splbio();
1989         int n;
1990
1991         /* reject invalid q */
1992         if ((q < 0) || (q >= BQUEUES))
1993                 goto out;
1994
1995         /* LOCKED queue MUST not be balanced */
1996         if (q == BQ_LOCKED)
1997                 goto out;
1998
1999         n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
2000
2001         /* If queue has less than target nothing more to do */
2002         if (n < 0)
2003                 goto out;
2004
2005         if ( n > 8 ) {
2006                 /* Balance only a small amount (12.5%) at a time */
2007                 n >>= 3;
2008         }
2009
2010         /* EMPTY queue needs special handling */
2011         if (q == BQ_EMPTY) {
2012                 moretodo |= btrimempty(n);
2013                 goto out;
2014         }
2015
2016         for (; n > 0; n--) {
2017                 struct buf *bp = bufqueues[q].tqh_first;
2018                 if (!bp)
2019                         break;
2020
2021                 /* check if it's stale */
2022                 if ((time.tv_sec - bp->b_timestamp) > bufqlim[q].bl_stale) {
2023                         if (bcleanbuf(bp)) {
2024                                 /* bawrite() issued, bp not ready */
2025                                 moretodo = 1;
2026                         } else {
2027                                 /* release the cleaned buffer to BQ_EMPTY */
2028                                 SET(bp->b_flags, B_INVAL);
2029                                 brelse(bp);
2030                         }
2031                 } else
2032                         break;
2033         }
2034
2035 out:
2036         splx(s);
2037         return (moretodo);
2038 }
2039
2040 int
2041 btrimempty(int n)
2042 {
2043         /*
2044          * When struct buf are allocated dynamically, this would
2045          * reclaim upto 'n' struct buf from the empty queue.
2046          */
2047
2048          return (0);
2049 }
2050
2051 void
2052 bufqinc(int q)
2053 {
2054         if ((q < 0) || (q >= BQUEUES))
2055                 return;
2056
2057         bufqlim[q].bl_num++;
2058         return;
2059 }
2060
2061 void
2062 bufqdec(int q)
2063 {
2064         if ((q < 0) || (q >= BQUEUES))
2065                 return;
2066
2067         bufqlim[q].bl_num--;
2068         return;
2069 }
2070
2071 void
2072 buqlimprt(int all)
2073 {
2074         int i;
2075     static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY", "META" };
2076
2077         if (all)
2078                 for (i = 0; i < BQUEUES; i++) {
2079                         printf("%s : ", bname[i]);
2080                         printf("min = %d, ", (long)bufqlim[i].bl_nlow);
2081                         printf("cur = %d, ", (long)bufqlim[i].bl_num);
2082                         printf("max = %d, ", (long)bufqlim[i].bl_nlhigh);
2083                         printf("target = %d, ", (long)bufqlim[i].bl_target);
2084                         printf("stale after %d seconds\n", bufqlim[i].bl_stale);
2085                 }
2086         else
2087                 for (i = 0; i < BQUEUES; i++) {
2088                         printf("%s : ", bname[i]);
2089                         printf("cur = %d, ", (long)bufqlim[i].bl_num);
2090                 }
2091 }