bsd/vfs/vfs_bio.c

   1 /*
   2  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  23 /*-
  24  * Copyright (c) 1994 Christopher G. Demetriou
  25  * Copyright (c) 1982, 1986, 1989, 1993
  26  *      The Regents of the University of California.  All rights reserved.
  27  * (c) UNIX System Laboratories, Inc.
  28  * All or some portions of this file are derived from material licensed
  29  * to the University of California by American Telephone and Telegraph
  30  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  31  * the permission of UNIX System Laboratories, Inc.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  * The NEXTSTEP Software License Agreement specifies the terms
  62  * and conditions for redistribution.
  63  *
  64  *      @(#)vfs_bio.c   8.6 (Berkeley) 1/11/94
  65  */
  66
  67
  68 /*
  69  * Some references:
  70  *      Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
  71  *      Leffler, et al.: The Design and Implementation of the 4.3BSD
  72  *              UNIX Operating System (Addison Welley, 1989)
  73  */
  74 #define ZALLOC_METADATA 1
  75
  76 #include <sys/param.h>
  77 #include <sys/systm.h>
  78 #include <sys/proc.h>
  79 #include <sys/buf.h>
  80 #include <sys/vnode.h>
  81 #include <sys/mount.h>
  82 #include <sys/trace.h>
  83 #include <sys/malloc.h>
  84 #include <sys/resourcevar.h>
  85 #include <miscfs/specfs/specdev.h>
  86 #include <sys/ubc.h>
  87 #include <vm/vm_pageout.h>
  88 #if DIAGNOSTIC
  89 #include <kern/assert.h>
  90 #endif /* DIAGNOSTIC */
  91 #include <kern/task.h>
  92 #include <kern/zalloc.h>
  93
  94 #include <sys/kdebug.h>
  95
  96 extern void bufqinc(int q);
  97 extern void bufqdec(int q);
  98 extern void bufq_balance_thread_init();
  99
 100 extern void reassignbuf(struct buf *, struct vnode *);
 101 static struct buf *getnewbuf(int slpflag, int slptimeo, int *queue);
 102
 103 extern int niobuf;      /* The number of IO buffer headers for cluster IO */
 104 int blaundrycnt;
 105
 106 #if TRACE
 107 struct  proc *traceproc;
 108 int     tracewhich, tracebuf[TRCSIZ];
 109 u_int   tracex;
 110 char    traceflags[TR_NFLAGS];
 111 #endif /* TRACE */
 112
 113 /*
 114  * Definitions for the buffer hash lists.
 115  */
 116 #define BUFHASH(dvp, lbn)       \
 117         (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
 118 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
 119 u_long  bufhash;
 120
 121 /* Definitions for the buffer stats. */
 122 struct bufstats bufstats;
 123
 124 /*
 125  * Insq/Remq for the buffer hash lists.
 126  */
 127 #if 0
 128 #define binshash(bp, dp)        LIST_INSERT_HEAD(dp, bp, b_hash)
 129 #define bremhash(bp)            LIST_REMOVE(bp, b_hash)
 130 #endif /* 0 */
 131
 132
 133 TAILQ_HEAD(ioqueue, buf) iobufqueue;
 134 TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
 135 int needbuffer;
 136 int need_iobuffer;
 137
 138 /*
 139  * Insq/Remq for the buffer free lists.
 140  */
 141 #define binsheadfree(bp, dp, whichq)    do { \
 142                                     TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
 143                                         bufqinc((whichq));      \
 144                                         (bp)->b_whichq = whichq; \
 145                                     (bp)->b_timestamp = time.tv_sec; \
 146                                 } while (0)
 147
 148 #define binstailfree(bp, dp, whichq)    do { \
 149                                     TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
 150                                         bufqinc((whichq));      \
 151                                         (bp)->b_whichq = whichq; \
 152                                     (bp)->b_timestamp = time.tv_sec; \
 153                                 } while (0)
 154
 155 #define BHASHENTCHECK(bp)       \
 156         if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef)  \
 157                 panic("%x: b_hash.le_prev is not deadbeef", (bp));
 158
 159 #define BLISTNONE(bp)   \
 160         (bp)->b_hash.le_next = (struct buf *)0; \
 161         (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
 162
 163 simple_lock_data_t bufhashlist_slock;           /* lock on buffer hash list */
 164
 165 /*
 166  * Time in seconds before a buffer on a list is
 167  * considered as a stale buffer
 168  */
 169 #define LRU_IS_STALE 120 /* default value for the LRU */
 170 #define AGE_IS_STALE 60  /* default value for the AGE */
 171 #define META_IS_STALE 180 /* default value for the BQ_META */
 172
 173 int lru_is_stale = LRU_IS_STALE;
 174 int age_is_stale = AGE_IS_STALE;
 175 int meta_is_stale = META_IS_STALE;
 176
 177 #if 1
 178 void
 179 blistenterhead(struct bufhashhdr * head, struct buf * bp)
 180 {
 181         if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
 182                 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
 183         (head)->lh_first = bp;
 184         bp->b_hash.le_prev = &(head)->lh_first;
 185         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
 186                 panic("blistenterhead: le_prev is deadbeef");
 187
 188 }
 189 #endif
 190
 191 #if 1
 192 void
 193 binshash(struct buf *bp, struct bufhashhdr *dp)
 194 {
 195 int s;
 196
 197 struct buf *nbp;
 198
 199         simple_lock(&bufhashlist_slock);
 200 #if 0
 201         if(incore(bp->b_vp, bp->b_lblkno)) {
 202                 panic("adding to queue already existing element");
 203         }
 204 #endif /* 0 */
 205         BHASHENTCHECK(bp);
 206
 207         nbp = dp->lh_first;
 208         for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
 209                 if(nbp == bp)
 210                         panic("buf already in hashlist");
 211         }
 212
 213 #if 0
 214         LIST_INSERT_HEAD(dp, bp, b_hash);
 215 #else
 216         blistenterhead(dp, bp);
 217 #endif
 218         simple_unlock(&bufhashlist_slock);
 219 }
 220
 221 void
 222 bremhash(struct buf *bp)
 223 {
 224         int s;
 225
 226         simple_lock(&bufhashlist_slock);
 227         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
 228                 panic("bremhash le_prev is deadbeef");
 229         if (bp->b_hash.le_next == bp)
 230                 panic("bremhash: next points to self");
 231
 232         if (bp->b_hash.le_next != NULL)
 233                 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
 234         *bp->b_hash.le_prev = (bp)->b_hash.le_next;
 235         simple_unlock(&bufhashlist_slock);
 236 }
 237
 238 #endif /* 1 */
 239
 240
 241 /*
 242  * Remove a buffer from the free list it's on
 243  */
 244 void
 245 bremfree(bp)
 246         struct buf *bp;
 247 {
 248         struct bqueues *dp = NULL;
 249         int whichq = -1;
 250
 251         /*
 252          * We only calculate the head of the freelist when removing
 253          * the last element of the list as that is the only time that
 254          * it is needed (e.g. to reset the tail pointer).
 255          *
 256          * NB: This makes an assumption about how tailq's are implemented.
 257          */
 258         if (bp->b_freelist.tqe_next == NULL) {
 259                 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
 260                         if (dp->tqh_last == &bp->b_freelist.tqe_next)
 261                                 break;
 262                 if (dp == &bufqueues[BQUEUES])
 263                         panic("bremfree: lost tail");
 264         }
 265         TAILQ_REMOVE(dp, bp, b_freelist);
 266         whichq = bp->b_whichq;
 267         bufqdec(whichq);
 268         bp->b_whichq = -1;
 269         bp->b_timestamp = 0;
 270 }
 271
 272 static __inline__ void
 273 bufhdrinit(struct buf *bp)
 274 {
 275         bzero((char *)bp, sizeof *bp);
 276         bp->b_dev = NODEV;
 277         bp->b_rcred = NOCRED;
 278         bp->b_wcred = NOCRED;
 279         bp->b_vnbufs.le_next = NOLIST;
 280         bp->b_flags = B_INVAL;
 281
 282         return;
 283 }
 284
 285 /*
 286  * Initialize buffers and hash links for buffers.
 287  */
 288 void
 289 bufinit()
 290 {
 291         register struct buf *bp;
 292         register struct bqueues *dp;
 293         register int i;
 294         int metabuf;
 295         long whichq;
 296         static void bufzoneinit();
 297         static void bcleanbuf_thread_init();
 298
 299         /* Initialize the buffer queues ('freelists') and the hash table */
 300         for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
 301                 TAILQ_INIT(dp);
 302         bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash);
 303
 304         simple_lock_init(&bufhashlist_slock );
 305
 306         metabuf = nbuf/8; /* reserved for meta buf */
 307
 308         /* Initialize the buffer headers */
 309         for (i = 0; i < nbuf; i++) {
 310                 bp = &buf[i];
 311                 bufhdrinit(bp);
 312
 313                 /*
 314                  * metabuf buffer headers on the meta-data list and
 315                  * rest of the buffer headers on the empty list
 316                  */
 317                 if (--metabuf)
 318                         whichq = BQ_META;
 319                 else
 320                         whichq = BQ_EMPTY;
 321
 322                 BLISTNONE(bp);
 323                 dp = &bufqueues[whichq];
 324                 binsheadfree(bp, dp, whichq);
 325                 binshash(bp, &invalhash);
 326         }
 327
 328         for (; i < nbuf + niobuf; i++) {
 329                 bp = &buf[i];
 330                 bufhdrinit(bp);
 331                 binsheadfree(bp, &iobufqueue, -1);
 332         }
 333
 334         printf("using %d buffer headers and %d cluster IO buffer headers\n",
 335                 nbuf, niobuf);
 336
 337         /* Set up zones used by the buffer cache */
 338         bufzoneinit();
 339
 340         /* start the bcleanbuf() thread */
 341         bcleanbuf_thread_init();
 342
 343 #if 0   /* notyet */
 344         /* create a thread to do dynamic buffer queue balancing */
 345         bufq_balance_thread_init();
 346 #endif /* XXX */
 347 }
 348
 349 /* __inline  */
 350 struct buf *
 351 bio_doread(vp, blkno, size, cred, async, queuetype)
 352         struct vnode *vp;
 353         daddr_t blkno;
 354         int size;
 355         struct ucred *cred;
 356         int async;
 357         int queuetype;
 358 {
 359         register struct buf *bp;
 360         struct proc     *p = current_proc();
 361
 362         bp = getblk(vp, blkno, size, 0, 0, queuetype);
 363
 364         /*
 365          * If buffer does not have data valid, start a read.
 366          * Note that if buffer is B_INVAL, getblk() won't return it.
 367          * Therefore, it's valid if it's I/O has completed or been delayed.
 368          */
 369         if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
 370                 /* Start I/O for the buffer (keeping credentials). */
 371                 SET(bp->b_flags, B_READ | async);
 372                 if (cred != NOCRED && bp->b_rcred == NOCRED) {
 373                         /*
 374                          * NFS has embedded ucred.
 375                          * Can not crhold() here as that causes zone corruption
 376                          */
 377                         bp->b_rcred = crdup(cred);
 378                 }
 379                 VOP_STRATEGY(bp);
 380
 381                 trace(TR_BREADMISS, pack(vp, size), blkno);
 382
 383                 /* Pay for the read. */
 384                 if (p && p->p_stats)
 385                         p->p_stats->p_ru.ru_inblock++;          /* XXX */
 386         } else if (async) {
 387                 brelse(bp);
 388         }
 389
 390         trace(TR_BREADHIT, pack(vp, size), blkno);
 391
 392         return (bp);
 393 }
 394 /*
 395  * Read a disk block.
 396  * This algorithm described in Bach (p.54).
 397  */
 398 int
 399 bread(vp, blkno, size, cred, bpp)
 400         struct vnode *vp;
 401         daddr_t blkno;
 402         int size;
 403         struct ucred *cred;
 404         struct buf **bpp;
 405 {
 406         register struct buf *bp;
 407
 408         /* Get buffer for block. */
 409         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
 410
 411         /* Wait for the read to complete, and return result. */
 412         return (biowait(bp));
 413 }
 414
 415 /*
 416  * Read a disk block. [bread() for meta-data]
 417  * This algorithm described in Bach (p.54).
 418  */
 419 int
 420 meta_bread(vp, blkno, size, cred, bpp)
 421         struct vnode *vp;
 422         daddr_t blkno;
 423         int size;
 424         struct ucred *cred;
 425         struct buf **bpp;
 426 {
 427         register struct buf *bp;
 428
 429         /* Get buffer for block. */
 430         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
 431
 432         /* Wait for the read to complete, and return result. */
 433         return (biowait(bp));
 434 }
 435
 436 /*
 437  * Read-ahead multiple disk blocks. The first is sync, the rest async.
 438  * Trivial modification to the breada algorithm presented in Bach (p.55).
 439  */
 440 int
 441 breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp)
 442         struct vnode *vp;
 443         daddr_t blkno; int size;
 444         daddr_t rablks[]; int rasizes[];
 445         int nrablks;
 446         struct ucred *cred;
 447         struct buf **bpp;
 448 {
 449         register struct buf *bp;
 450         int i;
 451
 452         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
 453
 454         /*
 455          * For each of the read-ahead blocks, start a read, if necessary.
 456          */
 457         for (i = 0; i < nrablks; i++) {
 458                 /* If it's in the cache, just go on to next one. */
 459                 if (incore(vp, rablks[i]))
 460                         continue;
 461
 462                 /* Get a buffer for the read-ahead block */
 463                 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, BLK_READ);
 464         }
 465
 466         /* Otherwise, we had to start a read for it; wait until it's valid. */
 467         return (biowait(bp));
 468 }
 469
 470 /*
 471  * Read with single-block read-ahead.  Defined in Bach (p.55), but
 472  * implemented as a call to breadn().
 473  * XXX for compatibility with old file systems.
 474  */
 475 int
 476 breada(vp, blkno, size, rablkno, rabsize, cred, bpp)
 477         struct vnode *vp;
 478         daddr_t blkno; int size;
 479         daddr_t rablkno; int rabsize;
 480         struct ucred *cred;
 481         struct buf **bpp;
 482 {
 483
 484         return (breadn(vp, blkno, size, &rablkno, &rabsize, 1, cred, bpp));
 485 }
 486
 487 /*
 488  * Block write.  Described in Bach (p.56)
 489  */
 490 int
 491 bwrite(bp)
 492         struct buf *bp;
 493 {
 494         int rv, sync, wasdelayed;
 495         struct proc     *p = current_proc();
 496         upl_t  upl;
 497         upl_page_info_t *pl;
 498         void * object;
 499         kern_return_t kret;
 500         struct vnode *vp = bp->b_vp;
 501
 502         /* Remember buffer type, to switch on it later. */
 503         sync = !ISSET(bp->b_flags, B_ASYNC);
 504         wasdelayed = ISSET(bp->b_flags, B_DELWRI);
 505         CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
 506
 507         if (!sync) {
 508                 /*
 509                  * If not synchronous, pay for the I/O operation and make
 510                  * sure the buf is on the correct vnode queue.  We have
 511                  * to do this now, because if we don't, the vnode may not
 512                  * be properly notified that its I/O has completed.
 513                  */
 514                 if (wasdelayed)
 515                         reassignbuf(bp, vp);
 516                 else
 517                 if (p && p->p_stats)
 518                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
 519         }
 520
 521         trace(TR_BWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
 522
 523         /* Initiate disk write.  Make sure the appropriate party is charged. */
 524         SET(bp->b_flags, B_WRITEINPROG);
 525         vp->v_numoutput++;
 526
 527         VOP_STRATEGY(bp);
 528
 529         if (sync) {
 530                 /*
 531                  * If I/O was synchronous, wait for it to complete.
 532                  */
 533                 rv = biowait(bp);
 534
 535                 /*
 536                  * Pay for the I/O operation, if it's not been paid for, and
 537                  * make sure it's on the correct vnode queue. (async operatings
 538                  * were payed for above.)
 539                  */
 540                 if (wasdelayed)
 541                         reassignbuf(bp, vp);
 542                 else
 543                 if (p && p->p_stats)
 544                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
 545
 546                 /* Release the buffer. */
 547                 brelse(bp);
 548
 549                 return (rv);
 550         } else {
 551                 return (0);
 552         }
 553 }
 554
 555 int
 556 vn_bwrite(ap)
 557         struct vop_bwrite_args *ap;
 558 {
 559         return (bwrite(ap->a_bp));
 560 }
 561
 562 /*
 563  * Delayed write.
 564  *
 565  * The buffer is marked dirty, but is not queued for I/O.
 566  * This routine should be used when the buffer is expected
 567  * to be modified again soon, typically a small write that
 568  * partially fills a buffer.
 569  *
 570  * NB: magnetic tapes cannot be delayed; they must be
 571  * written in the order that the writes are requested.
 572  *
 573  * Described in Leffler, et al. (pp. 208-213).
 574  */
 575 void
 576 bdwrite(bp)
 577         struct buf *bp;
 578 {
 579         struct proc *p = current_proc();
 580         kern_return_t kret;
 581         upl_t upl;
 582         upl_page_info_t *pl;
 583
 584         /*
 585          * If the block hasn't been seen before:
 586          *      (1) Mark it as having been seen,
 587          *      (2) Charge for the write.
 588          *      (3) Make sure it's on its vnode's correct block list,
 589          */
 590         if (!ISSET(bp->b_flags, B_DELWRI)) {
 591                 SET(bp->b_flags, B_DELWRI);
 592                 if (p && p->p_stats)
 593                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
 594
 595                 reassignbuf(bp, bp->b_vp);
 596         }
 597
 598
 599         /* If this is a tape block, write it the block now. */
 600         if (ISSET(bp->b_flags, B_TAPE)) {
 601                 /* bwrite(bp); */
 602         VOP_BWRITE(bp);
 603                 return;
 604         }
 605
 606         /* Otherwise, the "write" is done, so mark and release the buffer. */
 607         SET(bp->b_flags, B_DONE);
 608         brelse(bp);
 609 }
 610
 611 /*
 612  * Asynchronous block write; just an asynchronous bwrite().
 613  */
 614 void
 615 bawrite(bp)
 616         struct buf *bp;
 617 {
 618
 619         SET(bp->b_flags, B_ASYNC);
 620         VOP_BWRITE(bp);
 621 }
 622
 623 /*
 624  * Release a buffer on to the free lists.
 625  * Described in Bach (p. 46).
 626  */
 627 void
 628 brelse(bp)
 629         struct buf *bp;
 630 {
 631         struct bqueues *bufq;
 632         int s;
 633         long whichq;
 634
 635         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
 636                      bp->b_lblkno * PAGE_SIZE, (int)bp, (int)bp->b_data,
 637                      bp->b_flags, 0);
 638
 639         trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
 640
 641         /* IO is done. Cleanup the UPL state */
 642         if (!ISSET(bp->b_flags, B_META)
 643                 && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
 644                 kern_return_t kret;
 645                 upl_t         upl;
 646                 int           upl_flags;
 647
 648                 if ( !ISSET(bp->b_flags, B_PAGELIST)) {
 649                         if ( !ISSET(bp->b_flags, B_INVAL)) {
 650                                 kret = ubc_create_upl(bp->b_vp,
 651                                                                 ubc_blktooff(bp->b_vp, bp->b_lblkno),
 652                                                                 bp->b_bufsize,
 653                                                             &upl,
 654                                                                 NULL,
 655                                                                 UPL_PRECIOUS);
 656                                 if (kret != KERN_SUCCESS)
 657                                         panic("brelse: Failed to get pagelists");
 658 #ifdef  UBC_DEBUG
 659                                 upl_ubc_alias_set(upl, bp, 5);
 660 #endif /* UBC_DEBUG */
 661                         } else
 662                                 upl = (upl_t) 0;
 663                 } else {
 664                         upl = bp->b_pagelist;
 665                         kret = ubc_upl_unmap(upl);
 666
 667                         if (kret != KERN_SUCCESS)
 668                                 panic("kernel_upl_unmap failed");
 669                         bp->b_data = 0;
 670                 }
 671                 if (upl) {
 672                         if (bp->b_flags & (B_ERROR | B_INVAL)) {
 673                             if (bp->b_flags & (B_READ | B_INVAL))
 674                                         upl_flags = UPL_ABORT_DUMP_PAGES;
 675                                 else
 676                                         upl_flags = 0;
 677                                 ubc_upl_abort(upl, upl_flags);
 678                         } else {
 679                             if (ISSET(bp->b_flags, B_NEEDCOMMIT))
 680                                     upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
 681                             else if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY))
 682                                         upl_flags = UPL_COMMIT_SET_DIRTY ;
 683                                 else
 684                                     upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
 685                                 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
 686                                         UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
 687                         }
 688                         s = splbio();
 689                         CLR(bp->b_flags, B_PAGELIST);
 690                         bp->b_pagelist = 0;
 691                         splx(s);
 692                 }
 693         } else {
 694                 if(ISSET(bp->b_flags, B_PAGELIST))
 695                         panic("brelse: pagelist set for non VREG; vp=%x", bp->b_vp);
 696         }
 697
 698         /* Wake up any processes waiting for any buffer to become free. */
 699         if (needbuffer) {
 700                 needbuffer = 0;
 701                 wakeup(&needbuffer);
 702         }
 703
 704         /* Wake up any proceeses waiting for _this_ buffer to become free. */
 705         if (ISSET(bp->b_flags, B_WANTED)) {
 706                 CLR(bp->b_flags, B_WANTED);
 707                 wakeup(bp);
 708         }
 709
 710         /* Block disk interrupts. */
 711         s = splbio();
 712
 713         /*
 714          * Determine which queue the buffer should be on, then put it there.
 715          */
 716
 717         /* If it's locked, don't report an error; try again later. */
 718         if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
 719                 CLR(bp->b_flags, B_ERROR);
 720
 721         /* If it's not cacheable, or an error, mark it invalid. */
 722         if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
 723                 SET(bp->b_flags, B_INVAL);
 724
 725         if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
 726                 /*
 727                  * If it's invalid or empty, dissociate it from its vnode
 728                  * and put on the head of the appropriate queue.
 729                  */
 730                 if (bp->b_vp)
 731                         brelvp(bp);
 732                 CLR(bp->b_flags, B_DELWRI);
 733                 if (bp->b_bufsize <= 0)
 734                         whichq = BQ_EMPTY;      /* no data */
 735                 else
 736                         whichq = BQ_AGE;        /* invalid data */
 737
 738                 bufq = &bufqueues[whichq];
 739                 binsheadfree(bp, bufq, whichq);
 740         } else {
 741                 /*
 742                  * It has valid data.  Put it on the end of the appropriate
 743                  * queue, so that it'll stick around for as long as possible.
 744                  */
 745                 if (ISSET(bp->b_flags, B_LOCKED))
 746                         whichq = BQ_LOCKED;             /* locked in core */
 747                 else if (ISSET(bp->b_flags, B_META))
 748                         whichq = BQ_META;               /* meta-data */
 749                 else if (ISSET(bp->b_flags, B_AGE))
 750                         whichq = BQ_AGE;                /* stale but valid data */
 751                 else
 752                         whichq = BQ_LRU;                /* valid data */
 753
 754                 bufq = &bufqueues[whichq];
 755                 binstailfree(bp, bufq, whichq);
 756         }
 757
 758         /* Unlock the buffer. */
 759         CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE));
 760
 761         /* Allow disk interrupts. */
 762         splx(s);
 763
 764         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
 765                      (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
 766 }
 767
 768 /*
 769  * Determine if a block is in the cache.
 770  * Just look on what would be its hash chain.  If it's there, return
 771  * a pointer to it, unless it's marked invalid.  If it's marked invalid,
 772  * we normally don't return the buffer, unless the caller explicitly
 773  * wants us to.
 774  */
 775 struct buf *
 776 incore(vp, blkno)
 777         struct vnode *vp;
 778         daddr_t blkno;
 779 {
 780         struct buf *bp;
 781         int bufseen = 0;
 782
 783         bp = BUFHASH(vp, blkno)->lh_first;
 784
 785         /* Search hash chain */
 786         for (; bp != NULL; bp = bp->b_hash.le_next, bufseen++) {
 787                 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
 788                     !ISSET(bp->b_flags, B_INVAL))
 789                         return (bp);
 790         if(bufseen >= nbuf)
 791                 panic("walked more than nbuf in incore");
 792
 793         }
 794
 795         return (0);
 796 }
 797
 798
 799 /* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
 800 /*
 801  * Get a block of requested size that is associated with
 802  * a given vnode and block offset. If it is found in the
 803  * block cache, mark it as having been found, make it busy
 804  * and return it. Otherwise, return an empty block of the
 805  * correct size. It is up to the caller to insure that the
 806  * cached blocks be of the correct size.
 807  */
 808 struct buf *
 809 getblk(vp, blkno, size, slpflag, slptimeo, operation)
 810         register struct vnode *vp;
 811         daddr_t blkno;
 812         int size, slpflag, slptimeo, operation;
 813 {
 814         struct buf *bp;
 815         int s, err;
 816         upl_t upl;
 817         upl_page_info_t *pl;
 818         kern_return_t kret;
 819         int error=0;
 820         int pagedirty = 0;
 821
 822         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
 823                      blkno * PAGE_SIZE, size, operation, 0, 0);
 824 start:
 825
 826         s = splbio();
 827         if (bp = incore(vp, blkno)) {
 828                 /* Found in the Buffer Cache */
 829                 if (ISSET(bp->b_flags, B_BUSY)) {
 830                         /* but is busy */
 831                         switch (operation) {
 832                         case BLK_READ:
 833                         case BLK_WRITE:
 834                         case BLK_META:
 835                                 SET(bp->b_flags, B_WANTED);
 836                                 bufstats.bufs_busyincore++;
 837                                 err = tsleep(bp, slpflag | (PRIBIO + 1), "getblk",
 838                                     slptimeo);
 839                                 splx(s);
 840                                 /*
 841                                  * Callers who call with PCATCH or timeout are
 842                                  * willing to deal with the NULL pointer
 843                                  */
 844                                 if (err && ((slpflag & PCATCH) ||
 845                                                          ((err == EWOULDBLOCK) && slptimeo)))
 846                                         return (NULL);
 847                                 goto start;
 848                                 /*NOTREACHED*/
 849                                 break;
 850
 851                         case BLK_PAGEIN:
 852                                 /* pagein operation must not use getblk */
 853                                 panic("getblk: pagein for incore busy buffer");
 854                                 splx(s);
 855                                 /*NOTREACHED*/
 856                                 break;
 857
 858                         case BLK_PAGEOUT:
 859                                 /* pageout operation must not use getblk */
 860                                 panic("getblk: pageout for incore busy buffer");
 861                                 splx(s);
 862                                 /*NOTREACHED*/
 863                                 break;
 864
 865                         default:
 866                                 panic("getblk: %d unknown operation 1", operation);
 867                                 /*NOTREACHED*/
 868                                 break;
 869                         }
 870                 } else {
 871                         /* not busy */
 872                         SET(bp->b_flags, (B_BUSY | B_CACHE));
 873                         bremfree(bp);
 874                         bufstats.bufs_incore++;
 875                         splx(s);
 876
 877                         allocbuf(bp, size);
 878                         if (ISSET(bp->b_flags, B_PAGELIST))
 879                                         panic("pagelist buffer is not busy");
 880
 881                         switch (operation) {
 882                         case BLK_READ:
 883                         case BLK_WRITE:
 884                                 if (UBCISVALID(bp->b_vp) && bp->b_bufsize) {
 885                                         kret = ubc_create_upl(vp,
 886                                                                         ubc_blktooff(vp, bp->b_lblkno),
 887                                                                         bp->b_bufsize,
 888                                                                         &upl,
 889                                                                         &pl,
 890                                                                         UPL_PRECIOUS);
 891                                         if (kret != KERN_SUCCESS)
 892                                                 panic("Failed to get pagelists");
 893
 894                                         SET(bp->b_flags, B_PAGELIST);
 895                                         bp->b_pagelist = upl;
 896
 897                                         if (!upl_valid_page(pl, 0)) {
 898                                                 if (vp->v_tag != VT_NFS)
 899                                                         panic("getblk: incore buffer without valid page");
 900                                                 CLR(bp->b_flags, B_CACHE);
 901                                         }
 902
 903                                         if (upl_dirty_page(pl, 0))
 904                                                 SET(bp->b_flags, B_WASDIRTY);
 905                                         else
 906                                                 CLR(bp->b_flags, B_WASDIRTY);
 907
 908                                         kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
 909                                         if (kret != KERN_SUCCESS) {
 910                                                 panic("getblk: ubc_upl_map() failed with (%d)",
 911                                                                   kret);
 912                                         }
 913                                         if (bp->b_data == 0) panic("ubc_upl_map mapped 0");
 914                                 }
 915                                 break;
 916
 917                         case BLK_META:
 918                                 /*
 919                                  * VM is not involved in IO for the meta data
 920                                  * buffer already has valid data
 921                                  */
 922                         if(bp->b_data == 0)
 923                                         panic("bp->b_data null incore buf=%x", bp);
 924                                 break;
 925
 926                         case BLK_PAGEIN:
 927                         case BLK_PAGEOUT:
 928                                 panic("getblk: paging operation 1");
 929                                 break;
 930
 931                         default:
 932                                 panic("getblk: %d unknown operation 2", operation);
 933                                 /*NOTREACHED*/
 934                                 break;
 935                         }
 936                 }
 937         } else { /* not incore() */
 938                 int queue = BQ_EMPTY; /* Start with no preference */
 939                 splx(s);
 940
 941                 if ((operation == BLK_META) || (UBCINVALID(vp)) ||
 942                         !(UBCINFOEXISTS(vp))) {
 943                         operation = BLK_META;
 944                 }
 945                 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
 946                         goto start;
 947                 if (incore(vp, blkno)) {
 948                         SET(bp->b_flags, B_INVAL);
 949                         binshash(bp, &invalhash);
 950                         brelse(bp);
 951                         goto start;
 952                 }
 953
 954                 /*
 955                  * if it is meta, the queue may be set to other
 956                  * type so reset as well as mark it to be B_META
 957                  * so that when buffer is released it will goto META queue
 958                  * Also, if the vnode is not VREG, then it is META
 959                  */
 960                 if (operation == BLK_META) {
 961                         SET(bp->b_flags, B_META);
 962                         queue = BQ_META;
 963                 }
 964                 /*
 965                  * Insert in the hash so that incore() can find it
 966                  */
 967                 binshash(bp, BUFHASH(vp, blkno));
 968
 969                 allocbuf(bp, size);
 970
 971                 switch (operation) {
 972                 case BLK_META:
 973                         /* buffer data is invalid */
 974
 975 #if !ZALLOC_METADATA
 976                         if (bp->b_data)
 977                                 panic("bp->b_data is not nul; %x",bp);
 978                         kret = kmem_alloc(kernel_map,
 979                                                 &bp->b_data, bp->b_bufsize);
 980                         if (kret != KERN_SUCCESS)
 981                                 panic("getblk: kmem_alloc() returned %d", kret);
 982 #endif /* ZALLOC_METADATA */
 983
 984                         if(bp->b_data == 0)
 985                                 panic("bp->b_data is null %x",bp);
 986
 987                         bp->b_blkno = bp->b_lblkno = blkno;
 988                         s = splbio();
 989                         bgetvp(vp, bp);
 990                         bufstats.bufs_miss++;
 991                         splx(s);
 992                         if (bp->b_data == 0)
 993                                 panic("b_data is 0: 2");
 994
 995                         /* wakeup the buffer */
 996                         CLR(bp->b_flags, B_WANTED);
 997                         wakeup(bp);
 998                         break;
 999
1000                 case BLK_READ:
1001                 case BLK_WRITE:
1002
1003                         if (ISSET(bp->b_flags, B_PAGELIST))
1004                                 panic("B_PAGELIST in bp=%x",bp);
1005
1006                         kret = ubc_create_upl(vp,
1007                                                         ubc_blktooff(vp, blkno),
1008                                                         bp->b_bufsize,
1009                                                         &upl,
1010                                                         &pl,
1011                                                         UPL_PRECIOUS);
1012                         if (kret != KERN_SUCCESS)
1013                                 panic("Failed to get pagelists");
1014
1015 #ifdef  UBC_DEBUG
1016                         upl_ubc_alias_set(upl, bp, 4);
1017 #endif /* UBC_DEBUG */
1018                         bp->b_blkno = bp->b_lblkno = blkno;
1019                         bp->b_pagelist = upl;
1020
1021                         SET(bp->b_flags, B_PAGELIST);
1022
1023                         if (upl_valid_page(pl, 0)) {
1024                                 SET(bp->b_flags, B_CACHE | B_DONE);
1025                                 bufstats.bufs_vmhits++;
1026
1027                                 pagedirty = upl_dirty_page(pl, 0);
1028
1029                                 if (pagedirty)
1030                                         SET(bp->b_flags, B_WASDIRTY);
1031
1032                                 if (vp->v_tag == VT_NFS) {
1033                                         off_t  f_offset;
1034                                         int    valid_size;
1035
1036                                         bp->b_validoff = 0;
1037                                         bp->b_dirtyoff = 0;
1038
1039                                         f_offset = ubc_blktooff(vp, blkno);
1040
1041                                         if (f_offset > vp->v_ubcinfo->ui_size) {
1042                                                 CLR(bp->b_flags, (B_CACHE|B_DONE|B_WASDIRTY));
1043                                                 bp->b_validend = 0;
1044                                                 bp->b_dirtyend = 0;
1045                                         } else {
1046                                                 valid_size = min(((unsigned int)(vp->v_ubcinfo->ui_size - f_offset)), PAGE_SIZE);
1047                                                 bp->b_validend = valid_size;
1048
1049                                                 if (pagedirty)
1050                                                        bp->b_dirtyend = valid_size;
1051                                                 else
1052                                                        bp->b_dirtyend = 0;
1053
1054                                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_NONE,
1055                                                              bp->b_validend, bp->b_dirtyend,
1056                                                              (int)vp->v_ubcinfo->ui_size, 0, 0);
1057                                         }
1058                                 } else {
1059                                         bp->b_validoff = 0;
1060                                         bp->b_dirtyoff = 0;
1061
1062                                         if (pagedirty) {
1063                                                 /* page is dirty */
1064                                                 bp->b_validend = bp->b_bcount;
1065                                                 bp->b_dirtyend = bp->b_bcount;
1066                                         } else {
1067                                                 /* page is clean */
1068                                                 bp->b_validend = bp->b_bcount;
1069                                                 bp->b_dirtyend = 0;
1070                                         }
1071                                 }
1072                                 if (error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL)) {
1073                                         panic("VOP_BMAP failed in getblk");
1074                                         /*NOTREACHED*/
1075                                         /*
1076                                          * XXX:  We probably should invalidate the VM Page
1077                                          */
1078                                         bp->b_error = error;
1079                                         SET(bp->b_flags, (B_ERROR | B_INVAL));
1080                                         /* undo B_DONE that was set before upl_commit() */
1081                                         CLR(bp->b_flags, B_DONE);
1082                                         brelse(bp);
1083                                         return (0);
1084                                 }
1085                         } else {
1086                                 bufstats.bufs_miss++;
1087                         }
1088                         kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
1089                         if (kret != KERN_SUCCESS) {
1090                                 panic("getblk: ubc_upl_map() "
1091                                       "failed with (%d)", kret);
1092                         }
1093                         if (bp->b_data == 0) panic("kernel_upl_map mapped 0");
1094
1095                         s = splbio();
1096                         bgetvp(vp, bp);
1097                         splx(s);
1098
1099                         break;
1100
1101                 case BLK_PAGEIN:
1102                 case BLK_PAGEOUT:
1103                         panic("getblk: paging operation 2");
1104                         break;
1105                 default:
1106                         panic("getblk: %d unknown operation 3", operation);
1107                         /*NOTREACHED*/
1108                         break;
1109                 }
1110         }
1111
1112         if (bp->b_data == NULL)
1113                 panic("getblk: bp->b_addr is null");
1114
1115         if (bp->b_bufsize & 0xfff) {
1116 #if ZALLOC_METADATA
1117                 if (ISSET(bp->b_flags, B_META) && (bp->b_bufsize & 0x1ff))
1118 #endif /* ZALLOC_METADATA */
1119                         panic("getblk: bp->b_bufsize = %d", bp->b_bufsize);
1120         }
1121
1122         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
1123                      (int)bp, (int)bp->b_data, bp->b_flags, 3, 0);
1124
1125         return (bp);
1126 }
1127
1128 /*
1129  * Get an empty, disassociated buffer of given size.
1130  */
1131 struct buf *
1132 geteblk(size)
1133         int size;
1134 {
1135         struct buf *bp;
1136     int queue = BQ_EMPTY;
1137 #if !ZALLOC_METADATA
1138         kern_return_t kret;
1139         vm_size_t desired_size = roundup(size, CLBYTES);
1140
1141         if (desired_size > MAXBSIZE)
1142                 panic("geteblk: buffer larger than MAXBSIZE requested");
1143 #endif /* ZALLOC_METADATA */
1144
1145         while ((bp = getnewbuf(0, 0, &queue)) == 0)
1146                 ;
1147 #if ZALLOC_METADATA
1148         SET(bp->b_flags, (B_META|B_INVAL));
1149 #else
1150         SET(bp->b_flags, B_INVAL);
1151 #endif /* ZALLOC_METADATA */
1152
1153 #if DIAGNOSTIC
1154         assert(queue == BQ_EMPTY);
1155 #endif /* DIAGNOSTIC */
1156         /* XXX need to implement logic to deal with other queues */
1157
1158 #if !ZALLOC_METADATA
1159         /* Empty buffer - allocate pages */
1160         kret = kmem_alloc_aligned(kernel_map, &bp->b_data, desired_size);
1161         if (kret != KERN_SUCCESS)
1162                 panic("geteblk: kmem_alloc_aligned returned %d", kret);
1163 #endif /* ZALLOC_METADATA */
1164
1165         binshash(bp, &invalhash);
1166         allocbuf(bp, size);
1167         bufstats.bufs_eblk++;
1168
1169         return (bp);
1170 }
1171
1172 #if ZALLOC_METADATA
1173 /*
1174  * Zones for the meta data buffers
1175  */
1176
1177 #define MINMETA 512
1178 #define MAXMETA 4096
1179
1180 struct meta_zone_entry {
1181         zone_t mz_zone;
1182         vm_size_t mz_size;
1183         vm_size_t mz_max;
1184         char *mz_name;
1185 };
1186
1187 struct meta_zone_entry meta_zones[] = {
1188         {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
1189         {NULL, (MINMETA * 2),  64 * (MINMETA * 2), "buf.1024" },
1190         {NULL, (MINMETA * 3),  16 * (MINMETA * 3), "buf.1536" },
1191         {NULL, (MINMETA * 4),  16 * (MINMETA * 4), "buf.2048" },
1192         {NULL, (MINMETA * 5),  16 * (MINMETA * 5), "buf.2560" },
1193         {NULL, (MINMETA * 6),  16 * (MINMETA * 6), "buf.3072" },
1194         {NULL, (MINMETA * 7),  16 * (MINMETA * 7), "buf.3584" },
1195         {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
1196         {NULL, 0, 0, "" } /* End */
1197 };
1198 #endif /* ZALLOC_METADATA */
1199
1200 zone_t buf_hdr_zone;
1201 int buf_hdr_count;
1202
1203 /*
1204  * Initialize the meta data zones
1205  */
1206 static void
1207 bufzoneinit(void)
1208 {
1209 #if ZALLOC_METADATA
1210         int i;
1211
1212         for (i = 0; meta_zones[i].mz_size != 0; i++) {
1213                 meta_zones[i].mz_zone =
1214                                 zinit(meta_zones[i].mz_size,
1215                                         meta_zones[i].mz_max,
1216                                         PAGE_SIZE,
1217                                         meta_zones[i].mz_name);
1218         }
1219 #endif /* ZALLOC_METADATA */
1220         buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
1221 }
1222
1223 #if ZALLOC_METADATA
1224 static zone_t
1225 getbufzone(size_t size)
1226 {
1227         int i;
1228
1229         if (size % 512)
1230                 panic("getbufzone: incorect size = %d", size);
1231
1232         i = (size / 512) - 1;
1233         return (meta_zones[i].mz_zone);
1234 }
1235 #endif /* ZALLOC_METADATA */
1236
1237 /*
1238  * With UBC, there is no need to expand / shrink the file data
1239  * buffer. The VM uses the same pages, hence no waste.
1240  * All the file data buffers can have one size.
1241  * In fact expand / shrink would be an expensive operation.
1242  *
1243  * Only exception to this is meta-data buffers. Most of the
1244  * meta data operations are smaller than PAGE_SIZE. Having the
1245  * meta-data buffers grow and shrink as needed, optimizes use
1246  * of the kernel wired memory.
1247  */
1248
1249 int
1250 allocbuf(bp, size)
1251         struct buf *bp;
1252         int size;
1253 {
1254         vm_size_t desired_size;
1255
1256         desired_size = roundup(size, CLBYTES);
1257
1258         if(desired_size < PAGE_SIZE)
1259                 desired_size = PAGE_SIZE;
1260         if (desired_size > MAXBSIZE)
1261                 panic("allocbuf: buffer larger than MAXBSIZE requested");
1262
1263 #if ZALLOC_METADATA
1264         if (ISSET(bp->b_flags, B_META)) {
1265                 kern_return_t kret;
1266                 zone_t zprev, z;
1267                 size_t nsize = roundup(size, MINMETA);
1268
1269                 if (bp->b_data) {
1270                         vm_offset_t elem = (vm_offset_t)bp->b_data;
1271
1272                         if (ISSET(bp->b_flags, B_ZALLOC))
1273                                 if (bp->b_bufsize <= MAXMETA) {
1274                                         if (bp->b_bufsize < nsize) {
1275                                                 /* reallocate to a bigger size */
1276                                                 desired_size = nsize;
1277
1278                                                 zprev = getbufzone(bp->b_bufsize);
1279                                                 z = getbufzone(nsize);
1280                                                 bp->b_data = (caddr_t)zalloc(z);
1281                                                 if(bp->b_data == 0)
1282                                                         panic("allocbuf: zalloc() returned NULL");
1283                                                 bcopy(elem, bp->b_data, bp->b_bufsize);
1284                                                 zfree(zprev, elem);
1285                                         } else {
1286                                                 desired_size = bp->b_bufsize;
1287                                         }
1288                                 } else
1289                                         panic("allocbuf: B_ZALLOC set incorrectly");
1290                         else
1291                                 if (bp->b_bufsize < desired_size) {
1292                                         /* reallocate to a bigger size */
1293                                         kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1294                                         if (kret != KERN_SUCCESS)
1295                                                 panic("allocbuf: kmem_alloc() returned %d", kret);
1296                                         if(bp->b_data == 0)
1297                                                 panic("allocbuf: null b_data");
1298                                         bcopy(elem, bp->b_data, bp->b_bufsize);
1299                                         kmem_free(kernel_map, elem, bp->b_bufsize);
1300                                 } else {
1301                                         desired_size = bp->b_bufsize;
1302                                 }
1303                 } else {
1304                         /* new allocation */
1305                         if (nsize <= MAXMETA) {
1306                                 desired_size = nsize;
1307                                 z = getbufzone(nsize);
1308                                 bp->b_data = (caddr_t)zalloc(z);
1309                                 if(bp->b_data == 0)
1310                                         panic("allocbuf: zalloc() returned NULL 2");
1311                                 SET(bp->b_flags, B_ZALLOC);
1312                         } else {
1313                                 kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1314                                 if (kret != KERN_SUCCESS)
1315                                         panic("allocbuf: kmem_alloc() 2 returned %d", kret);
1316                                 if(bp->b_data == 0)
1317                                         panic("allocbuf: null b_data 2");
1318                         }
1319                 }
1320         }
1321
1322         if (ISSET(bp->b_flags, B_META) && (bp->b_data == 0))
1323                 panic("allocbuf: bp->b_data is NULL");
1324 #endif /* ZALLOC_METADATA */
1325
1326                 bp->b_bufsize = desired_size;
1327                 bp->b_bcount = size;
1328 }
1329
1330 /*
1331  *      Get a new buffer from one of the free lists.
1332  *
1333  *      Request for a queue is passes in. The queue from which the buffer was taken
1334  *      from is returned. Out of range queue requests get BQ_EMPTY. Request for
1335  *      BQUEUE means no preference. Use heuristics in that case.
1336  *      Heuristics is as follows:
1337  *      Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
1338  *      If none available block till one is made available.
1339  *      If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
1340  *      Pick the most stale buffer.
1341  *      If found buffer was marked delayed write, start the async. write
1342  *      and restart the search.
1343  *      Initialize the fields and disassociate the buffer from the vnode.
1344  *      Remove the buffer from the hash. Return the buffer and the queue
1345  *      on which it was found.
1346  */
1347
1348 static struct buf *
1349 getnewbuf(slpflag, slptimeo, queue)
1350         int slpflag, slptimeo;
1351         int *queue;
1352 {
1353         register struct buf *bp;
1354         register struct buf *lru_bp;
1355         register struct buf *age_bp;
1356         register struct buf *meta_bp;
1357         register int age_time, lru_time, bp_time, meta_time;
1358         int s;
1359         struct ucred *cred;
1360         int req = *queue; /* save it for restarts */
1361
1362 start:
1363         s = splbio();
1364
1365         /* invalid request gets empty queue */
1366         if ((*queue > BQUEUES) || (*queue < 0)
1367                 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED))
1368                 *queue = BQ_EMPTY;
1369
1370         /* (*queue == BQUEUES) means no preference */
1371         if (*queue != BQUEUES) {
1372                 /* Try for the requested queue first */
1373                 bp = bufqueues[*queue].tqh_first;
1374                 if (bp)
1375                         goto found;
1376         }
1377
1378         /* Unable to use requested queue */
1379         age_bp = bufqueues[BQ_AGE].tqh_first;
1380         lru_bp = bufqueues[BQ_LRU].tqh_first;
1381         meta_bp = bufqueues[BQ_META].tqh_first;
1382
1383         if (!age_bp && !lru_bp && !meta_bp) { /* Unavailble on AGE or LRU */
1384                 /* Try the empty list first */
1385                 bp = bufqueues[BQ_EMPTY].tqh_first;
1386                 if (bp) {
1387                         *queue = BQ_EMPTY;
1388                         goto found;
1389                 }
1390
1391                 /* Create a new temparory buffer header */
1392                 bp = (struct buf *)zalloc(buf_hdr_zone);
1393
1394                 if (bp) {
1395                         bufhdrinit(bp);
1396                         BLISTNONE(bp);
1397                         binshash(bp, &invalhash);
1398                         SET(bp->b_flags, B_HDRALLOC);
1399                         *queue = BQ_EMPTY;
1400                         binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
1401                         buf_hdr_count++;
1402                         goto found;
1403                 }
1404
1405                 /* Log this error condition */
1406                 printf("getnewbuf: No useful buffers");
1407
1408                 /* wait for a free buffer of any kind */
1409                 needbuffer = 1;
1410                 bufstats.bufs_sleeps++;
1411                 tsleep(&needbuffer, slpflag|(PRIBIO+1), "getnewbuf", slptimeo);
1412                 splx(s);
1413                 return (0);
1414         }
1415
1416         /* Buffer available either on AGE or LRU or META */
1417         bp = NULL;
1418         *queue = -1;
1419
1420         /* Buffer available either on AGE or LRU */
1421         if (!age_bp) {
1422                 bp = lru_bp;
1423                 *queue = BQ_LRU;
1424         } else if (!lru_bp) {
1425                 bp = age_bp;
1426                 *queue = BQ_AGE;
1427         } else { /* buffer available on both AGE and LRU */
1428                 age_time = time.tv_sec - age_bp->b_timestamp;
1429                 lru_time = time.tv_sec - lru_bp->b_timestamp;
1430                 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
1431                         bp = age_bp;
1432                         *queue = BQ_AGE;
1433                         /*
1434                          * we should probably re-timestamp eveything in the
1435                          * queues at this point with the current time
1436                          */
1437                 } else {
1438                         if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
1439                                 bp = lru_bp;
1440                                 *queue = BQ_LRU;
1441                         } else {
1442                                 bp = age_bp;
1443                                 *queue = BQ_AGE;
1444                         }
1445                 }
1446         }
1447
1448         if (!bp) { /* Neither on AGE nor on LRU */
1449                 bp = meta_bp;
1450                 *queue = BQ_META;
1451         }  else if (meta_bp) {
1452                 bp_time = time.tv_sec - bp->b_timestamp;
1453                 meta_time = time.tv_sec - meta_bp->b_timestamp;
1454
1455                 if (!(bp_time < 0) && !(meta_time < 0)) {
1456                         /* time not set backwards */
1457                         int bp_is_stale;
1458                         bp_is_stale = (*queue == BQ_LRU) ?
1459                                         lru_is_stale : age_is_stale;
1460
1461                         if ((meta_time >= meta_is_stale) &&
1462                                         (bp_time < bp_is_stale)) {
1463                                 bp = meta_bp;
1464                                 *queue = BQ_META;
1465                         }
1466                 }
1467         }
1468
1469         if (bp == NULL)
1470                 panic("getnewbuf: null bp");
1471
1472 found:
1473         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1474                 panic("getnewbuf: le_prev is deadbeef");
1475
1476         if(ISSET(bp->b_flags, B_BUSY))
1477                 panic("getnewbuf reusing BUSY buf");
1478
1479         /* Clean it */
1480         if (bcleanbuf(bp)) {
1481                 /* bawrite() issued, buffer not ready */
1482                 splx(s);
1483                 *queue = req;
1484                 goto start;
1485         }
1486         splx(s);
1487         return (bp);
1488 }
1489 #include <mach/mach_types.h>
1490 #include <mach/memory_object_types.h>
1491
1492 /*
1493  * Clean a buffer.
1494  * Returns 0 is buffer is ready to use,
1495  * Returns 1 if issued a bawrite() to indicate
1496  * that the buffer is not ready.
1497  */
1498 int
1499 bcleanbuf(struct buf *bp)
1500 {
1501         int s;
1502         struct ucred *cred;
1503
1504         s = splbio();
1505
1506         /* Remove from the queue */
1507         bremfree(bp);
1508
1509         /* Buffer is no longer on free lists. */
1510         SET(bp->b_flags, B_BUSY);
1511
1512         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1513                 panic("bcleanbuf: le_prev is deadbeef");
1514
1515         /*
1516          * If buffer was a delayed write, start the IO by queuing
1517          * it on the LAUNDRY queue, and return 1
1518          */
1519         if (ISSET(bp->b_flags, B_DELWRI)) {
1520                 splx(s);
1521                 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
1522                 blaundrycnt++;
1523                 wakeup(&blaundrycnt);
1524                 return (1);
1525         }
1526
1527         if (bp->b_vp)
1528                 brelvp(bp);
1529         bremhash(bp);
1530         BLISTNONE(bp);
1531
1532         splx(s);
1533
1534         if (ISSET(bp->b_flags, B_META)) {
1535 #if ZALLOC_METADATA
1536                 vm_offset_t elem = (vm_offset_t)bp->b_data;
1537                 if (elem == 0)
1538                         panic("bcleanbuf: NULL bp->b_data B_META buffer");
1539
1540                 if (ISSET(bp->b_flags, B_ZALLOC)) {
1541                         if (bp->b_bufsize <= MAXMETA) {
1542                                 zone_t z;
1543
1544                                 z = getbufzone(bp->b_bufsize);
1545                                 bp->b_data = (caddr_t)0xdeadbeef;
1546                                 zfree(z, elem);
1547                                 CLR(bp->b_flags, B_ZALLOC);
1548                         } else
1549                                 panic("bcleanbuf: B_ZALLOC set incorrectly");
1550                 } else {
1551                         bp->b_data = (caddr_t)0xdeadbeef;
1552                         kmem_free(kernel_map, elem, bp->b_bufsize);
1553                 }
1554 #else
1555            if (bp->b_data == 0)
1556                    panic("bcleanbuf: bp->b_data == NULL for B_META buffer");
1557
1558            kmem_free(kernel_map, bp->b_data, bp->b_bufsize);
1559 #endif /* ZALLOC_METADATA */
1560         }
1561
1562         trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
1563
1564         /* disassociate us from our vnode, if we had one... */
1565         s = splbio();
1566
1567         /* clear out various other fields */
1568         bp->b_bufsize = 0;
1569         bp->b_data = 0;
1570         bp->b_flags = B_BUSY;
1571         bp->b_dev = NODEV;
1572         bp->b_blkno = bp->b_lblkno = 0;
1573         bp->b_iodone = 0;
1574         bp->b_error = 0;
1575         bp->b_resid = 0;
1576         bp->b_bcount = 0;
1577         bp->b_dirtyoff = bp->b_dirtyend = 0;
1578         bp->b_validoff = bp->b_validend = 0;
1579
1580         /* nuke any credentials we were holding */
1581         cred = bp->b_rcred;
1582         if (cred != NOCRED) {
1583                 bp->b_rcred = NOCRED;
1584                 crfree(cred);
1585         }
1586         cred = bp->b_wcred;
1587         if (cred != NOCRED) {
1588                 bp->b_wcred = NOCRED;
1589                 crfree(cred);
1590         }
1591         splx(s);
1592         return (0);
1593 }
1594
1595
1596 /*
1597  * Wait for operations on the buffer to complete.
1598  * When they do, extract and return the I/O's error value.
1599  */
1600 int
1601 biowait(bp)
1602         struct buf *bp;
1603 {
1604         upl_t           upl;
1605         upl_page_info_t *pl;
1606         int s;
1607         kern_return_t kret;
1608
1609         s = splbio();
1610         while (!ISSET(bp->b_flags, B_DONE))
1611                 tsleep(bp, PRIBIO + 1, "biowait", 0);
1612         splx(s);
1613
1614         /* check for interruption of I/O (e.g. via NFS), then errors. */
1615         if (ISSET(bp->b_flags, B_EINTR)) {
1616                 CLR(bp->b_flags, B_EINTR);
1617                 return (EINTR);
1618         } else if (ISSET(bp->b_flags, B_ERROR))
1619                 return (bp->b_error ? bp->b_error : EIO);
1620         else
1621                 return (0);
1622 }
1623
1624 /*
1625  * Mark I/O complete on a buffer.
1626  *
1627  * If a callback has been requested, e.g. the pageout
1628  * daemon, do so. Otherwise, awaken waiting processes.
1629  *
1630  * [ Leffler, et al., says on p.247:
1631  *      "This routine wakes up the blocked process, frees the buffer
1632  *      for an asynchronous write, or, for a request by the pagedaemon
1633  *      process, invokes a procedure specified in the buffer structure" ]
1634  *
1635  * In real life, the pagedaemon (or other system processes) wants
1636  * to do async stuff to, and doesn't want the buffer brelse()'d.
1637  * (for swap pager, that puts swap buffers on the free lists (!!!),
1638  * for the vn device, that puts malloc'd buffers on the free lists!)
1639  */
1640 void
1641 biodone(bp)
1642         struct buf *bp;
1643 {
1644         boolean_t       funnel_state;
1645         int s;
1646
1647         funnel_state = thread_funnel_set(kernel_flock, TRUE);
1648
1649         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
1650                      (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1651
1652         if (ISSET(bp->b_flags, B_DONE))
1653                 panic("biodone already");
1654         SET(bp->b_flags, B_DONE);               /* note that it's done */
1655         /*
1656          * I/O was done, so don't believe
1657          * the DIRTY state from VM anymore
1658          */
1659         CLR(bp->b_flags, B_WASDIRTY);
1660
1661         if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
1662                 vwakeup(bp);     /* wake up reader */
1663
1664         if (ISSET(bp->b_flags, B_CALL)) {       /* if necessary, call out */
1665                 CLR(bp->b_flags, B_CALL);       /* but note callout done */
1666                 (*bp->b_iodone)(bp);
1667         } else if (ISSET(bp->b_flags, B_ASYNC)) /* if async, release it */
1668                 brelse(bp);
1669         else {                                  /* or just wakeup the buffer */
1670                 CLR(bp->b_flags, B_WANTED);
1671                 wakeup(bp);
1672         }
1673
1674         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
1675                      (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1676
1677         thread_funnel_set(kernel_flock, funnel_state);
1678 }
1679
1680 /*
1681  * Return a count of buffers on the "locked" queue.
1682  */
1683 int
1684 count_lock_queue()
1685 {
1686         register struct buf *bp;
1687         register int n = 0;
1688
1689         for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
1690             bp = bp->b_freelist.tqe_next)
1691                 n++;
1692         return (n);
1693 }
1694
1695 /*
1696  * Return a count of 'busy' buffers. Used at the time of shutdown.
1697  */
1698 int
1699 count_busy_buffers()
1700 {
1701         register struct buf *bp;
1702         register int nbusy = 0;
1703
1704         for (bp = &buf[nbuf]; --bp >= buf; )
1705                 if ((bp->b_flags & (B_BUSY|B_INVAL)) == B_BUSY)
1706                         nbusy++;
1707         return (nbusy);
1708 }
1709
1710 #if 1 /*DIAGNOSTIC */
1711 /*
1712  * Print out statistics on the current allocation of the buffer pool.
1713  * Can be enabled to print out on every ``sync'' by setting "syncprt"
1714  * in vfs_syscalls.c using sysctl.
1715  */
1716 void
1717 vfs_bufstats()
1718 {
1719         int s, i, j, count;
1720         register struct buf *bp;
1721         register struct bqueues *dp;
1722         int counts[MAXBSIZE/CLBYTES+1];
1723         static char *bname[BQUEUES] =
1724                 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
1725
1726         for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
1727                 count = 0;
1728                 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1729                         counts[j] = 0;
1730                 s = splbio();
1731                 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
1732                         counts[bp->b_bufsize/CLBYTES]++;
1733                         count++;
1734                 }
1735                 splx(s);
1736                 printf("%s: total-%d", bname[i], count);
1737                 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1738                         if (counts[j] != 0)
1739                                 printf(", %d-%d", j * CLBYTES, counts[j]);
1740                 printf("\n");
1741         }
1742 }
1743 #endif /* DIAGNOSTIC */
1744
1745 #define NRESERVEDIOBUFS 16
1746
1747 struct buf *
1748 alloc_io_buf(vp, priv)
1749         struct vnode *vp;
1750         int priv;
1751 {
1752         register struct buf *bp;
1753         int s;
1754
1755         s = splbio();
1756
1757         while (niobuf - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse && !priv) {
1758                 need_iobuffer = 1;
1759                 bufstats.bufs_iobufsleeps++;
1760                 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf", 0);
1761         }
1762
1763         while ((bp = iobufqueue.tqh_first) == NULL) {
1764                 need_iobuffer = 1;
1765                 bufstats.bufs_iobufsleeps++;
1766                 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf1", 0);
1767         }
1768
1769         TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
1770         bp->b_timestamp = 0;
1771
1772         /* clear out various fields */
1773         bp->b_flags = B_BUSY;
1774         bp->b_blkno = bp->b_lblkno = 0;
1775         bp->b_iodone = 0;
1776         bp->b_error = 0;
1777         bp->b_resid = 0;
1778         bp->b_bcount = 0;
1779         bp->b_bufsize = 0;
1780         bp->b_vp = vp;
1781
1782         if (vp->v_type == VBLK || vp->v_type == VCHR)
1783                 bp->b_dev = vp->v_rdev;
1784         else
1785                 bp->b_dev = NODEV;
1786         bufstats.bufs_iobufinuse++;
1787         if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
1788                 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
1789         splx(s);
1790
1791         return (bp);
1792 }
1793
1794 void
1795 free_io_buf(bp)
1796         struct buf *bp;
1797 {
1798         int s;
1799
1800         s = splbio();
1801         /* put buffer back on the head of the iobufqueue */
1802         bp->b_vp = NULL;
1803         bp->b_flags = B_INVAL;
1804
1805         binsheadfree(bp, &iobufqueue, -1);
1806
1807         /* Wake up any processes waiting for any buffer to become free. */
1808         if (need_iobuffer) {
1809                 need_iobuffer = 0;
1810                 wakeup(&need_iobuffer);
1811         }
1812         bufstats.bufs_iobufinuse--;
1813         splx(s);
1814 }
1815
1816
1817 /* not hookedup yet */
1818
1819 /* XXX move this to a separate file */
1820 /*
1821  * Dynamic Scaling of the Buffer Queues
1822  */
1823
1824 typedef long long blsize_t;
1825
1826 blsize_t MAXNBUF; /* initialize to (mem_size / PAGE_SIZE) */
1827 /* Global tunable limits */
1828 blsize_t nbufh;                 /* number of buffer headers */
1829 blsize_t nbuflow;               /* minimum number of buffer headers required */
1830 blsize_t nbufhigh;              /* maximum number of buffer headers allowed */
1831 blsize_t nbuftarget;    /* preferred number of buffer headers */
1832
1833 /*
1834  * assertions:
1835  *
1836  * 1.   0 < nbuflow <= nbufh <= nbufhigh
1837  * 2.   nbufhigh <= MAXNBUF
1838  * 3.   0 < nbuflow <= nbuftarget <= nbufhigh
1839  * 4.   nbufh can not be set by sysctl().
1840  */
1841
1842 /* Per queue tunable limits */
1843
1844 struct bufqlim {
1845         blsize_t        bl_nlow;        /* minimum number of buffer headers required */
1846         blsize_t        bl_num;         /* number of buffer headers on the queue */
1847         blsize_t        bl_nlhigh;      /* maximum number of buffer headers allowed */
1848         blsize_t        bl_target;      /* preferred number of buffer headers */
1849         long    bl_stale;       /* Seconds after which a buffer is considered stale */
1850 } bufqlim[BQUEUES];
1851
1852 /*
1853  * assertions:
1854  *
1855  * 1.   0 <= bl_nlow <= bl_num <= bl_nlhigh
1856  * 2.   bl_nlhigh <= MAXNBUF
1857  * 3.  bufqlim[BQ_META].bl_nlow != 0
1858  * 4.  bufqlim[BQ_META].bl_nlow > (number of possible concurrent
1859  *                                                                      file system IO operations)
1860  * 5.   bl_num can not be set by sysctl().
1861  * 6.   bl_nhigh <= nbufhigh
1862  */
1863
1864 /*
1865  * Rationale:
1866  * ----------
1867  * Defining it blsize_t as long permits 2^31 buffer headers per queue.
1868  * Which can describe (2^31 * PAGE_SIZE) memory per queue.
1869  *
1870  * These limits are exported to by means of sysctl().
1871  * It was decided to define blsize_t as a 64 bit quantity.
1872  * This will make sure that we will not be required to change it
1873  * as long as we do not exceed 64 bit address space for the kernel.
1874  *
1875  * low and high numbers parameters initialized at compile time
1876  * and boot arguments can be used to override them. sysctl()
1877  * would not change the value. sysctl() can get all the values
1878  * but can set only target. num is the current level.
1879  *
1880  * Advantages of having a "bufqscan" thread doing the balancing are,
1881  * Keep enough bufs on BQ_EMPTY.
1882  *      getnewbuf() by default will always select a buffer from the BQ_EMPTY.
1883  *              getnewbuf() perfoms best if a buffer was found there.
1884  *              Also this minimizes the possibility of starting IO
1885  *              from getnewbuf(). That's a performance win, too.
1886  *
1887  *      Localize complex logic [balancing as well as time aging]
1888  *              to balancebufq().
1889  *
1890  *      Simplify getnewbuf() logic by elimination of time aging code.
1891  */
1892
1893 /*
1894  * Algorithm:
1895  * -----------
1896  * The goal of the dynamic scaling of the buffer queues to to keep
1897  * the size of the LRU close to bl_target. Buffers on a queue would
1898  * be time aged.
1899  *
1900  * There would be a thread which will be responsible for "balancing"
1901  * the buffer cache queues.
1902  *
1903  * The scan order would be:     AGE, LRU, META, EMPTY.
1904  */
1905
1906 long bufqscanwait = 0;
1907
1908 extern void bufqscan_thread();
1909 extern int balancebufq(int q);
1910 extern int btrimempty(int n);
1911 extern int initbufqscan(void);
1912 extern int nextbufq(int q);
1913 extern void buqlimprt(int all);
1914
1915 void
1916 bufq_balance_thread_init()
1917 {
1918
1919         if (bufqscanwait++ == 0) {
1920                 int i;
1921
1922                 /* Initalize globals */
1923                 MAXNBUF = (mem_size / PAGE_SIZE);
1924                 nbufh = nbuf;
1925                 nbuflow = min(nbufh, 100);
1926                 nbufhigh = min(MAXNBUF, max(nbufh, 2048));
1927                 nbuftarget = (mem_size >> 5) / PAGE_SIZE;
1928                 nbuftarget = max(nbuflow, nbuftarget);
1929                 nbuftarget = min(nbufhigh, nbuftarget);
1930
1931                 /*
1932                  * Initialize the bufqlim
1933                  */
1934
1935                 /* LOCKED queue */
1936                 bufqlim[BQ_LOCKED].bl_nlow = 0;
1937                 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
1938                 bufqlim[BQ_LOCKED].bl_target = 0;
1939                 bufqlim[BQ_LOCKED].bl_stale = 30;
1940
1941                 /* LRU queue */
1942                 bufqlim[BQ_LRU].bl_nlow = 0;
1943                 bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
1944                 bufqlim[BQ_LRU].bl_target = nbuftarget/4;
1945                 bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
1946
1947                 /* AGE queue */
1948                 bufqlim[BQ_AGE].bl_nlow = 0;
1949                 bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
1950                 bufqlim[BQ_AGE].bl_target = nbuftarget/4;
1951                 bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
1952
1953                 /* EMPTY queue */
1954                 bufqlim[BQ_EMPTY].bl_nlow = 0;
1955                 bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
1956                 bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
1957                 bufqlim[BQ_EMPTY].bl_stale = 600000;
1958
1959                 /* META queue */
1960                 bufqlim[BQ_META].bl_nlow = 0;
1961                 bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
1962                 bufqlim[BQ_META].bl_target = nbuftarget/4;
1963                 bufqlim[BQ_META].bl_stale = META_IS_STALE;
1964
1965                 /* LAUNDRY queue */
1966                 bufqlim[BQ_LOCKED].bl_nlow = 0;
1967                 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
1968                 bufqlim[BQ_LOCKED].bl_target = 0;
1969                 bufqlim[BQ_LOCKED].bl_stale = 30;
1970
1971                 buqlimprt(1);
1972         }
1973
1974         /* create worker thread */
1975         kernel_thread(kernel_task, bufqscan_thread);
1976 }
1977
1978 /* The workloop for the buffer balancing thread */
1979 void
1980 bufqscan_thread()
1981 {
1982         boolean_t       funnel_state;
1983         int moretodo = 0;
1984
1985         funnel_state = thread_funnel_set(kernel_flock, TRUE);
1986
1987         for(;;) {
1988                 do {
1989                         int q;  /* buffer queue to process */
1990
1991                         for (q = initbufqscan(); q; ) {
1992                                 moretodo |= balancebufq(q);
1993                                 q = nextbufq(q);
1994                         }
1995                 } while (moretodo);
1996
1997 #if 1 || DIAGNOSTIC
1998                 vfs_bufstats();
1999                 buqlimprt(0);
2000 #endif
2001                 (void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz);
2002                 moretodo = 0;
2003         }
2004
2005         (void) thread_funnel_set(kernel_flock, FALSE);
2006 }
2007
2008 /* Seed for the buffer queue balancing */
2009 int
2010 initbufqscan()
2011 {
2012         /* Start with AGE queue */
2013         return (BQ_AGE);
2014 }
2015
2016 /* Pick next buffer queue to balance */
2017 int
2018 nextbufq(int q)
2019 {
2020         int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
2021
2022         q++;
2023         q %= sizeof(order);
2024         return (order[q]);
2025 }
2026
2027 /* function to balance the buffer queues */
2028 int
2029 balancebufq(int q)
2030 {
2031         int moretodo = 0;
2032         int s = splbio();
2033         int n;
2034
2035         /* reject invalid q */
2036         if ((q < 0) || (q >= BQUEUES))
2037                 goto out;
2038
2039         /* LOCKED or LAUNDRY queue MUST not be balanced */
2040         if ((q == BQ_LOCKED) || (q == BQ_LAUNDRY))
2041                 goto out;
2042
2043         n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
2044
2045         /* If queue has less than target nothing more to do */
2046         if (n < 0)
2047                 goto out;
2048
2049         if ( n > 8 ) {
2050                 /* Balance only a small amount (12.5%) at a time */
2051                 n >>= 3;
2052         }
2053
2054         /* EMPTY queue needs special handling */
2055         if (q == BQ_EMPTY) {
2056                 moretodo |= btrimempty(n);
2057                 goto out;
2058         }
2059
2060         for (; n > 0; n--) {
2061                 struct buf *bp = bufqueues[q].tqh_first;
2062                 if (!bp)
2063                         break;
2064
2065                 /* check if it's stale */
2066                 if ((time.tv_sec - bp->b_timestamp) > bufqlim[q].bl_stale) {
2067                         if (bcleanbuf(bp)) {
2068                                 /* bawrite() issued, bp not ready */
2069                                 moretodo = 1;
2070                         } else {
2071                                 /* release the cleaned buffer to BQ_EMPTY */
2072                                 SET(bp->b_flags, B_INVAL);
2073                                 brelse(bp);
2074                         }
2075                 } else
2076                         break;
2077         }
2078
2079 out:
2080         splx(s);
2081         return (moretodo);
2082 }
2083
2084 int
2085 btrimempty(int n)
2086 {
2087         /*
2088          * When struct buf are allocated dynamically, this would
2089          * reclaim upto 'n' struct buf from the empty queue.
2090          */
2091
2092          return (0);
2093 }
2094
2095 void
2096 bufqinc(int q)
2097 {
2098         if ((q < 0) || (q >= BQUEUES))
2099                 return;
2100
2101         bufqlim[q].bl_num++;
2102         return;
2103 }
2104
2105 void
2106 bufqdec(int q)
2107 {
2108         if ((q < 0) || (q >= BQUEUES))
2109                 return;
2110
2111         bufqlim[q].bl_num--;
2112         return;
2113 }
2114
2115 void
2116 buqlimprt(int all)
2117 {
2118         int i;
2119     static char *bname[BQUEUES] =
2120                 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
2121
2122         if (all)
2123                 for (i = 0; i < BQUEUES; i++) {
2124                         printf("%s : ", bname[i]);
2125                         printf("min = %d, ", (long)bufqlim[i].bl_nlow);
2126                         printf("cur = %d, ", (long)bufqlim[i].bl_num);
2127                         printf("max = %d, ", (long)bufqlim[i].bl_nlhigh);
2128                         printf("target = %d, ", (long)bufqlim[i].bl_target);
2129                         printf("stale after %d seconds\n", bufqlim[i].bl_stale);
2130                 }
2131         else
2132                 for (i = 0; i < BQUEUES; i++) {
2133                         printf("%s : ", bname[i]);
2134                         printf("cur = %d, ", (long)bufqlim[i].bl_num);
2135                 }
2136 }
2137
2138 /*
2139  * If the getnewbuf() calls bcleanbuf() on the same thread
2140  * there is a potential for stack overrun and deadlocks.
2141  * So we always handoff the work to worker thread for completion
2142  */
2143
2144 static void
2145 bcleanbuf_thread_init()
2146 {
2147         static void bcleanbuf_thread();
2148
2149         /* create worker thread */
2150         kernel_thread(kernel_task, bcleanbuf_thread);
2151 }
2152
2153 static void
2154 bcleanbuf_thread()
2155 {
2156         boolean_t       funnel_state;
2157         struct buf *bp;
2158
2159         funnel_state = thread_funnel_set(kernel_flock, TRUE);
2160
2161 doit:
2162         while (blaundrycnt == 0)
2163                 (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 60 * hz);
2164         bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY]);
2165         /* Remove from the queue */
2166         bremfree(bp);
2167         blaundrycnt--;
2168         /* do the IO */
2169         bawrite(bp);
2170         /* start again */
2171         goto doit;
2172
2173         (void) thread_funnel_set(kernel_flock, funnel_state);
2174 }
2175