bsd/vfs/vfs_bio.c

   1 /*
   2  * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  23 /*-
  24  * Copyright (c) 1994 Christopher G. Demetriou
  25  * Copyright (c) 1982, 1986, 1989, 1993
  26  *      The Regents of the University of California.  All rights reserved.
  27  * (c) UNIX System Laboratories, Inc.
  28  * All or some portions of this file are derived from material licensed
  29  * to the University of California by American Telephone and Telegraph
  30  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  31  * the permission of UNIX System Laboratories, Inc.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  * The NEXTSTEP Software License Agreement specifies the terms
  62  * and conditions for redistribution.
  63  *
  64  *      @(#)vfs_bio.c   8.6 (Berkeley) 1/11/94
  65  */
  66
  67 /*
  68  * Some references:
  69  *      Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
  70  *      Leffler, et al.: The Design and Implementation of the 4.3BSD
  71  *              UNIX Operating System (Addison Welley, 1989)
  72  */
  73
  74 #include <sys/param.h>
  75 #include <sys/systm.h>
  76 #include <sys/proc.h>
  77 #include <sys/buf.h>
  78 #include <sys/vnode.h>
  79 #include <sys/mount.h>
  80 #include <sys/trace.h>
  81 #include <sys/malloc.h>
  82 #include <sys/resourcevar.h>
  83 #include <miscfs/specfs/specdev.h>
  84 #include <sys/ubc.h>
  85 #include <vm/vm_pageout.h>
  86 #if DIAGNOSTIC
  87 #include <kern/assert.h>
  88 #endif /* DIAGNOSTIC */
  89 #include <kern/task.h>
  90 #include <kern/zalloc.h>
  91
  92 #include <sys/kdebug.h>
  93 #include <machine/spl.h>
  94
  95 static __inline__ void bufqinc(int q);
  96 static __inline__ void bufqdec(int q);
  97
  98 static struct buf *getnewbuf(int slpflag, int slptimeo, int *queue);
  99 static int bcleanbuf(struct buf *bp);
 100 extern void vwakeup();
 101
 102 extern int niobuf;      /* The number of IO buffer headers for cluster IO */
 103 int blaundrycnt;
 104
 105 /* zone allocated buffer headers */
 106 static zone_t buf_hdr_zone;
 107 static int buf_hdr_count;
 108
 109 #if TRACE
 110 struct  proc *traceproc;
 111 int     tracewhich, tracebuf[TRCSIZ];
 112 u_int   tracex;
 113 char    traceflags[TR_NFLAGS];
 114 #endif /* TRACE */
 115
 116 /*
 117  * Definitions for the buffer hash lists.
 118  */
 119 #define BUFHASH(dvp, lbn)       \
 120         (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
 121 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
 122 u_long  bufhash;
 123
 124 /* Definitions for the buffer stats. */
 125 struct bufstats bufstats;
 126
 127 /* Number of delayed write buffers */
 128 int nbdwrite = 0;
 129
 130 /*
 131  * Insq/Remq for the buffer hash lists.
 132  */
 133 #if 0
 134 #define binshash(bp, dp)        LIST_INSERT_HEAD(dp, bp, b_hash)
 135 #define bremhash(bp)            LIST_REMOVE(bp, b_hash)
 136 #endif /* 0 */
 137
 138
 139 TAILQ_HEAD(ioqueue, buf) iobufqueue;
 140 TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
 141 static int needbuffer;
 142 static int need_iobuffer;
 143
 144 /*
 145  * Insq/Remq for the buffer free lists.
 146  */
 147 #define binsheadfree(bp, dp, whichq)    do { \
 148                                     TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
 149                                         bufqinc((whichq));      \
 150                                         (bp)->b_whichq = whichq; \
 151                                     (bp)->b_timestamp = time.tv_sec; \
 152                                 } while (0)
 153
 154 #define binstailfree(bp, dp, whichq)    do { \
 155                                     TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
 156                                         bufqinc((whichq));      \
 157                                         (bp)->b_whichq = whichq; \
 158                                     (bp)->b_timestamp = time.tv_sec; \
 159                                 } while (0)
 160
 161 #define BHASHENTCHECK(bp)       \
 162         if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef)  \
 163                 panic("%x: b_hash.le_prev is not deadbeef", (bp));
 164
 165 #define BLISTNONE(bp)   \
 166         (bp)->b_hash.le_next = (struct buf *)0; \
 167         (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
 168
 169 /*
 170  * Insq/Remq for the vnode usage lists.
 171  */
 172 #define bufinsvn(bp, dp)        LIST_INSERT_HEAD(dp, bp, b_vnbufs)
 173 #define bufremvn(bp) {                                                  \
 174         LIST_REMOVE(bp, b_vnbufs);                                      \
 175         (bp)->b_vnbufs.le_next = NOLIST;                                \
 176 }
 177
 178 simple_lock_data_t bufhashlist_slock;           /* lock on buffer hash list */
 179
 180 /* number of per vnode, "in flight" buffer writes */
 181 #define BUFWRITE_THROTTLE       9
 182
 183
 184 /*
 185  * Time in seconds before a buffer on a list is
 186  * considered as a stale buffer
 187  */
 188 #define LRU_IS_STALE 120 /* default value for the LRU */
 189 #define AGE_IS_STALE 60  /* default value for the AGE */
 190 #define META_IS_STALE 180 /* default value for the BQ_META */
 191
 192 int lru_is_stale = LRU_IS_STALE;
 193 int age_is_stale = AGE_IS_STALE;
 194 int meta_is_stale = META_IS_STALE;
 195
 196 /* LIST_INSERT_HEAD() with assertions */
 197 static __inline__ void
 198 blistenterhead(struct bufhashhdr * head, struct buf * bp)
 199 {
 200         if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
 201                 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
 202         (head)->lh_first = bp;
 203         bp->b_hash.le_prev = &(head)->lh_first;
 204         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
 205                 panic("blistenterhead: le_prev is deadbeef");
 206 }
 207
 208 static __inline__ void
 209 binshash(struct buf *bp, struct bufhashhdr *dp)
 210 {
 211         struct buf *nbp;
 212
 213         simple_lock(&bufhashlist_slock);
 214
 215 #if 0
 216         if((bad = incore(bp->b_vp, bp->b_lblkno)))
 217                 panic("binshash: already incore bp 0x%x, bad 0x%x\n", bp, bad);
 218 #endif /* 0 */
 219
 220         BHASHENTCHECK(bp);
 221
 222         nbp = dp->lh_first;
 223         for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
 224                 if(nbp == bp)
 225                         panic("buf already in hashlist");
 226         }
 227
 228         blistenterhead(dp, bp);
 229         simple_unlock(&bufhashlist_slock);
 230 }
 231
 232 static __inline__ void
 233 bremhash(struct buf *bp)
 234 {
 235         simple_lock(&bufhashlist_slock);
 236         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
 237                 panic("bremhash le_prev is deadbeef");
 238         if (bp->b_hash.le_next == bp)
 239                 panic("bremhash: next points to self");
 240
 241         if (bp->b_hash.le_next != NULL)
 242                 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
 243         *bp->b_hash.le_prev = (bp)->b_hash.le_next;
 244         simple_unlock(&bufhashlist_slock);
 245 }
 246
 247 /*
 248  * Remove a buffer from the free list it's on
 249  */
 250 void
 251 bremfree(bp)
 252         struct buf *bp;
 253 {
 254         struct bqueues *dp = NULL;
 255         int whichq = -1;
 256
 257         /*
 258          * We only calculate the head of the freelist when removing
 259          * the last element of the list as that is the only time that
 260          * it is needed (e.g. to reset the tail pointer).
 261          *
 262          * NB: This makes an assumption about how tailq's are implemented.
 263          */
 264         if (bp->b_freelist.tqe_next == NULL) {
 265                 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
 266                         if (dp->tqh_last == &bp->b_freelist.tqe_next)
 267                                 break;
 268                 if (dp == &bufqueues[BQUEUES])
 269                         panic("bremfree: lost tail");
 270         }
 271         TAILQ_REMOVE(dp, bp, b_freelist);
 272         whichq = bp->b_whichq;
 273         bufqdec(whichq);
 274         bp->b_whichq = -1;
 275         bp->b_timestamp = 0;
 276 }
 277
 278 /*
 279  * Associate a buffer with a vnode.
 280  */
 281 static void
 282 bgetvp(vp, bp)
 283         register struct vnode *vp;
 284         register struct buf *bp;
 285 {
 286
 287         if (bp->b_vp != vp)
 288                 panic("bgetvp: not free");
 289         VHOLD(vp);
 290         bp->b_vp = vp;
 291         if (vp->v_type == VBLK || vp->v_type == VCHR)
 292                 bp->b_dev = vp->v_rdev;
 293         else
 294                 bp->b_dev = NODEV;
 295         /*
 296          * Insert onto list for new vnode.
 297          */
 298         bufinsvn(bp, &vp->v_cleanblkhd);
 299 }
 300
 301 /*
 302  * Disassociate a buffer from a vnode.
 303  */
 304 static void
 305 brelvp(bp)
 306         register struct buf *bp;
 307 {
 308         struct vnode *vp;
 309
 310         if (bp->b_vp == (struct vnode *) 0)
 311                 panic("brelvp: NULL vp");
 312         /*
 313          * Delete from old vnode list, if on one.
 314          */
 315         if (bp->b_vnbufs.le_next != NOLIST)
 316                 bufremvn(bp);
 317         vp = bp->b_vp;
 318         bp->b_vp = (struct vnode *) 0;
 319         HOLDRELE(vp);
 320 }
 321
 322 /*
 323  * Reassign a buffer from one vnode to another.
 324  * Used to assign file specific control information
 325  * (indirect blocks) to the vnode to which they belong.
 326  */
 327 void
 328 reassignbuf(bp, newvp)
 329         register struct buf *bp;
 330         register struct vnode *newvp;
 331 {
 332         register struct buflists *listheadp;
 333
 334         if (newvp == NULL) {
 335                 printf("reassignbuf: NULL");
 336                 return;
 337         }
 338         /*
 339          * Delete from old vnode list, if on one.
 340          */
 341         if (bp->b_vnbufs.le_next != NOLIST)
 342                 bufremvn(bp);
 343         /*
 344          * If dirty, put on list of dirty buffers;
 345          * otherwise insert onto list of clean buffers.
 346          */
 347         if (ISSET(bp->b_flags, B_DELWRI))
 348                 listheadp = &newvp->v_dirtyblkhd;
 349         else
 350                 listheadp = &newvp->v_cleanblkhd;
 351         bufinsvn(bp, listheadp);
 352 }
 353
 354 static __inline__ void
 355 bufhdrinit(struct buf *bp)
 356 {
 357         bzero((char *)bp, sizeof *bp);
 358         bp->b_dev = NODEV;
 359         bp->b_rcred = NOCRED;
 360         bp->b_wcred = NOCRED;
 361         bp->b_vnbufs.le_next = NOLIST;
 362         bp->b_flags = B_INVAL;
 363
 364         return;
 365 }
 366
 367 /*
 368  * Initialize buffers and hash links for buffers.
 369  */
 370 __private_extern__ void
 371 bufinit()
 372 {
 373         register struct buf *bp;
 374         register struct bqueues *dp;
 375         register int i;
 376         int metabuf;
 377         long whichq;
 378         static void bufzoneinit();
 379         static void bcleanbuf_thread_init();
 380
 381         /* Initialize the buffer queues ('freelists') and the hash table */
 382         for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
 383                 TAILQ_INIT(dp);
 384         bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash);
 385
 386         simple_lock_init(&bufhashlist_slock );
 387
 388         metabuf = nbuf/8; /* reserved for meta buf */
 389
 390         /* Initialize the buffer headers */
 391         for (i = 0; i < nbuf; i++) {
 392                 bp = &buf[i];
 393                 bufhdrinit(bp);
 394
 395                 /*
 396                  * metabuf buffer headers on the meta-data list and
 397                  * rest of the buffer headers on the empty list
 398                  */
 399                 if (--metabuf)
 400                         whichq = BQ_META;
 401                 else
 402                         whichq = BQ_EMPTY;
 403
 404                 BLISTNONE(bp);
 405                 dp = &bufqueues[whichq];
 406                 binsheadfree(bp, dp, whichq);
 407                 binshash(bp, &invalhash);
 408         }
 409
 410         for (; i < nbuf + niobuf; i++) {
 411                 bp = &buf[i];
 412                 bufhdrinit(bp);
 413                 binsheadfree(bp, &iobufqueue, -1);
 414         }
 415
 416         printf("using %d buffer headers and %d cluster IO buffer headers\n",
 417                 nbuf, niobuf);
 418
 419         /* Set up zones used by the buffer cache */
 420         bufzoneinit();
 421
 422         /* start the bcleanbuf() thread */
 423         bcleanbuf_thread_init();
 424
 425 #if 0   /* notyet */
 426         {
 427         static void bufq_balance_thread_init();
 428         /* create a thread to do dynamic buffer queue balancing */
 429         bufq_balance_thread_init();
 430         }
 431 #endif /* notyet */
 432 }
 433
 434 static struct buf *
 435 bio_doread(vp, blkno, size, cred, async, queuetype)
 436         struct vnode *vp;
 437         daddr_t blkno;
 438         int size;
 439         struct ucred *cred;
 440         int async;
 441         int queuetype;
 442 {
 443         register struct buf *bp;
 444         struct proc     *p = current_proc();
 445
 446         bp = getblk(vp, blkno, size, 0, 0, queuetype);
 447
 448         /*
 449          * If buffer does not have data valid, start a read.
 450          * Note that if buffer is B_INVAL, getblk() won't return it.
 451          * Therefore, it's valid if it's I/O has completed or been delayed.
 452          */
 453         if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
 454                 /* Start I/O for the buffer (keeping credentials). */
 455                 SET(bp->b_flags, B_READ | async);
 456                 if (cred != NOCRED && bp->b_rcred == NOCRED) {
 457                         /*
 458                          * NFS has embedded ucred.
 459                          * Can not crhold() here as that causes zone corruption
 460                          */
 461                         bp->b_rcred = crdup(cred);
 462                 }
 463
 464                 VOP_STRATEGY(bp);
 465
 466                 trace(TR_BREADMISS, pack(vp, size), blkno);
 467
 468                 /* Pay for the read. */
 469                 if (p && p->p_stats)
 470                         p->p_stats->p_ru.ru_inblock++;          /* XXX */
 471         } else if (async) {
 472                 brelse(bp);
 473         }
 474
 475         trace(TR_BREADHIT, pack(vp, size), blkno);
 476
 477         return (bp);
 478 }
 479 /*
 480  * Read a disk block.
 481  * This algorithm described in Bach (p.54).
 482  */
 483 int
 484 bread(vp, blkno, size, cred, bpp)
 485         struct vnode *vp;
 486         daddr_t blkno;
 487         int size;
 488         struct ucred *cred;
 489         struct buf **bpp;
 490 {
 491         register struct buf *bp;
 492
 493         /* Get buffer for block. */
 494         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
 495
 496         /* Wait for the read to complete, and return result. */
 497         return (biowait(bp));
 498 }
 499
 500 /*
 501  * Read a disk block. [bread() for meta-data]
 502  * This algorithm described in Bach (p.54).
 503  */
 504 int
 505 meta_bread(vp, blkno, size, cred, bpp)
 506         struct vnode *vp;
 507         daddr_t blkno;
 508         int size;
 509         struct ucred *cred;
 510         struct buf **bpp;
 511 {
 512         register struct buf *bp;
 513
 514         /* Get buffer for block. */
 515         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
 516
 517         /* Wait for the read to complete, and return result. */
 518         return (biowait(bp));
 519 }
 520
 521 /*
 522  * Read-ahead multiple disk blocks. The first is sync, the rest async.
 523  * Trivial modification to the breada algorithm presented in Bach (p.55).
 524  */
 525 int
 526 breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp)
 527         struct vnode *vp;
 528         daddr_t blkno; int size;
 529         daddr_t rablks[]; int rasizes[];
 530         int nrablks;
 531         struct ucred *cred;
 532         struct buf **bpp;
 533 {
 534         register struct buf *bp;
 535         int i;
 536
 537         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
 538
 539         /*
 540          * For each of the read-ahead blocks, start a read, if necessary.
 541          */
 542         for (i = 0; i < nrablks; i++) {
 543                 /* If it's in the cache, just go on to next one. */
 544                 if (incore(vp, rablks[i]))
 545                         continue;
 546
 547                 /* Get a buffer for the read-ahead block */
 548                 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, BLK_READ);
 549         }
 550
 551         /* Otherwise, we had to start a read for it; wait until it's valid. */
 552         return (biowait(bp));
 553 }
 554
 555 /*
 556  * Read with single-block read-ahead.  Defined in Bach (p.55), but
 557  * implemented as a call to breadn().
 558  * XXX for compatibility with old file systems.
 559  */
 560 int
 561 breada(vp, blkno, size, rablkno, rabsize, cred, bpp)
 562         struct vnode *vp;
 563         daddr_t blkno; int size;
 564         daddr_t rablkno; int rabsize;
 565         struct ucred *cred;
 566         struct buf **bpp;
 567 {
 568
 569         return (breadn(vp, blkno, size, &rablkno, &rabsize, 1, cred, bpp));
 570 }
 571
 572 /*
 573  * Block write.  Described in Bach (p.56)
 574  */
 575 int
 576 bwrite(bp)
 577         struct buf *bp;
 578 {
 579         int rv, sync, wasdelayed;
 580         struct proc     *p = current_proc();
 581         struct vnode *vp = bp->b_vp;
 582
 583         /* Remember buffer type, to switch on it later. */
 584         sync = !ISSET(bp->b_flags, B_ASYNC);
 585         wasdelayed = ISSET(bp->b_flags, B_DELWRI);
 586         CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
 587         if (wasdelayed) {
 588                 nbdwrite--;
 589                 wakeup((caddr_t)&nbdwrite);
 590         }
 591
 592         if (!sync) {
 593                 /*
 594                  * If not synchronous, pay for the I/O operation and make
 595                  * sure the buf is on the correct vnode queue.  We have
 596                  * to do this now, because if we don't, the vnode may not
 597                  * be properly notified that its I/O has completed.
 598                  */
 599                 if (wasdelayed)
 600                         reassignbuf(bp, vp);
 601                 else
 602                 if (p && p->p_stats)
 603                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
 604         }
 605
 606         trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
 607
 608         /* Initiate disk write.  Make sure the appropriate party is charged. */
 609         SET(bp->b_flags, B_WRITEINPROG);
 610         vp->v_numoutput++;
 611
 612         VOP_STRATEGY(bp);
 613
 614         if (sync) {
 615                 /*
 616                  * If I/O was synchronous, wait for it to complete.
 617                  */
 618                 rv = biowait(bp);
 619
 620                 /*
 621                  * Pay for the I/O operation, if it's not been paid for, and
 622                  * make sure it's on the correct vnode queue. (async operatings
 623                  * were payed for above.)
 624                  */
 625                 if (wasdelayed)
 626                         reassignbuf(bp, vp);
 627                 else
 628                 if (p && p->p_stats)
 629                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
 630
 631                 /* Release the buffer. */
 632                 // XXXdbg - only if the unused bit is set
 633                 if (!ISSET(bp->b_flags, B_NORELSE)) {
 634                     brelse(bp);
 635                 } else {
 636                     CLR(bp->b_flags, B_NORELSE);
 637                 }
 638
 639                 return (rv);
 640         } else {
 641                 return (0);
 642         }
 643 }
 644
 645 int
 646 vn_bwrite(ap)
 647         struct vop_bwrite_args *ap;
 648 {
 649         return (bwrite(ap->a_bp));
 650 }
 651
 652 /*
 653  * Delayed write.
 654  *
 655  * The buffer is marked dirty, but is not queued for I/O.
 656  * This routine should be used when the buffer is expected
 657  * to be modified again soon, typically a small write that
 658  * partially fills a buffer.
 659  *
 660  * NB: magnetic tapes cannot be delayed; they must be
 661  * written in the order that the writes are requested.
 662  *
 663  * Described in Leffler, et al. (pp. 208-213).
 664  *
 665  * Note: With the abilitty to allocate additional buffer
 666  * headers, we can get in to the situation where "too" many
 667  * bdwrite()s can create situation where the kernel can create
 668  * buffers faster than the disks can service. Doing a bawrite() in
 669  * cases were we have "too many" outstanding bdwrite()s avoids that.
 670  */
 671 __private_extern__ int
 672 bdwrite_internal(bp, return_error)
 673         struct buf *bp;
 674         int return_error;
 675 {
 676         struct proc *p = current_proc();
 677         struct vnode *vp = bp->b_vp;
 678
 679         /*
 680          * If the block hasn't been seen before:
 681          *      (1) Mark it as having been seen,
 682          *      (2) Charge for the write.
 683          *      (3) Make sure it's on its vnode's correct block list,
 684          */
 685         if (!ISSET(bp->b_flags, B_DELWRI)) {
 686                 SET(bp->b_flags, B_DELWRI);
 687                 if (p && p->p_stats)
 688                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
 689                 nbdwrite ++;
 690                 reassignbuf(bp, vp);
 691         }
 692
 693         /* If this is a tape block, write it the block now. */
 694         if (ISSET(bp->b_flags, B_TAPE)) {
 695                 /* bwrite(bp); */
 696                 VOP_BWRITE(bp);
 697                 return (0);
 698         }
 699
 700         /*
 701          * If the vnode has "too many" write operations in progress
 702          * wait for them to finish the IO
 703          */
 704         while (vp->v_numoutput >= BUFWRITE_THROTTLE) {
 705                 vp->v_flag |= VTHROTTLED;
 706                 (void)tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "bdwrite", 0);
 707         }
 708
 709         /*
 710          * If we have too many delayed write buffers,
 711          * more than we can "safely" handle, just fall back to
 712          * doing the async write
 713          */
 714         if (nbdwrite < 0)
 715                 panic("bdwrite: Negative nbdwrite");
 716
 717         // can't do a bawrite() if the LOCKED bit is set because the
 718         // buffer is part of a transaction and can't go to disk until
 719         // the LOCKED bit is cleared.
 720         if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf/4)*3)) {
 721                 if (return_error)
 722                         return (EAGAIN);
 723                 else
 724                         bawrite(bp);
 725                 return (0);
 726         }
 727
 728         /* Otherwise, the "write" is done, so mark and release the buffer. */
 729         SET(bp->b_flags, B_DONE);
 730         brelse(bp);
 731         return (0);
 732 }
 733
 734 void
 735 bdwrite(bp)
 736         struct buf *bp;
 737 {
 738         (void) bdwrite_internal(bp, 0);
 739 }
 740
 741
 742 /*
 743  * Asynchronous block write; just an asynchronous bwrite().
 744  *
 745  * Note: With the abilitty to allocate additional buffer
 746  * headers, we can get in to the situation where "too" many
 747  * bawrite()s can create situation where the kernel can create
 748  * buffers faster than the disks can service.
 749  * We limit the number of "in flight" writes a vnode can have to
 750  * avoid this.
 751  */
 752 static int
 753 bawrite_internal(bp, throttle)
 754         struct buf *bp;
 755         int throttle;
 756 {
 757         struct vnode *vp = bp->b_vp;
 758
 759         if (vp) {
 760                 /*
 761                  * If the vnode has "too many" write operations in progress
 762                  * wait for them to finish the IO
 763                  */
 764                 while (vp->v_numoutput >= BUFWRITE_THROTTLE) {
 765                         if (throttle) {
 766                                 vp->v_flag |= VTHROTTLED;
 767                                 (void)tsleep((caddr_t)&vp->v_numoutput,
 768                                                 PRIBIO + 1, "bawrite", 0);
 769                         } else
 770                                 return (EWOULDBLOCK);
 771                 }
 772         }
 773
 774         SET(bp->b_flags, B_ASYNC);
 775         VOP_BWRITE(bp);
 776         return (0);
 777 }
 778
 779 void
 780 bawrite(bp)
 781         struct buf *bp;
 782 {
 783         (void) bawrite_internal(bp, 1);
 784 }
 785
 786 /*
 787  *      bwillwrite:
 788  *
 789  *      Called prior to the locking of any vnodes when we are expecting to
 790  *      write.  We do not want to starve the buffer cache with too many
 791  *      dirty buffers so we block here.  By blocking prior to the locking
 792  *      of any vnodes we attempt to avoid the situation where a locked vnode
 793  *      prevents the various system daemons from flushing related buffers.
 794  */
 795
 796 void
 797 bwillwrite(void)
 798 {
 799         /* XXX To be implemented later */
 800 }
 801
 802 /*
 803  * Release a buffer on to the free lists.
 804  * Described in Bach (p. 46).
 805  */
 806 void
 807 brelse(bp)
 808         struct buf *bp;
 809 {
 810         struct bqueues *bufq;
 811         int s;
 812         long whichq;
 813
 814         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
 815                      bp->b_lblkno * PAGE_SIZE, (int)bp, (int)bp->b_data,
 816                      bp->b_flags, 0);
 817
 818         trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
 819
 820         // if we're invalidating a buffer that has the B_CALL bit
 821         // set then call the b_iodone function so it gets cleaned
 822         // up properly.
 823         //
 824         if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
 825                 if (ISSET(bp->b_flags, B_CALL) && !ISSET(bp->b_flags, B_DELWRI)) {
 826                         panic("brelse: CALL flag set but not DELWRI! bp 0x%x\n", bp);
 827                 }
 828                 if (ISSET(bp->b_flags, B_CALL)) {       /* if necessary, call out */
 829                         void    (*iodone_func)(struct buf *) = bp->b_iodone;
 830
 831                         CLR(bp->b_flags, B_CALL);       /* but note callout done */
 832                         bp->b_iodone = NULL;
 833
 834                         if (iodone_func == NULL) {
 835                                 panic("brelse: bp @ 0x%x has NULL b_iodone!\n", bp);
 836                         }
 837                         (*iodone_func)(bp);
 838                 }
 839         }
 840
 841         /* IO is done. Cleanup the UPL state */
 842         if (!ISSET(bp->b_flags, B_META)
 843                 && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
 844                 kern_return_t kret;
 845                 upl_t         upl;
 846                 int           upl_flags;
 847
 848                 if ( !ISSET(bp->b_flags, B_PAGELIST)) {
 849                         if ( !ISSET(bp->b_flags, B_INVAL)) {
 850                                 kret = ubc_create_upl(bp->b_vp,
 851                                                                 ubc_blktooff(bp->b_vp, bp->b_lblkno),
 852                                                                 bp->b_bufsize,
 853                                                             &upl,
 854                                                                 NULL,
 855                                                                 UPL_PRECIOUS);
 856                                 if (kret != KERN_SUCCESS)
 857                                         panic("brelse: Failed to get pagelists");
 858 #ifdef  UBC_DEBUG
 859                                 upl_ubc_alias_set(upl, bp, 5);
 860 #endif /* UBC_DEBUG */
 861                         } else
 862                                 upl = (upl_t) 0;
 863                 } else {
 864                         upl = bp->b_pagelist;
 865                         kret = ubc_upl_unmap(upl);
 866
 867                         if (kret != KERN_SUCCESS)
 868                                 panic("kernel_upl_unmap failed");
 869                         bp->b_data = 0;
 870                 }
 871                 if (upl) {
 872                         if (bp->b_flags & (B_ERROR | B_INVAL)) {
 873                             if (bp->b_flags & (B_READ | B_INVAL))
 874                                         upl_flags = UPL_ABORT_DUMP_PAGES;
 875                                 else
 876                                         upl_flags = 0;
 877                                 ubc_upl_abort(upl, upl_flags);
 878                         } else {
 879                             if (ISSET(bp->b_flags, B_NEEDCOMMIT))
 880                                     upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
 881                             else if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY))
 882                                         upl_flags = UPL_COMMIT_SET_DIRTY ;
 883                                 else
 884                                     upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
 885                                 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
 886                                         UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
 887                         }
 888                         s = splbio();
 889                         CLR(bp->b_flags, B_PAGELIST);
 890                         bp->b_pagelist = 0;
 891                         splx(s);
 892                 }
 893         } else {
 894                 if(ISSET(bp->b_flags, B_PAGELIST))
 895                         panic("brelse: pagelist set for non VREG; vp=%x", bp->b_vp);
 896         }
 897
 898         /* Wake up any processes waiting for any buffer to become free. */
 899         if (needbuffer) {
 900                 needbuffer = 0;
 901                 wakeup(&needbuffer);
 902         }
 903
 904         /* Wake up any proceeses waiting for _this_ buffer to become free. */
 905         if (ISSET(bp->b_flags, B_WANTED)) {
 906                 CLR(bp->b_flags, B_WANTED);
 907                 wakeup(bp);
 908         }
 909
 910         /* Block disk interrupts. */
 911         s = splbio();
 912
 913         /*
 914          * Determine which queue the buffer should be on, then put it there.
 915          */
 916
 917         /* If it's locked, don't report an error; try again later. */
 918         if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
 919                 CLR(bp->b_flags, B_ERROR);
 920
 921         /* If it's not cacheable, or an error, mark it invalid. */
 922         if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
 923                 SET(bp->b_flags, B_INVAL);
 924
 925         if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
 926                 /*
 927                  * If it's invalid or empty, dissociate it from its vnode
 928                  * and put on the head of the appropriate queue.
 929                  */
 930                 if (bp->b_vp)
 931                         brelvp(bp);
 932                 if (ISSET(bp->b_flags, B_DELWRI)) {
 933                         CLR(bp->b_flags, B_DELWRI);
 934                         nbdwrite--;
 935                         wakeup((caddr_t)&nbdwrite);
 936                 }
 937                 if (bp->b_bufsize <= 0)
 938                         whichq = BQ_EMPTY;      /* no data */
 939                 else if (ISSET(bp->b_flags, B_META))
 940                         whichq = BQ_META;               /* meta-data */
 941                 else
 942                         whichq = BQ_AGE;        /* invalid data */
 943
 944                 bufq = &bufqueues[whichq];
 945                 binsheadfree(bp, bufq, whichq);
 946         } else {
 947                 /*
 948                  * It has valid data.  Put it on the end of the appropriate
 949                  * queue, so that it'll stick around for as long as possible.
 950                  */
 951                 if (ISSET(bp->b_flags, B_LOCKED))
 952                         whichq = BQ_LOCKED;             /* locked in core */
 953                 else if (ISSET(bp->b_flags, B_META))
 954                         whichq = BQ_META;               /* meta-data */
 955                 else if (ISSET(bp->b_flags, B_AGE))
 956                         whichq = BQ_AGE;                /* stale but valid data */
 957                 else
 958                         whichq = BQ_LRU;                /* valid data */
 959
 960                 bufq = &bufqueues[whichq];
 961                 binstailfree(bp, bufq, whichq);
 962         }
 963
 964         /* Unlock the buffer. */
 965         CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE));
 966
 967         /* Allow disk interrupts. */
 968         splx(s);
 969
 970         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
 971                      (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
 972 }
 973
 974 /*
 975  * Determine if a block is in the cache.
 976  * Just look on what would be its hash chain.  If it's there, return
 977  * a pointer to it, unless it's marked invalid.  If it's marked invalid,
 978  * we normally don't return the buffer, unless the caller explicitly
 979  * wants us to.
 980  */
 981 struct buf *
 982 incore(vp, blkno)
 983         struct vnode *vp;
 984         daddr_t blkno;
 985 {
 986         struct buf *bp;
 987
 988         bp = BUFHASH(vp, blkno)->lh_first;
 989
 990         /* Search hash chain */
 991         for (; bp != NULL; bp = bp->b_hash.le_next) {
 992                 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
 993                     !ISSET(bp->b_flags, B_INVAL))
 994                         return (bp);
 995         }
 996
 997         return (0);
 998 }
 999
1000
1001 /* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
1002 /*
1003  * Get a block of requested size that is associated with
1004  * a given vnode and block offset. If it is found in the
1005  * block cache, mark it as having been found, make it busy
1006  * and return it. Otherwise, return an empty block of the
1007  * correct size. It is up to the caller to insure that the
1008  * cached blocks be of the correct size.
1009  */
1010 struct buf *
1011 getblk(vp, blkno, size, slpflag, slptimeo, operation)
1012         register struct vnode *vp;
1013         daddr_t blkno;
1014         int size, slpflag, slptimeo, operation;
1015 {
1016         struct buf *bp;
1017         int s, err;
1018         upl_t upl;
1019         upl_page_info_t *pl;
1020         kern_return_t kret;
1021         int error=0;
1022         int pagedirty = 0;
1023
1024         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
1025                      blkno * PAGE_SIZE, size, operation, 0, 0);
1026 start:
1027
1028         s = splbio();
1029         if ((bp = incore(vp, blkno))) {
1030                 /* Found in the Buffer Cache */
1031                 if (ISSET(bp->b_flags, B_BUSY)) {
1032                         /* but is busy */
1033                         switch (operation) {
1034                         case BLK_READ:
1035                         case BLK_WRITE:
1036                         case BLK_META:
1037                                 SET(bp->b_flags, B_WANTED);
1038                                 bufstats.bufs_busyincore++;
1039                                 err = tsleep(bp, slpflag | (PRIBIO + 1), "getblk",
1040                                     slptimeo);
1041                                 splx(s);
1042                                 /*
1043                                  * Callers who call with PCATCH or timeout are
1044                                  * willing to deal with the NULL pointer
1045                                  */
1046                                 if (err && ((slpflag & PCATCH) ||
1047                                                          ((err == EWOULDBLOCK) && slptimeo)))
1048                                         return (NULL);
1049                                 goto start;
1050                                 /*NOTREACHED*/
1051                                 break;
1052
1053                         case BLK_PAGEIN:
1054                                 /* pagein operation must not use getblk */
1055                                 panic("getblk: pagein for incore busy buffer");
1056                                 splx(s);
1057                                 /*NOTREACHED*/
1058                                 break;
1059
1060                         case BLK_PAGEOUT:
1061                                 /* pageout operation must not use getblk */
1062                                 panic("getblk: pageout for incore busy buffer");
1063                                 splx(s);
1064                                 /*NOTREACHED*/
1065                                 break;
1066
1067                         default:
1068                                 panic("getblk: %d unknown operation 1", operation);
1069                                 /*NOTREACHED*/
1070                                 break;
1071                         }
1072                 } else {
1073                         /* not busy */
1074                         SET(bp->b_flags, (B_BUSY | B_CACHE));
1075                         bremfree(bp);
1076                         bufstats.bufs_incore++;
1077                         splx(s);
1078
1079                         allocbuf(bp, size);
1080                         if (ISSET(bp->b_flags, B_PAGELIST))
1081                                         panic("pagelist buffer is not busy");
1082
1083                         switch (operation) {
1084                         case BLK_READ:
1085                         case BLK_WRITE:
1086                                 if (UBCISVALID(bp->b_vp) && bp->b_bufsize) {
1087                                         kret = ubc_create_upl(vp,
1088                                                                         ubc_blktooff(vp, bp->b_lblkno),
1089                                                                         bp->b_bufsize,
1090                                                                         &upl,
1091                                                                         &pl,
1092                                                                         UPL_PRECIOUS);
1093                                         if (kret != KERN_SUCCESS)
1094                                                 panic("Failed to get pagelists");
1095
1096                                         SET(bp->b_flags, B_PAGELIST);
1097                                         bp->b_pagelist = upl;
1098
1099                                         if (!upl_valid_page(pl, 0)) {
1100                                                 if (vp->v_tag != VT_NFS)
1101                                                         panic("getblk: incore buffer without valid page");
1102                                                 CLR(bp->b_flags, B_CACHE);
1103                                         }
1104
1105                                         if (upl_dirty_page(pl, 0))
1106                                                 SET(bp->b_flags, B_WASDIRTY);
1107                                         else
1108                                                 CLR(bp->b_flags, B_WASDIRTY);
1109
1110                                         kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
1111                                         if (kret != KERN_SUCCESS)
1112                                                 panic("getblk: ubc_upl_map() failed with (%d)",
1113                                                                   kret);
1114                                         if (bp->b_data == 0)
1115                                                 panic("ubc_upl_map mapped 0");
1116                                 }
1117                                 break;
1118
1119                         case BLK_META:
1120                                 /*
1121                                  * VM is not involved in IO for the meta data
1122                                  * buffer already has valid data
1123                                  */
1124                                 if(bp->b_data == 0)
1125                                         panic("bp->b_data null incore buf=%x", bp);
1126                                 break;
1127
1128                         case BLK_PAGEIN:
1129                         case BLK_PAGEOUT:
1130                                 panic("getblk: paging operation 1");
1131                                 break;
1132
1133                         default:
1134                                 panic("getblk: %d unknown operation 2", operation);
1135                                 /*NOTREACHED*/
1136                                 break;
1137                         }
1138                 }
1139         } else { /* not incore() */
1140                 int queue = BQ_EMPTY; /* Start with no preference */
1141                 splx(s);
1142
1143                 if ((operation == BLK_META) || (UBCINVALID(vp)) ||
1144                         !(UBCINFOEXISTS(vp))) {
1145                         operation = BLK_META;
1146                 }
1147                 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
1148                         goto start;
1149                 if (incore(vp, blkno)) {
1150                         SET(bp->b_flags, B_INVAL);
1151                         binshash(bp, &invalhash);
1152                         brelse(bp);
1153                         goto start;
1154                 }
1155                 /*
1156                  * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
1157                  *       CALLED!  BE CAREFUL.
1158                  */
1159
1160                 /*
1161                  * if it is meta, the queue may be set to other
1162                  * type so reset as well as mark it to be B_META
1163                  * so that when buffer is released it will goto META queue
1164                  * Also, if the vnode is not VREG, then it is META
1165                  */
1166                 if (operation == BLK_META) {
1167                         SET(bp->b_flags, B_META);
1168                         queue = BQ_META;
1169                 }
1170
1171                 bp->b_blkno = bp->b_lblkno = blkno;
1172                 bp->b_vp = vp;
1173
1174                 /*
1175                  * Insert in the hash so that incore() can find it
1176                  */
1177                 binshash(bp, BUFHASH(vp, blkno));
1178
1179                 s = splbio();
1180                 bgetvp(vp, bp);
1181                 splx(s);
1182
1183                 allocbuf(bp, size);
1184
1185                 switch (operation) {
1186                 case BLK_META:
1187                         /* buffer data is invalid */
1188
1189                         if(bp->b_data == 0)
1190                                 panic("bp->b_data is null %x",bp);
1191
1192                         bufstats.bufs_miss++;
1193
1194                         /* wakeup the buffer */
1195                         CLR(bp->b_flags, B_WANTED);
1196                         wakeup(bp);
1197                         break;
1198
1199                 case BLK_READ:
1200                 case BLK_WRITE:
1201
1202                         if (ISSET(bp->b_flags, B_PAGELIST))
1203                                 panic("B_PAGELIST in bp=%x",bp);
1204
1205                         kret = ubc_create_upl(vp,
1206                                                         ubc_blktooff(vp, blkno),
1207                                                         bp->b_bufsize,
1208                                                         &upl,
1209                                                         &pl,
1210                                                         UPL_PRECIOUS);
1211                         if (kret != KERN_SUCCESS)
1212                                 panic("Failed to get pagelists");
1213
1214 #ifdef  UBC_DEBUG
1215                         upl_ubc_alias_set(upl, bp, 4);
1216 #endif /* UBC_DEBUG */
1217                         bp->b_pagelist = upl;
1218
1219                         SET(bp->b_flags, B_PAGELIST);
1220
1221                         if (upl_valid_page(pl, 0)) {
1222                                 SET(bp->b_flags, B_CACHE | B_DONE);
1223                                 bufstats.bufs_vmhits++;
1224
1225                                 pagedirty = upl_dirty_page(pl, 0);
1226
1227                                 if (pagedirty)
1228                                         SET(bp->b_flags, B_WASDIRTY);
1229
1230                                 if (vp->v_tag == VT_NFS) {
1231                                         off_t  f_offset;
1232                                         int    valid_size;
1233
1234                                         bp->b_validoff = 0;
1235                                         bp->b_dirtyoff = 0;
1236
1237                                         f_offset = ubc_blktooff(vp, blkno);
1238
1239                                         if (f_offset > vp->v_ubcinfo->ui_size) {
1240                                                 CLR(bp->b_flags, (B_CACHE|B_DONE|B_WASDIRTY));
1241                                                 bp->b_validend = 0;
1242                                                 bp->b_dirtyend = 0;
1243                                         } else {
1244                                                 valid_size = min(((unsigned int)(vp->v_ubcinfo->ui_size - f_offset)), PAGE_SIZE);
1245                                                 bp->b_validend = valid_size;
1246
1247                                                 if (pagedirty)
1248                                                        bp->b_dirtyend = valid_size;
1249                                                 else
1250                                                        bp->b_dirtyend = 0;
1251
1252                                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_NONE,
1253                                                              bp->b_validend, bp->b_dirtyend,
1254                                                              (int)vp->v_ubcinfo->ui_size, 0, 0);
1255                                         }
1256                                 } else {
1257                                         bp->b_validoff = 0;
1258                                         bp->b_dirtyoff = 0;
1259
1260                                         if (pagedirty) {
1261                                                 /* page is dirty */
1262                                                 bp->b_validend = bp->b_bcount;
1263                                                 bp->b_dirtyend = bp->b_bcount;
1264                                         } else {
1265                                                 /* page is clean */
1266                                                 bp->b_validend = bp->b_bcount;
1267                                                 bp->b_dirtyend = 0;
1268                                         }
1269                                 }
1270                                 error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL);
1271                                 if(error) {
1272                                         panic("getblk: VOP_BMAP failed");
1273                                         /*NOTREACHED*/
1274                                         /*
1275                                          * XXX:  We probably should invalidate the VM Page
1276                                          */
1277                                         bp->b_error = error;
1278                                         SET(bp->b_flags, (B_ERROR | B_INVAL));
1279                                         /* undo B_DONE that was set before upl_commit() */
1280                                         CLR(bp->b_flags, B_DONE);
1281                                         brelse(bp);
1282                                         return (0);
1283                                 }
1284                         } else {
1285                                 bufstats.bufs_miss++;
1286                         }
1287                         kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
1288                         if (kret != KERN_SUCCESS) {
1289                                 panic("getblk: ubc_upl_map() "
1290                                       "failed with (%d)", kret);
1291                         }
1292                         if (bp->b_data == 0)
1293                                 panic("kernel_upl_map mapped 0");
1294
1295                         break;
1296
1297                 case BLK_PAGEIN:
1298                 case BLK_PAGEOUT:
1299                         panic("getblk: paging operation 2");
1300                         break;
1301                 default:
1302                         panic("getblk: %d unknown operation 3", operation);
1303                         /*NOTREACHED*/
1304                         break;
1305                 }
1306         }
1307
1308         if (bp->b_data == NULL)
1309                 panic("getblk: bp->b_addr is null");
1310
1311         if (bp->b_bufsize & 0xfff) {
1312                 if (ISSET(bp->b_flags, B_META) && (bp->b_bufsize & 0x1ff))
1313                         panic("getblk: bp->b_bufsize = %d", bp->b_bufsize);
1314         }
1315
1316         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
1317                      (int)bp, (int)bp->b_data, bp->b_flags, 3, 0);
1318
1319         return (bp);
1320 }
1321
1322 /*
1323  * Get an empty, disassociated buffer of given size.
1324  */
1325 struct buf *
1326 geteblk(size)
1327         int size;
1328 {
1329         struct buf *bp;
1330     int queue = BQ_EMPTY;
1331
1332         while ((bp = getnewbuf(0, 0, &queue)) == 0)
1333                 ;
1334         SET(bp->b_flags, (B_META|B_INVAL));
1335
1336 #if DIAGNOSTIC
1337         assert(queue == BQ_EMPTY);
1338 #endif /* DIAGNOSTIC */
1339         /* XXX need to implement logic to deal with other queues */
1340
1341         binshash(bp, &invalhash);
1342         allocbuf(bp, size);
1343         bufstats.bufs_eblk++;
1344
1345         return (bp);
1346 }
1347
1348 /*
1349  * Zones for the meta data buffers
1350  */
1351
1352 #define MINMETA 512
1353 #define MAXMETA 4096
1354
1355 struct meta_zone_entry {
1356         zone_t mz_zone;
1357         vm_size_t mz_size;
1358         vm_size_t mz_max;
1359         char *mz_name;
1360 };
1361
1362 struct meta_zone_entry meta_zones[] = {
1363         {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
1364         {NULL, (MINMETA * 2),  64 * (MINMETA * 2), "buf.1024" },
1365         {NULL, (MINMETA * 4),  16 * (MINMETA * 4), "buf.2048" },
1366         {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
1367         {NULL, 0, 0, "" } /* End */
1368 };
1369
1370 /*
1371  * Initialize the meta data zones
1372  */
1373 static void
1374 bufzoneinit(void)
1375 {
1376         int i;
1377
1378         for (i = 0; meta_zones[i].mz_size != 0; i++) {
1379                 meta_zones[i].mz_zone =
1380                                 zinit(meta_zones[i].mz_size,
1381                                         meta_zones[i].mz_max,
1382                                         PAGE_SIZE,
1383                                         meta_zones[i].mz_name);
1384         }
1385         buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
1386 }
1387
1388 static __inline__ zone_t
1389 getbufzone(size_t size)
1390 {
1391         int i;
1392
1393         if ((size % 512) || (size < MINMETA) || (size > MAXMETA))
1394                 panic("getbufzone: incorect size = %d", size);
1395
1396     for (i = 0; meta_zones[i].mz_size != 0; i++) {
1397                 if (meta_zones[i].mz_size >= size)
1398                         break;
1399         }
1400
1401         return (meta_zones[i].mz_zone);
1402 }
1403
1404 /*
1405  * With UBC, there is no need to expand / shrink the file data
1406  * buffer. The VM uses the same pages, hence no waste.
1407  * All the file data buffers can have one size.
1408  * In fact expand / shrink would be an expensive operation.
1409  *
1410  * Only exception to this is meta-data buffers. Most of the
1411  * meta data operations are smaller than PAGE_SIZE. Having the
1412  * meta-data buffers grow and shrink as needed, optimizes use
1413  * of the kernel wired memory.
1414  */
1415
1416 int
1417 allocbuf(bp, size)
1418         struct buf *bp;
1419         int size;
1420 {
1421         vm_size_t desired_size;
1422
1423         desired_size = roundup(size, CLBYTES);
1424
1425         if(desired_size < PAGE_SIZE)
1426                 desired_size = PAGE_SIZE;
1427         if (desired_size > MAXBSIZE)
1428                 panic("allocbuf: buffer larger than MAXBSIZE requested");
1429
1430         if (ISSET(bp->b_flags, B_META)) {
1431                 kern_return_t kret;
1432                 zone_t zprev, z;
1433                 size_t nsize = roundup(size, MINMETA);
1434
1435                 if (bp->b_data) {
1436                         vm_offset_t elem = (vm_offset_t)bp->b_data;
1437
1438                         if (ISSET(bp->b_flags, B_ZALLOC))
1439                                 if (bp->b_bufsize <= MAXMETA) {
1440                                         if (bp->b_bufsize < nsize) {
1441                                                 /* reallocate to a bigger size */
1442                                                 desired_size = nsize;
1443
1444                                                 zprev = getbufzone(bp->b_bufsize);
1445                                                 z = getbufzone(nsize);
1446                                                 bp->b_data = (caddr_t)zalloc(z);
1447                                                 if(bp->b_data == 0)
1448                                                         panic("allocbuf: zalloc() returned NULL");
1449                                                 bcopy(elem, bp->b_data, bp->b_bufsize);
1450                                                 zfree(zprev, elem);
1451                                         } else {
1452                                                 desired_size = bp->b_bufsize;
1453                                         }
1454                                 } else
1455                                         panic("allocbuf: B_ZALLOC set incorrectly");
1456                         else
1457                                 if (bp->b_bufsize < desired_size) {
1458                                         /* reallocate to a bigger size */
1459                                         kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1460                                         if (kret != KERN_SUCCESS)
1461                                                 panic("allocbuf: kmem_alloc() returned %d", kret);
1462                                         if(bp->b_data == 0)
1463                                                 panic("allocbuf: null b_data");
1464                                         bcopy(elem, bp->b_data, bp->b_bufsize);
1465                                         kmem_free(kernel_map, elem, bp->b_bufsize);
1466                                 } else {
1467                                         desired_size = bp->b_bufsize;
1468                                 }
1469                 } else {
1470                         /* new allocation */
1471                         if (nsize <= MAXMETA) {
1472                                 desired_size = nsize;
1473                                 z = getbufzone(nsize);
1474                                 bp->b_data = (caddr_t)zalloc(z);
1475                                 if(bp->b_data == 0)
1476                                         panic("allocbuf: zalloc() returned NULL 2");
1477                                 SET(bp->b_flags, B_ZALLOC);
1478                         } else {
1479                                 kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1480                                 if (kret != KERN_SUCCESS)
1481                                         panic("allocbuf: kmem_alloc() 2 returned %d", kret);
1482                                 if(bp->b_data == 0)
1483                                         panic("allocbuf: null b_data 2");
1484                         }
1485                 }
1486         }
1487
1488         if (ISSET(bp->b_flags, B_META) && (bp->b_data == 0))
1489                 panic("allocbuf: bp->b_data is NULL, buf @ 0x%x", bp);
1490
1491         bp->b_bufsize = desired_size;
1492         bp->b_bcount = size;
1493         return (0);
1494 }
1495
1496 /*
1497  *      Get a new buffer from one of the free lists.
1498  *
1499  *      Request for a queue is passes in. The queue from which the buffer was taken
1500  *      from is returned. Out of range queue requests get BQ_EMPTY. Request for
1501  *      BQUEUE means no preference. Use heuristics in that case.
1502  *      Heuristics is as follows:
1503  *      Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
1504  *      If none available block till one is made available.
1505  *      If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
1506  *      Pick the most stale buffer.
1507  *      If found buffer was marked delayed write, start the async. write
1508  *      and restart the search.
1509  *      Initialize the fields and disassociate the buffer from the vnode.
1510  *      Remove the buffer from the hash. Return the buffer and the queue
1511  *      on which it was found.
1512  */
1513
1514 static struct buf *
1515 getnewbuf(slpflag, slptimeo, queue)
1516         int slpflag, slptimeo;
1517         int *queue;
1518 {
1519         register struct buf *bp;
1520         register struct buf *lru_bp;
1521         register struct buf *age_bp;
1522         register struct buf *meta_bp;
1523         register int age_time, lru_time, bp_time, meta_time;
1524         int s;
1525         int req = *queue; /* save it for restarts */
1526
1527 start:
1528         s = splbio();
1529
1530         /* invalid request gets empty queue */
1531         if ((*queue > BQUEUES) || (*queue < 0)
1532                 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED))
1533                 *queue = BQ_EMPTY;
1534
1535         /* (*queue == BQUEUES) means no preference */
1536         if (*queue != BQUEUES) {
1537                 /* Try for the requested queue first */
1538                 bp = bufqueues[*queue].tqh_first;
1539                 if (bp)
1540                         goto found;
1541         }
1542
1543         /* Unable to use requested queue */
1544         age_bp = bufqueues[BQ_AGE].tqh_first;
1545         lru_bp = bufqueues[BQ_LRU].tqh_first;
1546         meta_bp = bufqueues[BQ_META].tqh_first;
1547
1548         if (!age_bp && !lru_bp && !meta_bp) {
1549                 /*
1550                  * Unavailble on AGE or LRU or META queues
1551                  * Try the empty list first
1552                  */
1553                 bp = bufqueues[BQ_EMPTY].tqh_first;
1554                 if (bp) {
1555                         *queue = BQ_EMPTY;
1556                         goto found;
1557                 }
1558
1559                 /* Create a new temparory buffer header */
1560                 bp = (struct buf *)zalloc(buf_hdr_zone);
1561
1562                 if (bp) {
1563                         bufhdrinit(bp);
1564                         BLISTNONE(bp);
1565                         binshash(bp, &invalhash);
1566                         SET(bp->b_flags, B_HDRALLOC);
1567                         *queue = BQ_EMPTY;
1568                         binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
1569                         buf_hdr_count++;
1570                         goto found;
1571                 }
1572
1573                 /* Log this error condition */
1574                 printf("getnewbuf: No useful buffers");
1575
1576                 /* wait for a free buffer of any kind */
1577                 needbuffer = 1;
1578                 bufstats.bufs_sleeps++;
1579                 tsleep(&needbuffer, slpflag|(PRIBIO+1), "getnewbuf", slptimeo);
1580                 splx(s);
1581                 return (0);
1582         }
1583
1584         /* Buffer available either on AGE or LRU or META */
1585         bp = NULL;
1586         *queue = -1;
1587
1588         /* Buffer available either on AGE or LRU */
1589         if (!age_bp) {
1590                 bp = lru_bp;
1591                 *queue = BQ_LRU;
1592         } else if (!lru_bp) {
1593                 bp = age_bp;
1594                 *queue = BQ_AGE;
1595         } else { /* buffer available on both AGE and LRU */
1596                 age_time = time.tv_sec - age_bp->b_timestamp;
1597                 lru_time = time.tv_sec - lru_bp->b_timestamp;
1598                 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
1599                         bp = age_bp;
1600                         *queue = BQ_AGE;
1601                         /*
1602                          * we should probably re-timestamp eveything in the
1603                          * queues at this point with the current time
1604                          */
1605                 } else {
1606                         if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
1607                                 bp = lru_bp;
1608                                 *queue = BQ_LRU;
1609                         } else {
1610                                 bp = age_bp;
1611                                 *queue = BQ_AGE;
1612                         }
1613                 }
1614         }
1615
1616         if (!bp) { /* Neither on AGE nor on LRU */
1617                 bp = meta_bp;
1618                 *queue = BQ_META;
1619         }  else if (meta_bp) {
1620                 bp_time = time.tv_sec - bp->b_timestamp;
1621                 meta_time = time.tv_sec - meta_bp->b_timestamp;
1622
1623                 if (!(bp_time < 0) && !(meta_time < 0)) {
1624                         /* time not set backwards */
1625                         int bp_is_stale;
1626                         bp_is_stale = (*queue == BQ_LRU) ?
1627                                         lru_is_stale : age_is_stale;
1628
1629                         if ((meta_time >= meta_is_stale) &&
1630                                         (bp_time < bp_is_stale)) {
1631                                 bp = meta_bp;
1632                                 *queue = BQ_META;
1633                         }
1634                 }
1635         }
1636
1637         if (bp == NULL)
1638                 panic("getnewbuf: null bp");
1639
1640 found:
1641         if (ISSET(bp->b_flags, B_LOCKED)) {
1642             panic("getnewbuf: bp @ 0x%x is LOCKED! (flags 0x%x)\n", bp, bp->b_flags);
1643         }
1644
1645         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1646                 panic("getnewbuf: le_prev is deadbeef, buf @ 0x%x", bp);
1647
1648         if(ISSET(bp->b_flags, B_BUSY))
1649                 panic("getnewbuf reusing BUSY buf @ 0x%x", bp);
1650
1651         /* Clean it */
1652         if (bcleanbuf(bp)) {
1653                 /* bawrite() issued, buffer not ready */
1654                 splx(s);
1655                 *queue = req;
1656                 goto start;
1657         }
1658         splx(s);
1659         return (bp);
1660 }
1661
1662 #include <mach/mach_types.h>
1663 #include <mach/memory_object_types.h>
1664 #include <kern/sched_prim.h>
1665
1666 /*
1667  * Clean a buffer.
1668  * Returns 0 is buffer is ready to use,
1669  * Returns 1 if issued a bawrite() to indicate
1670  * that the buffer is not ready.
1671  */
1672 static int
1673 bcleanbuf(struct buf *bp)
1674 {
1675         int s;
1676         struct ucred *cred;
1677         int     hdralloc = 0;
1678
1679         s = splbio();
1680
1681         /* Remove from the queue */
1682         bremfree(bp);
1683
1684         /* Buffer is no longer on free lists. */
1685         SET(bp->b_flags, B_BUSY);
1686
1687         /* Check whether the buffer header was "allocated" */
1688         if (ISSET(bp->b_flags, B_HDRALLOC))
1689                 hdralloc = 1;
1690
1691         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1692                 panic("bcleanbuf: le_prev is deadbeef");
1693
1694         /*
1695          * If buffer was a delayed write, start the IO by queuing
1696          * it on the LAUNDRY queue, and return 1
1697          */
1698         if (ISSET(bp->b_flags, B_DELWRI)) {
1699                 splx(s);
1700                 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
1701                 blaundrycnt++;
1702                 wakeup(&blaundrycnt);
1703                 /* and give it a chance to run */
1704                 (void)thread_block(THREAD_CONTINUE_NULL);
1705                 return (1);
1706         }
1707
1708         if (bp->b_vp)
1709                 brelvp(bp);
1710         bremhash(bp);
1711         BLISTNONE(bp);
1712
1713         splx(s);
1714
1715         if (ISSET(bp->b_flags, B_META)) {
1716                 vm_offset_t elem = (vm_offset_t)bp->b_data;
1717                 if (elem == 0)
1718                         panic("bcleanbuf: NULL bp->b_data B_META buffer");
1719
1720                 if (ISSET(bp->b_flags, B_ZALLOC)) {
1721                         if (bp->b_bufsize <= MAXMETA) {
1722                                 zone_t z;
1723
1724                                 z = getbufzone(bp->b_bufsize);
1725                                 bp->b_data = (caddr_t)0xdeadbeef;
1726                                 zfree(z, elem);
1727                                 CLR(bp->b_flags, B_ZALLOC);
1728                         } else
1729                                 panic("bcleanbuf: B_ZALLOC set incorrectly");
1730                 } else {
1731                         bp->b_data = (caddr_t)0xdeadbeef;
1732                         kmem_free(kernel_map, elem, bp->b_bufsize);
1733                 }
1734         }
1735
1736         trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
1737
1738         /* disassociate us from our vnode, if we had one... */
1739         s = splbio();
1740
1741         /* clear out various other fields */
1742         bp->b_bufsize = 0;
1743         bp->b_data = 0;
1744         bp->b_flags = B_BUSY;
1745         if (hdralloc)
1746                 SET(bp->b_flags, B_HDRALLOC);
1747         bp->b_dev = NODEV;
1748         bp->b_blkno = bp->b_lblkno = 0;
1749         bp->b_iodone = 0;
1750         bp->b_error = 0;
1751         bp->b_resid = 0;
1752         bp->b_bcount = 0;
1753         bp->b_dirtyoff = bp->b_dirtyend = 0;
1754         bp->b_validoff = bp->b_validend = 0;
1755
1756         /* nuke any credentials we were holding */
1757         cred = bp->b_rcred;
1758         if (cred != NOCRED) {
1759                 bp->b_rcred = NOCRED;
1760                 crfree(cred);
1761         }
1762         cred = bp->b_wcred;
1763         if (cred != NOCRED) {
1764                 bp->b_wcred = NOCRED;
1765                 crfree(cred);
1766         }
1767         splx(s);
1768         return (0);
1769 }
1770
1771
1772 /*
1773  * Wait for operations on the buffer to complete.
1774  * When they do, extract and return the I/O's error value.
1775  */
1776 int
1777 biowait(bp)
1778         struct buf *bp;
1779 {
1780         int s;
1781
1782         s = splbio();
1783         while (!ISSET(bp->b_flags, B_DONE))
1784                 tsleep(bp, PRIBIO + 1, "biowait", 0);
1785         splx(s);
1786
1787         /* check for interruption of I/O (e.g. via NFS), then errors. */
1788         if (ISSET(bp->b_flags, B_EINTR)) {
1789                 CLR(bp->b_flags, B_EINTR);
1790                 return (EINTR);
1791         } else if (ISSET(bp->b_flags, B_ERROR))
1792                 return (bp->b_error ? bp->b_error : EIO);
1793         else
1794                 return (0);
1795 }
1796
1797 /*
1798  * Mark I/O complete on a buffer.
1799  *
1800  * If a callback has been requested, e.g. the pageout
1801  * daemon, do so. Otherwise, awaken waiting processes.
1802  *
1803  * [ Leffler, et al., says on p.247:
1804  *      "This routine wakes up the blocked process, frees the buffer
1805  *      for an asynchronous write, or, for a request by the pagedaemon
1806  *      process, invokes a procedure specified in the buffer structure" ]
1807  *
1808  * In real life, the pagedaemon (or other system processes) wants
1809  * to do async stuff to, and doesn't want the buffer brelse()'d.
1810  * (for swap pager, that puts swap buffers on the free lists (!!!),
1811  * for the vn device, that puts malloc'd buffers on the free lists!)
1812  */
1813 void
1814 biodone(bp)
1815         struct buf *bp;
1816 {
1817         boolean_t       funnel_state;
1818         struct vnode *vp;
1819
1820         funnel_state = thread_funnel_set(kernel_flock, TRUE);
1821
1822         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
1823                      (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1824
1825         if (ISSET(bp->b_flags, B_DONE))
1826                 panic("biodone already");
1827         SET(bp->b_flags, B_DONE);               /* note that it's done */
1828         /*
1829          * I/O was done, so don't believe
1830          * the DIRTY state from VM anymore
1831          */
1832         CLR(bp->b_flags, B_WASDIRTY);
1833
1834         if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
1835                 vwakeup(bp);     /* wake up reader */
1836
1837         if (kdebug_enable) {
1838             int    code = DKIO_DONE;
1839
1840             if (bp->b_flags & B_READ)
1841                 code |= DKIO_READ;
1842             if (bp->b_flags & B_ASYNC)
1843                 code |= DKIO_ASYNC;
1844
1845             if (bp->b_flags & B_META)
1846                 code |= DKIO_META;
1847             else if (bp->b_flags & (B_PGIN | B_PAGEOUT))
1848                 code |= DKIO_PAGING;
1849
1850             KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
1851                                     bp, bp->b_vp, bp->b_resid, bp->b_error, 0);
1852         }
1853
1854         /* Wakeup the throttled write operations as needed */
1855         vp = bp->b_vp;
1856         if (vp
1857                 && (vp->v_flag & VTHROTTLED)
1858                 && (vp->v_numoutput <= (BUFWRITE_THROTTLE / 3))) {
1859                 vp->v_flag &= ~VTHROTTLED;
1860                 wakeup((caddr_t)&vp->v_numoutput);
1861         }
1862
1863         if (ISSET(bp->b_flags, B_CALL)) {       /* if necessary, call out */
1864                 void    (*iodone_func)(struct buf *) = bp->b_iodone;
1865
1866                 CLR(bp->b_flags, B_CALL);       /* but note callout done */
1867                 bp->b_iodone = NULL;
1868
1869                 if (iodone_func == NULL) {
1870                         panic("biodone: bp @ 0x%x has NULL b_iodone!\n", bp);
1871                 } else {
1872                         (*iodone_func)(bp);
1873                 }
1874         } else if (ISSET(bp->b_flags, B_ASYNC)) /* if async, release it */
1875                 brelse(bp);
1876         else {                                  /* or just wakeup the buffer */
1877                 CLR(bp->b_flags, B_WANTED);
1878                 wakeup(bp);
1879         }
1880
1881         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
1882                      (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1883
1884         thread_funnel_set(kernel_flock, funnel_state);
1885 }
1886
1887 /*
1888  * Return a count of buffers on the "locked" queue.
1889  */
1890 int
1891 count_lock_queue()
1892 {
1893         register struct buf *bp;
1894         register int n = 0;
1895
1896         for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
1897             bp = bp->b_freelist.tqe_next)
1898                 n++;
1899         return (n);
1900 }
1901
1902 /*
1903  * Return a count of 'busy' buffers. Used at the time of shutdown.
1904  */
1905 int
1906 count_busy_buffers()
1907 {
1908         register struct buf *bp;
1909         register int nbusy = 0;
1910
1911         for (bp = &buf[nbuf]; --bp >= buf; )
1912                 if ((bp->b_flags & (B_BUSY|B_INVAL)) == B_BUSY)
1913                         nbusy++;
1914         return (nbusy);
1915 }
1916
1917 #if DIAGNOSTIC
1918 /*
1919  * Print out statistics on the current allocation of the buffer pool.
1920  * Can be enabled to print out on every ``sync'' by setting "syncprt"
1921  * in vfs_syscalls.c using sysctl.
1922  */
1923 void
1924 vfs_bufstats()
1925 {
1926         int s, i, j, count;
1927         register struct buf *bp;
1928         register struct bqueues *dp;
1929         int counts[MAXBSIZE/CLBYTES+1];
1930         static char *bname[BQUEUES] =
1931                 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
1932
1933         for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
1934                 count = 0;
1935                 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1936                         counts[j] = 0;
1937                 s = splbio();
1938                 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
1939                         counts[bp->b_bufsize/CLBYTES]++;
1940                         count++;
1941                 }
1942                 splx(s);
1943                 printf("%s: total-%d", bname[i], count);
1944                 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1945                         if (counts[j] != 0)
1946                                 printf(", %d-%d", j * CLBYTES, counts[j]);
1947                 printf("\n");
1948         }
1949 }
1950 #endif /* DIAGNOSTIC */
1951
1952 #define NRESERVEDIOBUFS 64
1953
1954 __private_extern__ struct buf *
1955 alloc_io_buf(vp, priv)
1956         struct vnode *vp;
1957         int priv;
1958 {
1959         register struct buf *bp;
1960         int s;
1961
1962         s = splbio();
1963
1964         while (niobuf - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse && !priv) {
1965                 need_iobuffer = 1;
1966                 bufstats.bufs_iobufsleeps++;
1967                 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf", 0);
1968         }
1969
1970         while ((bp = iobufqueue.tqh_first) == NULL) {
1971                 need_iobuffer = 1;
1972                 bufstats.bufs_iobufsleeps++;
1973                 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf1", 0);
1974         }
1975
1976         TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
1977         bp->b_timestamp = 0;
1978
1979         /* clear out various fields */
1980         bp->b_flags = B_BUSY;
1981         bp->b_blkno = bp->b_lblkno = 0;
1982
1983         bp->b_iodone = 0;
1984         bp->b_error = 0;
1985         bp->b_resid = 0;
1986         bp->b_bcount = 0;
1987         bp->b_bufsize = 0;
1988         bp->b_vp = vp;
1989
1990         if (vp->v_type == VBLK || vp->v_type == VCHR)
1991                 bp->b_dev = vp->v_rdev;
1992         else
1993                 bp->b_dev = NODEV;
1994         bufstats.bufs_iobufinuse++;
1995         if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
1996                 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
1997         splx(s);
1998
1999         return (bp);
2000 }
2001
2002 __private_extern__ void
2003 free_io_buf(bp)
2004         struct buf *bp;
2005 {
2006         int s;
2007
2008         s = splbio();
2009         /* put buffer back on the head of the iobufqueue */
2010         bp->b_vp = NULL;
2011         bp->b_flags = B_INVAL;
2012
2013         binsheadfree(bp, &iobufqueue, -1);
2014
2015         /* Wake up any processes waiting for any buffer to become free. */
2016         if (need_iobuffer) {
2017                 need_iobuffer = 0;
2018                 wakeup(&need_iobuffer);
2019         }
2020         bufstats.bufs_iobufinuse--;
2021         splx(s);
2022 }
2023
2024 /* disabled for now */
2025
2026 /* XXX move this to a separate file */
2027 /*
2028  * Dynamic Scaling of the Buffer Queues
2029  */
2030
2031 typedef long long blsize_t;
2032
2033 blsize_t MAXNBUF; /* initialize to (mem_size / PAGE_SIZE) */
2034 /* Global tunable limits */
2035 blsize_t nbufh;                 /* number of buffer headers */
2036 blsize_t nbuflow;               /* minimum number of buffer headers required */
2037 blsize_t nbufhigh;              /* maximum number of buffer headers allowed */
2038 blsize_t nbuftarget;    /* preferred number of buffer headers */
2039
2040 /*
2041  * assertions:
2042  *
2043  * 1.   0 < nbuflow <= nbufh <= nbufhigh
2044  * 2.   nbufhigh <= MAXNBUF
2045  * 3.   0 < nbuflow <= nbuftarget <= nbufhigh
2046  * 4.   nbufh can not be set by sysctl().
2047  */
2048
2049 /* Per queue tunable limits */
2050
2051 struct bufqlim {
2052         blsize_t        bl_nlow;        /* minimum number of buffer headers required */
2053         blsize_t        bl_num;         /* number of buffer headers on the queue */
2054         blsize_t        bl_nlhigh;      /* maximum number of buffer headers allowed */
2055         blsize_t        bl_target;      /* preferred number of buffer headers */
2056         long    bl_stale;       /* Seconds after which a buffer is considered stale */
2057 } bufqlim[BQUEUES];
2058
2059 /*
2060  * assertions:
2061  *
2062  * 1.   0 <= bl_nlow <= bl_num <= bl_nlhigh
2063  * 2.   bl_nlhigh <= MAXNBUF
2064  * 3.  bufqlim[BQ_META].bl_nlow != 0
2065  * 4.  bufqlim[BQ_META].bl_nlow > (number of possible concurrent
2066  *                                                                      file system IO operations)
2067  * 5.   bl_num can not be set by sysctl().
2068  * 6.   bl_nhigh <= nbufhigh
2069  */
2070
2071 /*
2072  * Rationale:
2073  * ----------
2074  * Defining it blsize_t as long permits 2^31 buffer headers per queue.
2075  * Which can describe (2^31 * PAGE_SIZE) memory per queue.
2076  *
2077  * These limits are exported to by means of sysctl().
2078  * It was decided to define blsize_t as a 64 bit quantity.
2079  * This will make sure that we will not be required to change it
2080  * as long as we do not exceed 64 bit address space for the kernel.
2081  *
2082  * low and high numbers parameters initialized at compile time
2083  * and boot arguments can be used to override them. sysctl()
2084  * would not change the value. sysctl() can get all the values
2085  * but can set only target. num is the current level.
2086  *
2087  * Advantages of having a "bufqscan" thread doing the balancing are,
2088  * Keep enough bufs on BQ_EMPTY.
2089  *      getnewbuf() by default will always select a buffer from the BQ_EMPTY.
2090  *              getnewbuf() perfoms best if a buffer was found there.
2091  *              Also this minimizes the possibility of starting IO
2092  *              from getnewbuf(). That's a performance win, too.
2093  *
2094  *      Localize complex logic [balancing as well as time aging]
2095  *              to balancebufq().
2096  *
2097  *      Simplify getnewbuf() logic by elimination of time aging code.
2098  */
2099
2100 /*
2101  * Algorithm:
2102  * -----------
2103  * The goal of the dynamic scaling of the buffer queues to to keep
2104  * the size of the LRU close to bl_target. Buffers on a queue would
2105  * be time aged.
2106  *
2107  * There would be a thread which will be responsible for "balancing"
2108  * the buffer cache queues.
2109  *
2110  * The scan order would be:     AGE, LRU, META, EMPTY.
2111  */
2112
2113 long bufqscanwait = 0;
2114
2115 static void bufqscan_thread();
2116 static int balancebufq(int q);
2117 static int btrimempty(int n);
2118 static __inline__ int initbufqscan(void);
2119 static __inline__ int nextbufq(int q);
2120 static void buqlimprt(int all);
2121
2122 static void
2123 bufq_balance_thread_init()
2124 {
2125
2126         if (bufqscanwait++ == 0) {
2127
2128                 /* Initalize globals */
2129                 MAXNBUF = (mem_size / PAGE_SIZE);
2130                 nbufh = nbuf;
2131                 nbuflow = min(nbufh, 100);
2132                 nbufhigh = min(MAXNBUF, max(nbufh, 2048));
2133                 nbuftarget = (mem_size >> 5) / PAGE_SIZE;
2134                 nbuftarget = max(nbuflow, nbuftarget);
2135                 nbuftarget = min(nbufhigh, nbuftarget);
2136
2137                 /*
2138                  * Initialize the bufqlim
2139                  */
2140
2141                 /* LOCKED queue */
2142                 bufqlim[BQ_LOCKED].bl_nlow = 0;
2143                 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
2144                 bufqlim[BQ_LOCKED].bl_target = 0;
2145                 bufqlim[BQ_LOCKED].bl_stale = 30;
2146
2147                 /* LRU queue */
2148                 bufqlim[BQ_LRU].bl_nlow = 0;
2149                 bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
2150                 bufqlim[BQ_LRU].bl_target = nbuftarget/4;
2151                 bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
2152
2153                 /* AGE queue */
2154                 bufqlim[BQ_AGE].bl_nlow = 0;
2155                 bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
2156                 bufqlim[BQ_AGE].bl_target = nbuftarget/4;
2157                 bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
2158
2159                 /* EMPTY queue */
2160                 bufqlim[BQ_EMPTY].bl_nlow = 0;
2161                 bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
2162                 bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
2163                 bufqlim[BQ_EMPTY].bl_stale = 600000;
2164
2165                 /* META queue */
2166                 bufqlim[BQ_META].bl_nlow = 0;
2167                 bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
2168                 bufqlim[BQ_META].bl_target = nbuftarget/4;
2169                 bufqlim[BQ_META].bl_stale = META_IS_STALE;
2170
2171                 /* LAUNDRY queue */
2172                 bufqlim[BQ_LOCKED].bl_nlow = 0;
2173                 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
2174                 bufqlim[BQ_LOCKED].bl_target = 0;
2175                 bufqlim[BQ_LOCKED].bl_stale = 30;
2176
2177                 buqlimprt(1);
2178         }
2179
2180         /* create worker thread */
2181         kernel_thread(kernel_task, bufqscan_thread);
2182 }
2183
2184 /* The workloop for the buffer balancing thread */
2185 static void
2186 bufqscan_thread()
2187 {
2188         boolean_t       funnel_state;
2189         int moretodo = 0;
2190
2191         funnel_state = thread_funnel_set(kernel_flock, TRUE);
2192
2193         for(;;) {
2194                 do {
2195                         int q;  /* buffer queue to process */
2196
2197                         q = initbufqscan();
2198                         for (; q; ) {
2199                                 moretodo |= balancebufq(q);
2200                                 q = nextbufq(q);
2201                         }
2202                 } while (moretodo);
2203
2204 #if DIAGNOSTIC
2205                 vfs_bufstats();
2206                 buqlimprt(0);
2207 #endif
2208                 (void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz);
2209                 moretodo = 0;
2210         }
2211
2212         (void) thread_funnel_set(kernel_flock, FALSE);
2213 }
2214
2215 /* Seed for the buffer queue balancing */
2216 static __inline__ int
2217 initbufqscan()
2218 {
2219         /* Start with AGE queue */
2220         return (BQ_AGE);
2221 }
2222
2223 /* Pick next buffer queue to balance */
2224 static __inline__ int
2225 nextbufq(int q)
2226 {
2227         int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
2228
2229         q++;
2230         q %= sizeof(order);
2231         return (order[q]);
2232 }
2233
2234 /* function to balance the buffer queues */
2235 static int
2236 balancebufq(int q)
2237 {
2238         int moretodo = 0;
2239         int s = splbio();
2240         int n;
2241
2242         /* reject invalid q */
2243         if ((q < 0) || (q >= BQUEUES))
2244                 goto out;
2245
2246         /* LOCKED or LAUNDRY queue MUST not be balanced */
2247         if ((q == BQ_LOCKED) || (q == BQ_LAUNDRY))
2248                 goto out;
2249
2250         n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
2251
2252         /* If queue has less than target nothing more to do */
2253         if (n < 0)
2254                 goto out;
2255
2256         if ( n > 8 ) {
2257                 /* Balance only a small amount (12.5%) at a time */
2258                 n >>= 3;
2259         }
2260
2261         /* EMPTY queue needs special handling */
2262         if (q == BQ_EMPTY) {
2263                 moretodo |= btrimempty(n);
2264                 goto out;
2265         }
2266
2267         for (; n > 0; n--) {
2268                 struct buf *bp = bufqueues[q].tqh_first;
2269                 if (!bp)
2270                         break;
2271
2272                 /* check if it's stale */
2273                 if ((time.tv_sec - bp->b_timestamp) > bufqlim[q].bl_stale) {
2274                         if (bcleanbuf(bp)) {
2275                                 /* bawrite() issued, bp not ready */
2276                                 moretodo = 1;
2277                         } else {
2278                                 /* release the cleaned buffer to BQ_EMPTY */
2279                                 SET(bp->b_flags, B_INVAL);
2280                                 brelse(bp);
2281                         }
2282                 } else
2283                         break;
2284         }
2285
2286 out:
2287         splx(s);
2288         return (moretodo);
2289 }
2290
2291 static int
2292 btrimempty(int n)
2293 {
2294         /*
2295          * When struct buf are allocated dynamically, this would
2296          * reclaim upto 'n' struct buf from the empty queue.
2297          */
2298
2299          return (0);
2300 }
2301
2302 static __inline__ void
2303 bufqinc(int q)
2304 {
2305         if ((q < 0) || (q >= BQUEUES))
2306                 return;
2307
2308         bufqlim[q].bl_num++;
2309         return;
2310 }
2311
2312 static __inline__ void
2313 bufqdec(int q)
2314 {
2315         if ((q < 0) || (q >= BQUEUES))
2316                 return;
2317
2318         bufqlim[q].bl_num--;
2319         return;
2320 }
2321
2322 static void
2323 buqlimprt(int all)
2324 {
2325         int i;
2326     static char *bname[BQUEUES] =
2327                 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
2328
2329         if (all)
2330                 for (i = 0; i < BQUEUES; i++) {
2331                         printf("%s : ", bname[i]);
2332                         printf("min = %ld, ", (long)bufqlim[i].bl_nlow);
2333                         printf("cur = %ld, ", (long)bufqlim[i].bl_num);
2334                         printf("max = %ld, ", (long)bufqlim[i].bl_nlhigh);
2335                         printf("target = %ld, ", (long)bufqlim[i].bl_target);
2336                         printf("stale after %ld seconds\n", bufqlim[i].bl_stale);
2337                 }
2338         else
2339                 for (i = 0; i < BQUEUES; i++) {
2340                         printf("%s : ", bname[i]);
2341                         printf("cur = %ld, ", (long)bufqlim[i].bl_num);
2342                 }
2343 }
2344
2345 /*
2346  * If the getnewbuf() calls bcleanbuf() on the same thread
2347  * there is a potential for stack overrun and deadlocks.
2348  * So we always handoff the work to worker thread for completion
2349  */
2350
2351 static void
2352 bcleanbuf_thread_init()
2353 {
2354         static void bcleanbuf_thread();
2355
2356         /* create worker thread */
2357         kernel_thread(kernel_task, bcleanbuf_thread);
2358 }
2359
2360 static void
2361 bcleanbuf_thread()
2362 {
2363         boolean_t       funnel_state;
2364         struct buf *bp;
2365         int error = 0;
2366         int loopcnt = 0;
2367
2368         funnel_state = thread_funnel_set(kernel_flock, TRUE);
2369
2370 doit:
2371         while (blaundrycnt == 0)
2372                 (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 60 * hz);
2373         bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY]);
2374         /* Remove from the queue */
2375         bremfree(bp);
2376         blaundrycnt--;
2377         /* do the IO */
2378         error = bawrite_internal(bp, 0);
2379         if (error) {
2380                 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
2381                 blaundrycnt++;
2382                 if (loopcnt > 10) {
2383                         (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 1);
2384                         loopcnt = 0;
2385                 } else {
2386                         (void)thread_block(THREAD_CONTINUE_NULL);
2387                         loopcnt++;
2388                 }
2389         }
2390         /* start again */
2391         goto doit;
2392
2393         (void) thread_funnel_set(kernel_flock, funnel_state);
2394 }
2395
2396
2397 static int
2398 bp_cmp(void *a, void *b)
2399 {
2400     struct buf *bp_a = *(struct buf **)a,
2401                *bp_b = *(struct buf **)b;
2402     daddr_t res;
2403
2404     // don't have to worry about negative block
2405     // numbers so this is ok to do.
2406     //
2407     res = (bp_a->b_blkno - bp_b->b_blkno);
2408
2409     return (int)res;
2410 }
2411
2412 #define NFLUSH 32
2413
2414 int
2415 bflushq(int whichq, struct mount *mp)
2416 {
2417         struct buf *bp, *next;
2418         int         i, buf_count, s;
2419         int         counter=0, total_writes=0;
2420         static struct buf *flush_table[NFLUSH];
2421
2422         if (whichq < 0 || whichq >= BQUEUES) {
2423             return;
2424         }
2425
2426
2427   restart:
2428         bp = TAILQ_FIRST(&bufqueues[whichq]);
2429         for(buf_count=0; bp; bp=next) {
2430             next = bp->b_freelist.tqe_next;
2431
2432             if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) {
2433                 continue;
2434             }
2435
2436             if ((bp->b_flags & B_DELWRI) && (bp->b_flags & B_BUSY) == 0) {
2437                 if (whichq != BQ_LOCKED && (bp->b_flags & B_LOCKED)) {
2438                     panic("bflushq: bp @ 0x%x is locked!\n", bp);
2439                 }
2440
2441                 bremfree(bp);
2442                 bp->b_flags |= B_BUSY;
2443                 flush_table[buf_count] = bp;
2444                 buf_count++;
2445                 total_writes++;
2446
2447                 if (buf_count >= NFLUSH) {
2448                     qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
2449
2450                     for(i=0; i < buf_count; i++) {
2451                         bawrite(flush_table[i]);
2452                     }
2453
2454                     goto restart;
2455                 }
2456             }
2457         }
2458
2459         if (buf_count > 0) {
2460             qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
2461             for(i=0; i < buf_count; i++) {
2462                 bawrite(flush_table[i]);
2463             }
2464         }
2465
2466         return total_writes;
2467 }