bsd/vfs/vfs_bio.c

   1 /*
   2  * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
   7  *
   8  * This file contains Original Code and/or Modifications of Original Code
   9  * as defined in and that are subject to the Apple Public Source License
  10  * Version 2.0 (the 'License'). You may not use this file except in
  11  * compliance with the License. Please obtain a copy of the License at
  12  * http://www.opensource.apple.com/apsl/ and read it before using this
  13  * file.
  14  *
  15  * The Original Code and all software distributed under the License are
  16  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  17  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  18  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  20  * Please see the License for the specific language governing rights and
  21  * limitations under the License.
  22  *
  23  * @APPLE_LICENSE_HEADER_END@
  24  */
  25 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  26 /*-
  27  * Copyright (c) 1994 Christopher G. Demetriou
  28  * Copyright (c) 1982, 1986, 1989, 1993
  29  *      The Regents of the University of California.  All rights reserved.
  30  * (c) UNIX System Laboratories, Inc.
  31  * All or some portions of this file are derived from material licensed
  32  * to the University of California by American Telephone and Telegraph
  33  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  34  * the permission of UNIX System Laboratories, Inc.
  35  *
  36  * Redistribution and use in source and binary forms, with or without
  37  * modification, are permitted provided that the following conditions
  38  * are met:
  39  * 1. Redistributions of source code must retain the above copyright
  40  *    notice, this list of conditions and the following disclaimer.
  41  * 2. Redistributions in binary form must reproduce the above copyright
  42  *    notice, this list of conditions and the following disclaimer in the
  43  *    documentation and/or other materials provided with the distribution.
  44  * 3. All advertising materials mentioning features or use of this software
  45  *    must display the following acknowledgement:
  46  *      This product includes software developed by the University of
  47  *      California, Berkeley and its contributors.
  48  * 4. Neither the name of the University nor the names of its contributors
  49  *    may be used to endorse or promote products derived from this software
  50  *    without specific prior written permission.
  51  *
  52  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  53  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  54  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  55  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  56  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  57  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  58  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  59  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  60  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  61  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  62  * SUCH DAMAGE.
  63  *
  64  * The NEXTSTEP Software License Agreement specifies the terms
  65  * and conditions for redistribution.
  66  *
  67  *      @(#)vfs_bio.c   8.6 (Berkeley) 1/11/94
  68  */
  69
  70 /*
  71  * Some references:
  72  *      Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
  73  *      Leffler, et al.: The Design and Implementation of the 4.3BSD
  74  *              UNIX Operating System (Addison Welley, 1989)
  75  */
  76
  77 #include <sys/param.h>
  78 #include <sys/systm.h>
  79 #include <sys/proc.h>
  80 #include <sys/buf.h>
  81 #include <sys/vnode.h>
  82 #include <sys/mount.h>
  83 #include <sys/trace.h>
  84 #include <sys/malloc.h>
  85 #include <sys/resourcevar.h>
  86 #include <miscfs/specfs/specdev.h>
  87 #include <sys/ubc.h>
  88 #include <vm/vm_pageout.h>
  89 #if DIAGNOSTIC
  90 #include <kern/assert.h>
  91 #endif /* DIAGNOSTIC */
  92 #include <kern/task.h>
  93 #include <kern/zalloc.h>
  94
  95 #include <sys/kdebug.h>
  96 #include <machine/spl.h>
  97
  98 static __inline__ void bufqinc(int q);
  99 static __inline__ void bufqdec(int q);
 100
 101 static int do_breadn_for_type(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablks,
 102                 int *rasizes, int nrablks, struct ucred *cred, struct buf **bpp, int queuetype);
 103 static struct buf *getnewbuf(int slpflag, int slptimeo, int *queue);
 104 static int bcleanbuf(struct buf *bp);
 105 static int brecover_data(struct buf *bp);
 106 extern void vwakeup();
 107
 108 extern int niobuf;      /* The number of IO buffer headers for cluster IO */
 109 int blaundrycnt;
 110
 111 /* zone allocated buffer headers */
 112 static zone_t buf_hdr_zone;
 113 static int buf_hdr_count;
 114
 115 #if TRACE
 116 struct  proc *traceproc;
 117 int     tracewhich, tracebuf[TRCSIZ];
 118 u_int   tracex;
 119 char    traceflags[TR_NFLAGS];
 120 #endif /* TRACE */
 121
 122 /*
 123  * Definitions for the buffer hash lists.
 124  */
 125 #define BUFHASH(dvp, lbn)       \
 126         (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
 127 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
 128 u_long  bufhash;
 129
 130 /* Definitions for the buffer stats. */
 131 struct bufstats bufstats;
 132
 133 /* Number of delayed write buffers */
 134 int nbdwrite = 0;
 135
 136 /*
 137  * Insq/Remq for the buffer hash lists.
 138  */
 139 #if 0
 140 #define binshash(bp, dp)        LIST_INSERT_HEAD(dp, bp, b_hash)
 141 #define bremhash(bp)            LIST_REMOVE(bp, b_hash)
 142 #endif /* 0 */
 143
 144
 145 TAILQ_HEAD(ioqueue, buf) iobufqueue;
 146 TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
 147 static int needbuffer;
 148 static int need_iobuffer;
 149
 150 /*
 151  * Insq/Remq for the buffer free lists.
 152  */
 153 #define binsheadfree(bp, dp, whichq)    do { \
 154                                     TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
 155                                         bufqinc((whichq));      \
 156                                         (bp)->b_whichq = whichq; \
 157                                     (bp)->b_timestamp = time.tv_sec; \
 158                                 } while (0)
 159
 160 #define binstailfree(bp, dp, whichq)    do { \
 161                                     TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
 162                                         bufqinc((whichq));      \
 163                                         (bp)->b_whichq = whichq; \
 164                                     (bp)->b_timestamp = time.tv_sec; \
 165                                 } while (0)
 166
 167 #define BHASHENTCHECK(bp)       \
 168         if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef)  \
 169                 panic("%x: b_hash.le_prev is not deadbeef", (bp));
 170
 171 #define BLISTNONE(bp)   \
 172         (bp)->b_hash.le_next = (struct buf *)0; \
 173         (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
 174
 175 /*
 176  * Insq/Remq for the vnode usage lists.
 177  */
 178 #define bufinsvn(bp, dp)        LIST_INSERT_HEAD(dp, bp, b_vnbufs)
 179 #define bufremvn(bp) {                                                  \
 180         LIST_REMOVE(bp, b_vnbufs);                                      \
 181         (bp)->b_vnbufs.le_next = NOLIST;                                \
 182 }
 183
 184 simple_lock_data_t bufhashlist_slock;           /* lock on buffer hash list */
 185
 186 /* number of per vnode, "in flight" buffer writes */
 187 #define BUFWRITE_THROTTLE       9
 188
 189
 190 /*
 191  * Time in seconds before a buffer on a list is
 192  * considered as a stale buffer
 193  */
 194 #define LRU_IS_STALE 120 /* default value for the LRU */
 195 #define AGE_IS_STALE 60  /* default value for the AGE */
 196 #define META_IS_STALE 180 /* default value for the BQ_META */
 197
 198 int lru_is_stale = LRU_IS_STALE;
 199 int age_is_stale = AGE_IS_STALE;
 200 int meta_is_stale = META_IS_STALE;
 201
 202 /* LIST_INSERT_HEAD() with assertions */
 203 static __inline__ void
 204 blistenterhead(struct bufhashhdr * head, struct buf * bp)
 205 {
 206         if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
 207                 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
 208         (head)->lh_first = bp;
 209         bp->b_hash.le_prev = &(head)->lh_first;
 210         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
 211                 panic("blistenterhead: le_prev is deadbeef");
 212 }
 213
 214 static __inline__ void
 215 binshash(struct buf *bp, struct bufhashhdr *dp)
 216 {
 217         struct buf *nbp;
 218
 219         simple_lock(&bufhashlist_slock);
 220
 221 #if 0
 222         if((bad = incore(bp->b_vp, bp->b_lblkno)))
 223                 panic("binshash: already incore bp 0x%x, bad 0x%x\n", bp, bad);
 224 #endif /* 0 */
 225
 226         BHASHENTCHECK(bp);
 227
 228         nbp = dp->lh_first;
 229         for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
 230                 if(nbp == bp)
 231                         panic("buf already in hashlist");
 232         }
 233
 234         blistenterhead(dp, bp);
 235         simple_unlock(&bufhashlist_slock);
 236 }
 237
 238 static __inline__ void
 239 bremhash(struct buf *bp)
 240 {
 241         simple_lock(&bufhashlist_slock);
 242         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
 243                 panic("bremhash le_prev is deadbeef");
 244         if (bp->b_hash.le_next == bp)
 245                 panic("bremhash: next points to self");
 246
 247         if (bp->b_hash.le_next != NULL)
 248                 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
 249         *bp->b_hash.le_prev = (bp)->b_hash.le_next;
 250         simple_unlock(&bufhashlist_slock);
 251 }
 252
 253 /*
 254  * Remove a buffer from the free list it's on
 255  */
 256 void
 257 bremfree(bp)
 258         struct buf *bp;
 259 {
 260         struct bqueues *dp = NULL;
 261         int whichq = -1;
 262
 263         /*
 264          * We only calculate the head of the freelist when removing
 265          * the last element of the list as that is the only time that
 266          * it is needed (e.g. to reset the tail pointer).
 267          *
 268          * NB: This makes an assumption about how tailq's are implemented.
 269          */
 270         if (bp->b_freelist.tqe_next == NULL) {
 271                 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
 272                         if (dp->tqh_last == &bp->b_freelist.tqe_next)
 273                                 break;
 274                 if (dp == &bufqueues[BQUEUES])
 275                         panic("bremfree: lost tail");
 276         }
 277         TAILQ_REMOVE(dp, bp, b_freelist);
 278         whichq = bp->b_whichq;
 279         bufqdec(whichq);
 280         bp->b_whichq = -1;
 281         bp->b_timestamp = 0;
 282 }
 283
 284 /*
 285  * Associate a buffer with a vnode.
 286  */
 287 static void
 288 bgetvp(vp, bp)
 289         register struct vnode *vp;
 290         register struct buf *bp;
 291 {
 292
 293         if (bp->b_vp != vp)
 294                 panic("bgetvp: not free");
 295         VHOLD(vp);
 296         bp->b_vp = vp;
 297         if (vp->v_type == VBLK || vp->v_type == VCHR)
 298                 bp->b_dev = vp->v_rdev;
 299         else
 300                 bp->b_dev = NODEV;
 301         /*
 302          * Insert onto list for new vnode.
 303          */
 304         bufinsvn(bp, &vp->v_cleanblkhd);
 305 }
 306
 307 /*
 308  * Disassociate a buffer from a vnode.
 309  */
 310 static void
 311 brelvp(bp)
 312         register struct buf *bp;
 313 {
 314         struct vnode *vp;
 315
 316         if (bp->b_vp == (struct vnode *) 0)
 317                 panic("brelvp: NULL vp");
 318         /*
 319          * Delete from old vnode list, if on one.
 320          */
 321         if (bp->b_vnbufs.le_next != NOLIST)
 322                 bufremvn(bp);
 323         vp = bp->b_vp;
 324         bp->b_vp = (struct vnode *) 0;
 325         HOLDRELE(vp);
 326 }
 327
 328 /*
 329  * Reassign a buffer from one vnode to another.
 330  * Used to assign file specific control information
 331  * (indirect blocks) to the vnode to which they belong.
 332  */
 333 void
 334 reassignbuf(bp, newvp)
 335         register struct buf *bp;
 336         register struct vnode *newvp;
 337 {
 338         register struct buflists *listheadp;
 339
 340         if (newvp == NULL) {
 341                 printf("reassignbuf: NULL");
 342                 return;
 343         }
 344         /*
 345          * Delete from old vnode list, if on one.
 346          */
 347         if (bp->b_vnbufs.le_next != NOLIST)
 348                 bufremvn(bp);
 349         /*
 350          * If dirty, put on list of dirty buffers;
 351          * otherwise insert onto list of clean buffers.
 352          */
 353         if (ISSET(bp->b_flags, B_DELWRI))
 354                 listheadp = &newvp->v_dirtyblkhd;
 355         else
 356                 listheadp = &newvp->v_cleanblkhd;
 357         bufinsvn(bp, listheadp);
 358 }
 359
 360 static __inline__ void
 361 bufhdrinit(struct buf *bp)
 362 {
 363         bzero((char *)bp, sizeof *bp);
 364         bp->b_dev = NODEV;
 365         bp->b_rcred = NOCRED;
 366         bp->b_wcred = NOCRED;
 367         bp->b_vnbufs.le_next = NOLIST;
 368         bp->b_flags = B_INVAL;
 369
 370         return;
 371 }
 372
 373 /*
 374  * Initialize buffers and hash links for buffers.
 375  */
 376 __private_extern__ void
 377 bufinit()
 378 {
 379         register struct buf *bp;
 380         register struct bqueues *dp;
 381         register int i;
 382         int metabuf;
 383         long whichq;
 384         static void bufzoneinit();
 385         static void bcleanbuf_thread_init();
 386
 387         /* Initialize the buffer queues ('freelists') and the hash table */
 388         for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
 389                 TAILQ_INIT(dp);
 390         bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash);
 391
 392         simple_lock_init(&bufhashlist_slock );
 393
 394         metabuf = nbuf/8; /* reserved for meta buf */
 395
 396         /* Initialize the buffer headers */
 397         for (i = 0; i < nbuf; i++) {
 398                 bp = &buf[i];
 399                 bufhdrinit(bp);
 400
 401                 /*
 402                  * metabuf buffer headers on the meta-data list and
 403                  * rest of the buffer headers on the empty list
 404                  */
 405                 if (--metabuf)
 406                         whichq = BQ_META;
 407                 else
 408                         whichq = BQ_EMPTY;
 409
 410                 BLISTNONE(bp);
 411                 dp = &bufqueues[whichq];
 412                 binsheadfree(bp, dp, whichq);
 413                 binshash(bp, &invalhash);
 414         }
 415
 416         for (; i < nbuf + niobuf; i++) {
 417                 bp = &buf[i];
 418                 bufhdrinit(bp);
 419                 binsheadfree(bp, &iobufqueue, -1);
 420         }
 421
 422         printf("using %d buffer headers and %d cluster IO buffer headers\n",
 423                 nbuf, niobuf);
 424
 425         /* Set up zones used by the buffer cache */
 426         bufzoneinit();
 427
 428         /* start the bcleanbuf() thread */
 429         bcleanbuf_thread_init();
 430
 431 #if 0   /* notyet */
 432         {
 433         static void bufq_balance_thread_init();
 434         /* create a thread to do dynamic buffer queue balancing */
 435         bufq_balance_thread_init();
 436         }
 437 #endif /* notyet */
 438 }
 439
 440 static struct buf *
 441 bio_doread(vp, blkno, size, cred, async, queuetype)
 442         struct vnode *vp;
 443         daddr_t blkno;
 444         int size;
 445         struct ucred *cred;
 446         int async;
 447         int queuetype;
 448 {
 449         register struct buf *bp;
 450         struct proc     *p = current_proc();
 451
 452         bp = getblk(vp, blkno, size, 0, 0, queuetype);
 453
 454         /*
 455          * If buffer does not have data valid, start a read.
 456          * Note that if buffer is B_INVAL, getblk() won't return it.
 457          * Therefore, it's valid if it's I/O has completed or been delayed.
 458          */
 459         if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
 460                 /* Start I/O for the buffer (keeping credentials). */
 461                 SET(bp->b_flags, B_READ | async);
 462                 if (cred != NOCRED && bp->b_rcred == NOCRED) {
 463                         /*
 464                          * NFS has embedded ucred.
 465                          * Can not crhold() here as that causes zone corruption
 466                          */
 467                         bp->b_rcred = crdup(cred);
 468                 }
 469
 470                 VOP_STRATEGY(bp);
 471
 472                 trace(TR_BREADMISS, pack(vp, size), blkno);
 473
 474                 /* Pay for the read. */
 475                 if (p && p->p_stats)
 476                         p->p_stats->p_ru.ru_inblock++;          /* XXX */
 477         } else if (async) {
 478                 brelse(bp);
 479         }
 480
 481         trace(TR_BREADHIT, pack(vp, size), blkno);
 482
 483         return (bp);
 484 }
 485 /*
 486  * Read a disk block.
 487  * This algorithm described in Bach (p.54).
 488  */
 489 int
 490 bread(vp, blkno, size, cred, bpp)
 491         struct vnode *vp;
 492         daddr_t blkno;
 493         int size;
 494         struct ucred *cred;
 495         struct buf **bpp;
 496 {
 497         register struct buf *bp;
 498
 499         /* Get buffer for block. */
 500         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
 501
 502         /* Wait for the read to complete, and return result. */
 503         return (biowait(bp));
 504 }
 505
 506 /*
 507  * Read a disk block. [bread() for meta-data]
 508  * This algorithm described in Bach (p.54).
 509  */
 510 int
 511 meta_bread(vp, blkno, size, cred, bpp)
 512         struct vnode *vp;
 513         daddr_t blkno;
 514         int size;
 515         struct ucred *cred;
 516         struct buf **bpp;
 517 {
 518         register struct buf *bp;
 519
 520         /* Get buffer for block. */
 521         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
 522
 523         /* Wait for the read to complete, and return result. */
 524         return (biowait(bp));
 525 }
 526
 527 /*
 528  * Read-ahead multiple disk blocks. The first is sync, the rest async.
 529  */
 530 int
 531 breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp)
 532         struct vnode *vp;
 533         daddr_t blkno; int size;
 534         daddr_t rablks[]; int rasizes[];
 535         int nrablks;
 536         struct ucred *cred;
 537         struct buf **bpp;
 538 {
 539         return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_READ));
 540 }
 541
 542 /*
 543  * Read-ahead multiple disk blocks. The first is sync, the rest async.
 544  * [breadn() for meta-data]
 545  */
 546 int
 547 meta_breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp)
 548         struct vnode *vp;
 549         daddr_t blkno; int size;
 550         daddr_t rablks[]; int rasizes[];
 551         int nrablks;
 552         struct ucred *cred;
 553         struct buf **bpp;
 554 {
 555         return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_META));
 556 }
 557
 558 /*
 559  * Perform the reads for breadn() and meta_breadn().
 560  * Trivial modification to the breada algorithm presented in Bach (p.55).
 561  */
 562 static int
 563 do_breadn_for_type(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablks, int *rasizes,
 564                    int nrablks, struct ucred *cred, struct buf **bpp, int queuetype)
 565 {
 566         register struct buf *bp;
 567         int i;
 568
 569         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, queuetype);
 570
 571         /*
 572          * For each of the read-ahead blocks, start a read, if necessary.
 573          */
 574         for (i = 0; i < nrablks; i++) {
 575                 /* If it's in the cache, just go on to next one. */
 576                 if (incore(vp, rablks[i]))
 577                         continue;
 578
 579                 /* Get a buffer for the read-ahead block */
 580                 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, queuetype);
 581         }
 582
 583         /* Otherwise, we had to start a read for it; wait until it's valid. */
 584         return (biowait(bp));
 585 }
 586
 587 /*
 588  * Read with single-block read-ahead.  Defined in Bach (p.55), but
 589  * implemented as a call to breadn().
 590  * XXX for compatibility with old file systems.
 591  */
 592 int
 593 breada(vp, blkno, size, rablkno, rabsize, cred, bpp)
 594         struct vnode *vp;
 595         daddr_t blkno; int size;
 596         daddr_t rablkno; int rabsize;
 597         struct ucred *cred;
 598         struct buf **bpp;
 599 {
 600
 601         return (breadn(vp, blkno, size, &rablkno, &rabsize, 1, cred, bpp));
 602 }
 603
 604 /*
 605  * Block write.  Described in Bach (p.56)
 606  */
 607 int
 608 bwrite(bp)
 609         struct buf *bp;
 610 {
 611         int rv, sync, wasdelayed;
 612         struct proc     *p = current_proc();
 613         struct vnode *vp = bp->b_vp;
 614
 615         if (bp->b_data == 0) {
 616                 if (brecover_data(bp) == 0)
 617                         return (0);
 618         }
 619         /* Remember buffer type, to switch on it later. */
 620         sync = !ISSET(bp->b_flags, B_ASYNC);
 621         wasdelayed = ISSET(bp->b_flags, B_DELWRI);
 622         CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
 623         if (wasdelayed) {
 624                 nbdwrite--;
 625                 wakeup((caddr_t)&nbdwrite);
 626         }
 627
 628         if (!sync) {
 629                 /*
 630                  * If not synchronous, pay for the I/O operation and make
 631                  * sure the buf is on the correct vnode queue.  We have
 632                  * to do this now, because if we don't, the vnode may not
 633                  * be properly notified that its I/O has completed.
 634                  */
 635                 if (wasdelayed)
 636                         reassignbuf(bp, vp);
 637                 else
 638                 if (p && p->p_stats)
 639                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
 640         }
 641
 642         trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
 643
 644         /* Initiate disk write.  Make sure the appropriate party is charged. */
 645         SET(bp->b_flags, B_WRITEINPROG);
 646         vp->v_numoutput++;
 647
 648         VOP_STRATEGY(bp);
 649
 650         if (sync) {
 651                 /*
 652                  * If I/O was synchronous, wait for it to complete.
 653                  */
 654                 rv = biowait(bp);
 655
 656                 /*
 657                  * Pay for the I/O operation, if it's not been paid for, and
 658                  * make sure it's on the correct vnode queue. (async operatings
 659                  * were payed for above.)
 660                  */
 661                 if (wasdelayed)
 662                         reassignbuf(bp, vp);
 663                 else
 664                 if (p && p->p_stats)
 665                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
 666
 667                 /* Release the buffer. */
 668                 // XXXdbg - only if the unused bit is set
 669                 if (!ISSET(bp->b_flags, B_NORELSE)) {
 670                     brelse(bp);
 671                 } else {
 672                     CLR(bp->b_flags, B_NORELSE);
 673                 }
 674
 675                 return (rv);
 676         } else {
 677                 return (0);
 678         }
 679 }
 680
 681 int
 682 vn_bwrite(ap)
 683         struct vop_bwrite_args *ap;
 684 {
 685         return (bwrite(ap->a_bp));
 686 }
 687
 688 /*
 689  * Delayed write.
 690  *
 691  * The buffer is marked dirty, but is not queued for I/O.
 692  * This routine should be used when the buffer is expected
 693  * to be modified again soon, typically a small write that
 694  * partially fills a buffer.
 695  *
 696  * NB: magnetic tapes cannot be delayed; they must be
 697  * written in the order that the writes are requested.
 698  *
 699  * Described in Leffler, et al. (pp. 208-213).
 700  *
 701  * Note: With the abilitty to allocate additional buffer
 702  * headers, we can get in to the situation where "too" many
 703  * bdwrite()s can create situation where the kernel can create
 704  * buffers faster than the disks can service. Doing a bawrite() in
 705  * cases were we have "too many" outstanding bdwrite()s avoids that.
 706  */
 707 __private_extern__ int
 708 bdwrite_internal(bp, return_error)
 709         struct buf *bp;
 710         int return_error;
 711 {
 712         struct proc *p = current_proc();
 713         struct vnode *vp = bp->b_vp;
 714
 715         /*
 716          * If the block hasn't been seen before:
 717          *      (1) Mark it as having been seen,
 718          *      (2) Charge for the write.
 719          *      (3) Make sure it's on its vnode's correct block list,
 720          */
 721         if (!ISSET(bp->b_flags, B_DELWRI)) {
 722                 SET(bp->b_flags, B_DELWRI);
 723                 if (p && p->p_stats)
 724                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
 725                 nbdwrite ++;
 726                 reassignbuf(bp, vp);
 727         }
 728
 729         /* If this is a tape block, write it the block now. */
 730         if (ISSET(bp->b_flags, B_TAPE)) {
 731                 /* bwrite(bp); */
 732                 VOP_BWRITE(bp);
 733                 return (0);
 734         }
 735
 736         /*
 737          * If the vnode has "too many" write operations in progress
 738          * wait for them to finish the IO
 739          */
 740         while (vp->v_numoutput >= BUFWRITE_THROTTLE) {
 741                 vp->v_flag |= VTHROTTLED;
 742                 (void)tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "bdwrite", 0);
 743         }
 744
 745         /*
 746          * If we have too many delayed write buffers,
 747          * more than we can "safely" handle, just fall back to
 748          * doing the async write
 749          */
 750         if (nbdwrite < 0)
 751                 panic("bdwrite: Negative nbdwrite");
 752
 753         // can't do a bawrite() if the LOCKED bit is set because the
 754         // buffer is part of a transaction and can't go to disk until
 755         // the LOCKED bit is cleared.
 756         if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf/4)*3)) {
 757                 if (return_error)
 758                         return (EAGAIN);
 759                 else
 760                         bawrite(bp);
 761                 return (0);
 762         }
 763
 764         /* Otherwise, the "write" is done, so mark and release the buffer. */
 765         SET(bp->b_flags, B_DONE);
 766         brelse(bp);
 767         return (0);
 768 }
 769
 770 void
 771 bdwrite(bp)
 772         struct buf *bp;
 773 {
 774         (void) bdwrite_internal(bp, 0);
 775 }
 776
 777
 778 /*
 779  * Asynchronous block write; just an asynchronous bwrite().
 780  *
 781  * Note: With the abilitty to allocate additional buffer
 782  * headers, we can get in to the situation where "too" many
 783  * bawrite()s can create situation where the kernel can create
 784  * buffers faster than the disks can service.
 785  * We limit the number of "in flight" writes a vnode can have to
 786  * avoid this.
 787  */
 788 static int
 789 bawrite_internal(bp, throttle)
 790         struct buf *bp;
 791         int throttle;
 792 {
 793         struct vnode *vp = bp->b_vp;
 794
 795         if (vp) {
 796                 /*
 797                  * If the vnode has "too many" write operations in progress
 798                  * wait for them to finish the IO
 799                  */
 800                 while (vp->v_numoutput >= BUFWRITE_THROTTLE) {
 801                         if (throttle) {
 802                                 vp->v_flag |= VTHROTTLED;
 803                                 (void)tsleep((caddr_t)&vp->v_numoutput,
 804                                                 PRIBIO + 1, "bawrite", 0);
 805                         } else
 806                                 return (EWOULDBLOCK);
 807                 }
 808         }
 809
 810         SET(bp->b_flags, B_ASYNC);
 811         VOP_BWRITE(bp);
 812         return (0);
 813 }
 814
 815 void
 816 bawrite(bp)
 817         struct buf *bp;
 818 {
 819         (void) bawrite_internal(bp, 1);
 820 }
 821
 822 /*
 823  *      bwillwrite:
 824  *
 825  *      Called prior to the locking of any vnodes when we are expecting to
 826  *      write.  We do not want to starve the buffer cache with too many
 827  *      dirty buffers so we block here.  By blocking prior to the locking
 828  *      of any vnodes we attempt to avoid the situation where a locked vnode
 829  *      prevents the various system daemons from flushing related buffers.
 830  */
 831
 832 void
 833 bwillwrite(void)
 834 {
 835         /* XXX To be implemented later */
 836 }
 837
 838 /*
 839  * Release a buffer on to the free lists.
 840  * Described in Bach (p. 46).
 841  */
 842 void
 843 brelse(bp)
 844         struct buf *bp;
 845 {
 846         struct bqueues *bufq;
 847         int s;
 848         long whichq;
 849
 850         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
 851                      bp->b_lblkno * PAGE_SIZE, (int)bp, (int)bp->b_data,
 852                      bp->b_flags, 0);
 853
 854         trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
 855
 856         // if we're invalidating a buffer that has the B_CALL bit
 857         // set then call the b_iodone function so it gets cleaned
 858         // up properly.
 859         //
 860         if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
 861                 if (ISSET(bp->b_flags, B_CALL) && !ISSET(bp->b_flags, B_DELWRI)) {
 862                         panic("brelse: CALL flag set but not DELWRI! bp 0x%x\n", bp);
 863                 }
 864                 if (ISSET(bp->b_flags, B_CALL)) {       /* if necessary, call out */
 865                         void    (*iodone_func)(struct buf *) = bp->b_iodone;
 866
 867                         CLR(bp->b_flags, B_CALL);       /* but note callout done */
 868                         bp->b_iodone = NULL;
 869
 870                         if (iodone_func == NULL) {
 871                                 panic("brelse: bp @ 0x%x has NULL b_iodone!\n", bp);
 872                         }
 873                         (*iodone_func)(bp);
 874                 }
 875         }
 876
 877         /* IO is done. Cleanup the UPL state */
 878         if (!ISSET(bp->b_flags, B_META)
 879                 && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
 880                 kern_return_t kret;
 881                 upl_t         upl;
 882                 int           upl_flags;
 883
 884                 if ( !ISSET(bp->b_flags, B_PAGELIST)) {
 885                         if ( !ISSET(bp->b_flags, B_INVAL)) {
 886                                 kret = ubc_create_upl(bp->b_vp,
 887                                                                 ubc_blktooff(bp->b_vp, bp->b_lblkno),
 888                                                                 bp->b_bufsize,
 889                                                             &upl,
 890                                                                 NULL,
 891                                                                 UPL_PRECIOUS);
 892                                 if (kret != KERN_SUCCESS)
 893                                         panic("brelse: Failed to get pagelists");
 894 #ifdef  UBC_DEBUG
 895                                 upl_ubc_alias_set(upl, bp, 5);
 896 #endif /* UBC_DEBUG */
 897                         } else
 898                                 upl = (upl_t) 0;
 899                 } else {
 900                         upl = bp->b_pagelist;
 901
 902                         if (bp->b_data) {
 903                                 kret = ubc_upl_unmap(upl);
 904
 905                                 if (kret != KERN_SUCCESS)
 906                                         panic("kernel_upl_unmap failed");
 907                                 bp->b_data = 0;
 908                         }
 909                 }
 910                 if (upl) {
 911                         if (bp->b_flags & (B_ERROR | B_INVAL)) {
 912                             if (bp->b_flags & (B_READ | B_INVAL))
 913                                         upl_flags = UPL_ABORT_DUMP_PAGES;
 914                                 else
 915                                         upl_flags = 0;
 916                                 ubc_upl_abort(upl, upl_flags);
 917                         } else {
 918                             if (ISSET(bp->b_flags, B_NEEDCOMMIT))
 919                                     upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
 920                             else if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY))
 921                                         upl_flags = UPL_COMMIT_SET_DIRTY ;
 922                                 else
 923                                     upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
 924                                 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
 925                                         UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
 926                         }
 927                         s = splbio();
 928                         CLR(bp->b_flags, B_PAGELIST);
 929                         bp->b_pagelist = 0;
 930                         splx(s);
 931                 }
 932         } else {
 933                 if(ISSET(bp->b_flags, B_PAGELIST))
 934                         panic("brelse: pagelist set for non VREG; vp=%x", bp->b_vp);
 935         }
 936
 937         /* Wake up any processes waiting for any buffer to become free. */
 938         if (needbuffer) {
 939                 needbuffer = 0;
 940                 wakeup(&needbuffer);
 941         }
 942
 943         /* Wake up any proceeses waiting for _this_ buffer to become free. */
 944         if (ISSET(bp->b_flags, B_WANTED)) {
 945                 CLR(bp->b_flags, B_WANTED);
 946                 wakeup(bp);
 947         }
 948
 949         /* Block disk interrupts. */
 950         s = splbio();
 951
 952         /*
 953          * Determine which queue the buffer should be on, then put it there.
 954          */
 955
 956         /* If it's locked, don't report an error; try again later. */
 957         if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
 958                 CLR(bp->b_flags, B_ERROR);
 959
 960         /* If it's not cacheable, or an error, mark it invalid. */
 961         if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
 962                 SET(bp->b_flags, B_INVAL);
 963
 964         if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
 965                 /*
 966                  * If it's invalid or empty, dissociate it from its vnode
 967                  * and put on the head of the appropriate queue.
 968                  */
 969                 if (bp->b_vp)
 970                         brelvp(bp);
 971                 if (ISSET(bp->b_flags, B_DELWRI)) {
 972                         CLR(bp->b_flags, B_DELWRI);
 973                         nbdwrite--;
 974                         wakeup((caddr_t)&nbdwrite);
 975                 }
 976                 if (bp->b_bufsize <= 0)
 977                         whichq = BQ_EMPTY;      /* no data */
 978                 else if (ISSET(bp->b_flags, B_META))
 979                         whichq = BQ_META;               /* meta-data */
 980                 else
 981                         whichq = BQ_AGE;        /* invalid data */
 982
 983                 bufq = &bufqueues[whichq];
 984                 binsheadfree(bp, bufq, whichq);
 985         } else {
 986                 /*
 987                  * It has valid data.  Put it on the end of the appropriate
 988                  * queue, so that it'll stick around for as long as possible.
 989                  */
 990                 if (ISSET(bp->b_flags, B_LOCKED))
 991                         whichq = BQ_LOCKED;             /* locked in core */
 992                 else if (ISSET(bp->b_flags, B_META))
 993                         whichq = BQ_META;               /* meta-data */
 994                 else if (ISSET(bp->b_flags, B_AGE))
 995                         whichq = BQ_AGE;                /* stale but valid data */
 996                 else
 997                         whichq = BQ_LRU;                /* valid data */
 998
 999                 bufq = &bufqueues[whichq];
1000                 binstailfree(bp, bufq, whichq);
1001         }
1002
1003         /* Unlock the buffer. */
1004         CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE));
1005
1006         /* Allow disk interrupts. */
1007         splx(s);
1008
1009         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
1010                      (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1011 }
1012
1013 /*
1014  * Determine if a block is in the cache.
1015  * Just look on what would be its hash chain.  If it's there, return
1016  * a pointer to it, unless it's marked invalid.  If it's marked invalid,
1017  * we normally don't return the buffer, unless the caller explicitly
1018  * wants us to.
1019  */
1020 struct buf *
1021 incore(vp, blkno)
1022         struct vnode *vp;
1023         daddr_t blkno;
1024 {
1025         struct buf *bp;
1026
1027         bp = BUFHASH(vp, blkno)->lh_first;
1028
1029         /* Search hash chain */
1030         for (; bp != NULL; bp = bp->b_hash.le_next) {
1031                 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
1032                     !ISSET(bp->b_flags, B_INVAL))
1033                         return (bp);
1034         }
1035
1036         return (0);
1037 }
1038
1039
1040 /* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
1041 /*
1042  * Get a block of requested size that is associated with
1043  * a given vnode and block offset. If it is found in the
1044  * block cache, mark it as having been found, make it busy
1045  * and return it. Otherwise, return an empty block of the
1046  * correct size. It is up to the caller to insure that the
1047  * cached blocks be of the correct size.
1048  */
1049 struct buf *
1050 getblk(vp, blkno, size, slpflag, slptimeo, operation)
1051         register struct vnode *vp;
1052         daddr_t blkno;
1053         int size, slpflag, slptimeo, operation;
1054 {
1055         struct buf *bp;
1056         int s, err;
1057         upl_t upl;
1058         upl_page_info_t *pl;
1059         kern_return_t kret;
1060         int error=0;
1061         int pagedirty = 0;
1062
1063         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
1064                      blkno * PAGE_SIZE, size, operation, 0, 0);
1065 start:
1066
1067         s = splbio();
1068         if ((bp = incore(vp, blkno))) {
1069                 /* Found in the Buffer Cache */
1070                 if (ISSET(bp->b_flags, B_BUSY)) {
1071                         /* but is busy */
1072                         switch (operation) {
1073                         case BLK_READ:
1074                         case BLK_WRITE:
1075                         case BLK_META:
1076                                 SET(bp->b_flags, B_WANTED);
1077                                 bufstats.bufs_busyincore++;
1078                                 err = tsleep(bp, slpflag | (PRIBIO + 1), "getblk",
1079                                     slptimeo);
1080                                 splx(s);
1081                                 /*
1082                                  * Callers who call with PCATCH or timeout are
1083                                  * willing to deal with the NULL pointer
1084                                  */
1085                                 if (err && ((slpflag & PCATCH) ||
1086                                                          ((err == EWOULDBLOCK) && slptimeo)))
1087                                         return (NULL);
1088                                 goto start;
1089                                 /*NOTREACHED*/
1090                                 break;
1091
1092                         case BLK_PAGEIN:
1093                                 /* pagein operation must not use getblk */
1094                                 panic("getblk: pagein for incore busy buffer");
1095                                 splx(s);
1096                                 /*NOTREACHED*/
1097                                 break;
1098
1099                         case BLK_PAGEOUT:
1100                                 /* pageout operation must not use getblk */
1101                                 panic("getblk: pageout for incore busy buffer");
1102                                 splx(s);
1103                                 /*NOTREACHED*/
1104                                 break;
1105
1106                         default:
1107                                 panic("getblk: %d unknown operation 1", operation);
1108                                 /*NOTREACHED*/
1109                                 break;
1110                         }
1111                 } else {
1112                         /* not busy */
1113                         SET(bp->b_flags, (B_BUSY | B_CACHE));
1114                         bremfree(bp);
1115                         bufstats.bufs_incore++;
1116                         splx(s);
1117
1118                         allocbuf(bp, size);
1119                         if (ISSET(bp->b_flags, B_PAGELIST))
1120                                         panic("pagelist buffer is not busy");
1121
1122                         switch (operation) {
1123                         case BLK_READ:
1124                         case BLK_WRITE:
1125                                 if (UBCISVALID(bp->b_vp) && bp->b_bufsize) {
1126                                         kret = ubc_create_upl(vp,
1127                                                                         ubc_blktooff(vp, bp->b_lblkno),
1128                                                                         bp->b_bufsize,
1129                                                                         &upl,
1130                                                                         &pl,
1131                                                                         UPL_PRECIOUS);
1132                                         if (kret != KERN_SUCCESS)
1133                                                 panic("Failed to get pagelists");
1134
1135                                         SET(bp->b_flags, B_PAGELIST);
1136                                         bp->b_pagelist = upl;
1137
1138                                         if (!upl_valid_page(pl, 0)) {
1139                                                 if (vp->v_tag != VT_NFS)
1140                                                         panic("getblk: incore buffer without valid page");
1141                                                 CLR(bp->b_flags, B_CACHE);
1142                                         }
1143
1144                                         if (upl_dirty_page(pl, 0))
1145                                                 SET(bp->b_flags, B_WASDIRTY);
1146                                         else
1147                                                 CLR(bp->b_flags, B_WASDIRTY);
1148
1149                                         kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
1150                                         if (kret != KERN_SUCCESS)
1151                                                 panic("getblk: ubc_upl_map() failed with (%d)",
1152                                                                   kret);
1153                                         if (bp->b_data == 0)
1154                                                 panic("ubc_upl_map mapped 0");
1155                                 }
1156                                 break;
1157
1158                         case BLK_META:
1159                                 /*
1160                                  * VM is not involved in IO for the meta data
1161                                  * buffer already has valid data
1162                                  */
1163                                 if(bp->b_data == 0)
1164                                         panic("bp->b_data null incore buf=%x", bp);
1165                                 break;
1166
1167                         case BLK_PAGEIN:
1168                         case BLK_PAGEOUT:
1169                                 panic("getblk: paging operation 1");
1170                                 break;
1171
1172                         default:
1173                                 panic("getblk: %d unknown operation 2", operation);
1174                                 /*NOTREACHED*/
1175                                 break;
1176                         }
1177                 }
1178         } else { /* not incore() */
1179                 int queue = BQ_EMPTY; /* Start with no preference */
1180                 splx(s);
1181
1182                 if ((operation == BLK_META) || (UBCINVALID(vp)) ||
1183                         !(UBCINFOEXISTS(vp))) {
1184                         operation = BLK_META;
1185                 }
1186                 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
1187                         goto start;
1188                 if (incore(vp, blkno)) {
1189                         SET(bp->b_flags, B_INVAL);
1190                         binshash(bp, &invalhash);
1191                         brelse(bp);
1192                         goto start;
1193                 }
1194                 /*
1195                  * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
1196                  *       CALLED!  BE CAREFUL.
1197                  */
1198
1199                 /*
1200                  * if it is meta, the queue may be set to other
1201                  * type so reset as well as mark it to be B_META
1202                  * so that when buffer is released it will goto META queue
1203                  * Also, if the vnode is not VREG, then it is META
1204                  */
1205                 if (operation == BLK_META) {
1206                         SET(bp->b_flags, B_META);
1207                         queue = BQ_META;
1208                 }
1209
1210                 bp->b_blkno = bp->b_lblkno = blkno;
1211                 bp->b_vp = vp;
1212
1213                 /*
1214                  * Insert in the hash so that incore() can find it
1215                  */
1216                 binshash(bp, BUFHASH(vp, blkno));
1217
1218                 s = splbio();
1219                 bgetvp(vp, bp);
1220                 splx(s);
1221
1222                 allocbuf(bp, size);
1223
1224                 switch (operation) {
1225                 case BLK_META:
1226                         /* buffer data is invalid */
1227
1228                         if(bp->b_data == 0)
1229                                 panic("bp->b_data is null %x",bp);
1230
1231                         bufstats.bufs_miss++;
1232
1233                         /* wakeup the buffer */
1234                         CLR(bp->b_flags, B_WANTED);
1235                         wakeup(bp);
1236                         break;
1237
1238                 case BLK_READ:
1239                 case BLK_WRITE:
1240
1241                         if (ISSET(bp->b_flags, B_PAGELIST))
1242                                 panic("B_PAGELIST in bp=%x",bp);
1243
1244                         kret = ubc_create_upl(vp,
1245                                                         ubc_blktooff(vp, blkno),
1246                                                         bp->b_bufsize,
1247                                                         &upl,
1248                                                         &pl,
1249                                                         UPL_PRECIOUS);
1250                         if (kret != KERN_SUCCESS)
1251                                 panic("Failed to get pagelists");
1252
1253 #ifdef  UBC_DEBUG
1254                         upl_ubc_alias_set(upl, bp, 4);
1255 #endif /* UBC_DEBUG */
1256                         bp->b_pagelist = upl;
1257
1258                         SET(bp->b_flags, B_PAGELIST);
1259
1260                         if (upl_valid_page(pl, 0)) {
1261                                 SET(bp->b_flags, B_CACHE | B_DONE);
1262                                 bufstats.bufs_vmhits++;
1263
1264                                 pagedirty = upl_dirty_page(pl, 0);
1265
1266                                 if (pagedirty)
1267                                         SET(bp->b_flags, B_WASDIRTY);
1268
1269                                 if (vp->v_tag == VT_NFS) {
1270                                         off_t  f_offset;
1271                                         int    valid_size;
1272
1273                                         bp->b_validoff = 0;
1274                                         bp->b_dirtyoff = 0;
1275
1276                                         f_offset = ubc_blktooff(vp, blkno);
1277
1278                                         if (f_offset > vp->v_ubcinfo->ui_size) {
1279                                                 CLR(bp->b_flags, (B_CACHE|B_DONE|B_WASDIRTY));
1280                                                 bp->b_validend = 0;
1281                                                 bp->b_dirtyend = 0;
1282                                         } else {
1283                                                 valid_size = min(((unsigned int)(vp->v_ubcinfo->ui_size - f_offset)), PAGE_SIZE);
1284                                                 bp->b_validend = valid_size;
1285
1286                                                 if (pagedirty)
1287                                                        bp->b_dirtyend = valid_size;
1288                                                 else
1289                                                        bp->b_dirtyend = 0;
1290
1291                                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_NONE,
1292                                                              bp->b_validend, bp->b_dirtyend,
1293                                                              (int)vp->v_ubcinfo->ui_size, 0, 0);
1294                                         }
1295                                 } else {
1296                                         bp->b_validoff = 0;
1297                                         bp->b_dirtyoff = 0;
1298
1299                                         if (pagedirty) {
1300                                                 /* page is dirty */
1301                                                 bp->b_validend = bp->b_bcount;
1302                                                 bp->b_dirtyend = bp->b_bcount;
1303                                         } else {
1304                                                 /* page is clean */
1305                                                 bp->b_validend = bp->b_bcount;
1306                                                 bp->b_dirtyend = 0;
1307                                         }
1308                                 }
1309                                 error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL);
1310                                 if(error) {
1311                                         panic("getblk: VOP_BMAP failed");
1312                                         /*NOTREACHED*/
1313                                         /*
1314                                          * XXX:  We probably should invalidate the VM Page
1315                                          */
1316                                         bp->b_error = error;
1317                                         SET(bp->b_flags, (B_ERROR | B_INVAL));
1318                                         /* undo B_DONE that was set before upl_commit() */
1319                                         CLR(bp->b_flags, B_DONE);
1320                                         brelse(bp);
1321                                         return (0);
1322                                 }
1323                         } else {
1324                                 bufstats.bufs_miss++;
1325                         }
1326                         kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
1327                         if (kret != KERN_SUCCESS) {
1328                                 panic("getblk: ubc_upl_map() "
1329                                       "failed with (%d)", kret);
1330                         }
1331                         if (bp->b_data == 0)
1332                                 panic("kernel_upl_map mapped 0");
1333
1334                         break;
1335
1336                 case BLK_PAGEIN:
1337                 case BLK_PAGEOUT:
1338                         panic("getblk: paging operation 2");
1339                         break;
1340                 default:
1341                         panic("getblk: %d unknown operation 3", operation);
1342                         /*NOTREACHED*/
1343                         break;
1344                 }
1345         }
1346
1347         if (bp->b_data == NULL)
1348                 panic("getblk: bp->b_addr is null");
1349
1350         if (bp->b_bufsize & 0xfff) {
1351                 if (ISSET(bp->b_flags, B_META) && (bp->b_bufsize & 0x1ff))
1352                         panic("getblk: bp->b_bufsize = %d", bp->b_bufsize);
1353         }
1354
1355         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
1356                      (int)bp, (int)bp->b_data, bp->b_flags, 3, 0);
1357
1358         return (bp);
1359 }
1360
1361 /*
1362  * Get an empty, disassociated buffer of given size.
1363  */
1364 struct buf *
1365 geteblk(size)
1366         int size;
1367 {
1368         struct buf *bp;
1369     int queue = BQ_EMPTY;
1370
1371         while ((bp = getnewbuf(0, 0, &queue)) == 0)
1372                 ;
1373         SET(bp->b_flags, (B_META|B_INVAL));
1374
1375 #if DIAGNOSTIC
1376         assert(queue == BQ_EMPTY);
1377 #endif /* DIAGNOSTIC */
1378         /* XXX need to implement logic to deal with other queues */
1379
1380         binshash(bp, &invalhash);
1381         allocbuf(bp, size);
1382         bufstats.bufs_eblk++;
1383
1384         return (bp);
1385 }
1386
1387 /*
1388  * Zones for the meta data buffers
1389  */
1390
1391 #define MINMETA 512
1392 #define MAXMETA 4096
1393
1394 struct meta_zone_entry {
1395         zone_t mz_zone;
1396         vm_size_t mz_size;
1397         vm_size_t mz_max;
1398         char *mz_name;
1399 };
1400
1401 struct meta_zone_entry meta_zones[] = {
1402         {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
1403         {NULL, (MINMETA * 2),  64 * (MINMETA * 2), "buf.1024" },
1404         {NULL, (MINMETA * 4),  16 * (MINMETA * 4), "buf.2048" },
1405         {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
1406         {NULL, 0, 0, "" } /* End */
1407 };
1408
1409 /*
1410  * Initialize the meta data zones
1411  */
1412 static void
1413 bufzoneinit(void)
1414 {
1415         int i;
1416
1417         for (i = 0; meta_zones[i].mz_size != 0; i++) {
1418                 meta_zones[i].mz_zone =
1419                                 zinit(meta_zones[i].mz_size,
1420                                         meta_zones[i].mz_max,
1421                                         PAGE_SIZE,
1422                                         meta_zones[i].mz_name);
1423         }
1424         buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
1425 }
1426
1427 static __inline__ zone_t
1428 getbufzone(size_t size)
1429 {
1430         int i;
1431
1432         if ((size % 512) || (size < MINMETA) || (size > MAXMETA))
1433                 panic("getbufzone: incorect size = %d", size);
1434
1435     for (i = 0; meta_zones[i].mz_size != 0; i++) {
1436                 if (meta_zones[i].mz_size >= size)
1437                         break;
1438         }
1439
1440         return (meta_zones[i].mz_zone);
1441 }
1442
1443 /*
1444  * With UBC, there is no need to expand / shrink the file data
1445  * buffer. The VM uses the same pages, hence no waste.
1446  * All the file data buffers can have one size.
1447  * In fact expand / shrink would be an expensive operation.
1448  *
1449  * Only exception to this is meta-data buffers. Most of the
1450  * meta data operations are smaller than PAGE_SIZE. Having the
1451  * meta-data buffers grow and shrink as needed, optimizes use
1452  * of the kernel wired memory.
1453  */
1454
1455 int
1456 allocbuf(bp, size)
1457         struct buf *bp;
1458         int size;
1459 {
1460         vm_size_t desired_size;
1461
1462         desired_size = roundup(size, CLBYTES);
1463
1464         if(desired_size < PAGE_SIZE)
1465                 desired_size = PAGE_SIZE;
1466         if (desired_size > MAXBSIZE)
1467                 panic("allocbuf: buffer larger than MAXBSIZE requested");
1468
1469         if (ISSET(bp->b_flags, B_META)) {
1470                 kern_return_t kret;
1471                 zone_t zprev, z;
1472                 size_t nsize = roundup(size, MINMETA);
1473
1474                 if (bp->b_data) {
1475                         vm_offset_t elem = (vm_offset_t)bp->b_data;
1476
1477                         if (ISSET(bp->b_flags, B_ZALLOC))
1478                                 if (bp->b_bufsize <= MAXMETA) {
1479                                         if (bp->b_bufsize < nsize) {
1480                                                 /* reallocate to a bigger size */
1481
1482                                                 zprev = getbufzone(bp->b_bufsize);
1483                                                 if (nsize <= MAXMETA) {
1484                                                         desired_size = nsize;
1485                                                         z = getbufzone(nsize);
1486                                                         bp->b_data = (caddr_t)zalloc(z);
1487                                                         if(bp->b_data == 0)
1488                                                                 panic("allocbuf: zalloc() returned NULL");
1489                                                 } else {
1490                                                         kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1491                                                         if (kret != KERN_SUCCESS)
1492                                                                 panic("allocbuf: kmem_alloc() 0 returned %d", kret);
1493                                                         if(bp->b_data == 0)
1494                                                                 panic("allocbuf: null b_data 0");
1495                                                         CLR(bp->b_flags, B_ZALLOC);
1496                                                 }
1497                                                 bcopy((const void *)elem, bp->b_data, bp->b_bufsize);
1498                                                 zfree(zprev, elem);
1499                                         } else {
1500                                                 desired_size = bp->b_bufsize;
1501                                         }
1502                                 } else
1503                                         panic("allocbuf: B_ZALLOC set incorrectly");
1504                         else
1505                                 if (bp->b_bufsize < desired_size) {
1506                                         /* reallocate to a bigger size */
1507                                         kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1508                                         if (kret != KERN_SUCCESS)
1509                                                 panic("allocbuf: kmem_alloc() returned %d", kret);
1510                                         if(bp->b_data == 0)
1511                                                 panic("allocbuf: null b_data");
1512                                         bcopy((const void *)elem, bp->b_data, bp->b_bufsize);
1513                                         kmem_free(kernel_map, elem, bp->b_bufsize);
1514                                 } else {
1515                                         desired_size = bp->b_bufsize;
1516                                 }
1517                 } else {
1518                         /* new allocation */
1519                         if (nsize <= MAXMETA) {
1520                                 desired_size = nsize;
1521                                 z = getbufzone(nsize);
1522                                 bp->b_data = (caddr_t)zalloc(z);
1523                                 if(bp->b_data == 0)
1524                                         panic("allocbuf: zalloc() returned NULL 2");
1525                                 SET(bp->b_flags, B_ZALLOC);
1526                         } else {
1527                                 kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1528                                 if (kret != KERN_SUCCESS)
1529                                         panic("allocbuf: kmem_alloc() 2 returned %d", kret);
1530                                 if(bp->b_data == 0)
1531                                         panic("allocbuf: null b_data 2");
1532                         }
1533                 }
1534         }
1535
1536         if (ISSET(bp->b_flags, B_META) && (bp->b_data == 0))
1537                 panic("allocbuf: bp->b_data is NULL, buf @ 0x%x", bp);
1538
1539         bp->b_bufsize = desired_size;
1540         bp->b_bcount = size;
1541         return (0);
1542 }
1543
1544 /*
1545  *      Get a new buffer from one of the free lists.
1546  *
1547  *      Request for a queue is passes in. The queue from which the buffer was taken
1548  *      from is returned. Out of range queue requests get BQ_EMPTY. Request for
1549  *      BQUEUE means no preference. Use heuristics in that case.
1550  *      Heuristics is as follows:
1551  *      Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
1552  *      If none available block till one is made available.
1553  *      If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
1554  *      Pick the most stale buffer.
1555  *      If found buffer was marked delayed write, start the async. write
1556  *      and restart the search.
1557  *      Initialize the fields and disassociate the buffer from the vnode.
1558  *      Remove the buffer from the hash. Return the buffer and the queue
1559  *      on which it was found.
1560  */
1561
1562 static struct buf *
1563 getnewbuf(slpflag, slptimeo, queue)
1564         int slpflag, slptimeo;
1565         int *queue;
1566 {
1567         register struct buf *bp;
1568         register struct buf *lru_bp;
1569         register struct buf *age_bp;
1570         register struct buf *meta_bp;
1571         register int age_time, lru_time, bp_time, meta_time;
1572         int s;
1573         int req = *queue; /* save it for restarts */
1574
1575 start:
1576         s = splbio();
1577
1578         /* invalid request gets empty queue */
1579         if ((*queue > BQUEUES) || (*queue < 0)
1580                 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED))
1581                 *queue = BQ_EMPTY;
1582
1583         /* (*queue == BQUEUES) means no preference */
1584         if (*queue != BQUEUES) {
1585                 /* Try for the requested queue first */
1586                 bp = bufqueues[*queue].tqh_first;
1587                 if (bp)
1588                         goto found;
1589         }
1590
1591         /* Unable to use requested queue */
1592         age_bp = bufqueues[BQ_AGE].tqh_first;
1593         lru_bp = bufqueues[BQ_LRU].tqh_first;
1594         meta_bp = bufqueues[BQ_META].tqh_first;
1595
1596         if (!age_bp && !lru_bp && !meta_bp) {
1597                 /*
1598                  * Unavailble on AGE or LRU or META queues
1599                  * Try the empty list first
1600                  */
1601                 bp = bufqueues[BQ_EMPTY].tqh_first;
1602                 if (bp) {
1603                         *queue = BQ_EMPTY;
1604                         goto found;
1605                 }
1606
1607                 /* Create a new temparory buffer header */
1608                 bp = (struct buf *)zalloc(buf_hdr_zone);
1609
1610                 if (bp) {
1611                         bufhdrinit(bp);
1612                         BLISTNONE(bp);
1613                         binshash(bp, &invalhash);
1614                         SET(bp->b_flags, B_HDRALLOC);
1615                         *queue = BQ_EMPTY;
1616                         binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
1617                         buf_hdr_count++;
1618                         goto found;
1619                 }
1620
1621                 /* Log this error condition */
1622                 printf("getnewbuf: No useful buffers");
1623
1624                 /* wait for a free buffer of any kind */
1625                 needbuffer = 1;
1626                 bufstats.bufs_sleeps++;
1627                 tsleep(&needbuffer, slpflag|(PRIBIO+1), "getnewbuf", slptimeo);
1628                 splx(s);
1629                 return (0);
1630         }
1631
1632         /* Buffer available either on AGE or LRU or META */
1633         bp = NULL;
1634         *queue = -1;
1635
1636         /* Buffer available either on AGE or LRU */
1637         if (!age_bp) {
1638                 bp = lru_bp;
1639                 *queue = BQ_LRU;
1640         } else if (!lru_bp) {
1641                 bp = age_bp;
1642                 *queue = BQ_AGE;
1643         } else { /* buffer available on both AGE and LRU */
1644                 age_time = time.tv_sec - age_bp->b_timestamp;
1645                 lru_time = time.tv_sec - lru_bp->b_timestamp;
1646                 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
1647                         bp = age_bp;
1648                         *queue = BQ_AGE;
1649                         /*
1650                          * we should probably re-timestamp eveything in the
1651                          * queues at this point with the current time
1652                          */
1653                 } else {
1654                         if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
1655                                 bp = lru_bp;
1656                                 *queue = BQ_LRU;
1657                         } else {
1658                                 bp = age_bp;
1659                                 *queue = BQ_AGE;
1660                         }
1661                 }
1662         }
1663
1664         if (!bp) { /* Neither on AGE nor on LRU */
1665                 bp = meta_bp;
1666                 *queue = BQ_META;
1667         }  else if (meta_bp) {
1668                 bp_time = time.tv_sec - bp->b_timestamp;
1669                 meta_time = time.tv_sec - meta_bp->b_timestamp;
1670
1671                 if (!(bp_time < 0) && !(meta_time < 0)) {
1672                         /* time not set backwards */
1673                         int bp_is_stale;
1674                         bp_is_stale = (*queue == BQ_LRU) ?
1675                                         lru_is_stale : age_is_stale;
1676
1677                         if ((meta_time >= meta_is_stale) &&
1678                                         (bp_time < bp_is_stale)) {
1679                                 bp = meta_bp;
1680                                 *queue = BQ_META;
1681                         }
1682                 }
1683         }
1684
1685         if (bp == NULL)
1686                 panic("getnewbuf: null bp");
1687
1688 found:
1689         if (ISSET(bp->b_flags, B_LOCKED)) {
1690             panic("getnewbuf: bp @ 0x%x is LOCKED! (flags 0x%x)\n", bp, bp->b_flags);
1691         }
1692
1693         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1694                 panic("getnewbuf: le_prev is deadbeef, buf @ 0x%x", bp);
1695
1696         if(ISSET(bp->b_flags, B_BUSY))
1697                 panic("getnewbuf reusing BUSY buf @ 0x%x", bp);
1698
1699         /* Clean it */
1700         if (bcleanbuf(bp)) {
1701                 /* bawrite() issued, buffer not ready */
1702                 splx(s);
1703                 *queue = req;
1704                 goto start;
1705         }
1706         splx(s);
1707         return (bp);
1708 }
1709
1710 #include <mach/mach_types.h>
1711 #include <mach/memory_object_types.h>
1712 #include <kern/sched_prim.h>
1713
1714 /*
1715  * Clean a buffer.
1716  * Returns 0 is buffer is ready to use,
1717  * Returns 1 if issued a bawrite() to indicate
1718  * that the buffer is not ready.
1719  */
1720 static int
1721 bcleanbuf(struct buf *bp)
1722 {
1723         int s;
1724         struct ucred *cred;
1725         int     hdralloc = 0;
1726
1727         s = splbio();
1728
1729         /* Remove from the queue */
1730         bremfree(bp);
1731
1732         /* Buffer is no longer on free lists. */
1733         SET(bp->b_flags, B_BUSY);
1734
1735         /* Check whether the buffer header was "allocated" */
1736         if (ISSET(bp->b_flags, B_HDRALLOC))
1737                 hdralloc = 1;
1738
1739         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1740                 panic("bcleanbuf: le_prev is deadbeef");
1741
1742         /*
1743          * If buffer was a delayed write, start the IO by queuing
1744          * it on the LAUNDRY queue, and return 1
1745          */
1746         if (ISSET(bp->b_flags, B_DELWRI)) {
1747                 splx(s);
1748                 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
1749                 blaundrycnt++;
1750                 wakeup(&blaundrycnt);
1751                 /* and give it a chance to run */
1752                 (void)thread_block(THREAD_CONTINUE_NULL);
1753                 return (1);
1754         }
1755
1756         if (bp->b_vp)
1757                 brelvp(bp);
1758         bremhash(bp);
1759         BLISTNONE(bp);
1760
1761         splx(s);
1762
1763         if (ISSET(bp->b_flags, B_META)) {
1764                 vm_offset_t elem = (vm_offset_t)bp->b_data;
1765                 if (elem == 0)
1766                         panic("bcleanbuf: NULL bp->b_data B_META buffer");
1767
1768                 if (ISSET(bp->b_flags, B_ZALLOC)) {
1769                         if (bp->b_bufsize <= MAXMETA) {
1770                                 zone_t z;
1771
1772                                 z = getbufzone(bp->b_bufsize);
1773                                 bp->b_data = (caddr_t)0xdeadbeef;
1774                                 zfree(z, elem);
1775                                 CLR(bp->b_flags, B_ZALLOC);
1776                         } else
1777                                 panic("bcleanbuf: B_ZALLOC set incorrectly");
1778                 } else {
1779                         bp->b_data = (caddr_t)0xdeadbeef;
1780                         kmem_free(kernel_map, elem, bp->b_bufsize);
1781                 }
1782         }
1783
1784         trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
1785
1786         /* disassociate us from our vnode, if we had one... */
1787         s = splbio();
1788
1789         /* clear out various other fields */
1790         bp->b_bufsize = 0;
1791         bp->b_data = 0;
1792         bp->b_flags = B_BUSY;
1793         if (hdralloc)
1794                 SET(bp->b_flags, B_HDRALLOC);
1795         bp->b_dev = NODEV;
1796         bp->b_blkno = bp->b_lblkno = 0;
1797         bp->b_iodone = 0;
1798         bp->b_error = 0;
1799         bp->b_resid = 0;
1800         bp->b_bcount = 0;
1801         bp->b_dirtyoff = bp->b_dirtyend = 0;
1802         bp->b_validoff = bp->b_validend = 0;
1803
1804         /* nuke any credentials we were holding */
1805         cred = bp->b_rcred;
1806         if (cred != NOCRED) {
1807                 bp->b_rcred = NOCRED;
1808                 crfree(cred);
1809         }
1810         cred = bp->b_wcred;
1811         if (cred != NOCRED) {
1812                 bp->b_wcred = NOCRED;
1813                 crfree(cred);
1814         }
1815         splx(s);
1816         return (0);
1817 }
1818
1819
1820 /*
1821  * Wait for operations on the buffer to complete.
1822  * When they do, extract and return the I/O's error value.
1823  */
1824 int
1825 biowait(bp)
1826         struct buf *bp;
1827 {
1828         int s;
1829
1830         s = splbio();
1831         while (!ISSET(bp->b_flags, B_DONE))
1832                 tsleep(bp, PRIBIO + 1, "biowait", 0);
1833         splx(s);
1834
1835         /* check for interruption of I/O (e.g. via NFS), then errors. */
1836         if (ISSET(bp->b_flags, B_EINTR)) {
1837                 CLR(bp->b_flags, B_EINTR);
1838                 return (EINTR);
1839         } else if (ISSET(bp->b_flags, B_ERROR))
1840                 return (bp->b_error ? bp->b_error : EIO);
1841         else
1842                 return (0);
1843 }
1844
1845 /*
1846  * Mark I/O complete on a buffer.
1847  *
1848  * If a callback has been requested, e.g. the pageout
1849  * daemon, do so. Otherwise, awaken waiting processes.
1850  *
1851  * [ Leffler, et al., says on p.247:
1852  *      "This routine wakes up the blocked process, frees the buffer
1853  *      for an asynchronous write, or, for a request by the pagedaemon
1854  *      process, invokes a procedure specified in the buffer structure" ]
1855  *
1856  * In real life, the pagedaemon (or other system processes) wants
1857  * to do async stuff to, and doesn't want the buffer brelse()'d.
1858  * (for swap pager, that puts swap buffers on the free lists (!!!),
1859  * for the vn device, that puts malloc'd buffers on the free lists!)
1860  */
1861 void
1862 biodone(bp)
1863         struct buf *bp;
1864 {
1865         boolean_t       funnel_state;
1866         struct vnode *vp;
1867         extern struct timeval priority_IO_timestamp_for_root;
1868         extern int hard_throttle_on_root;
1869
1870         funnel_state = thread_funnel_set(kernel_flock, TRUE);
1871
1872         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
1873                      (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1874
1875         if (ISSET(bp->b_flags, B_DONE))
1876                 panic("biodone already");
1877         SET(bp->b_flags, B_DONE);               /* note that it's done */
1878         /*
1879          * I/O was done, so don't believe
1880          * the DIRTY state from VM anymore
1881          */
1882         CLR(bp->b_flags, B_WASDIRTY);
1883
1884         if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
1885                 vwakeup(bp);     /* wake up reader */
1886
1887         if (kdebug_enable) {
1888             int    code = DKIO_DONE;
1889
1890             if (bp->b_flags & B_READ)
1891                 code |= DKIO_READ;
1892             if (bp->b_flags & B_ASYNC)
1893                 code |= DKIO_ASYNC;
1894
1895             if (bp->b_flags & B_META)
1896                 code |= DKIO_META;
1897             else if (bp->b_flags & (B_PGIN | B_PAGEOUT))
1898                 code |= DKIO_PAGING;
1899
1900             KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
1901                                 (unsigned int)bp, (unsigned int)bp->b_vp,
1902                                 bp->b_resid, bp->b_error, 0);
1903         }
1904
1905         /* Wakeup the throttled write operations as needed */
1906         vp = bp->b_vp;
1907         if (vp
1908                 && (vp->v_flag & VTHROTTLED)
1909                 && (vp->v_numoutput <= (BUFWRITE_THROTTLE / 3))) {
1910                 vp->v_flag &= ~VTHROTTLED;
1911                 wakeup((caddr_t)&vp->v_numoutput);
1912         }
1913         if ((bp->b_flags & B_PGIN) && (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV)) {
1914                 priority_IO_timestamp_for_root = time;
1915                 hard_throttle_on_root = 0;
1916         }
1917         if (ISSET(bp->b_flags, B_CALL)) {       /* if necessary, call out */
1918                 void    (*iodone_func)(struct buf *) = bp->b_iodone;
1919
1920                 CLR(bp->b_flags, B_CALL);       /* but note callout done */
1921                 bp->b_iodone = NULL;
1922
1923                 if (iodone_func == NULL) {
1924                         panic("biodone: bp @ 0x%x has NULL b_iodone!\n", bp);
1925                 } else {
1926                         (*iodone_func)(bp);
1927                 }
1928         } else if (ISSET(bp->b_flags, B_ASYNC)) /* if async, release it */
1929                 brelse(bp);
1930         else {                                  /* or just wakeup the buffer */
1931                 CLR(bp->b_flags, B_WANTED);
1932                 wakeup(bp);
1933         }
1934
1935         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
1936                      (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1937
1938         thread_funnel_set(kernel_flock, funnel_state);
1939 }
1940
1941 /*
1942  * Return a count of buffers on the "locked" queue.
1943  */
1944 int
1945 count_lock_queue()
1946 {
1947         register struct buf *bp;
1948         register int n = 0;
1949
1950         for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
1951             bp = bp->b_freelist.tqe_next)
1952                 n++;
1953         return (n);
1954 }
1955
1956 /*
1957  * Return a count of 'busy' buffers. Used at the time of shutdown.
1958  */
1959 int
1960 count_busy_buffers()
1961 {
1962         register struct buf *bp;
1963         register int nbusy = 0;
1964
1965         for (bp = &buf[nbuf]; --bp >= buf; )
1966                 if ((bp->b_flags & (B_BUSY|B_INVAL)) == B_BUSY)
1967                         nbusy++;
1968         return (nbusy);
1969 }
1970
1971 #if DIAGNOSTIC
1972 /*
1973  * Print out statistics on the current allocation of the buffer pool.
1974  * Can be enabled to print out on every ``sync'' by setting "syncprt"
1975  * in vfs_syscalls.c using sysctl.
1976  */
1977 void
1978 vfs_bufstats()
1979 {
1980         int s, i, j, count;
1981         register struct buf *bp;
1982         register struct bqueues *dp;
1983         int counts[MAXBSIZE/CLBYTES+1];
1984         static char *bname[BQUEUES] =
1985                 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
1986
1987         for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
1988                 count = 0;
1989                 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1990                         counts[j] = 0;
1991                 s = splbio();
1992                 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
1993                         counts[bp->b_bufsize/CLBYTES]++;
1994                         count++;
1995                 }
1996                 splx(s);
1997                 printf("%s: total-%d", bname[i], count);
1998                 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1999                         if (counts[j] != 0)
2000                                 printf(", %d-%d", j * CLBYTES, counts[j]);
2001                 printf("\n");
2002         }
2003 }
2004 #endif /* DIAGNOSTIC */
2005
2006 #define NRESERVEDIOBUFS 64
2007
2008 __private_extern__ struct buf *
2009 alloc_io_buf(vp, priv)
2010         struct vnode *vp;
2011         int priv;
2012 {
2013         register struct buf *bp;
2014         int s;
2015
2016         s = splbio();
2017
2018         while (niobuf - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse && !priv) {
2019                 need_iobuffer = 1;
2020                 bufstats.bufs_iobufsleeps++;
2021                 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf", 0);
2022         }
2023
2024         while ((bp = iobufqueue.tqh_first) == NULL) {
2025                 need_iobuffer = 1;
2026                 bufstats.bufs_iobufsleeps++;
2027                 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf1", 0);
2028         }
2029
2030         TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
2031         bp->b_timestamp = 0;
2032
2033         /* clear out various fields */
2034         bp->b_flags = B_BUSY;
2035         bp->b_blkno = bp->b_lblkno = 0;
2036
2037         bp->b_iodone = 0;
2038         bp->b_error = 0;
2039         bp->b_resid = 0;
2040         bp->b_bcount = 0;
2041         bp->b_bufsize = 0;
2042         bp->b_vp = vp;
2043
2044         if (vp->v_type == VBLK || vp->v_type == VCHR)
2045                 bp->b_dev = vp->v_rdev;
2046         else
2047                 bp->b_dev = NODEV;
2048         bufstats.bufs_iobufinuse++;
2049         if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
2050                 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
2051         splx(s);
2052
2053         return (bp);
2054 }
2055
2056 __private_extern__ void
2057 free_io_buf(bp)
2058         struct buf *bp;
2059 {
2060         int s;
2061
2062         s = splbio();
2063         /* put buffer back on the head of the iobufqueue */
2064         bp->b_vp = NULL;
2065         bp->b_flags = B_INVAL;
2066
2067         binsheadfree(bp, &iobufqueue, -1);
2068
2069         /* Wake up any processes waiting for any buffer to become free. */
2070         if (need_iobuffer) {
2071                 need_iobuffer = 0;
2072                 wakeup(&need_iobuffer);
2073         }
2074         bufstats.bufs_iobufinuse--;
2075         splx(s);
2076 }
2077
2078 /* disabled for now */
2079
2080 /* XXX move this to a separate file */
2081 /*
2082  * Dynamic Scaling of the Buffer Queues
2083  */
2084
2085 typedef long long blsize_t;
2086
2087 blsize_t MAXNBUF; /* initialize to (sane_size / PAGE_SIZE) */
2088 /* Global tunable limits */
2089 blsize_t nbufh;                 /* number of buffer headers */
2090 blsize_t nbuflow;               /* minimum number of buffer headers required */
2091 blsize_t nbufhigh;              /* maximum number of buffer headers allowed */
2092 blsize_t nbuftarget;    /* preferred number of buffer headers */
2093
2094 /*
2095  * assertions:
2096  *
2097  * 1.   0 < nbuflow <= nbufh <= nbufhigh
2098  * 2.   nbufhigh <= MAXNBUF
2099  * 3.   0 < nbuflow <= nbuftarget <= nbufhigh
2100  * 4.   nbufh can not be set by sysctl().
2101  */
2102
2103 /* Per queue tunable limits */
2104
2105 struct bufqlim {
2106         blsize_t        bl_nlow;        /* minimum number of buffer headers required */
2107         blsize_t        bl_num;         /* number of buffer headers on the queue */
2108         blsize_t        bl_nlhigh;      /* maximum number of buffer headers allowed */
2109         blsize_t        bl_target;      /* preferred number of buffer headers */
2110         long    bl_stale;       /* Seconds after which a buffer is considered stale */
2111 } bufqlim[BQUEUES];
2112
2113 /*
2114  * assertions:
2115  *
2116  * 1.   0 <= bl_nlow <= bl_num <= bl_nlhigh
2117  * 2.   bl_nlhigh <= MAXNBUF
2118  * 3.  bufqlim[BQ_META].bl_nlow != 0
2119  * 4.  bufqlim[BQ_META].bl_nlow > (number of possible concurrent
2120  *                                                                      file system IO operations)
2121  * 5.   bl_num can not be set by sysctl().
2122  * 6.   bl_nhigh <= nbufhigh
2123  */
2124
2125 /*
2126  * Rationale:
2127  * ----------
2128  * Defining it blsize_t as long permits 2^31 buffer headers per queue.
2129  * Which can describe (2^31 * PAGE_SIZE) memory per queue.
2130  *
2131  * These limits are exported to by means of sysctl().
2132  * It was decided to define blsize_t as a 64 bit quantity.
2133  * This will make sure that we will not be required to change it
2134  * as long as we do not exceed 64 bit address space for the kernel.
2135  *
2136  * low and high numbers parameters initialized at compile time
2137  * and boot arguments can be used to override them. sysctl()
2138  * would not change the value. sysctl() can get all the values
2139  * but can set only target. num is the current level.
2140  *
2141  * Advantages of having a "bufqscan" thread doing the balancing are,
2142  * Keep enough bufs on BQ_EMPTY.
2143  *      getnewbuf() by default will always select a buffer from the BQ_EMPTY.
2144  *              getnewbuf() perfoms best if a buffer was found there.
2145  *              Also this minimizes the possibility of starting IO
2146  *              from getnewbuf(). That's a performance win, too.
2147  *
2148  *      Localize complex logic [balancing as well as time aging]
2149  *              to balancebufq().
2150  *
2151  *      Simplify getnewbuf() logic by elimination of time aging code.
2152  */
2153
2154 /*
2155  * Algorithm:
2156  * -----------
2157  * The goal of the dynamic scaling of the buffer queues to to keep
2158  * the size of the LRU close to bl_target. Buffers on a queue would
2159  * be time aged.
2160  *
2161  * There would be a thread which will be responsible for "balancing"
2162  * the buffer cache queues.
2163  *
2164  * The scan order would be:     AGE, LRU, META, EMPTY.
2165  */
2166
2167 long bufqscanwait = 0;
2168
2169 static void bufqscan_thread();
2170 static int balancebufq(int q);
2171 static int btrimempty(int n);
2172 static __inline__ int initbufqscan(void);
2173 static __inline__ int nextbufq(int q);
2174 static void buqlimprt(int all);
2175
2176 static void
2177 bufq_balance_thread_init()
2178 {
2179
2180         if (bufqscanwait++ == 0) {
2181
2182                 /* Initalize globals */
2183                 MAXNBUF = (sane_size / PAGE_SIZE);
2184                 nbufh = nbuf;
2185                 nbuflow = min(nbufh, 100);
2186                 nbufhigh = min(MAXNBUF, max(nbufh, 2048));
2187                 nbuftarget = (sane_size >> 5) / PAGE_SIZE;
2188                 nbuftarget = max(nbuflow, nbuftarget);
2189                 nbuftarget = min(nbufhigh, nbuftarget);
2190
2191                 /*
2192                  * Initialize the bufqlim
2193                  */
2194
2195                 /* LOCKED queue */
2196                 bufqlim[BQ_LOCKED].bl_nlow = 0;
2197                 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
2198                 bufqlim[BQ_LOCKED].bl_target = 0;
2199                 bufqlim[BQ_LOCKED].bl_stale = 30;
2200
2201                 /* LRU queue */
2202                 bufqlim[BQ_LRU].bl_nlow = 0;
2203                 bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
2204                 bufqlim[BQ_LRU].bl_target = nbuftarget/4;
2205                 bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
2206
2207                 /* AGE queue */
2208                 bufqlim[BQ_AGE].bl_nlow = 0;
2209                 bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
2210                 bufqlim[BQ_AGE].bl_target = nbuftarget/4;
2211                 bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
2212
2213                 /* EMPTY queue */
2214                 bufqlim[BQ_EMPTY].bl_nlow = 0;
2215                 bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
2216                 bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
2217                 bufqlim[BQ_EMPTY].bl_stale = 600000;
2218
2219                 /* META queue */
2220                 bufqlim[BQ_META].bl_nlow = 0;
2221                 bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
2222                 bufqlim[BQ_META].bl_target = nbuftarget/4;
2223                 bufqlim[BQ_META].bl_stale = META_IS_STALE;
2224
2225                 /* LAUNDRY queue */
2226                 bufqlim[BQ_LOCKED].bl_nlow = 0;
2227                 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
2228                 bufqlim[BQ_LOCKED].bl_target = 0;
2229                 bufqlim[BQ_LOCKED].bl_stale = 30;
2230
2231                 buqlimprt(1);
2232         }
2233
2234         /* create worker thread */
2235         kernel_thread(kernel_task, bufqscan_thread);
2236 }
2237
2238 /* The workloop for the buffer balancing thread */
2239 static void
2240 bufqscan_thread()
2241 {
2242         boolean_t       funnel_state;
2243         int moretodo = 0;
2244
2245         funnel_state = thread_funnel_set(kernel_flock, TRUE);
2246
2247         for(;;) {
2248                 do {
2249                         int q;  /* buffer queue to process */
2250
2251                         q = initbufqscan();
2252                         for (; q; ) {
2253                                 moretodo |= balancebufq(q);
2254                                 q = nextbufq(q);
2255                         }
2256                 } while (moretodo);
2257
2258 #if DIAGNOSTIC
2259                 vfs_bufstats();
2260                 buqlimprt(0);
2261 #endif
2262                 (void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz);
2263                 moretodo = 0;
2264         }
2265
2266         (void) thread_funnel_set(kernel_flock, FALSE);
2267 }
2268
2269 /* Seed for the buffer queue balancing */
2270 static __inline__ int
2271 initbufqscan()
2272 {
2273         /* Start with AGE queue */
2274         return (BQ_AGE);
2275 }
2276
2277 /* Pick next buffer queue to balance */
2278 static __inline__ int
2279 nextbufq(int q)
2280 {
2281         int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
2282
2283         q++;
2284         q %= sizeof(order);
2285         return (order[q]);
2286 }
2287
2288 /* function to balance the buffer queues */
2289 static int
2290 balancebufq(int q)
2291 {
2292         int moretodo = 0;
2293         int s = splbio();
2294         int n;
2295
2296         /* reject invalid q */
2297         if ((q < 0) || (q >= BQUEUES))
2298                 goto out;
2299
2300         /* LOCKED or LAUNDRY queue MUST not be balanced */
2301         if ((q == BQ_LOCKED) || (q == BQ_LAUNDRY))
2302                 goto out;
2303
2304         n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
2305
2306         /* If queue has less than target nothing more to do */
2307         if (n < 0)
2308                 goto out;
2309
2310         if ( n > 8 ) {
2311                 /* Balance only a small amount (12.5%) at a time */
2312                 n >>= 3;
2313         }
2314
2315         /* EMPTY queue needs special handling */
2316         if (q == BQ_EMPTY) {
2317                 moretodo |= btrimempty(n);
2318                 goto out;
2319         }
2320
2321         for (; n > 0; n--) {
2322                 struct buf *bp = bufqueues[q].tqh_first;
2323                 if (!bp)
2324                         break;
2325
2326                 /* check if it's stale */
2327                 if ((time.tv_sec - bp->b_timestamp) > bufqlim[q].bl_stale) {
2328                         if (bcleanbuf(bp)) {
2329                                 /* bawrite() issued, bp not ready */
2330                                 moretodo = 1;
2331                         } else {
2332                                 /* release the cleaned buffer to BQ_EMPTY */
2333                                 SET(bp->b_flags, B_INVAL);
2334                                 brelse(bp);
2335                         }
2336                 } else
2337                         break;
2338         }
2339
2340 out:
2341         splx(s);
2342         return (moretodo);
2343 }
2344
2345 static int
2346 btrimempty(int n)
2347 {
2348         /*
2349          * When struct buf are allocated dynamically, this would
2350          * reclaim upto 'n' struct buf from the empty queue.
2351          */
2352
2353          return (0);
2354 }
2355
2356 static __inline__ void
2357 bufqinc(int q)
2358 {
2359         if ((q < 0) || (q >= BQUEUES))
2360                 return;
2361
2362         bufqlim[q].bl_num++;
2363         return;
2364 }
2365
2366 static __inline__ void
2367 bufqdec(int q)
2368 {
2369         if ((q < 0) || (q >= BQUEUES))
2370                 return;
2371
2372         bufqlim[q].bl_num--;
2373         return;
2374 }
2375
2376 static void
2377 buqlimprt(int all)
2378 {
2379         int i;
2380     static char *bname[BQUEUES] =
2381                 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
2382
2383         if (all)
2384                 for (i = 0; i < BQUEUES; i++) {
2385                         printf("%s : ", bname[i]);
2386                         printf("min = %ld, ", (long)bufqlim[i].bl_nlow);
2387                         printf("cur = %ld, ", (long)bufqlim[i].bl_num);
2388                         printf("max = %ld, ", (long)bufqlim[i].bl_nlhigh);
2389                         printf("target = %ld, ", (long)bufqlim[i].bl_target);
2390                         printf("stale after %ld seconds\n", bufqlim[i].bl_stale);
2391                 }
2392         else
2393                 for (i = 0; i < BQUEUES; i++) {
2394                         printf("%s : ", bname[i]);
2395                         printf("cur = %ld, ", (long)bufqlim[i].bl_num);
2396                 }
2397 }
2398
2399 /*
2400  * If the getnewbuf() calls bcleanbuf() on the same thread
2401  * there is a potential for stack overrun and deadlocks.
2402  * So we always handoff the work to worker thread for completion
2403  */
2404
2405 static void
2406 bcleanbuf_thread_init()
2407 {
2408         static void bcleanbuf_thread();
2409
2410         /* create worker thread */
2411         kernel_thread(kernel_task, bcleanbuf_thread);
2412 }
2413
2414 static void
2415 bcleanbuf_thread()
2416 {
2417         boolean_t       funnel_state;
2418         struct buf *bp;
2419         int error = 0;
2420         int loopcnt = 0;
2421
2422         funnel_state = thread_funnel_set(kernel_flock, TRUE);
2423
2424 doit:
2425         while (blaundrycnt == 0)
2426                 (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 60 * hz);
2427         bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY]);
2428         /* Remove from the queue */
2429         bremfree(bp);
2430         blaundrycnt--;
2431
2432         /* do the IO */
2433         error = bawrite_internal(bp, 0);
2434         if (error) {
2435                 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
2436                 blaundrycnt++;
2437                 if (loopcnt > 10) {
2438                         (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 1);
2439                         loopcnt = 0;
2440                 } else {
2441                         (void)thread_block(THREAD_CONTINUE_NULL);
2442                         loopcnt++;
2443                 }
2444         }
2445         /* start again */
2446         goto doit;
2447
2448         (void) thread_funnel_set(kernel_flock, funnel_state);
2449 }
2450
2451
2452 static int
2453 brecover_data(struct buf *bp)
2454 {
2455         upl_t upl;
2456         upl_page_info_t *pl;
2457         int upl_offset;
2458         kern_return_t kret;
2459         struct vnode *vp = bp->b_vp;
2460
2461         if (vp->v_tag == VT_NFS)
2462                 /*
2463                  * NFS currently deals with this case
2464                  * in a slightly different manner...
2465                  * continue to let it do so
2466                  */
2467                 return(1);
2468
2469         if (!UBCISVALID(vp) || bp->b_bufsize == 0)
2470                 goto dump_buffer;
2471
2472         kret = ubc_create_upl(vp,
2473                               ubc_blktooff(vp, bp->b_lblkno),
2474                               bp->b_bufsize,
2475                               &upl,
2476                               &pl,
2477                               UPL_PRECIOUS);
2478         if (kret != KERN_SUCCESS)
2479                 panic("Failed to get pagelists");
2480
2481         for (upl_offset = 0; upl_offset < bp->b_bufsize; upl_offset += PAGE_SIZE) {
2482
2483                 if (!upl_valid_page(pl, upl_offset / PAGE_SIZE) || !upl_dirty_page(pl, upl_offset / PAGE_SIZE)) {
2484                         ubc_upl_abort(upl, 0);
2485                         goto dump_buffer;
2486                 }
2487         }
2488         SET(bp->b_flags, B_PAGELIST);
2489         bp->b_pagelist = upl;
2490
2491         kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
2492         if (kret != KERN_SUCCESS)
2493                 panic("getblk: ubc_upl_map() failed with (%d)", kret);
2494         if (bp->b_data == 0)
2495                 panic("ubc_upl_map mapped 0");
2496
2497         return (1);
2498
2499 dump_buffer:
2500         bp->b_bufsize = 0;
2501         SET(bp->b_flags, B_INVAL);
2502         brelse(bp);
2503
2504         return(0);
2505 }
2506
2507
2508 static int
2509 bp_cmp(void *a, void *b)
2510 {
2511     struct buf *bp_a = *(struct buf **)a,
2512                *bp_b = *(struct buf **)b;
2513     daddr_t res;
2514
2515     // don't have to worry about negative block
2516     // numbers so this is ok to do.
2517     //
2518     res = (bp_a->b_blkno - bp_b->b_blkno);
2519
2520     return (int)res;
2521 }
2522
2523 #define NFLUSH 32
2524
2525 int
2526 bflushq(int whichq, struct mount *mp)
2527 {
2528         struct buf *bp, *next;
2529         int         i, buf_count, s;
2530         int         counter=0, total_writes=0;
2531         static struct buf *flush_table[NFLUSH];
2532
2533         if (whichq < 0 || whichq >= BQUEUES) {
2534             return;
2535         }
2536
2537
2538   restart:
2539         bp = TAILQ_FIRST(&bufqueues[whichq]);
2540         for(buf_count=0; bp; bp=next) {
2541             next = bp->b_freelist.tqe_next;
2542
2543             if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) {
2544                 continue;
2545             }
2546
2547             if ((bp->b_flags & B_DELWRI) && (bp->b_flags & B_BUSY) == 0) {
2548                 if (whichq != BQ_LOCKED && (bp->b_flags & B_LOCKED)) {
2549                     panic("bflushq: bp @ 0x%x is locked!\n", bp);
2550                 }
2551
2552                 bremfree(bp);
2553                 bp->b_flags |= B_BUSY;
2554                 flush_table[buf_count] = bp;
2555                 buf_count++;
2556                 total_writes++;
2557
2558                 if (buf_count >= NFLUSH) {
2559                     qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
2560
2561                     for(i=0; i < buf_count; i++) {
2562                         bawrite(flush_table[i]);
2563                     }
2564
2565                     goto restart;
2566                 }
2567             }
2568         }
2569
2570         if (buf_count > 0) {
2571             qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
2572             for(i=0; i < buf_count; i++) {
2573                 bawrite(flush_table[i]);
2574             }
2575         }
2576
2577         return total_writes;
2578 }