bsd/vfs/vfs_bio.c

   1 /*
   2  * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*-
  30  * Copyright (c) 1994 Christopher G. Demetriou
  31  * Copyright (c) 1982, 1986, 1989, 1993
  32  *      The Regents of the University of California.  All rights reserved.
  33  * (c) UNIX System Laboratories, Inc.
  34  * All or some portions of this file are derived from material licensed
  35  * to the University of California by American Telephone and Telegraph
  36  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  37  * the permission of UNIX System Laboratories, Inc.
  38  *
  39  * Redistribution and use in source and binary forms, with or without
  40  * modification, are permitted provided that the following conditions
  41  * are met:
  42  * 1. Redistributions of source code must retain the above copyright
  43  *    notice, this list of conditions and the following disclaimer.
  44  * 2. Redistributions in binary form must reproduce the above copyright
  45  *    notice, this list of conditions and the following disclaimer in the
  46  *    documentation and/or other materials provided with the distribution.
  47  * 3. All advertising materials mentioning features or use of this software
  48  *    must display the following acknowledgement:
  49  *      This product includes software developed by the University of
  50  *      California, Berkeley and its contributors.
  51  * 4. Neither the name of the University nor the names of its contributors
  52  *    may be used to endorse or promote products derived from this software
  53  *    without specific prior written permission.
  54  *
  55  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  56  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  57  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  58  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  59  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  60  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  61  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  62  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  63  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  64  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  65  * SUCH DAMAGE.
  66  *
  67  *      @(#)vfs_bio.c   8.6 (Berkeley) 1/11/94
  68  */
  69
  70 /*
  71  * Some references:
  72  *      Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
  73  *      Leffler, et al.: The Design and Implementation of the 4.3BSD
  74  *              UNIX Operating System (Addison Welley, 1989)
  75  */
  76
  77 #include <sys/param.h>
  78 #include <sys/systm.h>
  79 #include <sys/proc_internal.h>
  80 #include <sys/buf_internal.h>
  81 #include <sys/vnode_internal.h>
  82 #include <sys/mount_internal.h>
  83 #include <sys/trace.h>
  84 #include <sys/malloc.h>
  85 #include <sys/resourcevar.h>
  86 #include <miscfs/specfs/specdev.h>
  87 #include <sys/ubc.h>
  88 #include <sys/kauth.h>
  89 #if DIAGNOSTIC
  90 #include <kern/assert.h>
  91 #endif /* DIAGNOSTIC */
  92 #include <kern/task.h>
  93 #include <kern/zalloc.h>
  94 #include <kern/lock.h>
  95
  96 #include <sys/fslog.h>          /* fslog_io_error() */
  97
  98 #include <mach/mach_types.h>
  99 #include <mach/memory_object_types.h>
 100 #include <kern/sched_prim.h>    /* thread_block() */
 101
 102 #include <vm/vm_kern.h>
 103 #include <vm/vm_pageout.h>
 104
 105 #include <sys/kdebug.h>
 106
 107 #include <libkern/OSAtomic.h>
 108 #include <libkern/OSDebug.h>
 109 #include <sys/ubc_internal.h>
 110
 111 #include <sys/sdt.h>
 112
 113 #if BALANCE_QUEUES
 114 static __inline__ void bufqinc(int q);
 115 static __inline__ void bufqdec(int q);
 116 #endif
 117
 118 static int      bcleanbuf(buf_t bp, boolean_t discard);
 119 static int      brecover_data(buf_t bp);
 120 static boolean_t incore(vnode_t vp, daddr64_t blkno);
 121 /* timeout is in msecs */
 122 static buf_t    getnewbuf(int slpflag, int slptimeo, int *queue);
 123 static void     bremfree_locked(buf_t bp);
 124 static void     buf_reassign(buf_t bp, vnode_t newvp);
 125 static errno_t  buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo);
 126 static int      buf_iterprepare(vnode_t vp, struct buflists *, int flags);
 127 static void     buf_itercomplete(vnode_t vp, struct buflists *, int flags);
 128 static boolean_t buffer_cache_gc(void);
 129
 130 __private_extern__ int  bdwrite_internal(buf_t, int);
 131
 132 /* zone allocated buffer headers */
 133 static void     bufzoneinit(void) __attribute__((section("__TEXT, initcode")));
 134 static void     bcleanbuf_thread_init(void) __attribute__((section("__TEXT, initcode")));
 135 static void     bcleanbuf_thread(void);
 136
 137 static zone_t   buf_hdr_zone;
 138 static int      buf_hdr_count;
 139
 140
 141 /*
 142  * Definitions for the buffer hash lists.
 143  */
 144 #define BUFHASH(dvp, lbn)       \
 145         (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
 146 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
 147 u_long  bufhash;
 148
 149 static buf_t    incore_locked(vnode_t vp, daddr64_t blkno, struct bufhashhdr *dp);
 150
 151 /* Definitions for the buffer stats. */
 152 struct bufstats bufstats;
 153
 154 /* Number of delayed write buffers */
 155 long nbdwrite = 0;
 156 int blaundrycnt = 0;
 157 static int boot_nbuf_headers = 0;
 158
 159
 160 static TAILQ_HEAD(ioqueue, buf) iobufqueue;
 161 static TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
 162 static int needbuffer;
 163 static int need_iobuffer;
 164
 165 static lck_grp_t        *buf_mtx_grp;
 166 static lck_attr_t       *buf_mtx_attr;
 167 static lck_grp_attr_t   *buf_mtx_grp_attr;
 168 static lck_mtx_t        *iobuffer_mtxp;
 169 static lck_mtx_t        *buf_mtxp;
 170
 171 static int buf_busycount;
 172
 173 static __inline__ int
 174 buf_timestamp(void)
 175 {
 176         struct  timeval         t;
 177         microuptime(&t);
 178         return (t.tv_sec);
 179 }
 180
 181 /*
 182  * Insq/Remq for the buffer free lists.
 183  */
 184 #if BALANCE_QUEUES
 185 #define binsheadfree(bp, dp, whichq)    do { \
 186                                     TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
 187                                         bufqinc((whichq));      \
 188                                 } while (0)
 189
 190 #define binstailfree(bp, dp, whichq)    do { \
 191                                     TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
 192                                         bufqinc((whichq));      \
 193                                 } while (0)
 194 #else
 195 #define binsheadfree(bp, dp, whichq)    do { \
 196                                     TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
 197                                 } while (0)
 198
 199 #define binstailfree(bp, dp, whichq)    do { \
 200                                     TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
 201                                 } while (0)
 202 #endif
 203
 204
 205 #define BHASHENTCHECK(bp)       \
 206         if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef)  \
 207                 panic("%p: b_hash.le_prev is not deadbeef", (bp));
 208
 209 #define BLISTNONE(bp)   \
 210         (bp)->b_hash.le_next = (struct buf *)0; \
 211         (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
 212
 213 /*
 214  * Insq/Remq for the vnode usage lists.
 215  */
 216 #define bufinsvn(bp, dp)        LIST_INSERT_HEAD(dp, bp, b_vnbufs)
 217 #define bufremvn(bp) {                                                  \
 218         LIST_REMOVE(bp, b_vnbufs);                                      \
 219         (bp)->b_vnbufs.le_next = NOLIST;                                \
 220 }
 221
 222 /*
 223  * Time in seconds before a buffer on a list is
 224  * considered as a stale buffer
 225  */
 226 #define LRU_IS_STALE 120 /* default value for the LRU */
 227 #define AGE_IS_STALE 60  /* default value for the AGE */
 228 #define META_IS_STALE 180 /* default value for the BQ_META */
 229
 230 int lru_is_stale = LRU_IS_STALE;
 231 int age_is_stale = AGE_IS_STALE;
 232 int meta_is_stale = META_IS_STALE;
 233
 234
 235
 236 /* LIST_INSERT_HEAD() with assertions */
 237 static __inline__ void
 238 blistenterhead(struct bufhashhdr * head, buf_t bp)
 239 {
 240         if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
 241                 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
 242         (head)->lh_first = bp;
 243         bp->b_hash.le_prev = &(head)->lh_first;
 244         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
 245                 panic("blistenterhead: le_prev is deadbeef");
 246 }
 247
 248 static __inline__ void
 249 binshash(buf_t bp, struct bufhashhdr *dp)
 250 {
 251 #if DIAGNOSTIC
 252         buf_t   nbp;
 253 #endif /* DIAGNOSTIC */
 254
 255         BHASHENTCHECK(bp);
 256
 257 #if DIAGNOSTIC
 258         nbp = dp->lh_first;
 259         for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
 260                 if(nbp == bp)
 261                         panic("buf already in hashlist");
 262         }
 263 #endif /* DIAGNOSTIC */
 264
 265         blistenterhead(dp, bp);
 266 }
 267
 268 static __inline__ void
 269 bremhash(buf_t  bp)
 270 {
 271         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
 272                 panic("bremhash le_prev is deadbeef");
 273         if (bp->b_hash.le_next == bp)
 274                 panic("bremhash: next points to self");
 275
 276         if (bp->b_hash.le_next != NULL)
 277                 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
 278         *bp->b_hash.le_prev = (bp)->b_hash.le_next;
 279 }
 280
 281
 282
 283
 284 int
 285 buf_valid(buf_t bp) {
 286
 287         if ( (bp->b_flags & (B_DONE | B_DELWRI)) )
 288                 return 1;
 289         return 0;
 290 }
 291
 292 int
 293 buf_fromcache(buf_t bp) {
 294
 295         if ( (bp->b_flags & B_CACHE) )
 296                 return 1;
 297         return 0;
 298 }
 299
 300 void
 301 buf_markinvalid(buf_t bp) {
 302
 303         SET(bp->b_flags, B_INVAL);
 304 }
 305
 306 void
 307 buf_markdelayed(buf_t bp) {
 308
 309         if (!ISSET(bp->b_flags, B_DELWRI)) {
 310                 SET(bp->b_flags, B_DELWRI);
 311
 312                 OSAddAtomicLong(1, &nbdwrite);
 313                 buf_reassign(bp, bp->b_vp);
 314         }
 315         SET(bp->b_flags, B_DONE);
 316 }
 317
 318 void
 319 buf_markeintr(buf_t bp) {
 320
 321         SET(bp->b_flags, B_EINTR);
 322 }
 323
 324
 325 void
 326 buf_markaged(buf_t bp) {
 327
 328         SET(bp->b_flags, B_AGE);
 329 }
 330
 331 int
 332 buf_fua(buf_t bp) {
 333
 334         if ((bp->b_flags & B_FUA) == B_FUA)
 335                 return 1;
 336         return 0;
 337 }
 338
 339 void
 340 buf_markfua(buf_t bp) {
 341
 342         SET(bp->b_flags, B_FUA);
 343 }
 344
 345 errno_t
 346 buf_error(buf_t bp) {
 347
 348         return (bp->b_error);
 349 }
 350
 351 void
 352 buf_seterror(buf_t bp, errno_t error) {
 353
 354         if ((bp->b_error = error))
 355                 SET(bp->b_flags, B_ERROR);
 356         else
 357                 CLR(bp->b_flags, B_ERROR);
 358 }
 359
 360 void
 361 buf_setflags(buf_t bp, int32_t flags) {
 362
 363         SET(bp->b_flags, (flags & BUF_X_WRFLAGS));
 364 }
 365
 366 void
 367 buf_clearflags(buf_t bp, int32_t flags) {
 368
 369         CLR(bp->b_flags, (flags & BUF_X_WRFLAGS));
 370 }
 371
 372 int32_t
 373 buf_flags(buf_t bp) {
 374
 375         return ((bp->b_flags & BUF_X_RDFLAGS));
 376 }
 377
 378 void
 379 buf_reset(buf_t bp, int32_t io_flags) {
 380
 381         CLR(bp->b_flags, (B_READ | B_WRITE | B_ERROR | B_DONE | B_INVAL | B_ASYNC | B_NOCACHE | B_FUA));
 382         SET(bp->b_flags, (io_flags & (B_ASYNC | B_READ | B_WRITE | B_NOCACHE)));
 383
 384         bp->b_error = 0;
 385 }
 386
 387 uint32_t
 388 buf_count(buf_t bp) {
 389
 390         return (bp->b_bcount);
 391 }
 392
 393 void
 394 buf_setcount(buf_t bp, uint32_t bcount) {
 395
 396         bp->b_bcount = bcount;
 397 }
 398
 399 uint32_t
 400 buf_size(buf_t bp) {
 401
 402         return (bp->b_bufsize);
 403 }
 404
 405 void
 406 buf_setsize(buf_t bp, uint32_t bufsize) {
 407
 408         bp->b_bufsize = bufsize;
 409 }
 410
 411 uint32_t
 412 buf_resid(buf_t bp) {
 413
 414         return (bp->b_resid);
 415 }
 416
 417 void
 418 buf_setresid(buf_t bp, uint32_t resid) {
 419
 420         bp->b_resid = resid;
 421 }
 422
 423 uint32_t
 424 buf_dirtyoff(buf_t bp) {
 425
 426         return (bp->b_dirtyoff);
 427 }
 428
 429 uint32_t
 430 buf_dirtyend(buf_t bp) {
 431
 432         return (bp->b_dirtyend);
 433 }
 434
 435 void
 436 buf_setdirtyoff(buf_t bp, uint32_t dirtyoff) {
 437
 438         bp->b_dirtyoff = dirtyoff;
 439 }
 440
 441 void
 442 buf_setdirtyend(buf_t bp, uint32_t dirtyend) {
 443
 444         bp->b_dirtyend = dirtyend;
 445 }
 446
 447 uintptr_t
 448 buf_dataptr(buf_t bp) {
 449
 450         return (bp->b_datap);
 451 }
 452
 453 void
 454 buf_setdataptr(buf_t bp, uintptr_t data) {
 455
 456         bp->b_datap = data;
 457 }
 458
 459 vnode_t
 460 buf_vnode(buf_t bp) {
 461
 462         return (bp->b_vp);
 463 }
 464
 465 void
 466 buf_setvnode(buf_t bp, vnode_t vp) {
 467
 468         bp->b_vp = vp;
 469 }
 470
 471
 472 void *
 473 buf_callback(buf_t bp)
 474 {
 475         if ( !(bp->b_flags & B_CALL) )
 476                 return ((void *) NULL);
 477
 478         return ((void *)bp->b_iodone);
 479 }
 480
 481
 482 errno_t
 483 buf_setcallback(buf_t bp, void (*callback)(buf_t, void *), void *transaction)
 484 {
 485         if (callback)
 486                 bp->b_flags |= (B_CALL | B_ASYNC);
 487         else
 488                 bp->b_flags &= ~B_CALL;
 489         bp->b_transaction = transaction;
 490         bp->b_iodone = callback;
 491
 492         return (0);
 493 }
 494
 495 errno_t
 496 buf_setupl(buf_t bp, upl_t upl, uint32_t offset)
 497 {
 498
 499         if ( !(bp->b_lflags & BL_IOBUF) )
 500                 return (EINVAL);
 501
 502         if (upl)
 503                 bp->b_flags |= B_CLUSTER;
 504         else
 505                 bp->b_flags &= ~B_CLUSTER;
 506         bp->b_upl = upl;
 507         bp->b_uploffset = offset;
 508
 509         return (0);
 510 }
 511
 512 buf_t
 513 buf_clone(buf_t bp, int io_offset, int io_size, void (*iodone)(buf_t, void *), void *arg)
 514 {
 515         buf_t   io_bp;
 516
 517         if (io_offset < 0 || io_size < 0)
 518                 return (NULL);
 519
 520         if ((unsigned)(io_offset + io_size) > (unsigned)bp->b_bcount)
 521                 return (NULL);
 522
 523         if (bp->b_flags & B_CLUSTER) {
 524                 if (io_offset && ((bp->b_uploffset + io_offset) & PAGE_MASK))
 525                         return (NULL);
 526
 527                 if (((bp->b_uploffset + io_offset + io_size) & PAGE_MASK) && ((io_offset + io_size) < bp->b_bcount))
 528                         return (NULL);
 529         }
 530         io_bp = alloc_io_buf(bp->b_vp, 0);
 531
 532         io_bp->b_flags = bp->b_flags & (B_COMMIT_UPL | B_META | B_PAGEIO | B_CLUSTER | B_PHYS | B_RAW | B_ASYNC | B_READ | B_FUA);
 533
 534         if (iodone) {
 535                 io_bp->b_transaction = arg;
 536                 io_bp->b_iodone = iodone;
 537                 io_bp->b_flags |= B_CALL;
 538         }
 539         if (bp->b_flags & B_CLUSTER) {
 540                 io_bp->b_upl = bp->b_upl;
 541                 io_bp->b_uploffset = bp->b_uploffset + io_offset;
 542         } else {
 543                 io_bp->b_datap  = (uintptr_t)(((char *)bp->b_datap) + io_offset);
 544         }
 545         io_bp->b_bcount = io_size;
 546
 547         return (io_bp);
 548 }
 549
 550
 551
 552 void
 553 buf_setfilter(buf_t bp, void (*filter)(buf_t, void *), void *transaction,
 554               void **old_iodone, void **old_transaction)
 555 {
 556         if (old_iodone)
 557                 *old_iodone = (void *)(bp->b_iodone);
 558         if (old_transaction)
 559                 *old_transaction = (void *)(bp->b_transaction);
 560
 561         bp->b_transaction = transaction;
 562         bp->b_iodone = filter;
 563         if (filter)
 564                 bp->b_flags |= B_FILTER;
 565         else
 566                 bp->b_flags &= ~B_FILTER;
 567 }
 568
 569
 570 daddr64_t
 571 buf_blkno(buf_t bp) {
 572
 573         return (bp->b_blkno);
 574 }
 575
 576 daddr64_t
 577 buf_lblkno(buf_t bp) {
 578
 579         return (bp->b_lblkno);
 580 }
 581
 582 void
 583 buf_setblkno(buf_t bp, daddr64_t blkno) {
 584
 585         bp->b_blkno = blkno;
 586 }
 587
 588 void
 589 buf_setlblkno(buf_t bp, daddr64_t lblkno) {
 590
 591         bp->b_lblkno = lblkno;
 592 }
 593
 594 dev_t
 595 buf_device(buf_t bp) {
 596
 597         return (bp->b_dev);
 598 }
 599
 600 errno_t
 601 buf_setdevice(buf_t bp, vnode_t vp) {
 602
 603         if ((vp->v_type != VBLK) && (vp->v_type != VCHR))
 604                 return EINVAL;
 605         bp->b_dev = vp->v_rdev;
 606
 607         return 0;
 608 }
 609
 610
 611 void *
 612 buf_drvdata(buf_t bp) {
 613
 614         return (bp->b_drvdata);
 615 }
 616
 617 void
 618 buf_setdrvdata(buf_t bp, void *drvdata) {
 619
 620         bp->b_drvdata = drvdata;
 621 }
 622
 623 void *
 624 buf_fsprivate(buf_t bp) {
 625
 626         return (bp->b_fsprivate);
 627 }
 628
 629 void
 630 buf_setfsprivate(buf_t bp, void *fsprivate) {
 631
 632         bp->b_fsprivate = fsprivate;
 633 }
 634
 635 kauth_cred_t
 636 buf_rcred(buf_t bp) {
 637
 638         return (bp->b_rcred);
 639 }
 640
 641 kauth_cred_t
 642 buf_wcred(buf_t bp) {
 643
 644         return (bp->b_wcred);
 645 }
 646
 647 void *
 648 buf_upl(buf_t bp) {
 649
 650         return (bp->b_upl);
 651 }
 652
 653 uint32_t
 654 buf_uploffset(buf_t bp) {
 655
 656         return ((uint32_t)(bp->b_uploffset));
 657 }
 658
 659 proc_t
 660 buf_proc(buf_t bp) {
 661
 662         return (bp->b_proc);
 663 }
 664
 665
 666 errno_t
 667 buf_map(buf_t bp, caddr_t *io_addr)
 668 {
 669         buf_t           real_bp;
 670         vm_offset_t     vaddr;
 671         kern_return_t   kret;
 672
 673         if ( !(bp->b_flags & B_CLUSTER)) {
 674                 *io_addr = (caddr_t)bp->b_datap;
 675                 return (0);
 676         }
 677         real_bp = (buf_t)(bp->b_real_bp);
 678
 679         if (real_bp && real_bp->b_datap) {
 680                 /*
 681                  * b_real_bp is only valid if B_CLUSTER is SET
 682                  * if it's non-zero, than someone did a cluster_bp call
 683                  * if the backing physical pages were already mapped
 684                  * in before the call to cluster_bp (non-zero b_datap),
 685                  * than we just use that mapping
 686                  */
 687                 *io_addr = (caddr_t)real_bp->b_datap;
 688                 return (0);
 689         }
 690         kret = ubc_upl_map(bp->b_upl, &vaddr);    /* Map it in */
 691
 692         if (kret != KERN_SUCCESS) {
 693                 *io_addr = NULL;
 694
 695                 return(ENOMEM);
 696         }
 697         vaddr += bp->b_uploffset;
 698
 699         *io_addr = (caddr_t)vaddr;
 700
 701         return (0);
 702 }
 703
 704 errno_t
 705 buf_unmap(buf_t bp)
 706 {
 707         buf_t           real_bp;
 708         kern_return_t   kret;
 709
 710         if ( !(bp->b_flags & B_CLUSTER))
 711                 return (0);
 712         /*
 713          * see buf_map for the explanation
 714          */
 715         real_bp = (buf_t)(bp->b_real_bp);
 716
 717         if (real_bp && real_bp->b_datap)
 718                 return (0);
 719
 720         if ((bp->b_lflags & BL_IOBUF) &&
 721             ((bp->b_flags & (B_PAGEIO | B_READ)) != (B_PAGEIO | B_READ))) {
 722                 /*
 723                  * ignore pageins... the 'right' thing will
 724                  * happen due to the way we handle speculative
 725                  * clusters...
 726                  *
 727                  * when we commit these pages, we'll hit
 728                  * it with UPL_COMMIT_INACTIVE which
 729                  * will clear the reference bit that got
 730                  * turned on when we touched the mapping
 731                  */
 732                 bp->b_flags |= B_AGE;
 733         }
 734         kret = ubc_upl_unmap(bp->b_upl);
 735
 736         if (kret != KERN_SUCCESS)
 737                 return (EINVAL);
 738         return (0);
 739 }
 740
 741
 742 void
 743 buf_clear(buf_t bp) {
 744         caddr_t baddr;
 745
 746         if (buf_map(bp, &baddr) == 0) {
 747                 bzero(baddr, bp->b_bcount);
 748                 buf_unmap(bp);
 749         }
 750         bp->b_resid = 0;
 751 }
 752
 753
 754
 755 /*
 756  * Read or write a buffer that is not contiguous on disk.
 757  * buffer is marked done/error at the conclusion
 758  */
 759 static int
 760 buf_strategy_fragmented(vnode_t devvp, buf_t bp, off_t f_offset, size_t contig_bytes)
 761 {
 762         vnode_t vp = buf_vnode(bp);
 763         buf_t   io_bp;                   /* For reading or writing a single block */
 764         int     io_direction;
 765         int     io_resid;
 766         size_t  io_contig_bytes;
 767         daddr64_t io_blkno;
 768         int     error = 0;
 769         int     bmap_flags;
 770
 771         /*
 772          * save our starting point... the bp was already mapped
 773          * in buf_strategy before we got called
 774          * no sense doing it again.
 775          */
 776         io_blkno = bp->b_blkno;
 777         /*
 778          * Make sure we redo this mapping for the next I/O
 779          * i.e. this can never be a 'permanent' mapping
 780          */
 781         bp->b_blkno = bp->b_lblkno;
 782
 783         /*
 784          * Get an io buffer to do the deblocking
 785          */
 786         io_bp = alloc_io_buf(devvp, 0);
 787
 788         io_bp->b_lblkno = bp->b_lblkno;
 789         io_bp->b_datap  = bp->b_datap;
 790         io_resid        = bp->b_bcount;
 791         io_direction    = bp->b_flags & B_READ;
 792         io_contig_bytes = contig_bytes;
 793
 794         if (bp->b_flags & B_READ)
 795                 bmap_flags = VNODE_READ;
 796         else
 797                 bmap_flags = VNODE_WRITE;
 798
 799         for (;;) {
 800                 if (io_blkno == -1)
 801                         /*
 802                          * this is unexepected, but we'll allow for it
 803                          */
 804                         bzero((caddr_t)io_bp->b_datap, (int)io_contig_bytes);
 805                 else {
 806                         io_bp->b_bcount  = io_contig_bytes;
 807                         io_bp->b_bufsize = io_contig_bytes;
 808                         io_bp->b_resid   = io_contig_bytes;
 809                         io_bp->b_blkno   = io_blkno;
 810
 811                         buf_reset(io_bp, io_direction);
 812
 813                         /*
 814                          * Call the device to do the I/O and wait for it.  Make sure the appropriate party is charged for write
 815                          */
 816
 817                         if (!ISSET(bp->b_flags, B_READ))
 818                                 OSAddAtomic(1, &devvp->v_numoutput);
 819
 820                         if ((error = VNOP_STRATEGY(io_bp)))
 821                                 break;
 822                         if ((error = (int)buf_biowait(io_bp)))
 823                                 break;
 824                         if (io_bp->b_resid) {
 825                                 io_resid -= (io_contig_bytes - io_bp->b_resid);
 826                                 break;
 827                         }
 828                 }
 829                 if ((io_resid -= io_contig_bytes) == 0)
 830                         break;
 831                 f_offset       += io_contig_bytes;
 832                 io_bp->b_datap += io_contig_bytes;
 833
 834                 /*
 835                  * Map the current position to a physical block number
 836                  */
 837                 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_resid, &io_blkno, &io_contig_bytes, NULL, bmap_flags, NULL)))
 838                         break;
 839         }
 840         buf_free(io_bp);
 841
 842         if (error)
 843                 buf_seterror(bp, error);
 844         bp->b_resid = io_resid;
 845         /*
 846          * This I/O is now complete
 847          */
 848         buf_biodone(bp);
 849
 850         return error;
 851 }
 852
 853
 854 /*
 855  * struct vnop_strategy_args {
 856  *      struct buf *a_bp;
 857  * } *ap;
 858  */
 859 errno_t
 860 buf_strategy(vnode_t devvp, void *ap)
 861 {
 862         buf_t   bp = ((struct vnop_strategy_args *)ap)->a_bp;
 863         vnode_t vp = bp->b_vp;
 864         int     bmap_flags;
 865         errno_t error;
 866
 867         if (vp == NULL || vp->v_type == VCHR || vp->v_type == VBLK)
 868                 panic("buf_strategy: b_vp == NULL || vtype == VCHR | VBLK\n");
 869         /*
 870          * associate the physical device with
 871          * with this buf_t even if we don't
 872          * end up issuing the I/O...
 873          */
 874         bp->b_dev = devvp->v_rdev;
 875         DTRACE_IO1(start, buf_t, bp);
 876
 877         if (bp->b_flags & B_READ)
 878                 bmap_flags = VNODE_READ;
 879         else
 880                 bmap_flags = VNODE_WRITE;
 881
 882         if ( !(bp->b_flags & B_CLUSTER)) {
 883
 884                 if ( (bp->b_upl) ) {
 885                         /*
 886                          * we have a UPL associated with this bp
 887                          * go through cluster_bp which knows how
 888                          * to deal with filesystem block sizes
 889                          * that aren't equal to the page size
 890                          */
 891                         return (cluster_bp(bp));
 892                 }
 893                 if (bp->b_blkno == bp->b_lblkno) {
 894                         off_t   f_offset;
 895                         size_t  contig_bytes;
 896
 897                         if ((error = VNOP_BLKTOOFF(vp, bp->b_lblkno, &f_offset))) {
 898                                 buf_seterror(bp, error);
 899                                 buf_biodone(bp);
 900
 901                                 return (error);
 902                         }
 903                         if ((error = VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))) {
 904                                 buf_seterror(bp, error);
 905                                 buf_biodone(bp);
 906
 907                                 return (error);
 908                         }
 909                         if ((bp->b_blkno == -1) || (contig_bytes == 0)) {
 910                                 /* Set block number to force biodone later */
 911                                 bp->b_blkno = -1;
 912                                 buf_clear(bp);
 913                         }
 914                         else if ((long)contig_bytes < bp->b_bcount)
 915                                 return (buf_strategy_fragmented(devvp, bp, f_offset, contig_bytes));
 916                 }
 917                 if (bp->b_blkno == -1) {
 918                         buf_biodone(bp);
 919                         return (0);
 920                 }
 921         }
 922         /*
 923          * we can issue the I/O because...
 924          * either B_CLUSTER is set which
 925          * means that the I/O is properly set
 926          * up to be a multiple of the page size, or
 927          * we were able to successfully set up the
 928          * phsyical block mapping
 929          */
 930         return (VOCALL(devvp->v_op, VOFFSET(vnop_strategy), ap));
 931 }
 932
 933
 934
 935 buf_t
 936 buf_alloc(vnode_t vp)
 937 {
 938         return(alloc_io_buf(vp, 0));
 939 }
 940
 941 void
 942 buf_free(buf_t bp) {
 943
 944         free_io_buf(bp);
 945 }
 946
 947
 948 /*
 949  * iterate buffers for the specified vp.
 950  *   if BUF_SCAN_DIRTY is set, do the dirty list
 951  *   if BUF_SCAN_CLEAN is set, do the clean list
 952  *   if neither flag is set, default to BUF_SCAN_DIRTY
 953  *   if BUF_NOTIFY_BUSY is set, call the callout function using a NULL bp for busy pages
 954  */
 955
 956 struct buf_iterate_info_t {
 957         int flag;
 958         struct buflists *listhead;
 959 };
 960
 961 void
 962 buf_iterate(vnode_t vp, int (*callout)(buf_t, void *), int flags, void *arg)
 963 {
 964         buf_t   bp;
 965         int     retval;
 966         struct  buflists local_iterblkhd;
 967         int     lock_flags = BAC_NOWAIT | BAC_REMOVE;
 968         int     notify_busy = flags & BUF_NOTIFY_BUSY;
 969         struct buf_iterate_info_t list[2];
 970         int     num_lists, i;
 971
 972         if (flags & BUF_SKIP_LOCKED)
 973                 lock_flags |= BAC_SKIP_LOCKED;
 974         if (flags & BUF_SKIP_NONLOCKED)
 975                 lock_flags |= BAC_SKIP_NONLOCKED;
 976
 977         if ( !(flags & (BUF_SCAN_DIRTY | BUF_SCAN_CLEAN)))
 978                 flags |= BUF_SCAN_DIRTY;
 979
 980         num_lists = 0;
 981
 982         if (flags & BUF_SCAN_DIRTY) {
 983                 list[num_lists].flag = VBI_DIRTY;
 984                 list[num_lists].listhead = &vp->v_dirtyblkhd;
 985                 num_lists++;
 986         }
 987         if (flags & BUF_SCAN_CLEAN) {
 988                 list[num_lists].flag = VBI_CLEAN;
 989                 list[num_lists].listhead = &vp->v_cleanblkhd;
 990                 num_lists++;
 991         }
 992
 993         for (i = 0; i < num_lists; i++) {
 994                 lck_mtx_lock(buf_mtxp);
 995
 996                 if (buf_iterprepare(vp, &local_iterblkhd, list[i].flag))  {
 997                         lck_mtx_unlock(buf_mtxp);
 998                         continue;
 999                 }
1000                 while (!LIST_EMPTY(&local_iterblkhd)) {
1001                         bp = LIST_FIRST(&local_iterblkhd);
1002                         LIST_REMOVE(bp, b_vnbufs);
1003                         LIST_INSERT_HEAD(list[i].listhead, bp, b_vnbufs);
1004
1005                         if (buf_acquire_locked(bp, lock_flags, 0, 0)) {
1006                                 if (notify_busy) {
1007                                         bp = NULL;
1008                                 } else {
1009                                         continue;
1010                                 }
1011                         }
1012
1013                         lck_mtx_unlock(buf_mtxp);
1014
1015                         retval = callout(bp, arg);
1016
1017                         switch (retval) {
1018                         case BUF_RETURNED:
1019                                 if (bp)
1020                                         buf_brelse(bp);
1021                                 break;
1022                         case BUF_CLAIMED:
1023                                 break;
1024                         case BUF_RETURNED_DONE:
1025                                 if (bp)
1026                                         buf_brelse(bp);
1027                                 lck_mtx_lock(buf_mtxp);
1028                                 goto out;
1029                         case BUF_CLAIMED_DONE:
1030                                 lck_mtx_lock(buf_mtxp);
1031                                 goto out;
1032                         }
1033                         lck_mtx_lock(buf_mtxp);
1034                 } /* while list has more nodes */
1035           out:
1036                 buf_itercomplete(vp, &local_iterblkhd, list[i].flag);
1037                 lck_mtx_unlock(buf_mtxp);
1038         } /* for each list */
1039 } /* buf_iterate */
1040
1041
1042 /*
1043  * Flush out and invalidate all buffers associated with a vnode.
1044  */
1045 int
1046 buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo)
1047 {
1048         buf_t   bp;
1049         int     error = 0;
1050         int     must_rescan = 1;
1051         struct  buflists local_iterblkhd;
1052
1053
1054         if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd))
1055                 return (0);
1056
1057         lck_mtx_lock(buf_mtxp);
1058
1059         for (;;) {
1060                 if (must_rescan == 0)
1061                         /*
1062                          * the lists may not be empty, but all that's left at this
1063                          * point are metadata or B_LOCKED buffers which are being
1064                          * skipped... we know this because we made it through both
1065                          * the clean and dirty lists without dropping buf_mtxp...
1066                          * each time we drop buf_mtxp we bump "must_rescan"
1067                          */
1068                         break;
1069                 if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd))
1070                         break;
1071                 must_rescan = 0;
1072                 /*
1073                  * iterate the clean list
1074                  */
1075                 if (buf_iterprepare(vp, &local_iterblkhd, VBI_CLEAN)) {
1076                         goto try_dirty_list;
1077                 }
1078                 while (!LIST_EMPTY(&local_iterblkhd)) {
1079                         bp = LIST_FIRST(&local_iterblkhd);
1080
1081                         LIST_REMOVE(bp, b_vnbufs);
1082                         LIST_INSERT_HEAD(&vp->v_cleanblkhd, bp, b_vnbufs);
1083
1084                         /*
1085                          * some filesystems distinguish meta data blocks with a negative logical block #
1086                          */
1087                         if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
1088                                 continue;
1089
1090                         if ( (error = (int)buf_acquire_locked(bp, BAC_REMOVE | BAC_SKIP_LOCKED, slpflag, slptimeo)) ) {
1091                                 if (error == EDEADLK)
1092                                         /*
1093                                          * this buffer was marked B_LOCKED...
1094                                          * we didn't drop buf_mtxp, so we
1095                                          * we don't need to rescan
1096                                          */
1097                                         continue;
1098                                 if (error == EAGAIN) {
1099                                         /*
1100                                          * found a busy buffer... we blocked and
1101                                          * dropped buf_mtxp, so we're going to
1102                                          * need to rescan after this pass is completed
1103                                          */
1104                                         must_rescan++;
1105                                         continue;
1106                                 }
1107                                 /*
1108                                  * got some kind of 'real' error out of the msleep
1109                                  * in buf_acquire_locked, terminate the scan and return the error
1110                                  */
1111                                 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1112
1113                                 lck_mtx_unlock(buf_mtxp);
1114                                 return (error);
1115                         }
1116                         lck_mtx_unlock(buf_mtxp);
1117
1118                         SET(bp->b_flags, B_INVAL);
1119                         buf_brelse(bp);
1120
1121                         lck_mtx_lock(buf_mtxp);
1122
1123                         /*
1124                          * by dropping buf_mtxp, we allow new
1125                          * buffers to be added to the vnode list(s)
1126                          * we'll have to rescan at least once more
1127                          * if the queues aren't empty
1128                          */
1129                         must_rescan++;
1130                 }
1131                 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1132
1133 try_dirty_list:
1134                 /*
1135                  * Now iterate on dirty blks
1136                  */
1137                 if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY)) {
1138                         continue;
1139                 }
1140                 while (!LIST_EMPTY(&local_iterblkhd)) {
1141                         bp = LIST_FIRST(&local_iterblkhd);
1142
1143                         LIST_REMOVE(bp, b_vnbufs);
1144                         LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1145
1146                         /*
1147                          * some filesystems distinguish meta data blocks with a negative logical block #
1148                          */
1149                         if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
1150                                 continue;
1151
1152                         if ( (error = (int)buf_acquire_locked(bp, BAC_REMOVE | BAC_SKIP_LOCKED, slpflag, slptimeo)) ) {
1153                                 if (error == EDEADLK)
1154                                         /*
1155                                          * this buffer was marked B_LOCKED...
1156                                          * we didn't drop buf_mtxp, so we
1157                                          * we don't need to rescan
1158                                          */
1159                                         continue;
1160                                 if (error == EAGAIN) {
1161                                         /*
1162                                          * found a busy buffer... we blocked and
1163                                          * dropped buf_mtxp, so we're going to
1164                                          * need to rescan after this pass is completed
1165                                          */
1166                                         must_rescan++;
1167                                         continue;
1168                                 }
1169                                 /*
1170                                  * got some kind of 'real' error out of the msleep
1171                                  * in buf_acquire_locked, terminate the scan and return the error
1172                                  */
1173                                 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1174
1175                                 lck_mtx_unlock(buf_mtxp);
1176                                 return (error);
1177                         }
1178                         lck_mtx_unlock(buf_mtxp);
1179
1180                         SET(bp->b_flags, B_INVAL);
1181
1182                         if (ISSET(bp->b_flags, B_DELWRI) && (flags & BUF_WRITE_DATA))
1183                                 (void) VNOP_BWRITE(bp);
1184                         else
1185                                 buf_brelse(bp);
1186
1187                         lck_mtx_lock(buf_mtxp);
1188                         /*
1189                          * by dropping buf_mtxp, we allow new
1190                          * buffers to be added to the vnode list(s)
1191                          * we'll have to rescan at least once more
1192                          * if the queues aren't empty
1193                          */
1194                         must_rescan++;
1195                 }
1196                 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1197         }
1198         lck_mtx_unlock(buf_mtxp);
1199
1200         return (0);
1201 }
1202
1203 void
1204 buf_flushdirtyblks(vnode_t vp, int wait, int flags, const char *msg) {
1205         buf_t   bp;
1206         int     writes_issued = 0;
1207         errno_t error;
1208         int     busy = 0;
1209         struct  buflists local_iterblkhd;
1210         int     lock_flags = BAC_NOWAIT | BAC_REMOVE;
1211
1212         if (flags & BUF_SKIP_LOCKED)
1213                 lock_flags |= BAC_SKIP_LOCKED;
1214         if (flags & BUF_SKIP_NONLOCKED)
1215                 lock_flags |= BAC_SKIP_NONLOCKED;
1216 loop:
1217         lck_mtx_lock(buf_mtxp);
1218
1219         if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY) == 0)  {
1220                 while (!LIST_EMPTY(&local_iterblkhd)) {
1221                         bp = LIST_FIRST(&local_iterblkhd);
1222                         LIST_REMOVE(bp, b_vnbufs);
1223                         LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1224
1225                         if ((error = buf_acquire_locked(bp, lock_flags, 0, 0)) == EBUSY)
1226                                 busy++;
1227                         if (error)
1228                                 continue;
1229                         lck_mtx_unlock(buf_mtxp);
1230
1231                         bp->b_flags &= ~B_LOCKED;
1232
1233                         /*
1234                          * Wait for I/O associated with indirect blocks to complete,
1235                          * since there is no way to quickly wait for them below.
1236                          */
1237                         if ((bp->b_vp == vp) || (wait == 0))
1238                                 (void) buf_bawrite(bp);
1239                         else
1240                                 (void) VNOP_BWRITE(bp);
1241                         writes_issued++;
1242
1243                         lck_mtx_lock(buf_mtxp);
1244                 }
1245                 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1246         }
1247         lck_mtx_unlock(buf_mtxp);
1248
1249         if (wait) {
1250                 (void)vnode_waitforwrites(vp, 0, 0, 0, msg);
1251
1252                 if (vp->v_dirtyblkhd.lh_first && busy) {
1253                         /*
1254                          * we had one or more BUSY buffers on
1255                          * the dirtyblock list... most likely
1256                          * these are due to delayed writes that
1257                          * were moved to the bclean queue but
1258                          * have not yet been 'written'.
1259                          * if we issued some writes on the
1260                          * previous pass, we try again immediately
1261                          * if we didn't, we'll sleep for some time
1262                          * to allow the state to change...
1263                          */
1264                         if (writes_issued == 0) {
1265                                 (void)tsleep((caddr_t)&vp->v_numoutput,
1266                                              PRIBIO + 1, "vnode_flushdirtyblks", hz/20);
1267                         }
1268                         writes_issued = 0;
1269                         busy = 0;
1270
1271                         goto loop;
1272                 }
1273         }
1274 }
1275
1276
1277 /*
1278  * called with buf_mtxp held...
1279  * this lock protects the queue manipulation
1280  */
1281 static int
1282 buf_iterprepare(vnode_t vp, struct buflists *iterheadp, int flags)
1283 {
1284         struct buflists * listheadp;
1285
1286         if (flags & VBI_DIRTY)
1287                 listheadp = &vp->v_dirtyblkhd;
1288         else
1289                 listheadp = &vp->v_cleanblkhd;
1290
1291         while (vp->v_iterblkflags & VBI_ITER)   {
1292                 vp->v_iterblkflags |= VBI_ITERWANT;
1293                 msleep(&vp->v_iterblkflags, buf_mtxp, 0, "buf_iterprepare", NULL);
1294         }
1295         if (LIST_EMPTY(listheadp)) {
1296                 LIST_INIT(iterheadp);
1297                 return(EINVAL);
1298         }
1299         vp->v_iterblkflags |= VBI_ITER;
1300
1301         iterheadp->lh_first = listheadp->lh_first;
1302         listheadp->lh_first->b_vnbufs.le_prev = &iterheadp->lh_first;
1303         LIST_INIT(listheadp);
1304
1305         return(0);
1306 }
1307
1308 /*
1309  * called with buf_mtxp held...
1310  * this lock protects the queue manipulation
1311  */
1312 static void
1313 buf_itercomplete(vnode_t vp, struct buflists *iterheadp, int flags)
1314 {
1315         struct buflists * listheadp;
1316         buf_t bp;
1317
1318         if (flags & VBI_DIRTY)
1319                 listheadp = &vp->v_dirtyblkhd;
1320         else
1321                 listheadp = &vp->v_cleanblkhd;
1322
1323         while (!LIST_EMPTY(iterheadp)) {
1324                 bp = LIST_FIRST(iterheadp);
1325                 LIST_REMOVE(bp, b_vnbufs);
1326                 LIST_INSERT_HEAD(listheadp, bp, b_vnbufs);
1327         }
1328         vp->v_iterblkflags &= ~VBI_ITER;
1329
1330         if  (vp->v_iterblkflags & VBI_ITERWANT)         {
1331                 vp->v_iterblkflags &= ~VBI_ITERWANT;
1332                 wakeup(&vp->v_iterblkflags);
1333         }
1334 }
1335
1336
1337 static void
1338 bremfree_locked(buf_t bp)
1339 {
1340         struct bqueues *dp = NULL;
1341         int whichq;
1342         /*
1343          * We only calculate the head of the freelist when removing
1344          * the last element of the list as that is the only time that
1345          * it is needed (e.g. to reset the tail pointer).
1346          *
1347          * NB: This makes an assumption about how tailq's are implemented.
1348          */
1349         whichq = bp->b_whichq;
1350
1351         if (bp->b_freelist.tqe_next == NULL) {
1352                 dp = &bufqueues[whichq];
1353
1354                 if (dp->tqh_last != &bp->b_freelist.tqe_next)
1355                         panic("bremfree: lost tail");
1356         }
1357         TAILQ_REMOVE(dp, bp, b_freelist);
1358
1359 #if BALANCE_QUEUES
1360         bufqdec(whichq);
1361 #endif
1362         if (whichq == BQ_LAUNDRY)
1363                 blaundrycnt--;
1364
1365         bp->b_whichq = -1;
1366         bp->b_timestamp = 0;
1367 }
1368
1369 /*
1370  * Associate a buffer with a vnode.
1371  * buf_mtxp must be locked on entry
1372  */
1373 static void
1374 bgetvp_locked(vnode_t vp, buf_t bp)
1375 {
1376
1377         if (bp->b_vp != vp)
1378                 panic("bgetvp_locked: not free");
1379
1380         if (vp->v_type == VBLK || vp->v_type == VCHR)
1381                 bp->b_dev = vp->v_rdev;
1382         else
1383                 bp->b_dev = NODEV;
1384         /*
1385          * Insert onto list for new vnode.
1386          */
1387         bufinsvn(bp, &vp->v_cleanblkhd);
1388 }
1389
1390 /*
1391  * Disassociate a buffer from a vnode.
1392  * buf_mtxp must be locked on entry
1393  */
1394 static void
1395 brelvp_locked(buf_t bp)
1396 {
1397         /*
1398          * Delete from old vnode list, if on one.
1399          */
1400         if (bp->b_vnbufs.le_next != NOLIST)
1401                 bufremvn(bp);
1402
1403         bp->b_vp = (vnode_t)NULL;
1404 }
1405
1406 /*
1407  * Reassign a buffer from one vnode to another.
1408  * Used to assign file specific control information
1409  * (indirect blocks) to the vnode to which they belong.
1410  */
1411 static void
1412 buf_reassign(buf_t bp, vnode_t newvp)
1413 {
1414         register struct buflists *listheadp;
1415
1416         if (newvp == NULL) {
1417                 printf("buf_reassign: NULL");
1418                 return;
1419         }
1420         lck_mtx_lock_spin(buf_mtxp);
1421
1422         /*
1423          * Delete from old vnode list, if on one.
1424          */
1425         if (bp->b_vnbufs.le_next != NOLIST)
1426                 bufremvn(bp);
1427         /*
1428          * If dirty, put on list of dirty buffers;
1429          * otherwise insert onto list of clean buffers.
1430          */
1431         if (ISSET(bp->b_flags, B_DELWRI))
1432                 listheadp = &newvp->v_dirtyblkhd;
1433         else
1434                 listheadp = &newvp->v_cleanblkhd;
1435         bufinsvn(bp, listheadp);
1436
1437         lck_mtx_unlock(buf_mtxp);
1438 }
1439
1440 static __inline__ void
1441 bufhdrinit(buf_t bp)
1442 {
1443         bzero((char *)bp, sizeof *bp);
1444         bp->b_dev = NODEV;
1445         bp->b_rcred = NOCRED;
1446         bp->b_wcred = NOCRED;
1447         bp->b_vnbufs.le_next = NOLIST;
1448         bp->b_flags = B_INVAL;
1449
1450         return;
1451 }
1452
1453 /*
1454  * Initialize buffers and hash links for buffers.
1455  */
1456 __private_extern__ void
1457 bufinit(void)
1458 {
1459         buf_t   bp;
1460         struct bqueues *dp;
1461         int     i;
1462
1463         nbuf_headers = 0;
1464         /* Initialize the buffer queues ('freelists') and the hash table */
1465         for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
1466                 TAILQ_INIT(dp);
1467         bufhashtbl = hashinit(nbuf_hashelements, M_CACHE, &bufhash);
1468
1469         buf_busycount = 0;
1470
1471         /* Initialize the buffer headers */
1472         for (i = 0; i < max_nbuf_headers; i++) {
1473                 nbuf_headers++;
1474                 bp = &buf_headers[i];
1475                 bufhdrinit(bp);
1476
1477                 BLISTNONE(bp);
1478                 dp = &bufqueues[BQ_EMPTY];
1479                 bp->b_whichq = BQ_EMPTY;
1480                 bp->b_timestamp = buf_timestamp();
1481                 binsheadfree(bp, dp, BQ_EMPTY);
1482                 binshash(bp, &invalhash);
1483         }
1484
1485         boot_nbuf_headers = nbuf_headers;
1486         for (; i < nbuf_headers + niobuf_headers; i++) {
1487                 bp = &buf_headers[i];
1488                 bufhdrinit(bp);
1489                 bp->b_whichq = -1;
1490                 binsheadfree(bp, &iobufqueue, -1);
1491         }
1492
1493         /*
1494          * allocate lock group attribute and group
1495          */
1496         buf_mtx_grp_attr = lck_grp_attr_alloc_init();
1497         buf_mtx_grp = lck_grp_alloc_init("buffer cache", buf_mtx_grp_attr);
1498
1499         /*
1500          * allocate the lock attribute
1501          */
1502         buf_mtx_attr = lck_attr_alloc_init();
1503
1504         /*
1505          * allocate and initialize mutex's for the buffer and iobuffer pools
1506          */
1507         buf_mtxp        = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
1508         iobuffer_mtxp   = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
1509
1510         if (iobuffer_mtxp == NULL)
1511                 panic("couldn't create iobuffer mutex");
1512
1513         if (buf_mtxp == NULL)
1514                 panic("couldn't create buf mutex");
1515
1516         /*
1517          * allocate and initialize cluster specific global locks...
1518          */
1519         cluster_init();
1520
1521         printf("using %d buffer headers and %d cluster IO buffer headers\n",
1522                 nbuf_headers, niobuf_headers);
1523
1524         /* Set up zones used by the buffer cache */
1525         bufzoneinit();
1526
1527         /* start the bcleanbuf() thread */
1528         bcleanbuf_thread_init();
1529
1530         /* Register a callout for relieving vm pressure */
1531         if (vm_set_buffer_cleanup_callout(buffer_cache_gc) != KERN_SUCCESS) {
1532                 panic("Couldn't register buffer cache callout for vm pressure!\n");
1533         }
1534
1535 #if BALANCE_QUEUES
1536         {
1537         static void bufq_balance_thread_init(void) __attribute__((section("__TEXT, initcode")));
1538         /* create a thread to do dynamic buffer queue balancing */
1539         bufq_balance_thread_init();
1540         }
1541 #endif /* notyet */
1542 }
1543
1544
1545
1546 /*
1547  * Zones for the meta data buffers
1548  */
1549
1550 #define MINMETA 512
1551 #define MAXMETA 8192
1552
1553 struct meta_zone_entry {
1554         zone_t mz_zone;
1555         vm_size_t mz_size;
1556         vm_size_t mz_max;
1557         const char *mz_name;
1558 };
1559
1560 struct meta_zone_entry meta_zones[] = {
1561         {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
1562         {NULL, (MINMETA * 2),  64 * (MINMETA * 2), "buf.1024" },
1563         {NULL, (MINMETA * 4),  16 * (MINMETA * 4), "buf.2048" },
1564         {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
1565         {NULL, (MINMETA * 16), 512 * (MINMETA * 16), "buf.8192" },
1566         {NULL, 0, 0, "" } /* End */
1567 };
1568
1569 /*
1570  * Initialize the meta data zones
1571  */
1572 static void
1573 bufzoneinit(void)
1574 {
1575         int i;
1576
1577         for (i = 0; meta_zones[i].mz_size != 0; i++) {
1578                 meta_zones[i].mz_zone =
1579                                 zinit(meta_zones[i].mz_size,
1580                                         meta_zones[i].mz_max,
1581                                         PAGE_SIZE,
1582                                         meta_zones[i].mz_name);
1583         }
1584         buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
1585 }
1586
1587 static __inline__ zone_t
1588 getbufzone(size_t size)
1589 {
1590         int i;
1591
1592         if ((size % 512) || (size < MINMETA) || (size > MAXMETA))
1593                 panic("getbufzone: incorect size = %lu", size);
1594
1595         for (i = 0; meta_zones[i].mz_size != 0; i++) {
1596                 if (meta_zones[i].mz_size >= size)
1597                         break;
1598         }
1599
1600         return (meta_zones[i].mz_zone);
1601 }
1602
1603
1604
1605 static struct buf *
1606 bio_doread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, int async, int queuetype)
1607 {
1608         buf_t   bp;
1609
1610         bp = buf_getblk(vp, blkno, size, 0, 0, queuetype);
1611
1612         /*
1613          * If buffer does not have data valid, start a read.
1614          * Note that if buffer is B_INVAL, buf_getblk() won't return it.
1615          * Therefore, it's valid if it's I/O has completed or been delayed.
1616          */
1617         if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
1618                 struct proc *p;
1619
1620                 p = current_proc();
1621
1622                 /* Start I/O for the buffer (keeping credentials). */
1623                 SET(bp->b_flags, B_READ | async);
1624                 if (IS_VALID_CRED(cred) && !IS_VALID_CRED(bp->b_rcred)) {
1625                         kauth_cred_ref(cred);
1626                         bp->b_rcred = cred;
1627                 }
1628
1629                 VNOP_STRATEGY(bp);
1630
1631                 trace(TR_BREADMISS, pack(vp, size), blkno);
1632
1633                 /* Pay for the read. */
1634                 if (p && p->p_stats)
1635                         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_inblock);            /* XXX */
1636
1637                 if (async) {
1638                         /*
1639                          * since we asked for an ASYNC I/O
1640                          * the biodone will do the brelse
1641                          * we don't want to pass back a bp
1642                          * that we don't 'own'
1643                          */
1644                         bp = NULL;
1645                 }
1646         } else if (async) {
1647                 buf_brelse(bp);
1648                 bp = NULL;
1649         }
1650
1651         trace(TR_BREADHIT, pack(vp, size), blkno);
1652
1653         return (bp);
1654 }
1655
1656 /*
1657  * Perform the reads for buf_breadn() and buf_meta_breadn().
1658  * Trivial modification to the breada algorithm presented in Bach (p.55).
1659  */
1660 static errno_t
1661 do_breadn_for_type(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes,
1662                    int nrablks, kauth_cred_t cred, buf_t *bpp, int queuetype)
1663 {
1664         buf_t   bp;
1665         int     i;
1666
1667         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, queuetype);
1668
1669         /*
1670          * For each of the read-ahead blocks, start a read, if necessary.
1671          */
1672         for (i = 0; i < nrablks; i++) {
1673                 /* If it's in the cache, just go on to next one. */
1674                 if (incore(vp, rablks[i]))
1675                         continue;
1676
1677                 /* Get a buffer for the read-ahead block */
1678                 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, queuetype);
1679         }
1680
1681         /* Otherwise, we had to start a read for it; wait until it's valid. */
1682         return (buf_biowait(bp));
1683 }
1684
1685
1686 /*
1687  * Read a disk block.
1688  * This algorithm described in Bach (p.54).
1689  */
1690 errno_t
1691 buf_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp)
1692 {
1693         buf_t   bp;
1694
1695         /* Get buffer for block. */
1696         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
1697
1698         /* Wait for the read to complete, and return result. */
1699         return (buf_biowait(bp));
1700 }
1701
1702 /*
1703  * Read a disk block. [bread() for meta-data]
1704  * This algorithm described in Bach (p.54).
1705  */
1706 errno_t
1707 buf_meta_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp)
1708 {
1709         buf_t   bp;
1710
1711         /* Get buffer for block. */
1712         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
1713
1714         /* Wait for the read to complete, and return result. */
1715         return (buf_biowait(bp));
1716 }
1717
1718 /*
1719  * Read-ahead multiple disk blocks. The first is sync, the rest async.
1720  */
1721 errno_t
1722 buf_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, kauth_cred_t cred, buf_t *bpp)
1723 {
1724         return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_READ));
1725 }
1726
1727 /*
1728  * Read-ahead multiple disk blocks. The first is sync, the rest async.
1729  * [buf_breadn() for meta-data]
1730  */
1731 errno_t
1732 buf_meta_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, kauth_cred_t cred, buf_t *bpp)
1733 {
1734         return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_META));
1735 }
1736
1737 /*
1738  * Block write.  Described in Bach (p.56)
1739  */
1740 errno_t
1741 buf_bwrite(buf_t bp)
1742 {
1743         int     sync, wasdelayed;
1744         errno_t rv;
1745         proc_t  p = current_proc();
1746         vnode_t vp = bp->b_vp;
1747
1748         if (bp->b_datap == 0) {
1749                 if (brecover_data(bp) == 0)
1750                         return (0);
1751         }
1752         /* Remember buffer type, to switch on it later. */
1753         sync = !ISSET(bp->b_flags, B_ASYNC);
1754         wasdelayed = ISSET(bp->b_flags, B_DELWRI);
1755         CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
1756
1757         if (wasdelayed)
1758                 OSAddAtomicLong(-1, &nbdwrite);
1759
1760         if (!sync) {
1761                 /*
1762                  * If not synchronous, pay for the I/O operation and make
1763                  * sure the buf is on the correct vnode queue.  We have
1764                  * to do this now, because if we don't, the vnode may not
1765                  * be properly notified that its I/O has completed.
1766                  */
1767                 if (wasdelayed)
1768                         buf_reassign(bp, vp);
1769                 else
1770                 if (p && p->p_stats)
1771                         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock);    /* XXX */
1772         }
1773         trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
1774
1775         /* Initiate disk write.  Make sure the appropriate party is charged. */
1776
1777         OSAddAtomic(1, &vp->v_numoutput);
1778
1779         VNOP_STRATEGY(bp);
1780
1781         if (sync) {
1782                 /*
1783                  * If I/O was synchronous, wait for it to complete.
1784                  */
1785                 rv = buf_biowait(bp);
1786
1787                 /*
1788                  * Pay for the I/O operation, if it's not been paid for, and
1789                  * make sure it's on the correct vnode queue. (async operatings
1790                  * were payed for above.)
1791                  */
1792                 if (wasdelayed)
1793                         buf_reassign(bp, vp);
1794                 else
1795                 if (p && p->p_stats)
1796                         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock);    /* XXX */
1797
1798                 /* Release the buffer. */
1799                 // XXXdbg - only if the unused bit is set
1800                 if (!ISSET(bp->b_flags, B_NORELSE)) {
1801                     buf_brelse(bp);
1802                 } else {
1803                     CLR(bp->b_flags, B_NORELSE);
1804                 }
1805
1806                 return (rv);
1807         } else {
1808                 return (0);
1809         }
1810 }
1811
1812 int
1813 vn_bwrite(struct vnop_bwrite_args *ap)
1814 {
1815         return (buf_bwrite(ap->a_bp));
1816 }
1817
1818 /*
1819  * Delayed write.
1820  *
1821  * The buffer is marked dirty, but is not queued for I/O.
1822  * This routine should be used when the buffer is expected
1823  * to be modified again soon, typically a small write that
1824  * partially fills a buffer.
1825  *
1826  * NB: magnetic tapes cannot be delayed; they must be
1827  * written in the order that the writes are requested.
1828  *
1829  * Described in Leffler, et al. (pp. 208-213).
1830  *
1831  * Note: With the ability to allocate additional buffer
1832  * headers, we can get in to the situation where "too" many
1833  * buf_bdwrite()s can create situation where the kernel can create
1834  * buffers faster than the disks can service. Doing a buf_bawrite() in
1835  * cases were we have "too many" outstanding buf_bdwrite()s avoids that.
1836  */
1837 __private_extern__ int
1838 bdwrite_internal(buf_t bp, int return_error)
1839 {
1840         proc_t  p  = current_proc();
1841         vnode_t vp = bp->b_vp;
1842
1843         /*
1844          * If the block hasn't been seen before:
1845          *      (1) Mark it as having been seen,
1846          *      (2) Charge for the write.
1847          *      (3) Make sure it's on its vnode's correct block list,
1848          */
1849         if (!ISSET(bp->b_flags, B_DELWRI)) {
1850                 SET(bp->b_flags, B_DELWRI);
1851                 if (p && p->p_stats)
1852                         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock);    /* XXX */
1853                 OSAddAtomicLong(1, &nbdwrite);
1854                 buf_reassign(bp, vp);
1855         }
1856
1857         /*
1858          * if we're not LOCKED, but the total number of delayed writes
1859          * has climbed above 75% of the total buffers in the system
1860          * return an error if the caller has indicated that it can
1861          * handle one in this case, otherwise schedule the I/O now
1862          * this is done to prevent us from allocating tons of extra
1863          * buffers when dealing with virtual disks (i.e. DiskImages),
1864          * because additional buffers are dynamically allocated to prevent
1865          * deadlocks from occurring
1866          *
1867          * however, can't do a buf_bawrite() if the LOCKED bit is set because the
1868          * buffer is part of a transaction and can't go to disk until
1869          * the LOCKED bit is cleared.
1870          */
1871         if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf_headers/4)*3)) {
1872                 if (return_error)
1873                         return (EAGAIN);
1874                 /*
1875                  * If the vnode has "too many" write operations in progress
1876                  * wait for them to finish the IO
1877                  */
1878                 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, "buf_bdwrite");
1879
1880                 return (buf_bawrite(bp));
1881         }
1882
1883         /* Otherwise, the "write" is done, so mark and release the buffer. */
1884         SET(bp->b_flags, B_DONE);
1885         buf_brelse(bp);
1886         return (0);
1887 }
1888
1889 errno_t
1890 buf_bdwrite(buf_t bp)
1891 {
1892         return (bdwrite_internal(bp, 0));
1893 }
1894
1895
1896 /*
1897  * Asynchronous block write; just an asynchronous buf_bwrite().
1898  *
1899  * Note: With the abilitty to allocate additional buffer
1900  * headers, we can get in to the situation where "too" many
1901  * buf_bawrite()s can create situation where the kernel can create
1902  * buffers faster than the disks can service.
1903  * We limit the number of "in flight" writes a vnode can have to
1904  * avoid this.
1905  */
1906 static int
1907 bawrite_internal(buf_t bp, int throttle)
1908 {
1909         vnode_t vp = bp->b_vp;
1910
1911         if (vp) {
1912                 if (throttle)
1913                         /*
1914                          * If the vnode has "too many" write operations in progress
1915                          * wait for them to finish the IO
1916                          */
1917                         (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, (const char *)"buf_bawrite");
1918                 else if (vp->v_numoutput >= VNODE_ASYNC_THROTTLE)
1919                         /*
1920                          * return to the caller and
1921                          * let him decide what to do
1922                          */
1923                         return (EWOULDBLOCK);
1924         }
1925         SET(bp->b_flags, B_ASYNC);
1926
1927         return (VNOP_BWRITE(bp));
1928 }
1929
1930 errno_t
1931 buf_bawrite(buf_t bp)
1932 {
1933         return (bawrite_internal(bp, 1));
1934 }
1935
1936
1937 /*
1938  * Release a buffer on to the free lists.
1939  * Described in Bach (p. 46).
1940  */
1941 void
1942 buf_brelse(buf_t bp)
1943 {
1944         struct bqueues *bufq;
1945         long    whichq;
1946         upl_t   upl;
1947         int need_wakeup = 0;
1948         int need_bp_wakeup = 0;
1949
1950
1951         if (bp->b_whichq != -1 || !(bp->b_lflags & BL_BUSY))
1952                 panic("buf_brelse: bad buffer = %p\n", bp);
1953
1954 #ifdef JOE_DEBUG
1955         (void) OSBacktrace(&bp->b_stackbrelse[0], 6);
1956
1957         bp->b_lastbrelse = current_thread();
1958         bp->b_tag = 0;
1959 #endif
1960         if (bp->b_lflags & BL_IOBUF) {
1961                 free_io_buf(bp);
1962                 return;
1963         }
1964
1965         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
1966                      bp->b_lblkno * PAGE_SIZE, bp, bp->b_datap,
1967                      bp->b_flags, 0);
1968
1969         trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
1970
1971         /*
1972          * if we're invalidating a buffer that has the B_FILTER bit
1973          * set then call the b_iodone function so it gets cleaned
1974          * up properly.
1975          *
1976          * the HFS journal code depends on this
1977          */
1978         if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
1979                 if (ISSET(bp->b_flags, B_FILTER)) {     /* if necessary, call out */
1980                         void    (*iodone_func)(struct buf *, void *) = bp->b_iodone;
1981                         void    *arg = (void *)bp->b_transaction;
1982
1983                         CLR(bp->b_flags, B_FILTER);     /* but note callout done */
1984                         bp->b_iodone = NULL;
1985                         bp->b_transaction = NULL;
1986
1987                         if (iodone_func == NULL) {
1988                                 panic("brelse: bp @ %p has NULL b_iodone!\n", bp);
1989                         }
1990                         (*iodone_func)(bp, arg);
1991                 }
1992         }
1993         /*
1994          * I/O is done. Cleanup the UPL state
1995          */
1996         upl = bp->b_upl;
1997
1998         if ( !ISSET(bp->b_flags, B_META) && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
1999                 kern_return_t kret;
2000                 int           upl_flags;
2001
2002                 if ( (upl == NULL) ) {
2003                         if ( !ISSET(bp->b_flags, B_INVAL)) {
2004                                 kret = ubc_create_upl(bp->b_vp,
2005                                                       ubc_blktooff(bp->b_vp, bp->b_lblkno),
2006                                                       bp->b_bufsize,
2007                                                       &upl,
2008                                                       NULL,
2009                                                       UPL_PRECIOUS);
2010
2011                                 if (kret != KERN_SUCCESS)
2012                                         panic("brelse: Failed to create UPL");
2013 #if  UPL_DEBUG
2014                                 upl_ubc_alias_set(upl, (uintptr_t) bp, (uintptr_t) 5);
2015 #endif /* UPL_DEBUG */
2016                         }
2017                 } else {
2018                         if (bp->b_datap) {
2019                                 kret = ubc_upl_unmap(upl);
2020
2021                                 if (kret != KERN_SUCCESS)
2022                                         panic("ubc_upl_unmap failed");
2023                                 bp->b_datap = (uintptr_t)NULL;
2024                         }
2025                 }
2026                 if (upl) {
2027                         if (bp->b_flags & (B_ERROR | B_INVAL)) {
2028                                 if (bp->b_flags & (B_READ | B_INVAL))
2029                                         upl_flags = UPL_ABORT_DUMP_PAGES;
2030                                 else
2031                                         upl_flags = 0;
2032
2033                                 ubc_upl_abort(upl, upl_flags);
2034                         } else {
2035                                 if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY))
2036                                         upl_flags = UPL_COMMIT_SET_DIRTY ;
2037                                 else
2038                                         upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
2039
2040                                 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
2041                                                      UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2042                         }
2043                         bp->b_upl = NULL;
2044                 }
2045         } else {
2046                 if ( (upl) )
2047                         panic("brelse: UPL set for non VREG; vp=%p", bp->b_vp);
2048         }
2049
2050         /*
2051          * If it's locked, don't report an error; try again later.
2052          */
2053         if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
2054                 CLR(bp->b_flags, B_ERROR);
2055         /*
2056          * If it's not cacheable, or an error, mark it invalid.
2057          */
2058         if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
2059                 SET(bp->b_flags, B_INVAL);
2060
2061         if ((bp->b_bufsize <= 0) ||
2062                         ISSET(bp->b_flags, B_INVAL) ||
2063                         (ISSET(bp->b_lflags, BL_WANTDEALLOC) && !ISSET(bp->b_flags, B_DELWRI))) {
2064                 /*
2065                  * If it's invalid or empty, dissociate it from its vnode,
2066                  * release its storage if B_META, and
2067                  * clean it up a bit and put it on the EMPTY queue
2068                  */
2069                 if (ISSET(bp->b_flags, B_DELWRI))
2070                         OSAddAtomicLong(-1, &nbdwrite);
2071
2072                 if (ISSET(bp->b_flags, B_META)) {
2073                         if (bp->b_bufsize) {
2074                                 if (ISSET(bp->b_flags, B_ZALLOC)) {
2075                                         zone_t z;
2076
2077                                         z = getbufzone(bp->b_bufsize);
2078                                         zfree(z, (void *)bp->b_datap);
2079                                 } else
2080                                         kmem_free(kernel_map, bp->b_datap, bp->b_bufsize);
2081
2082                                  bp->b_datap = (uintptr_t)NULL;
2083                                  bp->b_bufsize = 0;
2084                         }
2085                 }
2086                 /*
2087                  * nuke any credentials we were holding
2088                  */
2089                 if (IS_VALID_CRED(bp->b_rcred)) {
2090                         kauth_cred_unref(&bp->b_rcred);
2091                 }
2092                 if (IS_VALID_CRED(bp->b_wcred)) {
2093                         kauth_cred_unref(&bp->b_wcred);
2094                 }
2095                 CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA));
2096
2097                 bufq = &bufqueues[BQ_EMPTY];
2098                 bp->b_whichq = BQ_EMPTY;
2099
2100                 lck_mtx_lock_spin(buf_mtxp);
2101
2102                 if (bp->b_vp)
2103                         brelvp_locked(bp);
2104
2105                 bremhash(bp);
2106                 BLISTNONE(bp);
2107                 binshash(bp, &invalhash);
2108
2109                 binsheadfree(bp, bufq, BQ_EMPTY);
2110         } else {
2111                 /*
2112                  * It has valid data.  Put it on the end of the appropriate
2113                  * queue, so that it'll stick around for as long as possible.
2114                  */
2115                 if (ISSET(bp->b_flags, B_LOCKED))
2116                         whichq = BQ_LOCKED;             /* locked in core */
2117                 else if (ISSET(bp->b_flags, B_META))
2118                         whichq = BQ_META;               /* meta-data */
2119                 else if (ISSET(bp->b_flags, B_AGE))
2120                         whichq = BQ_AGE;                /* stale but valid data */
2121                 else
2122                         whichq = BQ_LRU;                /* valid data */
2123                 bufq = &bufqueues[whichq];
2124
2125                 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE));
2126                 bp->b_whichq = whichq;
2127                 bp->b_timestamp = buf_timestamp();
2128
2129                 lck_mtx_lock_spin(buf_mtxp);
2130
2131                 binstailfree(bp, bufq, whichq);
2132         }
2133         if (needbuffer) {
2134                 /*
2135                  * needbuffer is a global
2136                  * we're currently using buf_mtxp to protect it
2137                  * delay doing the actual wakeup until after
2138                  * we drop buf_mtxp
2139                  */
2140                 needbuffer = 0;
2141                 need_wakeup = 1;
2142         }
2143         if (ISSET(bp->b_lflags, BL_WANTED)) {
2144                 /*
2145                  * delay the actual wakeup until after we
2146                  * clear BL_BUSY and we've dropped buf_mtxp
2147                  */
2148                 need_bp_wakeup = 1;
2149         }
2150         /*
2151          * Unlock the buffer.
2152          */
2153         CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
2154         buf_busycount--;
2155
2156         lck_mtx_unlock(buf_mtxp);
2157
2158         if (need_wakeup) {
2159                 /*
2160                  * Wake up any processes waiting for any buffer to become free.
2161                  */
2162                 wakeup(&needbuffer);
2163         }
2164         if (need_bp_wakeup) {
2165                 /*
2166                  * Wake up any proceeses waiting for _this_ buffer to become free.
2167                  */
2168                 wakeup(bp);
2169         }
2170         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
2171                      bp, bp->b_datap, bp->b_flags, 0, 0);
2172 }
2173
2174 /*
2175  * Determine if a block is in the cache.
2176  * Just look on what would be its hash chain.  If it's there, return
2177  * a pointer to it, unless it's marked invalid.  If it's marked invalid,
2178  * we normally don't return the buffer, unless the caller explicitly
2179  * wants us to.
2180  */
2181 static boolean_t
2182 incore(vnode_t vp, daddr64_t blkno)
2183 {
2184         boolean_t retval;
2185         struct  bufhashhdr *dp;
2186
2187         dp = BUFHASH(vp, blkno);
2188
2189         lck_mtx_lock_spin(buf_mtxp);
2190
2191         if (incore_locked(vp, blkno, dp))
2192                 retval = TRUE;
2193         else
2194                 retval = FALSE;
2195         lck_mtx_unlock(buf_mtxp);
2196
2197         return (retval);
2198 }
2199
2200
2201 static buf_t
2202 incore_locked(vnode_t vp, daddr64_t blkno, struct bufhashhdr *dp)
2203 {
2204         struct buf *bp;
2205
2206         /* Search hash chain */
2207         for (bp = dp->lh_first; bp != NULL; bp = bp->b_hash.le_next) {
2208                 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
2209                     !ISSET(bp->b_flags, B_INVAL)) {
2210                         return (bp);
2211                 }
2212         }
2213         return (NULL);
2214 }
2215
2216
2217 /* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
2218 /*
2219  * Get a block of requested size that is associated with
2220  * a given vnode and block offset. If it is found in the
2221  * block cache, mark it as having been found, make it busy
2222  * and return it. Otherwise, return an empty block of the
2223  * correct size. It is up to the caller to insure that the
2224  * cached blocks be of the correct size.
2225  */
2226 buf_t
2227 buf_getblk(vnode_t vp, daddr64_t blkno, int size, int slpflag, int slptimeo, int operation)
2228 {
2229         buf_t bp;
2230         int   err;
2231         upl_t upl;
2232         upl_page_info_t *pl;
2233         kern_return_t kret;
2234         int ret_only_valid;
2235         struct timespec ts;
2236         int upl_flags;
2237         struct  bufhashhdr *dp;
2238
2239         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
2240                      (uintptr_t)(blkno * PAGE_SIZE), size, operation, 0, 0);
2241
2242         ret_only_valid = operation & BLK_ONLYVALID;
2243         operation &= ~BLK_ONLYVALID;
2244         dp = BUFHASH(vp, blkno);
2245 start:
2246         lck_mtx_lock_spin(buf_mtxp);
2247
2248         if ((bp = incore_locked(vp, blkno, dp))) {
2249                 /*
2250                  * Found in the Buffer Cache
2251                  */
2252                 if (ISSET(bp->b_lflags, BL_BUSY)) {
2253                         /*
2254                          * but is busy
2255                          */
2256                         switch (operation) {
2257                         case BLK_READ:
2258                         case BLK_WRITE:
2259                         case BLK_META:
2260                                 SET(bp->b_lflags, BL_WANTED);
2261                                 bufstats.bufs_busyincore++;
2262
2263                                 /*
2264                                  * don't retake the mutex after being awakened...
2265                                  * the time out is in msecs
2266                                  */
2267                                 ts.tv_sec = (slptimeo/1000);
2268                                 ts.tv_nsec = (slptimeo % 1000) * 10  * NSEC_PER_USEC * 1000;
2269
2270                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 396)) | DBG_FUNC_NONE,
2271                                              (uintptr_t)blkno, size, operation, 0, 0);
2272
2273                                 err = msleep(bp, buf_mtxp, slpflag | PDROP | (PRIBIO + 1), "buf_getblk", &ts);
2274
2275                                 /*
2276                                  * Callers who call with PCATCH or timeout are
2277                                  * willing to deal with the NULL pointer
2278                                  */
2279                                 if (err && ((slpflag & PCATCH) || ((err == EWOULDBLOCK) && slptimeo)))
2280                                         return (NULL);
2281                                 goto start;
2282                                 /*NOTREACHED*/
2283                                 break;
2284
2285                         default:
2286                                 /*
2287                                  * unknown operation requested
2288                                  */
2289                                 panic("getblk: paging or unknown operation for incore busy buffer - %x\n", operation);
2290                                 /*NOTREACHED*/
2291                                 break;
2292                         }
2293                 } else {
2294                         /*
2295                          * buffer in core and not busy
2296                          */
2297                         SET(bp->b_lflags, BL_BUSY);
2298                         SET(bp->b_flags, B_CACHE);
2299                         buf_busycount++;
2300
2301                         bremfree_locked(bp);
2302                         bufstats.bufs_incore++;
2303
2304                         lck_mtx_unlock(buf_mtxp);
2305 #ifdef JOE_DEBUG
2306                         bp->b_owner = current_thread();
2307                         bp->b_tag   = 1;
2308 #endif
2309                         if ( (bp->b_upl) )
2310                                 panic("buffer has UPL, but not marked BUSY: %p", bp);
2311
2312                         if ( !ret_only_valid && bp->b_bufsize != size)
2313                                 allocbuf(bp, size);
2314
2315                         upl_flags = 0;
2316                         switch (operation) {
2317                         case BLK_WRITE:
2318                                 /*
2319                                  * "write" operation:  let the UPL subsystem
2320                                  * know that we intend to modify the buffer
2321                                  * cache pages we're gathering.
2322                                  */
2323                                 upl_flags |= UPL_WILL_MODIFY;
2324                         case BLK_READ:
2325                                 upl_flags |= UPL_PRECIOUS;
2326                                 if (UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
2327                                         kret = ubc_create_upl(vp,
2328                                                               ubc_blktooff(vp, bp->b_lblkno),
2329                                                               bp->b_bufsize,
2330                                                               &upl,
2331                                                               &pl,
2332                                                               upl_flags);
2333                                         if (kret != KERN_SUCCESS)
2334                                                 panic("Failed to create UPL");
2335
2336                                         bp->b_upl = upl;
2337
2338                                         if (upl_valid_page(pl, 0)) {
2339                                                 if (upl_dirty_page(pl, 0))
2340                                                         SET(bp->b_flags, B_WASDIRTY);
2341                                                 else
2342                                                         CLR(bp->b_flags, B_WASDIRTY);
2343                                         } else
2344                                                 CLR(bp->b_flags, (B_DONE | B_CACHE | B_WASDIRTY | B_DELWRI));
2345
2346                                         kret = ubc_upl_map(upl, (vm_offset_t*)&(bp->b_datap));
2347
2348                                         if (kret != KERN_SUCCESS)
2349                                                 panic("getblk: ubc_upl_map() failed with (%d)", kret);
2350                                 }
2351                                 break;
2352
2353                         case BLK_META:
2354                                 /*
2355                                  * VM is not involved in IO for the meta data
2356                                  * buffer already has valid data
2357                                  */
2358                                 break;
2359
2360                         default:
2361                                 panic("getblk: paging or unknown operation for incore buffer- %d\n", operation);
2362                                 /*NOTREACHED*/
2363                                 break;
2364                         }
2365                 }
2366         } else { /* not incore() */
2367                 int queue = BQ_EMPTY; /* Start with no preference */
2368
2369                 if (ret_only_valid) {
2370                         lck_mtx_unlock(buf_mtxp);
2371                         return (NULL);
2372                 }
2373                 if ((vnode_isreg(vp) == 0) || (UBCINFOEXISTS(vp) == 0) /*|| (vnode_issystem(vp) == 1)*/)
2374                         operation = BLK_META;
2375
2376                 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
2377                         goto start;
2378
2379                 /*
2380                  * getnewbuf may block for a number of different reasons...
2381                  * if it does, it's then possible for someone else to
2382                  * create a buffer for the same block and insert it into
2383                  * the hash... if we see it incore at this point we dump
2384                  * the buffer we were working on and start over
2385                  */
2386                 if (incore_locked(vp, blkno, dp)) {
2387                         SET(bp->b_flags, B_INVAL);
2388                         binshash(bp, &invalhash);
2389
2390                         lck_mtx_unlock(buf_mtxp);
2391
2392                         buf_brelse(bp);
2393                         goto start;
2394                 }
2395                 /*
2396                  * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
2397                  *       CALLED!  BE CAREFUL.
2398                  */
2399
2400                 /*
2401                  * mark the buffer as B_META if indicated
2402                  * so that when buffer is released it will goto META queue
2403                  */
2404                 if (operation == BLK_META)
2405                         SET(bp->b_flags, B_META);
2406
2407                 bp->b_blkno = bp->b_lblkno = blkno;
2408                 bp->b_vp = vp;
2409
2410                 /*
2411                  * Insert in the hash so that incore() can find it
2412                  */
2413                 binshash(bp, BUFHASH(vp, blkno));
2414
2415                 bgetvp_locked(vp, bp);
2416
2417                 lck_mtx_unlock(buf_mtxp);
2418
2419                 allocbuf(bp, size);
2420
2421                 upl_flags = 0;
2422                 switch (operation) {
2423                 case BLK_META:
2424                         /*
2425                          * buffer data is invalid...
2426                          *
2427                          * I don't want to have to retake buf_mtxp,
2428                          * so the miss and vmhits counters are done
2429                          * with Atomic updates... all other counters
2430                          * in bufstats are protected with either
2431                          * buf_mtxp or iobuffer_mtxp
2432                          */
2433                         OSAddAtomicLong(1, &bufstats.bufs_miss);
2434                         break;
2435
2436                 case BLK_WRITE:
2437                         /*
2438                          * "write" operation:  let the UPL subsystem know
2439                          * that we intend to modify the buffer cache pages
2440                          * we're gathering.
2441                          */
2442                         upl_flags |= UPL_WILL_MODIFY;
2443                 case BLK_READ:
2444                   {     off_t   f_offset;
2445                         size_t  contig_bytes;
2446                         int     bmap_flags;
2447
2448                         if ( (bp->b_upl) )
2449                                 panic("bp already has UPL: %p",bp);
2450
2451                         f_offset = ubc_blktooff(vp, blkno);
2452
2453                         upl_flags |= UPL_PRECIOUS;
2454                         kret = ubc_create_upl(vp,
2455                                               f_offset,
2456                                               bp->b_bufsize,
2457                                               &upl,
2458                                               &pl,
2459                                               upl_flags);
2460
2461                         if (kret != KERN_SUCCESS)
2462                                 panic("Failed to create UPL");
2463 #if  UPL_DEBUG
2464                         upl_ubc_alias_set(upl, (uintptr_t) bp, (uintptr_t) 4);
2465 #endif /* UPL_DEBUG */
2466                         bp->b_upl = upl;
2467
2468                         if (upl_valid_page(pl, 0)) {
2469
2470                                 if (operation == BLK_READ)
2471                                         bmap_flags = VNODE_READ;
2472                                 else
2473                                         bmap_flags = VNODE_WRITE;
2474
2475                                 SET(bp->b_flags, B_CACHE | B_DONE);
2476
2477                                 OSAddAtomicLong(1, &bufstats.bufs_vmhits);
2478
2479                                 bp->b_validoff = 0;
2480                                 bp->b_dirtyoff = 0;
2481
2482                                 if (upl_dirty_page(pl, 0)) {
2483                                         /* page is dirty */
2484                                         SET(bp->b_flags, B_WASDIRTY);
2485
2486                                         bp->b_validend = bp->b_bcount;
2487                                         bp->b_dirtyend = bp->b_bcount;
2488                                 } else {
2489                                         /* page is clean */
2490                                         bp->b_validend = bp->b_bcount;
2491                                         bp->b_dirtyend = 0;
2492                                 }
2493                                 /*
2494                                  * try to recreate the physical block number associated with
2495                                  * this buffer...
2496                                  */
2497                                 if (VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))
2498                                         panic("getblk: VNOP_BLOCKMAP failed");
2499                                 /*
2500                                  * if the extent represented by this buffer
2501                                  * is not completely physically contiguous on
2502                                  * disk, than we can't cache the physical mapping
2503                                  * in the buffer header
2504                                  */
2505                                 if ((long)contig_bytes < bp->b_bcount)
2506                                         bp->b_blkno = bp->b_lblkno;
2507                         } else {
2508                                 OSAddAtomicLong(1, &bufstats.bufs_miss);
2509                         }
2510                         kret = ubc_upl_map(upl, (vm_offset_t *)&(bp->b_datap));
2511
2512                         if (kret != KERN_SUCCESS)
2513                                 panic("getblk: ubc_upl_map() failed with (%d)", kret);
2514                         break;
2515                   }
2516                 default:
2517                         panic("getblk: paging or unknown operation - %x", operation);
2518                         /*NOTREACHED*/
2519                         break;
2520                 }
2521         }
2522         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
2523                      bp, bp->b_datap, bp->b_flags, 3, 0);
2524
2525 #ifdef JOE_DEBUG
2526         (void) OSBacktrace(&bp->b_stackgetblk[0], 6);
2527 #endif
2528         return (bp);
2529 }
2530
2531 /*
2532  * Get an empty, disassociated buffer of given size.
2533  */
2534 buf_t
2535 buf_geteblk(int size)
2536 {
2537         buf_t   bp = NULL;
2538         int queue = BQ_EMPTY;
2539
2540         do {
2541                 lck_mtx_lock_spin(buf_mtxp);
2542
2543                 bp = getnewbuf(0, 0, &queue);
2544         } while (bp == NULL);
2545
2546         SET(bp->b_flags, (B_META|B_INVAL));
2547
2548 #if DIAGNOSTIC
2549         assert(queue == BQ_EMPTY);
2550 #endif /* DIAGNOSTIC */
2551         /* XXX need to implement logic to deal with other queues */
2552
2553         binshash(bp, &invalhash);
2554         bufstats.bufs_eblk++;
2555
2556         lck_mtx_unlock(buf_mtxp);
2557
2558         allocbuf(bp, size);
2559
2560         return (bp);
2561 }
2562
2563
2564 /*
2565  * With UBC, there is no need to expand / shrink the file data
2566  * buffer. The VM uses the same pages, hence no waste.
2567  * All the file data buffers can have one size.
2568  * In fact expand / shrink would be an expensive operation.
2569  *
2570  * Only exception to this is meta-data buffers. Most of the
2571  * meta data operations are smaller than PAGE_SIZE. Having the
2572  * meta-data buffers grow and shrink as needed, optimizes use
2573  * of the kernel wired memory.
2574  */
2575
2576 int
2577 allocbuf(buf_t bp, int size)
2578 {
2579         vm_size_t desired_size;
2580
2581         desired_size = roundup(size, CLBYTES);
2582
2583         if (desired_size < PAGE_SIZE)
2584                 desired_size = PAGE_SIZE;
2585         if (desired_size > MAXBSIZE)
2586                 panic("allocbuf: buffer larger than MAXBSIZE requested");
2587
2588         if (ISSET(bp->b_flags, B_META)) {
2589                 zone_t zprev, z;
2590                 int    nsize = roundup(size, MINMETA);
2591
2592                 if (bp->b_datap) {
2593                         vm_offset_t elem = (vm_offset_t)bp->b_datap;
2594
2595                         if (ISSET(bp->b_flags, B_ZALLOC)) {
2596                                 if (bp->b_bufsize < nsize) {
2597                                         /* reallocate to a bigger size */
2598
2599                                         zprev = getbufzone(bp->b_bufsize);
2600                                         if (nsize <= MAXMETA) {
2601                                                 desired_size = nsize;
2602                                                 z = getbufzone(nsize);
2603                                                 /* b_datap not really a ptr */
2604                                                 *(void **)(&bp->b_datap) = zalloc(z);
2605                                         } else {
2606                                                 bp->b_datap = (uintptr_t)NULL;
2607                                                 kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
2608                                                 CLR(bp->b_flags, B_ZALLOC);
2609                                         }
2610                                         bcopy((void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
2611                                         zfree(zprev, (void *)elem);
2612                                 } else {
2613                                         desired_size = bp->b_bufsize;
2614                                 }
2615
2616                         } else {
2617                                 if ((vm_size_t)bp->b_bufsize < desired_size) {
2618                                         /* reallocate to a bigger size */
2619                                         bp->b_datap = (uintptr_t)NULL;
2620                                         kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
2621                                         bcopy((const void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
2622                                         kmem_free(kernel_map, elem, bp->b_bufsize);
2623                                 } else {
2624                                         desired_size = bp->b_bufsize;
2625                                 }
2626                         }
2627                 } else {
2628                         /* new allocation */
2629                         if (nsize <= MAXMETA) {
2630                                 desired_size = nsize;
2631                                 z = getbufzone(nsize);
2632                                 /* b_datap not really a ptr */
2633                                 *(void **)(&bp->b_datap) = zalloc(z);
2634                                 SET(bp->b_flags, B_ZALLOC);
2635                         } else
2636                                 kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
2637                 }
2638
2639                 if (bp->b_datap == 0)
2640                         panic("allocbuf: NULL b_datap");
2641         }
2642         bp->b_bufsize = desired_size;
2643         bp->b_bcount = size;
2644
2645         return (0);
2646 }
2647
2648 /*
2649  *      Get a new buffer from one of the free lists.
2650  *
2651  *      Request for a queue is passes in. The queue from which the buffer was taken
2652  *      from is returned. Out of range queue requests get BQ_EMPTY. Request for
2653  *      BQUEUE means no preference. Use heuristics in that case.
2654  *      Heuristics is as follows:
2655  *      Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
2656  *      If none available block till one is made available.
2657  *      If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
2658  *      Pick the most stale buffer.
2659  *      If found buffer was marked delayed write, start the async. write
2660  *      and restart the search.
2661  *      Initialize the fields and disassociate the buffer from the vnode.
2662  *      Remove the buffer from the hash. Return the buffer and the queue
2663  *      on which it was found.
2664  *
2665  *      buf_mtxp is held upon entry
2666  *      returns with buf_mtxp locked if new buf available
2667  *      returns with buf_mtxp UNlocked if new buf NOT available
2668  */
2669
2670 static buf_t
2671 getnewbuf(int slpflag, int slptimeo, int * queue)
2672 {
2673         buf_t   bp;
2674         buf_t   lru_bp;
2675         buf_t   age_bp;
2676         buf_t   meta_bp;
2677         int     age_time, lru_time, bp_time, meta_time;
2678         int     req = *queue;   /* save it for restarts */
2679         struct timespec ts;
2680
2681 start:
2682         /*
2683          * invalid request gets empty queue
2684          */
2685         if ((*queue >= BQUEUES) || (*queue < 0)
2686                 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED))
2687                 *queue = BQ_EMPTY;
2688
2689
2690         if (*queue == BQ_EMPTY && (bp = bufqueues[*queue].tqh_first))
2691                 goto found;
2692
2693         /*
2694          * need to grow number of bufs, add another one rather than recycling
2695          */
2696         if (nbuf_headers < max_nbuf_headers) {
2697                 /*
2698                  * Increment  count now as lock
2699                  * is dropped for allocation.
2700                  * That avoids over commits
2701                  */
2702                 nbuf_headers++;
2703                 goto add_newbufs;
2704         }
2705         /* Try for the requested queue first */
2706         bp = bufqueues[*queue].tqh_first;
2707         if (bp)
2708                 goto found;
2709
2710         /* Unable to use requested queue */
2711         age_bp = bufqueues[BQ_AGE].tqh_first;
2712         lru_bp = bufqueues[BQ_LRU].tqh_first;
2713         meta_bp = bufqueues[BQ_META].tqh_first;
2714
2715         if (!age_bp && !lru_bp && !meta_bp) {
2716                 /*
2717                  * Unavailble on AGE or LRU or META queues
2718                  * Try the empty list first
2719                  */
2720                 bp = bufqueues[BQ_EMPTY].tqh_first;
2721                 if (bp) {
2722                         *queue = BQ_EMPTY;
2723                         goto found;
2724                 }
2725                 /*
2726                  * We have seen is this is hard to trigger.
2727                  * This is an overcommit of nbufs but needed
2728                  * in some scenarios with diskiamges
2729                  */
2730
2731 add_newbufs:
2732                 lck_mtx_unlock(buf_mtxp);
2733
2734                 /* Create a new temporary buffer header */
2735                 bp = (struct buf *)zalloc(buf_hdr_zone);
2736
2737                 if (bp) {
2738                         bufhdrinit(bp);
2739                         bp->b_whichq = BQ_EMPTY;
2740                         bp->b_timestamp = buf_timestamp();
2741                         BLISTNONE(bp);
2742                         SET(bp->b_flags, B_HDRALLOC);
2743                         *queue = BQ_EMPTY;
2744                 }
2745                 lck_mtx_lock_spin(buf_mtxp);
2746
2747                 if (bp) {
2748                         binshash(bp, &invalhash);
2749                         binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
2750                         buf_hdr_count++;
2751                         goto found;
2752                 }
2753                 /* subtract already accounted bufcount */
2754                 nbuf_headers--;
2755
2756                 bufstats.bufs_sleeps++;
2757
2758                 /* wait for a free buffer of any kind */
2759                 needbuffer = 1;
2760                 /* hz value is 100 */
2761                 ts.tv_sec = (slptimeo/1000);
2762                 /* the hz value is 100; which leads to 10ms */
2763                 ts.tv_nsec = (slptimeo % 1000) * NSEC_PER_USEC * 1000 * 10;
2764
2765                 msleep(&needbuffer, buf_mtxp, slpflag | PDROP | (PRIBIO+1), "getnewbuf", &ts);
2766                 return (NULL);
2767         }
2768
2769         /* Buffer available either on AGE or LRU or META */
2770         bp = NULL;
2771         *queue = -1;
2772
2773         /* Buffer available either on AGE or LRU */
2774         if (!age_bp) {
2775                 bp = lru_bp;
2776                 *queue = BQ_LRU;
2777         } else if (!lru_bp) {
2778                 bp = age_bp;
2779                 *queue = BQ_AGE;
2780         } else { /* buffer available on both AGE and LRU */
2781                 int             t = buf_timestamp();
2782
2783                 age_time = t - age_bp->b_timestamp;
2784                 lru_time = t - lru_bp->b_timestamp;
2785                 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
2786                         bp = age_bp;
2787                         *queue = BQ_AGE;
2788                         /*
2789                          * we should probably re-timestamp eveything in the
2790                          * queues at this point with the current time
2791                          */
2792                 } else {
2793                         if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
2794                                 bp = lru_bp;
2795                                 *queue = BQ_LRU;
2796                         } else {
2797                                 bp = age_bp;
2798                                 *queue = BQ_AGE;
2799                         }
2800                 }
2801         }
2802
2803         if (!bp) { /* Neither on AGE nor on LRU */
2804                 bp = meta_bp;
2805                 *queue = BQ_META;
2806         }  else if (meta_bp) {
2807                 int             t = buf_timestamp();
2808
2809                 bp_time = t - bp->b_timestamp;
2810                 meta_time = t - meta_bp->b_timestamp;
2811
2812                 if (!(bp_time < 0) && !(meta_time < 0)) {
2813                         /* time not set backwards */
2814                         int bp_is_stale;
2815                         bp_is_stale = (*queue == BQ_LRU) ?
2816                                         lru_is_stale : age_is_stale;
2817
2818                         if ((meta_time >= meta_is_stale) &&
2819                                         (bp_time < bp_is_stale)) {
2820                                 bp = meta_bp;
2821                                 *queue = BQ_META;
2822                         }
2823                 }
2824         }
2825 found:
2826         if (ISSET(bp->b_flags, B_LOCKED) || ISSET(bp->b_lflags, BL_BUSY))
2827                 panic("getnewbuf: bp @ %p is LOCKED or BUSY! (flags 0x%x)\n", bp, bp->b_flags);
2828
2829         /* Clean it */
2830         if (bcleanbuf(bp, FALSE)) {
2831                 /*
2832                  * moved to the laundry thread, buffer not ready
2833                  */
2834                 *queue = req;
2835                 goto start;
2836         }
2837         return (bp);
2838 }
2839
2840
2841 /*
2842  * Clean a buffer.
2843  * Returns 0 is buffer is ready to use,
2844  * Returns 1 if issued a buf_bawrite() to indicate
2845  * that the buffer is not ready.
2846  *
2847  * buf_mtxp is held upon entry
2848  * returns with buf_mtxp locked
2849  */
2850 static int
2851 bcleanbuf(buf_t bp, boolean_t discard)
2852 {
2853         /* Remove from the queue */
2854         bremfree_locked(bp);
2855
2856 #ifdef JOE_DEBUG
2857         bp->b_owner = current_thread();
2858         bp->b_tag   = 2;
2859 #endif
2860         /*
2861          * If buffer was a delayed write, start the IO by queuing
2862          * it on the LAUNDRY queue, and return 1
2863          */
2864         if (ISSET(bp->b_flags, B_DELWRI)) {
2865                 if (discard) {
2866                         SET(bp->b_lflags, BL_WANTDEALLOC);
2867                 }
2868
2869                 bp->b_whichq = BQ_LAUNDRY;
2870                 bp->b_timestamp = buf_timestamp();
2871                 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
2872                 blaundrycnt++;
2873
2874                 lck_mtx_unlock(buf_mtxp);
2875
2876                 wakeup(&bufqueues[BQ_LAUNDRY]);
2877                 /*
2878                  * and give it a chance to run
2879                  */
2880                 (void)thread_block(THREAD_CONTINUE_NULL);
2881
2882                 lck_mtx_lock_spin(buf_mtxp);
2883
2884                 return (1);
2885         }
2886 #ifdef JOE_DEBUG
2887         bp->b_owner = current_thread();
2888         bp->b_tag   = 8;
2889 #endif
2890         /*
2891          * Buffer is no longer on any free list... we own it
2892          */
2893         SET(bp->b_lflags, BL_BUSY);
2894         buf_busycount++;
2895
2896         bremhash(bp);
2897
2898         /*
2899          * disassociate us from our vnode, if we had one...
2900          */
2901         if (bp->b_vp)
2902                 brelvp_locked(bp);
2903
2904         lck_mtx_unlock(buf_mtxp);
2905
2906         BLISTNONE(bp);
2907
2908         if (ISSET(bp->b_flags, B_META)) {
2909                 vm_offset_t elem;
2910
2911                 elem = (vm_offset_t)bp->b_datap;
2912                 bp->b_datap = (uintptr_t)0xdeadbeef;
2913
2914                 if (ISSET(bp->b_flags, B_ZALLOC)) {
2915                         zone_t z;
2916
2917                         z = getbufzone(bp->b_bufsize);
2918                         zfree(z, (void *)elem);
2919                 } else
2920                         kmem_free(kernel_map, elem, bp->b_bufsize);
2921         }
2922
2923         trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
2924
2925         /* nuke any credentials we were holding */
2926         if (IS_VALID_CRED(bp->b_rcred)) {
2927                 kauth_cred_unref(&bp->b_rcred);
2928         }
2929         if (IS_VALID_CRED(bp->b_wcred)) {
2930                 kauth_cred_unref(&bp->b_wcred);
2931         }
2932
2933         /* If discarding, just move to the empty queue */
2934         if (discard) {
2935                 lck_mtx_lock_spin(buf_mtxp);
2936                 CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA));
2937                 bp->b_whichq = BQ_EMPTY;
2938                 binshash(bp, &invalhash);
2939                 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
2940                 CLR(bp->b_lflags, BL_BUSY);
2941                 buf_busycount--;
2942         } else {
2943                 /* Not discarding: clean up and prepare for reuse */
2944                 bp->b_bufsize = 0;
2945                 bp->b_datap = (uintptr_t)NULL;
2946                 bp->b_upl = (void *)NULL;
2947                 /*
2948                  * preserve the state of whether this buffer
2949                  * was allocated on the fly or not...
2950                  * the only other flag that should be set at
2951                  * this point is BL_BUSY...
2952                  */
2953 #ifdef JOE_DEBUG
2954                 bp->b_owner = current_thread();
2955                 bp->b_tag   = 3;
2956 #endif
2957                 bp->b_lflags = BL_BUSY;
2958                 bp->b_flags = (bp->b_flags & B_HDRALLOC);
2959                 bp->b_dev = NODEV;
2960                 bp->b_blkno = bp->b_lblkno = 0;
2961                 bp->b_iodone = NULL;
2962                 bp->b_error = 0;
2963                 bp->b_resid = 0;
2964                 bp->b_bcount = 0;
2965                 bp->b_dirtyoff = bp->b_dirtyend = 0;
2966                 bp->b_validoff = bp->b_validend = 0;
2967
2968                 lck_mtx_lock_spin(buf_mtxp);
2969         }
2970         return (0);
2971 }
2972
2973
2974
2975 errno_t
2976 buf_invalblkno(vnode_t vp, daddr64_t lblkno, int flags)
2977 {
2978         buf_t   bp;
2979         errno_t error;
2980         struct bufhashhdr *dp;
2981
2982         dp = BUFHASH(vp, lblkno);
2983
2984 relook:
2985         lck_mtx_lock_spin(buf_mtxp);
2986
2987         if ((bp = incore_locked(vp, lblkno, dp)) == (struct buf *)0) {
2988                 lck_mtx_unlock(buf_mtxp);
2989                 return (0);
2990         }
2991         if (ISSET(bp->b_lflags, BL_BUSY)) {
2992                 if ( !ISSET(flags, BUF_WAIT)) {
2993                         lck_mtx_unlock(buf_mtxp);
2994                         return (EBUSY);
2995                 }
2996                 SET(bp->b_lflags, BL_WANTED);
2997
2998                 error = msleep((caddr_t)bp, buf_mtxp, PDROP | (PRIBIO + 1), "buf_invalblkno", NULL);
2999
3000                 if (error) {
3001                         return (error);
3002                 }
3003                 goto relook;
3004         }
3005         bremfree_locked(bp);
3006         SET(bp->b_lflags, BL_BUSY);
3007         SET(bp->b_flags, B_INVAL);
3008         buf_busycount++;
3009 #ifdef JOE_DEBUG
3010         bp->b_owner = current_thread();
3011         bp->b_tag   = 4;
3012 #endif
3013         lck_mtx_unlock(buf_mtxp);
3014         buf_brelse(bp);
3015
3016         return (0);
3017 }
3018
3019
3020 void
3021 buf_drop(buf_t bp)
3022 {
3023         int need_wakeup = 0;
3024
3025         lck_mtx_lock_spin(buf_mtxp);
3026
3027         if (ISSET(bp->b_lflags, BL_WANTED)) {
3028                 /*
3029                  * delay the actual wakeup until after we
3030                  * clear BL_BUSY and we've dropped buf_mtxp
3031                  */
3032                 need_wakeup = 1;
3033         }
3034 #ifdef JOE_DEBUG
3035         bp->b_owner = current_thread();
3036         bp->b_tag   = 9;
3037 #endif
3038         /*
3039          * Unlock the buffer.
3040          */
3041         CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
3042         buf_busycount--;
3043
3044         lck_mtx_unlock(buf_mtxp);
3045
3046         if (need_wakeup) {
3047                 /*
3048                  * Wake up any proceeses waiting for _this_ buffer to become free.
3049                  */
3050                 wakeup(bp);
3051         }
3052 }
3053
3054
3055 errno_t
3056 buf_acquire(buf_t bp, int flags, int slpflag, int slptimeo) {
3057         errno_t error;
3058
3059         lck_mtx_lock_spin(buf_mtxp);
3060
3061         error = buf_acquire_locked(bp, flags, slpflag, slptimeo);
3062
3063         lck_mtx_unlock(buf_mtxp);
3064
3065         return (error);
3066 }
3067
3068
3069 static errno_t
3070 buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo)
3071 {
3072         errno_t error;
3073         struct timespec ts;
3074
3075         if (ISSET(bp->b_flags, B_LOCKED)) {
3076                 if ((flags & BAC_SKIP_LOCKED))
3077                         return (EDEADLK);
3078         } else {
3079                 if ((flags & BAC_SKIP_NONLOCKED))
3080                         return (EDEADLK);
3081         }
3082         if (ISSET(bp->b_lflags, BL_BUSY)) {
3083                 /*
3084                  * since the lck_mtx_lock may block, the buffer
3085                  * may become BUSY, so we need to
3086                  * recheck for a NOWAIT request
3087                  */
3088                 if (flags & BAC_NOWAIT)
3089                         return (EBUSY);
3090                 SET(bp->b_lflags, BL_WANTED);
3091
3092                 /* the hz value is 100; which leads to 10ms */
3093                 ts.tv_sec = (slptimeo/100);
3094                 ts.tv_nsec = (slptimeo % 100) * 10  * NSEC_PER_USEC * 1000;
3095                 error = msleep((caddr_t)bp, buf_mtxp, slpflag | (PRIBIO + 1), "buf_acquire", &ts);
3096
3097                 if (error)
3098                         return (error);
3099                 return (EAGAIN);
3100         }
3101         if (flags & BAC_REMOVE)
3102                 bremfree_locked(bp);
3103         SET(bp->b_lflags, BL_BUSY);
3104         buf_busycount++;
3105
3106 #ifdef JOE_DEBUG
3107         bp->b_owner = current_thread();
3108         bp->b_tag   = 5;
3109 #endif
3110         return (0);
3111 }
3112
3113
3114 /*
3115  * Wait for operations on the buffer to complete.
3116  * When they do, extract and return the I/O's error value.
3117  */
3118 errno_t
3119 buf_biowait(buf_t bp)
3120 {
3121         while (!ISSET(bp->b_flags, B_DONE)) {
3122
3123                 lck_mtx_lock_spin(buf_mtxp);
3124
3125                 if (!ISSET(bp->b_flags, B_DONE)) {
3126                         DTRACE_IO1(wait__start, buf_t, bp);
3127                         (void) msleep(bp, buf_mtxp, PDROP | (PRIBIO+1), "buf_biowait", NULL);
3128                         DTRACE_IO1(wait__done, buf_t, bp);
3129                 } else
3130                         lck_mtx_unlock(buf_mtxp);
3131         }
3132         /* check for interruption of I/O (e.g. via NFS), then errors. */
3133         if (ISSET(bp->b_flags, B_EINTR)) {
3134                 CLR(bp->b_flags, B_EINTR);
3135                 return (EINTR);
3136         } else if (ISSET(bp->b_flags, B_ERROR))
3137                 return (bp->b_error ? bp->b_error : EIO);
3138         else
3139                 return (0);
3140 }
3141
3142 /*
3143  * Wait for the callback operation on a B_CALL buffer to complete.
3144  */
3145 void
3146 buf_biowait_callback(buf_t bp)
3147 {
3148         while (!ISSET(bp->b_lflags, BL_CALLDONE)) {
3149
3150                 lck_mtx_lock_spin(buf_mtxp);
3151
3152                 if (!ISSET(bp->b_lflags, BL_CALLDONE)) {
3153                         DTRACE_IO1(wait__start, buf_t, bp);
3154                         (void) msleep(bp, buf_mtxp, PDROP | (PRIBIO+1), "buf_biowait", NULL);
3155                         DTRACE_IO1(wait__done, buf_t, bp);
3156                 } else
3157                         lck_mtx_unlock(buf_mtxp);
3158         }
3159 }
3160
3161 /*
3162  * Mark I/O complete on a buffer.
3163  *
3164  * If a callback has been requested, e.g. the pageout
3165  * daemon, do so. Otherwise, awaken waiting processes.
3166  *
3167  * [ Leffler, et al., says on p.247:
3168  *      "This routine wakes up the blocked process, frees the buffer
3169  *      for an asynchronous write, or, for a request by the pagedaemon
3170  *      process, invokes a procedure specified in the buffer structure" ]
3171  *
3172  * In real life, the pagedaemon (or other system processes) wants
3173  * to do async stuff to, and doesn't want the buffer buf_brelse()'d.
3174  * (for swap pager, that puts swap buffers on the free lists (!!!),
3175  * for the vn device, that puts malloc'd buffers on the free lists!)
3176  */
3177 extern struct timeval priority_IO_timestamp_for_root;
3178 extern int hard_throttle_on_root;
3179
3180 void
3181 buf_biodone(buf_t bp)
3182 {
3183         mount_t mp;
3184
3185         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
3186                      bp, bp->b_datap, bp->b_flags, 0, 0);
3187
3188         if (ISSET(bp->b_flags, B_DONE))
3189                 panic("biodone already");
3190
3191         if (ISSET(bp->b_flags, B_ERROR)) {
3192                 fslog_io_error(bp);
3193         }
3194
3195         if (bp->b_vp && bp->b_vp->v_mount) {
3196                 mp = bp->b_vp->v_mount;
3197         } else {
3198                 mp = NULL;
3199         }
3200
3201         if (mp && (bp->b_flags & B_READ) == 0) {
3202                 update_last_io_time(mp);
3203                 INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_write_size);
3204         } else if (mp) {
3205                 INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_read_size);
3206         }
3207
3208         if (kdebug_enable) {
3209                 int    code = DKIO_DONE;
3210
3211                 if (bp->b_flags & B_READ)
3212                         code |= DKIO_READ;
3213                 if (bp->b_flags & B_ASYNC)
3214                         code |= DKIO_ASYNC;
3215
3216                 if (bp->b_flags & B_META)
3217                         code |= DKIO_META;
3218                 else if (bp->b_flags & B_PAGEIO)
3219                         code |= DKIO_PAGING;
3220
3221                 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
3222                               bp, (uintptr_t)bp->b_vp,
3223                                       bp->b_resid, bp->b_error, 0);
3224         }
3225         if ((bp->b_vp != NULLVP) &&
3226             ((bp->b_flags & (B_IOSTREAMING | B_PAGEIO | B_READ)) == (B_PAGEIO | B_READ)) &&
3227             (bp->b_vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV)) {
3228                 microuptime(&priority_IO_timestamp_for_root);
3229                 hard_throttle_on_root = 0;
3230         }
3231         /*
3232          * I/O was done, so don't believe
3233          * the DIRTY state from VM anymore
3234          */
3235         CLR(bp->b_flags, B_WASDIRTY);
3236         DTRACE_IO1(done, buf_t, bp);
3237
3238         if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
3239                 /*
3240                  * wake up any writer's blocked
3241                  * on throttle or waiting for I/O
3242                  * to drain
3243                  */
3244                 vnode_writedone(bp->b_vp);
3245
3246         if (ISSET(bp->b_flags, (B_CALL | B_FILTER))) {  /* if necessary, call out */
3247                 void    (*iodone_func)(struct buf *, void *) = bp->b_iodone;
3248                 void    *arg = (void *)bp->b_transaction;
3249                 int     callout = ISSET(bp->b_flags, B_CALL);
3250
3251                 CLR(bp->b_flags, (B_CALL | B_FILTER));  /* filters and callouts are one-shot */
3252                 bp->b_iodone = NULL;
3253                 bp->b_transaction = NULL;
3254
3255                 if (iodone_func == NULL) {
3256                         panic("biodone: bp @ %p has NULL b_iodone!\n", bp);
3257                 } else {
3258                         if (callout)
3259                                 SET(bp->b_flags, B_DONE);       /* note that it's done */
3260                         (*iodone_func)(bp, arg);
3261                 }
3262                 if (callout) {
3263                         int need_wakeup = 0;
3264
3265                         /*
3266                          * assumes that the callback function takes
3267                          * ownership of the bp and deals with releasing it if necessary
3268                          * BL_WANTED indicates that we've decided to wait on the
3269                          * completion of this I/O in a synchronous manner... we
3270                          * still call the callback function, but in addition we
3271                          * will do a wakeup... BL_CALLDONE indicates that the callback
3272                          * routine has completed and its ok for the waiter to take
3273                          * 'ownership' of this bp back
3274                          */
3275                         lck_mtx_lock_spin(buf_mtxp);
3276
3277                         if (bp->b_lflags & BL_WANTED) {
3278                                 CLR(bp->b_lflags, BL_WANTED);
3279                                 need_wakeup = 1;
3280                         }
3281                         SET(bp->b_lflags, BL_CALLDONE);
3282
3283                         lck_mtx_unlock(buf_mtxp);
3284
3285                         if (need_wakeup)
3286                                 wakeup(bp);
3287
3288                         goto biodone_done;
3289                 }
3290                 /*
3291                  * in this case the call back function is acting
3292                  * strictly as a filter... it does not take
3293                  * ownership of the bp and is expecting us
3294                  * to finish cleaning up... this is currently used
3295                  * by the HFS journaling code
3296                  */
3297         }
3298         if (ISSET(bp->b_flags, B_ASYNC)) {      /* if async, release it */
3299                 SET(bp->b_flags, B_DONE);       /* note that it's done */
3300
3301                 buf_brelse(bp);
3302         } else {                                /* or just wakeup the buffer */
3303                 /*
3304                  * by taking the mutex, we serialize
3305                  * the buf owner calling buf_biowait so that we'll
3306                  * only see him in one of 2 states...
3307                  * state 1: B_DONE wasn't set and he's
3308                  * blocked in msleep
3309                  * state 2: he's blocked trying to take the
3310                  * mutex before looking at B_DONE
3311                  * BL_WANTED is cleared in case anyone else
3312                  * is blocked waiting for the buffer... note
3313                  * that we haven't cleared B_BUSY yet, so if
3314                  * they do get to run, their going to re-set
3315                  * BL_WANTED and go back to sleep
3316                  */
3317                 lck_mtx_lock_spin(buf_mtxp);
3318
3319                 CLR(bp->b_lflags, BL_WANTED);
3320                 SET(bp->b_flags, B_DONE);               /* note that it's done */
3321
3322                 lck_mtx_unlock(buf_mtxp);
3323
3324                 wakeup(bp);
3325         }
3326 biodone_done:
3327         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
3328                  (uintptr_t)bp, (uintptr_t)bp->b_datap, bp->b_flags, 0, 0);
3329 }
3330
3331 /*
3332  * Return a count of buffers on the "locked" queue.
3333  */
3334 int
3335 count_lock_queue(void)
3336 {
3337         buf_t   bp;
3338         int     n = 0;
3339
3340         lck_mtx_lock_spin(buf_mtxp);
3341
3342         for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
3343             bp = bp->b_freelist.tqe_next)
3344                 n++;
3345         lck_mtx_unlock(buf_mtxp);
3346
3347         return (n);
3348 }
3349
3350 /*
3351  * Return a count of 'busy' buffers. Used at the time of shutdown.
3352  */
3353 int
3354 count_busy_buffers(void)
3355 {
3356         return buf_busycount + bufstats.bufs_iobufinuse;
3357 }
3358
3359 #if DIAGNOSTIC
3360 /*
3361  * Print out statistics on the current allocation of the buffer pool.
3362  * Can be enabled to print out on every ``sync'' by setting "syncprt"
3363  * in vfs_syscalls.c using sysctl.
3364  */
3365 void
3366 vfs_bufstats()
3367 {
3368         int i, j, count;
3369         register struct buf *bp;
3370         register struct bqueues *dp;
3371         int counts[MAXBSIZE/CLBYTES+1];
3372         static char *bname[BQUEUES] =
3373                 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
3374
3375         for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
3376                 count = 0;
3377                 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
3378                         counts[j] = 0;
3379
3380                 lck_mtx_lock(buf_mtxp);
3381
3382                 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
3383                         counts[bp->b_bufsize/CLBYTES]++;
3384                         count++;
3385                 }
3386                 lck_mtx_unlock(buf_mtxp);
3387
3388                 printf("%s: total-%d", bname[i], count);
3389                 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
3390                         if (counts[j] != 0)
3391                                 printf(", %d-%d", j * CLBYTES, counts[j]);
3392                 printf("\n");
3393         }
3394 }
3395 #endif /* DIAGNOSTIC */
3396
3397 #define NRESERVEDIOBUFS 64
3398
3399
3400 buf_t
3401 alloc_io_buf(vnode_t vp, int priv)
3402 {
3403         buf_t   bp;
3404
3405         lck_mtx_lock_spin(iobuffer_mtxp);
3406
3407         while (((niobuf_headers - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse) && !priv) ||
3408                (bp = iobufqueue.tqh_first) == NULL) {
3409                 bufstats.bufs_iobufsleeps++;
3410
3411                 need_iobuffer = 1;
3412                 (void) msleep(&need_iobuffer, iobuffer_mtxp, PDROP | (PRIBIO+1), (const char *)"alloc_io_buf", NULL);
3413
3414                 lck_mtx_lock_spin(iobuffer_mtxp);
3415         }
3416         TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
3417
3418         bufstats.bufs_iobufinuse++;
3419         if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
3420                 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
3421
3422         lck_mtx_unlock(iobuffer_mtxp);
3423
3424         /*
3425          * initialize various fields
3426          * we don't need to hold the mutex since the buffer
3427          * is now private... the vp should have a reference
3428          * on it and is not protected by this mutex in any event
3429          */
3430         bp->b_timestamp = 0;
3431         bp->b_proc = NULL;
3432
3433         bp->b_datap = 0;
3434         bp->b_flags = 0;
3435         bp->b_lflags = BL_BUSY | BL_IOBUF;
3436         bp->b_blkno = bp->b_lblkno = 0;
3437 #ifdef JOE_DEBUG
3438         bp->b_owner = current_thread();
3439         bp->b_tag   = 6;
3440 #endif
3441         bp->b_iodone = NULL;
3442         bp->b_error = 0;
3443         bp->b_resid = 0;
3444         bp->b_bcount = 0;
3445         bp->b_bufsize = 0;
3446         bp->b_upl = NULL;
3447         bp->b_vp = vp;
3448
3449         if (vp && (vp->v_type == VBLK || vp->v_type == VCHR))
3450                 bp->b_dev = vp->v_rdev;
3451         else
3452                 bp->b_dev = NODEV;
3453
3454         return (bp);
3455 }
3456
3457
3458 void
3459 free_io_buf(buf_t bp)
3460 {
3461         int need_wakeup = 0;
3462
3463         /*
3464          * put buffer back on the head of the iobufqueue
3465          */
3466         bp->b_vp = NULL;
3467         bp->b_flags = B_INVAL;
3468
3469         lck_mtx_lock_spin(iobuffer_mtxp);
3470
3471         binsheadfree(bp, &iobufqueue, -1);
3472
3473         if (need_iobuffer) {
3474                 /*
3475                  * Wake up any processes waiting because they need an io buffer
3476                  *
3477                  * do the wakeup after we drop the mutex... it's possible that the
3478                  * wakeup will be superfluous if need_iobuffer gets set again and
3479                  * another thread runs this path, but it's highly unlikely, doesn't
3480                  * hurt, and it means we don't hold up I/O progress if the wakeup blocks
3481                  * trying to grab a task related lock...
3482                  */
3483                 need_iobuffer = 0;
3484                 need_wakeup = 1;
3485         }
3486         if (bufstats.bufs_iobufinuse <= 0)
3487                 panic("free_io_buf: bp(%p) - bufstats.bufs_iobufinuse < 0", bp);
3488
3489         bufstats.bufs_iobufinuse--;
3490
3491         lck_mtx_unlock(iobuffer_mtxp);
3492
3493         if (need_wakeup)
3494                 wakeup(&need_iobuffer);
3495 }
3496
3497
3498 void
3499 buf_list_lock(void)
3500 {
3501         lck_mtx_lock_spin(buf_mtxp);
3502 }
3503
3504 void
3505 buf_list_unlock(void)
3506 {
3507         lck_mtx_unlock(buf_mtxp);
3508 }
3509
3510 /*
3511  * If getnewbuf() calls bcleanbuf() on the same thread
3512  * there is a potential for stack overrun and deadlocks.
3513  * So we always handoff the work to a worker thread for completion
3514  */
3515
3516
3517 static void
3518 bcleanbuf_thread_init(void)
3519 {
3520         thread_t        thread = THREAD_NULL;
3521
3522         /* create worker thread */
3523         kernel_thread_start((thread_continue_t)bcleanbuf_thread, NULL, &thread);
3524         thread_deallocate(thread);
3525 }
3526
3527 static void
3528 bcleanbuf_thread(void)
3529 {
3530         struct buf *bp;
3531         int error = 0;
3532         int loopcnt = 0;
3533
3534         for (;;) {
3535                 lck_mtx_lock_spin(buf_mtxp);
3536
3537                 while ( (bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY])) == NULL) {
3538                         (void)msleep((void *)&bufqueues[BQ_LAUNDRY], buf_mtxp, PDROP | PRIBIO, "blaundry", NULL);
3539
3540                         lck_mtx_lock_spin(buf_mtxp);
3541                 }
3542                 /*
3543                  * Remove from the queue
3544                  */
3545                 bremfree_locked(bp);
3546
3547                 /*
3548                  * Buffer is no longer on any free list
3549                  */
3550                 SET(bp->b_lflags, BL_BUSY);
3551                 buf_busycount++;
3552
3553 #ifdef JOE_DEBUG
3554                 bp->b_owner = current_thread();
3555                 bp->b_tag   = 10;
3556 #endif
3557
3558                 lck_mtx_unlock(buf_mtxp);
3559                 /*
3560                  * do the IO
3561                  */
3562                 error = bawrite_internal(bp, 0);
3563
3564                 if (error) {
3565                         bp->b_whichq = BQ_LAUNDRY;
3566                         bp->b_timestamp = buf_timestamp();
3567
3568                         lck_mtx_lock_spin(buf_mtxp);
3569
3570                         binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
3571                         blaundrycnt++;
3572
3573                         /* we never leave a busy page on the laundary queue */
3574                         CLR(bp->b_lflags, BL_BUSY);
3575                         buf_busycount--;
3576 #ifdef JOE_DEBUG
3577                         bp->b_owner = current_thread();
3578                         bp->b_tag   = 11;
3579 #endif
3580
3581                         lck_mtx_unlock(buf_mtxp);
3582
3583                         if (loopcnt > 10) {
3584                                 (void)tsleep((void *)&bufqueues[BQ_LAUNDRY], PRIBIO, "blaundry", 1);
3585                                 loopcnt = 0;
3586                         } else {
3587                                 (void)thread_block(THREAD_CONTINUE_NULL);
3588                                 loopcnt++;
3589                         }
3590                 }
3591         }
3592 }
3593
3594
3595 static int
3596 brecover_data(buf_t bp)
3597 {
3598         int     upl_offset;
3599         upl_t   upl;
3600         upl_page_info_t *pl;
3601         kern_return_t kret;
3602         vnode_t vp = bp->b_vp;
3603         int upl_flags;
3604
3605
3606         if ( !UBCINFOEXISTS(vp) || bp->b_bufsize == 0)
3607                 goto dump_buffer;
3608
3609         upl_flags = UPL_PRECIOUS;
3610         if (! (buf_flags(bp) & B_READ)) {
3611                 /*
3612                  * "write" operation:  let the UPL subsystem know
3613                  * that we intend to modify the buffer cache pages we're
3614                  * gathering.
3615                  */
3616                 upl_flags |= UPL_WILL_MODIFY;
3617         }
3618
3619         kret = ubc_create_upl(vp,
3620                               ubc_blktooff(vp, bp->b_lblkno),
3621                               bp->b_bufsize,
3622                               &upl,
3623                               &pl,
3624                               upl_flags);
3625         if (kret != KERN_SUCCESS)
3626                 panic("Failed to create UPL");
3627
3628         for (upl_offset = 0; upl_offset < bp->b_bufsize; upl_offset += PAGE_SIZE) {
3629
3630                 if (!upl_valid_page(pl, upl_offset / PAGE_SIZE) || !upl_dirty_page(pl, upl_offset / PAGE_SIZE)) {
3631                         ubc_upl_abort(upl, 0);
3632                         goto dump_buffer;
3633                 }
3634         }
3635         bp->b_upl = upl;
3636
3637         kret = ubc_upl_map(upl, (vm_offset_t *)&(bp->b_datap));
3638
3639         if (kret != KERN_SUCCESS)
3640                 panic("getblk: ubc_upl_map() failed with (%d)", kret);
3641         return (1);
3642
3643 dump_buffer:
3644         bp->b_bufsize = 0;
3645         SET(bp->b_flags, B_INVAL);
3646         buf_brelse(bp);
3647
3648         return(0);
3649 }
3650
3651 static boolean_t
3652 buffer_cache_gc(void)
3653 {
3654         buf_t bp;
3655         boolean_t did_large_zfree = FALSE;
3656         int now = buf_timestamp();
3657
3658         lck_mtx_lock_spin(buf_mtxp);
3659
3660         /* We only care about metadata (incore storage comes from zalloc()) */
3661         bp = TAILQ_FIRST(&bufqueues[BQ_META]);
3662
3663         /* Only collect buffers unused in the last N seconds. Note: ordered by timestamp. */
3664         while ((bp != NULL) && ((now - bp->b_timestamp) > BUF_STALE_THRESHHOLD)) {
3665                 int result, size;
3666                 boolean_t is_zalloc;
3667
3668                 size = buf_size(bp);
3669                 is_zalloc = ISSET(bp->b_flags, B_ZALLOC);
3670
3671                 result = bcleanbuf(bp, TRUE);
3672                 if ((result == 0) && is_zalloc && (size >= PAGE_SIZE)) {
3673                         /* We've definitely freed at least a page to a zone */
3674                         did_large_zfree = TRUE;
3675                 }
3676                 bp = TAILQ_FIRST(&bufqueues[BQ_META]);
3677         }
3678
3679         lck_mtx_unlock(buf_mtxp);
3680
3681         return did_large_zfree;
3682 }
3683
3684
3685 /*
3686  * disabled for now
3687  */
3688
3689 #if FLUSH_QUEUES
3690
3691 #define NFLUSH 32
3692
3693 static int
3694 bp_cmp(void *a, void *b)
3695 {
3696     buf_t *bp_a = *(buf_t **)a,
3697           *bp_b = *(buf_t **)b;
3698     daddr64_t res;
3699
3700     // don't have to worry about negative block
3701     // numbers so this is ok to do.
3702     //
3703     res = (bp_a->b_blkno - bp_b->b_blkno);
3704
3705     return (int)res;
3706 }
3707
3708
3709 int
3710 bflushq(int whichq, mount_t mp)
3711 {
3712         buf_t   bp, next;
3713         int     i, buf_count;
3714         int     total_writes = 0;
3715         static buf_t flush_table[NFLUSH];
3716
3717         if (whichq < 0 || whichq >= BQUEUES) {
3718             return (0);
3719         }
3720
3721   restart:
3722         lck_mtx_lock(buf_mtxp);
3723
3724         bp = TAILQ_FIRST(&bufqueues[whichq]);
3725
3726         for (buf_count = 0; bp; bp = next) {
3727             next = bp->b_freelist.tqe_next;
3728
3729             if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) {
3730                 continue;
3731             }
3732
3733             if (ISSET(bp->b_flags, B_DELWRI) && !ISSET(bp->b_lflags, BL_BUSY)) {
3734
3735                 bremfree_locked(bp);
3736 #ifdef JOE_DEBUG
3737                 bp->b_owner = current_thread();
3738                 bp->b_tag   = 7;
3739 #endif
3740                 SET(bp->b_lflags, BL_BUSY);
3741                 buf_busycount++;
3742
3743                 flush_table[buf_count] = bp;
3744                 buf_count++;
3745                 total_writes++;
3746
3747                 if (buf_count >= NFLUSH) {
3748                     lck_mtx_unlock(buf_mtxp);
3749
3750                     qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
3751
3752                     for (i = 0; i < buf_count; i++) {
3753                         buf_bawrite(flush_table[i]);
3754                     }
3755                     goto restart;
3756                 }
3757             }
3758         }
3759         lck_mtx_unlock(buf_mtxp);
3760
3761         if (buf_count > 0) {
3762             qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
3763
3764             for (i = 0; i < buf_count; i++) {
3765                 buf_bawrite(flush_table[i]);
3766             }
3767         }
3768
3769         return (total_writes);
3770 }
3771 #endif
3772
3773
3774 #if BALANCE_QUEUES
3775
3776 /* XXX move this to a separate file */
3777
3778 /*
3779  * NOTE: THIS CODE HAS NOT BEEN UPDATED
3780  * WITH RESPECT TO THE NEW LOCKING MODEL
3781  */
3782
3783
3784 /*
3785  * Dynamic Scaling of the Buffer Queues
3786  */
3787
3788 typedef long long blsize_t;
3789
3790 blsize_t MAXNBUF; /* initialize to (sane_size / PAGE_SIZE) */
3791 /* Global tunable limits */
3792 blsize_t nbufh;                 /* number of buffer headers */
3793 blsize_t nbuflow;               /* minimum number of buffer headers required */
3794 blsize_t nbufhigh;              /* maximum number of buffer headers allowed */
3795 blsize_t nbuftarget;    /* preferred number of buffer headers */
3796
3797 /*
3798  * assertions:
3799  *
3800  * 1.   0 < nbuflow <= nbufh <= nbufhigh
3801  * 2.   nbufhigh <= MAXNBUF
3802  * 3.   0 < nbuflow <= nbuftarget <= nbufhigh
3803  * 4.   nbufh can not be set by sysctl().
3804  */
3805
3806 /* Per queue tunable limits */
3807
3808 struct bufqlim {
3809         blsize_t        bl_nlow;        /* minimum number of buffer headers required */
3810         blsize_t        bl_num;         /* number of buffer headers on the queue */
3811         blsize_t        bl_nlhigh;      /* maximum number of buffer headers allowed */
3812         blsize_t        bl_target;      /* preferred number of buffer headers */
3813         long    bl_stale;       /* Seconds after which a buffer is considered stale */
3814 } bufqlim[BQUEUES];
3815
3816 /*
3817  * assertions:
3818  *
3819  * 1.   0 <= bl_nlow <= bl_num <= bl_nlhigh
3820  * 2.   bl_nlhigh <= MAXNBUF
3821  * 3.  bufqlim[BQ_META].bl_nlow != 0
3822  * 4.  bufqlim[BQ_META].bl_nlow > (number of possible concurrent
3823  *                                                                      file system IO operations)
3824  * 5.   bl_num can not be set by sysctl().
3825  * 6.   bl_nhigh <= nbufhigh
3826  */
3827
3828 /*
3829  * Rationale:
3830  * ----------
3831  * Defining it blsize_t as long permits 2^31 buffer headers per queue.
3832  * Which can describe (2^31 * PAGE_SIZE) memory per queue.
3833  *
3834  * These limits are exported to by means of sysctl().
3835  * It was decided to define blsize_t as a 64 bit quantity.
3836  * This will make sure that we will not be required to change it
3837  * as long as we do not exceed 64 bit address space for the kernel.
3838  *
3839  * low and high numbers parameters initialized at compile time
3840  * and boot arguments can be used to override them. sysctl()
3841  * would not change the value. sysctl() can get all the values
3842  * but can set only target. num is the current level.
3843  *
3844  * Advantages of having a "bufqscan" thread doing the balancing are,
3845  * Keep enough bufs on BQ_EMPTY.
3846  *      getnewbuf() by default will always select a buffer from the BQ_EMPTY.
3847  *              getnewbuf() perfoms best if a buffer was found there.
3848  *              Also this minimizes the possibility of starting IO
3849  *              from getnewbuf(). That's a performance win, too.
3850  *
3851  *      Localize complex logic [balancing as well as time aging]
3852  *              to balancebufq().
3853  *
3854  *      Simplify getnewbuf() logic by elimination of time aging code.
3855  */
3856
3857 /*
3858  * Algorithm:
3859  * -----------
3860  * The goal of the dynamic scaling of the buffer queues to to keep
3861  * the size of the LRU close to bl_target. Buffers on a queue would
3862  * be time aged.
3863  *
3864  * There would be a thread which will be responsible for "balancing"
3865  * the buffer cache queues.
3866  *
3867  * The scan order would be:     AGE, LRU, META, EMPTY.
3868  */
3869
3870 long bufqscanwait = 0;
3871
3872 static void bufqscan_thread();
3873 static int balancebufq(int q);
3874 static int btrimempty(int n);
3875 static __inline__ int initbufqscan(void);
3876 static __inline__ int nextbufq(int q);
3877 static void buqlimprt(int all);
3878
3879
3880 static __inline__ void
3881 bufqinc(int q)
3882 {
3883         if ((q < 0) || (q >= BQUEUES))
3884                 return;
3885
3886         bufqlim[q].bl_num++;
3887         return;
3888 }
3889
3890 static __inline__ void
3891 bufqdec(int q)
3892 {
3893         if ((q < 0) || (q >= BQUEUES))
3894                 return;
3895
3896         bufqlim[q].bl_num--;
3897         return;
3898 }
3899
3900 static void
3901 bufq_balance_thread_init(void)
3902 {
3903         thread_t        thread = THREAD_NULL;
3904
3905         if (bufqscanwait++ == 0) {
3906
3907                 /* Initalize globals */
3908                 MAXNBUF = (sane_size / PAGE_SIZE);
3909                 nbufh = nbuf_headers;
3910                 nbuflow = min(nbufh, 100);
3911                 nbufhigh = min(MAXNBUF, max(nbufh, 2048));
3912                 nbuftarget = (sane_size >> 5) / PAGE_SIZE;
3913                 nbuftarget = max(nbuflow, nbuftarget);
3914                 nbuftarget = min(nbufhigh, nbuftarget);
3915
3916                 /*
3917                  * Initialize the bufqlim
3918                  */
3919
3920                 /* LOCKED queue */
3921                 bufqlim[BQ_LOCKED].bl_nlow = 0;
3922                 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
3923                 bufqlim[BQ_LOCKED].bl_target = 0;
3924                 bufqlim[BQ_LOCKED].bl_stale = 30;
3925
3926                 /* LRU queue */
3927                 bufqlim[BQ_LRU].bl_nlow = 0;
3928                 bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
3929                 bufqlim[BQ_LRU].bl_target = nbuftarget/4;
3930                 bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
3931
3932                 /* AGE queue */
3933                 bufqlim[BQ_AGE].bl_nlow = 0;
3934                 bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
3935                 bufqlim[BQ_AGE].bl_target = nbuftarget/4;
3936                 bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
3937
3938                 /* EMPTY queue */
3939                 bufqlim[BQ_EMPTY].bl_nlow = 0;
3940                 bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
3941                 bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
3942                 bufqlim[BQ_EMPTY].bl_stale = 600000;
3943
3944                 /* META queue */
3945                 bufqlim[BQ_META].bl_nlow = 0;
3946                 bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
3947                 bufqlim[BQ_META].bl_target = nbuftarget/4;
3948                 bufqlim[BQ_META].bl_stale = META_IS_STALE;
3949
3950                 /* LAUNDRY queue */
3951                 bufqlim[BQ_LOCKED].bl_nlow = 0;
3952                 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
3953                 bufqlim[BQ_LOCKED].bl_target = 0;
3954                 bufqlim[BQ_LOCKED].bl_stale = 30;
3955
3956                 buqlimprt(1);
3957         }
3958
3959         /* create worker thread */
3960         kernel_thread_start((thread_continue_t)bufqscan_thread, NULL, &thread);
3961         thread_deallocate(thread);
3962 }
3963
3964 /* The workloop for the buffer balancing thread */
3965 static void
3966 bufqscan_thread()
3967 {
3968         int moretodo = 0;
3969
3970         for(;;) {
3971                 do {
3972                         int q;  /* buffer queue to process */
3973
3974                         q = initbufqscan();
3975                         for (; q; ) {
3976                                 moretodo |= balancebufq(q);
3977                                 q = nextbufq(q);
3978                         }
3979                 } while (moretodo);
3980
3981 #if DIAGNOSTIC
3982                 vfs_bufstats();
3983                 buqlimprt(0);
3984 #endif
3985                 (void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz);
3986                 moretodo = 0;
3987         }
3988 }
3989
3990 /* Seed for the buffer queue balancing */
3991 static __inline__ int
3992 initbufqscan()
3993 {
3994         /* Start with AGE queue */
3995         return (BQ_AGE);
3996 }
3997
3998 /* Pick next buffer queue to balance */
3999 static __inline__ int
4000 nextbufq(int q)
4001 {
4002         int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
4003
4004         q++;
4005         q %= sizeof(order);
4006         return (order[q]);
4007 }
4008
4009 /* function to balance the buffer queues */
4010 static int
4011 balancebufq(int q)
4012 {
4013         int moretodo = 0;
4014         int n, t;
4015
4016         /* reject invalid q */
4017         if ((q < 0) || (q >= BQUEUES))
4018                 goto out;
4019
4020         /* LOCKED or LAUNDRY queue MUST not be balanced */
4021         if ((q == BQ_LOCKED) || (q == BQ_LAUNDRY))
4022                 goto out;
4023
4024         n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
4025
4026         /* If queue has less than target nothing more to do */
4027         if (n < 0)
4028                 goto out;
4029
4030         if ( n > 8 ) {
4031                 /* Balance only a small amount (12.5%) at a time */
4032                 n >>= 3;
4033         }
4034
4035         /* EMPTY queue needs special handling */
4036         if (q == BQ_EMPTY) {
4037                 moretodo |= btrimempty(n);
4038                 goto out;
4039         }
4040
4041         t = buf_timestamp():
4042
4043         for (; n > 0; n--) {
4044                 struct buf *bp = bufqueues[q].tqh_first;
4045                 if (!bp)
4046                         break;
4047
4048                 /* check if it's stale */
4049                 if ((t - bp->b_timestamp) > bufqlim[q].bl_stale) {
4050                         if (bcleanbuf(bp, FALSE)) {
4051                                 /* buf_bawrite() issued, bp not ready */
4052                                 moretodo = 1;
4053                         } else {
4054                                 /* release the cleaned buffer to BQ_EMPTY */
4055                                 SET(bp->b_flags, B_INVAL);
4056                                 buf_brelse(bp);
4057                         }
4058                 } else
4059                         break;
4060         }
4061
4062 out:
4063         return (moretodo);
4064 }
4065
4066 static int
4067 btrimempty(int n)
4068 {
4069         /*
4070          * When struct buf are allocated dynamically, this would
4071          * reclaim upto 'n' struct buf from the empty queue.
4072          */
4073
4074          return (0);
4075 }
4076
4077 static void
4078 buqlimprt(int all)
4079 {
4080         int i;
4081     static char *bname[BQUEUES] =
4082                 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
4083
4084         if (all)
4085                 for (i = 0; i < BQUEUES; i++) {
4086                         printf("%s : ", bname[i]);
4087                         printf("min = %ld, ", (long)bufqlim[i].bl_nlow);
4088                         printf("cur = %ld, ", (long)bufqlim[i].bl_num);
4089                         printf("max = %ld, ", (long)bufqlim[i].bl_nlhigh);
4090                         printf("target = %ld, ", (long)bufqlim[i].bl_target);
4091                         printf("stale after %ld seconds\n", bufqlim[i].bl_stale);
4092                 }
4093         else
4094                 for (i = 0; i < BQUEUES; i++) {
4095                         printf("%s : ", bname[i]);
4096                         printf("cur = %ld, ", (long)bufqlim[i].bl_num);
4097                 }
4098 }
4099
4100 #endif
4101
4102