bsd/vfs/vfs_bio.c

   1 /*
   2  * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License.  The rights granted to you under the
  10  * License may not be used to create, or enable the creation or
  11  * redistribution of, unlawful or unlicensed copies of an Apple operating
  12  * system, or to circumvent, violate, or enable the circumvention or
  13  * violation of, any terms of an Apple operating system software license
  14  * agreement.
  15  *
  16  * Please obtain a copy of the License at
  17  * http://www.opensource.apple.com/apsl/ and read it before using this
  18  * file.
  19  *
  20  * The Original Code and all software distributed under the License are
  21  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  22  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  23  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  24  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  25  * Please see the License for the specific language governing rights and
  26  * limitations under the License.
  27  *
  28  * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
  29  */
  30 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  31 /*-
  32  * Copyright (c) 1994 Christopher G. Demetriou
  33  * Copyright (c) 1982, 1986, 1989, 1993
  34  *      The Regents of the University of California.  All rights reserved.
  35  * (c) UNIX System Laboratories, Inc.
  36  * All or some portions of this file are derived from material licensed
  37  * to the University of California by American Telephone and Telegraph
  38  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  39  * the permission of UNIX System Laboratories, Inc.
  40  *
  41  * Redistribution and use in source and binary forms, with or without
  42  * modification, are permitted provided that the following conditions
  43  * are met:
  44  * 1. Redistributions of source code must retain the above copyright
  45  *    notice, this list of conditions and the following disclaimer.
  46  * 2. Redistributions in binary form must reproduce the above copyright
  47  *    notice, this list of conditions and the following disclaimer in the
  48  *    documentation and/or other materials provided with the distribution.
  49  * 3. All advertising materials mentioning features or use of this software
  50  *    must display the following acknowledgement:
  51  *      This product includes software developed by the University of
  52  *      California, Berkeley and its contributors.
  53  * 4. Neither the name of the University nor the names of its contributors
  54  *    may be used to endorse or promote products derived from this software
  55  *    without specific prior written permission.
  56  *
  57  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  58  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  59  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  60  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  61  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  62  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  63  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  64  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  65  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  66  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  67  * SUCH DAMAGE.
  68  *
  69  *      @(#)vfs_bio.c   8.6 (Berkeley) 1/11/94
  70  */
  71
  72 /*
  73  * Some references:
  74  *      Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
  75  *      Leffler, et al.: The Design and Implementation of the 4.3BSD
  76  *              UNIX Operating System (Addison Welley, 1989)
  77  */
  78
  79 #include <sys/param.h>
  80 #include <sys/systm.h>
  81 #include <sys/proc_internal.h>
  82 #include <sys/buf_internal.h>
  83 #include <sys/vnode_internal.h>
  84 #include <sys/mount_internal.h>
  85 #include <sys/trace.h>
  86 #include <sys/malloc.h>
  87 #include <sys/resourcevar.h>
  88 #include <miscfs/specfs/specdev.h>
  89 #include <sys/ubc.h>
  90 #include <sys/kauth.h>
  91 #if DIAGNOSTIC
  92 #include <kern/assert.h>
  93 #endif /* DIAGNOSTIC */
  94 #include <kern/task.h>
  95 #include <kern/zalloc.h>
  96 #include <kern/lock.h>
  97
  98 #include <vm/vm_kern.h>
  99
 100 #include <sys/kdebug.h>
 101 #include <machine/spl.h>
 102
 103 #if BALANCE_QUEUES
 104 static __inline__ void bufqinc(int q);
 105 static __inline__ void bufqdec(int q);
 106 #endif
 107
 108 static int      bcleanbuf(buf_t bp);
 109 static int      brecover_data(buf_t bp);
 110 static boolean_t incore(vnode_t vp, daddr64_t blkno);
 111 static buf_t    incore_locked(vnode_t vp, daddr64_t blkno);
 112 /* timeout is in msecs */
 113 static buf_t    getnewbuf(int slpflag, int slptimeo, int *queue);
 114 static void     bremfree_locked(buf_t bp);
 115 static void     buf_reassign(buf_t bp, vnode_t newvp);
 116 static errno_t  buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo);
 117 static int      buf_iterprepare(vnode_t vp, struct buflists *, int flags);
 118 static void     buf_itercomplete(vnode_t vp, struct buflists *, int flags);
 119
 120 __private_extern__ int  bdwrite_internal(buf_t, int);
 121
 122 /* zone allocated buffer headers */
 123 static void     bufzoneinit(void);
 124 static void     bcleanbuf_thread_init(void);
 125 static void     bcleanbuf_thread(void);
 126
 127 static zone_t   buf_hdr_zone;
 128 static int      buf_hdr_count;
 129
 130
 131 /*
 132  * Definitions for the buffer hash lists.
 133  */
 134 #define BUFHASH(dvp, lbn)       \
 135         (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
 136 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
 137 u_long  bufhash;
 138
 139 /* Definitions for the buffer stats. */
 140 struct bufstats bufstats;
 141
 142 /* Number of delayed write buffers */
 143 int nbdwrite = 0;
 144 int blaundrycnt = 0;
 145
 146
 147 static TAILQ_HEAD(ioqueue, buf) iobufqueue;
 148 static TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
 149 static int needbuffer;
 150 static int need_iobuffer;
 151
 152 static lck_grp_t        *buf_mtx_grp;
 153 static lck_attr_t       *buf_mtx_attr;
 154 static lck_grp_attr_t   *buf_mtx_grp_attr;
 155 static lck_mtx_t        *iobuffer_mtxp;
 156 static lck_mtx_t        *buf_mtxp;
 157
 158 static __inline__ int
 159 buf_timestamp(void)
 160 {
 161         struct  timeval         t;
 162         microuptime(&t);
 163         return (t.tv_sec);
 164 }
 165
 166 /*
 167  * Insq/Remq for the buffer free lists.
 168  */
 169 #if BALANCE_QUEUES
 170 #define binsheadfree(bp, dp, whichq)    do { \
 171                                     TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
 172                                         bufqinc((whichq));      \
 173                                         (bp)->b_whichq = whichq; \
 174                                     (bp)->b_timestamp = buf_timestamp(); \
 175                                 } while (0)
 176
 177 #define binstailfree(bp, dp, whichq)    do { \
 178                                     TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
 179                                         bufqinc((whichq));      \
 180                                         (bp)->b_whichq = whichq; \
 181                                     (bp)->b_timestamp = buf_timestamp(); \
 182                                 } while (0)
 183 #else
 184 #define binsheadfree(bp, dp, whichq)    do { \
 185                                     TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
 186                                         (bp)->b_whichq = whichq; \
 187                                     (bp)->b_timestamp = buf_timestamp(); \
 188                                 } while (0)
 189
 190 #define binstailfree(bp, dp, whichq)    do { \
 191                                     TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
 192                                         (bp)->b_whichq = whichq; \
 193                                     (bp)->b_timestamp = buf_timestamp(); \
 194                                 } while (0)
 195 #endif
 196
 197
 198 #define BHASHENTCHECK(bp)       \
 199         if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef)  \
 200                 panic("%x: b_hash.le_prev is not deadbeef", (bp));
 201
 202 #define BLISTNONE(bp)   \
 203         (bp)->b_hash.le_next = (struct buf *)0; \
 204         (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
 205
 206 /*
 207  * Insq/Remq for the vnode usage lists.
 208  */
 209 #define bufinsvn(bp, dp)        LIST_INSERT_HEAD(dp, bp, b_vnbufs)
 210 #define bufremvn(bp) {                                                  \
 211         LIST_REMOVE(bp, b_vnbufs);                                      \
 212         (bp)->b_vnbufs.le_next = NOLIST;                                \
 213 }
 214
 215 /*
 216  * Time in seconds before a buffer on a list is
 217  * considered as a stale buffer
 218  */
 219 #define LRU_IS_STALE 120 /* default value for the LRU */
 220 #define AGE_IS_STALE 60  /* default value for the AGE */
 221 #define META_IS_STALE 180 /* default value for the BQ_META */
 222
 223 int lru_is_stale = LRU_IS_STALE;
 224 int age_is_stale = AGE_IS_STALE;
 225 int meta_is_stale = META_IS_STALE;
 226 static int boot_nbuf = 0;
 227
 228
 229 /* LIST_INSERT_HEAD() with assertions */
 230 static __inline__ void
 231 blistenterhead(struct bufhashhdr * head, buf_t bp)
 232 {
 233         if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
 234                 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
 235         (head)->lh_first = bp;
 236         bp->b_hash.le_prev = &(head)->lh_first;
 237         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
 238                 panic("blistenterhead: le_prev is deadbeef");
 239 }
 240
 241 static __inline__ void
 242 binshash(buf_t bp, struct bufhashhdr *dp)
 243 {
 244 #if DIAGNOSTIC
 245         buf_t   nbp;
 246 #endif /* DIAGNOSTIC */
 247
 248         BHASHENTCHECK(bp);
 249
 250 #if DIAGNOSTIC
 251         nbp = dp->lh_first;
 252         for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
 253                 if(nbp == bp)
 254                         panic("buf already in hashlist");
 255         }
 256 #endif /* DIAGNOSTIC */
 257
 258         blistenterhead(dp, bp);
 259 }
 260
 261 static __inline__ void
 262 bremhash(buf_t  bp)
 263 {
 264         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
 265                 panic("bremhash le_prev is deadbeef");
 266         if (bp->b_hash.le_next == bp)
 267                 panic("bremhash: next points to self");
 268
 269         if (bp->b_hash.le_next != NULL)
 270                 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
 271         *bp->b_hash.le_prev = (bp)->b_hash.le_next;
 272 }
 273
 274
 275
 276
 277 int
 278 buf_valid(buf_t bp) {
 279
 280         if ( (bp->b_flags & (B_DONE | B_DELWRI)) )
 281                 return 1;
 282         return 0;
 283 }
 284
 285 int
 286 buf_fromcache(buf_t bp) {
 287
 288         if ( (bp->b_flags & B_CACHE) )
 289                 return 1;
 290         return 0;
 291 }
 292
 293 void
 294 buf_markinvalid(buf_t bp) {
 295
 296         SET(bp->b_flags, B_INVAL);
 297 }
 298
 299 void
 300 buf_markdelayed(buf_t bp) {
 301
 302         SET(bp->b_flags, B_DELWRI);
 303         buf_reassign(bp, bp->b_vp);
 304 }
 305
 306 void
 307 buf_markeintr(buf_t bp) {
 308
 309         SET(bp->b_flags, B_EINTR);
 310 }
 311
 312 void
 313 buf_markaged(buf_t bp) {
 314
 315         SET(bp->b_flags, B_AGE);
 316 }
 317
 318 errno_t
 319 buf_error(buf_t bp) {
 320
 321         return (bp->b_error);
 322 }
 323
 324 void
 325 buf_seterror(buf_t bp, errno_t error) {
 326
 327         if ((bp->b_error = error))
 328                 SET(bp->b_flags, B_ERROR);
 329         else
 330                 CLR(bp->b_flags, B_ERROR);
 331 }
 332
 333 void
 334 buf_setflags(buf_t bp, int32_t flags) {
 335
 336         SET(bp->b_flags, (flags & BUF_X_WRFLAGS));
 337 }
 338
 339 void
 340 buf_clearflags(buf_t bp, int32_t flags) {
 341
 342         CLR(bp->b_flags, (flags & BUF_X_WRFLAGS));
 343 }
 344
 345 int32_t
 346 buf_flags(buf_t bp) {
 347
 348         return ((bp->b_flags & BUF_X_RDFLAGS));
 349 }
 350
 351 void
 352 buf_reset(buf_t bp, int32_t io_flags) {
 353
 354         CLR(bp->b_flags, (B_READ | B_WRITE | B_ERROR | B_DONE | B_INVAL | B_ASYNC | B_NOCACHE));
 355         SET(bp->b_flags, (io_flags & (B_ASYNC | B_READ | B_WRITE | B_NOCACHE)));
 356
 357         bp->b_error = 0;
 358 }
 359
 360 uint32_t
 361 buf_count(buf_t bp) {
 362
 363         return (bp->b_bcount);
 364 }
 365
 366 void
 367 buf_setcount(buf_t bp, uint32_t bcount) {
 368
 369         bp->b_bcount = bcount;
 370 }
 371
 372 uint32_t
 373 buf_size(buf_t bp) {
 374
 375         return (bp->b_bufsize);
 376 }
 377
 378 void
 379 buf_setsize(buf_t bp, uint32_t bufsize) {
 380
 381         bp->b_bufsize = bufsize;
 382 }
 383
 384 uint32_t
 385 buf_resid(buf_t bp) {
 386
 387         return (bp->b_resid);
 388 }
 389
 390 void
 391 buf_setresid(buf_t bp, uint32_t resid) {
 392
 393         bp->b_resid = resid;
 394 }
 395
 396 uint32_t
 397 buf_dirtyoff(buf_t bp) {
 398
 399         return (bp->b_dirtyoff);
 400 }
 401
 402 uint32_t
 403 buf_dirtyend(buf_t bp) {
 404
 405         return (bp->b_dirtyend);
 406 }
 407
 408 void
 409 buf_setdirtyoff(buf_t bp, uint32_t dirtyoff) {
 410
 411         bp->b_dirtyoff = dirtyoff;
 412 }
 413
 414 void
 415 buf_setdirtyend(buf_t bp, uint32_t dirtyend) {
 416
 417         bp->b_dirtyend = dirtyend;
 418 }
 419
 420 uintptr_t
 421 buf_dataptr(buf_t bp) {
 422
 423         return (bp->b_datap);
 424 }
 425
 426 void
 427 buf_setdataptr(buf_t bp, uintptr_t data) {
 428
 429         bp->b_datap = data;
 430 }
 431
 432 vnode_t
 433 buf_vnode(buf_t bp) {
 434
 435         return (bp->b_vp);
 436 }
 437
 438 void
 439 buf_setvnode(buf_t bp, vnode_t vp) {
 440
 441         bp->b_vp = vp;
 442 }
 443
 444
 445 void *
 446 buf_callback(buf_t bp)
 447 {
 448         if ( !(bp->b_lflags & BL_IOBUF) )
 449                 return ((void *) NULL);
 450         if ( !(bp->b_flags & B_CALL) )
 451                 return ((void *) NULL);
 452
 453         return ((void *)bp->b_iodone);
 454 }
 455
 456
 457 errno_t
 458 buf_setcallback(buf_t bp, void (*callback)(buf_t, void *), void *transaction)
 459 {
 460
 461         if ( !(bp->b_lflags & BL_IOBUF) )
 462                 return (EINVAL);
 463
 464         if (callback)
 465                 bp->b_flags |= (B_CALL | B_ASYNC);
 466         else
 467                 bp->b_flags &= ~B_CALL;
 468         bp->b_transaction = transaction;
 469         bp->b_iodone = callback;
 470
 471         return (0);
 472 }
 473
 474 errno_t
 475 buf_setupl(buf_t bp, upl_t upl, uint32_t offset)
 476 {
 477
 478         if ( !(bp->b_lflags & BL_IOBUF) )
 479                 return (EINVAL);
 480
 481         if (upl)
 482                 bp->b_flags |= B_CLUSTER;
 483         else
 484                 bp->b_flags &= ~B_CLUSTER;
 485         bp->b_upl = upl;
 486         bp->b_uploffset = offset;
 487
 488         return (0);
 489 }
 490
 491 buf_t
 492 buf_clone(buf_t bp, int io_offset, int io_size, void (*iodone)(buf_t, void *), void *arg)
 493 {
 494         buf_t   io_bp;
 495
 496         if (io_offset < 0 || io_size < 0)
 497                 return (NULL);
 498
 499         if ((unsigned)(io_offset + io_size) > (unsigned)bp->b_bcount)
 500                 return (NULL);
 501
 502         if (bp->b_flags & B_CLUSTER) {
 503                 if (io_offset && ((bp->b_uploffset + io_offset) & PAGE_MASK))
 504                         return (NULL);
 505
 506                 if (((bp->b_uploffset + io_offset + io_size) & PAGE_MASK) && ((io_offset + io_size) < bp->b_bcount))
 507                         return (NULL);
 508         }
 509         io_bp = alloc_io_buf(bp->b_vp, 0);
 510
 511         io_bp->b_flags = bp->b_flags & (B_COMMIT_UPL | B_META | B_PAGEIO | B_CLUSTER | B_PHYS | B_ASYNC | B_READ);
 512
 513         if (iodone) {
 514                 io_bp->b_transaction = arg;
 515                 io_bp->b_iodone = iodone;
 516                 io_bp->b_flags |= B_CALL;
 517         }
 518         if (bp->b_flags & B_CLUSTER) {
 519                 io_bp->b_upl = bp->b_upl;
 520                 io_bp->b_uploffset = bp->b_uploffset + io_offset;
 521         } else {
 522                 io_bp->b_datap  = (uintptr_t)(((char *)bp->b_datap) + io_offset);
 523         }
 524         io_bp->b_bcount = io_size;
 525
 526         return (io_bp);
 527 }
 528
 529
 530
 531 void
 532 buf_setfilter(buf_t bp, void (*filter)(buf_t, void *), void *transaction,
 533               void **old_iodone, void **old_transaction)
 534 {
 535         if (old_iodone)
 536                 *old_iodone = (void *)(bp->b_iodone);
 537         if (old_transaction)
 538                 *old_transaction = (void *)(bp->b_transaction);
 539
 540         bp->b_transaction = transaction;
 541         bp->b_iodone = filter;
 542         bp->b_flags |= B_FILTER;
 543 }
 544
 545
 546 daddr64_t
 547 buf_blkno(buf_t bp) {
 548
 549         return (bp->b_blkno);
 550 }
 551
 552 daddr64_t
 553 buf_lblkno(buf_t bp) {
 554
 555         return (bp->b_lblkno);
 556 }
 557
 558 void
 559 buf_setblkno(buf_t bp, daddr64_t blkno) {
 560
 561         bp->b_blkno = blkno;
 562 }
 563
 564 void
 565 buf_setlblkno(buf_t bp, daddr64_t lblkno) {
 566
 567         bp->b_lblkno = lblkno;
 568 }
 569
 570 dev_t
 571 buf_device(buf_t bp) {
 572
 573         return (bp->b_dev);
 574 }
 575
 576 errno_t
 577 buf_setdevice(buf_t bp, vnode_t vp) {
 578
 579         if ((vp->v_type != VBLK) && (vp->v_type != VCHR))
 580                 return EINVAL;
 581         bp->b_dev = vp->v_rdev;
 582
 583         return 0;
 584 }
 585
 586
 587 void *
 588 buf_drvdata(buf_t bp) {
 589
 590         return (bp->b_drvdata);
 591 }
 592
 593 void
 594 buf_setdrvdata(buf_t bp, void *drvdata) {
 595
 596         bp->b_drvdata = drvdata;
 597 }
 598
 599 void *
 600 buf_fsprivate(buf_t bp) {
 601
 602         return (bp->b_fsprivate);
 603 }
 604
 605 void
 606 buf_setfsprivate(buf_t bp, void *fsprivate) {
 607
 608         bp->b_fsprivate = fsprivate;
 609 }
 610
 611 ucred_t
 612 buf_rcred(buf_t bp) {
 613
 614         return (bp->b_rcred);
 615 }
 616
 617 ucred_t
 618 buf_wcred(buf_t bp) {
 619
 620         return (bp->b_wcred);
 621 }
 622
 623 void *
 624 buf_upl(buf_t bp) {
 625
 626         return (bp->b_upl);
 627 }
 628
 629 uint32_t
 630 buf_uploffset(buf_t bp) {
 631
 632         return ((uint32_t)(bp->b_uploffset));
 633 }
 634
 635 proc_t
 636 buf_proc(buf_t bp) {
 637
 638         return (bp->b_proc);
 639 }
 640
 641
 642 errno_t
 643 buf_map(buf_t bp, caddr_t *io_addr)
 644 {
 645         buf_t           real_bp;
 646         vm_offset_t     vaddr;
 647         kern_return_t   kret;
 648
 649         if ( !(bp->b_flags & B_CLUSTER)) {
 650                 *io_addr = (caddr_t)bp->b_datap;
 651                 return (0);
 652         }
 653         real_bp = (buf_t)(bp->b_real_bp);
 654
 655         if (real_bp && real_bp->b_datap) {
 656                 /*
 657                  * b_real_bp is only valid if B_CLUSTER is SET
 658                  * if it's non-zero, than someone did a cluster_bp call
 659                  * if the backing physical pages were already mapped
 660                  * in before the call to cluster_bp (non-zero b_datap),
 661                  * than we just use that mapping
 662                  */
 663                 *io_addr = (caddr_t)real_bp->b_datap;
 664                 return (0);
 665         }
 666         kret = ubc_upl_map(bp->b_upl, &vaddr);    /* Map it in */
 667
 668         if (kret != KERN_SUCCESS) {
 669                 *io_addr = 0;
 670
 671                 return(ENOMEM);
 672         }
 673         vaddr += bp->b_uploffset;
 674
 675         *io_addr = (caddr_t)vaddr;
 676
 677         return (0);
 678 }
 679
 680 errno_t
 681 buf_unmap(buf_t bp)
 682 {
 683         buf_t           real_bp;
 684         kern_return_t   kret;
 685
 686         if ( !(bp->b_flags & B_CLUSTER))
 687                 return (0);
 688         /*
 689          * see buf_map for the explanation
 690          */
 691         real_bp = (buf_t)(bp->b_real_bp);
 692
 693         if (real_bp && real_bp->b_datap)
 694                 return (0);
 695
 696         if (bp->b_lflags & BL_IOBUF) {
 697                 /*
 698                  * when we commit these pages, we'll hit
 699                  * it with UPL_COMMIT_INACTIVE which
 700                  * will clear the reference bit that got
 701                  * turned on when we touched the mapping
 702                  */
 703                 bp->b_flags |= B_AGE;
 704         }
 705         kret = ubc_upl_unmap(bp->b_upl);
 706
 707         if (kret != KERN_SUCCESS)
 708                 return (EINVAL);
 709         return (0);
 710 }
 711
 712
 713 void
 714 buf_clear(buf_t bp) {
 715         caddr_t baddr;
 716
 717         if (buf_map(bp, &baddr) == 0) {
 718                 bzero(baddr, bp->b_bcount);
 719                 buf_unmap(bp);
 720         }
 721         bp->b_resid = 0;
 722 }
 723
 724
 725
 726 /*
 727  * Read or write a buffer that is not contiguous on disk.
 728  * buffer is marked done/error at the conclusion
 729  */
 730 static int
 731 buf_strategy_fragmented(vnode_t devvp, buf_t bp, off_t f_offset, size_t contig_bytes)
 732 {
 733         vnode_t vp = buf_vnode(bp);
 734         buf_t   io_bp;                   /* For reading or writing a single block */
 735         int     io_direction;
 736         int     io_resid;
 737         size_t  io_contig_bytes;
 738         daddr64_t io_blkno;
 739         int     error = 0;
 740         int     bmap_flags;
 741
 742         /*
 743          * save our starting point... the bp was already mapped
 744          * in buf_strategy before we got called
 745          * no sense doing it again.
 746          */
 747         io_blkno = bp->b_blkno;
 748         /*
 749          * Make sure we redo this mapping for the next I/O
 750          * i.e. this can never be a 'permanent' mapping
 751          */
 752         bp->b_blkno = bp->b_lblkno;
 753
 754         /*
 755          * Get an io buffer to do the deblocking
 756          */
 757         io_bp = alloc_io_buf(devvp, 0);
 758
 759         io_bp->b_lblkno = bp->b_lblkno;
 760         io_bp->b_datap  = bp->b_datap;
 761         io_resid        = bp->b_bcount;
 762         io_direction    = bp->b_flags & B_READ;
 763         io_contig_bytes = contig_bytes;
 764
 765         if (bp->b_flags & B_READ)
 766                 bmap_flags = VNODE_READ;
 767         else
 768                 bmap_flags = VNODE_WRITE;
 769
 770         for (;;) {
 771                 if (io_blkno == -1)
 772                         /*
 773                          * this is unexepected, but we'll allow for it
 774                          */
 775                         bzero((caddr_t)io_bp->b_datap, (int)io_contig_bytes);
 776                 else {
 777                         io_bp->b_bcount  = io_contig_bytes;
 778                         io_bp->b_bufsize = io_contig_bytes;
 779                         io_bp->b_resid   = io_contig_bytes;
 780                         io_bp->b_blkno   = io_blkno;
 781
 782                         buf_reset(io_bp, io_direction);
 783                         /*
 784                          * Call the device to do the I/O and wait for it
 785                          */
 786                         if ((error = VNOP_STRATEGY(io_bp)))
 787                                 break;
 788                         if ((error = (int)buf_biowait(io_bp)))
 789                                 break;
 790                         if (io_bp->b_resid) {
 791                                 io_resid -= (io_contig_bytes - io_bp->b_resid);
 792                                 break;
 793                         }
 794                 }
 795                 if ((io_resid -= io_contig_bytes) == 0)
 796                         break;
 797                 f_offset       += io_contig_bytes;
 798                 io_bp->b_datap += io_contig_bytes;
 799
 800                 /*
 801                  * Map the current position to a physical block number
 802                  */
 803                 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_resid, &io_blkno, &io_contig_bytes, NULL, bmap_flags, NULL)))
 804                         break;
 805         }
 806         buf_free(io_bp);
 807
 808         if (error)
 809                 buf_seterror(bp, error);
 810         bp->b_resid = io_resid;
 811         /*
 812          * This I/O is now complete
 813          */
 814         buf_biodone(bp);
 815
 816         return error;
 817 }
 818
 819
 820 /*
 821  * struct vnop_strategy_args {
 822  *      struct buf *a_bp;
 823  * } *ap;
 824  */
 825 errno_t
 826 buf_strategy(vnode_t devvp, void *ap)
 827 {
 828         buf_t   bp = ((struct vnop_strategy_args *)ap)->a_bp;
 829         vnode_t vp = bp->b_vp;
 830         int     bmap_flags;
 831         errno_t error;
 832
 833         if (vp == NULL || vp->v_type == VCHR || vp->v_type == VBLK)
 834                 panic("buf_strategy: b_vp == NULL || vtype == VCHR | VBLK\n");
 835         /*
 836          * associate the physical device with
 837          * with this buf_t even if we don't
 838          * end up issuing the I/O...
 839          */
 840         bp->b_dev = devvp->v_rdev;
 841
 842         if (bp->b_flags & B_READ)
 843                 bmap_flags = VNODE_READ;
 844         else
 845                 bmap_flags = VNODE_WRITE;
 846
 847         if ( !(bp->b_flags & B_CLUSTER)) {
 848
 849                 if ( (bp->b_upl) ) {
 850                         /*
 851                          * we have a UPL associated with this bp
 852                          * go through cluster_bp which knows how
 853                          * to deal with filesystem block sizes
 854                          * that aren't equal to the page size
 855                          */
 856                         return (cluster_bp(bp));
 857                 }
 858                 if (bp->b_blkno == bp->b_lblkno) {
 859                         off_t   f_offset;
 860                         size_t  contig_bytes;
 861
 862                         if ((error = VNOP_BLKTOOFF(vp, bp->b_lblkno, &f_offset))) {
 863                                 buf_seterror(bp, error);
 864                                 buf_biodone(bp);
 865
 866                                 return (error);
 867                         }
 868                         if ((error = VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))) {
 869                                 buf_seterror(bp, error);
 870                                 buf_biodone(bp);
 871
 872                                 return (error);
 873                         }
 874                         if (bp->b_blkno == -1)
 875                                 buf_clear(bp);
 876                         else if ((long)contig_bytes < bp->b_bcount)
 877                                 return (buf_strategy_fragmented(devvp, bp, f_offset, contig_bytes));
 878                 }
 879                 if (bp->b_blkno == -1) {
 880                         buf_biodone(bp);
 881                         return (0);
 882                 }
 883         }
 884         /*
 885          * we can issue the I/O because...
 886          * either B_CLUSTER is set which
 887          * means that the I/O is properly set
 888          * up to be a multiple of the page size, or
 889          * we were able to successfully set up the
 890          * phsyical block mapping
 891          */
 892         return (VOCALL(devvp->v_op, VOFFSET(vnop_strategy), ap));
 893 }
 894
 895
 896
 897 buf_t
 898 buf_alloc(vnode_t vp)
 899 {
 900         return(alloc_io_buf(vp, 0));
 901 }
 902
 903 void
 904 buf_free(buf_t bp) {
 905
 906         free_io_buf(bp);
 907 }
 908
 909
 910
 911 void
 912 buf_iterate(vnode_t vp, int (*callout)(buf_t, void *), int flags, void *arg) {
 913         buf_t   bp;
 914         int     retval;
 915         struct  buflists local_iterblkhd;
 916         int     lock_flags = BAC_NOWAIT | BAC_REMOVE;
 917
 918         if (flags & BUF_SKIP_LOCKED)
 919                 lock_flags |= BAC_SKIP_LOCKED;
 920         if (flags & BUF_SKIP_NONLOCKED)
 921                 lock_flags |= BAC_SKIP_NONLOCKED;
 922
 923         lck_mtx_lock(buf_mtxp);
 924
 925         if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY))  {
 926                 lck_mtx_unlock(buf_mtxp);
 927                 return;
 928         }
 929         while (!LIST_EMPTY(&local_iterblkhd)) {
 930                 bp = LIST_FIRST(&local_iterblkhd);
 931                 LIST_REMOVE(bp, b_vnbufs);
 932                 LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
 933
 934                 if (buf_acquire_locked(bp, lock_flags, 0, 0))
 935                         continue;
 936
 937                 lck_mtx_unlock(buf_mtxp);
 938
 939                 retval = callout(bp, arg);
 940
 941                 switch (retval) {
 942                 case BUF_RETURNED:
 943                         buf_brelse(bp);
 944                         break;
 945                 case BUF_CLAIMED:
 946                         break;
 947                 case BUF_RETURNED_DONE:
 948                         buf_brelse(bp);
 949                         lck_mtx_lock(buf_mtxp);
 950                         goto out;
 951                 case BUF_CLAIMED_DONE:
 952                         lck_mtx_lock(buf_mtxp);
 953                         goto out;
 954                 }
 955                 lck_mtx_lock(buf_mtxp);
 956         }
 957 out:
 958         buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
 959
 960         lck_mtx_unlock(buf_mtxp);
 961 }
 962
 963
 964 /*
 965  * Flush out and invalidate all buffers associated with a vnode.
 966  */
 967 int
 968 buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo)
 969 {
 970         buf_t   bp;
 971         int     error = 0;
 972         int     must_rescan = 1;
 973         struct  buflists local_iterblkhd;
 974
 975         lck_mtx_lock(buf_mtxp);
 976
 977         for (;;) {
 978                 if (must_rescan == 0)
 979                         /*
 980                          * the lists may not be empty, but all that's left at this
 981                          * point are metadata or B_LOCKED buffers which are being
 982                          * skipped... we know this because we made it through both
 983                          * the clean and dirty lists without dropping buf_mtxp...
 984                          * each time we drop buf_mtxp we bump "must_rescan"
 985                          */
 986                         break;
 987                 if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd))
 988                         break;
 989                 must_rescan = 0;
 990                 /*
 991                  * iterate the clean list
 992                  */
 993                 if (buf_iterprepare(vp, &local_iterblkhd, VBI_CLEAN)) {
 994                         goto try_dirty_list;
 995                 }
 996                 while (!LIST_EMPTY(&local_iterblkhd)) {
 997                         bp = LIST_FIRST(&local_iterblkhd);
 998
 999                         LIST_REMOVE(bp, b_vnbufs);
1000                         LIST_INSERT_HEAD(&vp->v_cleanblkhd, bp, b_vnbufs);
1001
1002                         /*
1003                          * some filesystems distinguish meta data blocks with a negative logical block #
1004                          */
1005                         if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
1006                                 continue;
1007
1008                         if ( (error = (int)buf_acquire_locked(bp, BAC_REMOVE | BAC_SKIP_LOCKED, slpflag, slptimeo)) ) {
1009                                 if (error == EDEADLK)
1010                                         /*
1011                                          * this buffer was marked B_LOCKED...
1012                                          * we didn't drop buf_mtxp, so we
1013                                          * we don't need to rescan
1014                                          */
1015                                         continue;
1016                                 if (error == EAGAIN) {
1017                                         /*
1018                                          * found a busy buffer... we blocked and
1019                                          * dropped buf_mtxp, so we're going to
1020                                          * need to rescan after this pass is completed
1021                                          */
1022                                         must_rescan++;
1023                                         continue;
1024                                 }
1025                                 /*
1026                                  * got some kind of 'real' error out of the msleep
1027                                  * in buf_acquire_locked, terminate the scan and return the error
1028                                  */
1029                                 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1030
1031                                 lck_mtx_unlock(buf_mtxp);
1032                                 return (error);
1033                         }
1034                         lck_mtx_unlock(buf_mtxp);
1035
1036                         SET(bp->b_flags, B_INVAL);
1037                         buf_brelse(bp);
1038
1039                         lck_mtx_lock(buf_mtxp);
1040
1041                         /*
1042                          * by dropping buf_mtxp, we allow new
1043                          * buffers to be added to the vnode list(s)
1044                          * we'll have to rescan at least once more
1045                          * if the queues aren't empty
1046                          */
1047                         must_rescan++;
1048                 }
1049                 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1050
1051 try_dirty_list:
1052                 /*
1053                  * Now iterate on dirty blks
1054                  */
1055                 if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY)) {
1056                         continue;
1057                 }
1058                 while (!LIST_EMPTY(&local_iterblkhd)) {
1059                         bp = LIST_FIRST(&local_iterblkhd);
1060
1061                         LIST_REMOVE(bp, b_vnbufs);
1062                         LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1063
1064                         /*
1065                          * some filesystems distinguish meta data blocks with a negative logical block #
1066                          */
1067                         if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
1068                                 continue;
1069
1070                         if ( (error = (int)buf_acquire_locked(bp, BAC_REMOVE | BAC_SKIP_LOCKED, slpflag, slptimeo)) ) {
1071                                 if (error == EDEADLK)
1072                                         /*
1073                                          * this buffer was marked B_LOCKED...
1074                                          * we didn't drop buf_mtxp, so we
1075                                          * we don't need to rescan
1076                                          */
1077                                         continue;
1078                                 if (error == EAGAIN) {
1079                                         /*
1080                                          * found a busy buffer... we blocked and
1081                                          * dropped buf_mtxp, so we're going to
1082                                          * need to rescan after this pass is completed
1083                                          */
1084                                         must_rescan++;
1085                                         continue;
1086                                 }
1087                                 /*
1088                                  * got some kind of 'real' error out of the msleep
1089                                  * in buf_acquire_locked, terminate the scan and return the error
1090                                  */
1091                                 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1092
1093                                 lck_mtx_unlock(buf_mtxp);
1094                                 return (error);
1095                         }
1096                         lck_mtx_unlock(buf_mtxp);
1097
1098                         SET(bp->b_flags, B_INVAL);
1099
1100                         if (ISSET(bp->b_flags, B_DELWRI) && (flags & BUF_WRITE_DATA))
1101                                 (void) VNOP_BWRITE(bp);
1102                         else
1103                                 buf_brelse(bp);
1104
1105                         lck_mtx_lock(buf_mtxp);
1106                         /*
1107                          * by dropping buf_mtxp, we allow new
1108                          * buffers to be added to the vnode list(s)
1109                          * we'll have to rescan at least once more
1110                          * if the queues aren't empty
1111                          */
1112                         must_rescan++;
1113                 }
1114                 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1115         }
1116         lck_mtx_unlock(buf_mtxp);
1117
1118         return (0);
1119 }
1120
1121 void
1122 buf_flushdirtyblks(vnode_t vp, int wait, int flags, char *msg) {
1123         buf_t   bp;
1124         int     writes_issued = 0;
1125         errno_t error;
1126         int     busy = 0;
1127         struct  buflists local_iterblkhd;
1128         int     lock_flags = BAC_NOWAIT | BAC_REMOVE;
1129
1130         if (flags & BUF_SKIP_LOCKED)
1131                 lock_flags |= BAC_SKIP_LOCKED;
1132         if (flags & BUF_SKIP_NONLOCKED)
1133                 lock_flags |= BAC_SKIP_NONLOCKED;
1134 loop:
1135         lck_mtx_lock(buf_mtxp);
1136
1137         if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY) == 0)  {
1138                 while (!LIST_EMPTY(&local_iterblkhd)) {
1139                         bp = LIST_FIRST(&local_iterblkhd);
1140                         LIST_REMOVE(bp, b_vnbufs);
1141                         LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1142
1143                         if ((error = buf_acquire_locked(bp, lock_flags, 0, 0)) == EBUSY)
1144                                 busy++;
1145                         if (error)
1146                                 continue;
1147                         lck_mtx_unlock(buf_mtxp);
1148
1149                         bp->b_flags &= ~B_LOCKED;
1150
1151                         /*
1152                          * Wait for I/O associated with indirect blocks to complete,
1153                          * since there is no way to quickly wait for them below.
1154                          */
1155                         if ((bp->b_vp == vp) || (wait == 0))
1156                                 (void) buf_bawrite(bp);
1157                         else
1158                                 (void) VNOP_BWRITE(bp);
1159                         writes_issued++;
1160
1161                         lck_mtx_lock(buf_mtxp);
1162                 }
1163                 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1164         }
1165         lck_mtx_unlock(buf_mtxp);
1166
1167         if (wait) {
1168                 (void)vnode_waitforwrites(vp, 0, 0, 0, msg);
1169
1170                 if (vp->v_dirtyblkhd.lh_first && busy) {
1171                         /*
1172                          * we had one or more BUSY buffers on
1173                          * the dirtyblock list... most likely
1174                          * these are due to delayed writes that
1175                          * were moved to the bclean queue but
1176                          * have not yet been 'written'.
1177                          * if we issued some writes on the
1178                          * previous pass, we try again immediately
1179                          * if we didn't, we'll sleep for some time
1180                          * to allow the state to change...
1181                          */
1182                         if (writes_issued == 0) {
1183                                 (void)tsleep((caddr_t)&vp->v_numoutput,
1184                                              PRIBIO + 1, "vnode_flushdirtyblks", hz/20);
1185                         }
1186                         writes_issued = 0;
1187                         busy = 0;
1188
1189                         goto loop;
1190                 }
1191         }
1192 }
1193
1194
1195 /*
1196  * called with buf_mtxp held...
1197  * this lock protects the queue manipulation
1198  */
1199 static int
1200 buf_iterprepare(vnode_t vp, struct buflists *iterheadp, int flags)
1201 {
1202         struct buflists * listheadp;
1203
1204         if (flags & VBI_DIRTY)
1205                 listheadp = &vp->v_dirtyblkhd;
1206         else
1207                 listheadp = &vp->v_cleanblkhd;
1208
1209         while (vp->v_iterblkflags & VBI_ITER)   {
1210                 vp->v_iterblkflags |= VBI_ITERWANT;
1211                 msleep(&vp->v_iterblkflags, buf_mtxp, 0, "buf_iterprepare", 0);
1212         }
1213         if (LIST_EMPTY(listheadp)) {
1214                 LIST_INIT(iterheadp);
1215                 return(EINVAL);
1216         }
1217         vp->v_iterblkflags |= VBI_ITER;
1218
1219         iterheadp->lh_first = listheadp->lh_first;
1220         listheadp->lh_first->b_vnbufs.le_prev = &iterheadp->lh_first;
1221         LIST_INIT(listheadp);
1222
1223         return(0);
1224 }
1225
1226 /*
1227  * called with buf_mtxp held...
1228  * this lock protects the queue manipulation
1229  */
1230 static void
1231 buf_itercomplete(vnode_t vp, struct buflists *iterheadp, int flags)
1232 {
1233         struct buflists * listheadp;
1234         buf_t bp;
1235
1236         if (flags & VBI_DIRTY)
1237                 listheadp = &vp->v_dirtyblkhd;
1238         else
1239                 listheadp = &vp->v_cleanblkhd;
1240
1241         while (!LIST_EMPTY(iterheadp)) {
1242                 bp = LIST_FIRST(iterheadp);
1243                 LIST_REMOVE(bp, b_vnbufs);
1244                 LIST_INSERT_HEAD(listheadp, bp, b_vnbufs);
1245         }
1246         vp->v_iterblkflags &= ~VBI_ITER;
1247
1248         if  (vp->v_iterblkflags & VBI_ITERWANT)         {
1249                 vp->v_iterblkflags &= ~VBI_ITERWANT;
1250                 wakeup(&vp->v_iterblkflags);
1251         }
1252 }
1253
1254
1255 static void
1256 bremfree_locked(buf_t bp)
1257 {
1258         struct bqueues *dp = NULL;
1259         int whichq = -1;
1260
1261         /*
1262          * We only calculate the head of the freelist when removing
1263          * the last element of the list as that is the only time that
1264          * it is needed (e.g. to reset the tail pointer).
1265          *
1266          * NB: This makes an assumption about how tailq's are implemented.
1267          */
1268         if (bp->b_freelist.tqe_next == NULL) {
1269                 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
1270                         if (dp->tqh_last == &bp->b_freelist.tqe_next)
1271                                 break;
1272                 if (dp == &bufqueues[BQUEUES])
1273                         panic("bremfree: lost tail");
1274         }
1275         TAILQ_REMOVE(dp, bp, b_freelist);
1276         whichq = bp->b_whichq;
1277 #if BALANCE_QUEUES
1278         bufqdec(whichq);
1279 #endif
1280         bp->b_whichq = -1;
1281         bp->b_timestamp = 0;
1282 }
1283
1284 /*
1285  * Associate a buffer with a vnode.
1286  */
1287 static void
1288 bgetvp(vnode_t vp, buf_t bp)
1289 {
1290
1291         if (bp->b_vp != vp)
1292                 panic("bgetvp: not free");
1293
1294         if (vp->v_type == VBLK || vp->v_type == VCHR)
1295                 bp->b_dev = vp->v_rdev;
1296         else
1297                 bp->b_dev = NODEV;
1298         /*
1299          * Insert onto list for new vnode.
1300          */
1301         lck_mtx_lock(buf_mtxp);
1302         bufinsvn(bp, &vp->v_cleanblkhd);
1303         lck_mtx_unlock(buf_mtxp);
1304 }
1305
1306 /*
1307  * Disassociate a buffer from a vnode.
1308  */
1309 static void
1310 brelvp(buf_t bp)
1311 {
1312         vnode_t vp;
1313
1314         if ((vp = bp->b_vp) == (vnode_t)NULL)
1315                 panic("brelvp: NULL vp");
1316         /*
1317          * Delete from old vnode list, if on one.
1318          */
1319         lck_mtx_lock(buf_mtxp);
1320         if (bp->b_vnbufs.le_next != NOLIST)
1321                 bufremvn(bp);
1322         lck_mtx_unlock(buf_mtxp);
1323
1324         bp->b_vp = (vnode_t)NULL;
1325 }
1326
1327 /*
1328  * Reassign a buffer from one vnode to another.
1329  * Used to assign file specific control information
1330  * (indirect blocks) to the vnode to which they belong.
1331  */
1332 static void
1333 buf_reassign(buf_t bp, vnode_t newvp)
1334 {
1335         register struct buflists *listheadp;
1336
1337         if (newvp == NULL) {
1338                 printf("buf_reassign: NULL");
1339                 return;
1340         }
1341         lck_mtx_lock(buf_mtxp);
1342
1343         /*
1344          * Delete from old vnode list, if on one.
1345          */
1346         if (bp->b_vnbufs.le_next != NOLIST)
1347                 bufremvn(bp);
1348         /*
1349          * If dirty, put on list of dirty buffers;
1350          * otherwise insert onto list of clean buffers.
1351          */
1352         if (ISSET(bp->b_flags, B_DELWRI))
1353                 listheadp = &newvp->v_dirtyblkhd;
1354         else
1355                 listheadp = &newvp->v_cleanblkhd;
1356         bufinsvn(bp, listheadp);
1357
1358         lck_mtx_unlock(buf_mtxp);
1359 }
1360
1361 static __inline__ void
1362 bufhdrinit(buf_t bp)
1363 {
1364         bzero((char *)bp, sizeof *bp);
1365         bp->b_dev = NODEV;
1366         bp->b_rcred = NOCRED;
1367         bp->b_wcred = NOCRED;
1368         bp->b_vnbufs.le_next = NOLIST;
1369         bp->b_flags = B_INVAL;
1370
1371         return;
1372 }
1373
1374 /*
1375  * Initialize buffers and hash links for buffers.
1376  */
1377 __private_extern__ void
1378 bufinit()
1379 {
1380         buf_t   bp;
1381         struct bqueues *dp;
1382         int     i;
1383         int     metabuf;
1384         long    whichq;
1385
1386         nbuf = 0;
1387         /* Initialize the buffer queues ('freelists') and the hash table */
1388         for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
1389                 TAILQ_INIT(dp);
1390         bufhashtbl = hashinit(nbuf_hashelements, M_CACHE, &bufhash);
1391
1392         metabuf = max_nbuf_headers/8; /* reserved for meta buf */
1393
1394         /* Initialize the buffer headers */
1395         for (i = 0; i < max_nbuf_headers; i++) {
1396                 nbuf++;
1397                 bp = &buf[i];
1398                 bufhdrinit(bp);
1399
1400                 /*
1401                  * metabuf buffer headers on the meta-data list and
1402                  * rest of the buffer headers on the empty list
1403                  */
1404                 if (--metabuf)
1405                         whichq = BQ_META;
1406                 else
1407                         whichq = BQ_EMPTY;
1408
1409                 BLISTNONE(bp);
1410                 dp = &bufqueues[whichq];
1411                 binsheadfree(bp, dp, whichq);
1412                 binshash(bp, &invalhash);
1413         }
1414
1415         boot_nbuf = nbuf;
1416
1417         for (; i < nbuf + niobuf; i++) {
1418                 bp = &buf[i];
1419                 bufhdrinit(bp);
1420                 binsheadfree(bp, &iobufqueue, -1);
1421         }
1422
1423     /*
1424          * allocate lock group attribute and group
1425          */
1426     buf_mtx_grp_attr = lck_grp_attr_alloc_init();
1427         buf_mtx_grp = lck_grp_alloc_init("buffer cache", buf_mtx_grp_attr);
1428
1429         /*
1430          * allocate the lock attribute
1431          */
1432         buf_mtx_attr = lck_attr_alloc_init();
1433
1434         /*
1435          * allocate and initialize mutex's for the buffer and iobuffer pools
1436          */
1437         buf_mtxp        = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
1438         iobuffer_mtxp   = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
1439
1440         if (iobuffer_mtxp == NULL)
1441                 panic("couldn't create iobuffer mutex");
1442
1443         if (buf_mtxp == NULL)
1444                 panic("couldn't create buf mutex");
1445
1446         /*
1447          * allocate and initialize cluster specific global locks...
1448          */
1449         cluster_init();
1450
1451         printf("using %d buffer headers and %d cluster IO buffer headers\n",
1452                 nbuf, niobuf);
1453
1454         /* Set up zones used by the buffer cache */
1455         bufzoneinit();
1456
1457         /* start the bcleanbuf() thread */
1458         bcleanbuf_thread_init();
1459
1460 #if BALANCE_QUEUES
1461         {
1462         static void bufq_balance_thread_init();
1463         /* create a thread to do dynamic buffer queue balancing */
1464         bufq_balance_thread_init();
1465         }
1466 #endif /* notyet */
1467 }
1468
1469 static struct buf *
1470 bio_doread(vnode_t vp, daddr64_t blkno, int size, ucred_t cred, int async, int queuetype)
1471 {
1472         buf_t   bp;
1473
1474         bp = buf_getblk(vp, blkno, size, 0, 0, queuetype);
1475
1476         /*
1477          * If buffer does not have data valid, start a read.
1478          * Note that if buffer is B_INVAL, buf_getblk() won't return it.
1479          * Therefore, it's valid if it's I/O has completed or been delayed.
1480          */
1481         if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
1482                 struct proc *p;
1483
1484                 p = current_proc();
1485
1486                 /* Start I/O for the buffer (keeping credentials). */
1487                 SET(bp->b_flags, B_READ | async);
1488                 if (cred != NOCRED && bp->b_rcred == NOCRED) {
1489                         kauth_cred_ref(cred);
1490                         bp->b_rcred = cred;
1491                 }
1492
1493                 VNOP_STRATEGY(bp);
1494
1495                 trace(TR_BREADMISS, pack(vp, size), blkno);
1496
1497                 /* Pay for the read. */
1498                 if (p && p->p_stats)
1499                         p->p_stats->p_ru.ru_inblock++;          /* XXX */
1500
1501                 if (async) {
1502                         /*
1503                          * since we asked for an ASYNC I/O
1504                          * the biodone will do the brelse
1505                          * we don't want to pass back a bp
1506                          * that we don't 'own'
1507                          */
1508                         bp = NULL;
1509                 }
1510         } else if (async) {
1511                 buf_brelse(bp);
1512                 bp = NULL;
1513         }
1514
1515         trace(TR_BREADHIT, pack(vp, size), blkno);
1516
1517         return (bp);
1518 }
1519
1520 /*
1521  * Perform the reads for buf_breadn() and buf_meta_breadn().
1522  * Trivial modification to the breada algorithm presented in Bach (p.55).
1523  */
1524 static errno_t
1525 do_breadn_for_type(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes,
1526                    int nrablks, ucred_t cred, buf_t *bpp, int queuetype)
1527 {
1528         buf_t   bp;
1529         int     i;
1530
1531         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, queuetype);
1532
1533         /*
1534          * For each of the read-ahead blocks, start a read, if necessary.
1535          */
1536         for (i = 0; i < nrablks; i++) {
1537                 /* If it's in the cache, just go on to next one. */
1538                 if (incore(vp, rablks[i]))
1539                         continue;
1540
1541                 /* Get a buffer for the read-ahead block */
1542                 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, queuetype);
1543         }
1544
1545         /* Otherwise, we had to start a read for it; wait until it's valid. */
1546         return (buf_biowait(bp));
1547 }
1548
1549
1550 /*
1551  * Read a disk block.
1552  * This algorithm described in Bach (p.54).
1553  */
1554 errno_t
1555 buf_bread(vnode_t vp, daddr64_t blkno, int size, ucred_t cred, buf_t *bpp)
1556 {
1557         buf_t   bp;
1558
1559         /* Get buffer for block. */
1560         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
1561
1562         /* Wait for the read to complete, and return result. */
1563         return (buf_biowait(bp));
1564 }
1565
1566 /*
1567  * Read a disk block. [bread() for meta-data]
1568  * This algorithm described in Bach (p.54).
1569  */
1570 errno_t
1571 buf_meta_bread(vnode_t vp, daddr64_t blkno, int size, ucred_t cred, buf_t *bpp)
1572 {
1573         buf_t   bp;
1574
1575         /* Get buffer for block. */
1576         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
1577
1578         /* Wait for the read to complete, and return result. */
1579         return (buf_biowait(bp));
1580 }
1581
1582 /*
1583  * Read-ahead multiple disk blocks. The first is sync, the rest async.
1584  */
1585 errno_t
1586 buf_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, ucred_t cred, buf_t *bpp)
1587 {
1588         return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_READ));
1589 }
1590
1591 /*
1592  * Read-ahead multiple disk blocks. The first is sync, the rest async.
1593  * [buf_breadn() for meta-data]
1594  */
1595 errno_t
1596 buf_meta_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, ucred_t cred, buf_t *bpp)
1597 {
1598         return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_META));
1599 }
1600
1601 /*
1602  * Block write.  Described in Bach (p.56)
1603  */
1604 errno_t
1605 buf_bwrite(buf_t bp)
1606 {
1607         int     sync, wasdelayed;
1608         errno_t rv;
1609         proc_t  p = current_proc();
1610         vnode_t vp = bp->b_vp;
1611
1612         if (bp->b_datap == 0) {
1613                 if (brecover_data(bp) == 0)
1614                         return (0);
1615         }
1616         /* Remember buffer type, to switch on it later. */
1617         sync = !ISSET(bp->b_flags, B_ASYNC);
1618         wasdelayed = ISSET(bp->b_flags, B_DELWRI);
1619         CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
1620
1621         if (wasdelayed)
1622                 OSAddAtomic(-1, &nbdwrite);
1623
1624         if (!sync) {
1625                 /*
1626                  * If not synchronous, pay for the I/O operation and make
1627                  * sure the buf is on the correct vnode queue.  We have
1628                  * to do this now, because if we don't, the vnode may not
1629                  * be properly notified that its I/O has completed.
1630                  */
1631                 if (wasdelayed)
1632                         buf_reassign(bp, vp);
1633                 else
1634                 if (p && p->p_stats)
1635                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
1636         }
1637         trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
1638
1639         /* Initiate disk write.  Make sure the appropriate party is charged. */
1640
1641         OSAddAtomic(1, &vp->v_numoutput);
1642
1643         VNOP_STRATEGY(bp);
1644
1645         if (sync) {
1646                 /*
1647                  * If I/O was synchronous, wait for it to complete.
1648                  */
1649                 rv = buf_biowait(bp);
1650
1651                 /*
1652                  * Pay for the I/O operation, if it's not been paid for, and
1653                  * make sure it's on the correct vnode queue. (async operatings
1654                  * were payed for above.)
1655                  */
1656                 if (wasdelayed)
1657                         buf_reassign(bp, vp);
1658                 else
1659                 if (p && p->p_stats)
1660                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
1661
1662                 /* Release the buffer. */
1663                 // XXXdbg - only if the unused bit is set
1664                 if (!ISSET(bp->b_flags, B_NORELSE)) {
1665                     buf_brelse(bp);
1666                 } else {
1667                     CLR(bp->b_flags, B_NORELSE);
1668                 }
1669
1670                 return (rv);
1671         } else {
1672                 return (0);
1673         }
1674 }
1675
1676 int
1677 vn_bwrite(ap)
1678         struct vnop_bwrite_args *ap;
1679 {
1680         return (buf_bwrite(ap->a_bp));
1681 }
1682
1683 /*
1684  * Delayed write.
1685  *
1686  * The buffer is marked dirty, but is not queued for I/O.
1687  * This routine should be used when the buffer is expected
1688  * to be modified again soon, typically a small write that
1689  * partially fills a buffer.
1690  *
1691  * NB: magnetic tapes cannot be delayed; they must be
1692  * written in the order that the writes are requested.
1693  *
1694  * Described in Leffler, et al. (pp. 208-213).
1695  *
1696  * Note: With the abilitty to allocate additional buffer
1697  * headers, we can get in to the situation where "too" many
1698  * buf_bdwrite()s can create situation where the kernel can create
1699  * buffers faster than the disks can service. Doing a buf_bawrite() in
1700  * cases were we have "too many" outstanding buf_bdwrite()s avoids that.
1701  */
1702 __private_extern__ int
1703 bdwrite_internal(buf_t bp, int return_error)
1704 {
1705         proc_t  p  = current_proc();
1706         vnode_t vp = bp->b_vp;
1707
1708         /*
1709          * If the block hasn't been seen before:
1710          *      (1) Mark it as having been seen,
1711          *      (2) Charge for the write.
1712          *      (3) Make sure it's on its vnode's correct block list,
1713          */
1714         if (!ISSET(bp->b_flags, B_DELWRI)) {
1715                 SET(bp->b_flags, B_DELWRI);
1716                 if (p && p->p_stats)
1717                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
1718                 OSAddAtomic(1, &nbdwrite);
1719                 buf_reassign(bp, vp);
1720         }
1721
1722         /* If this is a tape block, write it the block now. */
1723         if (ISSET(bp->b_flags, B_TAPE)) {
1724                 VNOP_BWRITE(bp);
1725                 return (0);
1726         }
1727
1728         /*
1729          * if we're not LOCKED, but the total number of delayed writes
1730          * has climbed above 75% of the total buffers in the system
1731          * return an error if the caller has indicated that it can
1732          * handle one in this case, otherwise schedule the I/O now
1733          * this is done to prevent us from allocating tons of extra
1734          * buffers when dealing with virtual disks (i.e. DiskImages),
1735          * because additional buffers are dynamically allocated to prevent
1736          * deadlocks from occurring
1737          *
1738          * however, can't do a buf_bawrite() if the LOCKED bit is set because the
1739          * buffer is part of a transaction and can't go to disk until
1740          * the LOCKED bit is cleared.
1741          */
1742         if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf/4)*3)) {
1743                 if (return_error)
1744                         return (EAGAIN);
1745                 /*
1746                  * If the vnode has "too many" write operations in progress
1747                  * wait for them to finish the IO
1748                  */
1749                 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, (char *)"buf_bdwrite");
1750
1751                 return (buf_bawrite(bp));
1752         }
1753
1754         /* Otherwise, the "write" is done, so mark and release the buffer. */
1755         SET(bp->b_flags, B_DONE);
1756         buf_brelse(bp);
1757         return (0);
1758 }
1759
1760 errno_t
1761 buf_bdwrite(buf_t bp)
1762 {
1763         return (bdwrite_internal(bp, 0));
1764 }
1765
1766
1767 /*
1768  * Asynchronous block write; just an asynchronous buf_bwrite().
1769  *
1770  * Note: With the abilitty to allocate additional buffer
1771  * headers, we can get in to the situation where "too" many
1772  * buf_bawrite()s can create situation where the kernel can create
1773  * buffers faster than the disks can service.
1774  * We limit the number of "in flight" writes a vnode can have to
1775  * avoid this.
1776  */
1777 static int
1778 bawrite_internal(buf_t bp, int throttle)
1779 {
1780         vnode_t vp = bp->b_vp;
1781
1782         if (vp) {
1783                 if (throttle)
1784                         /*
1785                          * If the vnode has "too many" write operations in progress
1786                          * wait for them to finish the IO
1787                          */
1788                         (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, (const char *)"buf_bawrite");
1789                 else if (vp->v_numoutput >= VNODE_ASYNC_THROTTLE)
1790                         /*
1791                          * return to the caller and
1792                          * let him decide what to do
1793                          */
1794                         return (EWOULDBLOCK);
1795         }
1796         SET(bp->b_flags, B_ASYNC);
1797
1798         return (VNOP_BWRITE(bp));
1799 }
1800
1801 errno_t
1802 buf_bawrite(buf_t bp)
1803 {
1804         return (bawrite_internal(bp, 1));
1805 }
1806
1807
1808 /*
1809  * Release a buffer on to the free lists.
1810  * Described in Bach (p. 46).
1811  */
1812 void
1813 buf_brelse(buf_t bp)
1814 {
1815         struct bqueues *bufq;
1816         long    whichq;
1817         upl_t   upl;
1818         int need_wakeup = 0;
1819         int need_bp_wakeup = 0;
1820
1821
1822         if (bp->b_whichq != -1 || !(bp->b_lflags & BL_BUSY))
1823                 panic("buf_brelse: bad buffer = %x\n", bp);
1824
1825 #ifdef JOE_DEBUG
1826         bp->b_stackbrelse[0] = __builtin_return_address(0);
1827         bp->b_stackbrelse[1] = __builtin_return_address(1);
1828         bp->b_stackbrelse[2] = __builtin_return_address(2);
1829         bp->b_stackbrelse[3] = __builtin_return_address(3);
1830         bp->b_stackbrelse[4] = __builtin_return_address(4);
1831         bp->b_stackbrelse[5] = __builtin_return_address(5);
1832
1833         bp->b_lastbrelse = current_thread();
1834         bp->b_tag = 0;
1835 #endif
1836         if (bp->b_lflags & BL_IOBUF) {
1837                 free_io_buf(bp);
1838                 return;
1839         }
1840
1841         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
1842                      bp->b_lblkno * PAGE_SIZE, (int)bp, (int)bp->b_datap,
1843                      bp->b_flags, 0);
1844
1845         trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
1846
1847         /*
1848          * if we're invalidating a buffer that has the B_FILTER bit
1849          * set then call the b_iodone function so it gets cleaned
1850          * up properly.
1851          *
1852          * the HFS journal code depends on this
1853          */
1854         if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
1855                 if (ISSET(bp->b_flags, B_FILTER)) {     /* if necessary, call out */
1856                         void    (*iodone_func)(struct buf *, void *) = bp->b_iodone;
1857                         void    *arg = (void *)bp->b_transaction;
1858
1859                         CLR(bp->b_flags, B_FILTER);     /* but note callout done */
1860                         bp->b_iodone = NULL;
1861                         bp->b_transaction = NULL;
1862
1863                         if (iodone_func == NULL) {
1864                                 panic("brelse: bp @ 0x%x has NULL b_iodone!\n", bp);
1865                         }
1866                         (*iodone_func)(bp, arg);
1867                 }
1868         }
1869         /*
1870          * I/O is done. Cleanup the UPL state
1871          */
1872         upl = bp->b_upl;
1873
1874         if ( !ISSET(bp->b_flags, B_META) && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
1875                 kern_return_t kret;
1876                 int           upl_flags;
1877
1878                 if ( (upl == NULL) ) {
1879                         if ( !ISSET(bp->b_flags, B_INVAL)) {
1880                                 kret = ubc_create_upl(bp->b_vp,
1881                                                       ubc_blktooff(bp->b_vp, bp->b_lblkno),
1882                                                       bp->b_bufsize,
1883                                                       &upl,
1884                                                       NULL,
1885                                                       UPL_PRECIOUS);
1886
1887                                 if (kret != KERN_SUCCESS)
1888                                         panic("brelse: Failed to create UPL");
1889 #ifdef  UPL_DEBUG
1890                                 upl_ubc_alias_set(upl, bp, 5);
1891 #endif /* UPL_DEBUG */
1892                         }
1893                 } else {
1894                         if (bp->b_datap) {
1895                                 kret = ubc_upl_unmap(upl);
1896
1897                                 if (kret != KERN_SUCCESS)
1898                                         panic("ubc_upl_unmap failed");
1899                                 bp->b_datap = (uintptr_t)NULL;
1900                         }
1901                 }
1902                 if (upl) {
1903                         if (bp->b_flags & (B_ERROR | B_INVAL)) {
1904                                 if (bp->b_flags & (B_READ | B_INVAL))
1905                                         upl_flags = UPL_ABORT_DUMP_PAGES;
1906                                 else
1907                                         upl_flags = 0;
1908
1909                                 ubc_upl_abort(upl, upl_flags);
1910                         } else {
1911                                 if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY))
1912                                         upl_flags = UPL_COMMIT_SET_DIRTY ;
1913                                 else
1914                                         upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
1915
1916                                 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
1917                                                      UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
1918                         }
1919                         bp->b_upl = NULL;
1920                 }
1921         } else {
1922                 if ( (upl) )
1923                         panic("brelse: UPL set for non VREG; vp=%x", bp->b_vp);
1924         }
1925
1926         /*
1927          * If it's locked, don't report an error; try again later.
1928          */
1929         if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
1930                 CLR(bp->b_flags, B_ERROR);
1931         /*
1932          * If it's not cacheable, or an error, mark it invalid.
1933          */
1934         if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
1935                 SET(bp->b_flags, B_INVAL);
1936
1937         if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
1938                 /*
1939                  * If it's invalid or empty, dissociate it from its vnode
1940                  * and put on the head of the appropriate queue.
1941                  */
1942                 if (bp->b_vp)
1943                         brelvp(bp);
1944
1945                 if (ISSET(bp->b_flags, B_DELWRI))
1946                         OSAddAtomic(-1, &nbdwrite);
1947
1948                 CLR(bp->b_flags, (B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE));
1949                 /*
1950                  * Determine which queue the buffer should be on, then put it there.
1951                  */
1952                 if (bp->b_bufsize <= 0)
1953                         whichq = BQ_EMPTY;      /* no data */
1954                 else if (ISSET(bp->b_flags, B_META))
1955                         whichq = BQ_META;               /* meta-data */
1956                 else
1957                         whichq = BQ_AGE;        /* invalid data */
1958                 bufq = &bufqueues[whichq];
1959
1960                 lck_mtx_lock(buf_mtxp);
1961
1962                 binsheadfree(bp, bufq, whichq);
1963         } else {
1964                 /*
1965                  * It has valid data.  Put it on the end of the appropriate
1966                  * queue, so that it'll stick around for as long as possible.
1967                  */
1968                 if (ISSET(bp->b_flags, B_LOCKED))
1969                         whichq = BQ_LOCKED;             /* locked in core */
1970                 else if (ISSET(bp->b_flags, B_META))
1971                         whichq = BQ_META;               /* meta-data */
1972                 else if (ISSET(bp->b_flags, B_AGE))
1973                         whichq = BQ_AGE;                /* stale but valid data */
1974                 else
1975                         whichq = BQ_LRU;                /* valid data */
1976                 bufq = &bufqueues[whichq];
1977
1978                 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE));
1979
1980                 lck_mtx_lock(buf_mtxp);
1981
1982                 binstailfree(bp, bufq, whichq);
1983         }
1984         if (needbuffer) {
1985                 /*
1986                  * needbuffer is a global
1987                  * we're currently using buf_mtxp to protect it
1988                  * delay doing the actual wakeup until after
1989                  * we drop buf_mtxp
1990                  */
1991                 needbuffer = 0;
1992                 need_wakeup = 1;
1993         }
1994         if (ISSET(bp->b_lflags, BL_WANTED)) {
1995                 /*
1996                  * delay the actual wakeup until after we
1997                  * clear BL_BUSY and we've dropped buf_mtxp
1998                  */
1999                 need_bp_wakeup = 1;
2000         }
2001         /*
2002          * Unlock the buffer.
2003          */
2004         CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
2005
2006         lck_mtx_unlock(buf_mtxp);
2007
2008         if (need_wakeup) {
2009                 /*
2010                  * Wake up any processes waiting for any buffer to become free.
2011                  */
2012                 wakeup(&needbuffer);
2013         }
2014         if (need_bp_wakeup) {
2015                 /*
2016                  * Wake up any proceeses waiting for _this_ buffer to become free.
2017                  */
2018                 wakeup(bp);
2019         }
2020         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
2021                      (int)bp, (int)bp->b_datap, bp->b_flags, 0, 0);
2022 }
2023
2024 /*
2025  * Determine if a block is in the cache.
2026  * Just look on what would be its hash chain.  If it's there, return
2027  * a pointer to it, unless it's marked invalid.  If it's marked invalid,
2028  * we normally don't return the buffer, unless the caller explicitly
2029  * wants us to.
2030  */
2031 static boolean_t
2032 incore(vnode_t vp, daddr64_t blkno)
2033 {
2034         boolean_t retval;
2035
2036         lck_mtx_lock(buf_mtxp);
2037
2038         if (incore_locked(vp, blkno))
2039                 retval = TRUE;
2040         else
2041                 retval = FALSE;
2042         lck_mtx_unlock(buf_mtxp);
2043
2044         return (retval);
2045 }
2046
2047
2048 static buf_t
2049 incore_locked(vnode_t vp, daddr64_t blkno)
2050 {
2051         struct buf *bp;
2052
2053         bp = BUFHASH(vp, blkno)->lh_first;
2054
2055         /* Search hash chain */
2056         for (; bp != NULL; bp = bp->b_hash.le_next) {
2057                 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
2058                     !ISSET(bp->b_flags, B_INVAL)) {
2059                         return (bp);
2060                 }
2061         }
2062         return (0);
2063 }
2064
2065
2066 /* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
2067 /*
2068  * Get a block of requested size that is associated with
2069  * a given vnode and block offset. If it is found in the
2070  * block cache, mark it as having been found, make it busy
2071  * and return it. Otherwise, return an empty block of the
2072  * correct size. It is up to the caller to insure that the
2073  * cached blocks be of the correct size.
2074  */
2075 buf_t
2076 buf_getblk(vnode_t vp, daddr64_t blkno, int size, int slpflag, int slptimeo, int operation)
2077 {
2078         buf_t bp;
2079         int   err;
2080         upl_t upl;
2081         upl_page_info_t *pl;
2082         kern_return_t kret;
2083         int ret_only_valid;
2084         struct timespec ts;
2085         int upl_flags;
2086
2087         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
2088                      (int)(blkno * PAGE_SIZE), size, operation, 0, 0);
2089
2090         ret_only_valid = operation & BLK_ONLYVALID;
2091         operation &= ~BLK_ONLYVALID;
2092 start:
2093         lck_mtx_lock(buf_mtxp);
2094 start_locked:
2095         if ((bp = incore_locked(vp, blkno))) {
2096                 /*
2097                  * Found in the Buffer Cache
2098                  */
2099                 if (ISSET(bp->b_lflags, BL_BUSY)) {
2100                         /*
2101                          * but is busy
2102                          */
2103                         switch (operation) {
2104                         case BLK_READ:
2105                         case BLK_WRITE:
2106                         case BLK_META:
2107                                 SET(bp->b_lflags, BL_WANTED);
2108                                 bufstats.bufs_busyincore++;
2109
2110                                 /*
2111                                  * don't retake the mutex after being awakened...
2112                                  * the time out is in msecs
2113                                  */
2114                                 ts.tv_sec = (slptimeo/1000);
2115                                 ts.tv_nsec = (slptimeo % 1000) * 10  * NSEC_PER_USEC * 1000;
2116
2117                                 err = msleep(bp, buf_mtxp, slpflag | PDROP | (PRIBIO + 1), "buf_getblk", &ts);
2118
2119                                 /*
2120                                  * Callers who call with PCATCH or timeout are
2121                                  * willing to deal with the NULL pointer
2122                                  */
2123                                 if (err && ((slpflag & PCATCH) || ((err == EWOULDBLOCK) && slptimeo)))
2124                                         return (NULL);
2125                                 goto start;
2126                                 /*NOTREACHED*/
2127                                 break;
2128
2129                         default:
2130                                 /*
2131                                  * unknown operation requested
2132                                  */
2133                                 panic("getblk: paging or unknown operation for incore busy buffer - %x\n", operation);
2134                                 /*NOTREACHED*/
2135                                 break;
2136                         }
2137                 } else {
2138                         /*
2139                          * buffer in core and not busy
2140                          */
2141                         if ( (bp->b_upl) )
2142                                 panic("buffer has UPL, but not marked BUSY: %x", bp);
2143                         SET(bp->b_lflags, BL_BUSY);
2144                         SET(bp->b_flags, B_CACHE);
2145 #ifdef JOE_DEBUG
2146                         bp->b_owner = current_thread();
2147                         bp->b_tag   = 1;
2148 #endif
2149                         bremfree_locked(bp);
2150                         bufstats.bufs_incore++;
2151
2152                         lck_mtx_unlock(buf_mtxp);
2153
2154                         if ( !ret_only_valid)
2155                                 allocbuf(bp, size);
2156
2157                         upl_flags = 0;
2158                         switch (operation) {
2159                         case BLK_WRITE:
2160                                 /*
2161                                  * "write" operation:  let the UPL subsystem
2162                                  * know that we intend to modify the buffer
2163                                  * cache pages we're gathering.
2164                                  */
2165                                 upl_flags |= UPL_WILL_MODIFY;
2166                         case BLK_READ:
2167                                 upl_flags |= UPL_PRECIOUS;
2168                                 if (UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
2169                                         kret = ubc_create_upl(vp,
2170                                                               ubc_blktooff(vp, bp->b_lblkno),
2171                                                               bp->b_bufsize,
2172                                                               &upl,
2173                                                               &pl,
2174                                                               upl_flags);
2175                                         if (kret != KERN_SUCCESS)
2176                                                 panic("Failed to create UPL");
2177
2178                                         bp->b_upl = upl;
2179
2180                                         if (upl_valid_page(pl, 0)) {
2181                                                 if (upl_dirty_page(pl, 0))
2182                                                         SET(bp->b_flags, B_WASDIRTY);
2183                                                 else
2184                                                         CLR(bp->b_flags, B_WASDIRTY);
2185                                         } else
2186                                                 CLR(bp->b_flags, (B_DONE | B_CACHE | B_WASDIRTY | B_DELWRI));
2187
2188                                         kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_datap));
2189
2190                                         if (kret != KERN_SUCCESS)
2191                                                 panic("getblk: ubc_upl_map() failed with (%d)", kret);
2192                                 }
2193                                 break;
2194
2195                         case BLK_META:
2196                                 /*
2197                                  * VM is not involved in IO for the meta data
2198                                  * buffer already has valid data
2199                                  */
2200                                 break;
2201
2202                         default:
2203                                 panic("getblk: paging or unknown operation for incore buffer- %d\n", operation);
2204                                 /*NOTREACHED*/
2205                                 break;
2206                         }
2207                 }
2208         } else { /* not incore() */
2209                 int queue = BQ_EMPTY; /* Start with no preference */
2210
2211                 if (ret_only_valid) {
2212                         lck_mtx_unlock(buf_mtxp);
2213                         return (NULL);
2214                 }
2215
2216                 if ((UBCINVALID(vp)) || !(UBCINFOEXISTS(vp)))
2217                         operation = BLK_META;
2218
2219                 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
2220                         goto start_locked;
2221
2222                 /*
2223                  * getnewbuf may block for a number of different reasons...
2224                  * if it does, it's then possible for someone else to
2225                  * create a buffer for the same block and insert it into
2226                  * the hash... if we see it incore at this point we dump
2227                  * the buffer we were working on and start over
2228                  */
2229                 if (incore_locked(vp, blkno)) {
2230                         SET(bp->b_flags, B_INVAL);
2231                         binshash(bp, &invalhash);
2232
2233                         lck_mtx_unlock(buf_mtxp);
2234
2235                         buf_brelse(bp);
2236                         goto start;
2237                 }
2238                 /*
2239                  * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
2240                  *       CALLED!  BE CAREFUL.
2241                  */
2242
2243                 /*
2244                  * mark the buffer as B_META if indicated
2245                  * so that when buffer is released it will goto META queue
2246                  */
2247                 if (operation == BLK_META)
2248                         SET(bp->b_flags, B_META);
2249
2250                 bp->b_blkno = bp->b_lblkno = blkno;
2251                 bp->b_vp = vp;
2252
2253                 /*
2254                  * Insert in the hash so that incore() can find it
2255                  */
2256                 binshash(bp, BUFHASH(vp, blkno));
2257
2258                 lck_mtx_unlock(buf_mtxp);
2259
2260                 bgetvp(vp, bp);
2261
2262                 allocbuf(bp, size);
2263
2264                 upl_flags = 0;
2265                 switch (operation) {
2266                 case BLK_META:
2267                         /*
2268                          * buffer data is invalid...
2269                          *
2270                          * I don't want to have to retake buf_mtxp,
2271                          * so the miss and vmhits counters are done
2272                          * with Atomic updates... all other counters
2273                          * in bufstats are protected with either
2274                          * buf_mtxp or iobuffer_mtxp
2275                          */
2276                         OSAddAtomic(1, &bufstats.bufs_miss);
2277                         break;
2278
2279                 case BLK_WRITE:
2280                         /*
2281                          * "write" operation:  let the UPL subsystem know
2282                          * that we intend to modify the buffer cache pages
2283                          * we're gathering.
2284                          */
2285                         upl_flags |= UPL_WILL_MODIFY;
2286                 case BLK_READ:
2287                   {     off_t   f_offset;
2288                         size_t  contig_bytes;
2289                         int     bmap_flags;
2290
2291                         if ( (bp->b_upl) )
2292                                 panic("bp already has UPL: %x",bp);
2293
2294                         f_offset = ubc_blktooff(vp, blkno);
2295
2296                         upl_flags |= UPL_PRECIOUS;
2297                         kret = ubc_create_upl(vp,
2298                                               f_offset,
2299                                               bp->b_bufsize,
2300                                               &upl,
2301                                               &pl,
2302                                               upl_flags);
2303
2304                         if (kret != KERN_SUCCESS)
2305                                 panic("Failed to create UPL");
2306 #ifdef  UPL_DEBUG
2307                         upl_ubc_alias_set(upl, bp, 4);
2308 #endif /* UPL_DEBUG */
2309                         bp->b_upl = upl;
2310
2311                         if (upl_valid_page(pl, 0)) {
2312
2313                                 if (operation == BLK_READ)
2314                                         bmap_flags = VNODE_READ;
2315                                 else
2316                                         bmap_flags = VNODE_WRITE;
2317
2318                                 SET(bp->b_flags, B_CACHE | B_DONE);
2319
2320                                 OSAddAtomic(1, &bufstats.bufs_vmhits);
2321
2322                                 bp->b_validoff = 0;
2323                                 bp->b_dirtyoff = 0;
2324
2325                                 if (upl_dirty_page(pl, 0)) {
2326                                         /* page is dirty */
2327                                         SET(bp->b_flags, B_WASDIRTY);
2328
2329                                         bp->b_validend = bp->b_bcount;
2330                                         bp->b_dirtyend = bp->b_bcount;
2331                                 } else {
2332                                         /* page is clean */
2333                                         bp->b_validend = bp->b_bcount;
2334                                         bp->b_dirtyend = 0;
2335                                 }
2336                                 /*
2337                                  * try to recreate the physical block number associated with
2338                                  * this buffer...
2339                                  */
2340                                 if (VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))
2341                                         panic("getblk: VNOP_BLOCKMAP failed");
2342                                 /*
2343                                  * if the extent represented by this buffer
2344                                  * is not completely physically contiguous on
2345                                  * disk, than we can't cache the physical mapping
2346                                  * in the buffer header
2347                                  */
2348                                 if ((long)contig_bytes < bp->b_bcount)
2349                                         bp->b_blkno = bp->b_lblkno;
2350                         } else {
2351                                 OSAddAtomic(1, &bufstats.bufs_miss);
2352                         }
2353                         kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_datap));
2354
2355                         if (kret != KERN_SUCCESS)
2356                                 panic("getblk: ubc_upl_map() failed with (%d)", kret);
2357                         break;
2358                   }
2359                 default:
2360                         panic("getblk: paging or unknown operation - %x", operation);
2361                         /*NOTREACHED*/
2362                         break;
2363                 }
2364         }
2365         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
2366                      (int)bp, (int)bp->b_datap, bp->b_flags, 3, 0);
2367
2368 #ifdef JOE_DEBUG
2369         bp->b_stackgetblk[0] = __builtin_return_address(0);
2370         bp->b_stackgetblk[1] = __builtin_return_address(1);
2371         bp->b_stackgetblk[2] = __builtin_return_address(2);
2372         bp->b_stackgetblk[3] = __builtin_return_address(3);
2373         bp->b_stackgetblk[4] = __builtin_return_address(4);
2374         bp->b_stackgetblk[5] = __builtin_return_address(5);
2375 #endif
2376         return (bp);
2377 }
2378
2379 /*
2380  * Get an empty, disassociated buffer of given size.
2381  */
2382 buf_t
2383 buf_geteblk(size)
2384         int size;
2385 {
2386         buf_t   bp;
2387         int queue = BQ_EMPTY;
2388
2389         lck_mtx_lock(buf_mtxp);
2390
2391         while ((bp = getnewbuf(0, 0, &queue)) == 0)
2392                 ;
2393         SET(bp->b_flags, (B_META|B_INVAL));
2394
2395 #if DIAGNOSTIC
2396         assert(queue == BQ_EMPTY);
2397 #endif /* DIAGNOSTIC */
2398         /* XXX need to implement logic to deal with other queues */
2399
2400         binshash(bp, &invalhash);
2401         bufstats.bufs_eblk++;
2402
2403         lck_mtx_unlock(buf_mtxp);
2404
2405         allocbuf(bp, size);
2406
2407         return (bp);
2408 }
2409
2410 /*
2411  * Zones for the meta data buffers
2412  */
2413
2414 #define MINMETA 512
2415 #define MAXMETA 4096
2416
2417 struct meta_zone_entry {
2418         zone_t mz_zone;
2419         vm_size_t mz_size;
2420         vm_size_t mz_max;
2421         char *mz_name;
2422 };
2423
2424 struct meta_zone_entry meta_zones[] = {
2425         {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
2426         {NULL, (MINMETA * 2),  64 * (MINMETA * 2), "buf.1024" },
2427         {NULL, (MINMETA * 4),  16 * (MINMETA * 4), "buf.2048" },
2428         {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
2429         {NULL, 0, 0, "" } /* End */
2430 };
2431
2432 /*
2433  * Initialize the meta data zones
2434  */
2435 static void
2436 bufzoneinit(void)
2437 {
2438         int i;
2439
2440         for (i = 0; meta_zones[i].mz_size != 0; i++) {
2441                 meta_zones[i].mz_zone =
2442                                 zinit(meta_zones[i].mz_size,
2443                                         meta_zones[i].mz_max,
2444                                         PAGE_SIZE,
2445                                         meta_zones[i].mz_name);
2446         }
2447         buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
2448 }
2449
2450 static __inline__ zone_t
2451 getbufzone(size_t size)
2452 {
2453         int i;
2454
2455         if ((size % 512) || (size < MINMETA) || (size > MAXMETA))
2456                 panic("getbufzone: incorect size = %d", size);
2457
2458         for (i = 0; meta_zones[i].mz_size != 0; i++) {
2459                 if (meta_zones[i].mz_size >= size)
2460                         break;
2461         }
2462
2463         return (meta_zones[i].mz_zone);
2464 }
2465
2466 /*
2467  * With UBC, there is no need to expand / shrink the file data
2468  * buffer. The VM uses the same pages, hence no waste.
2469  * All the file data buffers can have one size.
2470  * In fact expand / shrink would be an expensive operation.
2471  *
2472  * Only exception to this is meta-data buffers. Most of the
2473  * meta data operations are smaller than PAGE_SIZE. Having the
2474  * meta-data buffers grow and shrink as needed, optimizes use
2475  * of the kernel wired memory.
2476  */
2477
2478 int
2479 allocbuf(buf_t bp, int size)
2480 {
2481         vm_size_t desired_size;
2482
2483         desired_size = roundup(size, CLBYTES);
2484
2485         if (desired_size < PAGE_SIZE)
2486                 desired_size = PAGE_SIZE;
2487         if (desired_size > MAXBSIZE)
2488                 panic("allocbuf: buffer larger than MAXBSIZE requested");
2489
2490         if (ISSET(bp->b_flags, B_META)) {
2491                 zone_t zprev, z;
2492                 int    nsize = roundup(size, MINMETA);
2493
2494                 if (bp->b_datap) {
2495                         vm_offset_t elem = (vm_offset_t)bp->b_datap;
2496
2497                         if (ISSET(bp->b_flags, B_ZALLOC)) {
2498                                 if (bp->b_bufsize < nsize) {
2499                                         /* reallocate to a bigger size */
2500
2501                                         zprev = getbufzone(bp->b_bufsize);
2502                                         if (nsize <= MAXMETA) {
2503                                                 desired_size = nsize;
2504                                                 z = getbufzone(nsize);
2505                                                 bp->b_datap = (uintptr_t)zalloc(z);
2506                                         } else {
2507                                                 bp->b_datap = (uintptr_t)NULL;
2508                                                 kmem_alloc_wired(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
2509                                                 CLR(bp->b_flags, B_ZALLOC);
2510                                         }
2511                                         bcopy((void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
2512                                         zfree(zprev, (void *)elem);
2513                                 } else {
2514                                         desired_size = bp->b_bufsize;
2515                                 }
2516
2517                         } else {
2518                                 if ((vm_size_t)bp->b_bufsize < desired_size) {
2519                                         /* reallocate to a bigger size */
2520                                         bp->b_datap = (uintptr_t)NULL;
2521                                         kmem_alloc_wired(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
2522                                         bcopy((const void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
2523                                         kmem_free(kernel_map, elem, bp->b_bufsize);
2524                                 } else {
2525                                         desired_size = bp->b_bufsize;
2526                                 }
2527                         }
2528                 } else {
2529                         /* new allocation */
2530                         if (nsize <= MAXMETA) {
2531                                 desired_size = nsize;
2532                                 z = getbufzone(nsize);
2533                                 bp->b_datap = (uintptr_t)zalloc(z);
2534                                 SET(bp->b_flags, B_ZALLOC);
2535                         } else
2536                                 kmem_alloc_wired(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
2537                 }
2538         }
2539         bp->b_bufsize = desired_size;
2540         bp->b_bcount = size;
2541
2542         return (0);
2543 }
2544
2545 /*
2546  *      Get a new buffer from one of the free lists.
2547  *
2548  *      Request for a queue is passes in. The queue from which the buffer was taken
2549  *      from is returned. Out of range queue requests get BQ_EMPTY. Request for
2550  *      BQUEUE means no preference. Use heuristics in that case.
2551  *      Heuristics is as follows:
2552  *      Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
2553  *      If none available block till one is made available.
2554  *      If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
2555  *      Pick the most stale buffer.
2556  *      If found buffer was marked delayed write, start the async. write
2557  *      and restart the search.
2558  *      Initialize the fields and disassociate the buffer from the vnode.
2559  *      Remove the buffer from the hash. Return the buffer and the queue
2560  *      on which it was found.
2561  *
2562  *      buf_mtxp is held upon entry
2563  *      returns with buf_mtxp locked
2564  */
2565
2566 static buf_t
2567 getnewbuf(int slpflag, int slptimeo, int * queue)
2568 {
2569         buf_t   bp;
2570         buf_t   lru_bp;
2571         buf_t   age_bp;
2572         buf_t   meta_bp;
2573         int     age_time, lru_time, bp_time, meta_time;
2574         int     req = *queue;   /* save it for restarts */
2575         struct timespec ts;
2576
2577 start:
2578         /*
2579          * invalid request gets empty queue
2580          */
2581         if ((*queue > BQUEUES) || (*queue < 0)
2582                 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED))
2583                 *queue = BQ_EMPTY;
2584         /* need to grow number of bufs, add another one rather than recycling */
2585         if (nbuf < max_nbuf_headers) {
2586                 /*
2587                  * Increment  count now as lock
2588                  * is dropped for allocation.
2589                  * That avoids over commits
2590                  */
2591                 nbuf++;
2592                 goto add_newbufs;
2593         }
2594
2595         /*
2596          * (*queue == BQUEUES) means no preference
2597          */
2598         if (*queue != BQUEUES) {
2599                 /* Try for the requested queue first */
2600                 bp = bufqueues[*queue].tqh_first;
2601                 if (bp)
2602                         goto found;
2603         }
2604
2605         /* Unable to use requested queue */
2606         age_bp = bufqueues[BQ_AGE].tqh_first;
2607         lru_bp = bufqueues[BQ_LRU].tqh_first;
2608         meta_bp = bufqueues[BQ_META].tqh_first;
2609
2610         if (!age_bp && !lru_bp && !meta_bp) {
2611                 /*
2612                  * Unavailble on AGE or LRU or META queues
2613                  * Try the empty list first
2614                  */
2615                 bp = bufqueues[BQ_EMPTY].tqh_first;
2616                 if (bp) {
2617                         *queue = BQ_EMPTY;
2618                         goto found;
2619                 }
2620                 /*
2621                  * We have seen is this is hard to trigger.
2622                  * This is an overcommit of nbufs but needed
2623                  * in some scenarios with diskiamges
2624                  */
2625
2626 add_newbufs:
2627                 lck_mtx_unlock(buf_mtxp);
2628
2629                 /* Create a new temporary buffer header */
2630                 bp = (struct buf *)zalloc(buf_hdr_zone);
2631
2632                 lck_mtx_lock(buf_mtxp);
2633
2634                 if (bp) {
2635                         bufhdrinit(bp);
2636                         BLISTNONE(bp);
2637                         binshash(bp, &invalhash);
2638                         SET(bp->b_flags, B_HDRALLOC);
2639                         *queue = BQ_EMPTY;
2640                         binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
2641                         buf_hdr_count++;
2642                         goto found;
2643                 }
2644                 /* subtract already accounted bufcount */
2645                 nbuf--;
2646
2647                 bufstats.bufs_sleeps++;
2648
2649                 /* wait for a free buffer of any kind */
2650                 needbuffer = 1;
2651                 /* hz value is 100 */
2652                 ts.tv_sec = (slptimeo/1000);
2653                 /* the hz value is 100; which leads to 10ms */
2654                 ts.tv_nsec = (slptimeo % 1000) * NSEC_PER_USEC * 1000 * 10;
2655                 msleep(&needbuffer, buf_mtxp, slpflag|(PRIBIO+1), (char *)"getnewbuf", &ts);
2656                 return (0);
2657         }
2658
2659         /* Buffer available either on AGE or LRU or META */
2660         bp = NULL;
2661         *queue = -1;
2662
2663         /* Buffer available either on AGE or LRU */
2664         if (!age_bp) {
2665                 bp = lru_bp;
2666                 *queue = BQ_LRU;
2667         } else if (!lru_bp) {
2668                 bp = age_bp;
2669                 *queue = BQ_AGE;
2670         } else { /* buffer available on both AGE and LRU */
2671                 int             t = buf_timestamp();
2672
2673                 age_time = t - age_bp->b_timestamp;
2674                 lru_time = t - lru_bp->b_timestamp;
2675                 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
2676                         bp = age_bp;
2677                         *queue = BQ_AGE;
2678                         /*
2679                          * we should probably re-timestamp eveything in the
2680                          * queues at this point with the current time
2681                          */
2682                 } else {
2683                         if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
2684                                 bp = lru_bp;
2685                                 *queue = BQ_LRU;
2686                         } else {
2687                                 bp = age_bp;
2688                                 *queue = BQ_AGE;
2689                         }
2690                 }
2691         }
2692
2693         if (!bp) { /* Neither on AGE nor on LRU */
2694                 bp = meta_bp;
2695                 *queue = BQ_META;
2696         }  else if (meta_bp) {
2697                 int             t = buf_timestamp();
2698
2699                 bp_time = t - bp->b_timestamp;
2700                 meta_time = t - meta_bp->b_timestamp;
2701
2702                 if (!(bp_time < 0) && !(meta_time < 0)) {
2703                         /* time not set backwards */
2704                         int bp_is_stale;
2705                         bp_is_stale = (*queue == BQ_LRU) ?
2706                                         lru_is_stale : age_is_stale;
2707
2708                         if ((meta_time >= meta_is_stale) &&
2709                                         (bp_time < bp_is_stale)) {
2710                                 bp = meta_bp;
2711                                 *queue = BQ_META;
2712                         }
2713                 }
2714         }
2715 found:
2716         if (ISSET(bp->b_flags, B_LOCKED) || ISSET(bp->b_lflags, BL_BUSY))
2717                 panic("getnewbuf: bp @ 0x%x is LOCKED or BUSY! (flags 0x%x)\n", bp, bp->b_flags);
2718
2719         /* Clean it */
2720         if (bcleanbuf(bp)) {
2721                 /*
2722                  * moved to the laundry thread, buffer not ready
2723                  */
2724                 *queue = req;
2725                 goto start;
2726         }
2727         return (bp);
2728 }
2729
2730
2731 /*
2732  * Clean a buffer.
2733  * Returns 0 is buffer is ready to use,
2734  * Returns 1 if issued a buf_bawrite() to indicate
2735  * that the buffer is not ready.
2736  *
2737  * buf_mtxp is held upon entry
2738  * returns with buf_mtxp locked
2739  */
2740 static int
2741 bcleanbuf(buf_t bp)
2742 {
2743         ucred_t cred;
2744
2745
2746         /* Remove from the queue */
2747         bremfree_locked(bp);
2748
2749         /* Buffer is no longer on free lists. */
2750         SET(bp->b_lflags, BL_BUSY);
2751 #ifdef JOE_DEBUG
2752         bp->b_owner = current_thread();
2753         bp->b_tag   = 2;
2754 #endif
2755         /*
2756          * If buffer was a delayed write, start the IO by queuing
2757          * it on the LAUNDRY queue, and return 1
2758          */
2759         if (ISSET(bp->b_flags, B_DELWRI)) {
2760                 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
2761                 blaundrycnt++;
2762
2763                 lck_mtx_unlock(buf_mtxp);
2764
2765                 wakeup(&blaundrycnt);
2766                 /* and give it a chance to run */
2767                 (void)thread_block(THREAD_CONTINUE_NULL);
2768
2769                 lck_mtx_lock(buf_mtxp);
2770                 return (1);
2771         }
2772         bremhash(bp);
2773
2774         lck_mtx_unlock(buf_mtxp);
2775
2776         BLISTNONE(bp);
2777         /*
2778          * disassociate us from our vnode, if we had one...
2779          */
2780         if (bp->b_vp)
2781                 brelvp(bp);
2782
2783         if (ISSET(bp->b_flags, B_META)) {
2784                 vm_offset_t elem;
2785
2786                 elem = (vm_offset_t)bp->b_datap;
2787                 bp->b_datap = (uintptr_t)0xdeadbeef;
2788
2789                 if (ISSET(bp->b_flags, B_ZALLOC)) {
2790                         zone_t z;
2791
2792                         z = getbufzone(bp->b_bufsize);
2793                         zfree(z, (void *)elem);
2794                 } else
2795                         kmem_free(kernel_map, elem, bp->b_bufsize);
2796         }
2797
2798         trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
2799
2800         /* clear out various other fields */
2801         bp->b_bufsize = 0;
2802         bp->b_datap = (uintptr_t)NULL;
2803         bp->b_upl = (void *)NULL;
2804         /*
2805          * preserve the state of whether this buffer
2806          * was allocated on the fly or not...
2807          * the only other flag that should be set at
2808          * this point is BL_BUSY...
2809          */
2810 #ifdef JOE_DEBUG
2811         bp->b_owner = current_thread();
2812         bp->b_tag   = 3;
2813 #endif
2814         bp->b_lflags = BL_BUSY;
2815         bp->b_flags = (bp->b_flags & B_HDRALLOC);
2816         bp->b_dev = NODEV;
2817         bp->b_blkno = bp->b_lblkno = 0;
2818         bp->b_iodone = NULL;
2819         bp->b_error = 0;
2820         bp->b_resid = 0;
2821         bp->b_bcount = 0;
2822         bp->b_dirtyoff = bp->b_dirtyend = 0;
2823         bp->b_validoff = bp->b_validend = 0;
2824
2825         /* nuke any credentials we were holding */
2826         cred = bp->b_rcred;
2827         if (cred != NOCRED) {
2828                 bp->b_rcred = NOCRED;
2829                 kauth_cred_rele(cred);
2830         }
2831         cred = bp->b_wcred;
2832         if (cred != NOCRED) {
2833                 bp->b_wcred = NOCRED;
2834                 kauth_cred_rele(cred);
2835         }
2836         lck_mtx_lock(buf_mtxp);
2837
2838         return (0);
2839 }
2840
2841
2842
2843 errno_t
2844 buf_invalblkno(vnode_t vp, daddr64_t lblkno, int flags)
2845 {
2846         buf_t   bp;
2847         errno_t error;
2848
2849         lck_mtx_lock(buf_mtxp);
2850 relook:
2851         if ((bp = incore_locked(vp, lblkno)) == (struct buf *)0) {
2852                 lck_mtx_unlock(buf_mtxp);
2853                 return (0);
2854         }
2855         if (ISSET(bp->b_lflags, BL_BUSY)) {
2856                 if ( !ISSET(flags, BUF_WAIT)) {
2857                         lck_mtx_unlock(buf_mtxp);
2858                         return (EBUSY);
2859                 }
2860                 SET(bp->b_lflags, BL_WANTED);
2861
2862                 error = msleep((caddr_t)bp, buf_mtxp, (PRIBIO + 1), (char *)"buf_invalblkno", 0);
2863
2864                 if (error)
2865                         return (error);
2866                 goto relook;
2867         }
2868         bremfree_locked(bp);
2869         SET(bp->b_lflags, BL_BUSY);
2870         SET(bp->b_flags, B_INVAL);
2871 #ifdef JOE_DEBUG
2872         bp->b_owner = current_thread();
2873         bp->b_tag   = 4;
2874 #endif
2875         lck_mtx_unlock(buf_mtxp);
2876         buf_brelse(bp);
2877
2878         return (0);
2879 }
2880
2881
2882 void
2883 buf_drop(buf_t bp)
2884 {
2885         int need_wakeup = 0;
2886
2887         lck_mtx_lock(buf_mtxp);
2888
2889         if (ISSET(bp->b_lflags, BL_WANTED)) {
2890                 /*
2891                  * delay the actual wakeup until after we
2892                  * clear BL_BUSY and we've dropped buf_mtxp
2893                  */
2894                 need_wakeup = 1;
2895         }
2896         /*
2897          * Unlock the buffer.
2898          */
2899         CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
2900
2901         lck_mtx_unlock(buf_mtxp);
2902
2903         if (need_wakeup) {
2904                 /*
2905                  * Wake up any proceeses waiting for _this_ buffer to become free.
2906                  */
2907                 wakeup(bp);
2908         }
2909 }
2910
2911
2912 errno_t
2913 buf_acquire(buf_t bp, int flags, int slpflag, int slptimeo) {
2914         errno_t error;
2915
2916         lck_mtx_lock(buf_mtxp);
2917
2918         error = buf_acquire_locked(bp, flags, slpflag, slptimeo);
2919
2920         lck_mtx_unlock(buf_mtxp);
2921
2922         return (error);
2923 }
2924
2925
2926 static errno_t
2927 buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo)
2928 {
2929         errno_t error;
2930         struct timespec ts;
2931
2932         if (ISSET(bp->b_flags, B_LOCKED)) {
2933                 if ((flags & BAC_SKIP_LOCKED))
2934                         return (EDEADLK);
2935         } else {
2936                 if ((flags & BAC_SKIP_NONLOCKED))
2937                         return (EDEADLK);
2938         }
2939         if (ISSET(bp->b_lflags, BL_BUSY)) {
2940                 /*
2941                  * since the mutex_lock may block, the buffer
2942                  * may become BUSY, so we need to
2943                  * recheck for a NOWAIT request
2944                  */
2945                 if (flags & BAC_NOWAIT)
2946                         return (EBUSY);
2947                 SET(bp->b_lflags, BL_WANTED);
2948
2949                 /* the hz value is 100; which leads to 10ms */
2950                 ts.tv_sec = (slptimeo/100);
2951                 ts.tv_nsec = (slptimeo % 100) * 10  * NSEC_PER_USEC * 1000;
2952                 error = msleep((caddr_t)bp, buf_mtxp, slpflag | (PRIBIO + 1), (char *)"buf_acquire", &ts);
2953
2954                 if (error)
2955                         return (error);
2956                 return (EAGAIN);
2957         }
2958         if (flags & BAC_REMOVE)
2959                 bremfree_locked(bp);
2960         SET(bp->b_lflags, BL_BUSY);
2961 #ifdef JOE_DEBUG
2962         bp->b_owner = current_thread();
2963         bp->b_tag   = 5;
2964 #endif
2965         return (0);
2966 }
2967
2968
2969 /*
2970  * Wait for operations on the buffer to complete.
2971  * When they do, extract and return the I/O's error value.
2972  */
2973 errno_t
2974 buf_biowait(buf_t bp)
2975 {
2976         lck_mtx_lock(buf_mtxp);
2977
2978         while (!ISSET(bp->b_flags, B_DONE))
2979                 (void) msleep(bp, buf_mtxp, (PRIBIO+1), (char *)"buf_biowait", 0);
2980
2981         lck_mtx_unlock(buf_mtxp);
2982
2983         /* check for interruption of I/O (e.g. via NFS), then errors. */
2984         if (ISSET(bp->b_flags, B_EINTR)) {
2985                 CLR(bp->b_flags, B_EINTR);
2986                 return (EINTR);
2987         } else if (ISSET(bp->b_flags, B_ERROR))
2988                 return (bp->b_error ? bp->b_error : EIO);
2989         else
2990                 return (0);
2991 }
2992
2993 /*
2994  * Mark I/O complete on a buffer.
2995  *
2996  * If a callback has been requested, e.g. the pageout
2997  * daemon, do so. Otherwise, awaken waiting processes.
2998  *
2999  * [ Leffler, et al., says on p.247:
3000  *      "This routine wakes up the blocked process, frees the buffer
3001  *      for an asynchronous write, or, for a request by the pagedaemon
3002  *      process, invokes a procedure specified in the buffer structure" ]
3003  *
3004  * In real life, the pagedaemon (or other system processes) wants
3005  * to do async stuff to, and doesn't want the buffer buf_brelse()'d.
3006  * (for swap pager, that puts swap buffers on the free lists (!!!),
3007  * for the vn device, that puts malloc'd buffers on the free lists!)
3008  */
3009 extern struct timeval priority_IO_timestamp_for_root;
3010 extern int hard_throttle_on_root;
3011
3012 void
3013 buf_biodone(buf_t bp)
3014 {
3015         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
3016                      (int)bp, (int)bp->b_datap, bp->b_flags, 0, 0);
3017
3018         if (ISSET(bp->b_flags, B_DONE))
3019                 panic("biodone already");
3020
3021         if (kdebug_enable) {
3022                 int    code = DKIO_DONE;
3023
3024                 if (bp->b_flags & B_READ)
3025                         code |= DKIO_READ;
3026                 if (bp->b_flags & B_ASYNC)
3027                         code |= DKIO_ASYNC;
3028
3029                 if (bp->b_flags & B_META)
3030                         code |= DKIO_META;
3031                 else if (bp->b_flags & B_PAGEIO)
3032                         code |= DKIO_PAGING;
3033
3034                 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
3035                                       (unsigned int)bp, (unsigned int)bp->b_vp,
3036                                       bp->b_resid, bp->b_error, 0);
3037         }
3038         if ((bp->b_vp != NULLVP) &&
3039             ((bp->b_flags & (B_PAGEIO | B_READ)) == (B_PAGEIO | B_READ)) &&
3040             (bp->b_vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV)) {
3041                 microuptime(&priority_IO_timestamp_for_root);
3042                 hard_throttle_on_root = 0;
3043         }
3044         /*
3045          * I/O was done, so don't believe
3046          * the DIRTY state from VM anymore
3047          */
3048         CLR(bp->b_flags, B_WASDIRTY);
3049
3050         if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
3051                 /*
3052                  * wake up any writer's blocked
3053                  * on throttle or waiting for I/O
3054                  * to drain
3055                  */
3056                 vnode_writedone(bp->b_vp);
3057
3058         if (ISSET(bp->b_flags, (B_CALL | B_FILTER))) {  /* if necessary, call out */
3059                 void    (*iodone_func)(struct buf *, void *) = bp->b_iodone;
3060                 void    *arg = (void *)bp->b_transaction;
3061                 int     callout = ISSET(bp->b_flags, B_CALL);
3062
3063                 CLR(bp->b_flags, (B_CALL | B_FILTER));  /* filters and callouts are one-shot */
3064                 bp->b_iodone = NULL;
3065                 bp->b_transaction = NULL;
3066
3067                 if (iodone_func == NULL) {
3068                         panic("biodone: bp @ 0x%x has NULL b_iodone!\n", bp);
3069                 } else {
3070                         if (callout)
3071                                 SET(bp->b_flags, B_DONE);       /* note that it's done */
3072                         (*iodone_func)(bp, arg);
3073                 }
3074                 if (callout)
3075                         /*
3076                          * assumes that the call back function takes
3077                          * ownership of the bp and deals with releasing it if necessary
3078                          */
3079                         goto biodone_done;
3080                 /*
3081                  * in this case the call back function is acting
3082                  * strictly as a filter... it does not take
3083                  * ownership of the bp and is expecting us
3084                  * to finish cleaning up... this is currently used
3085                  * by the HFS journaling code
3086                  */
3087         }
3088         if (ISSET(bp->b_flags, B_ASYNC)) {      /* if async, release it */
3089                 SET(bp->b_flags, B_DONE);       /* note that it's done */
3090
3091                 buf_brelse(bp);
3092         } else {                                /* or just wakeup the buffer */
3093                 /*
3094                  * by taking the mutex, we serialize
3095                  * the buf owner calling buf_biowait so that we'll
3096                  * only see him in one of 2 states...
3097                  * state 1: B_DONE wasn't set and he's
3098                  * blocked in msleep
3099                  * state 2: he's blocked trying to take the
3100                  * mutex before looking at B_DONE
3101                  * BL_WANTED is cleared in case anyone else
3102                  * is blocked waiting for the buffer... note
3103                  * that we haven't cleared B_BUSY yet, so if
3104                  * they do get to run, their going to re-set
3105                  * BL_WANTED and go back to sleep
3106                  */
3107                 lck_mtx_lock(buf_mtxp);
3108
3109                 CLR(bp->b_lflags, BL_WANTED);
3110                 SET(bp->b_flags, B_DONE);               /* note that it's done */
3111
3112                 lck_mtx_unlock(buf_mtxp);
3113
3114                 wakeup(bp);
3115         }
3116 biodone_done:
3117         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
3118                      (int)bp, (int)bp->b_datap, bp->b_flags, 0, 0);
3119 }
3120
3121 /*
3122  * Return a count of buffers on the "locked" queue.
3123  */
3124 int
3125 count_lock_queue(void)
3126 {
3127         buf_t   bp;
3128         int     n = 0;
3129
3130         lck_mtx_lock(buf_mtxp);
3131
3132         for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
3133             bp = bp->b_freelist.tqe_next)
3134                 n++;
3135         lck_mtx_unlock(buf_mtxp);
3136
3137         return (n);
3138 }
3139
3140 /*
3141  * Return a count of 'busy' buffers. Used at the time of shutdown.
3142  */
3143 int
3144 count_busy_buffers(void)
3145 {
3146         buf_t   bp;
3147         int     nbusy = 0;
3148
3149         lck_mtx_lock(buf_mtxp);
3150         for (bp = &buf[boot_nbuf]; --bp >= buf; )
3151                 if (!ISSET(bp->b_flags, B_INVAL) && ISSET(bp->b_lflags, BL_BUSY))
3152                         nbusy++;
3153         lck_mtx_unlock(buf_mtxp);
3154
3155         return (nbusy);
3156 }
3157
3158 #if DIAGNOSTIC
3159 /*
3160  * Print out statistics on the current allocation of the buffer pool.
3161  * Can be enabled to print out on every ``sync'' by setting "syncprt"
3162  * in vfs_syscalls.c using sysctl.
3163  */
3164 void
3165 vfs_bufstats()
3166 {
3167         int i, j, count;
3168         register struct buf *bp;
3169         register struct bqueues *dp;
3170         int counts[MAXBSIZE/CLBYTES+1];
3171         static char *bname[BQUEUES] =
3172                 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
3173
3174         for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
3175                 count = 0;
3176                 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
3177                         counts[j] = 0;
3178
3179                 lck_mtx_lock(buf_mtxp);
3180
3181                 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
3182                         counts[bp->b_bufsize/CLBYTES]++;
3183                         count++;
3184                 }
3185                 lck_mtx_unlock(buf_mtxp);
3186
3187                 printf("%s: total-%d", bname[i], count);
3188                 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
3189                         if (counts[j] != 0)
3190                                 printf(", %d-%d", j * CLBYTES, counts[j]);
3191                 printf("\n");
3192         }
3193 }
3194 #endif /* DIAGNOSTIC */
3195
3196 #define NRESERVEDIOBUFS 64
3197
3198
3199 buf_t
3200 alloc_io_buf(vnode_t vp, int priv)
3201 {
3202         buf_t   bp;
3203
3204         lck_mtx_lock(iobuffer_mtxp);
3205
3206         while (((niobuf - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse) && !priv) ||
3207                (bp = iobufqueue.tqh_first) == NULL) {
3208                 bufstats.bufs_iobufsleeps++;
3209
3210                 need_iobuffer = 1;
3211                 (void) msleep(&need_iobuffer, iobuffer_mtxp, (PRIBIO+1), (const char *)"alloc_io_buf", 0);
3212         }
3213         TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
3214
3215         bufstats.bufs_iobufinuse++;
3216         if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
3217                 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
3218
3219         lck_mtx_unlock(iobuffer_mtxp);
3220
3221         /*
3222          * initialize various fields
3223          * we don't need to hold the mutex since the buffer
3224          * is now private... the vp should have a reference
3225          * on it and is not protected by this mutex in any event
3226          */
3227         bp->b_timestamp = 0;
3228         bp->b_proc = NULL;
3229
3230         bp->b_datap = 0;
3231         bp->b_flags = 0;
3232         bp->b_lflags = BL_BUSY | BL_IOBUF;
3233         bp->b_blkno = bp->b_lblkno = 0;
3234 #ifdef JOE_DEBUG
3235         bp->b_owner = current_thread();
3236         bp->b_tag   = 6;
3237 #endif
3238         bp->b_iodone = NULL;
3239         bp->b_error = 0;
3240         bp->b_resid = 0;
3241         bp->b_bcount = 0;
3242         bp->b_bufsize = 0;
3243         bp->b_upl = NULL;
3244         bp->b_vp = vp;
3245
3246         if (vp && (vp->v_type == VBLK || vp->v_type == VCHR))
3247                 bp->b_dev = vp->v_rdev;
3248         else
3249                 bp->b_dev = NODEV;
3250
3251         return (bp);
3252 }
3253
3254
3255 void
3256 free_io_buf(buf_t bp)
3257 {
3258         int need_wakeup = 0;
3259
3260         /*
3261          * put buffer back on the head of the iobufqueue
3262          */
3263         bp->b_vp = NULL;
3264         bp->b_flags = B_INVAL;
3265
3266         lck_mtx_lock(iobuffer_mtxp);
3267
3268         binsheadfree(bp, &iobufqueue, -1);
3269
3270         if (need_iobuffer) {
3271                 /*
3272                  * Wake up any processes waiting because they need an io buffer
3273                  *
3274                  * do the wakeup after we drop the mutex... it's possible that the
3275                  * wakeup will be superfluous if need_iobuffer gets set again and
3276                  * another thread runs this path, but it's highly unlikely, doesn't
3277                  * hurt, and it means we don't hold up I/O progress if the wakeup blocks
3278                  * trying to grab a task related lock...
3279                  */
3280                 need_iobuffer = 0;
3281                 need_wakeup = 1;
3282         }
3283         bufstats.bufs_iobufinuse--;
3284
3285         lck_mtx_unlock(iobuffer_mtxp);
3286
3287         if (need_wakeup)
3288                 wakeup(&need_iobuffer);
3289 }
3290
3291
3292
3293 /*
3294  * If getnewbuf() calls bcleanbuf() on the same thread
3295  * there is a potential for stack overrun and deadlocks.
3296  * So we always handoff the work to a worker thread for completion
3297  */
3298 #include <mach/mach_types.h>
3299 #include <mach/memory_object_types.h>
3300 #include <kern/sched_prim.h>
3301
3302
3303 static void
3304 bcleanbuf_thread_init(void)
3305 {
3306         /* create worker thread */
3307         kernel_thread(kernel_task, bcleanbuf_thread);
3308 }
3309
3310 static void
3311 bcleanbuf_thread(void)
3312 {
3313         struct buf *bp;
3314         int error = 0;
3315         int loopcnt = 0;
3316
3317         for (;;) {
3318                 lck_mtx_lock(buf_mtxp);
3319
3320                 while (blaundrycnt == 0)
3321                         (void)msleep((void *)&blaundrycnt, buf_mtxp, PRIBIO, "blaundry", 0);
3322
3323                 bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY]);
3324                 /*
3325                  * Remove from the queue
3326                  */
3327                 bremfree_locked(bp);
3328                 blaundrycnt--;
3329
3330                 lck_mtx_unlock(buf_mtxp);
3331                 /*
3332                  * do the IO
3333                  */
3334                 error = bawrite_internal(bp, 0);
3335
3336                 if (error) {
3337                         lck_mtx_lock(buf_mtxp);
3338
3339                         binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
3340                         blaundrycnt++;
3341
3342                         lck_mtx_unlock(buf_mtxp);
3343
3344                         if (loopcnt > 10) {
3345                                 (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 1);
3346                                 loopcnt = 0;
3347                         } else {
3348                                 (void)thread_block(THREAD_CONTINUE_NULL);
3349                                 loopcnt++;
3350                         }
3351                 }
3352         }
3353 }
3354
3355
3356 static int
3357 brecover_data(buf_t bp)
3358 {
3359         int     upl_offset;
3360         upl_t   upl;
3361         upl_page_info_t *pl;
3362         kern_return_t kret;
3363         vnode_t vp = bp->b_vp;
3364         int upl_flags;
3365
3366
3367         if ( !UBCINFOEXISTS(vp) || bp->b_bufsize == 0)
3368                 goto dump_buffer;
3369
3370         upl_flags = UPL_PRECIOUS;
3371         if (! (buf_flags(bp) & B_READ)) {
3372                 /*
3373                  * "write" operation:  let the UPL subsystem know
3374                  * that we intend to modify the buffer cache pages we're
3375                  * gathering.
3376                  */
3377                 upl_flags |= UPL_WILL_MODIFY;
3378         }
3379
3380         kret = ubc_create_upl(vp,
3381                               ubc_blktooff(vp, bp->b_lblkno),
3382                               bp->b_bufsize,
3383                               &upl,
3384                               &pl,
3385                               upl_flags);
3386         if (kret != KERN_SUCCESS)
3387                 panic("Failed to create UPL");
3388
3389         for (upl_offset = 0; upl_offset < bp->b_bufsize; upl_offset += PAGE_SIZE) {
3390
3391                 if (!upl_valid_page(pl, upl_offset / PAGE_SIZE) || !upl_dirty_page(pl, upl_offset / PAGE_SIZE)) {
3392                         ubc_upl_abort(upl, 0);
3393                         goto dump_buffer;
3394                 }
3395         }
3396         bp->b_upl = upl;
3397
3398         kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_datap));
3399
3400         if (kret != KERN_SUCCESS)
3401                 panic("getblk: ubc_upl_map() failed with (%d)", kret);
3402         return (1);
3403
3404 dump_buffer:
3405         bp->b_bufsize = 0;
3406         SET(bp->b_flags, B_INVAL);
3407         buf_brelse(bp);
3408
3409         return(0);
3410 }
3411
3412
3413
3414 /*
3415  * disabled for now
3416  */
3417
3418 #if FLUSH_QUEUES
3419
3420 #define NFLUSH 32
3421
3422 static int
3423 bp_cmp(void *a, void *b)
3424 {
3425     buf_t *bp_a = *(buf_t **)a,
3426           *bp_b = *(buf_t **)b;
3427     daddr64_t res;
3428
3429     // don't have to worry about negative block
3430     // numbers so this is ok to do.
3431     //
3432     res = (bp_a->b_blkno - bp_b->b_blkno);
3433
3434     return (int)res;
3435 }
3436
3437
3438 int
3439 bflushq(int whichq, mount_t mp)
3440 {
3441         buf_t   bp, next;
3442         int     i, buf_count;
3443         int     total_writes = 0;
3444         static buf_t flush_table[NFLUSH];
3445
3446         if (whichq < 0 || whichq >= BQUEUES) {
3447             return (0);
3448         }
3449
3450   restart:
3451         lck_mtx_lock(buf_mtxp);
3452
3453         bp = TAILQ_FIRST(&bufqueues[whichq]);
3454
3455         for (buf_count = 0; bp; bp = next) {
3456             next = bp->b_freelist.tqe_next;
3457
3458             if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) {
3459                 continue;
3460             }
3461
3462             if (ISSET(bp->b_flags, B_DELWRI) && !ISSET(bp->b_lflags, BL_BUSY)) {
3463
3464                 bremfree_locked(bp);
3465 #ifdef JOE_DEBUG
3466                 bp->b_owner = current_thread();
3467                 bp->b_tag   = 7;
3468 #endif
3469                 SET(bp->b_lflags, BL_BUSY);
3470                 flush_table[buf_count] = bp;
3471                 buf_count++;
3472                 total_writes++;
3473
3474                 if (buf_count >= NFLUSH) {
3475                     lck_mtx_unlock(buf_mtxp);
3476
3477                     qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
3478
3479                     for (i = 0; i < buf_count; i++) {
3480                         buf_bawrite(flush_table[i]);
3481                     }
3482                     goto restart;
3483                 }
3484             }
3485         }
3486         lck_mtx_unlock(buf_mtxp);
3487
3488         if (buf_count > 0) {
3489             qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
3490
3491             for (i = 0; i < buf_count; i++) {
3492                 buf_bawrite(flush_table[i]);
3493             }
3494         }
3495
3496         return (total_writes);
3497 }
3498 #endif
3499
3500
3501 #if BALANCE_QUEUES
3502
3503 /* XXX move this to a separate file */
3504
3505 /*
3506  * NOTE: THIS CODE HAS NOT BEEN UPDATED
3507  * WITH RESPECT TO THE NEW LOCKING MODEL
3508  */
3509
3510
3511 /*
3512  * Dynamic Scaling of the Buffer Queues
3513  */
3514
3515 typedef long long blsize_t;
3516
3517 blsize_t MAXNBUF; /* initialize to (sane_size / PAGE_SIZE) */
3518 /* Global tunable limits */
3519 blsize_t nbufh;                 /* number of buffer headers */
3520 blsize_t nbuflow;               /* minimum number of buffer headers required */
3521 blsize_t nbufhigh;              /* maximum number of buffer headers allowed */
3522 blsize_t nbuftarget;    /* preferred number of buffer headers */
3523
3524 /*
3525  * assertions:
3526  *
3527  * 1.   0 < nbuflow <= nbufh <= nbufhigh
3528  * 2.   nbufhigh <= MAXNBUF
3529  * 3.   0 < nbuflow <= nbuftarget <= nbufhigh
3530  * 4.   nbufh can not be set by sysctl().
3531  */
3532
3533 /* Per queue tunable limits */
3534
3535 struct bufqlim {
3536         blsize_t        bl_nlow;        /* minimum number of buffer headers required */
3537         blsize_t        bl_num;         /* number of buffer headers on the queue */
3538         blsize_t        bl_nlhigh;      /* maximum number of buffer headers allowed */
3539         blsize_t        bl_target;      /* preferred number of buffer headers */
3540         long    bl_stale;       /* Seconds after which a buffer is considered stale */
3541 } bufqlim[BQUEUES];
3542
3543 /*
3544  * assertions:
3545  *
3546  * 1.   0 <= bl_nlow <= bl_num <= bl_nlhigh
3547  * 2.   bl_nlhigh <= MAXNBUF
3548  * 3.  bufqlim[BQ_META].bl_nlow != 0
3549  * 4.  bufqlim[BQ_META].bl_nlow > (number of possible concurrent
3550  *                                                                      file system IO operations)
3551  * 5.   bl_num can not be set by sysctl().
3552  * 6.   bl_nhigh <= nbufhigh
3553  */
3554
3555 /*
3556  * Rationale:
3557  * ----------
3558  * Defining it blsize_t as long permits 2^31 buffer headers per queue.
3559  * Which can describe (2^31 * PAGE_SIZE) memory per queue.
3560  *
3561  * These limits are exported to by means of sysctl().
3562  * It was decided to define blsize_t as a 64 bit quantity.
3563  * This will make sure that we will not be required to change it
3564  * as long as we do not exceed 64 bit address space for the kernel.
3565  *
3566  * low and high numbers parameters initialized at compile time
3567  * and boot arguments can be used to override them. sysctl()
3568  * would not change the value. sysctl() can get all the values
3569  * but can set only target. num is the current level.
3570  *
3571  * Advantages of having a "bufqscan" thread doing the balancing are,
3572  * Keep enough bufs on BQ_EMPTY.
3573  *      getnewbuf() by default will always select a buffer from the BQ_EMPTY.
3574  *              getnewbuf() perfoms best if a buffer was found there.
3575  *              Also this minimizes the possibility of starting IO
3576  *              from getnewbuf(). That's a performance win, too.
3577  *
3578  *      Localize complex logic [balancing as well as time aging]
3579  *              to balancebufq().
3580  *
3581  *      Simplify getnewbuf() logic by elimination of time aging code.
3582  */
3583
3584 /*
3585  * Algorithm:
3586  * -----------
3587  * The goal of the dynamic scaling of the buffer queues to to keep
3588  * the size of the LRU close to bl_target. Buffers on a queue would
3589  * be time aged.
3590  *
3591  * There would be a thread which will be responsible for "balancing"
3592  * the buffer cache queues.
3593  *
3594  * The scan order would be:     AGE, LRU, META, EMPTY.
3595  */
3596
3597 long bufqscanwait = 0;
3598
3599 static void bufqscan_thread();
3600 static int balancebufq(int q);
3601 static int btrimempty(int n);
3602 static __inline__ int initbufqscan(void);
3603 static __inline__ int nextbufq(int q);
3604 static void buqlimprt(int all);
3605
3606
3607 static __inline__ void
3608 bufqinc(int q)
3609 {
3610         if ((q < 0) || (q >= BQUEUES))
3611                 return;
3612
3613         bufqlim[q].bl_num++;
3614         return;
3615 }
3616
3617 static __inline__ void
3618 bufqdec(int q)
3619 {
3620         if ((q < 0) || (q >= BQUEUES))
3621                 return;
3622
3623         bufqlim[q].bl_num--;
3624         return;
3625 }
3626
3627 static void
3628 bufq_balance_thread_init()
3629 {
3630
3631         if (bufqscanwait++ == 0) {
3632
3633                 /* Initalize globals */
3634                 MAXNBUF = (sane_size / PAGE_SIZE);
3635                 nbufh = nbuf;
3636                 nbuflow = min(nbufh, 100);
3637                 nbufhigh = min(MAXNBUF, max(nbufh, 2048));
3638                 nbuftarget = (sane_size >> 5) / PAGE_SIZE;
3639                 nbuftarget = max(nbuflow, nbuftarget);
3640                 nbuftarget = min(nbufhigh, nbuftarget);
3641
3642                 /*
3643                  * Initialize the bufqlim
3644                  */
3645
3646                 /* LOCKED queue */
3647                 bufqlim[BQ_LOCKED].bl_nlow = 0;
3648                 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
3649                 bufqlim[BQ_LOCKED].bl_target = 0;
3650                 bufqlim[BQ_LOCKED].bl_stale = 30;
3651
3652                 /* LRU queue */
3653                 bufqlim[BQ_LRU].bl_nlow = 0;
3654                 bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
3655                 bufqlim[BQ_LRU].bl_target = nbuftarget/4;
3656                 bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
3657
3658                 /* AGE queue */
3659                 bufqlim[BQ_AGE].bl_nlow = 0;
3660                 bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
3661                 bufqlim[BQ_AGE].bl_target = nbuftarget/4;
3662                 bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
3663
3664                 /* EMPTY queue */
3665                 bufqlim[BQ_EMPTY].bl_nlow = 0;
3666                 bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
3667                 bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
3668                 bufqlim[BQ_EMPTY].bl_stale = 600000;
3669
3670                 /* META queue */
3671                 bufqlim[BQ_META].bl_nlow = 0;
3672                 bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
3673                 bufqlim[BQ_META].bl_target = nbuftarget/4;
3674                 bufqlim[BQ_META].bl_stale = META_IS_STALE;
3675
3676                 /* LAUNDRY queue */
3677                 bufqlim[BQ_LOCKED].bl_nlow = 0;
3678                 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
3679                 bufqlim[BQ_LOCKED].bl_target = 0;
3680                 bufqlim[BQ_LOCKED].bl_stale = 30;
3681
3682                 buqlimprt(1);
3683         }
3684
3685         /* create worker thread */
3686         kernel_thread(kernel_task, bufqscan_thread);
3687 }
3688
3689 /* The workloop for the buffer balancing thread */
3690 static void
3691 bufqscan_thread()
3692 {
3693         int moretodo = 0;
3694
3695         for(;;) {
3696                 do {
3697                         int q;  /* buffer queue to process */
3698
3699                         q = initbufqscan();
3700                         for (; q; ) {
3701                                 moretodo |= balancebufq(q);
3702                                 q = nextbufq(q);
3703                         }
3704                 } while (moretodo);
3705
3706 #if DIAGNOSTIC
3707                 vfs_bufstats();
3708                 buqlimprt(0);
3709 #endif
3710                 (void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz);
3711                 moretodo = 0;
3712         }
3713 }
3714
3715 /* Seed for the buffer queue balancing */
3716 static __inline__ int
3717 initbufqscan()
3718 {
3719         /* Start with AGE queue */
3720         return (BQ_AGE);
3721 }
3722
3723 /* Pick next buffer queue to balance */
3724 static __inline__ int
3725 nextbufq(int q)
3726 {
3727         int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
3728
3729         q++;
3730         q %= sizeof(order);
3731         return (order[q]);
3732 }
3733
3734 /* function to balance the buffer queues */
3735 static int
3736 balancebufq(int q)
3737 {
3738         int moretodo = 0;
3739         int s = splbio();
3740         int n, t;
3741
3742         /* reject invalid q */
3743         if ((q < 0) || (q >= BQUEUES))
3744                 goto out;
3745
3746         /* LOCKED or LAUNDRY queue MUST not be balanced */
3747         if ((q == BQ_LOCKED) || (q == BQ_LAUNDRY))
3748                 goto out;
3749
3750         n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
3751
3752         /* If queue has less than target nothing more to do */
3753         if (n < 0)
3754                 goto out;
3755
3756         if ( n > 8 ) {
3757                 /* Balance only a small amount (12.5%) at a time */
3758                 n >>= 3;
3759         }
3760
3761         /* EMPTY queue needs special handling */
3762         if (q == BQ_EMPTY) {
3763                 moretodo |= btrimempty(n);
3764                 goto out;
3765         }
3766
3767         t = buf_timestamp():
3768
3769         for (; n > 0; n--) {
3770                 struct buf *bp = bufqueues[q].tqh_first;
3771                 if (!bp)
3772                         break;
3773
3774                 /* check if it's stale */
3775                 if ((t - bp->b_timestamp) > bufqlim[q].bl_stale) {
3776                         if (bcleanbuf(bp)) {
3777                                 /* buf_bawrite() issued, bp not ready */
3778                                 moretodo = 1;
3779                         } else {
3780                                 /* release the cleaned buffer to BQ_EMPTY */
3781                                 SET(bp->b_flags, B_INVAL);
3782                                 buf_brelse(bp);
3783                         }
3784                 } else
3785                         break;
3786         }
3787
3788 out:
3789         splx(s);
3790         return (moretodo);
3791 }
3792
3793 static int
3794 btrimempty(int n)
3795 {
3796         /*
3797          * When struct buf are allocated dynamically, this would
3798          * reclaim upto 'n' struct buf from the empty queue.
3799          */
3800
3801          return (0);
3802 }
3803
3804 static void
3805 buqlimprt(int all)
3806 {
3807         int i;
3808     static char *bname[BQUEUES] =
3809                 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
3810
3811         if (all)
3812                 for (i = 0; i < BQUEUES; i++) {
3813                         printf("%s : ", bname[i]);
3814                         printf("min = %ld, ", (long)bufqlim[i].bl_nlow);
3815                         printf("cur = %ld, ", (long)bufqlim[i].bl_num);
3816                         printf("max = %ld, ", (long)bufqlim[i].bl_nlhigh);
3817                         printf("target = %ld, ", (long)bufqlim[i].bl_target);
3818                         printf("stale after %ld seconds\n", bufqlim[i].bl_stale);
3819                 }
3820         else
3821                 for (i = 0; i < BQUEUES; i++) {
3822                         printf("%s : ", bname[i]);
3823                         printf("cur = %ld, ", (long)bufqlim[i].bl_num);
3824                 }
3825 }
3826
3827 #endif
3828
3829