bsd/vfs/vfs_bio.c

   1 /*
   2  * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  23 /*-
  24  * Copyright (c) 1994 Christopher G. Demetriou
  25  * Copyright (c) 1982, 1986, 1989, 1993
  26  *      The Regents of the University of California.  All rights reserved.
  27  * (c) UNIX System Laboratories, Inc.
  28  * All or some portions of this file are derived from material licensed
  29  * to the University of California by American Telephone and Telegraph
  30  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  31  * the permission of UNIX System Laboratories, Inc.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)vfs_bio.c   8.6 (Berkeley) 1/11/94
  62  */
  63
  64 /*
  65  * Some references:
  66  *      Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
  67  *      Leffler, et al.: The Design and Implementation of the 4.3BSD
  68  *              UNIX Operating System (Addison Welley, 1989)
  69  */
  70
  71 #include <sys/param.h>
  72 #include <sys/systm.h>
  73 #include <sys/proc_internal.h>
  74 #include <sys/buf_internal.h>
  75 #include <sys/vnode_internal.h>
  76 #include <sys/mount_internal.h>
  77 #include <sys/trace.h>
  78 #include <sys/malloc.h>
  79 #include <sys/resourcevar.h>
  80 #include <miscfs/specfs/specdev.h>
  81 #include <sys/ubc.h>
  82 #include <sys/kauth.h>
  83 #if DIAGNOSTIC
  84 #include <kern/assert.h>
  85 #endif /* DIAGNOSTIC */
  86 #include <kern/task.h>
  87 #include <kern/zalloc.h>
  88 #include <kern/lock.h>
  89
  90 #include <vm/vm_kern.h>
  91
  92 #include <sys/kdebug.h>
  93 #include <machine/spl.h>
  94
  95 #if BALANCE_QUEUES
  96 static __inline__ void bufqinc(int q);
  97 static __inline__ void bufqdec(int q);
  98 #endif
  99
 100 static int      bcleanbuf(buf_t bp);
 101 static int      brecover_data(buf_t bp);
 102 static boolean_t incore(vnode_t vp, daddr64_t blkno);
 103 static buf_t    incore_locked(vnode_t vp, daddr64_t blkno);
 104 /* timeout is in msecs */
 105 static buf_t    getnewbuf(int slpflag, int slptimeo, int *queue);
 106 static void     bremfree_locked(buf_t bp);
 107 static void     buf_reassign(buf_t bp, vnode_t newvp);
 108 static errno_t  buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo);
 109 static int      buf_iterprepare(vnode_t vp, struct buflists *, int flags);
 110 static void     buf_itercomplete(vnode_t vp, struct buflists *, int flags);
 111
 112 __private_extern__ int  bdwrite_internal(buf_t, int);
 113
 114 /* zone allocated buffer headers */
 115 static void     bufzoneinit(void);
 116 static void     bcleanbuf_thread_init(void);
 117 static void     bcleanbuf_thread(void);
 118
 119 static zone_t   buf_hdr_zone;
 120 static int      buf_hdr_count;
 121
 122
 123 /*
 124  * Definitions for the buffer hash lists.
 125  */
 126 #define BUFHASH(dvp, lbn)       \
 127         (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
 128 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
 129 u_long  bufhash;
 130
 131 /* Definitions for the buffer stats. */
 132 struct bufstats bufstats;
 133
 134 /* Number of delayed write buffers */
 135 int nbdwrite = 0;
 136 int blaundrycnt = 0;
 137
 138
 139 static TAILQ_HEAD(ioqueue, buf) iobufqueue;
 140 static TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
 141 static int needbuffer;
 142 static int need_iobuffer;
 143
 144 static lck_grp_t        *buf_mtx_grp;
 145 static lck_attr_t       *buf_mtx_attr;
 146 static lck_grp_attr_t   *buf_mtx_grp_attr;
 147 static lck_mtx_t        *iobuffer_mtxp;
 148 static lck_mtx_t        *buf_mtxp;
 149
 150 static __inline__ int
 151 buf_timestamp(void)
 152 {
 153         struct  timeval         t;
 154         microuptime(&t);
 155         return (t.tv_sec);
 156 }
 157
 158 /*
 159  * Insq/Remq for the buffer free lists.
 160  */
 161 #if BALANCE_QUEUES
 162 #define binsheadfree(bp, dp, whichq)    do { \
 163                                     TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
 164                                         bufqinc((whichq));      \
 165                                         (bp)->b_whichq = whichq; \
 166                                     (bp)->b_timestamp = buf_timestamp(); \
 167                                 } while (0)
 168
 169 #define binstailfree(bp, dp, whichq)    do { \
 170                                     TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
 171                                         bufqinc((whichq));      \
 172                                         (bp)->b_whichq = whichq; \
 173                                     (bp)->b_timestamp = buf_timestamp(); \
 174                                 } while (0)
 175 #else
 176 #define binsheadfree(bp, dp, whichq)    do { \
 177                                     TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
 178                                         (bp)->b_whichq = whichq; \
 179                                     (bp)->b_timestamp = buf_timestamp(); \
 180                                 } while (0)
 181
 182 #define binstailfree(bp, dp, whichq)    do { \
 183                                     TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
 184                                         (bp)->b_whichq = whichq; \
 185                                     (bp)->b_timestamp = buf_timestamp(); \
 186                                 } while (0)
 187 #endif
 188
 189
 190 #define BHASHENTCHECK(bp)       \
 191         if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef)  \
 192                 panic("%x: b_hash.le_prev is not deadbeef", (bp));
 193
 194 #define BLISTNONE(bp)   \
 195         (bp)->b_hash.le_next = (struct buf *)0; \
 196         (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
 197
 198 /*
 199  * Insq/Remq for the vnode usage lists.
 200  */
 201 #define bufinsvn(bp, dp)        LIST_INSERT_HEAD(dp, bp, b_vnbufs)
 202 #define bufremvn(bp) {                                                  \
 203         LIST_REMOVE(bp, b_vnbufs);                                      \
 204         (bp)->b_vnbufs.le_next = NOLIST;                                \
 205 }
 206
 207 /*
 208  * Time in seconds before a buffer on a list is
 209  * considered as a stale buffer
 210  */
 211 #define LRU_IS_STALE 120 /* default value for the LRU */
 212 #define AGE_IS_STALE 60  /* default value for the AGE */
 213 #define META_IS_STALE 180 /* default value for the BQ_META */
 214
 215 int lru_is_stale = LRU_IS_STALE;
 216 int age_is_stale = AGE_IS_STALE;
 217 int meta_is_stale = META_IS_STALE;
 218
 219
 220
 221 /* LIST_INSERT_HEAD() with assertions */
 222 static __inline__ void
 223 blistenterhead(struct bufhashhdr * head, buf_t bp)
 224 {
 225         if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
 226                 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
 227         (head)->lh_first = bp;
 228         bp->b_hash.le_prev = &(head)->lh_first;
 229         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
 230                 panic("blistenterhead: le_prev is deadbeef");
 231 }
 232
 233 static __inline__ void
 234 binshash(buf_t bp, struct bufhashhdr *dp)
 235 {
 236         buf_t   nbp;
 237
 238         BHASHENTCHECK(bp);
 239
 240         nbp = dp->lh_first;
 241         for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
 242                 if(nbp == bp)
 243                         panic("buf already in hashlist");
 244         }
 245
 246         blistenterhead(dp, bp);
 247 }
 248
 249 static __inline__ void
 250 bremhash(buf_t  bp)
 251 {
 252         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
 253                 panic("bremhash le_prev is deadbeef");
 254         if (bp->b_hash.le_next == bp)
 255                 panic("bremhash: next points to self");
 256
 257         if (bp->b_hash.le_next != NULL)
 258                 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
 259         *bp->b_hash.le_prev = (bp)->b_hash.le_next;
 260 }
 261
 262
 263
 264
 265 int
 266 buf_valid(buf_t bp) {
 267
 268         if ( (bp->b_flags & (B_DONE | B_DELWRI)) )
 269                 return 1;
 270         return 0;
 271 }
 272
 273 int
 274 buf_fromcache(buf_t bp) {
 275
 276         if ( (bp->b_flags & B_CACHE) )
 277                 return 1;
 278         return 0;
 279 }
 280
 281 void
 282 buf_markinvalid(buf_t bp) {
 283
 284         SET(bp->b_flags, B_INVAL);
 285 }
 286
 287 void
 288 buf_markdelayed(buf_t bp) {
 289
 290         SET(bp->b_flags, B_DELWRI);
 291         buf_reassign(bp, bp->b_vp);
 292 }
 293
 294 void
 295 buf_markeintr(buf_t bp) {
 296
 297         SET(bp->b_flags, B_EINTR);
 298 }
 299
 300 void
 301 buf_markaged(buf_t bp) {
 302
 303         SET(bp->b_flags, B_AGE);
 304 }
 305
 306 errno_t
 307 buf_error(buf_t bp) {
 308
 309         return (bp->b_error);
 310 }
 311
 312 void
 313 buf_seterror(buf_t bp, errno_t error) {
 314
 315         if ((bp->b_error = error))
 316                 SET(bp->b_flags, B_ERROR);
 317         else
 318                 CLR(bp->b_flags, B_ERROR);
 319 }
 320
 321 void
 322 buf_setflags(buf_t bp, int32_t flags) {
 323
 324         SET(bp->b_flags, (flags & BUF_X_WRFLAGS));
 325 }
 326
 327 void
 328 buf_clearflags(buf_t bp, int32_t flags) {
 329
 330         CLR(bp->b_flags, (flags & BUF_X_WRFLAGS));
 331 }
 332
 333 int32_t
 334 buf_flags(buf_t bp) {
 335
 336         return ((bp->b_flags & BUF_X_RDFLAGS));
 337 }
 338
 339 void
 340 buf_reset(buf_t bp, int32_t io_flags) {
 341
 342         CLR(bp->b_flags, (B_READ | B_WRITE | B_ERROR | B_DONE | B_INVAL | B_ASYNC | B_NOCACHE));
 343         SET(bp->b_flags, (io_flags & (B_ASYNC | B_READ | B_WRITE | B_NOCACHE)));
 344
 345         bp->b_error = 0;
 346 }
 347
 348 uint32_t
 349 buf_count(buf_t bp) {
 350
 351         return (bp->b_bcount);
 352 }
 353
 354 void
 355 buf_setcount(buf_t bp, uint32_t bcount) {
 356
 357         bp->b_bcount = bcount;
 358 }
 359
 360 uint32_t
 361 buf_size(buf_t bp) {
 362
 363         return (bp->b_bufsize);
 364 }
 365
 366 void
 367 buf_setsize(buf_t bp, uint32_t bufsize) {
 368
 369         bp->b_bufsize = bufsize;
 370 }
 371
 372 uint32_t
 373 buf_resid(buf_t bp) {
 374
 375         return (bp->b_resid);
 376 }
 377
 378 void
 379 buf_setresid(buf_t bp, uint32_t resid) {
 380
 381         bp->b_resid = resid;
 382 }
 383
 384 uint32_t
 385 buf_dirtyoff(buf_t bp) {
 386
 387         return (bp->b_dirtyoff);
 388 }
 389
 390 uint32_t
 391 buf_dirtyend(buf_t bp) {
 392
 393         return (bp->b_dirtyend);
 394 }
 395
 396 void
 397 buf_setdirtyoff(buf_t bp, uint32_t dirtyoff) {
 398
 399         bp->b_dirtyoff = dirtyoff;
 400 }
 401
 402 void
 403 buf_setdirtyend(buf_t bp, uint32_t dirtyend) {
 404
 405         bp->b_dirtyend = dirtyend;
 406 }
 407
 408 uintptr_t
 409 buf_dataptr(buf_t bp) {
 410
 411         return (bp->b_datap);
 412 }
 413
 414 void
 415 buf_setdataptr(buf_t bp, uintptr_t data) {
 416
 417         bp->b_datap = data;
 418 }
 419
 420 vnode_t
 421 buf_vnode(buf_t bp) {
 422
 423         return (bp->b_vp);
 424 }
 425
 426 void
 427 buf_setvnode(buf_t bp, vnode_t vp) {
 428
 429         bp->b_vp = vp;
 430 }
 431
 432
 433 void *
 434 buf_callback(buf_t bp)
 435 {
 436         if ( !(bp->b_lflags & BL_IOBUF) )
 437                 return ((void *) NULL);
 438         if ( !(bp->b_flags & B_CALL) )
 439                 return ((void *) NULL);
 440
 441         return ((void *)bp->b_iodone);
 442 }
 443
 444
 445 errno_t
 446 buf_setcallback(buf_t bp, void (*callback)(buf_t, void *), void *transaction)
 447 {
 448
 449         if ( !(bp->b_lflags & BL_IOBUF) )
 450                 return (EINVAL);
 451
 452         if (callback)
 453                 bp->b_flags |= (B_CALL | B_ASYNC);
 454         else
 455                 bp->b_flags &= ~B_CALL;
 456         bp->b_transaction = transaction;
 457         bp->b_iodone = callback;
 458
 459         return (0);
 460 }
 461
 462 errno_t
 463 buf_setupl(buf_t bp, upl_t upl, uint32_t offset)
 464 {
 465
 466         if ( !(bp->b_lflags & BL_IOBUF) )
 467                 return (EINVAL);
 468
 469         if (upl)
 470                 bp->b_flags |= B_CLUSTER;
 471         else
 472                 bp->b_flags &= ~B_CLUSTER;
 473         bp->b_upl = upl;
 474         bp->b_uploffset = offset;
 475
 476         return (0);
 477 }
 478
 479 buf_t
 480 buf_clone(buf_t bp, int io_offset, int io_size, void (*iodone)(buf_t, void *), void *arg)
 481 {
 482         buf_t   io_bp;
 483
 484         if (io_offset < 0 || io_size < 0)
 485                 return (NULL);
 486
 487         if ((unsigned)(io_offset + io_size) > (unsigned)bp->b_bcount)
 488                 return (NULL);
 489
 490         if (bp->b_flags & B_CLUSTER) {
 491                 if (io_offset && ((bp->b_uploffset + io_offset) & PAGE_MASK))
 492                         return (NULL);
 493
 494                 if (((bp->b_uploffset + io_offset + io_size) & PAGE_MASK) && ((io_offset + io_size) < bp->b_bcount))
 495                         return (NULL);
 496         }
 497         io_bp = alloc_io_buf(bp->b_vp, 0);
 498
 499         io_bp->b_flags = bp->b_flags & (B_COMMIT_UPL | B_META | B_PAGEIO | B_CLUSTER | B_PHYS | B_ASYNC | B_READ);
 500
 501         if (iodone) {
 502                 io_bp->b_transaction = arg;
 503                 io_bp->b_iodone = iodone;
 504                 io_bp->b_flags |= B_CALL;
 505         }
 506         if (bp->b_flags & B_CLUSTER) {
 507                 io_bp->b_upl = bp->b_upl;
 508                 io_bp->b_uploffset = bp->b_uploffset + io_offset;
 509         } else {
 510                 io_bp->b_datap  = (uintptr_t)(((char *)bp->b_datap) + io_offset);
 511         }
 512         io_bp->b_bcount = io_size;
 513
 514         return (io_bp);
 515 }
 516
 517
 518
 519 void
 520 buf_setfilter(buf_t bp, void (*filter)(buf_t, void *), void *transaction,
 521               void **old_iodone, void **old_transaction)
 522 {
 523         if (old_iodone)
 524                 *old_iodone = (void *)(bp->b_iodone);
 525         if (old_transaction)
 526                 *old_transaction = (void *)(bp->b_transaction);
 527
 528         bp->b_transaction = transaction;
 529         bp->b_iodone = filter;
 530         bp->b_flags |= B_FILTER;
 531 }
 532
 533
 534 daddr64_t
 535 buf_blkno(buf_t bp) {
 536
 537         return (bp->b_blkno);
 538 }
 539
 540 daddr64_t
 541 buf_lblkno(buf_t bp) {
 542
 543         return (bp->b_lblkno);
 544 }
 545
 546 void
 547 buf_setblkno(buf_t bp, daddr64_t blkno) {
 548
 549         bp->b_blkno = blkno;
 550 }
 551
 552 void
 553 buf_setlblkno(buf_t bp, daddr64_t lblkno) {
 554
 555         bp->b_lblkno = lblkno;
 556 }
 557
 558 dev_t
 559 buf_device(buf_t bp) {
 560
 561         return (bp->b_dev);
 562 }
 563
 564 errno_t
 565 buf_setdevice(buf_t bp, vnode_t vp) {
 566
 567         if ((vp->v_type != VBLK) && (vp->v_type != VCHR))
 568                 return EINVAL;
 569         bp->b_dev = vp->v_rdev;
 570
 571         return 0;
 572 }
 573
 574
 575 void *
 576 buf_drvdata(buf_t bp) {
 577
 578         return (bp->b_drvdata);
 579 }
 580
 581 void
 582 buf_setdrvdata(buf_t bp, void *drvdata) {
 583
 584         bp->b_drvdata = drvdata;
 585 }
 586
 587 void *
 588 buf_fsprivate(buf_t bp) {
 589
 590         return (bp->b_fsprivate);
 591 }
 592
 593 void
 594 buf_setfsprivate(buf_t bp, void *fsprivate) {
 595
 596         bp->b_fsprivate = fsprivate;
 597 }
 598
 599 ucred_t
 600 buf_rcred(buf_t bp) {
 601
 602         return (bp->b_rcred);
 603 }
 604
 605 ucred_t
 606 buf_wcred(buf_t bp) {
 607
 608         return (bp->b_wcred);
 609 }
 610
 611 void *
 612 buf_upl(buf_t bp) {
 613
 614         return (bp->b_upl);
 615 }
 616
 617 uint32_t
 618 buf_uploffset(buf_t bp) {
 619
 620         return ((uint32_t)(bp->b_uploffset));
 621 }
 622
 623 proc_t
 624 buf_proc(buf_t bp) {
 625
 626         return (bp->b_proc);
 627 }
 628
 629
 630 errno_t
 631 buf_map(buf_t bp, caddr_t *io_addr)
 632 {
 633         buf_t           real_bp;
 634         vm_offset_t     vaddr;
 635         kern_return_t   kret;
 636
 637         if ( !(bp->b_flags & B_CLUSTER)) {
 638                 *io_addr = (caddr_t)bp->b_datap;
 639                 return (0);
 640         }
 641         real_bp = (buf_t)(bp->b_real_bp);
 642
 643         if (real_bp && real_bp->b_datap) {
 644                 /*
 645                  * b_real_bp is only valid if B_CLUSTER is SET
 646                  * if it's non-zero, than someone did a cluster_bp call
 647                  * if the backing physical pages were already mapped
 648                  * in before the call to cluster_bp (non-zero b_datap),
 649                  * than we just use that mapping
 650                  */
 651                 *io_addr = (caddr_t)real_bp->b_datap;
 652                 return (0);
 653         }
 654         kret = ubc_upl_map(bp->b_upl, &vaddr);    /* Map it in */
 655
 656         if (kret != KERN_SUCCESS) {
 657                 *io_addr = 0;
 658
 659                 return(ENOMEM);
 660         }
 661         vaddr += bp->b_uploffset;
 662
 663         *io_addr = (caddr_t)vaddr;
 664
 665         return (0);
 666 }
 667
 668 errno_t
 669 buf_unmap(buf_t bp)
 670 {
 671         buf_t           real_bp;
 672         kern_return_t   kret;
 673
 674         if ( !(bp->b_flags & B_CLUSTER))
 675                 return (0);
 676         /*
 677          * see buf_map for the explanation
 678          */
 679         real_bp = (buf_t)(bp->b_real_bp);
 680
 681         if (real_bp && real_bp->b_datap)
 682                 return (0);
 683
 684         if (bp->b_lflags & BL_IOBUF) {
 685                 /*
 686                  * when we commit these pages, we'll hit
 687                  * it with UPL_COMMIT_INACTIVE which
 688                  * will clear the reference bit that got
 689                  * turned on when we touched the mapping
 690                  */
 691                 bp->b_flags |= B_AGE;
 692         }
 693         kret = ubc_upl_unmap(bp->b_upl);
 694
 695         if (kret != KERN_SUCCESS)
 696                 return (EINVAL);
 697         return (0);
 698 }
 699
 700
 701 void
 702 buf_clear(buf_t bp) {
 703         caddr_t baddr;
 704
 705         if (buf_map(bp, &baddr) == 0) {
 706                 bzero(baddr, bp->b_bcount);
 707                 buf_unmap(bp);
 708         }
 709         bp->b_resid = 0;
 710 }
 711
 712
 713
 714 /*
 715  * Read or write a buffer that is not contiguous on disk.
 716  * buffer is marked done/error at the conclusion
 717  */
 718 static int
 719 buf_strategy_fragmented(vnode_t devvp, buf_t bp, off_t f_offset, size_t contig_bytes)
 720 {
 721         vnode_t vp = buf_vnode(bp);
 722         buf_t   io_bp;                   /* For reading or writing a single block */
 723         int     io_direction;
 724         int     io_resid;
 725         size_t  io_contig_bytes;
 726         daddr64_t io_blkno;
 727         int     error = 0;
 728         int     bmap_flags;
 729
 730         /*
 731          * save our starting point... the bp was already mapped
 732          * in buf_strategy before we got called
 733          * no sense doing it again.
 734          */
 735         io_blkno = bp->b_blkno;
 736         /*
 737          * Make sure we redo this mapping for the next I/O
 738          * i.e. this can never be a 'permanent' mapping
 739          */
 740         bp->b_blkno = bp->b_lblkno;
 741
 742         /*
 743          * Get an io buffer to do the deblocking
 744          */
 745         io_bp = alloc_io_buf(devvp, 0);
 746
 747         io_bp->b_lblkno = bp->b_lblkno;
 748         io_bp->b_datap  = bp->b_datap;
 749         io_resid        = bp->b_bcount;
 750         io_direction    = bp->b_flags & B_READ;
 751         io_contig_bytes = contig_bytes;
 752
 753         if (bp->b_flags & B_READ)
 754                 bmap_flags = VNODE_READ;
 755         else
 756                 bmap_flags = VNODE_WRITE;
 757
 758         for (;;) {
 759                 if (io_blkno == -1)
 760                         /*
 761                          * this is unexepected, but we'll allow for it
 762                          */
 763                         bzero((caddr_t)io_bp->b_datap, (int)io_contig_bytes);
 764                 else {
 765                         io_bp->b_bcount  = io_contig_bytes;
 766                         io_bp->b_bufsize = io_contig_bytes;
 767                         io_bp->b_resid   = io_contig_bytes;
 768                         io_bp->b_blkno   = io_blkno;
 769
 770                         buf_reset(io_bp, io_direction);
 771                         /*
 772                          * Call the device to do the I/O and wait for it
 773                          */
 774                         if ((error = VNOP_STRATEGY(io_bp)))
 775                                 break;
 776                         if ((error = (int)buf_biowait(io_bp)))
 777                                 break;
 778                         if (io_bp->b_resid) {
 779                                 io_resid -= (io_contig_bytes - io_bp->b_resid);
 780                                 break;
 781                         }
 782                 }
 783                 if ((io_resid -= io_contig_bytes) == 0)
 784                         break;
 785                 f_offset       += io_contig_bytes;
 786                 io_bp->b_datap += io_contig_bytes;
 787
 788                 /*
 789                  * Map the current position to a physical block number
 790                  */
 791                 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_resid, &io_blkno, &io_contig_bytes, NULL, bmap_flags, NULL)))
 792                         break;
 793         }
 794         buf_free(io_bp);
 795
 796         if (error)
 797                 buf_seterror(bp, error);
 798         bp->b_resid = io_resid;
 799         /*
 800          * This I/O is now complete
 801          */
 802         buf_biodone(bp);
 803
 804         return error;
 805 }
 806
 807
 808 /*
 809  * struct vnop_strategy_args {
 810  *      struct buf *a_bp;
 811  * } *ap;
 812  */
 813 errno_t
 814 buf_strategy(vnode_t devvp, void *ap)
 815 {
 816         buf_t   bp = ((struct vnop_strategy_args *)ap)->a_bp;
 817         vnode_t vp = bp->b_vp;
 818         int     bmap_flags;
 819         errno_t error;
 820
 821         if (vp == NULL || vp->v_type == VCHR || vp->v_type == VBLK)
 822                 panic("buf_strategy: b_vp == NULL || vtype == VCHR | VBLK\n");
 823         /*
 824          * associate the physical device with
 825          * with this buf_t even if we don't
 826          * end up issuing the I/O...
 827          */
 828         bp->b_dev = devvp->v_rdev;
 829
 830         if (bp->b_flags & B_READ)
 831                 bmap_flags = VNODE_READ;
 832         else
 833                 bmap_flags = VNODE_WRITE;
 834
 835         if ( !(bp->b_flags & B_CLUSTER)) {
 836
 837                 if ( (bp->b_upl) ) {
 838                         /*
 839                          * we have a UPL associated with this bp
 840                          * go through cluster_bp which knows how
 841                          * to deal with filesystem block sizes
 842                          * that aren't equal to the page size
 843                          */
 844                         return (cluster_bp(bp));
 845                 }
 846                 if (bp->b_blkno == bp->b_lblkno) {
 847                         off_t   f_offset;
 848                         size_t  contig_bytes;
 849
 850                         if ((error = VNOP_BLKTOOFF(vp, bp->b_lblkno, &f_offset))) {
 851                                 buf_seterror(bp, error);
 852                                 buf_biodone(bp);
 853
 854                                 return (error);
 855                         }
 856                         if ((error = VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))) {
 857                                 buf_seterror(bp, error);
 858                                 buf_biodone(bp);
 859
 860                                 return (error);
 861                         }
 862                         if (bp->b_blkno == -1)
 863                                 buf_clear(bp);
 864                         else if ((long)contig_bytes < bp->b_bcount)
 865                                 return (buf_strategy_fragmented(devvp, bp, f_offset, contig_bytes));
 866                 }
 867                 if (bp->b_blkno == -1) {
 868                         buf_biodone(bp);
 869                         return (0);
 870                 }
 871         }
 872         /*
 873          * we can issue the I/O because...
 874          * either B_CLUSTER is set which
 875          * means that the I/O is properly set
 876          * up to be a multiple of the page size, or
 877          * we were able to successfully set up the
 878          * phsyical block mapping
 879          */
 880         return (VOCALL(devvp->v_op, VOFFSET(vnop_strategy), ap));
 881 }
 882
 883
 884
 885 buf_t
 886 buf_alloc(vnode_t vp)
 887 {
 888         return(alloc_io_buf(vp, 0));
 889 }
 890
 891 void
 892 buf_free(buf_t bp) {
 893
 894         free_io_buf(bp);
 895 }
 896
 897
 898
 899 void
 900 buf_iterate(vnode_t vp, int (*callout)(buf_t, void *), int flags, void *arg) {
 901         buf_t   bp;
 902         int     retval;
 903         struct  buflists local_iterblkhd;
 904         int     lock_flags = BAC_NOWAIT | BAC_REMOVE;
 905
 906         if (flags & BUF_SKIP_LOCKED)
 907                 lock_flags |= BAC_SKIP_LOCKED;
 908         if (flags & BUF_SKIP_NONLOCKED)
 909                 lock_flags |= BAC_SKIP_NONLOCKED;
 910
 911         lck_mtx_lock(buf_mtxp);
 912
 913         if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY))  {
 914                 lck_mtx_unlock(buf_mtxp);
 915                 return;
 916         }
 917         while (!LIST_EMPTY(&local_iterblkhd)) {
 918                 bp = LIST_FIRST(&local_iterblkhd);
 919                 LIST_REMOVE(bp, b_vnbufs);
 920                 LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
 921
 922                 if (buf_acquire_locked(bp, lock_flags, 0, 0))
 923                         continue;
 924
 925                 lck_mtx_unlock(buf_mtxp);
 926
 927                 retval = callout(bp, arg);
 928
 929                 switch (retval) {
 930                 case BUF_RETURNED:
 931                         buf_brelse(bp);
 932                         break;
 933                 case BUF_CLAIMED:
 934                         break;
 935                 case BUF_RETURNED_DONE:
 936                         buf_brelse(bp);
 937                         lck_mtx_lock(buf_mtxp);
 938                         goto out;
 939                 case BUF_CLAIMED_DONE:
 940                         lck_mtx_lock(buf_mtxp);
 941                         goto out;
 942                 }
 943                 lck_mtx_lock(buf_mtxp);
 944         }
 945 out:
 946         buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
 947
 948         lck_mtx_unlock(buf_mtxp);
 949 }
 950
 951
 952 /*
 953  * Flush out and invalidate all buffers associated with a vnode.
 954  */
 955 int
 956 buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo)
 957 {
 958         buf_t   bp;
 959         int     error = 0;
 960         int     must_rescan = 1;
 961         struct  buflists local_iterblkhd;
 962
 963         lck_mtx_lock(buf_mtxp);
 964
 965         for (;;) {
 966                 if (must_rescan == 0)
 967                         /*
 968                          * the lists may not be empty, but all that's left at this
 969                          * point are metadata or B_LOCKED buffers which are being
 970                          * skipped... we know this because we made it through both
 971                          * the clean and dirty lists without dropping buf_mtxp...
 972                          * each time we drop buf_mtxp we bump "must_rescan"
 973                          */
 974                         break;
 975                 if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd))
 976                         break;
 977                 must_rescan = 0;
 978                 /*
 979                  * iterate the clean list
 980                  */
 981                 if (buf_iterprepare(vp, &local_iterblkhd, VBI_CLEAN)) {
 982                         goto try_dirty_list;
 983                 }
 984                 while (!LIST_EMPTY(&local_iterblkhd)) {
 985                         bp = LIST_FIRST(&local_iterblkhd);
 986
 987                         LIST_REMOVE(bp, b_vnbufs);
 988                         LIST_INSERT_HEAD(&vp->v_cleanblkhd, bp, b_vnbufs);
 989
 990                         /*
 991                          * some filesystems distinguish meta data blocks with a negative logical block #
 992                          */
 993                         if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
 994                                 continue;
 995
 996                         if ( (error = (int)buf_acquire_locked(bp, BAC_REMOVE | BAC_SKIP_LOCKED, slpflag, slptimeo)) ) {
 997                                 if (error == EDEADLK)
 998                                         /*
 999                                          * this buffer was marked B_LOCKED...
1000                                          * we didn't drop buf_mtxp, so we
1001                                          * we don't need to rescan
1002                                          */
1003                                         continue;
1004                                 if (error == EAGAIN) {
1005                                         /*
1006                                          * found a busy buffer... we blocked and
1007                                          * dropped buf_mtxp, so we're going to
1008                                          * need to rescan after this pass is completed
1009                                          */
1010                                         must_rescan++;
1011                                         continue;
1012                                 }
1013                                 /*
1014                                  * got some kind of 'real' error out of the msleep
1015                                  * in buf_acquire_locked, terminate the scan and return the error
1016                                  */
1017                                 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1018
1019                                 lck_mtx_unlock(buf_mtxp);
1020                                 return (error);
1021                         }
1022                         lck_mtx_unlock(buf_mtxp);
1023
1024                         SET(bp->b_flags, B_INVAL);
1025                         buf_brelse(bp);
1026
1027                         lck_mtx_lock(buf_mtxp);
1028
1029                         /*
1030                          * by dropping buf_mtxp, we allow new
1031                          * buffers to be added to the vnode list(s)
1032                          * we'll have to rescan at least once more
1033                          * if the queues aren't empty
1034                          */
1035                         must_rescan++;
1036                 }
1037                 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1038
1039 try_dirty_list:
1040                 /*
1041                  * Now iterate on dirty blks
1042                  */
1043                 if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY)) {
1044                         continue;
1045                 }
1046                 while (!LIST_EMPTY(&local_iterblkhd)) {
1047                         bp = LIST_FIRST(&local_iterblkhd);
1048
1049                         LIST_REMOVE(bp, b_vnbufs);
1050                         LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1051
1052                         /*
1053                          * some filesystems distinguish meta data blocks with a negative logical block #
1054                          */
1055                         if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
1056                                 continue;
1057
1058                         if ( (error = (int)buf_acquire_locked(bp, BAC_REMOVE | BAC_SKIP_LOCKED, slpflag, slptimeo)) ) {
1059                                 if (error == EDEADLK)
1060                                         /*
1061                                          * this buffer was marked B_LOCKED...
1062                                          * we didn't drop buf_mtxp, so we
1063                                          * we don't need to rescan
1064                                          */
1065                                         continue;
1066                                 if (error == EAGAIN) {
1067                                         /*
1068                                          * found a busy buffer... we blocked and
1069                                          * dropped buf_mtxp, so we're going to
1070                                          * need to rescan after this pass is completed
1071                                          */
1072                                         must_rescan++;
1073                                         continue;
1074                                 }
1075                                 /*
1076                                  * got some kind of 'real' error out of the msleep
1077                                  * in buf_acquire_locked, terminate the scan and return the error
1078                                  */
1079                                 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1080
1081                                 lck_mtx_unlock(buf_mtxp);
1082                                 return (error);
1083                         }
1084                         lck_mtx_unlock(buf_mtxp);
1085
1086                         SET(bp->b_flags, B_INVAL);
1087
1088                         if (ISSET(bp->b_flags, B_DELWRI) && (flags & BUF_WRITE_DATA))
1089                                 (void) VNOP_BWRITE(bp);
1090                         else
1091                                 buf_brelse(bp);
1092
1093                         lck_mtx_lock(buf_mtxp);
1094                         /*
1095                          * by dropping buf_mtxp, we allow new
1096                          * buffers to be added to the vnode list(s)
1097                          * we'll have to rescan at least once more
1098                          * if the queues aren't empty
1099                          */
1100                         must_rescan++;
1101                 }
1102                 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1103         }
1104         lck_mtx_unlock(buf_mtxp);
1105
1106         return (0);
1107 }
1108
1109 void
1110 buf_flushdirtyblks(vnode_t vp, int wait, int flags, char *msg) {
1111         buf_t   bp;
1112         int     writes_issued = 0;
1113         errno_t error;
1114         int     busy = 0;
1115         struct  buflists local_iterblkhd;
1116         int     lock_flags = BAC_NOWAIT | BAC_REMOVE;
1117
1118         if (flags & BUF_SKIP_LOCKED)
1119                 lock_flags |= BAC_SKIP_LOCKED;
1120         if (flags & BUF_SKIP_NONLOCKED)
1121                 lock_flags |= BAC_SKIP_NONLOCKED;
1122 loop:
1123         lck_mtx_lock(buf_mtxp);
1124
1125         if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY) == 0)  {
1126                 while (!LIST_EMPTY(&local_iterblkhd)) {
1127                         bp = LIST_FIRST(&local_iterblkhd);
1128                         LIST_REMOVE(bp, b_vnbufs);
1129                         LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1130
1131                         if ((error = buf_acquire_locked(bp, lock_flags, 0, 0)) == EBUSY)
1132                                 busy++;
1133                         if (error)
1134                                 continue;
1135                         lck_mtx_unlock(buf_mtxp);
1136
1137                         bp->b_flags &= ~B_LOCKED;
1138
1139                         /*
1140                          * Wait for I/O associated with indirect blocks to complete,
1141                          * since there is no way to quickly wait for them below.
1142                          */
1143                         if ((bp->b_vp == vp) || (wait == 0))
1144                                 (void) buf_bawrite(bp);
1145                         else
1146                                 (void) VNOP_BWRITE(bp);
1147                         writes_issued++;
1148
1149                         lck_mtx_lock(buf_mtxp);
1150                 }
1151                 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1152         }
1153         lck_mtx_unlock(buf_mtxp);
1154
1155         if (wait) {
1156                 (void)vnode_waitforwrites(vp, 0, 0, 0, msg);
1157
1158                 if (vp->v_dirtyblkhd.lh_first && busy) {
1159                         /*
1160                          * we had one or more BUSY buffers on
1161                          * the dirtyblock list... most likely
1162                          * these are due to delayed writes that
1163                          * were moved to the bclean queue but
1164                          * have not yet been 'written'.
1165                          * if we issued some writes on the
1166                          * previous pass, we try again immediately
1167                          * if we didn't, we'll sleep for some time
1168                          * to allow the state to change...
1169                          */
1170                         if (writes_issued == 0) {
1171                                 (void)tsleep((caddr_t)&vp->v_numoutput,
1172                                              PRIBIO + 1, "vnode_flushdirtyblks", hz/20);
1173                         }
1174                         writes_issued = 0;
1175                         busy = 0;
1176
1177                         goto loop;
1178                 }
1179         }
1180 }
1181
1182
1183 /*
1184  * called with buf_mtxp held...
1185  * this lock protects the queue manipulation
1186  */
1187 static int
1188 buf_iterprepare(vnode_t vp, struct buflists *iterheadp, int flags)
1189 {
1190         struct buflists * listheadp;
1191
1192         if (flags & VBI_DIRTY)
1193                 listheadp = &vp->v_dirtyblkhd;
1194         else
1195                 listheadp = &vp->v_cleanblkhd;
1196
1197         while (vp->v_iterblkflags & VBI_ITER)   {
1198                 vp->v_iterblkflags |= VBI_ITERWANT;
1199                 msleep(&vp->v_iterblkflags, buf_mtxp, 0, "buf_iterprepare", 0);
1200         }
1201         if (LIST_EMPTY(listheadp)) {
1202                 LIST_INIT(iterheadp);
1203                 return(EINVAL);
1204         }
1205         vp->v_iterblkflags |= VBI_ITER;
1206
1207         iterheadp->lh_first = listheadp->lh_first;
1208         listheadp->lh_first->b_vnbufs.le_prev = &iterheadp->lh_first;
1209         LIST_INIT(listheadp);
1210
1211         return(0);
1212 }
1213
1214 /*
1215  * called with buf_mtxp held...
1216  * this lock protects the queue manipulation
1217  */
1218 static void
1219 buf_itercomplete(vnode_t vp, struct buflists *iterheadp, int flags)
1220 {
1221         struct buflists * listheadp;
1222         buf_t bp;
1223
1224         if (flags & VBI_DIRTY)
1225                 listheadp = &vp->v_dirtyblkhd;
1226         else
1227                 listheadp = &vp->v_cleanblkhd;
1228
1229         while (!LIST_EMPTY(iterheadp)) {
1230                 bp = LIST_FIRST(iterheadp);
1231                 LIST_REMOVE(bp, b_vnbufs);
1232                 LIST_INSERT_HEAD(listheadp, bp, b_vnbufs);
1233         }
1234         vp->v_iterblkflags &= ~VBI_ITER;
1235
1236         if  (vp->v_iterblkflags & VBI_ITERWANT)         {
1237                 vp->v_iterblkflags &= ~VBI_ITERWANT;
1238                 wakeup(&vp->v_iterblkflags);
1239         }
1240 }
1241
1242
1243 static void
1244 bremfree_locked(buf_t bp)
1245 {
1246         struct bqueues *dp = NULL;
1247         int whichq = -1;
1248
1249         /*
1250          * We only calculate the head of the freelist when removing
1251          * the last element of the list as that is the only time that
1252          * it is needed (e.g. to reset the tail pointer).
1253          *
1254          * NB: This makes an assumption about how tailq's are implemented.
1255          */
1256         if (bp->b_freelist.tqe_next == NULL) {
1257                 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
1258                         if (dp->tqh_last == &bp->b_freelist.tqe_next)
1259                                 break;
1260                 if (dp == &bufqueues[BQUEUES])
1261                         panic("bremfree: lost tail");
1262         }
1263         TAILQ_REMOVE(dp, bp, b_freelist);
1264         whichq = bp->b_whichq;
1265 #if BALANCE_QUEUES
1266         bufqdec(whichq);
1267 #endif
1268         bp->b_whichq = -1;
1269         bp->b_timestamp = 0;
1270 }
1271
1272 /*
1273  * Associate a buffer with a vnode.
1274  */
1275 static void
1276 bgetvp(vnode_t vp, buf_t bp)
1277 {
1278
1279         if (bp->b_vp != vp)
1280                 panic("bgetvp: not free");
1281
1282         if (vp->v_type == VBLK || vp->v_type == VCHR)
1283                 bp->b_dev = vp->v_rdev;
1284         else
1285                 bp->b_dev = NODEV;
1286         /*
1287          * Insert onto list for new vnode.
1288          */
1289         lck_mtx_lock(buf_mtxp);
1290         bufinsvn(bp, &vp->v_cleanblkhd);
1291         lck_mtx_unlock(buf_mtxp);
1292 }
1293
1294 /*
1295  * Disassociate a buffer from a vnode.
1296  */
1297 static void
1298 brelvp(buf_t bp)
1299 {
1300         vnode_t vp;
1301
1302         if ((vp = bp->b_vp) == (vnode_t)NULL)
1303                 panic("brelvp: NULL vp");
1304         /*
1305          * Delete from old vnode list, if on one.
1306          */
1307         lck_mtx_lock(buf_mtxp);
1308         if (bp->b_vnbufs.le_next != NOLIST)
1309                 bufremvn(bp);
1310         lck_mtx_unlock(buf_mtxp);
1311
1312         bp->b_vp = (vnode_t)NULL;
1313 }
1314
1315 /*
1316  * Reassign a buffer from one vnode to another.
1317  * Used to assign file specific control information
1318  * (indirect blocks) to the vnode to which they belong.
1319  */
1320 static void
1321 buf_reassign(buf_t bp, vnode_t newvp)
1322 {
1323         register struct buflists *listheadp;
1324
1325         if (newvp == NULL) {
1326                 printf("buf_reassign: NULL");
1327                 return;
1328         }
1329         lck_mtx_lock(buf_mtxp);
1330
1331         /*
1332          * Delete from old vnode list, if on one.
1333          */
1334         if (bp->b_vnbufs.le_next != NOLIST)
1335                 bufremvn(bp);
1336         /*
1337          * If dirty, put on list of dirty buffers;
1338          * otherwise insert onto list of clean buffers.
1339          */
1340         if (ISSET(bp->b_flags, B_DELWRI))
1341                 listheadp = &newvp->v_dirtyblkhd;
1342         else
1343                 listheadp = &newvp->v_cleanblkhd;
1344         bufinsvn(bp, listheadp);
1345
1346         lck_mtx_unlock(buf_mtxp);
1347 }
1348
1349 static __inline__ void
1350 bufhdrinit(buf_t bp)
1351 {
1352         bzero((char *)bp, sizeof *bp);
1353         bp->b_dev = NODEV;
1354         bp->b_rcred = NOCRED;
1355         bp->b_wcred = NOCRED;
1356         bp->b_vnbufs.le_next = NOLIST;
1357         bp->b_flags = B_INVAL;
1358
1359         return;
1360 }
1361
1362 /*
1363  * Initialize buffers and hash links for buffers.
1364  */
1365 __private_extern__ void
1366 bufinit()
1367 {
1368         buf_t   bp;
1369         struct bqueues *dp;
1370         int     i;
1371         int     metabuf;
1372         long    whichq;
1373
1374         /* Initialize the buffer queues ('freelists') and the hash table */
1375         for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
1376                 TAILQ_INIT(dp);
1377         bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash);
1378
1379         metabuf = nbuf/8; /* reserved for meta buf */
1380
1381         /* Initialize the buffer headers */
1382         for (i = 0; i < nbuf; i++) {
1383                 bp = &buf[i];
1384                 bufhdrinit(bp);
1385
1386                 /*
1387                  * metabuf buffer headers on the meta-data list and
1388                  * rest of the buffer headers on the empty list
1389                  */
1390                 if (--metabuf)
1391                         whichq = BQ_META;
1392                 else
1393                         whichq = BQ_EMPTY;
1394
1395                 BLISTNONE(bp);
1396                 dp = &bufqueues[whichq];
1397                 binsheadfree(bp, dp, whichq);
1398                 binshash(bp, &invalhash);
1399         }
1400
1401         for (; i < nbuf + niobuf; i++) {
1402                 bp = &buf[i];
1403                 bufhdrinit(bp);
1404                 binsheadfree(bp, &iobufqueue, -1);
1405         }
1406
1407         /*
1408          * allocate lock group attribute and group
1409          */
1410         buf_mtx_grp_attr = lck_grp_attr_alloc_init();
1411         //lck_grp_attr_setstat(buf_mtx_grp_attr);
1412         buf_mtx_grp = lck_grp_alloc_init("buffer cache", buf_mtx_grp_attr);
1413
1414         /*
1415          * allocate the lock attribute
1416          */
1417         buf_mtx_attr = lck_attr_alloc_init();
1418         //lck_attr_setdebug(buf_mtx_attr);
1419
1420         /*
1421          * allocate and initialize mutex's for the buffer and iobuffer pools
1422          */
1423         buf_mtxp        = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
1424         iobuffer_mtxp   = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
1425
1426         if (iobuffer_mtxp == NULL)
1427                 panic("couldn't create iobuffer mutex");
1428
1429         if (buf_mtxp == NULL)
1430                 panic("couldn't create buf mutex");
1431
1432         /*
1433          * allocate and initialize cluster specific global locks...
1434          */
1435         cluster_init();
1436
1437         printf("using %d buffer headers and %d cluster IO buffer headers\n",
1438                 nbuf, niobuf);
1439
1440         /* Set up zones used by the buffer cache */
1441         bufzoneinit();
1442
1443         /* start the bcleanbuf() thread */
1444         bcleanbuf_thread_init();
1445
1446 #if BALANCE_QUEUES
1447         {
1448         static void bufq_balance_thread_init();
1449         /* create a thread to do dynamic buffer queue balancing */
1450         bufq_balance_thread_init();
1451         }
1452 #endif /* notyet */
1453 }
1454
1455 static struct buf *
1456 bio_doread(vnode_t vp, daddr64_t blkno, int size, ucred_t cred, int async, int queuetype)
1457 {
1458         buf_t   bp;
1459
1460         bp = buf_getblk(vp, blkno, size, 0, 0, queuetype);
1461
1462         /*
1463          * If buffer does not have data valid, start a read.
1464          * Note that if buffer is B_INVAL, buf_getblk() won't return it.
1465          * Therefore, it's valid if it's I/O has completed or been delayed.
1466          */
1467         if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
1468                 struct proc *p;
1469
1470                 p = current_proc();
1471
1472                 /* Start I/O for the buffer (keeping credentials). */
1473                 SET(bp->b_flags, B_READ | async);
1474                 if (cred != NOCRED && bp->b_rcred == NOCRED) {
1475                         kauth_cred_ref(cred);
1476                         bp->b_rcred = cred;
1477                 }
1478
1479                 VNOP_STRATEGY(bp);
1480
1481                 trace(TR_BREADMISS, pack(vp, size), blkno);
1482
1483                 /* Pay for the read. */
1484                 if (p && p->p_stats)
1485                         p->p_stats->p_ru.ru_inblock++;          /* XXX */
1486
1487                 if (async) {
1488                         /*
1489                          * since we asked for an ASYNC I/O
1490                          * the biodone will do the brelse
1491                          * we don't want to pass back a bp
1492                          * that we don't 'own'
1493                          */
1494                         bp = NULL;
1495                 }
1496         } else if (async) {
1497                 buf_brelse(bp);
1498                 bp = NULL;
1499         }
1500
1501         trace(TR_BREADHIT, pack(vp, size), blkno);
1502
1503         return (bp);
1504 }
1505
1506 /*
1507  * Perform the reads for buf_breadn() and buf_meta_breadn().
1508  * Trivial modification to the breada algorithm presented in Bach (p.55).
1509  */
1510 static errno_t
1511 do_breadn_for_type(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes,
1512                    int nrablks, ucred_t cred, buf_t *bpp, int queuetype)
1513 {
1514         buf_t   bp;
1515         int     i;
1516
1517         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, queuetype);
1518
1519         /*
1520          * For each of the read-ahead blocks, start a read, if necessary.
1521          */
1522         for (i = 0; i < nrablks; i++) {
1523                 /* If it's in the cache, just go on to next one. */
1524                 if (incore(vp, rablks[i]))
1525                         continue;
1526
1527                 /* Get a buffer for the read-ahead block */
1528                 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, queuetype);
1529         }
1530
1531         /* Otherwise, we had to start a read for it; wait until it's valid. */
1532         return (buf_biowait(bp));
1533 }
1534
1535
1536 /*
1537  * Read a disk block.
1538  * This algorithm described in Bach (p.54).
1539  */
1540 errno_t
1541 buf_bread(vnode_t vp, daddr64_t blkno, int size, ucred_t cred, buf_t *bpp)
1542 {
1543         buf_t   bp;
1544
1545         /* Get buffer for block. */
1546         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
1547
1548         /* Wait for the read to complete, and return result. */
1549         return (buf_biowait(bp));
1550 }
1551
1552 /*
1553  * Read a disk block. [bread() for meta-data]
1554  * This algorithm described in Bach (p.54).
1555  */
1556 errno_t
1557 buf_meta_bread(vnode_t vp, daddr64_t blkno, int size, ucred_t cred, buf_t *bpp)
1558 {
1559         buf_t   bp;
1560
1561         /* Get buffer for block. */
1562         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
1563
1564         /* Wait for the read to complete, and return result. */
1565         return (buf_biowait(bp));
1566 }
1567
1568 /*
1569  * Read-ahead multiple disk blocks. The first is sync, the rest async.
1570  */
1571 errno_t
1572 buf_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, ucred_t cred, buf_t *bpp)
1573 {
1574         return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_READ));
1575 }
1576
1577 /*
1578  * Read-ahead multiple disk blocks. The first is sync, the rest async.
1579  * [buf_breadn() for meta-data]
1580  */
1581 errno_t
1582 buf_meta_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, ucred_t cred, buf_t *bpp)
1583 {
1584         return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_META));
1585 }
1586
1587 /*
1588  * Block write.  Described in Bach (p.56)
1589  */
1590 errno_t
1591 buf_bwrite(buf_t bp)
1592 {
1593         int     sync, wasdelayed;
1594         errno_t rv;
1595         proc_t  p = current_proc();
1596         vnode_t vp = bp->b_vp;
1597
1598         if (bp->b_datap == 0) {
1599                 if (brecover_data(bp) == 0)
1600                         return (0);
1601         }
1602         /* Remember buffer type, to switch on it later. */
1603         sync = !ISSET(bp->b_flags, B_ASYNC);
1604         wasdelayed = ISSET(bp->b_flags, B_DELWRI);
1605         CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
1606
1607         if (wasdelayed)
1608                 OSAddAtomic(-1, &nbdwrite);
1609
1610         if (!sync) {
1611                 /*
1612                  * If not synchronous, pay for the I/O operation and make
1613                  * sure the buf is on the correct vnode queue.  We have
1614                  * to do this now, because if we don't, the vnode may not
1615                  * be properly notified that its I/O has completed.
1616                  */
1617                 if (wasdelayed)
1618                         buf_reassign(bp, vp);
1619                 else
1620                 if (p && p->p_stats)
1621                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
1622         }
1623         trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
1624
1625         /* Initiate disk write.  Make sure the appropriate party is charged. */
1626
1627         OSAddAtomic(1, &vp->v_numoutput);
1628
1629         VNOP_STRATEGY(bp);
1630
1631         if (sync) {
1632                 /*
1633                  * If I/O was synchronous, wait for it to complete.
1634                  */
1635                 rv = buf_biowait(bp);
1636
1637                 /*
1638                  * Pay for the I/O operation, if it's not been paid for, and
1639                  * make sure it's on the correct vnode queue. (async operatings
1640                  * were payed for above.)
1641                  */
1642                 if (wasdelayed)
1643                         buf_reassign(bp, vp);
1644                 else
1645                 if (p && p->p_stats)
1646                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
1647
1648                 /* Release the buffer. */
1649                 // XXXdbg - only if the unused bit is set
1650                 if (!ISSET(bp->b_flags, B_NORELSE)) {
1651                     buf_brelse(bp);
1652                 } else {
1653                     CLR(bp->b_flags, B_NORELSE);
1654                 }
1655
1656                 return (rv);
1657         } else {
1658                 return (0);
1659         }
1660 }
1661
1662 int
1663 vn_bwrite(ap)
1664         struct vnop_bwrite_args *ap;
1665 {
1666         return (buf_bwrite(ap->a_bp));
1667 }
1668
1669 /*
1670  * Delayed write.
1671  *
1672  * The buffer is marked dirty, but is not queued for I/O.
1673  * This routine should be used when the buffer is expected
1674  * to be modified again soon, typically a small write that
1675  * partially fills a buffer.
1676  *
1677  * NB: magnetic tapes cannot be delayed; they must be
1678  * written in the order that the writes are requested.
1679  *
1680  * Described in Leffler, et al. (pp. 208-213).
1681  *
1682  * Note: With the abilitty to allocate additional buffer
1683  * headers, we can get in to the situation where "too" many
1684  * buf_bdwrite()s can create situation where the kernel can create
1685  * buffers faster than the disks can service. Doing a buf_bawrite() in
1686  * cases were we have "too many" outstanding buf_bdwrite()s avoids that.
1687  */
1688 __private_extern__ int
1689 bdwrite_internal(buf_t bp, int return_error)
1690 {
1691         proc_t  p  = current_proc();
1692         vnode_t vp = bp->b_vp;
1693
1694         /*
1695          * If the block hasn't been seen before:
1696          *      (1) Mark it as having been seen,
1697          *      (2) Charge for the write.
1698          *      (3) Make sure it's on its vnode's correct block list,
1699          */
1700         if (!ISSET(bp->b_flags, B_DELWRI)) {
1701                 SET(bp->b_flags, B_DELWRI);
1702                 if (p && p->p_stats)
1703                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
1704                 OSAddAtomic(1, &nbdwrite);
1705                 buf_reassign(bp, vp);
1706         }
1707
1708         /* If this is a tape block, write it the block now. */
1709         if (ISSET(bp->b_flags, B_TAPE)) {
1710                 VNOP_BWRITE(bp);
1711                 return (0);
1712         }
1713
1714         /*
1715          * if we're not LOCKED, but the total number of delayed writes
1716          * has climbed above 75% of the total buffers in the system
1717          * return an error if the caller has indicated that it can
1718          * handle one in this case, otherwise schedule the I/O now
1719          * this is done to prevent us from allocating tons of extra
1720          * buffers when dealing with virtual disks (i.e. DiskImages),
1721          * because additional buffers are dynamically allocated to prevent
1722          * deadlocks from occurring
1723          *
1724          * however, can't do a buf_bawrite() if the LOCKED bit is set because the
1725          * buffer is part of a transaction and can't go to disk until
1726          * the LOCKED bit is cleared.
1727          */
1728         if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf/4)*3)) {
1729                 if (return_error)
1730                         return (EAGAIN);
1731                 /*
1732                  * If the vnode has "too many" write operations in progress
1733                  * wait for them to finish the IO
1734                  */
1735                 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, (char *)"buf_bdwrite");
1736
1737                 return (buf_bawrite(bp));
1738         }
1739
1740         /* Otherwise, the "write" is done, so mark and release the buffer. */
1741         SET(bp->b_flags, B_DONE);
1742         buf_brelse(bp);
1743         return (0);
1744 }
1745
1746 errno_t
1747 buf_bdwrite(buf_t bp)
1748 {
1749         return (bdwrite_internal(bp, 0));
1750 }
1751
1752
1753 /*
1754  * Asynchronous block write; just an asynchronous buf_bwrite().
1755  *
1756  * Note: With the abilitty to allocate additional buffer
1757  * headers, we can get in to the situation where "too" many
1758  * buf_bawrite()s can create situation where the kernel can create
1759  * buffers faster than the disks can service.
1760  * We limit the number of "in flight" writes a vnode can have to
1761  * avoid this.
1762  */
1763 static int
1764 bawrite_internal(buf_t bp, int throttle)
1765 {
1766         vnode_t vp = bp->b_vp;
1767
1768         if (vp) {
1769                 if (throttle)
1770                         /*
1771                          * If the vnode has "too many" write operations in progress
1772                          * wait for them to finish the IO
1773                          */
1774                         (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, (const char *)"buf_bawrite");
1775                 else if (vp->v_numoutput >= VNODE_ASYNC_THROTTLE)
1776                         /*
1777                          * return to the caller and
1778                          * let him decide what to do
1779                          */
1780                         return (EWOULDBLOCK);
1781         }
1782         SET(bp->b_flags, B_ASYNC);
1783
1784         return (VNOP_BWRITE(bp));
1785 }
1786
1787 errno_t
1788 buf_bawrite(buf_t bp)
1789 {
1790         return (bawrite_internal(bp, 1));
1791 }
1792
1793
1794 /*
1795  * Release a buffer on to the free lists.
1796  * Described in Bach (p. 46).
1797  */
1798 void
1799 buf_brelse(buf_t bp)
1800 {
1801         struct bqueues *bufq;
1802         long    whichq;
1803         upl_t   upl;
1804         int need_wakeup = 0;
1805         int need_bp_wakeup = 0;
1806
1807
1808         if (bp->b_whichq != -1 || !(bp->b_lflags & BL_BUSY))
1809                 panic("buf_brelse: bad buffer = %x\n", bp);
1810
1811 #ifdef JOE_DEBUG
1812         bp->b_stackbrelse[0] = __builtin_return_address(0);
1813         bp->b_stackbrelse[1] = __builtin_return_address(1);
1814         bp->b_stackbrelse[2] = __builtin_return_address(2);
1815         bp->b_stackbrelse[3] = __builtin_return_address(3);
1816         bp->b_stackbrelse[4] = __builtin_return_address(4);
1817         bp->b_stackbrelse[5] = __builtin_return_address(5);
1818
1819         bp->b_lastbrelse = current_thread();
1820         bp->b_tag = 0;
1821 #endif
1822         if (bp->b_lflags & BL_IOBUF) {
1823                 free_io_buf(bp);
1824                 return;
1825         }
1826
1827         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
1828                      bp->b_lblkno * PAGE_SIZE, (int)bp, (int)bp->b_datap,
1829                      bp->b_flags, 0);
1830
1831         trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
1832
1833         /*
1834          * if we're invalidating a buffer that has the B_FILTER bit
1835          * set then call the b_iodone function so it gets cleaned
1836          * up properly.
1837          *
1838          * the HFS journal code depends on this
1839          */
1840         if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
1841                 if (ISSET(bp->b_flags, B_FILTER)) {     /* if necessary, call out */
1842                         void    (*iodone_func)(struct buf *, void *) = bp->b_iodone;
1843                         void    *arg = (void *)bp->b_transaction;
1844
1845                         CLR(bp->b_flags, B_FILTER);     /* but note callout done */
1846                         bp->b_iodone = NULL;
1847                         bp->b_transaction = NULL;
1848
1849                         if (iodone_func == NULL) {
1850                                 panic("brelse: bp @ 0x%x has NULL b_iodone!\n", bp);
1851                         }
1852                         (*iodone_func)(bp, arg);
1853                 }
1854         }
1855         /*
1856          * I/O is done. Cleanup the UPL state
1857          */
1858         upl = bp->b_upl;
1859
1860         if ( !ISSET(bp->b_flags, B_META) && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
1861                 kern_return_t kret;
1862                 int           upl_flags;
1863
1864                 if ( (upl == NULL) ) {
1865                         if ( !ISSET(bp->b_flags, B_INVAL)) {
1866                                 kret = ubc_create_upl(bp->b_vp,
1867                                                       ubc_blktooff(bp->b_vp, bp->b_lblkno),
1868                                                       bp->b_bufsize,
1869                                                       &upl,
1870                                                       NULL,
1871                                                       UPL_PRECIOUS);
1872
1873                                 if (kret != KERN_SUCCESS)
1874                                         panic("brelse: Failed to create UPL");
1875 #ifdef  UPL_DEBUG
1876                                 upl_ubc_alias_set(upl, bp, 5);
1877 #endif /* UPL_DEBUG */
1878                         }
1879                 } else {
1880                         if (bp->b_datap) {
1881                                 kret = ubc_upl_unmap(upl);
1882
1883                                 if (kret != KERN_SUCCESS)
1884                                         panic("ubc_upl_unmap failed");
1885                                 bp->b_datap = (uintptr_t)NULL;
1886                         }
1887                 }
1888                 if (upl) {
1889                         if (bp->b_flags & (B_ERROR | B_INVAL)) {
1890                                 if (bp->b_flags & (B_READ | B_INVAL))
1891                                         upl_flags = UPL_ABORT_DUMP_PAGES;
1892                                 else
1893                                         upl_flags = 0;
1894
1895                                 ubc_upl_abort(upl, upl_flags);
1896                         } else {
1897                                 if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY))
1898                                         upl_flags = UPL_COMMIT_SET_DIRTY ;
1899                                 else
1900                                         upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
1901
1902                                 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
1903                                                      UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
1904                         }
1905                         bp->b_upl = NULL;
1906                 }
1907         } else {
1908                 if ( (upl) )
1909                         panic("brelse: UPL set for non VREG; vp=%x", bp->b_vp);
1910         }
1911
1912         /*
1913          * If it's locked, don't report an error; try again later.
1914          */
1915         if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
1916                 CLR(bp->b_flags, B_ERROR);
1917         /*
1918          * If it's not cacheable, or an error, mark it invalid.
1919          */
1920         if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
1921                 SET(bp->b_flags, B_INVAL);
1922
1923         if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
1924                 /*
1925                  * If it's invalid or empty, dissociate it from its vnode
1926                  * and put on the head of the appropriate queue.
1927                  */
1928                 if (bp->b_vp)
1929                         brelvp(bp);
1930
1931                 if (ISSET(bp->b_flags, B_DELWRI))
1932                         OSAddAtomic(-1, &nbdwrite);
1933
1934                 CLR(bp->b_flags, (B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE));
1935                 /*
1936                  * Determine which queue the buffer should be on, then put it there.
1937                  */
1938                 if (bp->b_bufsize <= 0)
1939                         whichq = BQ_EMPTY;      /* no data */
1940                 else if (ISSET(bp->b_flags, B_META))
1941                         whichq = BQ_META;               /* meta-data */
1942                 else
1943                         whichq = BQ_AGE;        /* invalid data */
1944                 bufq = &bufqueues[whichq];
1945
1946                 lck_mtx_lock(buf_mtxp);
1947
1948                 binsheadfree(bp, bufq, whichq);
1949         } else {
1950                 /*
1951                  * It has valid data.  Put it on the end of the appropriate
1952                  * queue, so that it'll stick around for as long as possible.
1953                  */
1954                 if (ISSET(bp->b_flags, B_LOCKED))
1955                         whichq = BQ_LOCKED;             /* locked in core */
1956                 else if (ISSET(bp->b_flags, B_META))
1957                         whichq = BQ_META;               /* meta-data */
1958                 else if (ISSET(bp->b_flags, B_AGE))
1959                         whichq = BQ_AGE;                /* stale but valid data */
1960                 else
1961                         whichq = BQ_LRU;                /* valid data */
1962                 bufq = &bufqueues[whichq];
1963
1964                 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE));
1965
1966                 lck_mtx_lock(buf_mtxp);
1967
1968                 binstailfree(bp, bufq, whichq);
1969         }
1970         if (needbuffer) {
1971                 /*
1972                  * needbuffer is a global
1973                  * we're currently using buf_mtxp to protect it
1974                  * delay doing the actual wakeup until after
1975                  * we drop buf_mtxp
1976                  */
1977                 needbuffer = 0;
1978                 need_wakeup = 1;
1979         }
1980         if (ISSET(bp->b_lflags, BL_WANTED)) {
1981                 /*
1982                  * delay the actual wakeup until after we
1983                  * clear BL_BUSY and we've dropped buf_mtxp
1984                  */
1985                 need_bp_wakeup = 1;
1986         }
1987         /*
1988          * Unlock the buffer.
1989          */
1990         CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
1991
1992         lck_mtx_unlock(buf_mtxp);
1993
1994         if (need_wakeup) {
1995                 /*
1996                  * Wake up any processes waiting for any buffer to become free.
1997                  */
1998                 wakeup(&needbuffer);
1999         }
2000         if (need_bp_wakeup) {
2001                 /*
2002                  * Wake up any proceeses waiting for _this_ buffer to become free.
2003                  */
2004                 wakeup(bp);
2005         }
2006         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
2007                      (int)bp, (int)bp->b_datap, bp->b_flags, 0, 0);
2008 }
2009
2010 /*
2011  * Determine if a block is in the cache.
2012  * Just look on what would be its hash chain.  If it's there, return
2013  * a pointer to it, unless it's marked invalid.  If it's marked invalid,
2014  * we normally don't return the buffer, unless the caller explicitly
2015  * wants us to.
2016  */
2017 static boolean_t
2018 incore(vnode_t vp, daddr64_t blkno)
2019 {
2020         boolean_t retval;
2021
2022         lck_mtx_lock(buf_mtxp);
2023
2024         if (incore_locked(vp, blkno))
2025                 retval = TRUE;
2026         else
2027                 retval = FALSE;
2028         lck_mtx_unlock(buf_mtxp);
2029
2030         return (retval);
2031 }
2032
2033
2034 static buf_t
2035 incore_locked(vnode_t vp, daddr64_t blkno)
2036 {
2037         struct buf *bp;
2038
2039         bp = BUFHASH(vp, blkno)->lh_first;
2040
2041         /* Search hash chain */
2042         for (; bp != NULL; bp = bp->b_hash.le_next) {
2043                 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
2044                     !ISSET(bp->b_flags, B_INVAL)) {
2045                         return (bp);
2046                 }
2047         }
2048         return (0);
2049 }
2050
2051
2052 /* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
2053 /*
2054  * Get a block of requested size that is associated with
2055  * a given vnode and block offset. If it is found in the
2056  * block cache, mark it as having been found, make it busy
2057  * and return it. Otherwise, return an empty block of the
2058  * correct size. It is up to the caller to insure that the
2059  * cached blocks be of the correct size.
2060  */
2061 buf_t
2062 buf_getblk(vnode_t vp, daddr64_t blkno, int size, int slpflag, int slptimeo, int operation)
2063 {
2064         buf_t bp;
2065         int   err;
2066         upl_t upl;
2067         upl_page_info_t *pl;
2068         kern_return_t kret;
2069         int ret_only_valid;
2070         struct timespec ts;
2071         int upl_flags;
2072
2073         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
2074                      (int)(blkno * PAGE_SIZE), size, operation, 0, 0);
2075
2076         ret_only_valid = operation & BLK_ONLYVALID;
2077         operation &= ~BLK_ONLYVALID;
2078 start:
2079         lck_mtx_lock(buf_mtxp);
2080 start_locked:
2081         if ((bp = incore_locked(vp, blkno))) {
2082                 /*
2083                  * Found in the Buffer Cache
2084                  */
2085                 if (ISSET(bp->b_lflags, BL_BUSY)) {
2086                         /*
2087                          * but is busy
2088                          */
2089                         switch (operation) {
2090                         case BLK_READ:
2091                         case BLK_WRITE:
2092                         case BLK_META:
2093                                 SET(bp->b_lflags, BL_WANTED);
2094                                 bufstats.bufs_busyincore++;
2095
2096                                 /*
2097                                  * don't retake the mutex after being awakened...
2098                                  * the time out is in msecs
2099                                  */
2100                                 ts.tv_sec = (slptimeo/1000);
2101                                 ts.tv_nsec = (slptimeo % 1000) * 10  * NSEC_PER_USEC * 1000;
2102
2103                                 err = msleep(bp, buf_mtxp, slpflag | PDROP | (PRIBIO + 1), "buf_getblk", &ts);
2104
2105                                 /*
2106                                  * Callers who call with PCATCH or timeout are
2107                                  * willing to deal with the NULL pointer
2108                                  */
2109                                 if (err && ((slpflag & PCATCH) || ((err == EWOULDBLOCK) && slptimeo)))
2110                                         return (NULL);
2111                                 goto start;
2112                                 /*NOTREACHED*/
2113                                 break;
2114
2115                         default:
2116                                 /*
2117                                  * unknown operation requested
2118                                  */
2119                                 panic("getblk: paging or unknown operation for incore busy buffer - %x\n", operation);
2120                                 /*NOTREACHED*/
2121                                 break;
2122                         }
2123                 } else {
2124                         /*
2125                          * buffer in core and not busy
2126                          */
2127                         if ( (bp->b_upl) )
2128                                 panic("buffer has UPL, but not marked BUSY: %x", bp);
2129                         SET(bp->b_lflags, BL_BUSY);
2130                         SET(bp->b_flags, B_CACHE);
2131 #ifdef JOE_DEBUG
2132                         bp->b_owner = current_thread();
2133                         bp->b_tag   = 1;
2134 #endif
2135                         bremfree_locked(bp);
2136                         bufstats.bufs_incore++;
2137
2138                         lck_mtx_unlock(buf_mtxp);
2139
2140                         if ( !ret_only_valid)
2141                                 allocbuf(bp, size);
2142
2143                         upl_flags = 0;
2144                         switch (operation) {
2145                         case BLK_WRITE:
2146                                 /*
2147                                  * "write" operation:  let the UPL subsystem
2148                                  * know that we intend to modify the buffer
2149                                  * cache pages we're gathering.
2150                                  */
2151                                 upl_flags |= UPL_WILL_MODIFY;
2152                         case BLK_READ:
2153                                 upl_flags |= UPL_PRECIOUS;
2154                                 if (UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
2155                                         kret = ubc_create_upl(vp,
2156                                                               ubc_blktooff(vp, bp->b_lblkno),
2157                                                               bp->b_bufsize,
2158                                                               &upl,
2159                                                               &pl,
2160                                                               upl_flags);
2161                                         if (kret != KERN_SUCCESS)
2162                                                 panic("Failed to create UPL");
2163
2164                                         bp->b_upl = upl;
2165
2166                                         if (upl_valid_page(pl, 0)) {
2167                                                 if (upl_dirty_page(pl, 0))
2168                                                         SET(bp->b_flags, B_WASDIRTY);
2169                                                 else
2170                                                         CLR(bp->b_flags, B_WASDIRTY);
2171                                         } else
2172                                                 CLR(bp->b_flags, (B_DONE | B_CACHE | B_WASDIRTY | B_DELWRI));
2173
2174                                         kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_datap));
2175
2176                                         if (kret != KERN_SUCCESS)
2177                                                 panic("getblk: ubc_upl_map() failed with (%d)", kret);
2178                                 }
2179                                 break;
2180
2181                         case BLK_META:
2182                                 /*
2183                                  * VM is not involved in IO for the meta data
2184                                  * buffer already has valid data
2185                                  */
2186                                 break;
2187
2188                         default:
2189                                 panic("getblk: paging or unknown operation for incore buffer- %d\n", operation);
2190                                 /*NOTREACHED*/
2191                                 break;
2192                         }
2193                 }
2194         } else { /* not incore() */
2195                 int queue = BQ_EMPTY; /* Start with no preference */
2196
2197                 if (ret_only_valid) {
2198                         lck_mtx_unlock(buf_mtxp);
2199                         return (NULL);
2200                 }
2201
2202                 if ((UBCINVALID(vp)) || !(UBCINFOEXISTS(vp)))
2203                         operation = BLK_META;
2204
2205                 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
2206                         goto start_locked;
2207
2208                 /*
2209                  * getnewbuf may block for a number of different reasons...
2210                  * if it does, it's then possible for someone else to
2211                  * create a buffer for the same block and insert it into
2212                  * the hash... if we see it incore at this point we dump
2213                  * the buffer we were working on and start over
2214                  */
2215                 if (incore_locked(vp, blkno)) {
2216                         SET(bp->b_flags, B_INVAL);
2217                         binshash(bp, &invalhash);
2218
2219                         lck_mtx_unlock(buf_mtxp);
2220
2221                         buf_brelse(bp);
2222                         goto start;
2223                 }
2224                 /*
2225                  * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
2226                  *       CALLED!  BE CAREFUL.
2227                  */
2228
2229                 /*
2230                  * mark the buffer as B_META if indicated
2231                  * so that when buffer is released it will goto META queue
2232                  */
2233                 if (operation == BLK_META)
2234                         SET(bp->b_flags, B_META);
2235
2236                 bp->b_blkno = bp->b_lblkno = blkno;
2237                 bp->b_vp = vp;
2238
2239                 /*
2240                  * Insert in the hash so that incore() can find it
2241                  */
2242                 binshash(bp, BUFHASH(vp, blkno));
2243
2244                 lck_mtx_unlock(buf_mtxp);
2245
2246                 bgetvp(vp, bp);
2247
2248                 allocbuf(bp, size);
2249
2250                 upl_flags = 0;
2251                 switch (operation) {
2252                 case BLK_META:
2253                         /*
2254                          * buffer data is invalid...
2255                          *
2256                          * I don't want to have to retake buf_mtxp,
2257                          * so the miss and vmhits counters are done
2258                          * with Atomic updates... all other counters
2259                          * in bufstats are protected with either
2260                          * buf_mtxp or iobuffer_mtxp
2261                          */
2262                         OSAddAtomic(1, &bufstats.bufs_miss);
2263                         break;
2264
2265                 case BLK_WRITE:
2266                         /*
2267                          * "write" operation:  let the UPL subsystem know
2268                          * that we intend to modify the buffer cache pages
2269                          * we're gathering.
2270                          */
2271                         upl_flags |= UPL_WILL_MODIFY;
2272                 case BLK_READ:
2273                   {     off_t   f_offset;
2274                         size_t  contig_bytes;
2275                         int     bmap_flags;
2276
2277                         if ( (bp->b_upl) )
2278                                 panic("bp already has UPL: %x",bp);
2279
2280                         f_offset = ubc_blktooff(vp, blkno);
2281
2282                         upl_flags |= UPL_PRECIOUS;
2283                         kret = ubc_create_upl(vp,
2284                                               f_offset,
2285                                               bp->b_bufsize,
2286                                               &upl,
2287                                               &pl,
2288                                               upl_flags);
2289
2290                         if (kret != KERN_SUCCESS)
2291                                 panic("Failed to create UPL");
2292 #ifdef  UPL_DEBUG
2293                         upl_ubc_alias_set(upl, bp, 4);
2294 #endif /* UPL_DEBUG */
2295                         bp->b_upl = upl;
2296
2297                         if (upl_valid_page(pl, 0)) {
2298
2299                                 if (operation == BLK_READ)
2300                                         bmap_flags = VNODE_READ;
2301                                 else
2302                                         bmap_flags = VNODE_WRITE;
2303
2304                                 SET(bp->b_flags, B_CACHE | B_DONE);
2305
2306                                 OSAddAtomic(1, &bufstats.bufs_vmhits);
2307
2308                                 bp->b_validoff = 0;
2309                                 bp->b_dirtyoff = 0;
2310
2311                                 if (upl_dirty_page(pl, 0)) {
2312                                         /* page is dirty */
2313                                         SET(bp->b_flags, B_WASDIRTY);
2314
2315                                         bp->b_validend = bp->b_bcount;
2316                                         bp->b_dirtyend = bp->b_bcount;
2317                                 } else {
2318                                         /* page is clean */
2319                                         bp->b_validend = bp->b_bcount;
2320                                         bp->b_dirtyend = 0;
2321                                 }
2322                                 /*
2323                                  * try to recreate the physical block number associated with
2324                                  * this buffer...
2325                                  */
2326                                 if (VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))
2327                                         panic("getblk: VNOP_BLOCKMAP failed");
2328                                 /*
2329                                  * if the extent represented by this buffer
2330                                  * is not completely physically contiguous on
2331                                  * disk, than we can't cache the physical mapping
2332                                  * in the buffer header
2333                                  */
2334                                 if ((long)contig_bytes < bp->b_bcount)
2335                                         bp->b_blkno = bp->b_lblkno;
2336                         } else {
2337                                 OSAddAtomic(1, &bufstats.bufs_miss);
2338                         }
2339                         kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_datap));
2340
2341                         if (kret != KERN_SUCCESS)
2342                                 panic("getblk: ubc_upl_map() failed with (%d)", kret);
2343                         break;
2344                   }
2345                 default:
2346                         panic("getblk: paging or unknown operation - %x", operation);
2347                         /*NOTREACHED*/
2348                         break;
2349                 }
2350         }
2351         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
2352                      (int)bp, (int)bp->b_datap, bp->b_flags, 3, 0);
2353
2354 #ifdef JOE_DEBUG
2355         bp->b_stackgetblk[0] = __builtin_return_address(0);
2356         bp->b_stackgetblk[1] = __builtin_return_address(1);
2357         bp->b_stackgetblk[2] = __builtin_return_address(2);
2358         bp->b_stackgetblk[3] = __builtin_return_address(3);
2359         bp->b_stackgetblk[4] = __builtin_return_address(4);
2360         bp->b_stackgetblk[5] = __builtin_return_address(5);
2361 #endif
2362         return (bp);
2363 }
2364
2365 /*
2366  * Get an empty, disassociated buffer of given size.
2367  */
2368 buf_t
2369 buf_geteblk(size)
2370         int size;
2371 {
2372         buf_t   bp;
2373         int queue = BQ_EMPTY;
2374
2375         lck_mtx_lock(buf_mtxp);
2376
2377         while ((bp = getnewbuf(0, 0, &queue)) == 0)
2378                 ;
2379         SET(bp->b_flags, (B_META|B_INVAL));
2380
2381 #if DIAGNOSTIC
2382         assert(queue == BQ_EMPTY);
2383 #endif /* DIAGNOSTIC */
2384         /* XXX need to implement logic to deal with other queues */
2385
2386         binshash(bp, &invalhash);
2387         bufstats.bufs_eblk++;
2388
2389         lck_mtx_unlock(buf_mtxp);
2390
2391         allocbuf(bp, size);
2392
2393         return (bp);
2394 }
2395
2396 /*
2397  * Zones for the meta data buffers
2398  */
2399
2400 #define MINMETA 512
2401 #define MAXMETA 4096
2402
2403 struct meta_zone_entry {
2404         zone_t mz_zone;
2405         vm_size_t mz_size;
2406         vm_size_t mz_max;
2407         char *mz_name;
2408 };
2409
2410 struct meta_zone_entry meta_zones[] = {
2411         {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
2412         {NULL, (MINMETA * 2),  64 * (MINMETA * 2), "buf.1024" },
2413         {NULL, (MINMETA * 4),  16 * (MINMETA * 4), "buf.2048" },
2414         {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
2415         {NULL, 0, 0, "" } /* End */
2416 };
2417
2418 /*
2419  * Initialize the meta data zones
2420  */
2421 static void
2422 bufzoneinit(void)
2423 {
2424         int i;
2425
2426         for (i = 0; meta_zones[i].mz_size != 0; i++) {
2427                 meta_zones[i].mz_zone =
2428                                 zinit(meta_zones[i].mz_size,
2429                                         meta_zones[i].mz_max,
2430                                         PAGE_SIZE,
2431                                         meta_zones[i].mz_name);
2432         }
2433         buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
2434 }
2435
2436 static __inline__ zone_t
2437 getbufzone(size_t size)
2438 {
2439         int i;
2440
2441         if ((size % 512) || (size < MINMETA) || (size > MAXMETA))
2442                 panic("getbufzone: incorect size = %d", size);
2443
2444         for (i = 0; meta_zones[i].mz_size != 0; i++) {
2445                 if (meta_zones[i].mz_size >= size)
2446                         break;
2447         }
2448
2449         return (meta_zones[i].mz_zone);
2450 }
2451
2452 /*
2453  * With UBC, there is no need to expand / shrink the file data
2454  * buffer. The VM uses the same pages, hence no waste.
2455  * All the file data buffers can have one size.
2456  * In fact expand / shrink would be an expensive operation.
2457  *
2458  * Only exception to this is meta-data buffers. Most of the
2459  * meta data operations are smaller than PAGE_SIZE. Having the
2460  * meta-data buffers grow and shrink as needed, optimizes use
2461  * of the kernel wired memory.
2462  */
2463
2464 int
2465 allocbuf(buf_t bp, int size)
2466 {
2467         vm_size_t desired_size;
2468
2469         desired_size = roundup(size, CLBYTES);
2470
2471         if (desired_size < PAGE_SIZE)
2472                 desired_size = PAGE_SIZE;
2473         if (desired_size > MAXBSIZE)
2474                 panic("allocbuf: buffer larger than MAXBSIZE requested");
2475
2476         if (ISSET(bp->b_flags, B_META)) {
2477                 zone_t zprev, z;
2478                 int    nsize = roundup(size, MINMETA);
2479
2480                 if (bp->b_datap) {
2481                         vm_offset_t elem = (vm_offset_t)bp->b_datap;
2482
2483                         if (ISSET(bp->b_flags, B_ZALLOC)) {
2484                                 if (bp->b_bufsize < nsize) {
2485                                         /* reallocate to a bigger size */
2486
2487                                         zprev = getbufzone(bp->b_bufsize);
2488                                         if (nsize <= MAXMETA) {
2489                                                 desired_size = nsize;
2490                                                 z = getbufzone(nsize);
2491                                                 bp->b_datap = (uintptr_t)zalloc(z);
2492                                         } else {
2493                                                 bp->b_datap = (uintptr_t)NULL;
2494                                                 kmem_alloc(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
2495                                                 CLR(bp->b_flags, B_ZALLOC);
2496                                         }
2497                                         bcopy((void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
2498                                         zfree(zprev, (void *)elem);
2499                                 } else {
2500                                         desired_size = bp->b_bufsize;
2501                                 }
2502
2503                         } else {
2504                                 if ((vm_size_t)bp->b_bufsize < desired_size) {
2505                                         /* reallocate to a bigger size */
2506                                         bp->b_datap = (uintptr_t)NULL;
2507                                         kmem_alloc(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
2508                                         bcopy((const void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
2509                                         kmem_free(kernel_map, elem, bp->b_bufsize);
2510                                 } else {
2511                                         desired_size = bp->b_bufsize;
2512                                 }
2513                         }
2514                 } else {
2515                         /* new allocation */
2516                         if (nsize <= MAXMETA) {
2517                                 desired_size = nsize;
2518                                 z = getbufzone(nsize);
2519                                 bp->b_datap = (uintptr_t)zalloc(z);
2520                                 SET(bp->b_flags, B_ZALLOC);
2521                         } else
2522                                 kmem_alloc(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
2523                 }
2524         }
2525         bp->b_bufsize = desired_size;
2526         bp->b_bcount = size;
2527
2528         return (0);
2529 }
2530
2531 /*
2532  *      Get a new buffer from one of the free lists.
2533  *
2534  *      Request for a queue is passes in. The queue from which the buffer was taken
2535  *      from is returned. Out of range queue requests get BQ_EMPTY. Request for
2536  *      BQUEUE means no preference. Use heuristics in that case.
2537  *      Heuristics is as follows:
2538  *      Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
2539  *      If none available block till one is made available.
2540  *      If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
2541  *      Pick the most stale buffer.
2542  *      If found buffer was marked delayed write, start the async. write
2543  *      and restart the search.
2544  *      Initialize the fields and disassociate the buffer from the vnode.
2545  *      Remove the buffer from the hash. Return the buffer and the queue
2546  *      on which it was found.
2547  *
2548  *      buf_mtxp is held upon entry
2549  *      returns with buf_mtxp locked
2550  */
2551
2552 static buf_t
2553 getnewbuf(int slpflag, int slptimeo, int * queue)
2554 {
2555         buf_t   bp;
2556         buf_t   lru_bp;
2557         buf_t   age_bp;
2558         buf_t   meta_bp;
2559         int     age_time, lru_time, bp_time, meta_time;
2560         int     req = *queue;   /* save it for restarts */
2561         struct timespec ts;
2562
2563 start:
2564         /*
2565          * invalid request gets empty queue
2566          */
2567         if ((*queue > BQUEUES) || (*queue < 0)
2568                 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED))
2569                 *queue = BQ_EMPTY;
2570
2571         /*
2572          * (*queue == BQUEUES) means no preference
2573          */
2574         if (*queue != BQUEUES) {
2575                 /* Try for the requested queue first */
2576                 bp = bufqueues[*queue].tqh_first;
2577                 if (bp)
2578                         goto found;
2579         }
2580
2581         /* Unable to use requested queue */
2582         age_bp = bufqueues[BQ_AGE].tqh_first;
2583         lru_bp = bufqueues[BQ_LRU].tqh_first;
2584         meta_bp = bufqueues[BQ_META].tqh_first;
2585
2586         if (!age_bp && !lru_bp && !meta_bp) {
2587                 /*
2588                  * Unavailble on AGE or LRU or META queues
2589                  * Try the empty list first
2590                  */
2591                 bp = bufqueues[BQ_EMPTY].tqh_first;
2592                 if (bp) {
2593                         *queue = BQ_EMPTY;
2594                         goto found;
2595                 }
2596                 lck_mtx_unlock(buf_mtxp);
2597
2598                 /* Create a new temporary buffer header */
2599                 bp = (struct buf *)zalloc(buf_hdr_zone);
2600
2601                 lck_mtx_lock(buf_mtxp);
2602
2603                 if (bp) {
2604                         bufhdrinit(bp);
2605                         BLISTNONE(bp);
2606                         binshash(bp, &invalhash);
2607                         SET(bp->b_flags, B_HDRALLOC);
2608                         *queue = BQ_EMPTY;
2609                         binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
2610                         buf_hdr_count++;
2611                         goto found;
2612                 }
2613                 bufstats.bufs_sleeps++;
2614
2615                 /* wait for a free buffer of any kind */
2616                 needbuffer = 1;
2617                 /* hz value is 100 */
2618                 ts.tv_sec = (slptimeo/1000);
2619                 /* the hz value is 100; which leads to 10ms */
2620                 ts.tv_nsec = (slptimeo % 1000) * NSEC_PER_USEC * 1000 * 10;
2621                 msleep(&needbuffer, buf_mtxp, slpflag|(PRIBIO+1), (char *)"getnewbuf", &ts);
2622
2623                 return (0);
2624         }
2625
2626         /* Buffer available either on AGE or LRU or META */
2627         bp = NULL;
2628         *queue = -1;
2629
2630         /* Buffer available either on AGE or LRU */
2631         if (!age_bp) {
2632                 bp = lru_bp;
2633                 *queue = BQ_LRU;
2634         } else if (!lru_bp) {
2635                 bp = age_bp;
2636                 *queue = BQ_AGE;
2637         } else { /* buffer available on both AGE and LRU */
2638                 int             t = buf_timestamp();
2639
2640                 age_time = t - age_bp->b_timestamp;
2641                 lru_time = t - lru_bp->b_timestamp;
2642                 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
2643                         bp = age_bp;
2644                         *queue = BQ_AGE;
2645                         /*
2646                          * we should probably re-timestamp eveything in the
2647                          * queues at this point with the current time
2648                          */
2649                 } else {
2650                         if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
2651                                 bp = lru_bp;
2652                                 *queue = BQ_LRU;
2653                         } else {
2654                                 bp = age_bp;
2655                                 *queue = BQ_AGE;
2656                         }
2657                 }
2658         }
2659
2660         if (!bp) { /* Neither on AGE nor on LRU */
2661                 bp = meta_bp;
2662                 *queue = BQ_META;
2663         }  else if (meta_bp) {
2664                 int             t = buf_timestamp();
2665
2666                 bp_time = t - bp->b_timestamp;
2667                 meta_time = t - meta_bp->b_timestamp;
2668
2669                 if (!(bp_time < 0) && !(meta_time < 0)) {
2670                         /* time not set backwards */
2671                         int bp_is_stale;
2672                         bp_is_stale = (*queue == BQ_LRU) ?
2673                                         lru_is_stale : age_is_stale;
2674
2675                         if ((meta_time >= meta_is_stale) &&
2676                                         (bp_time < bp_is_stale)) {
2677                                 bp = meta_bp;
2678                                 *queue = BQ_META;
2679                         }
2680                 }
2681         }
2682 found:
2683         if (ISSET(bp->b_flags, B_LOCKED) || ISSET(bp->b_lflags, BL_BUSY))
2684                 panic("getnewbuf: bp @ 0x%x is LOCKED or BUSY! (flags 0x%x)\n", bp, bp->b_flags);
2685
2686         /* Clean it */
2687         if (bcleanbuf(bp)) {
2688                 /*
2689                  * moved to the laundry thread, buffer not ready
2690                  */
2691                 *queue = req;
2692                 goto start;
2693         }
2694         return (bp);
2695 }
2696
2697
2698 /*
2699  * Clean a buffer.
2700  * Returns 0 is buffer is ready to use,
2701  * Returns 1 if issued a buf_bawrite() to indicate
2702  * that the buffer is not ready.
2703  *
2704  * buf_mtxp is held upon entry
2705  * returns with buf_mtxp locked
2706  */
2707 static int
2708 bcleanbuf(buf_t bp)
2709 {
2710         ucred_t cred;
2711
2712
2713         /* Remove from the queue */
2714         bremfree_locked(bp);
2715
2716         /* Buffer is no longer on free lists. */
2717         SET(bp->b_lflags, BL_BUSY);
2718 #ifdef JOE_DEBUG
2719         bp->b_owner = current_thread();
2720         bp->b_tag   = 2;
2721 #endif
2722         /*
2723          * If buffer was a delayed write, start the IO by queuing
2724          * it on the LAUNDRY queue, and return 1
2725          */
2726         if (ISSET(bp->b_flags, B_DELWRI)) {
2727                 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
2728                 blaundrycnt++;
2729
2730                 lck_mtx_unlock(buf_mtxp);
2731
2732                 wakeup(&blaundrycnt);
2733                 /* and give it a chance to run */
2734                 (void)thread_block(THREAD_CONTINUE_NULL);
2735
2736                 lck_mtx_lock(buf_mtxp);
2737                 return (1);
2738         }
2739         bremhash(bp);
2740
2741         lck_mtx_unlock(buf_mtxp);
2742
2743         BLISTNONE(bp);
2744         /*
2745          * disassociate us from our vnode, if we had one...
2746          */
2747         if (bp->b_vp)
2748                 brelvp(bp);
2749
2750         if (ISSET(bp->b_flags, B_META)) {
2751                 vm_offset_t elem;
2752
2753                 elem = (vm_offset_t)bp->b_datap;
2754                 bp->b_datap = (uintptr_t)0xdeadbeef;
2755
2756                 if (ISSET(bp->b_flags, B_ZALLOC)) {
2757                         zone_t z;
2758
2759                         z = getbufzone(bp->b_bufsize);
2760                         zfree(z, (void *)elem);
2761                 } else
2762                         kmem_free(kernel_map, elem, bp->b_bufsize);
2763         }
2764
2765         trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
2766
2767         /* clear out various other fields */
2768         bp->b_bufsize = 0;
2769         bp->b_datap = (uintptr_t)NULL;
2770         bp->b_upl = (void *)NULL;
2771         /*
2772          * preserve the state of whether this buffer
2773          * was allocated on the fly or not...
2774          * the only other flag that should be set at
2775          * this point is BL_BUSY...
2776          */
2777 #ifdef JOE_DEBUG
2778         bp->b_owner = current_thread();
2779         bp->b_tag   = 3;
2780 #endif
2781         bp->b_lflags = BL_BUSY;
2782         bp->b_flags = (bp->b_flags & B_HDRALLOC);
2783         bp->b_dev = NODEV;
2784         bp->b_blkno = bp->b_lblkno = 0;
2785         bp->b_iodone = NULL;
2786         bp->b_error = 0;
2787         bp->b_resid = 0;
2788         bp->b_bcount = 0;
2789         bp->b_dirtyoff = bp->b_dirtyend = 0;
2790         bp->b_validoff = bp->b_validend = 0;
2791
2792         /* nuke any credentials we were holding */
2793         cred = bp->b_rcred;
2794         if (cred != NOCRED) {
2795                 bp->b_rcred = NOCRED;
2796                 kauth_cred_rele(cred);
2797         }
2798         cred = bp->b_wcred;
2799         if (cred != NOCRED) {
2800                 bp->b_wcred = NOCRED;
2801                 kauth_cred_rele(cred);
2802         }
2803         lck_mtx_lock(buf_mtxp);
2804
2805         return (0);
2806 }
2807
2808
2809
2810 errno_t
2811 buf_invalblkno(vnode_t vp, daddr64_t lblkno, int flags)
2812 {
2813         buf_t   bp;
2814         errno_t error;
2815
2816         lck_mtx_lock(buf_mtxp);
2817 relook:
2818         if ((bp = incore_locked(vp, lblkno)) == (struct buf *)0) {
2819                 lck_mtx_unlock(buf_mtxp);
2820                 return (0);
2821         }
2822         if (ISSET(bp->b_lflags, BL_BUSY)) {
2823                 if ( !ISSET(flags, BUF_WAIT)) {
2824                         lck_mtx_unlock(buf_mtxp);
2825                         return (EBUSY);
2826                 }
2827                 SET(bp->b_lflags, BL_WANTED);
2828
2829                 error = msleep((caddr_t)bp, buf_mtxp, (PRIBIO + 1), (char *)"buf_invalblkno", 0);
2830
2831                 if (error)
2832                         return (error);
2833                 goto relook;
2834         }
2835         bremfree_locked(bp);
2836         SET(bp->b_lflags, BL_BUSY);
2837         SET(bp->b_flags, B_INVAL);
2838 #ifdef JOE_DEBUG
2839         bp->b_owner = current_thread();
2840         bp->b_tag   = 4;
2841 #endif
2842         lck_mtx_unlock(buf_mtxp);
2843         buf_brelse(bp);
2844
2845         return (0);
2846 }
2847
2848
2849 void
2850 buf_drop(buf_t bp)
2851 {
2852         int need_wakeup = 0;
2853
2854         lck_mtx_lock(buf_mtxp);
2855
2856         if (ISSET(bp->b_lflags, BL_WANTED)) {
2857                 /*
2858                  * delay the actual wakeup until after we
2859                  * clear BL_BUSY and we've dropped buf_mtxp
2860                  */
2861                 need_wakeup = 1;
2862         }
2863         /*
2864          * Unlock the buffer.
2865          */
2866         CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
2867
2868         lck_mtx_unlock(buf_mtxp);
2869
2870         if (need_wakeup) {
2871                 /*
2872                  * Wake up any proceeses waiting for _this_ buffer to become free.
2873                  */
2874                 wakeup(bp);
2875         }
2876 }
2877
2878
2879 errno_t
2880 buf_acquire(buf_t bp, int flags, int slpflag, int slptimeo) {
2881         errno_t error;
2882
2883         lck_mtx_lock(buf_mtxp);
2884
2885         error = buf_acquire_locked(bp, flags, slpflag, slptimeo);
2886
2887         lck_mtx_unlock(buf_mtxp);
2888
2889         return (error);
2890 }
2891
2892
2893 static errno_t
2894 buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo)
2895 {
2896         errno_t error;
2897         struct timespec ts;
2898
2899         if (ISSET(bp->b_flags, B_LOCKED)) {
2900                 if ((flags & BAC_SKIP_LOCKED))
2901                         return (EDEADLK);
2902         } else {
2903                 if ((flags & BAC_SKIP_NONLOCKED))
2904                         return (EDEADLK);
2905         }
2906         if (ISSET(bp->b_lflags, BL_BUSY)) {
2907                 /*
2908                  * since the mutex_lock may block, the buffer
2909                  * may become BUSY, so we need to
2910                  * recheck for a NOWAIT request
2911                  */
2912                 if (flags & BAC_NOWAIT)
2913                         return (EBUSY);
2914                 SET(bp->b_lflags, BL_WANTED);
2915
2916                 /* the hz value is 100; which leads to 10ms */
2917                 ts.tv_sec = (slptimeo/100);
2918                 ts.tv_nsec = (slptimeo % 100) * 10  * NSEC_PER_USEC * 1000;
2919                 error = msleep((caddr_t)bp, buf_mtxp, slpflag | (PRIBIO + 1), (char *)"buf_acquire", &ts);
2920
2921                 if (error)
2922                         return (error);
2923                 return (EAGAIN);
2924         }
2925         if (flags & BAC_REMOVE)
2926                 bremfree_locked(bp);
2927         SET(bp->b_lflags, BL_BUSY);
2928 #ifdef JOE_DEBUG
2929         bp->b_owner = current_thread();
2930         bp->b_tag   = 5;
2931 #endif
2932         return (0);
2933 }
2934
2935
2936 /*
2937  * Wait for operations on the buffer to complete.
2938  * When they do, extract and return the I/O's error value.
2939  */
2940 errno_t
2941 buf_biowait(buf_t bp)
2942 {
2943         lck_mtx_lock(buf_mtxp);
2944
2945         while (!ISSET(bp->b_flags, B_DONE))
2946                 (void) msleep(bp, buf_mtxp, (PRIBIO+1), (char *)"buf_biowait", 0);
2947
2948         lck_mtx_unlock(buf_mtxp);
2949
2950         /* check for interruption of I/O (e.g. via NFS), then errors. */
2951         if (ISSET(bp->b_flags, B_EINTR)) {
2952                 CLR(bp->b_flags, B_EINTR);
2953                 return (EINTR);
2954         } else if (ISSET(bp->b_flags, B_ERROR))
2955                 return (bp->b_error ? bp->b_error : EIO);
2956         else
2957                 return (0);
2958 }
2959
2960 /*
2961  * Mark I/O complete on a buffer.
2962  *
2963  * If a callback has been requested, e.g. the pageout
2964  * daemon, do so. Otherwise, awaken waiting processes.
2965  *
2966  * [ Leffler, et al., says on p.247:
2967  *      "This routine wakes up the blocked process, frees the buffer
2968  *      for an asynchronous write, or, for a request by the pagedaemon
2969  *      process, invokes a procedure specified in the buffer structure" ]
2970  *
2971  * In real life, the pagedaemon (or other system processes) wants
2972  * to do async stuff to, and doesn't want the buffer buf_brelse()'d.
2973  * (for swap pager, that puts swap buffers on the free lists (!!!),
2974  * for the vn device, that puts malloc'd buffers on the free lists!)
2975  */
2976 extern struct timeval priority_IO_timestamp_for_root;
2977 extern int hard_throttle_on_root;
2978
2979 void
2980 buf_biodone(buf_t bp)
2981 {
2982         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
2983                      (int)bp, (int)bp->b_datap, bp->b_flags, 0, 0);
2984
2985         if (ISSET(bp->b_flags, B_DONE))
2986                 panic("biodone already");
2987
2988         if (kdebug_enable) {
2989                 int    code = DKIO_DONE;
2990
2991                 if (bp->b_flags & B_READ)
2992                         code |= DKIO_READ;
2993                 if (bp->b_flags & B_ASYNC)
2994                         code |= DKIO_ASYNC;
2995
2996                 if (bp->b_flags & B_META)
2997                         code |= DKIO_META;
2998                 else if (bp->b_flags & B_PAGEIO)
2999                         code |= DKIO_PAGING;
3000
3001                 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
3002                                       (unsigned int)bp, (unsigned int)bp->b_vp,
3003                                       bp->b_resid, bp->b_error, 0);
3004         }
3005         if ((bp->b_vp != NULLVP) &&
3006             ((bp->b_flags & (B_PAGEIO | B_READ)) == (B_PAGEIO | B_READ)) &&
3007             (bp->b_vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV)) {
3008                 microuptime(&priority_IO_timestamp_for_root);
3009                 hard_throttle_on_root = 0;
3010         }
3011         /*
3012          * I/O was done, so don't believe
3013          * the DIRTY state from VM anymore
3014          */
3015         CLR(bp->b_flags, B_WASDIRTY);
3016
3017         if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
3018                 /*
3019                  * wake up any writer's blocked
3020                  * on throttle or waiting for I/O
3021                  * to drain
3022                  */
3023                 vnode_writedone(bp->b_vp);
3024
3025         if (ISSET(bp->b_flags, (B_CALL | B_FILTER))) {  /* if necessary, call out */
3026                 void    (*iodone_func)(struct buf *, void *) = bp->b_iodone;
3027                 void    *arg = (void *)bp->b_transaction;
3028                 int     callout = ISSET(bp->b_flags, B_CALL);
3029
3030                 CLR(bp->b_flags, (B_CALL | B_FILTER));  /* filters and callouts are one-shot */
3031                 bp->b_iodone = NULL;
3032                 bp->b_transaction = NULL;
3033
3034                 if (iodone_func == NULL) {
3035                         panic("biodone: bp @ 0x%x has NULL b_iodone!\n", bp);
3036                 } else {
3037                         if (callout)
3038                                 SET(bp->b_flags, B_DONE);       /* note that it's done */
3039                         (*iodone_func)(bp, arg);
3040                 }
3041                 if (callout)
3042                         /*
3043                          * assumes that the call back function takes
3044                          * ownership of the bp and deals with releasing it if necessary
3045                          */
3046                         goto biodone_done;
3047                 /*
3048                  * in this case the call back function is acting
3049                  * strictly as a filter... it does not take
3050                  * ownership of the bp and is expecting us
3051                  * to finish cleaning up... this is currently used
3052                  * by the HFS journaling code
3053                  */
3054         }
3055         if (ISSET(bp->b_flags, B_ASYNC)) {      /* if async, release it */
3056                 SET(bp->b_flags, B_DONE);       /* note that it's done */
3057
3058                 buf_brelse(bp);
3059         } else {                                /* or just wakeup the buffer */
3060                 /*
3061                  * by taking the mutex, we serialize
3062                  * the buf owner calling buf_biowait so that we'll
3063                  * only see him in one of 2 states...
3064                  * state 1: B_DONE wasn't set and he's
3065                  * blocked in msleep
3066                  * state 2: he's blocked trying to take the
3067                  * mutex before looking at B_DONE
3068                  * BL_WANTED is cleared in case anyone else
3069                  * is blocked waiting for the buffer... note
3070                  * that we haven't cleared B_BUSY yet, so if
3071                  * they do get to run, their going to re-set
3072                  * BL_WANTED and go back to sleep
3073                  */
3074                 lck_mtx_lock(buf_mtxp);
3075
3076                 CLR(bp->b_lflags, BL_WANTED);
3077                 SET(bp->b_flags, B_DONE);               /* note that it's done */
3078
3079                 lck_mtx_unlock(buf_mtxp);
3080
3081                 wakeup(bp);
3082         }
3083 biodone_done:
3084         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
3085                      (int)bp, (int)bp->b_datap, bp->b_flags, 0, 0);
3086 }
3087
3088 /*
3089  * Return a count of buffers on the "locked" queue.
3090  */
3091 int
3092 count_lock_queue(void)
3093 {
3094         buf_t   bp;
3095         int     n = 0;
3096
3097         lck_mtx_lock(buf_mtxp);
3098
3099         for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
3100             bp = bp->b_freelist.tqe_next)
3101                 n++;
3102         lck_mtx_unlock(buf_mtxp);
3103
3104         return (n);
3105 }
3106
3107 /*
3108  * Return a count of 'busy' buffers. Used at the time of shutdown.
3109  */
3110 int
3111 count_busy_buffers(void)
3112 {
3113         buf_t   bp;
3114         int     nbusy = 0;
3115
3116         for (bp = &buf[nbuf]; --bp >= buf; )
3117                 if (!ISSET(bp->b_flags, B_INVAL) && ISSET(bp->b_lflags, BL_BUSY))
3118                         nbusy++;
3119         return (nbusy);
3120 }
3121
3122 #if DIAGNOSTIC
3123 /*
3124  * Print out statistics on the current allocation of the buffer pool.
3125  * Can be enabled to print out on every ``sync'' by setting "syncprt"
3126  * in vfs_syscalls.c using sysctl.
3127  */
3128 void
3129 vfs_bufstats()
3130 {
3131         int i, j, count;
3132         register struct buf *bp;
3133         register struct bqueues *dp;
3134         int counts[MAXBSIZE/CLBYTES+1];
3135         static char *bname[BQUEUES] =
3136                 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
3137
3138         for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
3139                 count = 0;
3140                 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
3141                         counts[j] = 0;
3142
3143                 lck_mtx_lock(buf_mtxp);
3144
3145                 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
3146                         counts[bp->b_bufsize/CLBYTES]++;
3147                         count++;
3148                 }
3149                 lck_mtx_unlock(buf_mtxp);
3150
3151                 printf("%s: total-%d", bname[i], count);
3152                 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
3153                         if (counts[j] != 0)
3154                                 printf(", %d-%d", j * CLBYTES, counts[j]);
3155                 printf("\n");
3156         }
3157 }
3158 #endif /* DIAGNOSTIC */
3159
3160 #define NRESERVEDIOBUFS 64
3161
3162
3163 buf_t
3164 alloc_io_buf(vnode_t vp, int priv)
3165 {
3166         buf_t   bp;
3167
3168         lck_mtx_lock(iobuffer_mtxp);
3169
3170         while (((niobuf - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse) && !priv) ||
3171                (bp = iobufqueue.tqh_first) == NULL) {
3172                 bufstats.bufs_iobufsleeps++;
3173
3174                 need_iobuffer = 1;
3175                 (void) msleep(&need_iobuffer, iobuffer_mtxp, (PRIBIO+1), (const char *)"alloc_io_buf", 0);
3176         }
3177         TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
3178
3179         bufstats.bufs_iobufinuse++;
3180         if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
3181                 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
3182
3183         lck_mtx_unlock(iobuffer_mtxp);
3184
3185         /*
3186          * initialize various fields
3187          * we don't need to hold the mutex since the buffer
3188          * is now private... the vp should have a reference
3189          * on it and is not protected by this mutex in any event
3190          */
3191         bp->b_timestamp = 0;
3192         bp->b_proc = NULL;
3193
3194         bp->b_datap = 0;
3195         bp->b_flags = 0;
3196         bp->b_lflags = BL_BUSY | BL_IOBUF;
3197         bp->b_blkno = bp->b_lblkno = 0;
3198 #ifdef JOE_DEBUG
3199         bp->b_owner = current_thread();
3200         bp->b_tag   = 6;
3201 #endif
3202         bp->b_iodone = NULL;
3203         bp->b_error = 0;
3204         bp->b_resid = 0;
3205         bp->b_bcount = 0;
3206         bp->b_bufsize = 0;
3207         bp->b_upl = NULL;
3208         bp->b_vp = vp;
3209
3210         if (vp && (vp->v_type == VBLK || vp->v_type == VCHR))
3211                 bp->b_dev = vp->v_rdev;
3212         else
3213                 bp->b_dev = NODEV;
3214
3215         return (bp);
3216 }
3217
3218
3219 void
3220 free_io_buf(buf_t bp)
3221 {
3222         int need_wakeup = 0;
3223
3224         /*
3225          * put buffer back on the head of the iobufqueue
3226          */
3227         bp->b_vp = NULL;
3228         bp->b_flags = B_INVAL;
3229
3230         lck_mtx_lock(iobuffer_mtxp);
3231
3232         binsheadfree(bp, &iobufqueue, -1);
3233
3234         if (need_iobuffer) {
3235                 /*
3236                  * Wake up any processes waiting because they need an io buffer
3237                  *
3238                  * do the wakeup after we drop the mutex... it's possible that the
3239                  * wakeup will be superfluous if need_iobuffer gets set again and
3240                  * another thread runs this path, but it's highly unlikely, doesn't
3241                  * hurt, and it means we don't hold up I/O progress if the wakeup blocks
3242                  * trying to grab a task related lock...
3243                  */
3244                 need_iobuffer = 0;
3245                 need_wakeup = 1;
3246         }
3247         bufstats.bufs_iobufinuse--;
3248
3249         lck_mtx_unlock(iobuffer_mtxp);
3250
3251         if (need_wakeup)
3252                 wakeup(&need_iobuffer);
3253 }
3254
3255
3256
3257 /*
3258  * If getnewbuf() calls bcleanbuf() on the same thread
3259  * there is a potential for stack overrun and deadlocks.
3260  * So we always handoff the work to a worker thread for completion
3261  */
3262 #include <mach/mach_types.h>
3263 #include <mach/memory_object_types.h>
3264 #include <kern/sched_prim.h>
3265
3266
3267 static void
3268 bcleanbuf_thread_init(void)
3269 {
3270         /* create worker thread */
3271         kernel_thread(kernel_task, bcleanbuf_thread);
3272 }
3273
3274 static void
3275 bcleanbuf_thread(void)
3276 {
3277         struct buf *bp;
3278         int error = 0;
3279         int loopcnt = 0;
3280
3281         for (;;) {
3282                 lck_mtx_lock(buf_mtxp);
3283
3284                 while (blaundrycnt == 0)
3285                         (void)msleep((void *)&blaundrycnt, buf_mtxp, PRIBIO, "blaundry", 0);
3286
3287                 bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY]);
3288                 /*
3289                  * Remove from the queue
3290                  */
3291                 bremfree_locked(bp);
3292                 blaundrycnt--;
3293
3294                 lck_mtx_unlock(buf_mtxp);
3295                 /*
3296                  * do the IO
3297                  */
3298                 error = bawrite_internal(bp, 0);
3299
3300                 if (error) {
3301                         lck_mtx_lock(buf_mtxp);
3302
3303                         binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
3304                         blaundrycnt++;
3305
3306                         lck_mtx_unlock(buf_mtxp);
3307
3308                         if (loopcnt > 10) {
3309                                 (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 1);
3310                                 loopcnt = 0;
3311                         } else {
3312                                 (void)thread_block(THREAD_CONTINUE_NULL);
3313                                 loopcnt++;
3314                         }
3315                 }
3316         }
3317 }
3318
3319
3320 static int
3321 brecover_data(buf_t bp)
3322 {
3323         int     upl_offset;
3324         upl_t   upl;
3325         upl_page_info_t *pl;
3326         kern_return_t kret;
3327         vnode_t vp = bp->b_vp;
3328         int upl_flags;
3329
3330
3331         if ( !UBCINFOEXISTS(vp) || bp->b_bufsize == 0)
3332                 goto dump_buffer;
3333
3334         upl_flags = UPL_PRECIOUS;
3335         if (! (buf_flags(bp) & B_READ)) {
3336                 /*
3337                  * "write" operation:  let the UPL subsystem know
3338                  * that we intend to modify the buffer cache pages we're
3339                  * gathering.
3340                  */
3341                 upl_flags |= UPL_WILL_MODIFY;
3342         }
3343
3344         kret = ubc_create_upl(vp,
3345                               ubc_blktooff(vp, bp->b_lblkno),
3346                               bp->b_bufsize,
3347                               &upl,
3348                               &pl,
3349                               upl_flags);
3350         if (kret != KERN_SUCCESS)
3351                 panic("Failed to create UPL");
3352
3353         for (upl_offset = 0; upl_offset < bp->b_bufsize; upl_offset += PAGE_SIZE) {
3354
3355                 if (!upl_valid_page(pl, upl_offset / PAGE_SIZE) || !upl_dirty_page(pl, upl_offset / PAGE_SIZE)) {
3356                         ubc_upl_abort(upl, 0);
3357                         goto dump_buffer;
3358                 }
3359         }
3360         bp->b_upl = upl;
3361
3362         kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_datap));
3363
3364         if (kret != KERN_SUCCESS)
3365                 panic("getblk: ubc_upl_map() failed with (%d)", kret);
3366         return (1);
3367
3368 dump_buffer:
3369         bp->b_bufsize = 0;
3370         SET(bp->b_flags, B_INVAL);
3371         buf_brelse(bp);
3372
3373         return(0);
3374 }
3375
3376
3377
3378 /*
3379  * disabled for now
3380  */
3381
3382 #if FLUSH_QUEUES
3383
3384 #define NFLUSH 32
3385
3386 static int
3387 bp_cmp(void *a, void *b)
3388 {
3389     buf_t *bp_a = *(buf_t **)a,
3390           *bp_b = *(buf_t **)b;
3391     daddr64_t res;
3392
3393     // don't have to worry about negative block
3394     // numbers so this is ok to do.
3395     //
3396     res = (bp_a->b_blkno - bp_b->b_blkno);
3397
3398     return (int)res;
3399 }
3400
3401
3402 int
3403 bflushq(int whichq, mount_t mp)
3404 {
3405         buf_t   bp, next;
3406         int     i, buf_count;
3407         int     total_writes = 0;
3408         static buf_t flush_table[NFLUSH];
3409
3410         if (whichq < 0 || whichq >= BQUEUES) {
3411             return (0);
3412         }
3413
3414   restart:
3415         lck_mtx_lock(buf_mtxp);
3416
3417         bp = TAILQ_FIRST(&bufqueues[whichq]);
3418
3419         for (buf_count = 0; bp; bp = next) {
3420             next = bp->b_freelist.tqe_next;
3421
3422             if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) {
3423                 continue;
3424             }
3425
3426             if (ISSET(bp->b_flags, B_DELWRI) && !ISSET(bp->b_lflags, BL_BUSY)) {
3427
3428                 bremfree_locked(bp);
3429 #ifdef JOE_DEBUG
3430                 bp->b_owner = current_thread();
3431                 bp->b_tag   = 7;
3432 #endif
3433                 SET(bp->b_lflags, BL_BUSY);
3434                 flush_table[buf_count] = bp;
3435                 buf_count++;
3436                 total_writes++;
3437
3438                 if (buf_count >= NFLUSH) {
3439                     lck_mtx_unlock(buf_mtxp);
3440
3441                     qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
3442
3443                     for (i = 0; i < buf_count; i++) {
3444                         buf_bawrite(flush_table[i]);
3445                     }
3446                     goto restart;
3447                 }
3448             }
3449         }
3450         lck_mtx_unlock(buf_mtxp);
3451
3452         if (buf_count > 0) {
3453             qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
3454
3455             for (i = 0; i < buf_count; i++) {
3456                 buf_bawrite(flush_table[i]);
3457             }
3458         }
3459
3460         return (total_writes);
3461 }
3462 #endif
3463
3464
3465 #if BALANCE_QUEUES
3466
3467 /* XXX move this to a separate file */
3468
3469 /*
3470  * NOTE: THIS CODE HAS NOT BEEN UPDATED
3471  * WITH RESPECT TO THE NEW LOCKING MODEL
3472  */
3473
3474
3475 /*
3476  * Dynamic Scaling of the Buffer Queues
3477  */
3478
3479 typedef long long blsize_t;
3480
3481 blsize_t MAXNBUF; /* initialize to (sane_size / PAGE_SIZE) */
3482 /* Global tunable limits */
3483 blsize_t nbufh;                 /* number of buffer headers */
3484 blsize_t nbuflow;               /* minimum number of buffer headers required */
3485 blsize_t nbufhigh;              /* maximum number of buffer headers allowed */
3486 blsize_t nbuftarget;    /* preferred number of buffer headers */
3487
3488 /*
3489  * assertions:
3490  *
3491  * 1.   0 < nbuflow <= nbufh <= nbufhigh
3492  * 2.   nbufhigh <= MAXNBUF
3493  * 3.   0 < nbuflow <= nbuftarget <= nbufhigh
3494  * 4.   nbufh can not be set by sysctl().
3495  */
3496
3497 /* Per queue tunable limits */
3498
3499 struct bufqlim {
3500         blsize_t        bl_nlow;        /* minimum number of buffer headers required */
3501         blsize_t        bl_num;         /* number of buffer headers on the queue */
3502         blsize_t        bl_nlhigh;      /* maximum number of buffer headers allowed */
3503         blsize_t        bl_target;      /* preferred number of buffer headers */
3504         long    bl_stale;       /* Seconds after which a buffer is considered stale */
3505 } bufqlim[BQUEUES];
3506
3507 /*
3508  * assertions:
3509  *
3510  * 1.   0 <= bl_nlow <= bl_num <= bl_nlhigh
3511  * 2.   bl_nlhigh <= MAXNBUF
3512  * 3.  bufqlim[BQ_META].bl_nlow != 0
3513  * 4.  bufqlim[BQ_META].bl_nlow > (number of possible concurrent
3514  *                                                                      file system IO operations)
3515  * 5.   bl_num can not be set by sysctl().
3516  * 6.   bl_nhigh <= nbufhigh
3517  */
3518
3519 /*
3520  * Rationale:
3521  * ----------
3522  * Defining it blsize_t as long permits 2^31 buffer headers per queue.
3523  * Which can describe (2^31 * PAGE_SIZE) memory per queue.
3524  *
3525  * These limits are exported to by means of sysctl().
3526  * It was decided to define blsize_t as a 64 bit quantity.
3527  * This will make sure that we will not be required to change it
3528  * as long as we do not exceed 64 bit address space for the kernel.
3529  *
3530  * low and high numbers parameters initialized at compile time
3531  * and boot arguments can be used to override them. sysctl()
3532  * would not change the value. sysctl() can get all the values
3533  * but can set only target. num is the current level.
3534  *
3535  * Advantages of having a "bufqscan" thread doing the balancing are,
3536  * Keep enough bufs on BQ_EMPTY.
3537  *      getnewbuf() by default will always select a buffer from the BQ_EMPTY.
3538  *              getnewbuf() perfoms best if a buffer was found there.
3539  *              Also this minimizes the possibility of starting IO
3540  *              from getnewbuf(). That's a performance win, too.
3541  *
3542  *      Localize complex logic [balancing as well as time aging]
3543  *              to balancebufq().
3544  *
3545  *      Simplify getnewbuf() logic by elimination of time aging code.
3546  */
3547
3548 /*
3549  * Algorithm:
3550  * -----------
3551  * The goal of the dynamic scaling of the buffer queues to to keep
3552  * the size of the LRU close to bl_target. Buffers on a queue would
3553  * be time aged.
3554  *
3555  * There would be a thread which will be responsible for "balancing"
3556  * the buffer cache queues.
3557  *
3558  * The scan order would be:     AGE, LRU, META, EMPTY.
3559  */
3560
3561 long bufqscanwait = 0;
3562
3563 static void bufqscan_thread();
3564 static int balancebufq(int q);
3565 static int btrimempty(int n);
3566 static __inline__ int initbufqscan(void);
3567 static __inline__ int nextbufq(int q);
3568 static void buqlimprt(int all);
3569
3570
3571 static __inline__ void
3572 bufqinc(int q)
3573 {
3574         if ((q < 0) || (q >= BQUEUES))
3575                 return;
3576
3577         bufqlim[q].bl_num++;
3578         return;
3579 }
3580
3581 static __inline__ void
3582 bufqdec(int q)
3583 {
3584         if ((q < 0) || (q >= BQUEUES))
3585                 return;
3586
3587         bufqlim[q].bl_num--;
3588         return;
3589 }
3590
3591 static void
3592 bufq_balance_thread_init()
3593 {
3594
3595         if (bufqscanwait++ == 0) {
3596
3597                 /* Initalize globals */
3598                 MAXNBUF = (sane_size / PAGE_SIZE);
3599                 nbufh = nbuf;
3600                 nbuflow = min(nbufh, 100);
3601                 nbufhigh = min(MAXNBUF, max(nbufh, 2048));
3602                 nbuftarget = (sane_size >> 5) / PAGE_SIZE;
3603                 nbuftarget = max(nbuflow, nbuftarget);
3604                 nbuftarget = min(nbufhigh, nbuftarget);
3605
3606                 /*
3607                  * Initialize the bufqlim
3608                  */
3609
3610                 /* LOCKED queue */
3611                 bufqlim[BQ_LOCKED].bl_nlow = 0;
3612                 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
3613                 bufqlim[BQ_LOCKED].bl_target = 0;
3614                 bufqlim[BQ_LOCKED].bl_stale = 30;
3615
3616                 /* LRU queue */
3617                 bufqlim[BQ_LRU].bl_nlow = 0;
3618                 bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
3619                 bufqlim[BQ_LRU].bl_target = nbuftarget/4;
3620                 bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
3621
3622                 /* AGE queue */
3623                 bufqlim[BQ_AGE].bl_nlow = 0;
3624                 bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
3625                 bufqlim[BQ_AGE].bl_target = nbuftarget/4;
3626                 bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
3627
3628                 /* EMPTY queue */
3629                 bufqlim[BQ_EMPTY].bl_nlow = 0;
3630                 bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
3631                 bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
3632                 bufqlim[BQ_EMPTY].bl_stale = 600000;
3633
3634                 /* META queue */
3635                 bufqlim[BQ_META].bl_nlow = 0;
3636                 bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
3637                 bufqlim[BQ_META].bl_target = nbuftarget/4;
3638                 bufqlim[BQ_META].bl_stale = META_IS_STALE;
3639
3640                 /* LAUNDRY queue */
3641                 bufqlim[BQ_LOCKED].bl_nlow = 0;
3642                 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
3643                 bufqlim[BQ_LOCKED].bl_target = 0;
3644                 bufqlim[BQ_LOCKED].bl_stale = 30;
3645
3646                 buqlimprt(1);
3647         }
3648
3649         /* create worker thread */
3650         kernel_thread(kernel_task, bufqscan_thread);
3651 }
3652
3653 /* The workloop for the buffer balancing thread */
3654 static void
3655 bufqscan_thread()
3656 {
3657         int moretodo = 0;
3658
3659         for(;;) {
3660                 do {
3661                         int q;  /* buffer queue to process */
3662
3663                         q = initbufqscan();
3664                         for (; q; ) {
3665                                 moretodo |= balancebufq(q);
3666                                 q = nextbufq(q);
3667                         }
3668                 } while (moretodo);
3669
3670 #if DIAGNOSTIC
3671                 vfs_bufstats();
3672                 buqlimprt(0);
3673 #endif
3674                 (void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz);
3675                 moretodo = 0;
3676         }
3677 }
3678
3679 /* Seed for the buffer queue balancing */
3680 static __inline__ int
3681 initbufqscan()
3682 {
3683         /* Start with AGE queue */
3684         return (BQ_AGE);
3685 }
3686
3687 /* Pick next buffer queue to balance */
3688 static __inline__ int
3689 nextbufq(int q)
3690 {
3691         int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
3692
3693         q++;
3694         q %= sizeof(order);
3695         return (order[q]);
3696 }
3697
3698 /* function to balance the buffer queues */
3699 static int
3700 balancebufq(int q)
3701 {
3702         int moretodo = 0;
3703         int s = splbio();
3704         int n, t;
3705
3706         /* reject invalid q */
3707         if ((q < 0) || (q >= BQUEUES))
3708                 goto out;
3709
3710         /* LOCKED or LAUNDRY queue MUST not be balanced */
3711         if ((q == BQ_LOCKED) || (q == BQ_LAUNDRY))
3712                 goto out;
3713
3714         n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
3715
3716         /* If queue has less than target nothing more to do */
3717         if (n < 0)
3718                 goto out;
3719
3720         if ( n > 8 ) {
3721                 /* Balance only a small amount (12.5%) at a time */
3722                 n >>= 3;
3723         }
3724
3725         /* EMPTY queue needs special handling */
3726         if (q == BQ_EMPTY) {
3727                 moretodo |= btrimempty(n);
3728                 goto out;
3729         }
3730
3731         t = buf_timestamp():
3732
3733         for (; n > 0; n--) {
3734                 struct buf *bp = bufqueues[q].tqh_first;
3735                 if (!bp)
3736                         break;
3737
3738                 /* check if it's stale */
3739                 if ((t - bp->b_timestamp) > bufqlim[q].bl_stale) {
3740                         if (bcleanbuf(bp)) {
3741                                 /* buf_bawrite() issued, bp not ready */
3742                                 moretodo = 1;
3743                         } else {
3744                                 /* release the cleaned buffer to BQ_EMPTY */
3745                                 SET(bp->b_flags, B_INVAL);
3746                                 buf_brelse(bp);
3747                         }
3748                 } else
3749                         break;
3750         }
3751
3752 out:
3753         splx(s);
3754         return (moretodo);
3755 }
3756
3757 static int
3758 btrimempty(int n)
3759 {
3760         /*
3761          * When struct buf are allocated dynamically, this would
3762          * reclaim upto 'n' struct buf from the empty queue.
3763          */
3764
3765          return (0);
3766 }
3767
3768 static void
3769 buqlimprt(int all)
3770 {
3771         int i;
3772     static char *bname[BQUEUES] =
3773                 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
3774
3775         if (all)
3776                 for (i = 0; i < BQUEUES; i++) {
3777                         printf("%s : ", bname[i]);
3778                         printf("min = %ld, ", (long)bufqlim[i].bl_nlow);
3779                         printf("cur = %ld, ", (long)bufqlim[i].bl_num);
3780                         printf("max = %ld, ", (long)bufqlim[i].bl_nlhigh);
3781                         printf("target = %ld, ", (long)bufqlim[i].bl_target);
3782                         printf("stale after %ld seconds\n", bufqlim[i].bl_stale);
3783                 }
3784         else
3785                 for (i = 0; i < BQUEUES; i++) {
3786                         printf("%s : ", bname[i]);
3787                         printf("cur = %ld, ", (long)bufqlim[i].bl_num);
3788                 }
3789 }
3790
3791 #endif
3792
3793