bsd/vfs/vfs_bio.c

   1 /*
   2  * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*-
  30  * Copyright (c) 1994 Christopher G. Demetriou
  31  * Copyright (c) 1982, 1986, 1989, 1993
  32  *      The Regents of the University of California.  All rights reserved.
  33  * (c) UNIX System Laboratories, Inc.
  34  * All or some portions of this file are derived from material licensed
  35  * to the University of California by American Telephone and Telegraph
  36  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  37  * the permission of UNIX System Laboratories, Inc.
  38  *
  39  * Redistribution and use in source and binary forms, with or without
  40  * modification, are permitted provided that the following conditions
  41  * are met:
  42  * 1. Redistributions of source code must retain the above copyright
  43  *    notice, this list of conditions and the following disclaimer.
  44  * 2. Redistributions in binary form must reproduce the above copyright
  45  *    notice, this list of conditions and the following disclaimer in the
  46  *    documentation and/or other materials provided with the distribution.
  47  * 3. All advertising materials mentioning features or use of this software
  48  *    must display the following acknowledgement:
  49  *      This product includes software developed by the University of
  50  *      California, Berkeley and its contributors.
  51  * 4. Neither the name of the University nor the names of its contributors
  52  *    may be used to endorse or promote products derived from this software
  53  *    without specific prior written permission.
  54  *
  55  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  56  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  57  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  58  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  59  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  60  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  61  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  62  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  63  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  64  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  65  * SUCH DAMAGE.
  66  *
  67  *      @(#)vfs_bio.c   8.6 (Berkeley) 1/11/94
  68  */
  69
  70 /*
  71  * Some references:
  72  *      Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
  73  *      Leffler, et al.: The Design and Implementation of the 4.3BSD
  74  *              UNIX Operating System (Addison Welley, 1989)
  75  */
  76
  77 #include <sys/param.h>
  78 #include <sys/systm.h>
  79 #include <sys/proc_internal.h>
  80 #include <sys/buf_internal.h>
  81 #include <sys/vnode_internal.h>
  82 #include <sys/mount_internal.h>
  83 #include <sys/trace.h>
  84 #include <sys/malloc.h>
  85 #include <sys/resourcevar.h>
  86 #include <miscfs/specfs/specdev.h>
  87 #include <sys/ubc.h>
  88 #include <sys/kauth.h>
  89 #if DIAGNOSTIC
  90 #include <kern/assert.h>
  91 #endif /* DIAGNOSTIC */
  92 #include <kern/task.h>
  93 #include <kern/zalloc.h>
  94 #include <kern/lock.h>
  95
  96 #include <vm/vm_kern.h>
  97
  98 #include <sys/kdebug.h>
  99 #include <machine/spl.h>
 100
 101 #if BALANCE_QUEUES
 102 static __inline__ void bufqinc(int q);
 103 static __inline__ void bufqdec(int q);
 104 #endif
 105
 106 static int      bcleanbuf(buf_t bp);
 107 static int      brecover_data(buf_t bp);
 108 static boolean_t incore(vnode_t vp, daddr64_t blkno);
 109 static buf_t    incore_locked(vnode_t vp, daddr64_t blkno);
 110 /* timeout is in msecs */
 111 static buf_t    getnewbuf(int slpflag, int slptimeo, int *queue);
 112 static void     bremfree_locked(buf_t bp);
 113 static void     buf_reassign(buf_t bp, vnode_t newvp);
 114 static errno_t  buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo);
 115 static int      buf_iterprepare(vnode_t vp, struct buflists *, int flags);
 116 static void     buf_itercomplete(vnode_t vp, struct buflists *, int flags);
 117
 118 __private_extern__ int  bdwrite_internal(buf_t, int);
 119
 120 /* zone allocated buffer headers */
 121 static void     bufzoneinit(void);
 122 static void     bcleanbuf_thread_init(void);
 123 static void     bcleanbuf_thread(void);
 124
 125 static zone_t   buf_hdr_zone;
 126 static int      buf_hdr_count;
 127
 128
 129 /*
 130  * Definitions for the buffer hash lists.
 131  */
 132 #define BUFHASH(dvp, lbn)       \
 133         (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
 134 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
 135 u_long  bufhash;
 136
 137 /* Definitions for the buffer stats. */
 138 struct bufstats bufstats;
 139
 140 /* Number of delayed write buffers */
 141 int nbdwrite = 0;
 142 int blaundrycnt = 0;
 143
 144
 145 static TAILQ_HEAD(ioqueue, buf) iobufqueue;
 146 static TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
 147 static int needbuffer;
 148 static int need_iobuffer;
 149
 150 static lck_grp_t        *buf_mtx_grp;
 151 static lck_attr_t       *buf_mtx_attr;
 152 static lck_grp_attr_t   *buf_mtx_grp_attr;
 153 static lck_mtx_t        *iobuffer_mtxp;
 154 static lck_mtx_t        *buf_mtxp;
 155
 156 static __inline__ int
 157 buf_timestamp(void)
 158 {
 159         struct  timeval         t;
 160         microuptime(&t);
 161         return (t.tv_sec);
 162 }
 163
 164 /*
 165  * Insq/Remq for the buffer free lists.
 166  */
 167 #if BALANCE_QUEUES
 168 #define binsheadfree(bp, dp, whichq)    do { \
 169                                     TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
 170                                         bufqinc((whichq));      \
 171                                         (bp)->b_whichq = whichq; \
 172                                     (bp)->b_timestamp = buf_timestamp(); \
 173                                 } while (0)
 174
 175 #define binstailfree(bp, dp, whichq)    do { \
 176                                     TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
 177                                         bufqinc((whichq));      \
 178                                         (bp)->b_whichq = whichq; \
 179                                     (bp)->b_timestamp = buf_timestamp(); \
 180                                 } while (0)
 181 #else
 182 #define binsheadfree(bp, dp, whichq)    do { \
 183                                     TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
 184                                         (bp)->b_whichq = whichq; \
 185                                     (bp)->b_timestamp = buf_timestamp(); \
 186                                 } while (0)
 187
 188 #define binstailfree(bp, dp, whichq)    do { \
 189                                     TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
 190                                         (bp)->b_whichq = whichq; \
 191                                     (bp)->b_timestamp = buf_timestamp(); \
 192                                 } while (0)
 193 #endif
 194
 195
 196 #define BHASHENTCHECK(bp)       \
 197         if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef)  \
 198                 panic("%x: b_hash.le_prev is not deadbeef", (bp));
 199
 200 #define BLISTNONE(bp)   \
 201         (bp)->b_hash.le_next = (struct buf *)0; \
 202         (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
 203
 204 /*
 205  * Insq/Remq for the vnode usage lists.
 206  */
 207 #define bufinsvn(bp, dp)        LIST_INSERT_HEAD(dp, bp, b_vnbufs)
 208 #define bufremvn(bp) {                                                  \
 209         LIST_REMOVE(bp, b_vnbufs);                                      \
 210         (bp)->b_vnbufs.le_next = NOLIST;                                \
 211 }
 212
 213 /*
 214  * Time in seconds before a buffer on a list is
 215  * considered as a stale buffer
 216  */
 217 #define LRU_IS_STALE 120 /* default value for the LRU */
 218 #define AGE_IS_STALE 60  /* default value for the AGE */
 219 #define META_IS_STALE 180 /* default value for the BQ_META */
 220
 221 int lru_is_stale = LRU_IS_STALE;
 222 int age_is_stale = AGE_IS_STALE;
 223 int meta_is_stale = META_IS_STALE;
 224 static int boot_nbuf = 0;
 225
 226
 227 /* LIST_INSERT_HEAD() with assertions */
 228 static __inline__ void
 229 blistenterhead(struct bufhashhdr * head, buf_t bp)
 230 {
 231         if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
 232                 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
 233         (head)->lh_first = bp;
 234         bp->b_hash.le_prev = &(head)->lh_first;
 235         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
 236                 panic("blistenterhead: le_prev is deadbeef");
 237 }
 238
 239 static __inline__ void
 240 binshash(buf_t bp, struct bufhashhdr *dp)
 241 {
 242 #if DIAGNOSTIC
 243         buf_t   nbp;
 244 #endif /* DIAGNOSTIC */
 245
 246         BHASHENTCHECK(bp);
 247
 248 #if DIAGNOSTIC
 249         nbp = dp->lh_first;
 250         for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
 251                 if(nbp == bp)
 252                         panic("buf already in hashlist");
 253         }
 254 #endif /* DIAGNOSTIC */
 255
 256         blistenterhead(dp, bp);
 257 }
 258
 259 static __inline__ void
 260 bremhash(buf_t  bp)
 261 {
 262         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
 263                 panic("bremhash le_prev is deadbeef");
 264         if (bp->b_hash.le_next == bp)
 265                 panic("bremhash: next points to self");
 266
 267         if (bp->b_hash.le_next != NULL)
 268                 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
 269         *bp->b_hash.le_prev = (bp)->b_hash.le_next;
 270 }
 271
 272
 273
 274
 275 int
 276 buf_valid(buf_t bp) {
 277
 278         if ( (bp->b_flags & (B_DONE | B_DELWRI)) )
 279                 return 1;
 280         return 0;
 281 }
 282
 283 int
 284 buf_fromcache(buf_t bp) {
 285
 286         if ( (bp->b_flags & B_CACHE) )
 287                 return 1;
 288         return 0;
 289 }
 290
 291 void
 292 buf_markinvalid(buf_t bp) {
 293
 294         SET(bp->b_flags, B_INVAL);
 295 }
 296
 297 void
 298 buf_markdelayed(buf_t bp) {
 299
 300         SET(bp->b_flags, B_DELWRI);
 301         buf_reassign(bp, bp->b_vp);
 302 }
 303
 304 void
 305 buf_markeintr(buf_t bp) {
 306
 307         SET(bp->b_flags, B_EINTR);
 308 }
 309
 310 void
 311 buf_markaged(buf_t bp) {
 312
 313         SET(bp->b_flags, B_AGE);
 314 }
 315
 316 errno_t
 317 buf_error(buf_t bp) {
 318
 319         return (bp->b_error);
 320 }
 321
 322 void
 323 buf_seterror(buf_t bp, errno_t error) {
 324
 325         if ((bp->b_error = error))
 326                 SET(bp->b_flags, B_ERROR);
 327         else
 328                 CLR(bp->b_flags, B_ERROR);
 329 }
 330
 331 void
 332 buf_setflags(buf_t bp, int32_t flags) {
 333
 334         SET(bp->b_flags, (flags & BUF_X_WRFLAGS));
 335 }
 336
 337 void
 338 buf_clearflags(buf_t bp, int32_t flags) {
 339
 340         CLR(bp->b_flags, (flags & BUF_X_WRFLAGS));
 341 }
 342
 343 int32_t
 344 buf_flags(buf_t bp) {
 345
 346         return ((bp->b_flags & BUF_X_RDFLAGS));
 347 }
 348
 349 void
 350 buf_reset(buf_t bp, int32_t io_flags) {
 351
 352         CLR(bp->b_flags, (B_READ | B_WRITE | B_ERROR | B_DONE | B_INVAL | B_ASYNC | B_NOCACHE));
 353         SET(bp->b_flags, (io_flags & (B_ASYNC | B_READ | B_WRITE | B_NOCACHE)));
 354
 355         bp->b_error = 0;
 356 }
 357
 358 uint32_t
 359 buf_count(buf_t bp) {
 360
 361         return (bp->b_bcount);
 362 }
 363
 364 void
 365 buf_setcount(buf_t bp, uint32_t bcount) {
 366
 367         bp->b_bcount = bcount;
 368 }
 369
 370 uint32_t
 371 buf_size(buf_t bp) {
 372
 373         return (bp->b_bufsize);
 374 }
 375
 376 void
 377 buf_setsize(buf_t bp, uint32_t bufsize) {
 378
 379         bp->b_bufsize = bufsize;
 380 }
 381
 382 uint32_t
 383 buf_resid(buf_t bp) {
 384
 385         return (bp->b_resid);
 386 }
 387
 388 void
 389 buf_setresid(buf_t bp, uint32_t resid) {
 390
 391         bp->b_resid = resid;
 392 }
 393
 394 uint32_t
 395 buf_dirtyoff(buf_t bp) {
 396
 397         return (bp->b_dirtyoff);
 398 }
 399
 400 uint32_t
 401 buf_dirtyend(buf_t bp) {
 402
 403         return (bp->b_dirtyend);
 404 }
 405
 406 void
 407 buf_setdirtyoff(buf_t bp, uint32_t dirtyoff) {
 408
 409         bp->b_dirtyoff = dirtyoff;
 410 }
 411
 412 void
 413 buf_setdirtyend(buf_t bp, uint32_t dirtyend) {
 414
 415         bp->b_dirtyend = dirtyend;
 416 }
 417
 418 uintptr_t
 419 buf_dataptr(buf_t bp) {
 420
 421         return (bp->b_datap);
 422 }
 423
 424 void
 425 buf_setdataptr(buf_t bp, uintptr_t data) {
 426
 427         bp->b_datap = data;
 428 }
 429
 430 vnode_t
 431 buf_vnode(buf_t bp) {
 432
 433         return (bp->b_vp);
 434 }
 435
 436 void
 437 buf_setvnode(buf_t bp, vnode_t vp) {
 438
 439         bp->b_vp = vp;
 440 }
 441
 442
 443 void *
 444 buf_callback(buf_t bp)
 445 {
 446         if ( !(bp->b_lflags & BL_IOBUF) )
 447                 return ((void *) NULL);
 448         if ( !(bp->b_flags & B_CALL) )
 449                 return ((void *) NULL);
 450
 451         return ((void *)bp->b_iodone);
 452 }
 453
 454
 455 errno_t
 456 buf_setcallback(buf_t bp, void (*callback)(buf_t, void *), void *transaction)
 457 {
 458
 459         if ( !(bp->b_lflags & BL_IOBUF) )
 460                 return (EINVAL);
 461
 462         if (callback)
 463                 bp->b_flags |= (B_CALL | B_ASYNC);
 464         else
 465                 bp->b_flags &= ~B_CALL;
 466         bp->b_transaction = transaction;
 467         bp->b_iodone = callback;
 468
 469         return (0);
 470 }
 471
 472 errno_t
 473 buf_setupl(buf_t bp, upl_t upl, uint32_t offset)
 474 {
 475
 476         if ( !(bp->b_lflags & BL_IOBUF) )
 477                 return (EINVAL);
 478
 479         if (upl)
 480                 bp->b_flags |= B_CLUSTER;
 481         else
 482                 bp->b_flags &= ~B_CLUSTER;
 483         bp->b_upl = upl;
 484         bp->b_uploffset = offset;
 485
 486         return (0);
 487 }
 488
 489 buf_t
 490 buf_clone(buf_t bp, int io_offset, int io_size, void (*iodone)(buf_t, void *), void *arg)
 491 {
 492         buf_t   io_bp;
 493
 494         if (io_offset < 0 || io_size < 0)
 495                 return (NULL);
 496
 497         if ((unsigned)(io_offset + io_size) > (unsigned)bp->b_bcount)
 498                 return (NULL);
 499
 500         if (bp->b_flags & B_CLUSTER) {
 501                 if (io_offset && ((bp->b_uploffset + io_offset) & PAGE_MASK))
 502                         return (NULL);
 503
 504                 if (((bp->b_uploffset + io_offset + io_size) & PAGE_MASK) && ((io_offset + io_size) < bp->b_bcount))
 505                         return (NULL);
 506         }
 507         io_bp = alloc_io_buf(bp->b_vp, 0);
 508
 509         io_bp->b_flags = bp->b_flags & (B_COMMIT_UPL | B_META | B_PAGEIO | B_CLUSTER | B_PHYS | B_ASYNC | B_READ);
 510
 511         if (iodone) {
 512                 io_bp->b_transaction = arg;
 513                 io_bp->b_iodone = iodone;
 514                 io_bp->b_flags |= B_CALL;
 515         }
 516         if (bp->b_flags & B_CLUSTER) {
 517                 io_bp->b_upl = bp->b_upl;
 518                 io_bp->b_uploffset = bp->b_uploffset + io_offset;
 519         } else {
 520                 io_bp->b_datap  = (uintptr_t)(((char *)bp->b_datap) + io_offset);
 521         }
 522         io_bp->b_bcount = io_size;
 523
 524         return (io_bp);
 525 }
 526
 527
 528
 529 void
 530 buf_setfilter(buf_t bp, void (*filter)(buf_t, void *), void *transaction,
 531               void **old_iodone, void **old_transaction)
 532 {
 533         if (old_iodone)
 534                 *old_iodone = (void *)(bp->b_iodone);
 535         if (old_transaction)
 536                 *old_transaction = (void *)(bp->b_transaction);
 537
 538         bp->b_transaction = transaction;
 539         bp->b_iodone = filter;
 540         bp->b_flags |= B_FILTER;
 541 }
 542
 543
 544 daddr64_t
 545 buf_blkno(buf_t bp) {
 546
 547         return (bp->b_blkno);
 548 }
 549
 550 daddr64_t
 551 buf_lblkno(buf_t bp) {
 552
 553         return (bp->b_lblkno);
 554 }
 555
 556 void
 557 buf_setblkno(buf_t bp, daddr64_t blkno) {
 558
 559         bp->b_blkno = blkno;
 560 }
 561
 562 void
 563 buf_setlblkno(buf_t bp, daddr64_t lblkno) {
 564
 565         bp->b_lblkno = lblkno;
 566 }
 567
 568 dev_t
 569 buf_device(buf_t bp) {
 570
 571         return (bp->b_dev);
 572 }
 573
 574 errno_t
 575 buf_setdevice(buf_t bp, vnode_t vp) {
 576
 577         if ((vp->v_type != VBLK) && (vp->v_type != VCHR))
 578                 return EINVAL;
 579         bp->b_dev = vp->v_rdev;
 580
 581         return 0;
 582 }
 583
 584
 585 void *
 586 buf_drvdata(buf_t bp) {
 587
 588         return (bp->b_drvdata);
 589 }
 590
 591 void
 592 buf_setdrvdata(buf_t bp, void *drvdata) {
 593
 594         bp->b_drvdata = drvdata;
 595 }
 596
 597 void *
 598 buf_fsprivate(buf_t bp) {
 599
 600         return (bp->b_fsprivate);
 601 }
 602
 603 void
 604 buf_setfsprivate(buf_t bp, void *fsprivate) {
 605
 606         bp->b_fsprivate = fsprivate;
 607 }
 608
 609 ucred_t
 610 buf_rcred(buf_t bp) {
 611
 612         return (bp->b_rcred);
 613 }
 614
 615 ucred_t
 616 buf_wcred(buf_t bp) {
 617
 618         return (bp->b_wcred);
 619 }
 620
 621 void *
 622 buf_upl(buf_t bp) {
 623
 624         return (bp->b_upl);
 625 }
 626
 627 uint32_t
 628 buf_uploffset(buf_t bp) {
 629
 630         return ((uint32_t)(bp->b_uploffset));
 631 }
 632
 633 proc_t
 634 buf_proc(buf_t bp) {
 635
 636         return (bp->b_proc);
 637 }
 638
 639
 640 errno_t
 641 buf_map(buf_t bp, caddr_t *io_addr)
 642 {
 643         buf_t           real_bp;
 644         vm_offset_t     vaddr;
 645         kern_return_t   kret;
 646
 647         if ( !(bp->b_flags & B_CLUSTER)) {
 648                 *io_addr = (caddr_t)bp->b_datap;
 649                 return (0);
 650         }
 651         real_bp = (buf_t)(bp->b_real_bp);
 652
 653         if (real_bp && real_bp->b_datap) {
 654                 /*
 655                  * b_real_bp is only valid if B_CLUSTER is SET
 656                  * if it's non-zero, than someone did a cluster_bp call
 657                  * if the backing physical pages were already mapped
 658                  * in before the call to cluster_bp (non-zero b_datap),
 659                  * than we just use that mapping
 660                  */
 661                 *io_addr = (caddr_t)real_bp->b_datap;
 662                 return (0);
 663         }
 664         kret = ubc_upl_map(bp->b_upl, &vaddr);    /* Map it in */
 665
 666         if (kret != KERN_SUCCESS) {
 667                 *io_addr = 0;
 668
 669                 return(ENOMEM);
 670         }
 671         vaddr += bp->b_uploffset;
 672
 673         *io_addr = (caddr_t)vaddr;
 674
 675         return (0);
 676 }
 677
 678 errno_t
 679 buf_unmap(buf_t bp)
 680 {
 681         buf_t           real_bp;
 682         kern_return_t   kret;
 683
 684         if ( !(bp->b_flags & B_CLUSTER))
 685                 return (0);
 686         /*
 687          * see buf_map for the explanation
 688          */
 689         real_bp = (buf_t)(bp->b_real_bp);
 690
 691         if (real_bp && real_bp->b_datap)
 692                 return (0);
 693
 694         if (bp->b_lflags & BL_IOBUF) {
 695                 /*
 696                  * when we commit these pages, we'll hit
 697                  * it with UPL_COMMIT_INACTIVE which
 698                  * will clear the reference bit that got
 699                  * turned on when we touched the mapping
 700                  */
 701                 bp->b_flags |= B_AGE;
 702         }
 703         kret = ubc_upl_unmap(bp->b_upl);
 704
 705         if (kret != KERN_SUCCESS)
 706                 return (EINVAL);
 707         return (0);
 708 }
 709
 710
 711 void
 712 buf_clear(buf_t bp) {
 713         caddr_t baddr;
 714
 715         if (buf_map(bp, &baddr) == 0) {
 716                 bzero(baddr, bp->b_bcount);
 717                 buf_unmap(bp);
 718         }
 719         bp->b_resid = 0;
 720 }
 721
 722
 723
 724 /*
 725  * Read or write a buffer that is not contiguous on disk.
 726  * buffer is marked done/error at the conclusion
 727  */
 728 static int
 729 buf_strategy_fragmented(vnode_t devvp, buf_t bp, off_t f_offset, size_t contig_bytes)
 730 {
 731         vnode_t vp = buf_vnode(bp);
 732         buf_t   io_bp;                   /* For reading or writing a single block */
 733         int     io_direction;
 734         int     io_resid;
 735         size_t  io_contig_bytes;
 736         daddr64_t io_blkno;
 737         int     error = 0;
 738         int     bmap_flags;
 739
 740         /*
 741          * save our starting point... the bp was already mapped
 742          * in buf_strategy before we got called
 743          * no sense doing it again.
 744          */
 745         io_blkno = bp->b_blkno;
 746         /*
 747          * Make sure we redo this mapping for the next I/O
 748          * i.e. this can never be a 'permanent' mapping
 749          */
 750         bp->b_blkno = bp->b_lblkno;
 751
 752         /*
 753          * Get an io buffer to do the deblocking
 754          */
 755         io_bp = alloc_io_buf(devvp, 0);
 756
 757         io_bp->b_lblkno = bp->b_lblkno;
 758         io_bp->b_datap  = bp->b_datap;
 759         io_resid        = bp->b_bcount;
 760         io_direction    = bp->b_flags & B_READ;
 761         io_contig_bytes = contig_bytes;
 762
 763         if (bp->b_flags & B_READ)
 764                 bmap_flags = VNODE_READ;
 765         else
 766                 bmap_flags = VNODE_WRITE;
 767
 768         for (;;) {
 769                 if (io_blkno == -1)
 770                         /*
 771                          * this is unexepected, but we'll allow for it
 772                          */
 773                         bzero((caddr_t)io_bp->b_datap, (int)io_contig_bytes);
 774                 else {
 775                         io_bp->b_bcount  = io_contig_bytes;
 776                         io_bp->b_bufsize = io_contig_bytes;
 777                         io_bp->b_resid   = io_contig_bytes;
 778                         io_bp->b_blkno   = io_blkno;
 779
 780                         buf_reset(io_bp, io_direction);
 781                         /*
 782                          * Call the device to do the I/O and wait for it
 783                          */
 784                         if ((error = VNOP_STRATEGY(io_bp)))
 785                                 break;
 786                         if ((error = (int)buf_biowait(io_bp)))
 787                                 break;
 788                         if (io_bp->b_resid) {
 789                                 io_resid -= (io_contig_bytes - io_bp->b_resid);
 790                                 break;
 791                         }
 792                 }
 793                 if ((io_resid -= io_contig_bytes) == 0)
 794                         break;
 795                 f_offset       += io_contig_bytes;
 796                 io_bp->b_datap += io_contig_bytes;
 797
 798                 /*
 799                  * Map the current position to a physical block number
 800                  */
 801                 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_resid, &io_blkno, &io_contig_bytes, NULL, bmap_flags, NULL)))
 802                         break;
 803         }
 804         buf_free(io_bp);
 805
 806         if (error)
 807                 buf_seterror(bp, error);
 808         bp->b_resid = io_resid;
 809         /*
 810          * This I/O is now complete
 811          */
 812         buf_biodone(bp);
 813
 814         return error;
 815 }
 816
 817
 818 /*
 819  * struct vnop_strategy_args {
 820  *      struct buf *a_bp;
 821  * } *ap;
 822  */
 823 errno_t
 824 buf_strategy(vnode_t devvp, void *ap)
 825 {
 826         buf_t   bp = ((struct vnop_strategy_args *)ap)->a_bp;
 827         vnode_t vp = bp->b_vp;
 828         int     bmap_flags;
 829         errno_t error;
 830
 831         if (vp == NULL || vp->v_type == VCHR || vp->v_type == VBLK)
 832                 panic("buf_strategy: b_vp == NULL || vtype == VCHR | VBLK\n");
 833         /*
 834          * associate the physical device with
 835          * with this buf_t even if we don't
 836          * end up issuing the I/O...
 837          */
 838         bp->b_dev = devvp->v_rdev;
 839
 840         if (bp->b_flags & B_READ)
 841                 bmap_flags = VNODE_READ;
 842         else
 843                 bmap_flags = VNODE_WRITE;
 844
 845         if ( !(bp->b_flags & B_CLUSTER)) {
 846
 847                 if ( (bp->b_upl) ) {
 848                         /*
 849                          * we have a UPL associated with this bp
 850                          * go through cluster_bp which knows how
 851                          * to deal with filesystem block sizes
 852                          * that aren't equal to the page size
 853                          */
 854                         return (cluster_bp(bp));
 855                 }
 856                 if (bp->b_blkno == bp->b_lblkno) {
 857                         off_t   f_offset;
 858                         size_t  contig_bytes;
 859
 860                         if ((error = VNOP_BLKTOOFF(vp, bp->b_lblkno, &f_offset))) {
 861                                 buf_seterror(bp, error);
 862                                 buf_biodone(bp);
 863
 864                                 return (error);
 865                         }
 866                         if ((error = VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))) {
 867                                 buf_seterror(bp, error);
 868                                 buf_biodone(bp);
 869
 870                                 return (error);
 871                         }
 872                         if (bp->b_blkno == -1)
 873                                 buf_clear(bp);
 874                         else if ((long)contig_bytes < bp->b_bcount)
 875                                 return (buf_strategy_fragmented(devvp, bp, f_offset, contig_bytes));
 876                 }
 877                 if (bp->b_blkno == -1) {
 878                         buf_biodone(bp);
 879                         return (0);
 880                 }
 881         }
 882         /*
 883          * we can issue the I/O because...
 884          * either B_CLUSTER is set which
 885          * means that the I/O is properly set
 886          * up to be a multiple of the page size, or
 887          * we were able to successfully set up the
 888          * phsyical block mapping
 889          */
 890         return (VOCALL(devvp->v_op, VOFFSET(vnop_strategy), ap));
 891 }
 892
 893
 894
 895 buf_t
 896 buf_alloc(vnode_t vp)
 897 {
 898         return(alloc_io_buf(vp, 0));
 899 }
 900
 901 void
 902 buf_free(buf_t bp) {
 903
 904         free_io_buf(bp);
 905 }
 906
 907
 908
 909 void
 910 buf_iterate(vnode_t vp, int (*callout)(buf_t, void *), int flags, void *arg) {
 911         buf_t   bp;
 912         int     retval;
 913         struct  buflists local_iterblkhd;
 914         int     lock_flags = BAC_NOWAIT | BAC_REMOVE;
 915
 916         if (flags & BUF_SKIP_LOCKED)
 917                 lock_flags |= BAC_SKIP_LOCKED;
 918         if (flags & BUF_SKIP_NONLOCKED)
 919                 lock_flags |= BAC_SKIP_NONLOCKED;
 920
 921         lck_mtx_lock(buf_mtxp);
 922
 923         if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY))  {
 924                 lck_mtx_unlock(buf_mtxp);
 925                 return;
 926         }
 927         while (!LIST_EMPTY(&local_iterblkhd)) {
 928                 bp = LIST_FIRST(&local_iterblkhd);
 929                 LIST_REMOVE(bp, b_vnbufs);
 930                 LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
 931
 932                 if (buf_acquire_locked(bp, lock_flags, 0, 0))
 933                         continue;
 934
 935                 lck_mtx_unlock(buf_mtxp);
 936
 937                 retval = callout(bp, arg);
 938
 939                 switch (retval) {
 940                 case BUF_RETURNED:
 941                         buf_brelse(bp);
 942                         break;
 943                 case BUF_CLAIMED:
 944                         break;
 945                 case BUF_RETURNED_DONE:
 946                         buf_brelse(bp);
 947                         lck_mtx_lock(buf_mtxp);
 948                         goto out;
 949                 case BUF_CLAIMED_DONE:
 950                         lck_mtx_lock(buf_mtxp);
 951                         goto out;
 952                 }
 953                 lck_mtx_lock(buf_mtxp);
 954         }
 955 out:
 956         buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
 957
 958         lck_mtx_unlock(buf_mtxp);
 959 }
 960
 961
 962 /*
 963  * Flush out and invalidate all buffers associated with a vnode.
 964  */
 965 int
 966 buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo)
 967 {
 968         buf_t   bp;
 969         int     error = 0;
 970         int     must_rescan = 1;
 971         struct  buflists local_iterblkhd;
 972
 973         lck_mtx_lock(buf_mtxp);
 974
 975         for (;;) {
 976                 if (must_rescan == 0)
 977                         /*
 978                          * the lists may not be empty, but all that's left at this
 979                          * point are metadata or B_LOCKED buffers which are being
 980                          * skipped... we know this because we made it through both
 981                          * the clean and dirty lists without dropping buf_mtxp...
 982                          * each time we drop buf_mtxp we bump "must_rescan"
 983                          */
 984                         break;
 985                 if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd))
 986                         break;
 987                 must_rescan = 0;
 988                 /*
 989                  * iterate the clean list
 990                  */
 991                 if (buf_iterprepare(vp, &local_iterblkhd, VBI_CLEAN)) {
 992                         goto try_dirty_list;
 993                 }
 994                 while (!LIST_EMPTY(&local_iterblkhd)) {
 995                         bp = LIST_FIRST(&local_iterblkhd);
 996
 997                         LIST_REMOVE(bp, b_vnbufs);
 998                         LIST_INSERT_HEAD(&vp->v_cleanblkhd, bp, b_vnbufs);
 999
1000                         /*
1001                          * some filesystems distinguish meta data blocks with a negative logical block #
1002                          */
1003                         if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
1004                                 continue;
1005
1006                         if ( (error = (int)buf_acquire_locked(bp, BAC_REMOVE | BAC_SKIP_LOCKED, slpflag, slptimeo)) ) {
1007                                 if (error == EDEADLK)
1008                                         /*
1009                                          * this buffer was marked B_LOCKED...
1010                                          * we didn't drop buf_mtxp, so we
1011                                          * we don't need to rescan
1012                                          */
1013                                         continue;
1014                                 if (error == EAGAIN) {
1015                                         /*
1016                                          * found a busy buffer... we blocked and
1017                                          * dropped buf_mtxp, so we're going to
1018                                          * need to rescan after this pass is completed
1019                                          */
1020                                         must_rescan++;
1021                                         continue;
1022                                 }
1023                                 /*
1024                                  * got some kind of 'real' error out of the msleep
1025                                  * in buf_acquire_locked, terminate the scan and return the error
1026                                  */
1027                                 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1028
1029                                 lck_mtx_unlock(buf_mtxp);
1030                                 return (error);
1031                         }
1032                         lck_mtx_unlock(buf_mtxp);
1033
1034                         SET(bp->b_flags, B_INVAL);
1035                         buf_brelse(bp);
1036
1037                         lck_mtx_lock(buf_mtxp);
1038
1039                         /*
1040                          * by dropping buf_mtxp, we allow new
1041                          * buffers to be added to the vnode list(s)
1042                          * we'll have to rescan at least once more
1043                          * if the queues aren't empty
1044                          */
1045                         must_rescan++;
1046                 }
1047                 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1048
1049 try_dirty_list:
1050                 /*
1051                  * Now iterate on dirty blks
1052                  */
1053                 if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY)) {
1054                         continue;
1055                 }
1056                 while (!LIST_EMPTY(&local_iterblkhd)) {
1057                         bp = LIST_FIRST(&local_iterblkhd);
1058
1059                         LIST_REMOVE(bp, b_vnbufs);
1060                         LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1061
1062                         /*
1063                          * some filesystems distinguish meta data blocks with a negative logical block #
1064                          */
1065                         if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
1066                                 continue;
1067
1068                         if ( (error = (int)buf_acquire_locked(bp, BAC_REMOVE | BAC_SKIP_LOCKED, slpflag, slptimeo)) ) {
1069                                 if (error == EDEADLK)
1070                                         /*
1071                                          * this buffer was marked B_LOCKED...
1072                                          * we didn't drop buf_mtxp, so we
1073                                          * we don't need to rescan
1074                                          */
1075                                         continue;
1076                                 if (error == EAGAIN) {
1077                                         /*
1078                                          * found a busy buffer... we blocked and
1079                                          * dropped buf_mtxp, so we're going to
1080                                          * need to rescan after this pass is completed
1081                                          */
1082                                         must_rescan++;
1083                                         continue;
1084                                 }
1085                                 /*
1086                                  * got some kind of 'real' error out of the msleep
1087                                  * in buf_acquire_locked, terminate the scan and return the error
1088                                  */
1089                                 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1090
1091                                 lck_mtx_unlock(buf_mtxp);
1092                                 return (error);
1093                         }
1094                         lck_mtx_unlock(buf_mtxp);
1095
1096                         SET(bp->b_flags, B_INVAL);
1097
1098                         if (ISSET(bp->b_flags, B_DELWRI) && (flags & BUF_WRITE_DATA))
1099                                 (void) VNOP_BWRITE(bp);
1100                         else
1101                                 buf_brelse(bp);
1102
1103                         lck_mtx_lock(buf_mtxp);
1104                         /*
1105                          * by dropping buf_mtxp, we allow new
1106                          * buffers to be added to the vnode list(s)
1107                          * we'll have to rescan at least once more
1108                          * if the queues aren't empty
1109                          */
1110                         must_rescan++;
1111                 }
1112                 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1113         }
1114         lck_mtx_unlock(buf_mtxp);
1115
1116         return (0);
1117 }
1118
1119 void
1120 buf_flushdirtyblks(vnode_t vp, int wait, int flags, char *msg) {
1121         buf_t   bp;
1122         int     writes_issued = 0;
1123         errno_t error;
1124         int     busy = 0;
1125         struct  buflists local_iterblkhd;
1126         int     lock_flags = BAC_NOWAIT | BAC_REMOVE;
1127
1128         if (flags & BUF_SKIP_LOCKED)
1129                 lock_flags |= BAC_SKIP_LOCKED;
1130         if (flags & BUF_SKIP_NONLOCKED)
1131                 lock_flags |= BAC_SKIP_NONLOCKED;
1132 loop:
1133         lck_mtx_lock(buf_mtxp);
1134
1135         if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY) == 0)  {
1136                 while (!LIST_EMPTY(&local_iterblkhd)) {
1137                         bp = LIST_FIRST(&local_iterblkhd);
1138                         LIST_REMOVE(bp, b_vnbufs);
1139                         LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1140
1141                         if ((error = buf_acquire_locked(bp, lock_flags, 0, 0)) == EBUSY)
1142                                 busy++;
1143                         if (error)
1144                                 continue;
1145                         lck_mtx_unlock(buf_mtxp);
1146
1147                         bp->b_flags &= ~B_LOCKED;
1148
1149                         /*
1150                          * Wait for I/O associated with indirect blocks to complete,
1151                          * since there is no way to quickly wait for them below.
1152                          */
1153                         if ((bp->b_vp == vp) || (wait == 0))
1154                                 (void) buf_bawrite(bp);
1155                         else
1156                                 (void) VNOP_BWRITE(bp);
1157                         writes_issued++;
1158
1159                         lck_mtx_lock(buf_mtxp);
1160                 }
1161                 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1162         }
1163         lck_mtx_unlock(buf_mtxp);
1164
1165         if (wait) {
1166                 (void)vnode_waitforwrites(vp, 0, 0, 0, msg);
1167
1168                 if (vp->v_dirtyblkhd.lh_first && busy) {
1169                         /*
1170                          * we had one or more BUSY buffers on
1171                          * the dirtyblock list... most likely
1172                          * these are due to delayed writes that
1173                          * were moved to the bclean queue but
1174                          * have not yet been 'written'.
1175                          * if we issued some writes on the
1176                          * previous pass, we try again immediately
1177                          * if we didn't, we'll sleep for some time
1178                          * to allow the state to change...
1179                          */
1180                         if (writes_issued == 0) {
1181                                 (void)tsleep((caddr_t)&vp->v_numoutput,
1182                                              PRIBIO + 1, "vnode_flushdirtyblks", hz/20);
1183                         }
1184                         writes_issued = 0;
1185                         busy = 0;
1186
1187                         goto loop;
1188                 }
1189         }
1190 }
1191
1192
1193 /*
1194  * called with buf_mtxp held...
1195  * this lock protects the queue manipulation
1196  */
1197 static int
1198 buf_iterprepare(vnode_t vp, struct buflists *iterheadp, int flags)
1199 {
1200         struct buflists * listheadp;
1201
1202         if (flags & VBI_DIRTY)
1203                 listheadp = &vp->v_dirtyblkhd;
1204         else
1205                 listheadp = &vp->v_cleanblkhd;
1206
1207         while (vp->v_iterblkflags & VBI_ITER)   {
1208                 vp->v_iterblkflags |= VBI_ITERWANT;
1209                 msleep(&vp->v_iterblkflags, buf_mtxp, 0, "buf_iterprepare", 0);
1210         }
1211         if (LIST_EMPTY(listheadp)) {
1212                 LIST_INIT(iterheadp);
1213                 return(EINVAL);
1214         }
1215         vp->v_iterblkflags |= VBI_ITER;
1216
1217         iterheadp->lh_first = listheadp->lh_first;
1218         listheadp->lh_first->b_vnbufs.le_prev = &iterheadp->lh_first;
1219         LIST_INIT(listheadp);
1220
1221         return(0);
1222 }
1223
1224 /*
1225  * called with buf_mtxp held...
1226  * this lock protects the queue manipulation
1227  */
1228 static void
1229 buf_itercomplete(vnode_t vp, struct buflists *iterheadp, int flags)
1230 {
1231         struct buflists * listheadp;
1232         buf_t bp;
1233
1234         if (flags & VBI_DIRTY)
1235                 listheadp = &vp->v_dirtyblkhd;
1236         else
1237                 listheadp = &vp->v_cleanblkhd;
1238
1239         while (!LIST_EMPTY(iterheadp)) {
1240                 bp = LIST_FIRST(iterheadp);
1241                 LIST_REMOVE(bp, b_vnbufs);
1242                 LIST_INSERT_HEAD(listheadp, bp, b_vnbufs);
1243         }
1244         vp->v_iterblkflags &= ~VBI_ITER;
1245
1246         if  (vp->v_iterblkflags & VBI_ITERWANT)         {
1247                 vp->v_iterblkflags &= ~VBI_ITERWANT;
1248                 wakeup(&vp->v_iterblkflags);
1249         }
1250 }
1251
1252
1253 static void
1254 bremfree_locked(buf_t bp)
1255 {
1256         struct bqueues *dp = NULL;
1257         int whichq = -1;
1258
1259         /*
1260          * We only calculate the head of the freelist when removing
1261          * the last element of the list as that is the only time that
1262          * it is needed (e.g. to reset the tail pointer).
1263          *
1264          * NB: This makes an assumption about how tailq's are implemented.
1265          */
1266         if (bp->b_freelist.tqe_next == NULL) {
1267                 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
1268                         if (dp->tqh_last == &bp->b_freelist.tqe_next)
1269                                 break;
1270                 if (dp == &bufqueues[BQUEUES])
1271                         panic("bremfree: lost tail");
1272         }
1273         TAILQ_REMOVE(dp, bp, b_freelist);
1274         whichq = bp->b_whichq;
1275 #if BALANCE_QUEUES
1276         bufqdec(whichq);
1277 #endif
1278         bp->b_whichq = -1;
1279         bp->b_timestamp = 0;
1280 }
1281
1282 /*
1283  * Associate a buffer with a vnode.
1284  */
1285 static void
1286 bgetvp(vnode_t vp, buf_t bp)
1287 {
1288
1289         if (bp->b_vp != vp)
1290                 panic("bgetvp: not free");
1291
1292         if (vp->v_type == VBLK || vp->v_type == VCHR)
1293                 bp->b_dev = vp->v_rdev;
1294         else
1295                 bp->b_dev = NODEV;
1296         /*
1297          * Insert onto list for new vnode.
1298          */
1299         lck_mtx_lock(buf_mtxp);
1300         bufinsvn(bp, &vp->v_cleanblkhd);
1301         lck_mtx_unlock(buf_mtxp);
1302 }
1303
1304 /*
1305  * Disassociate a buffer from a vnode.
1306  */
1307 static void
1308 brelvp(buf_t bp)
1309 {
1310         vnode_t vp;
1311
1312         if ((vp = bp->b_vp) == (vnode_t)NULL)
1313                 panic("brelvp: NULL vp");
1314         /*
1315          * Delete from old vnode list, if on one.
1316          */
1317         lck_mtx_lock(buf_mtxp);
1318         if (bp->b_vnbufs.le_next != NOLIST)
1319                 bufremvn(bp);
1320         lck_mtx_unlock(buf_mtxp);
1321
1322         bp->b_vp = (vnode_t)NULL;
1323 }
1324
1325 /*
1326  * Reassign a buffer from one vnode to another.
1327  * Used to assign file specific control information
1328  * (indirect blocks) to the vnode to which they belong.
1329  */
1330 static void
1331 buf_reassign(buf_t bp, vnode_t newvp)
1332 {
1333         register struct buflists *listheadp;
1334
1335         if (newvp == NULL) {
1336                 printf("buf_reassign: NULL");
1337                 return;
1338         }
1339         lck_mtx_lock(buf_mtxp);
1340
1341         /*
1342          * Delete from old vnode list, if on one.
1343          */
1344         if (bp->b_vnbufs.le_next != NOLIST)
1345                 bufremvn(bp);
1346         /*
1347          * If dirty, put on list of dirty buffers;
1348          * otherwise insert onto list of clean buffers.
1349          */
1350         if (ISSET(bp->b_flags, B_DELWRI))
1351                 listheadp = &newvp->v_dirtyblkhd;
1352         else
1353                 listheadp = &newvp->v_cleanblkhd;
1354         bufinsvn(bp, listheadp);
1355
1356         lck_mtx_unlock(buf_mtxp);
1357 }
1358
1359 static __inline__ void
1360 bufhdrinit(buf_t bp)
1361 {
1362         bzero((char *)bp, sizeof *bp);
1363         bp->b_dev = NODEV;
1364         bp->b_rcred = NOCRED;
1365         bp->b_wcred = NOCRED;
1366         bp->b_vnbufs.le_next = NOLIST;
1367         bp->b_flags = B_INVAL;
1368
1369         return;
1370 }
1371
1372 /*
1373  * Initialize buffers and hash links for buffers.
1374  */
1375 __private_extern__ void
1376 bufinit()
1377 {
1378         buf_t   bp;
1379         struct bqueues *dp;
1380         int     i;
1381         int     metabuf;
1382         long    whichq;
1383
1384         nbuf = 0;
1385         /* Initialize the buffer queues ('freelists') and the hash table */
1386         for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
1387                 TAILQ_INIT(dp);
1388         bufhashtbl = hashinit(nbuf_hashelements, M_CACHE, &bufhash);
1389
1390         metabuf = max_nbuf_headers/8; /* reserved for meta buf */
1391
1392         /* Initialize the buffer headers */
1393         for (i = 0; i < max_nbuf_headers; i++) {
1394                 nbuf++;
1395                 bp = &buf[i];
1396                 bufhdrinit(bp);
1397
1398                 /*
1399                  * metabuf buffer headers on the meta-data list and
1400                  * rest of the buffer headers on the empty list
1401                  */
1402                 if (--metabuf)
1403                         whichq = BQ_META;
1404                 else
1405                         whichq = BQ_EMPTY;
1406
1407                 BLISTNONE(bp);
1408                 dp = &bufqueues[whichq];
1409                 binsheadfree(bp, dp, whichq);
1410                 binshash(bp, &invalhash);
1411         }
1412
1413         boot_nbuf = nbuf;
1414
1415         for (; i < nbuf + niobuf; i++) {
1416                 bp = &buf[i];
1417                 bufhdrinit(bp);
1418                 binsheadfree(bp, &iobufqueue, -1);
1419         }
1420
1421     /*
1422          * allocate lock group attribute and group
1423          */
1424     buf_mtx_grp_attr = lck_grp_attr_alloc_init();
1425         buf_mtx_grp = lck_grp_alloc_init("buffer cache", buf_mtx_grp_attr);
1426
1427         /*
1428          * allocate the lock attribute
1429          */
1430         buf_mtx_attr = lck_attr_alloc_init();
1431
1432         /*
1433          * allocate and initialize mutex's for the buffer and iobuffer pools
1434          */
1435         buf_mtxp        = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
1436         iobuffer_mtxp   = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
1437
1438         if (iobuffer_mtxp == NULL)
1439                 panic("couldn't create iobuffer mutex");
1440
1441         if (buf_mtxp == NULL)
1442                 panic("couldn't create buf mutex");
1443
1444         /*
1445          * allocate and initialize cluster specific global locks...
1446          */
1447         cluster_init();
1448
1449         printf("using %d buffer headers and %d cluster IO buffer headers\n",
1450                 nbuf, niobuf);
1451
1452         /* Set up zones used by the buffer cache */
1453         bufzoneinit();
1454
1455         /* start the bcleanbuf() thread */
1456         bcleanbuf_thread_init();
1457
1458 #if BALANCE_QUEUES
1459         {
1460         static void bufq_balance_thread_init();
1461         /* create a thread to do dynamic buffer queue balancing */
1462         bufq_balance_thread_init();
1463         }
1464 #endif /* notyet */
1465 }
1466
1467 static struct buf *
1468 bio_doread(vnode_t vp, daddr64_t blkno, int size, ucred_t cred, int async, int queuetype)
1469 {
1470         buf_t   bp;
1471
1472         bp = buf_getblk(vp, blkno, size, 0, 0, queuetype);
1473
1474         /*
1475          * If buffer does not have data valid, start a read.
1476          * Note that if buffer is B_INVAL, buf_getblk() won't return it.
1477          * Therefore, it's valid if it's I/O has completed or been delayed.
1478          */
1479         if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
1480                 struct proc *p;
1481
1482                 p = current_proc();
1483
1484                 /* Start I/O for the buffer (keeping credentials). */
1485                 SET(bp->b_flags, B_READ | async);
1486                 if (IS_VALID_CRED(cred) && !IS_VALID_CRED(bp->b_rcred)) {
1487                         kauth_cred_ref(cred);
1488                         bp->b_rcred = cred;
1489                 }
1490
1491                 VNOP_STRATEGY(bp);
1492
1493                 trace(TR_BREADMISS, pack(vp, size), blkno);
1494
1495                 /* Pay for the read. */
1496                 if (p && p->p_stats)
1497                         p->p_stats->p_ru.ru_inblock++;          /* XXX */
1498
1499                 if (async) {
1500                         /*
1501                          * since we asked for an ASYNC I/O
1502                          * the biodone will do the brelse
1503                          * we don't want to pass back a bp
1504                          * that we don't 'own'
1505                          */
1506                         bp = NULL;
1507                 }
1508         } else if (async) {
1509                 buf_brelse(bp);
1510                 bp = NULL;
1511         }
1512
1513         trace(TR_BREADHIT, pack(vp, size), blkno);
1514
1515         return (bp);
1516 }
1517
1518 /*
1519  * Perform the reads for buf_breadn() and buf_meta_breadn().
1520  * Trivial modification to the breada algorithm presented in Bach (p.55).
1521  */
1522 static errno_t
1523 do_breadn_for_type(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes,
1524                    int nrablks, ucred_t cred, buf_t *bpp, int queuetype)
1525 {
1526         buf_t   bp;
1527         int     i;
1528
1529         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, queuetype);
1530
1531         /*
1532          * For each of the read-ahead blocks, start a read, if necessary.
1533          */
1534         for (i = 0; i < nrablks; i++) {
1535                 /* If it's in the cache, just go on to next one. */
1536                 if (incore(vp, rablks[i]))
1537                         continue;
1538
1539                 /* Get a buffer for the read-ahead block */
1540                 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, queuetype);
1541         }
1542
1543         /* Otherwise, we had to start a read for it; wait until it's valid. */
1544         return (buf_biowait(bp));
1545 }
1546
1547
1548 /*
1549  * Read a disk block.
1550  * This algorithm described in Bach (p.54).
1551  */
1552 errno_t
1553 buf_bread(vnode_t vp, daddr64_t blkno, int size, ucred_t cred, buf_t *bpp)
1554 {
1555         buf_t   bp;
1556
1557         /* Get buffer for block. */
1558         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
1559
1560         /* Wait for the read to complete, and return result. */
1561         return (buf_biowait(bp));
1562 }
1563
1564 /*
1565  * Read a disk block. [bread() for meta-data]
1566  * This algorithm described in Bach (p.54).
1567  */
1568 errno_t
1569 buf_meta_bread(vnode_t vp, daddr64_t blkno, int size, ucred_t cred, buf_t *bpp)
1570 {
1571         buf_t   bp;
1572
1573         /* Get buffer for block. */
1574         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
1575
1576         /* Wait for the read to complete, and return result. */
1577         return (buf_biowait(bp));
1578 }
1579
1580 /*
1581  * Read-ahead multiple disk blocks. The first is sync, the rest async.
1582  */
1583 errno_t
1584 buf_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, ucred_t cred, buf_t *bpp)
1585 {
1586         return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_READ));
1587 }
1588
1589 /*
1590  * Read-ahead multiple disk blocks. The first is sync, the rest async.
1591  * [buf_breadn() for meta-data]
1592  */
1593 errno_t
1594 buf_meta_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, ucred_t cred, buf_t *bpp)
1595 {
1596         return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_META));
1597 }
1598
1599 /*
1600  * Block write.  Described in Bach (p.56)
1601  */
1602 errno_t
1603 buf_bwrite(buf_t bp)
1604 {
1605         int     sync, wasdelayed;
1606         errno_t rv;
1607         proc_t  p = current_proc();
1608         vnode_t vp = bp->b_vp;
1609
1610         if (bp->b_datap == 0) {
1611                 if (brecover_data(bp) == 0)
1612                         return (0);
1613         }
1614         /* Remember buffer type, to switch on it later. */
1615         sync = !ISSET(bp->b_flags, B_ASYNC);
1616         wasdelayed = ISSET(bp->b_flags, B_DELWRI);
1617         CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
1618
1619         if (wasdelayed)
1620                 OSAddAtomic(-1, &nbdwrite);
1621
1622         if (!sync) {
1623                 /*
1624                  * If not synchronous, pay for the I/O operation and make
1625                  * sure the buf is on the correct vnode queue.  We have
1626                  * to do this now, because if we don't, the vnode may not
1627                  * be properly notified that its I/O has completed.
1628                  */
1629                 if (wasdelayed)
1630                         buf_reassign(bp, vp);
1631                 else
1632                 if (p && p->p_stats)
1633                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
1634         }
1635         trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
1636
1637         /* Initiate disk write.  Make sure the appropriate party is charged. */
1638
1639         OSAddAtomic(1, &vp->v_numoutput);
1640
1641         VNOP_STRATEGY(bp);
1642
1643         if (sync) {
1644                 /*
1645                  * If I/O was synchronous, wait for it to complete.
1646                  */
1647                 rv = buf_biowait(bp);
1648
1649                 /*
1650                  * Pay for the I/O operation, if it's not been paid for, and
1651                  * make sure it's on the correct vnode queue. (async operatings
1652                  * were payed for above.)
1653                  */
1654                 if (wasdelayed)
1655                         buf_reassign(bp, vp);
1656                 else
1657                 if (p && p->p_stats)
1658                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
1659
1660                 /* Release the buffer. */
1661                 // XXXdbg - only if the unused bit is set
1662                 if (!ISSET(bp->b_flags, B_NORELSE)) {
1663                     buf_brelse(bp);
1664                 } else {
1665                     CLR(bp->b_flags, B_NORELSE);
1666                 }
1667
1668                 return (rv);
1669         } else {
1670                 return (0);
1671         }
1672 }
1673
1674 int
1675 vn_bwrite(ap)
1676         struct vnop_bwrite_args *ap;
1677 {
1678         return (buf_bwrite(ap->a_bp));
1679 }
1680
1681 /*
1682  * Delayed write.
1683  *
1684  * The buffer is marked dirty, but is not queued for I/O.
1685  * This routine should be used when the buffer is expected
1686  * to be modified again soon, typically a small write that
1687  * partially fills a buffer.
1688  *
1689  * NB: magnetic tapes cannot be delayed; they must be
1690  * written in the order that the writes are requested.
1691  *
1692  * Described in Leffler, et al. (pp. 208-213).
1693  *
1694  * Note: With the abilitty to allocate additional buffer
1695  * headers, we can get in to the situation where "too" many
1696  * buf_bdwrite()s can create situation where the kernel can create
1697  * buffers faster than the disks can service. Doing a buf_bawrite() in
1698  * cases were we have "too many" outstanding buf_bdwrite()s avoids that.
1699  */
1700 __private_extern__ int
1701 bdwrite_internal(buf_t bp, int return_error)
1702 {
1703         proc_t  p  = current_proc();
1704         vnode_t vp = bp->b_vp;
1705
1706         /*
1707          * If the block hasn't been seen before:
1708          *      (1) Mark it as having been seen,
1709          *      (2) Charge for the write.
1710          *      (3) Make sure it's on its vnode's correct block list,
1711          */
1712         if (!ISSET(bp->b_flags, B_DELWRI)) {
1713                 SET(bp->b_flags, B_DELWRI);
1714                 if (p && p->p_stats)
1715                         p->p_stats->p_ru.ru_oublock++;          /* XXX */
1716                 OSAddAtomic(1, &nbdwrite);
1717                 buf_reassign(bp, vp);
1718         }
1719
1720         /* If this is a tape block, write it the block now. */
1721         if (ISSET(bp->b_flags, B_TAPE)) {
1722                 VNOP_BWRITE(bp);
1723                 return (0);
1724         }
1725
1726         /*
1727          * if we're not LOCKED, but the total number of delayed writes
1728          * has climbed above 75% of the total buffers in the system
1729          * return an error if the caller has indicated that it can
1730          * handle one in this case, otherwise schedule the I/O now
1731          * this is done to prevent us from allocating tons of extra
1732          * buffers when dealing with virtual disks (i.e. DiskImages),
1733          * because additional buffers are dynamically allocated to prevent
1734          * deadlocks from occurring
1735          *
1736          * however, can't do a buf_bawrite() if the LOCKED bit is set because the
1737          * buffer is part of a transaction and can't go to disk until
1738          * the LOCKED bit is cleared.
1739          */
1740         if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf/4)*3)) {
1741                 if (return_error)
1742                         return (EAGAIN);
1743                 /*
1744                  * If the vnode has "too many" write operations in progress
1745                  * wait for them to finish the IO
1746                  */
1747                 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, (char *)"buf_bdwrite");
1748
1749                 return (buf_bawrite(bp));
1750         }
1751
1752         /* Otherwise, the "write" is done, so mark and release the buffer. */
1753         SET(bp->b_flags, B_DONE);
1754         buf_brelse(bp);
1755         return (0);
1756 }
1757
1758 errno_t
1759 buf_bdwrite(buf_t bp)
1760 {
1761         return (bdwrite_internal(bp, 0));
1762 }
1763
1764
1765 /*
1766  * Asynchronous block write; just an asynchronous buf_bwrite().
1767  *
1768  * Note: With the abilitty to allocate additional buffer
1769  * headers, we can get in to the situation where "too" many
1770  * buf_bawrite()s can create situation where the kernel can create
1771  * buffers faster than the disks can service.
1772  * We limit the number of "in flight" writes a vnode can have to
1773  * avoid this.
1774  */
1775 static int
1776 bawrite_internal(buf_t bp, int throttle)
1777 {
1778         vnode_t vp = bp->b_vp;
1779
1780         if (vp) {
1781                 if (throttle)
1782                         /*
1783                          * If the vnode has "too many" write operations in progress
1784                          * wait for them to finish the IO
1785                          */
1786                         (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, (const char *)"buf_bawrite");
1787                 else if (vp->v_numoutput >= VNODE_ASYNC_THROTTLE)
1788                         /*
1789                          * return to the caller and
1790                          * let him decide what to do
1791                          */
1792                         return (EWOULDBLOCK);
1793         }
1794         SET(bp->b_flags, B_ASYNC);
1795
1796         return (VNOP_BWRITE(bp));
1797 }
1798
1799 errno_t
1800 buf_bawrite(buf_t bp)
1801 {
1802         return (bawrite_internal(bp, 1));
1803 }
1804
1805
1806 /*
1807  * Release a buffer on to the free lists.
1808  * Described in Bach (p. 46).
1809  */
1810 void
1811 buf_brelse(buf_t bp)
1812 {
1813         struct bqueues *bufq;
1814         long    whichq;
1815         upl_t   upl;
1816         int need_wakeup = 0;
1817         int need_bp_wakeup = 0;
1818
1819
1820         if (bp->b_whichq != -1 || !(bp->b_lflags & BL_BUSY))
1821                 panic("buf_brelse: bad buffer = %x\n", bp);
1822
1823 #ifdef JOE_DEBUG
1824         bp->b_stackbrelse[0] = __builtin_return_address(0);
1825         bp->b_stackbrelse[1] = __builtin_return_address(1);
1826         bp->b_stackbrelse[2] = __builtin_return_address(2);
1827         bp->b_stackbrelse[3] = __builtin_return_address(3);
1828         bp->b_stackbrelse[4] = __builtin_return_address(4);
1829         bp->b_stackbrelse[5] = __builtin_return_address(5);
1830
1831         bp->b_lastbrelse = current_thread();
1832         bp->b_tag = 0;
1833 #endif
1834         if (bp->b_lflags & BL_IOBUF) {
1835                 free_io_buf(bp);
1836                 return;
1837         }
1838
1839         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
1840                      bp->b_lblkno * PAGE_SIZE, (int)bp, (int)bp->b_datap,
1841                      bp->b_flags, 0);
1842
1843         trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
1844
1845         /*
1846          * if we're invalidating a buffer that has the B_FILTER bit
1847          * set then call the b_iodone function so it gets cleaned
1848          * up properly.
1849          *
1850          * the HFS journal code depends on this
1851          */
1852         if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
1853                 if (ISSET(bp->b_flags, B_FILTER)) {     /* if necessary, call out */
1854                         void    (*iodone_func)(struct buf *, void *) = bp->b_iodone;
1855                         void    *arg = (void *)bp->b_transaction;
1856
1857                         CLR(bp->b_flags, B_FILTER);     /* but note callout done */
1858                         bp->b_iodone = NULL;
1859                         bp->b_transaction = NULL;
1860
1861                         if (iodone_func == NULL) {
1862                                 panic("brelse: bp @ 0x%x has NULL b_iodone!\n", bp);
1863                         }
1864                         (*iodone_func)(bp, arg);
1865                 }
1866         }
1867         /*
1868          * I/O is done. Cleanup the UPL state
1869          */
1870         upl = bp->b_upl;
1871
1872         if ( !ISSET(bp->b_flags, B_META) && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
1873                 kern_return_t kret;
1874                 int           upl_flags;
1875
1876                 if ( (upl == NULL) ) {
1877                         if ( !ISSET(bp->b_flags, B_INVAL)) {
1878                                 kret = ubc_create_upl(bp->b_vp,
1879                                                       ubc_blktooff(bp->b_vp, bp->b_lblkno),
1880                                                       bp->b_bufsize,
1881                                                       &upl,
1882                                                       NULL,
1883                                                       UPL_PRECIOUS);
1884
1885                                 if (kret != KERN_SUCCESS)
1886                                         panic("brelse: Failed to create UPL");
1887 #ifdef  UPL_DEBUG
1888                                 upl_ubc_alias_set(upl, bp, 5);
1889 #endif /* UPL_DEBUG */
1890                         }
1891                 } else {
1892                         if (bp->b_datap) {
1893                                 kret = ubc_upl_unmap(upl);
1894
1895                                 if (kret != KERN_SUCCESS)
1896                                         panic("ubc_upl_unmap failed");
1897                                 bp->b_datap = (uintptr_t)NULL;
1898                         }
1899                 }
1900                 if (upl) {
1901                         if (bp->b_flags & (B_ERROR | B_INVAL)) {
1902                                 if (bp->b_flags & (B_READ | B_INVAL))
1903                                         upl_flags = UPL_ABORT_DUMP_PAGES;
1904                                 else
1905                                         upl_flags = 0;
1906
1907                                 ubc_upl_abort(upl, upl_flags);
1908                         } else {
1909                                 if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY))
1910                                         upl_flags = UPL_COMMIT_SET_DIRTY ;
1911                                 else
1912                                         upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
1913
1914                                 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
1915                                                      UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
1916                         }
1917                         bp->b_upl = NULL;
1918                 }
1919         } else {
1920                 if ( (upl) )
1921                         panic("brelse: UPL set for non VREG; vp=%x", bp->b_vp);
1922         }
1923
1924         /*
1925          * If it's locked, don't report an error; try again later.
1926          */
1927         if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
1928                 CLR(bp->b_flags, B_ERROR);
1929         /*
1930          * If it's not cacheable, or an error, mark it invalid.
1931          */
1932         if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
1933                 SET(bp->b_flags, B_INVAL);
1934
1935         if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
1936                 /*
1937                  * If it's invalid or empty, dissociate it from its vnode
1938                  * and put on the head of the appropriate queue.
1939                  */
1940                 if (bp->b_vp)
1941                         brelvp(bp);
1942
1943                 if (ISSET(bp->b_flags, B_DELWRI))
1944                         OSAddAtomic(-1, &nbdwrite);
1945
1946                 CLR(bp->b_flags, (B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE));
1947                 /*
1948                  * Determine which queue the buffer should be on, then put it there.
1949                  */
1950                 if (bp->b_bufsize <= 0)
1951                         whichq = BQ_EMPTY;      /* no data */
1952                 else if (ISSET(bp->b_flags, B_META))
1953                         whichq = BQ_META;               /* meta-data */
1954                 else
1955                         whichq = BQ_AGE;        /* invalid data */
1956                 bufq = &bufqueues[whichq];
1957
1958                 lck_mtx_lock(buf_mtxp);
1959
1960                 binsheadfree(bp, bufq, whichq);
1961         } else {
1962                 /*
1963                  * It has valid data.  Put it on the end of the appropriate
1964                  * queue, so that it'll stick around for as long as possible.
1965                  */
1966                 if (ISSET(bp->b_flags, B_LOCKED))
1967                         whichq = BQ_LOCKED;             /* locked in core */
1968                 else if (ISSET(bp->b_flags, B_META))
1969                         whichq = BQ_META;               /* meta-data */
1970                 else if (ISSET(bp->b_flags, B_AGE))
1971                         whichq = BQ_AGE;                /* stale but valid data */
1972                 else
1973                         whichq = BQ_LRU;                /* valid data */
1974                 bufq = &bufqueues[whichq];
1975
1976                 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE));
1977
1978                 lck_mtx_lock(buf_mtxp);
1979
1980                 binstailfree(bp, bufq, whichq);
1981         }
1982         if (needbuffer) {
1983                 /*
1984                  * needbuffer is a global
1985                  * we're currently using buf_mtxp to protect it
1986                  * delay doing the actual wakeup until after
1987                  * we drop buf_mtxp
1988                  */
1989                 needbuffer = 0;
1990                 need_wakeup = 1;
1991         }
1992         if (ISSET(bp->b_lflags, BL_WANTED)) {
1993                 /*
1994                  * delay the actual wakeup until after we
1995                  * clear BL_BUSY and we've dropped buf_mtxp
1996                  */
1997                 need_bp_wakeup = 1;
1998         }
1999         /*
2000          * Unlock the buffer.
2001          */
2002         CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
2003
2004         lck_mtx_unlock(buf_mtxp);
2005
2006         if (need_wakeup) {
2007                 /*
2008                  * Wake up any processes waiting for any buffer to become free.
2009                  */
2010                 wakeup(&needbuffer);
2011         }
2012         if (need_bp_wakeup) {
2013                 /*
2014                  * Wake up any proceeses waiting for _this_ buffer to become free.
2015                  */
2016                 wakeup(bp);
2017         }
2018         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
2019                      (int)bp, (int)bp->b_datap, bp->b_flags, 0, 0);
2020 }
2021
2022 /*
2023  * Determine if a block is in the cache.
2024  * Just look on what would be its hash chain.  If it's there, return
2025  * a pointer to it, unless it's marked invalid.  If it's marked invalid,
2026  * we normally don't return the buffer, unless the caller explicitly
2027  * wants us to.
2028  */
2029 static boolean_t
2030 incore(vnode_t vp, daddr64_t blkno)
2031 {
2032         boolean_t retval;
2033
2034         lck_mtx_lock(buf_mtxp);
2035
2036         if (incore_locked(vp, blkno))
2037                 retval = TRUE;
2038         else
2039                 retval = FALSE;
2040         lck_mtx_unlock(buf_mtxp);
2041
2042         return (retval);
2043 }
2044
2045
2046 static buf_t
2047 incore_locked(vnode_t vp, daddr64_t blkno)
2048 {
2049         struct buf *bp;
2050
2051         bp = BUFHASH(vp, blkno)->lh_first;
2052
2053         /* Search hash chain */
2054         for (; bp != NULL; bp = bp->b_hash.le_next) {
2055                 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
2056                     !ISSET(bp->b_flags, B_INVAL)) {
2057                         return (bp);
2058                 }
2059         }
2060         return (0);
2061 }
2062
2063
2064 /* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
2065 /*
2066  * Get a block of requested size that is associated with
2067  * a given vnode and block offset. If it is found in the
2068  * block cache, mark it as having been found, make it busy
2069  * and return it. Otherwise, return an empty block of the
2070  * correct size. It is up to the caller to insure that the
2071  * cached blocks be of the correct size.
2072  */
2073 buf_t
2074 buf_getblk(vnode_t vp, daddr64_t blkno, int size, int slpflag, int slptimeo, int operation)
2075 {
2076         buf_t bp;
2077         int   err;
2078         upl_t upl;
2079         upl_page_info_t *pl;
2080         kern_return_t kret;
2081         int ret_only_valid;
2082         struct timespec ts;
2083         int upl_flags;
2084
2085         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
2086                      (int)(blkno * PAGE_SIZE), size, operation, 0, 0);
2087
2088         ret_only_valid = operation & BLK_ONLYVALID;
2089         operation &= ~BLK_ONLYVALID;
2090 start:
2091         lck_mtx_lock(buf_mtxp);
2092 start_locked:
2093         if ((bp = incore_locked(vp, blkno))) {
2094                 /*
2095                  * Found in the Buffer Cache
2096                  */
2097                 if (ISSET(bp->b_lflags, BL_BUSY)) {
2098                         /*
2099                          * but is busy
2100                          */
2101                         switch (operation) {
2102                         case BLK_READ:
2103                         case BLK_WRITE:
2104                         case BLK_META:
2105                                 SET(bp->b_lflags, BL_WANTED);
2106                                 bufstats.bufs_busyincore++;
2107
2108                                 /*
2109                                  * don't retake the mutex after being awakened...
2110                                  * the time out is in msecs
2111                                  */
2112                                 ts.tv_sec = (slptimeo/1000);
2113                                 ts.tv_nsec = (slptimeo % 1000) * 10  * NSEC_PER_USEC * 1000;
2114
2115                                 err = msleep(bp, buf_mtxp, slpflag | PDROP | (PRIBIO + 1), "buf_getblk", &ts);
2116
2117                                 /*
2118                                  * Callers who call with PCATCH or timeout are
2119                                  * willing to deal with the NULL pointer
2120                                  */
2121                                 if (err && ((slpflag & PCATCH) || ((err == EWOULDBLOCK) && slptimeo)))
2122                                         return (NULL);
2123                                 goto start;
2124                                 /*NOTREACHED*/
2125                                 break;
2126
2127                         default:
2128                                 /*
2129                                  * unknown operation requested
2130                                  */
2131                                 panic("getblk: paging or unknown operation for incore busy buffer - %x\n", operation);
2132                                 /*NOTREACHED*/
2133                                 break;
2134                         }
2135                 } else {
2136                         /*
2137                          * buffer in core and not busy
2138                          */
2139                         if ( (bp->b_upl) )
2140                                 panic("buffer has UPL, but not marked BUSY: %x", bp);
2141                         SET(bp->b_lflags, BL_BUSY);
2142                         SET(bp->b_flags, B_CACHE);
2143 #ifdef JOE_DEBUG
2144                         bp->b_owner = current_thread();
2145                         bp->b_tag   = 1;
2146 #endif
2147                         bremfree_locked(bp);
2148                         bufstats.bufs_incore++;
2149
2150                         lck_mtx_unlock(buf_mtxp);
2151
2152                         if ( !ret_only_valid)
2153                                 allocbuf(bp, size);
2154
2155                         upl_flags = 0;
2156                         switch (operation) {
2157                         case BLK_WRITE:
2158                                 /*
2159                                  * "write" operation:  let the UPL subsystem
2160                                  * know that we intend to modify the buffer
2161                                  * cache pages we're gathering.
2162                                  */
2163                                 upl_flags |= UPL_WILL_MODIFY;
2164                         case BLK_READ:
2165                                 upl_flags |= UPL_PRECIOUS;
2166                                 if (UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
2167                                         kret = ubc_create_upl(vp,
2168                                                               ubc_blktooff(vp, bp->b_lblkno),
2169                                                               bp->b_bufsize,
2170                                                               &upl,
2171                                                               &pl,
2172                                                               upl_flags);
2173                                         if (kret != KERN_SUCCESS)
2174                                                 panic("Failed to create UPL");
2175
2176                                         bp->b_upl = upl;
2177
2178                                         if (upl_valid_page(pl, 0)) {
2179                                                 if (upl_dirty_page(pl, 0))
2180                                                         SET(bp->b_flags, B_WASDIRTY);
2181                                                 else
2182                                                         CLR(bp->b_flags, B_WASDIRTY);
2183                                         } else
2184                                                 CLR(bp->b_flags, (B_DONE | B_CACHE | B_WASDIRTY | B_DELWRI));
2185
2186                                         kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_datap));
2187
2188                                         if (kret != KERN_SUCCESS)
2189                                                 panic("getblk: ubc_upl_map() failed with (%d)", kret);
2190                                 }
2191                                 break;
2192
2193                         case BLK_META:
2194                                 /*
2195                                  * VM is not involved in IO for the meta data
2196                                  * buffer already has valid data
2197                                  */
2198                                 break;
2199
2200                         default:
2201                                 panic("getblk: paging or unknown operation for incore buffer- %d\n", operation);
2202                                 /*NOTREACHED*/
2203                                 break;
2204                         }
2205                 }
2206         } else { /* not incore() */
2207                 int queue = BQ_EMPTY; /* Start with no preference */
2208
2209                 if (ret_only_valid) {
2210                         lck_mtx_unlock(buf_mtxp);
2211                         return (NULL);
2212                 }
2213
2214                 if ((UBCINVALID(vp)) || !(UBCINFOEXISTS(vp)))
2215                         operation = BLK_META;
2216
2217                 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
2218                         goto start_locked;
2219
2220                 /*
2221                  * getnewbuf may block for a number of different reasons...
2222                  * if it does, it's then possible for someone else to
2223                  * create a buffer for the same block and insert it into
2224                  * the hash... if we see it incore at this point we dump
2225                  * the buffer we were working on and start over
2226                  */
2227                 if (incore_locked(vp, blkno)) {
2228                         SET(bp->b_flags, B_INVAL);
2229                         binshash(bp, &invalhash);
2230
2231                         lck_mtx_unlock(buf_mtxp);
2232
2233                         buf_brelse(bp);
2234                         goto start;
2235                 }
2236                 /*
2237                  * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
2238                  *       CALLED!  BE CAREFUL.
2239                  */
2240
2241                 /*
2242                  * mark the buffer as B_META if indicated
2243                  * so that when buffer is released it will goto META queue
2244                  */
2245                 if (operation == BLK_META)
2246                         SET(bp->b_flags, B_META);
2247
2248                 bp->b_blkno = bp->b_lblkno = blkno;
2249                 bp->b_vp = vp;
2250
2251                 /*
2252                  * Insert in the hash so that incore() can find it
2253                  */
2254                 binshash(bp, BUFHASH(vp, blkno));
2255
2256                 lck_mtx_unlock(buf_mtxp);
2257
2258                 bgetvp(vp, bp);
2259
2260                 allocbuf(bp, size);
2261
2262                 upl_flags = 0;
2263                 switch (operation) {
2264                 case BLK_META:
2265                         /*
2266                          * buffer data is invalid...
2267                          *
2268                          * I don't want to have to retake buf_mtxp,
2269                          * so the miss and vmhits counters are done
2270                          * with Atomic updates... all other counters
2271                          * in bufstats are protected with either
2272                          * buf_mtxp or iobuffer_mtxp
2273                          */
2274                         OSAddAtomic(1, &bufstats.bufs_miss);
2275                         break;
2276
2277                 case BLK_WRITE:
2278                         /*
2279                          * "write" operation:  let the UPL subsystem know
2280                          * that we intend to modify the buffer cache pages
2281                          * we're gathering.
2282                          */
2283                         upl_flags |= UPL_WILL_MODIFY;
2284                 case BLK_READ:
2285                   {     off_t   f_offset;
2286                         size_t  contig_bytes;
2287                         int     bmap_flags;
2288
2289                         if ( (bp->b_upl) )
2290                                 panic("bp already has UPL: %x",bp);
2291
2292                         f_offset = ubc_blktooff(vp, blkno);
2293
2294                         upl_flags |= UPL_PRECIOUS;
2295                         kret = ubc_create_upl(vp,
2296                                               f_offset,
2297                                               bp->b_bufsize,
2298                                               &upl,
2299                                               &pl,
2300                                               upl_flags);
2301
2302                         if (kret != KERN_SUCCESS)
2303                                 panic("Failed to create UPL");
2304 #ifdef  UPL_DEBUG
2305                         upl_ubc_alias_set(upl, bp, 4);
2306 #endif /* UPL_DEBUG */
2307                         bp->b_upl = upl;
2308
2309                         if (upl_valid_page(pl, 0)) {
2310
2311                                 if (operation == BLK_READ)
2312                                         bmap_flags = VNODE_READ;
2313                                 else
2314                                         bmap_flags = VNODE_WRITE;
2315
2316                                 SET(bp->b_flags, B_CACHE | B_DONE);
2317
2318                                 OSAddAtomic(1, &bufstats.bufs_vmhits);
2319
2320                                 bp->b_validoff = 0;
2321                                 bp->b_dirtyoff = 0;
2322
2323                                 if (upl_dirty_page(pl, 0)) {
2324                                         /* page is dirty */
2325                                         SET(bp->b_flags, B_WASDIRTY);
2326
2327                                         bp->b_validend = bp->b_bcount;
2328                                         bp->b_dirtyend = bp->b_bcount;
2329                                 } else {
2330                                         /* page is clean */
2331                                         bp->b_validend = bp->b_bcount;
2332                                         bp->b_dirtyend = 0;
2333                                 }
2334                                 /*
2335                                  * try to recreate the physical block number associated with
2336                                  * this buffer...
2337                                  */
2338                                 if (VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))
2339                                         panic("getblk: VNOP_BLOCKMAP failed");
2340                                 /*
2341                                  * if the extent represented by this buffer
2342                                  * is not completely physically contiguous on
2343                                  * disk, than we can't cache the physical mapping
2344                                  * in the buffer header
2345                                  */
2346                                 if ((long)contig_bytes < bp->b_bcount)
2347                                         bp->b_blkno = bp->b_lblkno;
2348                         } else {
2349                                 OSAddAtomic(1, &bufstats.bufs_miss);
2350                         }
2351                         kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_datap));
2352
2353                         if (kret != KERN_SUCCESS)
2354                                 panic("getblk: ubc_upl_map() failed with (%d)", kret);
2355                         break;
2356                   }
2357                 default:
2358                         panic("getblk: paging or unknown operation - %x", operation);
2359                         /*NOTREACHED*/
2360                         break;
2361                 }
2362         }
2363         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
2364                      (int)bp, (int)bp->b_datap, bp->b_flags, 3, 0);
2365
2366 #ifdef JOE_DEBUG
2367         bp->b_stackgetblk[0] = __builtin_return_address(0);
2368         bp->b_stackgetblk[1] = __builtin_return_address(1);
2369         bp->b_stackgetblk[2] = __builtin_return_address(2);
2370         bp->b_stackgetblk[3] = __builtin_return_address(3);
2371         bp->b_stackgetblk[4] = __builtin_return_address(4);
2372         bp->b_stackgetblk[5] = __builtin_return_address(5);
2373 #endif
2374         return (bp);
2375 }
2376
2377 /*
2378  * Get an empty, disassociated buffer of given size.
2379  */
2380 buf_t
2381 buf_geteblk(size)
2382         int size;
2383 {
2384         buf_t   bp;
2385         int queue = BQ_EMPTY;
2386
2387         lck_mtx_lock(buf_mtxp);
2388
2389         while ((bp = getnewbuf(0, 0, &queue)) == 0)
2390                 ;
2391         SET(bp->b_flags, (B_META|B_INVAL));
2392
2393 #if DIAGNOSTIC
2394         assert(queue == BQ_EMPTY);
2395 #endif /* DIAGNOSTIC */
2396         /* XXX need to implement logic to deal with other queues */
2397
2398         binshash(bp, &invalhash);
2399         bufstats.bufs_eblk++;
2400
2401         lck_mtx_unlock(buf_mtxp);
2402
2403         allocbuf(bp, size);
2404
2405         return (bp);
2406 }
2407
2408 /*
2409  * Zones for the meta data buffers
2410  */
2411
2412 #define MINMETA 512
2413 #define MAXMETA 4096
2414
2415 struct meta_zone_entry {
2416         zone_t mz_zone;
2417         vm_size_t mz_size;
2418         vm_size_t mz_max;
2419         char *mz_name;
2420 };
2421
2422 struct meta_zone_entry meta_zones[] = {
2423         {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
2424         {NULL, (MINMETA * 2),  64 * (MINMETA * 2), "buf.1024" },
2425         {NULL, (MINMETA * 4),  16 * (MINMETA * 4), "buf.2048" },
2426         {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
2427         {NULL, 0, 0, "" } /* End */
2428 };
2429
2430 /*
2431  * Initialize the meta data zones
2432  */
2433 static void
2434 bufzoneinit(void)
2435 {
2436         int i;
2437
2438         for (i = 0; meta_zones[i].mz_size != 0; i++) {
2439                 meta_zones[i].mz_zone =
2440                                 zinit(meta_zones[i].mz_size,
2441                                         meta_zones[i].mz_max,
2442                                         PAGE_SIZE,
2443                                         meta_zones[i].mz_name);
2444         }
2445         buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
2446 }
2447
2448 static __inline__ zone_t
2449 getbufzone(size_t size)
2450 {
2451         int i;
2452
2453         if ((size % 512) || (size < MINMETA) || (size > MAXMETA))
2454                 panic("getbufzone: incorect size = %d", size);
2455
2456         for (i = 0; meta_zones[i].mz_size != 0; i++) {
2457                 if (meta_zones[i].mz_size >= size)
2458                         break;
2459         }
2460
2461         return (meta_zones[i].mz_zone);
2462 }
2463
2464 /*
2465  * With UBC, there is no need to expand / shrink the file data
2466  * buffer. The VM uses the same pages, hence no waste.
2467  * All the file data buffers can have one size.
2468  * In fact expand / shrink would be an expensive operation.
2469  *
2470  * Only exception to this is meta-data buffers. Most of the
2471  * meta data operations are smaller than PAGE_SIZE. Having the
2472  * meta-data buffers grow and shrink as needed, optimizes use
2473  * of the kernel wired memory.
2474  */
2475
2476 int
2477 allocbuf(buf_t bp, int size)
2478 {
2479         vm_size_t desired_size;
2480
2481         desired_size = roundup(size, CLBYTES);
2482
2483         if (desired_size < PAGE_SIZE)
2484                 desired_size = PAGE_SIZE;
2485         if (desired_size > MAXBSIZE)
2486                 panic("allocbuf: buffer larger than MAXBSIZE requested");
2487
2488         if (ISSET(bp->b_flags, B_META)) {
2489                 zone_t zprev, z;
2490                 int    nsize = roundup(size, MINMETA);
2491
2492                 if (bp->b_datap) {
2493                         vm_offset_t elem = (vm_offset_t)bp->b_datap;
2494
2495                         if (ISSET(bp->b_flags, B_ZALLOC)) {
2496                                 if (bp->b_bufsize < nsize) {
2497                                         /* reallocate to a bigger size */
2498
2499                                         zprev = getbufzone(bp->b_bufsize);
2500                                         if (nsize <= MAXMETA) {
2501                                                 desired_size = nsize;
2502                                                 z = getbufzone(nsize);
2503                                                 bp->b_datap = (uintptr_t)zalloc(z);
2504                                         } else {
2505                                                 bp->b_datap = (uintptr_t)NULL;
2506                                                 kmem_alloc_wired(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
2507                                                 CLR(bp->b_flags, B_ZALLOC);
2508                                         }
2509                                         bcopy((void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
2510                                         zfree(zprev, (void *)elem);
2511                                 } else {
2512                                         desired_size = bp->b_bufsize;
2513                                 }
2514
2515                         } else {
2516                                 if ((vm_size_t)bp->b_bufsize < desired_size) {
2517                                         /* reallocate to a bigger size */
2518                                         bp->b_datap = (uintptr_t)NULL;
2519                                         kmem_alloc_wired(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
2520                                         bcopy((const void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
2521                                         kmem_free(kernel_map, elem, bp->b_bufsize);
2522                                 } else {
2523                                         desired_size = bp->b_bufsize;
2524                                 }
2525                         }
2526                 } else {
2527                         /* new allocation */
2528                         if (nsize <= MAXMETA) {
2529                                 desired_size = nsize;
2530                                 z = getbufzone(nsize);
2531                                 bp->b_datap = (uintptr_t)zalloc(z);
2532                                 SET(bp->b_flags, B_ZALLOC);
2533                         } else
2534                                 kmem_alloc_wired(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
2535                 }
2536         }
2537         bp->b_bufsize = desired_size;
2538         bp->b_bcount = size;
2539
2540         return (0);
2541 }
2542
2543 /*
2544  *      Get a new buffer from one of the free lists.
2545  *
2546  *      Request for a queue is passes in. The queue from which the buffer was taken
2547  *      from is returned. Out of range queue requests get BQ_EMPTY. Request for
2548  *      BQUEUE means no preference. Use heuristics in that case.
2549  *      Heuristics is as follows:
2550  *      Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
2551  *      If none available block till one is made available.
2552  *      If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
2553  *      Pick the most stale buffer.
2554  *      If found buffer was marked delayed write, start the async. write
2555  *      and restart the search.
2556  *      Initialize the fields and disassociate the buffer from the vnode.
2557  *      Remove the buffer from the hash. Return the buffer and the queue
2558  *      on which it was found.
2559  *
2560  *      buf_mtxp is held upon entry
2561  *      returns with buf_mtxp locked
2562  */
2563
2564 static buf_t
2565 getnewbuf(int slpflag, int slptimeo, int * queue)
2566 {
2567         buf_t   bp;
2568         buf_t   lru_bp;
2569         buf_t   age_bp;
2570         buf_t   meta_bp;
2571         int     age_time, lru_time, bp_time, meta_time;
2572         int     req = *queue;   /* save it for restarts */
2573         struct timespec ts;
2574
2575 start:
2576         /*
2577          * invalid request gets empty queue
2578          */
2579         if ((*queue > BQUEUES) || (*queue < 0)
2580                 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED))
2581                 *queue = BQ_EMPTY;
2582         /* need to grow number of bufs, add another one rather than recycling */
2583         if (nbuf < max_nbuf_headers) {
2584                 /*
2585                  * Increment  count now as lock
2586                  * is dropped for allocation.
2587                  * That avoids over commits
2588                  */
2589                 nbuf++;
2590                 goto add_newbufs;
2591         }
2592
2593         /*
2594          * (*queue == BQUEUES) means no preference
2595          */
2596         if (*queue != BQUEUES) {
2597                 /* Try for the requested queue first */
2598                 bp = bufqueues[*queue].tqh_first;
2599                 if (bp)
2600                         goto found;
2601         }
2602
2603         /* Unable to use requested queue */
2604         age_bp = bufqueues[BQ_AGE].tqh_first;
2605         lru_bp = bufqueues[BQ_LRU].tqh_first;
2606         meta_bp = bufqueues[BQ_META].tqh_first;
2607
2608         if (!age_bp && !lru_bp && !meta_bp) {
2609                 /*
2610                  * Unavailble on AGE or LRU or META queues
2611                  * Try the empty list first
2612                  */
2613                 bp = bufqueues[BQ_EMPTY].tqh_first;
2614                 if (bp) {
2615                         *queue = BQ_EMPTY;
2616                         goto found;
2617                 }
2618                 /*
2619                  * We have seen is this is hard to trigger.
2620                  * This is an overcommit of nbufs but needed
2621                  * in some scenarios with diskiamges
2622                  */
2623
2624 add_newbufs:
2625                 lck_mtx_unlock(buf_mtxp);
2626
2627                 /* Create a new temporary buffer header */
2628                 bp = (struct buf *)zalloc(buf_hdr_zone);
2629
2630                 lck_mtx_lock(buf_mtxp);
2631
2632                 if (bp) {
2633                         bufhdrinit(bp);
2634                         BLISTNONE(bp);
2635                         binshash(bp, &invalhash);
2636                         SET(bp->b_flags, B_HDRALLOC);
2637                         *queue = BQ_EMPTY;
2638                         binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
2639                         buf_hdr_count++;
2640                         goto found;
2641                 }
2642                 /* subtract already accounted bufcount */
2643                 nbuf--;
2644
2645                 bufstats.bufs_sleeps++;
2646
2647                 /* wait for a free buffer of any kind */
2648                 needbuffer = 1;
2649                 /* hz value is 100 */
2650                 ts.tv_sec = (slptimeo/1000);
2651                 /* the hz value is 100; which leads to 10ms */
2652                 ts.tv_nsec = (slptimeo % 1000) * NSEC_PER_USEC * 1000 * 10;
2653                 msleep(&needbuffer, buf_mtxp, slpflag|(PRIBIO+1), (char *)"getnewbuf", &ts);
2654                 return (0);
2655         }
2656
2657         /* Buffer available either on AGE or LRU or META */
2658         bp = NULL;
2659         *queue = -1;
2660
2661         /* Buffer available either on AGE or LRU */
2662         if (!age_bp) {
2663                 bp = lru_bp;
2664                 *queue = BQ_LRU;
2665         } else if (!lru_bp) {
2666                 bp = age_bp;
2667                 *queue = BQ_AGE;
2668         } else { /* buffer available on both AGE and LRU */
2669                 int             t = buf_timestamp();
2670
2671                 age_time = t - age_bp->b_timestamp;
2672                 lru_time = t - lru_bp->b_timestamp;
2673                 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
2674                         bp = age_bp;
2675                         *queue = BQ_AGE;
2676                         /*
2677                          * we should probably re-timestamp eveything in the
2678                          * queues at this point with the current time
2679                          */
2680                 } else {
2681                         if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
2682                                 bp = lru_bp;
2683                                 *queue = BQ_LRU;
2684                         } else {
2685                                 bp = age_bp;
2686                                 *queue = BQ_AGE;
2687                         }
2688                 }
2689         }
2690
2691         if (!bp) { /* Neither on AGE nor on LRU */
2692                 bp = meta_bp;
2693                 *queue = BQ_META;
2694         }  else if (meta_bp) {
2695                 int             t = buf_timestamp();
2696
2697                 bp_time = t - bp->b_timestamp;
2698                 meta_time = t - meta_bp->b_timestamp;
2699
2700                 if (!(bp_time < 0) && !(meta_time < 0)) {
2701                         /* time not set backwards */
2702                         int bp_is_stale;
2703                         bp_is_stale = (*queue == BQ_LRU) ?
2704                                         lru_is_stale : age_is_stale;
2705
2706                         if ((meta_time >= meta_is_stale) &&
2707                                         (bp_time < bp_is_stale)) {
2708                                 bp = meta_bp;
2709                                 *queue = BQ_META;
2710                         }
2711                 }
2712         }
2713 found:
2714         if (ISSET(bp->b_flags, B_LOCKED) || ISSET(bp->b_lflags, BL_BUSY))
2715                 panic("getnewbuf: bp @ 0x%x is LOCKED or BUSY! (flags 0x%x)\n", bp, bp->b_flags);
2716
2717         /* Clean it */
2718         if (bcleanbuf(bp)) {
2719                 /*
2720                  * moved to the laundry thread, buffer not ready
2721                  */
2722                 *queue = req;
2723                 goto start;
2724         }
2725         return (bp);
2726 }
2727
2728
2729 /*
2730  * Clean a buffer.
2731  * Returns 0 is buffer is ready to use,
2732  * Returns 1 if issued a buf_bawrite() to indicate
2733  * that the buffer is not ready.
2734  *
2735  * buf_mtxp is held upon entry
2736  * returns with buf_mtxp locked
2737  */
2738 static int
2739 bcleanbuf(buf_t bp)
2740 {
2741         /* Remove from the queue */
2742         bremfree_locked(bp);
2743
2744         /* Buffer is no longer on free lists. */
2745         SET(bp->b_lflags, BL_BUSY);
2746 #ifdef JOE_DEBUG
2747         bp->b_owner = current_thread();
2748         bp->b_tag   = 2;
2749 #endif
2750         /*
2751          * If buffer was a delayed write, start the IO by queuing
2752          * it on the LAUNDRY queue, and return 1
2753          */
2754         if (ISSET(bp->b_flags, B_DELWRI)) {
2755                 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
2756                 blaundrycnt++;
2757
2758                 lck_mtx_unlock(buf_mtxp);
2759
2760                 wakeup(&blaundrycnt);
2761                 /* and give it a chance to run */
2762                 (void)thread_block(THREAD_CONTINUE_NULL);
2763
2764                 lck_mtx_lock(buf_mtxp);
2765                 return (1);
2766         }
2767         bremhash(bp);
2768
2769         lck_mtx_unlock(buf_mtxp);
2770
2771         BLISTNONE(bp);
2772         /*
2773          * disassociate us from our vnode, if we had one...
2774          */
2775         if (bp->b_vp)
2776                 brelvp(bp);
2777
2778         if (ISSET(bp->b_flags, B_META)) {
2779                 vm_offset_t elem;
2780
2781                 elem = (vm_offset_t)bp->b_datap;
2782                 bp->b_datap = (uintptr_t)0xdeadbeef;
2783
2784                 if (ISSET(bp->b_flags, B_ZALLOC)) {
2785                         zone_t z;
2786
2787                         z = getbufzone(bp->b_bufsize);
2788                         zfree(z, (void *)elem);
2789                 } else
2790                         kmem_free(kernel_map, elem, bp->b_bufsize);
2791         }
2792
2793         trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
2794
2795         /* clear out various other fields */
2796         bp->b_bufsize = 0;
2797         bp->b_datap = (uintptr_t)NULL;
2798         bp->b_upl = (void *)NULL;
2799         /*
2800          * preserve the state of whether this buffer
2801          * was allocated on the fly or not...
2802          * the only other flag that should be set at
2803          * this point is BL_BUSY...
2804          */
2805 #ifdef JOE_DEBUG
2806         bp->b_owner = current_thread();
2807         bp->b_tag   = 3;
2808 #endif
2809         bp->b_lflags = BL_BUSY;
2810         bp->b_flags = (bp->b_flags & B_HDRALLOC);
2811         bp->b_dev = NODEV;
2812         bp->b_blkno = bp->b_lblkno = 0;
2813         bp->b_iodone = NULL;
2814         bp->b_error = 0;
2815         bp->b_resid = 0;
2816         bp->b_bcount = 0;
2817         bp->b_dirtyoff = bp->b_dirtyend = 0;
2818         bp->b_validoff = bp->b_validend = 0;
2819
2820         /* nuke any credentials we were holding */
2821         if (IS_VALID_CRED(bp->b_rcred)) {
2822                 kauth_cred_unref(&bp->b_rcred);
2823         }
2824         if (IS_VALID_CRED(bp->b_wcred)) {
2825                 kauth_cred_unref(&bp->b_wcred);
2826         }
2827         lck_mtx_lock(buf_mtxp);
2828
2829         return (0);
2830 }
2831
2832
2833
2834 errno_t
2835 buf_invalblkno(vnode_t vp, daddr64_t lblkno, int flags)
2836 {
2837         buf_t   bp;
2838         errno_t error;
2839
2840         lck_mtx_lock(buf_mtxp);
2841 relook:
2842         if ((bp = incore_locked(vp, lblkno)) == (struct buf *)0) {
2843                 lck_mtx_unlock(buf_mtxp);
2844                 return (0);
2845         }
2846         if (ISSET(bp->b_lflags, BL_BUSY)) {
2847                 if ( !ISSET(flags, BUF_WAIT)) {
2848                         lck_mtx_unlock(buf_mtxp);
2849                         return (EBUSY);
2850                 }
2851                 SET(bp->b_lflags, BL_WANTED);
2852
2853                 error = msleep((caddr_t)bp, buf_mtxp, (PRIBIO + 1), (char *)"buf_invalblkno", 0);
2854
2855                 if (error)
2856                         return (error);
2857                 goto relook;
2858         }
2859         bremfree_locked(bp);
2860         SET(bp->b_lflags, BL_BUSY);
2861         SET(bp->b_flags, B_INVAL);
2862 #ifdef JOE_DEBUG
2863         bp->b_owner = current_thread();
2864         bp->b_tag   = 4;
2865 #endif
2866         lck_mtx_unlock(buf_mtxp);
2867         buf_brelse(bp);
2868
2869         return (0);
2870 }
2871
2872
2873 void
2874 buf_drop(buf_t bp)
2875 {
2876         int need_wakeup = 0;
2877
2878         lck_mtx_lock(buf_mtxp);
2879
2880         if (ISSET(bp->b_lflags, BL_WANTED)) {
2881                 /*
2882                  * delay the actual wakeup until after we
2883                  * clear BL_BUSY and we've dropped buf_mtxp
2884                  */
2885                 need_wakeup = 1;
2886         }
2887         /*
2888          * Unlock the buffer.
2889          */
2890         CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
2891
2892         lck_mtx_unlock(buf_mtxp);
2893
2894         if (need_wakeup) {
2895                 /*
2896                  * Wake up any proceeses waiting for _this_ buffer to become free.
2897                  */
2898                 wakeup(bp);
2899         }
2900 }
2901
2902
2903 errno_t
2904 buf_acquire(buf_t bp, int flags, int slpflag, int slptimeo) {
2905         errno_t error;
2906
2907         lck_mtx_lock(buf_mtxp);
2908
2909         error = buf_acquire_locked(bp, flags, slpflag, slptimeo);
2910
2911         lck_mtx_unlock(buf_mtxp);
2912
2913         return (error);
2914 }
2915
2916
2917 static errno_t
2918 buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo)
2919 {
2920         errno_t error;
2921         struct timespec ts;
2922
2923         if (ISSET(bp->b_flags, B_LOCKED)) {
2924                 if ((flags & BAC_SKIP_LOCKED))
2925                         return (EDEADLK);
2926         } else {
2927                 if ((flags & BAC_SKIP_NONLOCKED))
2928                         return (EDEADLK);
2929         }
2930         if (ISSET(bp->b_lflags, BL_BUSY)) {
2931                 /*
2932                  * since the mutex_lock may block, the buffer
2933                  * may become BUSY, so we need to
2934                  * recheck for a NOWAIT request
2935                  */
2936                 if (flags & BAC_NOWAIT)
2937                         return (EBUSY);
2938                 SET(bp->b_lflags, BL_WANTED);
2939
2940                 /* the hz value is 100; which leads to 10ms */
2941                 ts.tv_sec = (slptimeo/100);
2942                 ts.tv_nsec = (slptimeo % 100) * 10  * NSEC_PER_USEC * 1000;
2943                 error = msleep((caddr_t)bp, buf_mtxp, slpflag | (PRIBIO + 1), (char *)"buf_acquire", &ts);
2944
2945                 if (error)
2946                         return (error);
2947                 return (EAGAIN);
2948         }
2949         if (flags & BAC_REMOVE)
2950                 bremfree_locked(bp);
2951         SET(bp->b_lflags, BL_BUSY);
2952 #ifdef JOE_DEBUG
2953         bp->b_owner = current_thread();
2954         bp->b_tag   = 5;
2955 #endif
2956         return (0);
2957 }
2958
2959
2960 /*
2961  * Wait for operations on the buffer to complete.
2962  * When they do, extract and return the I/O's error value.
2963  */
2964 errno_t
2965 buf_biowait(buf_t bp)
2966 {
2967         lck_mtx_lock(buf_mtxp);
2968
2969         while (!ISSET(bp->b_flags, B_DONE))
2970                 (void) msleep(bp, buf_mtxp, (PRIBIO+1), (char *)"buf_biowait", 0);
2971
2972         lck_mtx_unlock(buf_mtxp);
2973
2974         /* check for interruption of I/O (e.g. via NFS), then errors. */
2975         if (ISSET(bp->b_flags, B_EINTR)) {
2976                 CLR(bp->b_flags, B_EINTR);
2977                 return (EINTR);
2978         } else if (ISSET(bp->b_flags, B_ERROR))
2979                 return (bp->b_error ? bp->b_error : EIO);
2980         else
2981                 return (0);
2982 }
2983
2984 /*
2985  * Mark I/O complete on a buffer.
2986  *
2987  * If a callback has been requested, e.g. the pageout
2988  * daemon, do so. Otherwise, awaken waiting processes.
2989  *
2990  * [ Leffler, et al., says on p.247:
2991  *      "This routine wakes up the blocked process, frees the buffer
2992  *      for an asynchronous write, or, for a request by the pagedaemon
2993  *      process, invokes a procedure specified in the buffer structure" ]
2994  *
2995  * In real life, the pagedaemon (or other system processes) wants
2996  * to do async stuff to, and doesn't want the buffer buf_brelse()'d.
2997  * (for swap pager, that puts swap buffers on the free lists (!!!),
2998  * for the vn device, that puts malloc'd buffers on the free lists!)
2999  */
3000 extern struct timeval priority_IO_timestamp_for_root;
3001 extern int hard_throttle_on_root;
3002
3003 void
3004 buf_biodone(buf_t bp)
3005 {
3006         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
3007                      (int)bp, (int)bp->b_datap, bp->b_flags, 0, 0);
3008
3009         if (ISSET(bp->b_flags, B_DONE))
3010                 panic("biodone already");
3011
3012         if (kdebug_enable) {
3013                 int    code = DKIO_DONE;
3014
3015                 if (bp->b_flags & B_READ)
3016                         code |= DKIO_READ;
3017                 if (bp->b_flags & B_ASYNC)
3018                         code |= DKIO_ASYNC;
3019
3020                 if (bp->b_flags & B_META)
3021                         code |= DKIO_META;
3022                 else if (bp->b_flags & B_PAGEIO)
3023                         code |= DKIO_PAGING;
3024
3025                 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
3026                                       (unsigned int)bp, (unsigned int)bp->b_vp,
3027                                       bp->b_resid, bp->b_error, 0);
3028         }
3029         if ((bp->b_vp != NULLVP) &&
3030             ((bp->b_flags & (B_PAGEIO | B_READ)) == (B_PAGEIO | B_READ)) &&
3031             (bp->b_vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV)) {
3032                 microuptime(&priority_IO_timestamp_for_root);
3033                 hard_throttle_on_root = 0;
3034         }
3035         /*
3036          * I/O was done, so don't believe
3037          * the DIRTY state from VM anymore
3038          */
3039         CLR(bp->b_flags, B_WASDIRTY);
3040
3041         if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
3042                 /*
3043                  * wake up any writer's blocked
3044                  * on throttle or waiting for I/O
3045                  * to drain
3046                  */
3047                 vnode_writedone(bp->b_vp);
3048
3049         if (ISSET(bp->b_flags, (B_CALL | B_FILTER))) {  /* if necessary, call out */
3050                 void    (*iodone_func)(struct buf *, void *) = bp->b_iodone;
3051                 void    *arg = (void *)bp->b_transaction;
3052                 int     callout = ISSET(bp->b_flags, B_CALL);
3053
3054                 CLR(bp->b_flags, (B_CALL | B_FILTER));  /* filters and callouts are one-shot */
3055                 bp->b_iodone = NULL;
3056                 bp->b_transaction = NULL;
3057
3058                 if (iodone_func == NULL) {
3059                         panic("biodone: bp @ 0x%x has NULL b_iodone!\n", bp);
3060                 } else {
3061                         if (callout)
3062                                 SET(bp->b_flags, B_DONE);       /* note that it's done */
3063                         (*iodone_func)(bp, arg);
3064                 }
3065                 if (callout)
3066                         /*
3067                          * assumes that the call back function takes
3068                          * ownership of the bp and deals with releasing it if necessary
3069                          */
3070                         goto biodone_done;
3071                 /*
3072                  * in this case the call back function is acting
3073                  * strictly as a filter... it does not take
3074                  * ownership of the bp and is expecting us
3075                  * to finish cleaning up... this is currently used
3076                  * by the HFS journaling code
3077                  */
3078         }
3079         if (ISSET(bp->b_flags, B_ASYNC)) {      /* if async, release it */
3080                 SET(bp->b_flags, B_DONE);       /* note that it's done */
3081
3082                 buf_brelse(bp);
3083         } else {                                /* or just wakeup the buffer */
3084                 /*
3085                  * by taking the mutex, we serialize
3086                  * the buf owner calling buf_biowait so that we'll
3087                  * only see him in one of 2 states...
3088                  * state 1: B_DONE wasn't set and he's
3089                  * blocked in msleep
3090                  * state 2: he's blocked trying to take the
3091                  * mutex before looking at B_DONE
3092                  * BL_WANTED is cleared in case anyone else
3093                  * is blocked waiting for the buffer... note
3094                  * that we haven't cleared B_BUSY yet, so if
3095                  * they do get to run, their going to re-set
3096                  * BL_WANTED and go back to sleep
3097                  */
3098                 lck_mtx_lock(buf_mtxp);
3099
3100                 CLR(bp->b_lflags, BL_WANTED);
3101                 SET(bp->b_flags, B_DONE);               /* note that it's done */
3102
3103                 lck_mtx_unlock(buf_mtxp);
3104
3105                 wakeup(bp);
3106         }
3107 biodone_done:
3108         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
3109                      (int)bp, (int)bp->b_datap, bp->b_flags, 0, 0);
3110 }
3111
3112 /*
3113  * Return a count of buffers on the "locked" queue.
3114  */
3115 int
3116 count_lock_queue(void)
3117 {
3118         buf_t   bp;
3119         int     n = 0;
3120
3121         lck_mtx_lock(buf_mtxp);
3122
3123         for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
3124             bp = bp->b_freelist.tqe_next)
3125                 n++;
3126         lck_mtx_unlock(buf_mtxp);
3127
3128         return (n);
3129 }
3130
3131 /*
3132  * Return a count of 'busy' buffers. Used at the time of shutdown.
3133  */
3134 int
3135 count_busy_buffers(void)
3136 {
3137         buf_t   bp;
3138         int     nbusy = 0;
3139
3140         lck_mtx_lock(buf_mtxp);
3141         for (bp = &buf[boot_nbuf]; --bp >= buf; )
3142                 if (!ISSET(bp->b_flags, B_INVAL) && ISSET(bp->b_lflags, BL_BUSY))
3143                         nbusy++;
3144         lck_mtx_unlock(buf_mtxp);
3145
3146         return (nbusy);
3147 }
3148
3149 #if DIAGNOSTIC
3150 /*
3151  * Print out statistics on the current allocation of the buffer pool.
3152  * Can be enabled to print out on every ``sync'' by setting "syncprt"
3153  * in vfs_syscalls.c using sysctl.
3154  */
3155 void
3156 vfs_bufstats()
3157 {
3158         int i, j, count;
3159         register struct buf *bp;
3160         register struct bqueues *dp;
3161         int counts[MAXBSIZE/CLBYTES+1];
3162         static char *bname[BQUEUES] =
3163                 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
3164
3165         for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
3166                 count = 0;
3167                 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
3168                         counts[j] = 0;
3169
3170                 lck_mtx_lock(buf_mtxp);
3171
3172                 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
3173                         counts[bp->b_bufsize/CLBYTES]++;
3174                         count++;
3175                 }
3176                 lck_mtx_unlock(buf_mtxp);
3177
3178                 printf("%s: total-%d", bname[i], count);
3179                 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
3180                         if (counts[j] != 0)
3181                                 printf(", %d-%d", j * CLBYTES, counts[j]);
3182                 printf("\n");
3183         }
3184 }
3185 #endif /* DIAGNOSTIC */
3186
3187 #define NRESERVEDIOBUFS 64
3188
3189
3190 buf_t
3191 alloc_io_buf(vnode_t vp, int priv)
3192 {
3193         buf_t   bp;
3194
3195         lck_mtx_lock(iobuffer_mtxp);
3196
3197         while (((niobuf - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse) && !priv) ||
3198                (bp = iobufqueue.tqh_first) == NULL) {
3199                 bufstats.bufs_iobufsleeps++;
3200
3201                 need_iobuffer = 1;
3202                 (void) msleep(&need_iobuffer, iobuffer_mtxp, (PRIBIO+1), (const char *)"alloc_io_buf", 0);
3203         }
3204         TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
3205
3206         bufstats.bufs_iobufinuse++;
3207         if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
3208                 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
3209
3210         lck_mtx_unlock(iobuffer_mtxp);
3211
3212         /*
3213          * initialize various fields
3214          * we don't need to hold the mutex since the buffer
3215          * is now private... the vp should have a reference
3216          * on it and is not protected by this mutex in any event
3217          */
3218         bp->b_timestamp = 0;
3219         bp->b_proc = NULL;
3220
3221         bp->b_datap = 0;
3222         bp->b_flags = 0;
3223         bp->b_lflags = BL_BUSY | BL_IOBUF;
3224         bp->b_blkno = bp->b_lblkno = 0;
3225 #ifdef JOE_DEBUG
3226         bp->b_owner = current_thread();
3227         bp->b_tag   = 6;
3228 #endif
3229         bp->b_iodone = NULL;
3230         bp->b_error = 0;
3231         bp->b_resid = 0;
3232         bp->b_bcount = 0;
3233         bp->b_bufsize = 0;
3234         bp->b_upl = NULL;
3235         bp->b_vp = vp;
3236
3237         if (vp && (vp->v_type == VBLK || vp->v_type == VCHR))
3238                 bp->b_dev = vp->v_rdev;
3239         else
3240                 bp->b_dev = NODEV;
3241
3242         return (bp);
3243 }
3244
3245
3246 void
3247 free_io_buf(buf_t bp)
3248 {
3249         int need_wakeup = 0;
3250
3251         /*
3252          * put buffer back on the head of the iobufqueue
3253          */
3254         bp->b_vp = NULL;
3255         bp->b_flags = B_INVAL;
3256
3257         lck_mtx_lock(iobuffer_mtxp);
3258
3259         binsheadfree(bp, &iobufqueue, -1);
3260
3261         if (need_iobuffer) {
3262                 /*
3263                  * Wake up any processes waiting because they need an io buffer
3264                  *
3265                  * do the wakeup after we drop the mutex... it's possible that the
3266                  * wakeup will be superfluous if need_iobuffer gets set again and
3267                  * another thread runs this path, but it's highly unlikely, doesn't
3268                  * hurt, and it means we don't hold up I/O progress if the wakeup blocks
3269                  * trying to grab a task related lock...
3270                  */
3271                 need_iobuffer = 0;
3272                 need_wakeup = 1;
3273         }
3274         bufstats.bufs_iobufinuse--;
3275
3276         lck_mtx_unlock(iobuffer_mtxp);
3277
3278         if (need_wakeup)
3279                 wakeup(&need_iobuffer);
3280 }
3281
3282
3283
3284 /*
3285  * If getnewbuf() calls bcleanbuf() on the same thread
3286  * there is a potential for stack overrun and deadlocks.
3287  * So we always handoff the work to a worker thread for completion
3288  */
3289 #include <mach/mach_types.h>
3290 #include <mach/memory_object_types.h>
3291 #include <kern/sched_prim.h>
3292
3293
3294 static void
3295 bcleanbuf_thread_init(void)
3296 {
3297         /* create worker thread */
3298         kernel_thread(kernel_task, bcleanbuf_thread);
3299 }
3300
3301 static void
3302 bcleanbuf_thread(void)
3303 {
3304         struct buf *bp;
3305         int error = 0;
3306         int loopcnt = 0;
3307
3308         for (;;) {
3309                 lck_mtx_lock(buf_mtxp);
3310
3311                 while (blaundrycnt == 0)
3312                         (void)msleep((void *)&blaundrycnt, buf_mtxp, PRIBIO, "blaundry", 0);
3313
3314                 bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY]);
3315                 /*
3316                  * Remove from the queue
3317                  */
3318                 bremfree_locked(bp);
3319                 blaundrycnt--;
3320
3321                 lck_mtx_unlock(buf_mtxp);
3322                 /*
3323                  * do the IO
3324                  */
3325                 error = bawrite_internal(bp, 0);
3326
3327                 if (error) {
3328                         lck_mtx_lock(buf_mtxp);
3329
3330                         binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
3331                         blaundrycnt++;
3332
3333                         lck_mtx_unlock(buf_mtxp);
3334
3335                         if (loopcnt > 10) {
3336                                 (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 1);
3337                                 loopcnt = 0;
3338                         } else {
3339                                 (void)thread_block(THREAD_CONTINUE_NULL);
3340                                 loopcnt++;
3341                         }
3342                 }
3343         }
3344 }
3345
3346
3347 static int
3348 brecover_data(buf_t bp)
3349 {
3350         int     upl_offset;
3351         upl_t   upl;
3352         upl_page_info_t *pl;
3353         kern_return_t kret;
3354         vnode_t vp = bp->b_vp;
3355         int upl_flags;
3356
3357
3358         if ( !UBCINFOEXISTS(vp) || bp->b_bufsize == 0)
3359                 goto dump_buffer;
3360
3361         upl_flags = UPL_PRECIOUS;
3362         if (! (buf_flags(bp) & B_READ)) {
3363                 /*
3364                  * "write" operation:  let the UPL subsystem know
3365                  * that we intend to modify the buffer cache pages we're
3366                  * gathering.
3367                  */
3368                 upl_flags |= UPL_WILL_MODIFY;
3369         }
3370
3371         kret = ubc_create_upl(vp,
3372                               ubc_blktooff(vp, bp->b_lblkno),
3373                               bp->b_bufsize,
3374                               &upl,
3375                               &pl,
3376                               upl_flags);
3377         if (kret != KERN_SUCCESS)
3378                 panic("Failed to create UPL");
3379
3380         for (upl_offset = 0; upl_offset < bp->b_bufsize; upl_offset += PAGE_SIZE) {
3381
3382                 if (!upl_valid_page(pl, upl_offset / PAGE_SIZE) || !upl_dirty_page(pl, upl_offset / PAGE_SIZE)) {
3383                         ubc_upl_abort(upl, 0);
3384                         goto dump_buffer;
3385                 }
3386         }
3387         bp->b_upl = upl;
3388
3389         kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_datap));
3390
3391         if (kret != KERN_SUCCESS)
3392                 panic("getblk: ubc_upl_map() failed with (%d)", kret);
3393         return (1);
3394
3395 dump_buffer:
3396         bp->b_bufsize = 0;
3397         SET(bp->b_flags, B_INVAL);
3398         buf_brelse(bp);
3399
3400         return(0);
3401 }
3402
3403
3404
3405 /*
3406  * disabled for now
3407  */
3408
3409 #if FLUSH_QUEUES
3410
3411 #define NFLUSH 32
3412
3413 static int
3414 bp_cmp(void *a, void *b)
3415 {
3416     buf_t *bp_a = *(buf_t **)a,
3417           *bp_b = *(buf_t **)b;
3418     daddr64_t res;
3419
3420     // don't have to worry about negative block
3421     // numbers so this is ok to do.
3422     //
3423     res = (bp_a->b_blkno - bp_b->b_blkno);
3424
3425     return (int)res;
3426 }
3427
3428
3429 int
3430 bflushq(int whichq, mount_t mp)
3431 {
3432         buf_t   bp, next;
3433         int     i, buf_count;
3434         int     total_writes = 0;
3435         static buf_t flush_table[NFLUSH];
3436
3437         if (whichq < 0 || whichq >= BQUEUES) {
3438             return (0);
3439         }
3440
3441   restart:
3442         lck_mtx_lock(buf_mtxp);
3443
3444         bp = TAILQ_FIRST(&bufqueues[whichq]);
3445
3446         for (buf_count = 0; bp; bp = next) {
3447             next = bp->b_freelist.tqe_next;
3448
3449             if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) {
3450                 continue;
3451             }
3452
3453             if (ISSET(bp->b_flags, B_DELWRI) && !ISSET(bp->b_lflags, BL_BUSY)) {
3454
3455                 bremfree_locked(bp);
3456 #ifdef JOE_DEBUG
3457                 bp->b_owner = current_thread();
3458                 bp->b_tag   = 7;
3459 #endif
3460                 SET(bp->b_lflags, BL_BUSY);
3461                 flush_table[buf_count] = bp;
3462                 buf_count++;
3463                 total_writes++;
3464
3465                 if (buf_count >= NFLUSH) {
3466                     lck_mtx_unlock(buf_mtxp);
3467
3468                     qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
3469
3470                     for (i = 0; i < buf_count; i++) {
3471                         buf_bawrite(flush_table[i]);
3472                     }
3473                     goto restart;
3474                 }
3475             }
3476         }
3477         lck_mtx_unlock(buf_mtxp);
3478
3479         if (buf_count > 0) {
3480             qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
3481
3482             for (i = 0; i < buf_count; i++) {
3483                 buf_bawrite(flush_table[i]);
3484             }
3485         }
3486
3487         return (total_writes);
3488 }
3489 #endif
3490
3491
3492 #if BALANCE_QUEUES
3493
3494 /* XXX move this to a separate file */
3495
3496 /*
3497  * NOTE: THIS CODE HAS NOT BEEN UPDATED
3498  * WITH RESPECT TO THE NEW LOCKING MODEL
3499  */
3500
3501
3502 /*
3503  * Dynamic Scaling of the Buffer Queues
3504  */
3505
3506 typedef long long blsize_t;
3507
3508 blsize_t MAXNBUF; /* initialize to (sane_size / PAGE_SIZE) */
3509 /* Global tunable limits */
3510 blsize_t nbufh;                 /* number of buffer headers */
3511 blsize_t nbuflow;               /* minimum number of buffer headers required */
3512 blsize_t nbufhigh;              /* maximum number of buffer headers allowed */
3513 blsize_t nbuftarget;    /* preferred number of buffer headers */
3514
3515 /*
3516  * assertions:
3517  *
3518  * 1.   0 < nbuflow <= nbufh <= nbufhigh
3519  * 2.   nbufhigh <= MAXNBUF
3520  * 3.   0 < nbuflow <= nbuftarget <= nbufhigh
3521  * 4.   nbufh can not be set by sysctl().
3522  */
3523
3524 /* Per queue tunable limits */
3525
3526 struct bufqlim {
3527         blsize_t        bl_nlow;        /* minimum number of buffer headers required */
3528         blsize_t        bl_num;         /* number of buffer headers on the queue */
3529         blsize_t        bl_nlhigh;      /* maximum number of buffer headers allowed */
3530         blsize_t        bl_target;      /* preferred number of buffer headers */
3531         long    bl_stale;       /* Seconds after which a buffer is considered stale */
3532 } bufqlim[BQUEUES];
3533
3534 /*
3535  * assertions:
3536  *
3537  * 1.   0 <= bl_nlow <= bl_num <= bl_nlhigh
3538  * 2.   bl_nlhigh <= MAXNBUF
3539  * 3.  bufqlim[BQ_META].bl_nlow != 0
3540  * 4.  bufqlim[BQ_META].bl_nlow > (number of possible concurrent
3541  *                                                                      file system IO operations)
3542  * 5.   bl_num can not be set by sysctl().
3543  * 6.   bl_nhigh <= nbufhigh
3544  */
3545
3546 /*
3547  * Rationale:
3548  * ----------
3549  * Defining it blsize_t as long permits 2^31 buffer headers per queue.
3550  * Which can describe (2^31 * PAGE_SIZE) memory per queue.
3551  *
3552  * These limits are exported to by means of sysctl().
3553  * It was decided to define blsize_t as a 64 bit quantity.
3554  * This will make sure that we will not be required to change it
3555  * as long as we do not exceed 64 bit address space for the kernel.
3556  *
3557  * low and high numbers parameters initialized at compile time
3558  * and boot arguments can be used to override them. sysctl()
3559  * would not change the value. sysctl() can get all the values
3560  * but can set only target. num is the current level.
3561  *
3562  * Advantages of having a "bufqscan" thread doing the balancing are,
3563  * Keep enough bufs on BQ_EMPTY.
3564  *      getnewbuf() by default will always select a buffer from the BQ_EMPTY.
3565  *              getnewbuf() perfoms best if a buffer was found there.
3566  *              Also this minimizes the possibility of starting IO
3567  *              from getnewbuf(). That's a performance win, too.
3568  *
3569  *      Localize complex logic [balancing as well as time aging]
3570  *              to balancebufq().
3571  *
3572  *      Simplify getnewbuf() logic by elimination of time aging code.
3573  */
3574
3575 /*
3576  * Algorithm:
3577  * -----------
3578  * The goal of the dynamic scaling of the buffer queues to to keep
3579  * the size of the LRU close to bl_target. Buffers on a queue would
3580  * be time aged.
3581  *
3582  * There would be a thread which will be responsible for "balancing"
3583  * the buffer cache queues.
3584  *
3585  * The scan order would be:     AGE, LRU, META, EMPTY.
3586  */
3587
3588 long bufqscanwait = 0;
3589
3590 static void bufqscan_thread();
3591 static int balancebufq(int q);
3592 static int btrimempty(int n);
3593 static __inline__ int initbufqscan(void);
3594 static __inline__ int nextbufq(int q);
3595 static void buqlimprt(int all);
3596
3597
3598 static __inline__ void
3599 bufqinc(int q)
3600 {
3601         if ((q < 0) || (q >= BQUEUES))
3602                 return;
3603
3604         bufqlim[q].bl_num++;
3605         return;
3606 }
3607
3608 static __inline__ void
3609 bufqdec(int q)
3610 {
3611         if ((q < 0) || (q >= BQUEUES))
3612                 return;
3613
3614         bufqlim[q].bl_num--;
3615         return;
3616 }
3617
3618 static void
3619 bufq_balance_thread_init()
3620 {
3621
3622         if (bufqscanwait++ == 0) {
3623
3624                 /* Initalize globals */
3625                 MAXNBUF = (sane_size / PAGE_SIZE);
3626                 nbufh = nbuf;
3627                 nbuflow = min(nbufh, 100);
3628                 nbufhigh = min(MAXNBUF, max(nbufh, 2048));
3629                 nbuftarget = (sane_size >> 5) / PAGE_SIZE;
3630                 nbuftarget = max(nbuflow, nbuftarget);
3631                 nbuftarget = min(nbufhigh, nbuftarget);
3632
3633                 /*
3634                  * Initialize the bufqlim
3635                  */
3636
3637                 /* LOCKED queue */
3638                 bufqlim[BQ_LOCKED].bl_nlow = 0;
3639                 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
3640                 bufqlim[BQ_LOCKED].bl_target = 0;
3641                 bufqlim[BQ_LOCKED].bl_stale = 30;
3642
3643                 /* LRU queue */
3644                 bufqlim[BQ_LRU].bl_nlow = 0;
3645                 bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
3646                 bufqlim[BQ_LRU].bl_target = nbuftarget/4;
3647                 bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
3648
3649                 /* AGE queue */
3650                 bufqlim[BQ_AGE].bl_nlow = 0;
3651                 bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
3652                 bufqlim[BQ_AGE].bl_target = nbuftarget/4;
3653                 bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
3654
3655                 /* EMPTY queue */
3656                 bufqlim[BQ_EMPTY].bl_nlow = 0;
3657                 bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
3658                 bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
3659                 bufqlim[BQ_EMPTY].bl_stale = 600000;
3660
3661                 /* META queue */
3662                 bufqlim[BQ_META].bl_nlow = 0;
3663                 bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
3664                 bufqlim[BQ_META].bl_target = nbuftarget/4;
3665                 bufqlim[BQ_META].bl_stale = META_IS_STALE;
3666
3667                 /* LAUNDRY queue */
3668                 bufqlim[BQ_LOCKED].bl_nlow = 0;
3669                 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
3670                 bufqlim[BQ_LOCKED].bl_target = 0;
3671                 bufqlim[BQ_LOCKED].bl_stale = 30;
3672
3673                 buqlimprt(1);
3674         }
3675
3676         /* create worker thread */
3677         kernel_thread(kernel_task, bufqscan_thread);
3678 }
3679
3680 /* The workloop for the buffer balancing thread */
3681 static void
3682 bufqscan_thread()
3683 {
3684         int moretodo = 0;
3685
3686         for(;;) {
3687                 do {
3688                         int q;  /* buffer queue to process */
3689
3690                         q = initbufqscan();
3691                         for (; q; ) {
3692                                 moretodo |= balancebufq(q);
3693                                 q = nextbufq(q);
3694                         }
3695                 } while (moretodo);
3696
3697 #if DIAGNOSTIC
3698                 vfs_bufstats();
3699                 buqlimprt(0);
3700 #endif
3701                 (void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz);
3702                 moretodo = 0;
3703         }
3704 }
3705
3706 /* Seed for the buffer queue balancing */
3707 static __inline__ int
3708 initbufqscan()
3709 {
3710         /* Start with AGE queue */
3711         return (BQ_AGE);
3712 }
3713
3714 /* Pick next buffer queue to balance */
3715 static __inline__ int
3716 nextbufq(int q)
3717 {
3718         int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
3719
3720         q++;
3721         q %= sizeof(order);
3722         return (order[q]);
3723 }
3724
3725 /* function to balance the buffer queues */
3726 static int
3727 balancebufq(int q)
3728 {
3729         int moretodo = 0;
3730         int s = splbio();
3731         int n, t;
3732
3733         /* reject invalid q */
3734         if ((q < 0) || (q >= BQUEUES))
3735                 goto out;
3736
3737         /* LOCKED or LAUNDRY queue MUST not be balanced */
3738         if ((q == BQ_LOCKED) || (q == BQ_LAUNDRY))
3739                 goto out;
3740
3741         n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
3742
3743         /* If queue has less than target nothing more to do */
3744         if (n < 0)
3745                 goto out;
3746
3747         if ( n > 8 ) {
3748                 /* Balance only a small amount (12.5%) at a time */
3749                 n >>= 3;
3750         }
3751
3752         /* EMPTY queue needs special handling */
3753         if (q == BQ_EMPTY) {
3754                 moretodo |= btrimempty(n);
3755                 goto out;
3756         }
3757
3758         t = buf_timestamp():
3759
3760         for (; n > 0; n--) {
3761                 struct buf *bp = bufqueues[q].tqh_first;
3762                 if (!bp)
3763                         break;
3764
3765                 /* check if it's stale */
3766                 if ((t - bp->b_timestamp) > bufqlim[q].bl_stale) {
3767                         if (bcleanbuf(bp)) {
3768                                 /* buf_bawrite() issued, bp not ready */
3769                                 moretodo = 1;
3770                         } else {
3771                                 /* release the cleaned buffer to BQ_EMPTY */
3772                                 SET(bp->b_flags, B_INVAL);
3773                                 buf_brelse(bp);
3774                         }
3775                 } else
3776                         break;
3777         }
3778
3779 out:
3780         splx(s);
3781         return (moretodo);
3782 }
3783
3784 static int
3785 btrimempty(int n)
3786 {
3787         /*
3788          * When struct buf are allocated dynamically, this would
3789          * reclaim upto 'n' struct buf from the empty queue.
3790          */
3791
3792          return (0);
3793 }
3794
3795 static void
3796 buqlimprt(int all)
3797 {
3798         int i;
3799     static char *bname[BQUEUES] =
3800                 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
3801
3802         if (all)
3803                 for (i = 0; i < BQUEUES; i++) {
3804                         printf("%s : ", bname[i]);
3805                         printf("min = %ld, ", (long)bufqlim[i].bl_nlow);
3806                         printf("cur = %ld, ", (long)bufqlim[i].bl_num);
3807                         printf("max = %ld, ", (long)bufqlim[i].bl_nlhigh);
3808                         printf("target = %ld, ", (long)bufqlim[i].bl_target);
3809                         printf("stale after %ld seconds\n", bufqlim[i].bl_stale);
3810                 }
3811         else
3812                 for (i = 0; i < BQUEUES; i++) {
3813                         printf("%s : ", bname[i]);
3814                         printf("cur = %ld, ", (long)bufqlim[i].bl_num);
3815                 }
3816 }
3817
3818 #endif
3819
3820