bsd/vfs/vfs_bio.c

   1 /*
   2  * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*-
  30  * Copyright (c) 1994 Christopher G. Demetriou
  31  * Copyright (c) 1982, 1986, 1989, 1993
  32  *      The Regents of the University of California.  All rights reserved.
  33  * (c) UNIX System Laboratories, Inc.
  34  * All or some portions of this file are derived from material licensed
  35  * to the University of California by American Telephone and Telegraph
  36  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  37  * the permission of UNIX System Laboratories, Inc.
  38  *
  39  * Redistribution and use in source and binary forms, with or without
  40  * modification, are permitted provided that the following conditions
  41  * are met:
  42  * 1. Redistributions of source code must retain the above copyright
  43  *    notice, this list of conditions and the following disclaimer.
  44  * 2. Redistributions in binary form must reproduce the above copyright
  45  *    notice, this list of conditions and the following disclaimer in the
  46  *    documentation and/or other materials provided with the distribution.
  47  * 3. All advertising materials mentioning features or use of this software
  48  *    must display the following acknowledgement:
  49  *      This product includes software developed by the University of
  50  *      California, Berkeley and its contributors.
  51  * 4. Neither the name of the University nor the names of its contributors
  52  *    may be used to endorse or promote products derived from this software
  53  *    without specific prior written permission.
  54  *
  55  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  56  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  57  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  58  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  59  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  60  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  61  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  62  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  63  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  64  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  65  * SUCH DAMAGE.
  66  *
  67  *      @(#)vfs_bio.c   8.6 (Berkeley) 1/11/94
  68  */
  69
  70 /*
  71  * Some references:
  72  *      Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
  73  *      Leffler, et al.: The Design and Implementation of the 4.3BSD
  74  *              UNIX Operating System (Addison Welley, 1989)
  75  */
  76
  77 #include <sys/param.h>
  78 #include <sys/systm.h>
  79 #include <sys/proc_internal.h>
  80 #include <sys/buf_internal.h>
  81 #include <sys/vnode_internal.h>
  82 #include <sys/mount_internal.h>
  83 #include <sys/trace.h>
  84 #include <sys/malloc.h>
  85 #include <sys/resourcevar.h>
  86 #include <miscfs/specfs/specdev.h>
  87 #include <sys/ubc.h>
  88 #include <sys/kauth.h>
  89 #if DIAGNOSTIC
  90 #include <kern/assert.h>
  91 #endif /* DIAGNOSTIC */
  92 #include <kern/task.h>
  93 #include <kern/zalloc.h>
  94 #include <kern/locks.h>
  95 #include <kern/thread.h>
  96
  97 #include <sys/fslog.h>          /* fslog_io_error() */
  98 #include <sys/disk.h>           /* dk_error_description_t */
  99
 100 #include <mach/mach_types.h>
 101 #include <mach/memory_object_types.h>
 102 #include <kern/sched_prim.h>    /* thread_block() */
 103
 104 #include <vm/vm_kern.h>
 105 #include <vm/vm_pageout.h>
 106
 107 #include <sys/kdebug.h>
 108
 109 #include <libkern/OSAtomic.h>
 110 #include <libkern/OSDebug.h>
 111 #include <sys/ubc_internal.h>
 112
 113 #include <sys/sdt.h>
 114
 115 int     bcleanbuf(buf_t bp, boolean_t discard);
 116 static int      brecover_data(buf_t bp);
 117 static boolean_t incore(vnode_t vp, daddr64_t blkno);
 118 /* timeout is in msecs */
 119 static buf_t    getnewbuf(int slpflag, int slptimeo, int *queue);
 120 static void     bremfree_locked(buf_t bp);
 121 static void     buf_reassign(buf_t bp, vnode_t newvp);
 122 static errno_t  buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo);
 123 static int      buf_iterprepare(vnode_t vp, struct buflists *, int flags);
 124 static void     buf_itercomplete(vnode_t vp, struct buflists *, int flags);
 125 static boolean_t buffer_cache_gc(int);
 126 static buf_t    buf_brelse_shadow(buf_t bp);
 127 static void     buf_free_meta_store(buf_t bp);
 128
 129 static buf_t    buf_create_shadow_internal(buf_t bp, boolean_t force_copy,
 130                                            uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg, int priv);
 131
 132
 133 int  bdwrite_internal(buf_t, int);
 134
 135 /* zone allocated buffer headers */
 136 static void     bufzoneinit(void);
 137 static void     bcleanbuf_thread_init(void);
 138 static void     bcleanbuf_thread(void);
 139
 140 static zone_t   buf_hdr_zone;
 141 static int      buf_hdr_count;
 142
 143
 144 /*
 145  * Definitions for the buffer hash lists.
 146  */
 147 #define BUFHASH(dvp, lbn)       \
 148         (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
 149 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
 150 u_long  bufhash;
 151
 152 static buf_t    incore_locked(vnode_t vp, daddr64_t blkno, struct bufhashhdr *dp);
 153
 154 /* Definitions for the buffer stats. */
 155 struct bufstats bufstats;
 156
 157 /* Number of delayed write buffers */
 158 long nbdwrite = 0;
 159 int blaundrycnt = 0;
 160 static int boot_nbuf_headers = 0;
 161
 162 static TAILQ_HEAD(delayqueue, buf) delaybufqueue;
 163
 164 static TAILQ_HEAD(ioqueue, buf) iobufqueue;
 165 static TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
 166 static int needbuffer;
 167 static int need_iobuffer;
 168
 169 static lck_grp_t        *buf_mtx_grp;
 170 static lck_attr_t       *buf_mtx_attr;
 171 static lck_grp_attr_t   *buf_mtx_grp_attr;
 172 static lck_mtx_t        *iobuffer_mtxp;
 173 static lck_mtx_t        *buf_mtxp;
 174 static lck_mtx_t        *buf_gc_callout;
 175
 176 static int buf_busycount;
 177
 178 #define FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE 16
 179 typedef struct {
 180         void (* callout)(int, void *);
 181         void *context;
 182 } fs_buffer_cache_gc_callout_t;
 183
 184 fs_buffer_cache_gc_callout_t fs_callouts[FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE] = { {NULL, NULL} };
 185
 186 static __inline__ int
 187 buf_timestamp(void)
 188 {
 189         struct  timeval         t;
 190         microuptime(&t);
 191         return (t.tv_sec);
 192 }
 193
 194 /*
 195  * Insq/Remq for the buffer free lists.
 196  */
 197 #define binsheadfree(bp, dp, whichq)    do { \
 198                                     TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
 199                                 } while (0)
 200
 201 #define binstailfree(bp, dp, whichq)    do { \
 202                                     TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
 203                                 } while (0)
 204
 205 #define BHASHENTCHECK(bp)       \
 206         if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef)  \
 207                 panic("%p: b_hash.le_prev is not deadbeef", (bp));
 208
 209 #define BLISTNONE(bp)   \
 210         (bp)->b_hash.le_next = (struct buf *)0; \
 211         (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
 212
 213 /*
 214  * Insq/Remq for the vnode usage lists.
 215  */
 216 #define bufinsvn(bp, dp)        LIST_INSERT_HEAD(dp, bp, b_vnbufs)
 217 #define bufremvn(bp) {                                                  \
 218         LIST_REMOVE(bp, b_vnbufs);                                      \
 219         (bp)->b_vnbufs.le_next = NOLIST;                                \
 220 }
 221
 222 /*
 223  * Time in seconds before a buffer on a list is
 224  * considered as a stale buffer
 225  */
 226 #define LRU_IS_STALE 120 /* default value for the LRU */
 227 #define AGE_IS_STALE 60  /* default value for the AGE */
 228 #define META_IS_STALE 180 /* default value for the BQ_META */
 229
 230 int lru_is_stale = LRU_IS_STALE;
 231 int age_is_stale = AGE_IS_STALE;
 232 int meta_is_stale = META_IS_STALE;
 233
 234 #define MAXLAUNDRY      10
 235
 236 /* LIST_INSERT_HEAD() with assertions */
 237 static __inline__ void
 238 blistenterhead(struct bufhashhdr * head, buf_t bp)
 239 {
 240         if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
 241                 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
 242         (head)->lh_first = bp;
 243         bp->b_hash.le_prev = &(head)->lh_first;
 244         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
 245                 panic("blistenterhead: le_prev is deadbeef");
 246 }
 247
 248 static __inline__ void
 249 binshash(buf_t bp, struct bufhashhdr *dp)
 250 {
 251 #if DIAGNOSTIC
 252         buf_t   nbp;
 253 #endif /* DIAGNOSTIC */
 254
 255         BHASHENTCHECK(bp);
 256
 257 #if DIAGNOSTIC
 258         nbp = dp->lh_first;
 259         for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
 260                 if(nbp == bp)
 261                         panic("buf already in hashlist");
 262         }
 263 #endif /* DIAGNOSTIC */
 264
 265         blistenterhead(dp, bp);
 266 }
 267
 268 static __inline__ void
 269 bremhash(buf_t  bp)
 270 {
 271         if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
 272                 panic("bremhash le_prev is deadbeef");
 273         if (bp->b_hash.le_next == bp)
 274                 panic("bremhash: next points to self");
 275
 276         if (bp->b_hash.le_next != NULL)
 277                 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
 278         *bp->b_hash.le_prev = (bp)->b_hash.le_next;
 279 }
 280
 281 /*
 282  * buf_mtxp held.
 283  */
 284 static __inline__ void
 285 bmovelaundry(buf_t bp)
 286 {
 287         bp->b_whichq = BQ_LAUNDRY;
 288         bp->b_timestamp = buf_timestamp();
 289         binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
 290         blaundrycnt++;
 291 }
 292
 293 static __inline__ void
 294 buf_release_credentials(buf_t bp)
 295 {
 296         if (IS_VALID_CRED(bp->b_rcred)) {
 297                 kauth_cred_unref(&bp->b_rcred);
 298         }
 299         if (IS_VALID_CRED(bp->b_wcred)) {
 300                 kauth_cred_unref(&bp->b_wcred);
 301         }
 302 }
 303
 304
 305 int
 306 buf_valid(buf_t bp) {
 307
 308         if ( (bp->b_flags & (B_DONE | B_DELWRI)) )
 309                 return 1;
 310         return 0;
 311 }
 312
 313 int
 314 buf_fromcache(buf_t bp) {
 315
 316         if ( (bp->b_flags & B_CACHE) )
 317                 return 1;
 318         return 0;
 319 }
 320
 321 void
 322 buf_markinvalid(buf_t bp) {
 323
 324         SET(bp->b_flags, B_INVAL);
 325 }
 326
 327 void
 328 buf_markdelayed(buf_t bp) {
 329
 330         if (!ISSET(bp->b_flags, B_DELWRI)) {
 331                 SET(bp->b_flags, B_DELWRI);
 332
 333                 OSAddAtomicLong(1, &nbdwrite);
 334                 buf_reassign(bp, bp->b_vp);
 335         }
 336         SET(bp->b_flags, B_DONE);
 337 }
 338
 339 void
 340 buf_markclean(buf_t bp) {
 341
 342         if (ISSET(bp->b_flags, B_DELWRI)) {
 343                 CLR(bp->b_flags, B_DELWRI);
 344
 345                 OSAddAtomicLong(-1, &nbdwrite);
 346                 buf_reassign(bp, bp->b_vp);
 347         }
 348 }
 349
 350 void
 351 buf_markeintr(buf_t bp) {
 352
 353         SET(bp->b_flags, B_EINTR);
 354 }
 355
 356
 357 void
 358 buf_markaged(buf_t bp) {
 359
 360         SET(bp->b_flags, B_AGE);
 361 }
 362
 363 int
 364 buf_fua(buf_t bp) {
 365
 366         if ((bp->b_flags & B_FUA) == B_FUA)
 367                 return 1;
 368         return 0;
 369 }
 370
 371 void
 372 buf_markfua(buf_t bp) {
 373
 374         SET(bp->b_flags, B_FUA);
 375 }
 376
 377 #if CONFIG_PROTECT
 378 cpx_t bufattr_cpx(bufattr_t bap)
 379 {
 380         return bap->ba_cpx;
 381 }
 382
 383 void bufattr_setcpx(bufattr_t bap, cpx_t cpx)
 384 {
 385         bap->ba_cpx = cpx;
 386 }
 387
 388 void
 389 buf_setcpoff (buf_t bp, uint64_t foffset) {
 390         bp->b_attr.ba_cp_file_off = foffset;
 391 }
 392
 393 uint64_t
 394 bufattr_cpoff(bufattr_t bap) {
 395         return bap->ba_cp_file_off;
 396 }
 397
 398 void
 399 bufattr_setcpoff(bufattr_t bap, uint64_t foffset) {
 400         bap->ba_cp_file_off = foffset;
 401 }
 402
 403 #else // !CONTECT_PROTECT
 404
 405 uint64_t
 406 bufattr_cpoff(bufattr_t bap __unused) {
 407         return 0;
 408 }
 409
 410 void
 411 bufattr_setcpoff(__unused bufattr_t bap, __unused uint64_t foffset) {
 412         return;
 413 }
 414
 415 struct cpx *bufattr_cpx(__unused bufattr_t bap)
 416 {
 417         return NULL;
 418 }
 419
 420 void bufattr_setcpx(__unused bufattr_t bap, __unused struct cpx *cpx)
 421 {
 422 }
 423
 424 #endif /* !CONFIG_PROTECT */
 425
 426 bufattr_t
 427 bufattr_alloc() {
 428         bufattr_t bap;
 429         MALLOC(bap, bufattr_t, sizeof(struct bufattr), M_TEMP, M_WAITOK);
 430         if (bap == NULL)
 431                 return NULL;
 432
 433         bzero(bap, sizeof(struct bufattr));
 434         return bap;
 435 }
 436
 437 void
 438 bufattr_free(bufattr_t bap) {
 439         if (bap)
 440                 FREE(bap, M_TEMP);
 441 }
 442
 443 bufattr_t
 444 bufattr_dup(bufattr_t bap) {
 445         bufattr_t new_bufattr;
 446         MALLOC(new_bufattr, bufattr_t, sizeof(struct bufattr), M_TEMP, M_WAITOK);
 447         if (new_bufattr == NULL)
 448                 return NULL;
 449
 450         /* Copy the provided one into the new copy */
 451         memcpy (new_bufattr, bap, sizeof(struct bufattr));
 452         return new_bufattr;
 453 }
 454
 455 int
 456 bufattr_rawencrypted(bufattr_t bap) {
 457         if ( (bap->ba_flags & BA_RAW_ENCRYPTED_IO) )
 458                 return 1;
 459         return 0;
 460 }
 461
 462 int
 463 bufattr_throttled(bufattr_t bap) {
 464         return (GET_BUFATTR_IO_TIER(bap));
 465 }
 466
 467 int
 468 bufattr_passive(bufattr_t bap) {
 469         if ( (bap->ba_flags & BA_PASSIVE) )
 470                 return 1;
 471         return 0;
 472 }
 473
 474 int
 475 bufattr_nocache(bufattr_t bap) {
 476         if ( (bap->ba_flags & BA_NOCACHE) )
 477                 return 1;
 478         return 0;
 479 }
 480
 481 int
 482 bufattr_meta(bufattr_t bap) {
 483         if ( (bap->ba_flags & BA_META) )
 484                 return 1;
 485         return 0;
 486 }
 487
 488 void
 489 bufattr_markmeta(bufattr_t bap) {
 490         SET(bap->ba_flags,  BA_META);
 491 }
 492
 493 int
 494 bufattr_delayidlesleep(bufattr_t bap)
 495 {
 496         if ( (bap->ba_flags & BA_DELAYIDLESLEEP) )
 497                 return 1;
 498         return 0;
 499 }
 500
 501 bufattr_t
 502 buf_attr(buf_t bp) {
 503         return &bp->b_attr;
 504 }
 505
 506 void
 507 buf_markstatic(buf_t bp __unused) {
 508         SET(bp->b_flags, B_STATICCONTENT);
 509 }
 510
 511 int
 512 buf_static(buf_t bp) {
 513     if ( (bp->b_flags & B_STATICCONTENT) )
 514         return 1;
 515     return 0;
 516 }
 517
 518 void
 519 bufattr_markgreedymode(bufattr_t bap) {
 520         SET(bap->ba_flags, BA_GREEDY_MODE);
 521 }
 522
 523 int
 524 bufattr_greedymode(bufattr_t bap) {
 525     if ( (bap->ba_flags & BA_GREEDY_MODE) )
 526         return 1;
 527     return 0;
 528 }
 529
 530 void
 531 bufattr_markisochronous(bufattr_t bap) {
 532         SET(bap->ba_flags, BA_ISOCHRONOUS);
 533 }
 534
 535 int
 536 bufattr_isochronous(bufattr_t bap) {
 537     if ( (bap->ba_flags & BA_ISOCHRONOUS) )
 538         return 1;
 539     return 0;
 540 }
 541
 542 void
 543 bufattr_markquickcomplete(bufattr_t bap) {
 544         SET(bap->ba_flags, BA_QUICK_COMPLETE);
 545 }
 546
 547 int
 548 bufattr_quickcomplete(bufattr_t bap) {
 549     if ( (bap->ba_flags & BA_QUICK_COMPLETE) )
 550         return 1;
 551     return 0;
 552 }
 553
 554 errno_t
 555 buf_error(buf_t bp) {
 556
 557         return (bp->b_error);
 558 }
 559
 560 void
 561 buf_seterror(buf_t bp, errno_t error) {
 562
 563         if ((bp->b_error = error))
 564                 SET(bp->b_flags, B_ERROR);
 565         else
 566                 CLR(bp->b_flags, B_ERROR);
 567 }
 568
 569 void
 570 buf_setflags(buf_t bp, int32_t flags) {
 571
 572         SET(bp->b_flags, (flags & BUF_X_WRFLAGS));
 573 }
 574
 575 void
 576 buf_clearflags(buf_t bp, int32_t flags) {
 577
 578         CLR(bp->b_flags, (flags & BUF_X_WRFLAGS));
 579 }
 580
 581 int32_t
 582 buf_flags(buf_t bp) {
 583
 584         return ((bp->b_flags & BUF_X_RDFLAGS));
 585 }
 586
 587 void
 588 buf_reset(buf_t bp, int32_t io_flags) {
 589
 590         CLR(bp->b_flags, (B_READ | B_WRITE | B_ERROR | B_DONE | B_INVAL | B_ASYNC | B_NOCACHE | B_FUA));
 591         SET(bp->b_flags, (io_flags & (B_ASYNC | B_READ | B_WRITE | B_NOCACHE)));
 592
 593         bp->b_error = 0;
 594 }
 595
 596 uint32_t
 597 buf_count(buf_t bp) {
 598
 599         return (bp->b_bcount);
 600 }
 601
 602 void
 603 buf_setcount(buf_t bp, uint32_t bcount) {
 604
 605         bp->b_bcount = bcount;
 606 }
 607
 608 uint32_t
 609 buf_size(buf_t bp) {
 610
 611         return (bp->b_bufsize);
 612 }
 613
 614 void
 615 buf_setsize(buf_t bp, uint32_t bufsize) {
 616
 617         bp->b_bufsize = bufsize;
 618 }
 619
 620 uint32_t
 621 buf_resid(buf_t bp) {
 622
 623         return (bp->b_resid);
 624 }
 625
 626 void
 627 buf_setresid(buf_t bp, uint32_t resid) {
 628
 629         bp->b_resid = resid;
 630 }
 631
 632 uint32_t
 633 buf_dirtyoff(buf_t bp) {
 634
 635         return (bp->b_dirtyoff);
 636 }
 637
 638 uint32_t
 639 buf_dirtyend(buf_t bp) {
 640
 641         return (bp->b_dirtyend);
 642 }
 643
 644 void
 645 buf_setdirtyoff(buf_t bp, uint32_t dirtyoff) {
 646
 647         bp->b_dirtyoff = dirtyoff;
 648 }
 649
 650 void
 651 buf_setdirtyend(buf_t bp, uint32_t dirtyend) {
 652
 653         bp->b_dirtyend = dirtyend;
 654 }
 655
 656 uintptr_t
 657 buf_dataptr(buf_t bp) {
 658
 659         return (bp->b_datap);
 660 }
 661
 662 void
 663 buf_setdataptr(buf_t bp, uintptr_t data) {
 664
 665         bp->b_datap = data;
 666 }
 667
 668 vnode_t
 669 buf_vnode(buf_t bp) {
 670
 671         return (bp->b_vp);
 672 }
 673
 674 void
 675 buf_setvnode(buf_t bp, vnode_t vp) {
 676
 677         bp->b_vp = vp;
 678 }
 679
 680
 681 void *
 682 buf_callback(buf_t bp)
 683 {
 684         if ( !(bp->b_flags & B_CALL) )
 685                 return ((void *) NULL);
 686
 687         return ((void *)bp->b_iodone);
 688 }
 689
 690
 691 errno_t
 692 buf_setcallback(buf_t bp, void (*callback)(buf_t, void *), void *transaction)
 693 {
 694         assert(!ISSET(bp->b_flags, B_FILTER) && ISSET(bp->b_lflags, BL_BUSY));
 695
 696         if (callback)
 697                 bp->b_flags |= (B_CALL | B_ASYNC);
 698         else
 699                 bp->b_flags &= ~B_CALL;
 700         bp->b_transaction = transaction;
 701         bp->b_iodone = callback;
 702
 703         return (0);
 704 }
 705
 706 errno_t
 707 buf_setupl(buf_t bp, upl_t upl, uint32_t offset)
 708 {
 709
 710         if ( !(bp->b_lflags & BL_IOBUF) )
 711                 return (EINVAL);
 712
 713         if (upl)
 714                 bp->b_flags |= B_CLUSTER;
 715         else
 716                 bp->b_flags &= ~B_CLUSTER;
 717         bp->b_upl = upl;
 718         bp->b_uploffset = offset;
 719
 720         return (0);
 721 }
 722
 723 buf_t
 724 buf_clone(buf_t bp, int io_offset, int io_size, void (*iodone)(buf_t, void *), void *arg)
 725 {
 726         buf_t   io_bp;
 727
 728         if (io_offset < 0 || io_size < 0)
 729                 return (NULL);
 730
 731         if ((unsigned)(io_offset + io_size) > (unsigned)bp->b_bcount)
 732                 return (NULL);
 733
 734         if (bp->b_flags & B_CLUSTER) {
 735                 if (io_offset && ((bp->b_uploffset + io_offset) & PAGE_MASK))
 736                         return (NULL);
 737
 738                 if (((bp->b_uploffset + io_offset + io_size) & PAGE_MASK) && ((io_offset + io_size) < bp->b_bcount))
 739                         return (NULL);
 740         }
 741         io_bp = alloc_io_buf(bp->b_vp, 0);
 742
 743         io_bp->b_flags = bp->b_flags & (B_COMMIT_UPL | B_META | B_PAGEIO | B_CLUSTER | B_PHYS | B_RAW | B_ASYNC | B_READ | B_FUA);
 744
 745         if (iodone) {
 746                 io_bp->b_transaction = arg;
 747                 io_bp->b_iodone = iodone;
 748                 io_bp->b_flags |= B_CALL;
 749         }
 750         if (bp->b_flags & B_CLUSTER) {
 751                 io_bp->b_upl = bp->b_upl;
 752                 io_bp->b_uploffset = bp->b_uploffset + io_offset;
 753         } else {
 754                 io_bp->b_datap  = (uintptr_t)(((char *)bp->b_datap) + io_offset);
 755         }
 756         io_bp->b_bcount = io_size;
 757
 758         return (io_bp);
 759 }
 760
 761
 762 int
 763 buf_shadow(buf_t bp)
 764 {
 765         if (bp->b_lflags & BL_SHADOW)
 766                 return 1;
 767         return 0;
 768 }
 769
 770
 771 buf_t
 772 buf_create_shadow_priv(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg)
 773 {
 774         return (buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, 1));
 775 }
 776
 777 buf_t
 778 buf_create_shadow(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg)
 779 {
 780         return (buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, 0));
 781 }
 782
 783
 784 static buf_t
 785 buf_create_shadow_internal(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg, int priv)
 786 {
 787         buf_t   io_bp;
 788
 789         KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_START, bp, 0, 0, 0, 0);
 790
 791         if ( !(bp->b_flags & B_META) || (bp->b_lflags & BL_IOBUF)) {
 792
 793                 KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_END, bp, 0, 0, 0, 0);
 794                 return (NULL);
 795         }
 796 #ifdef BUF_MAKE_PRIVATE
 797         if (bp->b_shadow_ref && bp->b_data_ref == 0 && external_storage == 0)
 798                 panic("buf_create_shadow: %p is in the private state (%d, %d)", bp, bp->b_shadow_ref, bp->b_data_ref);
 799 #endif
 800         io_bp = alloc_io_buf(bp->b_vp, priv);
 801
 802         io_bp->b_flags = bp->b_flags & (B_META | B_ZALLOC | B_ASYNC | B_READ | B_FUA);
 803         io_bp->b_blkno = bp->b_blkno;
 804         io_bp->b_lblkno = bp->b_lblkno;
 805
 806         if (iodone) {
 807                 io_bp->b_transaction = arg;
 808                 io_bp->b_iodone = iodone;
 809                 io_bp->b_flags |= B_CALL;
 810         }
 811         if (force_copy == FALSE) {
 812                 io_bp->b_bcount = bp->b_bcount;
 813                 io_bp->b_bufsize = bp->b_bufsize;
 814
 815                 if (external_storage) {
 816                         io_bp->b_datap = external_storage;
 817 #ifdef BUF_MAKE_PRIVATE
 818                         io_bp->b_data_store = NULL;
 819 #endif
 820                 } else {
 821                         io_bp->b_datap = bp->b_datap;
 822 #ifdef BUF_MAKE_PRIVATE
 823                         io_bp->b_data_store = bp;
 824 #endif
 825                 }
 826                 *(buf_t *)(&io_bp->b_orig) = bp;
 827
 828                 lck_mtx_lock_spin(buf_mtxp);
 829
 830                 io_bp->b_lflags |= BL_SHADOW;
 831                 io_bp->b_shadow = bp->b_shadow;
 832                 bp->b_shadow = io_bp;
 833                 bp->b_shadow_ref++;
 834
 835 #ifdef BUF_MAKE_PRIVATE
 836                 if (external_storage)
 837                         io_bp->b_lflags |= BL_EXTERNAL;
 838                 else
 839                         bp->b_data_ref++;
 840 #endif
 841                 lck_mtx_unlock(buf_mtxp);
 842         } else {
 843                 if (external_storage) {
 844 #ifdef BUF_MAKE_PRIVATE
 845                         io_bp->b_lflags |= BL_EXTERNAL;
 846 #endif
 847                         io_bp->b_bcount = bp->b_bcount;
 848                         io_bp->b_bufsize = bp->b_bufsize;
 849                         io_bp->b_datap = external_storage;
 850                 } else {
 851                         allocbuf(io_bp, bp->b_bcount);
 852
 853                         io_bp->b_lflags |= BL_IOBUF_ALLOC;
 854                 }
 855                 bcopy((caddr_t)bp->b_datap, (caddr_t)io_bp->b_datap, bp->b_bcount);
 856
 857 #ifdef BUF_MAKE_PRIVATE
 858                 io_bp->b_data_store = NULL;
 859 #endif
 860         }
 861         KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, io_bp, 0);
 862
 863         return (io_bp);
 864 }
 865
 866
 867 #ifdef BUF_MAKE_PRIVATE
 868 errno_t
 869 buf_make_private(buf_t bp)
 870 {
 871         buf_t   ds_bp;
 872         buf_t   t_bp;
 873         struct buf my_buf;
 874
 875         KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_START, bp, bp->b_shadow_ref, 0, 0, 0);
 876
 877         if (bp->b_shadow_ref == 0 || bp->b_data_ref == 0 || ISSET(bp->b_lflags, BL_SHADOW)) {
 878
 879                 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, EINVAL, 0);
 880                 return (EINVAL);
 881         }
 882         my_buf.b_flags = B_META;
 883         my_buf.b_datap = (uintptr_t)NULL;
 884         allocbuf(&my_buf, bp->b_bcount);
 885
 886         bcopy((caddr_t)bp->b_datap, (caddr_t)my_buf.b_datap, bp->b_bcount);
 887
 888         lck_mtx_lock_spin(buf_mtxp);
 889
 890         for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) {
 891                 if ( !ISSET(bp->b_lflags, BL_EXTERNAL))
 892                         break;
 893         }
 894         ds_bp = t_bp;
 895
 896         if (ds_bp == NULL && bp->b_data_ref)
 897                 panic("buf_make_private: b_data_ref != 0 && ds_bp == NULL");
 898
 899         if (ds_bp && (bp->b_data_ref == 0 || bp->b_shadow_ref == 0))
 900                 panic("buf_make_private: ref_count == 0 && ds_bp != NULL");
 901
 902         if (ds_bp == NULL) {
 903                 lck_mtx_unlock(buf_mtxp);
 904
 905                 buf_free_meta_store(&my_buf);
 906
 907                 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, EINVAL, 0);
 908                 return (EINVAL);
 909         }
 910         for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) {
 911                 if ( !ISSET(t_bp->b_lflags, BL_EXTERNAL))
 912                         t_bp->b_data_store = ds_bp;
 913         }
 914         ds_bp->b_data_ref = bp->b_data_ref;
 915
 916         bp->b_data_ref = 0;
 917         bp->b_datap = my_buf.b_datap;
 918
 919         lck_mtx_unlock(buf_mtxp);
 920
 921         KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, 0, 0);
 922         return (0);
 923 }
 924 #endif
 925
 926
 927 void
 928 buf_setfilter(buf_t bp, void (*filter)(buf_t, void *), void *transaction,
 929                           void (**old_iodone)(buf_t, void *), void **old_transaction)
 930 {
 931         assert(ISSET(bp->b_lflags, BL_BUSY));
 932
 933         if (old_iodone)
 934                 *old_iodone = bp->b_iodone;
 935         if (old_transaction)
 936                 *old_transaction = bp->b_transaction;
 937
 938         bp->b_transaction = transaction;
 939         bp->b_iodone = filter;
 940         if (filter)
 941                 bp->b_flags |= B_FILTER;
 942         else
 943                 bp->b_flags &= ~B_FILTER;
 944 }
 945
 946
 947 daddr64_t
 948 buf_blkno(buf_t bp) {
 949
 950         return (bp->b_blkno);
 951 }
 952
 953 daddr64_t
 954 buf_lblkno(buf_t bp) {
 955
 956         return (bp->b_lblkno);
 957 }
 958
 959 void
 960 buf_setblkno(buf_t bp, daddr64_t blkno) {
 961
 962         bp->b_blkno = blkno;
 963 }
 964
 965 void
 966 buf_setlblkno(buf_t bp, daddr64_t lblkno) {
 967
 968         bp->b_lblkno = lblkno;
 969 }
 970
 971 dev_t
 972 buf_device(buf_t bp) {
 973
 974         return (bp->b_dev);
 975 }
 976
 977 errno_t
 978 buf_setdevice(buf_t bp, vnode_t vp) {
 979
 980         if ((vp->v_type != VBLK) && (vp->v_type != VCHR))
 981                 return EINVAL;
 982         bp->b_dev = vp->v_rdev;
 983
 984         return 0;
 985 }
 986
 987
 988 void *
 989 buf_drvdata(buf_t bp) {
 990
 991         return (bp->b_drvdata);
 992 }
 993
 994 void
 995 buf_setdrvdata(buf_t bp, void *drvdata) {
 996
 997         bp->b_drvdata = drvdata;
 998 }
 999
1000 void *
1001 buf_fsprivate(buf_t bp) {
1002
1003         return (bp->b_fsprivate);
1004 }
1005
1006 void
1007 buf_setfsprivate(buf_t bp, void *fsprivate) {
1008
1009         bp->b_fsprivate = fsprivate;
1010 }
1011
1012 kauth_cred_t
1013 buf_rcred(buf_t bp) {
1014
1015         return (bp->b_rcred);
1016 }
1017
1018 kauth_cred_t
1019 buf_wcred(buf_t bp) {
1020
1021         return (bp->b_wcred);
1022 }
1023
1024 void *
1025 buf_upl(buf_t bp) {
1026
1027         return (bp->b_upl);
1028 }
1029
1030 uint32_t
1031 buf_uploffset(buf_t bp) {
1032
1033         return ((uint32_t)(bp->b_uploffset));
1034 }
1035
1036 proc_t
1037 buf_proc(buf_t bp) {
1038
1039         return (bp->b_proc);
1040 }
1041
1042
1043 errno_t
1044 buf_map(buf_t bp, caddr_t *io_addr)
1045 {
1046         buf_t           real_bp;
1047         vm_offset_t     vaddr;
1048         kern_return_t   kret;
1049
1050         if ( !(bp->b_flags & B_CLUSTER)) {
1051                 *io_addr = (caddr_t)bp->b_datap;
1052                 return (0);
1053         }
1054         real_bp = (buf_t)(bp->b_real_bp);
1055
1056         if (real_bp && real_bp->b_datap) {
1057                 /*
1058                  * b_real_bp is only valid if B_CLUSTER is SET
1059                  * if it's non-zero, than someone did a cluster_bp call
1060                  * if the backing physical pages were already mapped
1061                  * in before the call to cluster_bp (non-zero b_datap),
1062                  * than we just use that mapping
1063                  */
1064                 *io_addr = (caddr_t)real_bp->b_datap;
1065                 return (0);
1066         }
1067         kret = ubc_upl_map(bp->b_upl, &vaddr);    /* Map it in */
1068
1069         if (kret != KERN_SUCCESS) {
1070                 *io_addr = NULL;
1071
1072                 return(ENOMEM);
1073         }
1074         vaddr += bp->b_uploffset;
1075
1076         *io_addr = (caddr_t)vaddr;
1077
1078         return (0);
1079 }
1080
1081 errno_t
1082 buf_unmap(buf_t bp)
1083 {
1084         buf_t           real_bp;
1085         kern_return_t   kret;
1086
1087         if ( !(bp->b_flags & B_CLUSTER))
1088                 return (0);
1089         /*
1090          * see buf_map for the explanation
1091          */
1092         real_bp = (buf_t)(bp->b_real_bp);
1093
1094         if (real_bp && real_bp->b_datap)
1095                 return (0);
1096
1097         if ((bp->b_lflags & BL_IOBUF) &&
1098             ((bp->b_flags & (B_PAGEIO | B_READ)) != (B_PAGEIO | B_READ))) {
1099                 /*
1100                  * ignore pageins... the 'right' thing will
1101                  * happen due to the way we handle speculative
1102                  * clusters...
1103                  *
1104                  * when we commit these pages, we'll hit
1105                  * it with UPL_COMMIT_INACTIVE which
1106                  * will clear the reference bit that got
1107                  * turned on when we touched the mapping
1108                  */
1109                 bp->b_flags |= B_AGE;
1110         }
1111         kret = ubc_upl_unmap(bp->b_upl);
1112
1113         if (kret != KERN_SUCCESS)
1114                 return (EINVAL);
1115         return (0);
1116 }
1117
1118
1119 void
1120 buf_clear(buf_t bp) {
1121         caddr_t baddr;
1122
1123         if (buf_map(bp, &baddr) == 0) {
1124                 bzero(baddr, bp->b_bcount);
1125                 buf_unmap(bp);
1126         }
1127         bp->b_resid = 0;
1128 }
1129
1130 /*
1131  * Read or write a buffer that is not contiguous on disk.
1132  * buffer is marked done/error at the conclusion
1133  */
1134 static int
1135 buf_strategy_fragmented(vnode_t devvp, buf_t bp, off_t f_offset, size_t contig_bytes)
1136 {
1137         vnode_t vp = buf_vnode(bp);
1138         buf_t   io_bp;                   /* For reading or writing a single block */
1139         int     io_direction;
1140         int     io_resid;
1141         size_t  io_contig_bytes;
1142         daddr64_t io_blkno;
1143         int     error = 0;
1144         int     bmap_flags;
1145
1146         /*
1147          * save our starting point... the bp was already mapped
1148          * in buf_strategy before we got called
1149          * no sense doing it again.
1150          */
1151         io_blkno = bp->b_blkno;
1152         /*
1153          * Make sure we redo this mapping for the next I/O
1154          * i.e. this can never be a 'permanent' mapping
1155          */
1156         bp->b_blkno = bp->b_lblkno;
1157
1158         /*
1159          * Get an io buffer to do the deblocking
1160          */
1161         io_bp = alloc_io_buf(devvp, 0);
1162
1163         io_bp->b_lblkno = bp->b_lblkno;
1164         io_bp->b_datap  = bp->b_datap;
1165         io_resid        = bp->b_bcount;
1166         io_direction    = bp->b_flags & B_READ;
1167         io_contig_bytes = contig_bytes;
1168
1169         if (bp->b_flags & B_READ)
1170                 bmap_flags = VNODE_READ;
1171         else
1172                 bmap_flags = VNODE_WRITE;
1173
1174         for (;;) {
1175                 if (io_blkno == -1)
1176                         /*
1177                          * this is unexepected, but we'll allow for it
1178                          */
1179                         bzero((caddr_t)io_bp->b_datap, (int)io_contig_bytes);
1180                 else {
1181                         io_bp->b_bcount  = io_contig_bytes;
1182                         io_bp->b_bufsize = io_contig_bytes;
1183                         io_bp->b_resid   = io_contig_bytes;
1184                         io_bp->b_blkno   = io_blkno;
1185
1186                         buf_reset(io_bp, io_direction);
1187
1188                         /*
1189                          * Call the device to do the I/O and wait for it.  Make sure the appropriate party is charged for write
1190                          */
1191
1192                         if (!ISSET(bp->b_flags, B_READ))
1193                                 OSAddAtomic(1, &devvp->v_numoutput);
1194
1195                         if ((error = VNOP_STRATEGY(io_bp)))
1196                                 break;
1197                         if ((error = (int)buf_biowait(io_bp)))
1198                                 break;
1199                         if (io_bp->b_resid) {
1200                                 io_resid -= (io_contig_bytes - io_bp->b_resid);
1201                                 break;
1202                         }
1203                 }
1204                 if ((io_resid -= io_contig_bytes) == 0)
1205                         break;
1206                 f_offset       += io_contig_bytes;
1207                 io_bp->b_datap += io_contig_bytes;
1208
1209                 /*
1210                  * Map the current position to a physical block number
1211                  */
1212                 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_resid, &io_blkno, &io_contig_bytes, NULL, bmap_flags, NULL)))
1213                         break;
1214         }
1215         buf_free(io_bp);
1216
1217         if (error)
1218                 buf_seterror(bp, error);
1219         bp->b_resid = io_resid;
1220         /*
1221          * This I/O is now complete
1222          */
1223         buf_biodone(bp);
1224
1225         return error;
1226 }
1227
1228
1229 /*
1230  * struct vnop_strategy_args {
1231  *      struct buf *a_bp;
1232  * } *ap;
1233  */
1234 errno_t
1235 buf_strategy(vnode_t devvp, void *ap)
1236 {
1237         buf_t   bp = ((struct vnop_strategy_args *)ap)->a_bp;
1238         vnode_t vp = bp->b_vp;
1239         int     bmap_flags;
1240         errno_t error;
1241 #if CONFIG_DTRACE
1242         int dtrace_io_start_flag = 0;    /* We only want to trip the io:::start
1243                                           * probe once, with the true physical
1244                                           * block in place (b_blkno)
1245                                           */
1246
1247 #endif
1248
1249         if (vp == NULL || vp->v_type == VCHR || vp->v_type == VBLK)
1250                 panic("buf_strategy: b_vp == NULL || vtype == VCHR | VBLK\n");
1251         /*
1252          * associate the physical device with
1253          * with this buf_t even if we don't
1254          * end up issuing the I/O...
1255          */
1256         bp->b_dev = devvp->v_rdev;
1257
1258         if (bp->b_flags & B_READ)
1259                 bmap_flags = VNODE_READ;
1260         else
1261                 bmap_flags = VNODE_WRITE;
1262
1263         if ( !(bp->b_flags & B_CLUSTER)) {
1264
1265                 if ( (bp->b_upl) ) {
1266                         /*
1267                          * we have a UPL associated with this bp
1268                          * go through cluster_bp which knows how
1269                          * to deal with filesystem block sizes
1270                          * that aren't equal to the page size
1271                          */
1272                         DTRACE_IO1(start, buf_t, bp);
1273                         return (cluster_bp(bp));
1274                 }
1275                 if (bp->b_blkno == bp->b_lblkno) {
1276                     off_t       f_offset;
1277                         size_t  contig_bytes;
1278
1279                         if ((error = VNOP_BLKTOOFF(vp, bp->b_lblkno, &f_offset))) {
1280                                 DTRACE_IO1(start, buf_t, bp);
1281                                 buf_seterror(bp, error);
1282                                 buf_biodone(bp);
1283
1284                             return (error);
1285                         }
1286
1287                 if ((error = VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))) {
1288                                 DTRACE_IO1(start, buf_t, bp);
1289                                 buf_seterror(bp, error);
1290                                 buf_biodone(bp);
1291
1292                                 return (error);
1293                         }
1294
1295                         DTRACE_IO1(start, buf_t, bp);
1296 #if CONFIG_DTRACE
1297                         dtrace_io_start_flag = 1;
1298 #endif /* CONFIG_DTRACE */
1299
1300                         if ((bp->b_blkno == -1) || (contig_bytes == 0)) {
1301                                 /* Set block number to force biodone later */
1302                                 bp->b_blkno = -1;
1303                                 buf_clear(bp);
1304                         }
1305                         else if ((long)contig_bytes < bp->b_bcount) {
1306                                 return (buf_strategy_fragmented(devvp, bp, f_offset, contig_bytes));
1307                         }
1308                 }
1309
1310 #if CONFIG_DTRACE
1311                 if (dtrace_io_start_flag == 0) {
1312                         DTRACE_IO1(start, buf_t, bp);
1313                         dtrace_io_start_flag = 1;
1314                 }
1315 #endif /* CONFIG_DTRACE */
1316
1317                 if (bp->b_blkno == -1) {
1318                         buf_biodone(bp);
1319                         return (0);
1320                 }
1321         }
1322
1323 #if CONFIG_DTRACE
1324         if (dtrace_io_start_flag == 0)
1325                 DTRACE_IO1(start, buf_t, bp);
1326 #endif /* CONFIG_DTRACE */
1327
1328 #if CONFIG_PROTECT
1329         /* Capture f_offset in the bufattr*/
1330         cpx_t cpx = bufattr_cpx(buf_attr(bp));
1331         if (cpx) {
1332                 /* No need to go here for older EAs */
1333                 if(cpx_use_offset_for_iv(cpx) && !cpx_synthetic_offset_for_iv(cpx)) {
1334                         off_t f_offset;
1335                         if ((error = VNOP_BLKTOOFF(bp->b_vp, bp->b_lblkno, &f_offset)))
1336                                 return error;
1337
1338                         /*
1339                          * Attach the file offset to this buffer.  The
1340                          * bufattr attributes will be passed down the stack
1341                          * until they reach the storage driver (whether
1342                          * IOFlashStorage, ASP, or IONVMe). The driver
1343                          * will retain the offset in a local variable when it
1344                          * issues its I/Os to the NAND controller.
1345                          *
1346                          * Note that LwVM may end up splitting this I/O
1347                          * into sub-I/Os if it crosses a chunk boundary.  In this
1348                          * case, LwVM will update this field when it dispatches
1349                          * each I/O to IOFlashStorage.  But from our perspective
1350                          * we have only issued a single I/O.
1351                          *
1352                          * In the case of APFS we do not bounce through another
1353                          * intermediate layer (such as CoreStorage). APFS will
1354                          * issue the I/Os directly to the block device / IOMedia
1355                          * via buf_strategy on the specfs node.
1356                          */
1357                         buf_setcpoff(bp, f_offset);
1358                         CP_DEBUG((CPDBG_OFFSET_IO | DBG_FUNC_NONE), (uint32_t) f_offset, (uint32_t) bp->b_lblkno, (uint32_t) bp->b_blkno, (uint32_t) bp->b_bcount, 0);
1359                 }
1360         }
1361 #endif
1362
1363         /*
1364          * we can issue the I/O because...
1365          * either B_CLUSTER is set which
1366          * means that the I/O is properly set
1367          * up to be a multiple of the page size, or
1368          * we were able to successfully set up the
1369          * physical block mapping
1370          */
1371         error = VOCALL(devvp->v_op, VOFFSET(vnop_strategy), ap);
1372         DTRACE_FSINFO(strategy, vnode_t, vp);
1373         return (error);
1374 }
1375
1376
1377
1378 buf_t
1379 buf_alloc(vnode_t vp)
1380 {
1381         return(alloc_io_buf(vp, is_vm_privileged()));
1382 }
1383
1384 void
1385 buf_free(buf_t bp) {
1386
1387         free_io_buf(bp);
1388 }
1389
1390
1391 /*
1392  * iterate buffers for the specified vp.
1393  *   if BUF_SCAN_DIRTY is set, do the dirty list
1394  *   if BUF_SCAN_CLEAN is set, do the clean list
1395  *   if neither flag is set, default to BUF_SCAN_DIRTY
1396  *   if BUF_NOTIFY_BUSY is set, call the callout function using a NULL bp for busy pages
1397  */
1398
1399 struct buf_iterate_info_t {
1400         int flag;
1401         struct buflists *listhead;
1402 };
1403
1404 void
1405 buf_iterate(vnode_t vp, int (*callout)(buf_t, void *), int flags, void *arg)
1406 {
1407         buf_t   bp;
1408         int     retval;
1409         struct  buflists local_iterblkhd;
1410         int     lock_flags = BAC_NOWAIT | BAC_REMOVE;
1411         int     notify_busy = flags & BUF_NOTIFY_BUSY;
1412         struct buf_iterate_info_t list[2];
1413         int     num_lists, i;
1414
1415         if (flags & BUF_SKIP_LOCKED)
1416                 lock_flags |= BAC_SKIP_LOCKED;
1417         if (flags & BUF_SKIP_NONLOCKED)
1418                 lock_flags |= BAC_SKIP_NONLOCKED;
1419
1420         if ( !(flags & (BUF_SCAN_DIRTY | BUF_SCAN_CLEAN)))
1421                 flags |= BUF_SCAN_DIRTY;
1422
1423         num_lists = 0;
1424
1425         if (flags & BUF_SCAN_DIRTY) {
1426                 list[num_lists].flag = VBI_DIRTY;
1427                 list[num_lists].listhead = &vp->v_dirtyblkhd;
1428                 num_lists++;
1429         }
1430         if (flags & BUF_SCAN_CLEAN) {
1431                 list[num_lists].flag = VBI_CLEAN;
1432                 list[num_lists].listhead = &vp->v_cleanblkhd;
1433                 num_lists++;
1434         }
1435
1436         for (i = 0; i < num_lists; i++) {
1437                 lck_mtx_lock(buf_mtxp);
1438
1439                 if (buf_iterprepare(vp, &local_iterblkhd, list[i].flag))  {
1440                         lck_mtx_unlock(buf_mtxp);
1441                         continue;
1442                 }
1443                 while (!LIST_EMPTY(&local_iterblkhd)) {
1444                         bp = LIST_FIRST(&local_iterblkhd);
1445                         LIST_REMOVE(bp, b_vnbufs);
1446                         LIST_INSERT_HEAD(list[i].listhead, bp, b_vnbufs);
1447
1448                         if (buf_acquire_locked(bp, lock_flags, 0, 0)) {
1449                                 if (notify_busy) {
1450                                         bp = NULL;
1451                                 } else {
1452                                         continue;
1453                                 }
1454                         }
1455
1456                         lck_mtx_unlock(buf_mtxp);
1457
1458                         retval = callout(bp, arg);
1459
1460                         switch (retval) {
1461                         case BUF_RETURNED:
1462                                 if (bp)
1463                                         buf_brelse(bp);
1464                                 break;
1465                         case BUF_CLAIMED:
1466                                 break;
1467                         case BUF_RETURNED_DONE:
1468                                 if (bp)
1469                                         buf_brelse(bp);
1470                                 lck_mtx_lock(buf_mtxp);
1471                                 goto out;
1472                         case BUF_CLAIMED_DONE:
1473                                 lck_mtx_lock(buf_mtxp);
1474                                 goto out;
1475                         }
1476                         lck_mtx_lock(buf_mtxp);
1477                 } /* while list has more nodes */
1478           out:
1479                 buf_itercomplete(vp, &local_iterblkhd, list[i].flag);
1480                 lck_mtx_unlock(buf_mtxp);
1481         } /* for each list */
1482 } /* buf_iterate */
1483
1484
1485 /*
1486  * Flush out and invalidate all buffers associated with a vnode.
1487  */
1488 int
1489 buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo)
1490 {
1491         buf_t   bp;
1492         int     aflags;
1493         int     error = 0;
1494         int     must_rescan = 1;
1495         struct  buflists local_iterblkhd;
1496
1497
1498         if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd))
1499                 return (0);
1500
1501         lck_mtx_lock(buf_mtxp);
1502
1503         for (;;) {
1504                 if (must_rescan == 0)
1505                         /*
1506                          * the lists may not be empty, but all that's left at this
1507                          * point are metadata or B_LOCKED buffers which are being
1508                          * skipped... we know this because we made it through both
1509                          * the clean and dirty lists without dropping buf_mtxp...
1510                          * each time we drop buf_mtxp we bump "must_rescan"
1511                          */
1512                         break;
1513                 if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd))
1514                         break;
1515                 must_rescan = 0;
1516                 /*
1517                  * iterate the clean list
1518                  */
1519                 if (buf_iterprepare(vp, &local_iterblkhd, VBI_CLEAN)) {
1520                         goto try_dirty_list;
1521                 }
1522                 while (!LIST_EMPTY(&local_iterblkhd)) {
1523
1524                         bp = LIST_FIRST(&local_iterblkhd);
1525
1526                         LIST_REMOVE(bp, b_vnbufs);
1527                         LIST_INSERT_HEAD(&vp->v_cleanblkhd, bp, b_vnbufs);
1528
1529                         /*
1530                          * some filesystems distinguish meta data blocks with a negative logical block #
1531                          */
1532                         if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
1533                                 continue;
1534
1535                         aflags = BAC_REMOVE;
1536
1537                         if ( !(flags & BUF_INVALIDATE_LOCKED) )
1538                                 aflags |= BAC_SKIP_LOCKED;
1539
1540                         if ( (error = (int)buf_acquire_locked(bp, aflags, slpflag, slptimeo)) ) {
1541                                 if (error == EDEADLK)
1542                                         /*
1543                                          * this buffer was marked B_LOCKED...
1544                                          * we didn't drop buf_mtxp, so we
1545                                          * we don't need to rescan
1546                                          */
1547                                         continue;
1548                                 if (error == EAGAIN) {
1549                                         /*
1550                                          * found a busy buffer... we blocked and
1551                                          * dropped buf_mtxp, so we're going to
1552                                          * need to rescan after this pass is completed
1553                                          */
1554                                         must_rescan++;
1555                                         continue;
1556                                 }
1557                                 /*
1558                                  * got some kind of 'real' error out of the msleep
1559                                  * in buf_acquire_locked, terminate the scan and return the error
1560                                  */
1561                                 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1562
1563                                 lck_mtx_unlock(buf_mtxp);
1564                                 return (error);
1565                         }
1566                         lck_mtx_unlock(buf_mtxp);
1567
1568                         if (bp->b_flags & B_LOCKED)
1569                                 KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 0, 0);
1570
1571                         CLR(bp->b_flags, B_LOCKED);
1572                         SET(bp->b_flags, B_INVAL);
1573                         buf_brelse(bp);
1574
1575                         lck_mtx_lock(buf_mtxp);
1576
1577                         /*
1578                          * by dropping buf_mtxp, we allow new
1579                          * buffers to be added to the vnode list(s)
1580                          * we'll have to rescan at least once more
1581                          * if the queues aren't empty
1582                          */
1583                         must_rescan++;
1584                 }
1585                 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1586
1587 try_dirty_list:
1588                 /*
1589                  * Now iterate on dirty blks
1590                  */
1591                 if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY)) {
1592                         continue;
1593                 }
1594                 while (!LIST_EMPTY(&local_iterblkhd)) {
1595                         bp = LIST_FIRST(&local_iterblkhd);
1596
1597                         LIST_REMOVE(bp, b_vnbufs);
1598                         LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1599
1600                         /*
1601                          * some filesystems distinguish meta data blocks with a negative logical block #
1602                          */
1603                         if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
1604                                 continue;
1605
1606                         aflags = BAC_REMOVE;
1607
1608                         if ( !(flags & BUF_INVALIDATE_LOCKED) )
1609                                 aflags |= BAC_SKIP_LOCKED;
1610
1611                         if ( (error = (int)buf_acquire_locked(bp, aflags, slpflag, slptimeo)) ) {
1612                                 if (error == EDEADLK)
1613                                         /*
1614                                          * this buffer was marked B_LOCKED...
1615                                          * we didn't drop buf_mtxp, so we
1616                                          * we don't need to rescan
1617                                          */
1618                                         continue;
1619                                 if (error == EAGAIN) {
1620                                         /*
1621                                          * found a busy buffer... we blocked and
1622                                          * dropped buf_mtxp, so we're going to
1623                                          * need to rescan after this pass is completed
1624                                          */
1625                                         must_rescan++;
1626                                         continue;
1627                                 }
1628                                 /*
1629                                  * got some kind of 'real' error out of the msleep
1630                                  * in buf_acquire_locked, terminate the scan and return the error
1631                                  */
1632                                 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1633
1634                                 lck_mtx_unlock(buf_mtxp);
1635                                 return (error);
1636                         }
1637                         lck_mtx_unlock(buf_mtxp);
1638
1639                         if (bp->b_flags & B_LOCKED)
1640                                 KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 1, 0);
1641
1642                         CLR(bp->b_flags, B_LOCKED);
1643                         SET(bp->b_flags, B_INVAL);
1644
1645                         if (ISSET(bp->b_flags, B_DELWRI) && (flags & BUF_WRITE_DATA))
1646                                 (void) VNOP_BWRITE(bp);
1647                         else
1648                                 buf_brelse(bp);
1649
1650                         lck_mtx_lock(buf_mtxp);
1651                         /*
1652                          * by dropping buf_mtxp, we allow new
1653                          * buffers to be added to the vnode list(s)
1654                          * we'll have to rescan at least once more
1655                          * if the queues aren't empty
1656                          */
1657                         must_rescan++;
1658                 }
1659                 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1660         }
1661         lck_mtx_unlock(buf_mtxp);
1662
1663         return (0);
1664 }
1665
1666 void
1667 buf_flushdirtyblks(vnode_t vp, int wait, int flags, const char *msg) {
1668
1669         (void) buf_flushdirtyblks_skipinfo(vp, wait, flags, msg);
1670         return;
1671 }
1672
1673 int
1674 buf_flushdirtyblks_skipinfo(vnode_t vp, int wait, int flags, const char *msg) {
1675         buf_t   bp;
1676         int     writes_issued = 0;
1677         errno_t error;
1678         int     busy = 0;
1679         struct  buflists local_iterblkhd;
1680         int     lock_flags = BAC_NOWAIT | BAC_REMOVE;
1681         int any_locked = 0;
1682
1683         if (flags & BUF_SKIP_LOCKED)
1684                 lock_flags |= BAC_SKIP_LOCKED;
1685         if (flags & BUF_SKIP_NONLOCKED)
1686                 lock_flags |= BAC_SKIP_NONLOCKED;
1687 loop:
1688         lck_mtx_lock(buf_mtxp);
1689
1690         if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY) == 0)  {
1691                 while (!LIST_EMPTY(&local_iterblkhd)) {
1692                         bp = LIST_FIRST(&local_iterblkhd);
1693                         LIST_REMOVE(bp, b_vnbufs);
1694                         LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1695
1696                         if ((error = buf_acquire_locked(bp, lock_flags, 0, 0)) == EBUSY) {
1697                                 busy++;
1698                         }
1699                         if (error) {
1700                                 /*
1701                                  * If we passed in BUF_SKIP_LOCKED or BUF_SKIP_NONLOCKED,
1702                                  * we may want to do somethign differently if a locked or unlocked
1703                                  * buffer was encountered (depending on the arg specified).
1704                                  * In this case, we know that one of those two was set, and the
1705                                  * buf acquisition failed above.
1706                                  *
1707                                  * If it failed with EDEADLK, then save state which can be emitted
1708                                  * later on to the caller.  Most callers should not care.
1709                                  */
1710                                 if (error == EDEADLK) {
1711                                         any_locked++;
1712                                 }
1713                                 continue;
1714                         }
1715                         lck_mtx_unlock(buf_mtxp);
1716
1717                         bp->b_flags &= ~B_LOCKED;
1718
1719                         /*
1720                          * Wait for I/O associated with indirect blocks to complete,
1721                          * since there is no way to quickly wait for them below.
1722                          */
1723                         if ((bp->b_vp == vp) || (wait == 0))
1724                                 (void) buf_bawrite(bp);
1725                         else
1726                                 (void) VNOP_BWRITE(bp);
1727                         writes_issued++;
1728
1729                         lck_mtx_lock(buf_mtxp);
1730                 }
1731                 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1732         }
1733         lck_mtx_unlock(buf_mtxp);
1734
1735         if (wait) {
1736                 (void)vnode_waitforwrites(vp, 0, 0, 0, msg);
1737
1738                 if (vp->v_dirtyblkhd.lh_first && busy) {
1739                         /*
1740                          * we had one or more BUSY buffers on
1741                          * the dirtyblock list... most likely
1742                          * these are due to delayed writes that
1743                          * were moved to the bclean queue but
1744                          * have not yet been 'written'.
1745                          * if we issued some writes on the
1746                          * previous pass, we try again immediately
1747                          * if we didn't, we'll sleep for some time
1748                          * to allow the state to change...
1749                          */
1750                         if (writes_issued == 0) {
1751                                 (void)tsleep((caddr_t)&vp->v_numoutput,
1752                                              PRIBIO + 1, "vnode_flushdirtyblks", hz/20);
1753                         }
1754                         writes_issued = 0;
1755                         busy = 0;
1756
1757                         goto loop;
1758                 }
1759         }
1760
1761         return any_locked;
1762 }
1763
1764
1765 /*
1766  * called with buf_mtxp held...
1767  * this lock protects the queue manipulation
1768  */
1769 static int
1770 buf_iterprepare(vnode_t vp, struct buflists *iterheadp, int flags)
1771 {
1772         struct buflists * listheadp;
1773
1774         if (flags & VBI_DIRTY)
1775                 listheadp = &vp->v_dirtyblkhd;
1776         else
1777                 listheadp = &vp->v_cleanblkhd;
1778
1779         while (vp->v_iterblkflags & VBI_ITER)   {
1780                 vp->v_iterblkflags |= VBI_ITERWANT;
1781                 msleep(&vp->v_iterblkflags, buf_mtxp, 0, "buf_iterprepare", NULL);
1782         }
1783         if (LIST_EMPTY(listheadp)) {
1784                 LIST_INIT(iterheadp);
1785                 return(EINVAL);
1786         }
1787         vp->v_iterblkflags |= VBI_ITER;
1788
1789         iterheadp->lh_first = listheadp->lh_first;
1790         listheadp->lh_first->b_vnbufs.le_prev = &iterheadp->lh_first;
1791         LIST_INIT(listheadp);
1792
1793         return(0);
1794 }
1795
1796 /*
1797  * called with buf_mtxp held...
1798  * this lock protects the queue manipulation
1799  */
1800 static void
1801 buf_itercomplete(vnode_t vp, struct buflists *iterheadp, int flags)
1802 {
1803         struct buflists * listheadp;
1804         buf_t bp;
1805
1806         if (flags & VBI_DIRTY)
1807                 listheadp = &vp->v_dirtyblkhd;
1808         else
1809                 listheadp = &vp->v_cleanblkhd;
1810
1811         while (!LIST_EMPTY(iterheadp)) {
1812                 bp = LIST_FIRST(iterheadp);
1813                 LIST_REMOVE(bp, b_vnbufs);
1814                 LIST_INSERT_HEAD(listheadp, bp, b_vnbufs);
1815         }
1816         vp->v_iterblkflags &= ~VBI_ITER;
1817
1818         if  (vp->v_iterblkflags & VBI_ITERWANT)         {
1819                 vp->v_iterblkflags &= ~VBI_ITERWANT;
1820                 wakeup(&vp->v_iterblkflags);
1821         }
1822 }
1823
1824
1825 static void
1826 bremfree_locked(buf_t bp)
1827 {
1828         struct bqueues *dp = NULL;
1829         int whichq;
1830
1831         whichq = bp->b_whichq;
1832
1833         if (whichq == -1) {
1834                 if (bp->b_shadow_ref == 0)
1835                         panic("bremfree_locked: %p not on freelist", bp);
1836                 /*
1837                  * there are clones pointing to 'bp'...
1838                  * therefore, it was not put on a freelist
1839                  * when buf_brelse was last called on 'bp'
1840                  */
1841                 return;
1842         }
1843         /*
1844          * We only calculate the head of the freelist when removing
1845          * the last element of the list as that is the only time that
1846          * it is needed (e.g. to reset the tail pointer).
1847          *
1848          * NB: This makes an assumption about how tailq's are implemented.
1849          */
1850         if (bp->b_freelist.tqe_next == NULL) {
1851                 dp = &bufqueues[whichq];
1852
1853                 if (dp->tqh_last != &bp->b_freelist.tqe_next)
1854                         panic("bremfree: lost tail");
1855         }
1856         TAILQ_REMOVE(dp, bp, b_freelist);
1857
1858         if (whichq == BQ_LAUNDRY)
1859                 blaundrycnt--;
1860
1861         bp->b_whichq = -1;
1862         bp->b_timestamp = 0;
1863         bp->b_shadow = 0;
1864 }
1865
1866 /*
1867  * Associate a buffer with a vnode.
1868  * buf_mtxp must be locked on entry
1869  */
1870 static void
1871 bgetvp_locked(vnode_t vp, buf_t bp)
1872 {
1873
1874         if (bp->b_vp != vp)
1875                 panic("bgetvp_locked: not free");
1876
1877         if (vp->v_type == VBLK || vp->v_type == VCHR)
1878                 bp->b_dev = vp->v_rdev;
1879         else
1880                 bp->b_dev = NODEV;
1881         /*
1882          * Insert onto list for new vnode.
1883          */
1884         bufinsvn(bp, &vp->v_cleanblkhd);
1885 }
1886
1887 /*
1888  * Disassociate a buffer from a vnode.
1889  * buf_mtxp must be locked on entry
1890  */
1891 static void
1892 brelvp_locked(buf_t bp)
1893 {
1894         /*
1895          * Delete from old vnode list, if on one.
1896          */
1897         if (bp->b_vnbufs.le_next != NOLIST)
1898                 bufremvn(bp);
1899
1900         bp->b_vp = (vnode_t)NULL;
1901 }
1902
1903 /*
1904  * Reassign a buffer from one vnode to another.
1905  * Used to assign file specific control information
1906  * (indirect blocks) to the vnode to which they belong.
1907  */
1908 static void
1909 buf_reassign(buf_t bp, vnode_t newvp)
1910 {
1911         struct buflists *listheadp;
1912
1913         if (newvp == NULL) {
1914                 printf("buf_reassign: NULL");
1915                 return;
1916         }
1917         lck_mtx_lock_spin(buf_mtxp);
1918
1919         /*
1920          * Delete from old vnode list, if on one.
1921          */
1922         if (bp->b_vnbufs.le_next != NOLIST)
1923                 bufremvn(bp);
1924         /*
1925          * If dirty, put on list of dirty buffers;
1926          * otherwise insert onto list of clean buffers.
1927          */
1928         if (ISSET(bp->b_flags, B_DELWRI))
1929                 listheadp = &newvp->v_dirtyblkhd;
1930         else
1931                 listheadp = &newvp->v_cleanblkhd;
1932         bufinsvn(bp, listheadp);
1933
1934         lck_mtx_unlock(buf_mtxp);
1935 }
1936
1937 static __inline__ void
1938 bufhdrinit(buf_t bp)
1939 {
1940         bzero((char *)bp, sizeof *bp);
1941         bp->b_dev = NODEV;
1942         bp->b_rcred = NOCRED;
1943         bp->b_wcred = NOCRED;
1944         bp->b_vnbufs.le_next = NOLIST;
1945         bp->b_flags = B_INVAL;
1946
1947         return;
1948 }
1949
1950 /*
1951  * Initialize buffers and hash links for buffers.
1952  */
1953 __private_extern__ void
1954 bufinit(void)
1955 {
1956         buf_t   bp;
1957         struct bqueues *dp;
1958         int     i;
1959
1960         nbuf_headers = 0;
1961         /* Initialize the buffer queues ('freelists') and the hash table */
1962         for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
1963                 TAILQ_INIT(dp);
1964         bufhashtbl = hashinit(nbuf_hashelements, M_CACHE, &bufhash);
1965
1966         buf_busycount = 0;
1967
1968         /* Initialize the buffer headers */
1969         for (i = 0; i < max_nbuf_headers; i++) {
1970                 nbuf_headers++;
1971                 bp = &buf_headers[i];
1972                 bufhdrinit(bp);
1973
1974                 BLISTNONE(bp);
1975                 dp = &bufqueues[BQ_EMPTY];
1976                 bp->b_whichq = BQ_EMPTY;
1977                 bp->b_timestamp = buf_timestamp();
1978                 binsheadfree(bp, dp, BQ_EMPTY);
1979                 binshash(bp, &invalhash);
1980         }
1981         boot_nbuf_headers = nbuf_headers;
1982
1983         TAILQ_INIT(&iobufqueue);
1984         TAILQ_INIT(&delaybufqueue);
1985
1986         for (; i < nbuf_headers + niobuf_headers; i++) {
1987                 bp = &buf_headers[i];
1988                 bufhdrinit(bp);
1989                 bp->b_whichq = -1;
1990                 binsheadfree(bp, &iobufqueue, -1);
1991         }
1992
1993         /*
1994          * allocate lock group attribute and group
1995          */
1996         buf_mtx_grp_attr = lck_grp_attr_alloc_init();
1997         buf_mtx_grp = lck_grp_alloc_init("buffer cache", buf_mtx_grp_attr);
1998
1999         /*
2000          * allocate the lock attribute
2001          */
2002         buf_mtx_attr = lck_attr_alloc_init();
2003
2004         /*
2005          * allocate and initialize mutex's for the buffer and iobuffer pools
2006          */
2007         buf_mtxp        = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
2008         iobuffer_mtxp   = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
2009         buf_gc_callout  = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
2010
2011         if (iobuffer_mtxp == NULL)
2012                 panic("couldn't create iobuffer mutex");
2013
2014         if (buf_mtxp == NULL)
2015                 panic("couldn't create buf mutex");
2016
2017         if (buf_gc_callout == NULL)
2018                 panic("couldn't create buf_gc_callout mutex");
2019
2020         /*
2021          * allocate and initialize cluster specific global locks...
2022          */
2023         cluster_init();
2024
2025         printf("using %d buffer headers and %d cluster IO buffer headers\n",
2026                 nbuf_headers, niobuf_headers);
2027
2028         /* Set up zones used by the buffer cache */
2029         bufzoneinit();
2030
2031         /* start the bcleanbuf() thread */
2032         bcleanbuf_thread_init();
2033
2034         /* Register a callout for relieving vm pressure */
2035         if (vm_set_buffer_cleanup_callout(buffer_cache_gc) != KERN_SUCCESS) {
2036                 panic("Couldn't register buffer cache callout for vm pressure!\n");
2037         }
2038
2039 }
2040
2041 /*
2042  * Zones for the meta data buffers
2043  */
2044
2045 #define MINMETA 512
2046 #define MAXMETA 16384
2047
2048 struct meta_zone_entry {
2049         zone_t mz_zone;
2050         vm_size_t mz_size;
2051         vm_size_t mz_max;
2052         const char *mz_name;
2053 };
2054
2055 struct meta_zone_entry meta_zones[] = {
2056         {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
2057         {NULL, (MINMETA * 2),  64 * (MINMETA * 2), "buf.1024" },
2058         {NULL, (MINMETA * 4),  16 * (MINMETA * 4), "buf.2048" },
2059         {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
2060         {NULL, (MINMETA * 16), 512 * (MINMETA * 16), "buf.8192" },
2061         {NULL, (MINMETA * 32), 512 * (MINMETA * 32), "buf.16384" },
2062         {NULL, 0, 0, "" } /* End */
2063 };
2064
2065 /*
2066  * Initialize the meta data zones
2067  */
2068 static void
2069 bufzoneinit(void)
2070 {
2071         int i;
2072
2073         for (i = 0; meta_zones[i].mz_size != 0; i++) {
2074                 meta_zones[i].mz_zone =
2075                                 zinit(meta_zones[i].mz_size,
2076                                         meta_zones[i].mz_max,
2077                                         PAGE_SIZE,
2078                                         meta_zones[i].mz_name);
2079                 zone_change(meta_zones[i].mz_zone, Z_CALLERACCT, FALSE);
2080         }
2081         buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
2082         zone_change(buf_hdr_zone, Z_CALLERACCT, FALSE);
2083 }
2084
2085 static __inline__ zone_t
2086 getbufzone(size_t size)
2087 {
2088         int i;
2089
2090         if ((size % 512) || (size < MINMETA) || (size > MAXMETA))
2091                 panic("getbufzone: incorect size = %lu", size);
2092
2093         for (i = 0; meta_zones[i].mz_size != 0; i++) {
2094                 if (meta_zones[i].mz_size >= size)
2095                         break;
2096         }
2097
2098         return (meta_zones[i].mz_zone);
2099 }
2100
2101
2102
2103 static struct buf *
2104 bio_doread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, int async, int queuetype)
2105 {
2106         buf_t   bp;
2107
2108         bp = buf_getblk(vp, blkno, size, 0, 0, queuetype);
2109
2110         /*
2111          * If buffer does not have data valid, start a read.
2112          * Note that if buffer is B_INVAL, buf_getblk() won't return it.
2113          * Therefore, it's valid if it's I/O has completed or been delayed.
2114          */
2115         if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
2116                 struct proc *p;
2117
2118                 p = current_proc();
2119
2120                 /* Start I/O for the buffer (keeping credentials). */
2121                 SET(bp->b_flags, B_READ | async);
2122                 if (IS_VALID_CRED(cred) && !IS_VALID_CRED(bp->b_rcred)) {
2123                         kauth_cred_ref(cred);
2124                         bp->b_rcred = cred;
2125                 }
2126
2127                 VNOP_STRATEGY(bp);
2128
2129                 trace(TR_BREADMISS, pack(vp, size), blkno);
2130
2131                 /* Pay for the read. */
2132                 if (p && p->p_stats) {
2133                         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_inblock);            /* XXX */
2134                 }
2135
2136                 if (async) {
2137                         /*
2138                          * since we asked for an ASYNC I/O
2139                          * the biodone will do the brelse
2140                          * we don't want to pass back a bp
2141                          * that we don't 'own'
2142                          */
2143                         bp = NULL;
2144                 }
2145         } else if (async) {
2146                 buf_brelse(bp);
2147                 bp = NULL;
2148         }
2149
2150         trace(TR_BREADHIT, pack(vp, size), blkno);
2151
2152         return (bp);
2153 }
2154
2155 /*
2156  * Perform the reads for buf_breadn() and buf_meta_breadn().
2157  * Trivial modification to the breada algorithm presented in Bach (p.55).
2158  */
2159 static errno_t
2160 do_breadn_for_type(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes,
2161                    int nrablks, kauth_cred_t cred, buf_t *bpp, int queuetype)
2162 {
2163         buf_t   bp;
2164         int     i;
2165
2166         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, queuetype);
2167
2168         /*
2169          * For each of the read-ahead blocks, start a read, if necessary.
2170          */
2171         for (i = 0; i < nrablks; i++) {
2172                 /* If it's in the cache, just go on to next one. */
2173                 if (incore(vp, rablks[i]))
2174                         continue;
2175
2176                 /* Get a buffer for the read-ahead block */
2177                 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, queuetype);
2178         }
2179
2180         /* Otherwise, we had to start a read for it; wait until it's valid. */
2181         return (buf_biowait(bp));
2182 }
2183
2184
2185 /*
2186  * Read a disk block.
2187  * This algorithm described in Bach (p.54).
2188  */
2189 errno_t
2190 buf_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp)
2191 {
2192         buf_t   bp;
2193
2194         /* Get buffer for block. */
2195         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
2196
2197         /* Wait for the read to complete, and return result. */
2198         return (buf_biowait(bp));
2199 }
2200
2201 /*
2202  * Read a disk block. [bread() for meta-data]
2203  * This algorithm described in Bach (p.54).
2204  */
2205 errno_t
2206 buf_meta_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp)
2207 {
2208         buf_t   bp;
2209
2210         /* Get buffer for block. */
2211         bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
2212
2213         /* Wait for the read to complete, and return result. */
2214         return (buf_biowait(bp));
2215 }
2216
2217 /*
2218  * Read-ahead multiple disk blocks. The first is sync, the rest async.
2219  */
2220 errno_t
2221 buf_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, kauth_cred_t cred, buf_t *bpp)
2222 {
2223         return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_READ));
2224 }
2225
2226 /*
2227  * Read-ahead multiple disk blocks. The first is sync, the rest async.
2228  * [buf_breadn() for meta-data]
2229  */
2230 errno_t
2231 buf_meta_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, kauth_cred_t cred, buf_t *bpp)
2232 {
2233         return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_META));
2234 }
2235
2236 /*
2237  * Block write.  Described in Bach (p.56)
2238  */
2239 errno_t
2240 buf_bwrite(buf_t bp)
2241 {
2242         int     sync, wasdelayed;
2243         errno_t rv;
2244         proc_t  p = current_proc();
2245         vnode_t vp = bp->b_vp;
2246
2247         if (bp->b_datap == 0) {
2248                 if (brecover_data(bp) == 0)
2249                         return (0);
2250         }
2251         /* Remember buffer type, to switch on it later. */
2252         sync = !ISSET(bp->b_flags, B_ASYNC);
2253         wasdelayed = ISSET(bp->b_flags, B_DELWRI);
2254         CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
2255
2256         if (wasdelayed)
2257                 OSAddAtomicLong(-1, &nbdwrite);
2258
2259         if (!sync) {
2260                 /*
2261                  * If not synchronous, pay for the I/O operation and make
2262                  * sure the buf is on the correct vnode queue.  We have
2263                  * to do this now, because if we don't, the vnode may not
2264                  * be properly notified that its I/O has completed.
2265                  */
2266                 if (wasdelayed)
2267                         buf_reassign(bp, vp);
2268                 else
2269                         if (p && p->p_stats) {
2270                                 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock);    /* XXX */
2271                         }
2272         }
2273         trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
2274
2275         /* Initiate disk write.  Make sure the appropriate party is charged. */
2276
2277         OSAddAtomic(1, &vp->v_numoutput);
2278
2279         VNOP_STRATEGY(bp);
2280
2281         if (sync) {
2282                 /*
2283                  * If I/O was synchronous, wait for it to complete.
2284                  */
2285                 rv = buf_biowait(bp);
2286
2287                 /*
2288                  * Pay for the I/O operation, if it's not been paid for, and
2289                  * make sure it's on the correct vnode queue. (async operatings
2290                  * were payed for above.)
2291                  */
2292                 if (wasdelayed)
2293                         buf_reassign(bp, vp);
2294                 else
2295                         if (p && p->p_stats) {
2296                                 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock);    /* XXX */
2297                         }
2298
2299                 /* Release the buffer. */
2300                 buf_brelse(bp);
2301
2302                 return (rv);
2303         } else {
2304                 return (0);
2305         }
2306 }
2307
2308 int
2309 vn_bwrite(struct vnop_bwrite_args *ap)
2310 {
2311         return (buf_bwrite(ap->a_bp));
2312 }
2313
2314 /*
2315  * Delayed write.
2316  *
2317  * The buffer is marked dirty, but is not queued for I/O.
2318  * This routine should be used when the buffer is expected
2319  * to be modified again soon, typically a small write that
2320  * partially fills a buffer.
2321  *
2322  * NB: magnetic tapes cannot be delayed; they must be
2323  * written in the order that the writes are requested.
2324  *
2325  * Described in Leffler, et al. (pp. 208-213).
2326  *
2327  * Note: With the ability to allocate additional buffer
2328  * headers, we can get in to the situation where "too" many
2329  * buf_bdwrite()s can create situation where the kernel can create
2330  * buffers faster than the disks can service. Doing a buf_bawrite() in
2331  * cases where we have "too many" outstanding buf_bdwrite()s avoids that.
2332  */
2333 int
2334 bdwrite_internal(buf_t bp, int return_error)
2335 {
2336         proc_t  p  = current_proc();
2337         vnode_t vp = bp->b_vp;
2338
2339         /*
2340          * If the block hasn't been seen before:
2341          *      (1) Mark it as having been seen,
2342          *      (2) Charge for the write.
2343          *      (3) Make sure it's on its vnode's correct block list,
2344          */
2345         if (!ISSET(bp->b_flags, B_DELWRI)) {
2346                 SET(bp->b_flags, B_DELWRI);
2347                 if (p && p->p_stats) {
2348                         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock);    /* XXX */
2349                 }
2350                 OSAddAtomicLong(1, &nbdwrite);
2351                 buf_reassign(bp, vp);
2352         }
2353
2354         /*
2355          * if we're not LOCKED, but the total number of delayed writes
2356          * has climbed above 75% of the total buffers in the system
2357          * return an error if the caller has indicated that it can
2358          * handle one in this case, otherwise schedule the I/O now
2359          * this is done to prevent us from allocating tons of extra
2360          * buffers when dealing with virtual disks (i.e. DiskImages),
2361          * because additional buffers are dynamically allocated to prevent
2362          * deadlocks from occurring
2363          *
2364          * however, can't do a buf_bawrite() if the LOCKED bit is set because the
2365          * buffer is part of a transaction and can't go to disk until
2366          * the LOCKED bit is cleared.
2367          */
2368         if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf_headers/4)*3)) {
2369                 if (return_error)
2370                         return (EAGAIN);
2371                 /*
2372                  * If the vnode has "too many" write operations in progress
2373                  * wait for them to finish the IO
2374                  */
2375                 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, "buf_bdwrite");
2376
2377                 return (buf_bawrite(bp));
2378         }
2379
2380         /* Otherwise, the "write" is done, so mark and release the buffer. */
2381         SET(bp->b_flags, B_DONE);
2382         buf_brelse(bp);
2383         return (0);
2384 }
2385
2386 errno_t
2387 buf_bdwrite(buf_t bp)
2388 {
2389         return (bdwrite_internal(bp, 0));
2390 }
2391
2392
2393 /*
2394  * Asynchronous block write; just an asynchronous buf_bwrite().
2395  *
2396  * Note: With the abilitty to allocate additional buffer
2397  * headers, we can get in to the situation where "too" many
2398  * buf_bawrite()s can create situation where the kernel can create
2399  * buffers faster than the disks can service.
2400  * We limit the number of "in flight" writes a vnode can have to
2401  * avoid this.
2402  */
2403 static int
2404 bawrite_internal(buf_t bp, int throttle)
2405 {
2406         vnode_t vp = bp->b_vp;
2407
2408         if (vp) {
2409                 if (throttle)
2410                         /*
2411                          * If the vnode has "too many" write operations in progress
2412                          * wait for them to finish the IO
2413                          */
2414                         (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, (const char *)"buf_bawrite");
2415                 else if (vp->v_numoutput >= VNODE_ASYNC_THROTTLE)
2416                         /*
2417                          * return to the caller and
2418                          * let him decide what to do
2419                          */
2420                         return (EWOULDBLOCK);
2421         }
2422         SET(bp->b_flags, B_ASYNC);
2423
2424         return (VNOP_BWRITE(bp));
2425 }
2426
2427 errno_t
2428 buf_bawrite(buf_t bp)
2429 {
2430         return (bawrite_internal(bp, 1));
2431 }
2432
2433
2434
2435 static void
2436 buf_free_meta_store(buf_t bp)
2437 {
2438         if (bp->b_bufsize) {
2439                 if (ISSET(bp->b_flags, B_ZALLOC)) {
2440                         zone_t z;
2441
2442                         z = getbufzone(bp->b_bufsize);
2443                         zfree(z, (void *)bp->b_datap);
2444                 } else
2445                         kmem_free(kernel_map, bp->b_datap, bp->b_bufsize);
2446
2447                 bp->b_datap = (uintptr_t)NULL;
2448                 bp->b_bufsize = 0;
2449         }
2450 }
2451
2452
2453 static buf_t
2454 buf_brelse_shadow(buf_t bp)
2455 {
2456         buf_t   bp_head;
2457         buf_t   bp_temp;
2458         buf_t   bp_return = NULL;
2459 #ifdef BUF_MAKE_PRIVATE
2460         buf_t   bp_data;
2461         int     data_ref = 0;
2462 #endif
2463         int need_wakeup = 0;
2464
2465         lck_mtx_lock_spin(buf_mtxp);
2466
2467         __IGNORE_WCASTALIGN(bp_head = (buf_t)bp->b_orig);
2468
2469         if (bp_head->b_whichq != -1)
2470                 panic("buf_brelse_shadow: bp_head on freelist %d\n", bp_head->b_whichq);
2471
2472 #ifdef BUF_MAKE_PRIVATE
2473         if (bp_data = bp->b_data_store) {
2474                 bp_data->b_data_ref--;
2475                 /*
2476                  * snapshot the ref count so that we can check it
2477                  * outside of the lock... we only want the guy going
2478                  * from 1 -> 0 to try and release the storage
2479                  */
2480                 data_ref = bp_data->b_data_ref;
2481         }
2482 #endif
2483         KERNEL_DEBUG(0xbbbbc008 | DBG_FUNC_START, bp, bp_head, bp_head->b_shadow_ref, 0, 0);
2484
2485         bp_head->b_shadow_ref--;
2486
2487         for (bp_temp = bp_head; bp_temp && bp != bp_temp->b_shadow; bp_temp = bp_temp->b_shadow);
2488
2489         if (bp_temp == NULL)
2490                 panic("buf_brelse_shadow: bp not on list %p", bp_head);
2491
2492         bp_temp->b_shadow = bp_temp->b_shadow->b_shadow;
2493
2494 #ifdef BUF_MAKE_PRIVATE
2495         /*
2496          * we're about to free the current 'owner' of the data buffer and
2497          * there is at least one other shadow buf_t still pointing at it
2498          * so transfer it to the first shadow buf left in the chain
2499          */
2500         if (bp == bp_data && data_ref) {
2501                 if ((bp_data = bp_head->b_shadow) == NULL)
2502                         panic("buf_brelse_shadow: data_ref mismatch bp(%p)", bp);
2503
2504                 for (bp_temp = bp_data; bp_temp; bp_temp = bp_temp->b_shadow)
2505                         bp_temp->b_data_store = bp_data;
2506                 bp_data->b_data_ref = data_ref;
2507         }
2508 #endif
2509         if (bp_head->b_shadow_ref == 0 && bp_head->b_shadow)
2510                 panic("buf_relse_shadow: b_shadow != NULL && b_shadow_ref == 0  bp(%p)", bp);
2511         if (bp_head->b_shadow_ref && bp_head->b_shadow == 0)
2512                 panic("buf_relse_shadow: b_shadow == NULL && b_shadow_ref != 0  bp(%p)", bp);
2513
2514         if (bp_head->b_shadow_ref == 0) {
2515                 if (!ISSET(bp_head->b_lflags, BL_BUSY)) {
2516
2517                         CLR(bp_head->b_flags, B_AGE);
2518                         bp_head->b_timestamp = buf_timestamp();
2519
2520                         if (ISSET(bp_head->b_flags, B_LOCKED)) {
2521                                 bp_head->b_whichq = BQ_LOCKED;
2522                                 binstailfree(bp_head, &bufqueues[BQ_LOCKED], BQ_LOCKED);
2523                         } else {
2524                                 bp_head->b_whichq = BQ_META;
2525                                 binstailfree(bp_head, &bufqueues[BQ_META], BQ_META);
2526                         }
2527                 } else if (ISSET(bp_head->b_lflags, BL_WAITSHADOW)) {
2528                         CLR(bp_head->b_lflags, BL_WAITSHADOW);
2529
2530                         bp_return = bp_head;
2531                 }
2532                 if (ISSET(bp_head->b_lflags, BL_WANTED_REF)) {
2533                         CLR(bp_head->b_lflags, BL_WANTED_REF);
2534                         need_wakeup = 1;
2535                 }
2536         }
2537         lck_mtx_unlock(buf_mtxp);
2538
2539         if (need_wakeup)
2540                 wakeup(bp_head);
2541
2542 #ifdef BUF_MAKE_PRIVATE
2543         if (bp == bp_data && data_ref == 0)
2544                 buf_free_meta_store(bp);
2545
2546         bp->b_data_store = NULL;
2547 #endif
2548         KERNEL_DEBUG(0xbbbbc008 | DBG_FUNC_END, bp, 0, 0, 0, 0);
2549
2550         return (bp_return);
2551 }
2552
2553
2554 /*
2555  * Release a buffer on to the free lists.
2556  * Described in Bach (p. 46).
2557  */
2558 void
2559 buf_brelse(buf_t bp)
2560 {
2561         struct bqueues *bufq;
2562         long    whichq;
2563         upl_t   upl;
2564         int need_wakeup = 0;
2565         int need_bp_wakeup = 0;
2566
2567
2568         if (bp->b_whichq != -1 || !(bp->b_lflags & BL_BUSY))
2569                 panic("buf_brelse: bad buffer = %p\n", bp);
2570
2571 #ifdef JOE_DEBUG
2572         (void) OSBacktrace(&bp->b_stackbrelse[0], 6);
2573
2574         bp->b_lastbrelse = current_thread();
2575         bp->b_tag = 0;
2576 #endif
2577         if (bp->b_lflags & BL_IOBUF) {
2578                 buf_t   shadow_master_bp = NULL;
2579
2580                 if (ISSET(bp->b_lflags, BL_SHADOW))
2581                         shadow_master_bp = buf_brelse_shadow(bp);
2582                 else if (ISSET(bp->b_lflags, BL_IOBUF_ALLOC))
2583                          buf_free_meta_store(bp);
2584                 free_io_buf(bp);
2585
2586                 if (shadow_master_bp) {
2587                         bp = shadow_master_bp;
2588                         goto finish_shadow_master;
2589                 }
2590                 return;
2591         }
2592
2593         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
2594                      bp->b_lblkno * PAGE_SIZE, bp, bp->b_datap,
2595                      bp->b_flags, 0);
2596
2597         trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
2598
2599         /*
2600          * if we're invalidating a buffer that has the B_FILTER bit
2601          * set then call the b_iodone function so it gets cleaned
2602          * up properly.
2603          *
2604          * the HFS journal code depends on this
2605          */
2606         if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
2607                 if (ISSET(bp->b_flags, B_FILTER)) {     /* if necessary, call out */
2608                         void    (*iodone_func)(struct buf *, void *) = bp->b_iodone;
2609                         void    *arg = bp->b_transaction;
2610
2611                         CLR(bp->b_flags, B_FILTER);     /* but note callout done */
2612                         bp->b_iodone = NULL;
2613                         bp->b_transaction = NULL;
2614
2615                         if (iodone_func == NULL) {
2616                                 panic("brelse: bp @ %p has NULL b_iodone!\n", bp);
2617                         }
2618                         (*iodone_func)(bp, arg);
2619                 }
2620         }
2621         /*
2622          * I/O is done. Cleanup the UPL state
2623          */
2624         upl = bp->b_upl;
2625
2626         if ( !ISSET(bp->b_flags, B_META) && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
2627                 kern_return_t kret;
2628                 int           upl_flags;
2629
2630                 if (upl == NULL) {
2631                         if ( !ISSET(bp->b_flags, B_INVAL)) {
2632                                 kret = ubc_create_upl(bp->b_vp,
2633                                                       ubc_blktooff(bp->b_vp, bp->b_lblkno),
2634                                                       bp->b_bufsize,
2635                                                       &upl,
2636                                                       NULL,
2637                                                       UPL_PRECIOUS);
2638
2639                                 if (kret != KERN_SUCCESS)
2640                                         panic("brelse: Failed to create UPL");
2641 #if  UPL_DEBUG
2642                                 upl_ubc_alias_set(upl, (uintptr_t) bp, (uintptr_t) 5);
2643 #endif /* UPL_DEBUG */
2644                         }
2645                 } else {
2646                         if (bp->b_datap) {
2647                                 kret = ubc_upl_unmap(upl);
2648
2649                                 if (kret != KERN_SUCCESS)
2650                                         panic("ubc_upl_unmap failed");
2651                                 bp->b_datap = (uintptr_t)NULL;
2652                         }
2653                 }
2654                 if (upl) {
2655                         if (bp->b_flags & (B_ERROR | B_INVAL)) {
2656                                 if (bp->b_flags & (B_READ | B_INVAL))
2657                                         upl_flags = UPL_ABORT_DUMP_PAGES;
2658                                 else
2659                                         upl_flags = 0;
2660
2661                                 ubc_upl_abort(upl, upl_flags);
2662                         } else {
2663                                 if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY))
2664                                         upl_flags = UPL_COMMIT_SET_DIRTY ;
2665                                 else
2666                                         upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
2667
2668                                 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
2669                                                      UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2670                         }
2671                         bp->b_upl = NULL;
2672                 }
2673         } else {
2674                 if ( (upl) )
2675                         panic("brelse: UPL set for non VREG; vp=%p", bp->b_vp);
2676         }
2677
2678         /*
2679          * If it's locked, don't report an error; try again later.
2680          */
2681         if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
2682                 CLR(bp->b_flags, B_ERROR);
2683         /*
2684          * If it's not cacheable, or an error, mark it invalid.
2685          */
2686         if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
2687                 SET(bp->b_flags, B_INVAL);
2688
2689         if ((bp->b_bufsize <= 0) ||
2690                         ISSET(bp->b_flags, B_INVAL) ||
2691                         (ISSET(bp->b_lflags, BL_WANTDEALLOC) && !ISSET(bp->b_flags, B_DELWRI))) {
2692
2693                 boolean_t       delayed_buf_free_meta_store = FALSE;
2694
2695                 /*
2696                  * If it's invalid or empty, dissociate it from its vnode,
2697                  * release its storage if B_META, and
2698                  * clean it up a bit and put it on the EMPTY queue
2699                  */
2700                 if (ISSET(bp->b_flags, B_DELWRI))
2701                         OSAddAtomicLong(-1, &nbdwrite);
2702
2703                 if (ISSET(bp->b_flags, B_META)) {
2704                         if (bp->b_shadow_ref)
2705                                 delayed_buf_free_meta_store = TRUE;
2706                         else
2707                                 buf_free_meta_store(bp);
2708                 }
2709                 /*
2710                  * nuke any credentials we were holding
2711                  */
2712                 buf_release_credentials(bp);
2713
2714                 lck_mtx_lock_spin(buf_mtxp);
2715
2716                 if (bp->b_shadow_ref) {
2717                         SET(bp->b_lflags, BL_WAITSHADOW);
2718
2719                         lck_mtx_unlock(buf_mtxp);
2720
2721                         return;
2722                 }
2723                 if (delayed_buf_free_meta_store == TRUE) {
2724
2725                         lck_mtx_unlock(buf_mtxp);
2726 finish_shadow_master:
2727                         buf_free_meta_store(bp);
2728
2729                         lck_mtx_lock_spin(buf_mtxp);
2730                 }
2731                 CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA));
2732
2733                 if (bp->b_vp)
2734                         brelvp_locked(bp);
2735
2736                 bremhash(bp);
2737                 BLISTNONE(bp);
2738                 binshash(bp, &invalhash);
2739
2740                 bp->b_whichq = BQ_EMPTY;
2741                 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
2742         } else {
2743
2744                 /*
2745                  * It has valid data.  Put it on the end of the appropriate
2746                  * queue, so that it'll stick around for as long as possible.
2747                  */
2748                 if (ISSET(bp->b_flags, B_LOCKED))
2749                         whichq = BQ_LOCKED;             /* locked in core */
2750                 else if (ISSET(bp->b_flags, B_META))
2751                         whichq = BQ_META;               /* meta-data */
2752                 else if (ISSET(bp->b_flags, B_AGE))
2753                         whichq = BQ_AGE;                /* stale but valid data */
2754                 else
2755                         whichq = BQ_LRU;                /* valid data */
2756                 bufq = &bufqueues[whichq];
2757
2758                 bp->b_timestamp = buf_timestamp();
2759
2760                 lck_mtx_lock_spin(buf_mtxp);
2761
2762                 /*
2763                  * the buf_brelse_shadow routine doesn't take 'ownership'
2764                  * of the parent buf_t... it updates state that is protected by
2765                  * the buf_mtxp, and checks for BL_BUSY to determine whether to
2766                  * put the buf_t back on a free list.  b_shadow_ref is protected
2767                  * by the lock, and since we have not yet cleared B_BUSY, we need
2768                  * to check it while holding the lock to insure that one of us
2769                  * puts this buf_t back on a free list when it is safe to do so
2770                  */
2771                 if (bp->b_shadow_ref == 0) {
2772                         CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE));
2773                         bp->b_whichq = whichq;
2774                         binstailfree(bp, bufq, whichq);
2775                 } else {
2776                         /*
2777                          * there are still cloned buf_t's pointing
2778                          * at this guy... need to keep it off the
2779                          * freelists until a buf_brelse is done on
2780                          * the last clone
2781                          */
2782                         CLR(bp->b_flags, (B_ASYNC | B_NOCACHE));
2783                 }
2784         }
2785         if (needbuffer) {
2786                 /*
2787                  * needbuffer is a global
2788                  * we're currently using buf_mtxp to protect it
2789                  * delay doing the actual wakeup until after
2790                  * we drop buf_mtxp
2791                  */
2792                 needbuffer = 0;
2793                 need_wakeup = 1;
2794         }
2795         if (ISSET(bp->b_lflags, BL_WANTED)) {
2796                 /*
2797                  * delay the actual wakeup until after we
2798                  * clear BL_BUSY and we've dropped buf_mtxp
2799                  */
2800                 need_bp_wakeup = 1;
2801         }
2802         /*
2803          * Unlock the buffer.
2804          */
2805         CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
2806         buf_busycount--;
2807
2808         lck_mtx_unlock(buf_mtxp);
2809
2810         if (need_wakeup) {
2811                 /*
2812                  * Wake up any processes waiting for any buffer to become free.
2813                  */
2814                 wakeup(&needbuffer);
2815         }
2816         if (need_bp_wakeup) {
2817                 /*
2818                  * Wake up any proceeses waiting for _this_ buffer to become free.
2819                  */
2820                 wakeup(bp);
2821         }
2822         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
2823                      bp, bp->b_datap, bp->b_flags, 0, 0);
2824 }
2825
2826 /*
2827  * Determine if a block is in the cache.
2828  * Just look on what would be its hash chain.  If it's there, return
2829  * a pointer to it, unless it's marked invalid.  If it's marked invalid,
2830  * we normally don't return the buffer, unless the caller explicitly
2831  * wants us to.
2832  */
2833 static boolean_t
2834 incore(vnode_t vp, daddr64_t blkno)
2835 {
2836         boolean_t retval;
2837         struct  bufhashhdr *dp;
2838
2839         dp = BUFHASH(vp, blkno);
2840
2841         lck_mtx_lock_spin(buf_mtxp);
2842
2843         if (incore_locked(vp, blkno, dp))
2844                 retval = TRUE;
2845         else
2846                 retval = FALSE;
2847         lck_mtx_unlock(buf_mtxp);
2848
2849         return (retval);
2850 }
2851
2852
2853 static buf_t
2854 incore_locked(vnode_t vp, daddr64_t blkno, struct bufhashhdr *dp)
2855 {
2856         struct buf *bp;
2857
2858         /* Search hash chain */
2859         for (bp = dp->lh_first; bp != NULL; bp = bp->b_hash.le_next) {
2860                 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
2861                     !ISSET(bp->b_flags, B_INVAL)) {
2862                         return (bp);
2863                 }
2864         }
2865         return (NULL);
2866 }
2867
2868
2869 void
2870 buf_wait_for_shadow_io(vnode_t vp, daddr64_t blkno)
2871 {
2872         buf_t bp;
2873         struct  bufhashhdr *dp;
2874
2875         dp = BUFHASH(vp, blkno);
2876
2877         lck_mtx_lock_spin(buf_mtxp);
2878
2879         for (;;) {
2880                 if ((bp = incore_locked(vp, blkno, dp)) == NULL)
2881                         break;
2882
2883                 if (bp->b_shadow_ref == 0)
2884                         break;
2885
2886                 SET(bp->b_lflags, BL_WANTED_REF);
2887
2888                 (void) msleep(bp, buf_mtxp, PSPIN | (PRIBIO+1), "buf_wait_for_shadow", NULL);
2889         }
2890         lck_mtx_unlock(buf_mtxp);
2891 }
2892
2893 /* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
2894 /*
2895  * Get a block of requested size that is associated with
2896  * a given vnode and block offset. If it is found in the
2897  * block cache, mark it as having been found, make it busy
2898  * and return it. Otherwise, return an empty block of the
2899  * correct size. It is up to the caller to insure that the
2900  * cached blocks be of the correct size.
2901  */
2902 buf_t
2903 buf_getblk(vnode_t vp, daddr64_t blkno, int size, int slpflag, int slptimeo, int operation)
2904 {
2905         buf_t bp;
2906         int   err;
2907         upl_t upl;
2908         upl_page_info_t *pl;
2909         kern_return_t kret;
2910         int ret_only_valid;
2911         struct timespec ts;
2912         int upl_flags;
2913         struct  bufhashhdr *dp;
2914
2915         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
2916                      (uintptr_t)(blkno * PAGE_SIZE), size, operation, 0, 0);
2917
2918         ret_only_valid = operation & BLK_ONLYVALID;
2919         operation &= ~BLK_ONLYVALID;
2920         dp = BUFHASH(vp, blkno);
2921 start:
2922         lck_mtx_lock_spin(buf_mtxp);
2923
2924         if ((bp = incore_locked(vp, blkno, dp))) {
2925                 /*
2926                  * Found in the Buffer Cache
2927                  */
2928                 if (ISSET(bp->b_lflags, BL_BUSY)) {
2929                         /*
2930                          * but is busy
2931                          */
2932                         switch (operation) {
2933                         case BLK_READ:
2934                         case BLK_WRITE:
2935                         case BLK_META:
2936                                 SET(bp->b_lflags, BL_WANTED);
2937                                 bufstats.bufs_busyincore++;
2938
2939                                 /*
2940                                  * don't retake the mutex after being awakened...
2941                                  * the time out is in msecs
2942                                  */
2943                                 ts.tv_sec = (slptimeo/1000);
2944                                 ts.tv_nsec = (slptimeo % 1000) * 10  * NSEC_PER_USEC * 1000;
2945
2946                                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 396)) | DBG_FUNC_NONE,
2947                                              (uintptr_t)blkno, size, operation, 0, 0);
2948
2949                                 err = msleep(bp, buf_mtxp, slpflag | PDROP | (PRIBIO + 1), "buf_getblk", &ts);
2950
2951                                 /*
2952                                  * Callers who call with PCATCH or timeout are
2953                                  * willing to deal with the NULL pointer
2954                                  */
2955                                 if (err && ((slpflag & PCATCH) || ((err == EWOULDBLOCK) && slptimeo)))
2956                                         return (NULL);
2957                                 goto start;
2958                                 /*NOTREACHED*/
2959
2960                         default:
2961                                 /*
2962                                  * unknown operation requested
2963                                  */
2964                                 panic("getblk: paging or unknown operation for incore busy buffer - %x\n", operation);
2965                                 /*NOTREACHED*/
2966                                 break;
2967                         }
2968                 } else {
2969                         int clear_bdone;
2970
2971                         /*
2972                          * buffer in core and not busy
2973                          */
2974                         SET(bp->b_lflags, BL_BUSY);
2975                         SET(bp->b_flags, B_CACHE);
2976                         buf_busycount++;
2977
2978                         bremfree_locked(bp);
2979                         bufstats.bufs_incore++;
2980
2981                         lck_mtx_unlock(buf_mtxp);
2982 #ifdef JOE_DEBUG
2983                         bp->b_owner = current_thread();
2984                         bp->b_tag   = 1;
2985 #endif
2986                         if ( (bp->b_upl) )
2987                                 panic("buffer has UPL, but not marked BUSY: %p", bp);
2988
2989                         clear_bdone = FALSE;
2990                         if (!ret_only_valid) {
2991                                 /*
2992                                  * If the number bytes that are valid is going
2993                                  * to increase (even if we end up not doing a
2994                                  * reallocation through allocbuf) we have to read
2995                                  * the new size first.
2996                                  *
2997                                  * This is required in cases where we doing a read
2998                                  * modify write of a already valid data on disk but
2999                                  * in cases where the data on disk beyond (blkno + b_bcount)
3000                                  * is invalid, we may end up doing extra I/O.
3001                                  */
3002                                 if (operation == BLK_META && bp->b_bcount < size) {
3003                                         /*
3004                                          * Since we are going to read in the whole size first
3005                                          * we first have to ensure that any pending delayed write
3006                                          * is flushed to disk first.
3007                                          */
3008                                         if (ISSET(bp->b_flags, B_DELWRI)) {
3009                                                 CLR(bp->b_flags, B_CACHE);
3010                                                 buf_bwrite(bp);
3011                                                 goto start;
3012                                         }
3013                                         /*
3014                                          * clear B_DONE before returning from
3015                                          * this function so that the caller can
3016                                          * can issue a read for the new size.
3017                                          */
3018                                         clear_bdone = TRUE;
3019                                 }
3020
3021                                 if (bp->b_bufsize != size)
3022                                         allocbuf(bp, size);
3023                         }
3024
3025                         upl_flags = 0;
3026                         switch (operation) {
3027                         case BLK_WRITE:
3028                                 /*
3029                                  * "write" operation:  let the UPL subsystem
3030                                  * know that we intend to modify the buffer
3031                                  * cache pages we're gathering.
3032                                  */
3033                                 upl_flags |= UPL_WILL_MODIFY;
3034                         case BLK_READ:
3035                                 upl_flags |= UPL_PRECIOUS;
3036                                 if (UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
3037                                         kret = ubc_create_upl(vp,
3038                                                               ubc_blktooff(vp, bp->b_lblkno),
3039                                                               bp->b_bufsize,
3040                                                               &upl,
3041                                                               &pl,
3042                                                               upl_flags);
3043                                         if (kret != KERN_SUCCESS)
3044                                                 panic("Failed to create UPL");
3045
3046                                         bp->b_upl = upl;
3047
3048                                         if (upl_valid_page(pl, 0)) {
3049                                                 if (upl_dirty_page(pl, 0))
3050                                                         SET(bp->b_flags, B_WASDIRTY);
3051                                                 else
3052                                                         CLR(bp->b_flags, B_WASDIRTY);
3053                                         } else
3054                                                 CLR(bp->b_flags, (B_DONE | B_CACHE | B_WASDIRTY | B_DELWRI));
3055
3056                                         kret = ubc_upl_map(upl, (vm_offset_t*)&(bp->b_datap));
3057
3058                                         if (kret != KERN_SUCCESS)
3059                                                 panic("getblk: ubc_upl_map() failed with (%d)", kret);
3060                                 }
3061                                 break;
3062
3063                         case BLK_META:
3064                                 /*
3065                                  * VM is not involved in IO for the meta data
3066                                  * buffer already has valid data
3067                                  */
3068                                 break;
3069
3070                         default:
3071                                 panic("getblk: paging or unknown operation for incore buffer- %d\n", operation);
3072                                 /*NOTREACHED*/
3073                                 break;
3074                         }
3075
3076                         if (clear_bdone)
3077                                 CLR(bp->b_flags, B_DONE);
3078                 }
3079         } else { /* not incore() */
3080                 int queue = BQ_EMPTY; /* Start with no preference */
3081
3082                 if (ret_only_valid) {
3083                         lck_mtx_unlock(buf_mtxp);
3084                         return (NULL);
3085                 }
3086                 if ((vnode_isreg(vp) == 0) || (UBCINFOEXISTS(vp) == 0) /*|| (vnode_issystem(vp) == 1)*/)
3087                         operation = BLK_META;
3088
3089                 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
3090                         goto start;
3091
3092                 /*
3093                  * getnewbuf may block for a number of different reasons...
3094                  * if it does, it's then possible for someone else to
3095                  * create a buffer for the same block and insert it into
3096                  * the hash... if we see it incore at this point we dump
3097                  * the buffer we were working on and start over
3098                  */
3099                 if (incore_locked(vp, blkno, dp)) {
3100                         SET(bp->b_flags, B_INVAL);
3101                         binshash(bp, &invalhash);
3102
3103                         lck_mtx_unlock(buf_mtxp);
3104
3105                         buf_brelse(bp);
3106                         goto start;
3107                 }
3108                 /*
3109                  * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
3110                  *       CALLED!  BE CAREFUL.
3111                  */
3112
3113                 /*
3114                  * mark the buffer as B_META if indicated
3115                  * so that when buffer is released it will goto META queue
3116                  */
3117                 if (operation == BLK_META)
3118                         SET(bp->b_flags, B_META);
3119
3120                 bp->b_blkno = bp->b_lblkno = blkno;
3121                 bp->b_vp = vp;
3122
3123                 /*
3124                  * Insert in the hash so that incore() can find it
3125                  */
3126                 binshash(bp, BUFHASH(vp, blkno));
3127
3128                 bgetvp_locked(vp, bp);
3129
3130                 lck_mtx_unlock(buf_mtxp);
3131
3132                 allocbuf(bp, size);
3133
3134                 upl_flags = 0;
3135                 switch (operation) {
3136                 case BLK_META:
3137                         /*
3138                          * buffer data is invalid...
3139                          *
3140                          * I don't want to have to retake buf_mtxp,
3141                          * so the miss and vmhits counters are done
3142                          * with Atomic updates... all other counters
3143                          * in bufstats are protected with either
3144                          * buf_mtxp or iobuffer_mtxp
3145                          */
3146                         OSAddAtomicLong(1, &bufstats.bufs_miss);
3147                         break;
3148
3149                 case BLK_WRITE:
3150                         /*
3151                          * "write" operation:  let the UPL subsystem know
3152                          * that we intend to modify the buffer cache pages
3153                          * we're gathering.
3154                          */
3155                         upl_flags |= UPL_WILL_MODIFY;
3156                 case BLK_READ:
3157                   {     off_t   f_offset;
3158                         size_t  contig_bytes;
3159                         int     bmap_flags;
3160
3161 #if DEVELOPMENT || DEBUG
3162                         /*
3163                          * Apple implemented file systems use UBC excludively; they should
3164                          * not call in here."
3165                          */
3166                         const char* excldfs[] = {"hfs", "afpfs", "smbfs", "acfs",
3167                                                  "exfat", "msdos", "webdav", NULL};
3168
3169                         for (int i = 0; excldfs[i] != NULL; i++) {
3170                                 if (vp->v_mount &&
3171                                     !strcmp(vp->v_mount->mnt_vfsstat.f_fstypename,
3172                                                 excldfs[i])) {
3173                                         panic("%s %s calls buf_getblk",
3174                                                 excldfs[i],
3175                                                 operation == BLK_READ ? "BLK_READ" : "BLK_WRITE");
3176                                 }
3177                         }
3178 #endif
3179
3180                         if ( (bp->b_upl) )
3181                                 panic("bp already has UPL: %p",bp);
3182
3183                         f_offset = ubc_blktooff(vp, blkno);
3184
3185                         upl_flags |= UPL_PRECIOUS;
3186                         kret = ubc_create_upl(vp,
3187                                               f_offset,
3188                                               bp->b_bufsize,
3189                                               &upl,
3190                                               &pl,
3191                                               upl_flags);
3192
3193                         if (kret != KERN_SUCCESS)
3194                                 panic("Failed to create UPL");
3195 #if  UPL_DEBUG
3196                         upl_ubc_alias_set(upl, (uintptr_t) bp, (uintptr_t) 4);
3197 #endif /* UPL_DEBUG */
3198                         bp->b_upl = upl;
3199
3200                         if (upl_valid_page(pl, 0)) {
3201
3202                                 if (operation == BLK_READ)
3203                                         bmap_flags = VNODE_READ;
3204                                 else
3205                                         bmap_flags = VNODE_WRITE;
3206
3207                                 SET(bp->b_flags, B_CACHE | B_DONE);
3208
3209                                 OSAddAtomicLong(1, &bufstats.bufs_vmhits);
3210
3211                                 bp->b_validoff = 0;
3212                                 bp->b_dirtyoff = 0;
3213
3214                                 if (upl_dirty_page(pl, 0)) {
3215                                         /* page is dirty */
3216                                         SET(bp->b_flags, B_WASDIRTY);
3217
3218                                         bp->b_validend = bp->b_bcount;
3219                                         bp->b_dirtyend = bp->b_bcount;
3220                                 } else {
3221                                         /* page is clean */
3222                                         bp->b_validend = bp->b_bcount;
3223                                         bp->b_dirtyend = 0;
3224                                 }
3225                                 /*
3226                                  * try to recreate the physical block number associated with
3227                                  * this buffer...
3228                                  */
3229                                 if (VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))
3230                                         panic("getblk: VNOP_BLOCKMAP failed");
3231                                 /*
3232                                  * if the extent represented by this buffer
3233                                  * is not completely physically contiguous on
3234                                  * disk, than we can't cache the physical mapping
3235                                  * in the buffer header
3236                                  */
3237                                 if ((long)contig_bytes < bp->b_bcount)
3238                                         bp->b_blkno = bp->b_lblkno;
3239                         } else {
3240                                 OSAddAtomicLong(1, &bufstats.bufs_miss);
3241                         }
3242                         kret = ubc_upl_map(upl, (vm_offset_t *)&(bp->b_datap));
3243
3244                         if (kret != KERN_SUCCESS)
3245                                 panic("getblk: ubc_upl_map() failed with (%d)", kret);
3246                         break;
3247                   }
3248                 default:
3249                         panic("getblk: paging or unknown operation - %x", operation);
3250                         /*NOTREACHED*/
3251                         break;
3252                 }
3253         }
3254         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
3255                      bp, bp->b_datap, bp->b_flags, 3, 0);
3256
3257 #ifdef JOE_DEBUG
3258         (void) OSBacktrace(&bp->b_stackgetblk[0], 6);
3259 #endif
3260         return (bp);
3261 }
3262
3263 /*
3264  * Get an empty, disassociated buffer of given size.
3265  */
3266 buf_t
3267 buf_geteblk(int size)
3268 {
3269         buf_t   bp = NULL;
3270         int queue = BQ_EMPTY;
3271
3272         do {
3273                 lck_mtx_lock_spin(buf_mtxp);
3274
3275                 bp = getnewbuf(0, 0, &queue);
3276         } while (bp == NULL);
3277
3278         SET(bp->b_flags, (B_META|B_INVAL));
3279
3280 #if DIAGNOSTIC
3281         assert(queue == BQ_EMPTY);
3282 #endif /* DIAGNOSTIC */
3283         /* XXX need to implement logic to deal with other queues */
3284
3285         binshash(bp, &invalhash);
3286         bufstats.bufs_eblk++;
3287
3288         lck_mtx_unlock(buf_mtxp);
3289
3290         allocbuf(bp, size);
3291
3292         return (bp);
3293 }
3294
3295 uint32_t
3296 buf_redundancy_flags(buf_t bp)
3297 {
3298         return bp->b_redundancy_flags;
3299 }
3300
3301 void
3302 buf_set_redundancy_flags(buf_t bp, uint32_t flags)
3303 {
3304         SET(bp->b_redundancy_flags, flags);
3305 }
3306
3307 void
3308 buf_clear_redundancy_flags(buf_t bp, uint32_t flags)
3309 {
3310         CLR(bp->b_redundancy_flags, flags);
3311 }
3312
3313
3314
3315 static void *
3316 recycle_buf_from_pool(int nsize)
3317 {
3318         buf_t   bp;
3319         void    *ptr = NULL;
3320
3321         lck_mtx_lock_spin(buf_mtxp);
3322
3323         TAILQ_FOREACH(bp, &bufqueues[BQ_META], b_freelist) {
3324                 if (ISSET(bp->b_flags, B_DELWRI) || bp->b_bufsize != nsize)
3325                         continue;
3326                 ptr = (void *)bp->b_datap;
3327                 bp->b_bufsize = 0;
3328
3329                 bcleanbuf(bp, TRUE);
3330                 break;
3331         }
3332         lck_mtx_unlock(buf_mtxp);
3333
3334         return (ptr);
3335 }
3336
3337
3338
3339 int zalloc_nopagewait_failed = 0;
3340 int recycle_buf_failed = 0;
3341
3342 static void *
3343 grab_memory_for_meta_buf(int nsize)
3344 {
3345         zone_t z;
3346         void *ptr;
3347         boolean_t was_vmpriv;
3348
3349         z = getbufzone(nsize);
3350
3351         /*
3352          * make sure we're NOT priviliged so that
3353          * if a vm_page_grab is needed, it won't
3354          * block if we're out of free pages... if
3355          * it blocks, then we can't honor the
3356          * nopagewait request
3357          */
3358         was_vmpriv = set_vm_privilege(FALSE);
3359
3360         ptr = zalloc_nopagewait(z);
3361
3362         if (was_vmpriv == TRUE)
3363                 set_vm_privilege(TRUE);
3364
3365         if (ptr == NULL) {
3366
3367                 zalloc_nopagewait_failed++;
3368
3369                 ptr = recycle_buf_from_pool(nsize);
3370
3371                 if (ptr == NULL) {
3372
3373                         recycle_buf_failed++;
3374
3375                         if (was_vmpriv == FALSE)
3376                                 set_vm_privilege(TRUE);
3377
3378                         ptr = zalloc(z);
3379
3380                         if (was_vmpriv == FALSE)
3381                                 set_vm_privilege(FALSE);
3382                 }
3383         }
3384         return (ptr);
3385 }
3386
3387 /*
3388  * With UBC, there is no need to expand / shrink the file data
3389  * buffer. The VM uses the same pages, hence no waste.
3390  * All the file data buffers can have one size.
3391  * In fact expand / shrink would be an expensive operation.
3392  *
3393  * Only exception to this is meta-data buffers. Most of the
3394  * meta data operations are smaller than PAGE_SIZE. Having the
3395  * meta-data buffers grow and shrink as needed, optimizes use
3396  * of the kernel wired memory.
3397  */
3398
3399 int
3400 allocbuf(buf_t bp, int size)
3401 {
3402         vm_size_t desired_size;
3403
3404         desired_size = roundup(size, CLBYTES);
3405
3406         if (desired_size < PAGE_SIZE)
3407                 desired_size = PAGE_SIZE;
3408         if (desired_size > MAXBSIZE)
3409                 panic("allocbuf: buffer larger than MAXBSIZE requested");
3410
3411         if (ISSET(bp->b_flags, B_META)) {
3412                 int    nsize = roundup(size, MINMETA);
3413
3414                 if (bp->b_datap) {
3415                         vm_offset_t elem = (vm_offset_t)bp->b_datap;
3416
3417                         if (ISSET(bp->b_flags, B_ZALLOC)) {
3418                                 if (bp->b_bufsize < nsize) {
3419                                         zone_t zprev;
3420
3421                                         /* reallocate to a bigger size */
3422
3423                                         zprev = getbufzone(bp->b_bufsize);
3424                                         if (nsize <= MAXMETA) {
3425                                                 desired_size = nsize;
3426
3427                                                 /* b_datap not really a ptr */
3428                                                 *(void **)(&bp->b_datap) = grab_memory_for_meta_buf(nsize);
3429                                         } else {
3430                                                 bp->b_datap = (uintptr_t)NULL;
3431                                                 kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size, VM_KERN_MEMORY_FILE);
3432                                                 CLR(bp->b_flags, B_ZALLOC);
3433                                         }
3434                                         bcopy((void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
3435                                         zfree(zprev, (void *)elem);
3436                                 } else {
3437                                         desired_size = bp->b_bufsize;
3438                                 }
3439
3440                         } else {
3441                                 if ((vm_size_t)bp->b_bufsize < desired_size) {
3442                                         /* reallocate to a bigger size */
3443                                         bp->b_datap = (uintptr_t)NULL;
3444                                         kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size, VM_KERN_MEMORY_FILE);
3445                                         bcopy((const void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
3446                                         kmem_free(kernel_map, elem, bp->b_bufsize);
3447                                 } else {
3448                                         desired_size = bp->b_bufsize;
3449                                 }
3450                         }
3451                 } else {
3452                         /* new allocation */
3453                         if (nsize <= MAXMETA) {
3454                                 desired_size = nsize;
3455
3456                                 /* b_datap not really a ptr */
3457                                 *(void **)(&bp->b_datap) = grab_memory_for_meta_buf(nsize);
3458                                 SET(bp->b_flags, B_ZALLOC);
3459                         } else
3460                                 kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size, VM_KERN_MEMORY_FILE);
3461                 }
3462
3463                 if (bp->b_datap == 0)
3464                         panic("allocbuf: NULL b_datap");
3465         }
3466         bp->b_bufsize = desired_size;
3467         bp->b_bcount = size;
3468
3469         return (0);
3470 }
3471
3472 /*
3473  *      Get a new buffer from one of the free lists.
3474  *
3475  *      Request for a queue is passes in. The queue from which the buffer was taken
3476  *      from is returned. Out of range queue requests get BQ_EMPTY. Request for
3477  *      BQUEUE means no preference. Use heuristics in that case.
3478  *      Heuristics is as follows:
3479  *      Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
3480  *      If none available block till one is made available.
3481  *      If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
3482  *      Pick the most stale buffer.
3483  *      If found buffer was marked delayed write, start the async. write
3484  *      and restart the search.
3485  *      Initialize the fields and disassociate the buffer from the vnode.
3486  *      Remove the buffer from the hash. Return the buffer and the queue
3487  *      on which it was found.
3488  *
3489  *      buf_mtxp is held upon entry
3490  *      returns with buf_mtxp locked if new buf available
3491  *      returns with buf_mtxp UNlocked if new buf NOT available
3492  */
3493
3494 static buf_t
3495 getnewbuf(int slpflag, int slptimeo, int * queue)
3496 {
3497         buf_t   bp;
3498         buf_t   lru_bp;
3499         buf_t   age_bp;
3500         buf_t   meta_bp;
3501         int     age_time, lru_time, bp_time, meta_time;
3502         int     req = *queue;   /* save it for restarts */
3503         struct timespec ts;
3504
3505 start:
3506         /*
3507          * invalid request gets empty queue
3508          */
3509         if ((*queue >= BQUEUES) || (*queue < 0)
3510                 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED))
3511                 *queue = BQ_EMPTY;
3512
3513
3514         if (*queue == BQ_EMPTY && (bp = bufqueues[*queue].tqh_first))
3515                 goto found;
3516
3517         /*
3518          * need to grow number of bufs, add another one rather than recycling
3519          */
3520         if (nbuf_headers < max_nbuf_headers) {
3521                 /*
3522                  * Increment  count now as lock
3523                  * is dropped for allocation.
3524                  * That avoids over commits
3525                  */
3526                 nbuf_headers++;
3527                 goto add_newbufs;
3528         }
3529         /* Try for the requested queue first */
3530         bp = bufqueues[*queue].tqh_first;
3531         if (bp)
3532                 goto found;
3533
3534         /* Unable to use requested queue */
3535         age_bp = bufqueues[BQ_AGE].tqh_first;
3536         lru_bp = bufqueues[BQ_LRU].tqh_first;
3537         meta_bp = bufqueues[BQ_META].tqh_first;
3538
3539         if (!age_bp && !lru_bp && !meta_bp) {
3540                 /*
3541                  * Unavailble on AGE or LRU or META queues
3542                  * Try the empty list first
3543                  */
3544                 bp = bufqueues[BQ_EMPTY].tqh_first;
3545                 if (bp) {
3546                         *queue = BQ_EMPTY;
3547                         goto found;
3548                 }
3549                 /*
3550                  * We have seen is this is hard to trigger.
3551                  * This is an overcommit of nbufs but needed
3552                  * in some scenarios with diskiamges
3553                  */
3554
3555 add_newbufs:
3556                 lck_mtx_unlock(buf_mtxp);
3557
3558                 /* Create a new temporary buffer header */
3559                 bp = (struct buf *)zalloc(buf_hdr_zone);
3560
3561                 if (bp) {
3562                         bufhdrinit(bp);
3563                         bp->b_whichq = BQ_EMPTY;
3564                         bp->b_timestamp = buf_timestamp();
3565                         BLISTNONE(bp);
3566                         SET(bp->b_flags, B_HDRALLOC);
3567                         *queue = BQ_EMPTY;
3568                 }
3569                 lck_mtx_lock_spin(buf_mtxp);
3570
3571                 if (bp) {
3572                         binshash(bp, &invalhash);
3573                         binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
3574                         buf_hdr_count++;
3575                         goto found;
3576                 }
3577                 /* subtract already accounted bufcount */
3578                 nbuf_headers--;
3579
3580                 bufstats.bufs_sleeps++;
3581
3582                 /* wait for a free buffer of any kind */
3583                 needbuffer = 1;
3584                 /* hz value is 100 */
3585                 ts.tv_sec = (slptimeo/1000);
3586                 /* the hz value is 100; which leads to 10ms */
3587                 ts.tv_nsec = (slptimeo % 1000) * NSEC_PER_USEC * 1000 * 10;
3588
3589                 msleep(&needbuffer, buf_mtxp, slpflag | PDROP | (PRIBIO+1), "getnewbuf", &ts);
3590                 return (NULL);
3591         }
3592
3593         /* Buffer available either on AGE or LRU or META */
3594         bp = NULL;
3595         *queue = -1;
3596
3597         /* Buffer available either on AGE or LRU */
3598         if (!age_bp) {
3599                 bp = lru_bp;
3600                 *queue = BQ_LRU;
3601         } else if (!lru_bp) {
3602                 bp = age_bp;
3603                 *queue = BQ_AGE;
3604         } else { /* buffer available on both AGE and LRU */
3605                 int             t = buf_timestamp();
3606
3607                 age_time = t - age_bp->b_timestamp;
3608                 lru_time = t - lru_bp->b_timestamp;
3609                 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
3610                         bp = age_bp;
3611                         *queue = BQ_AGE;
3612                         /*
3613                          * we should probably re-timestamp eveything in the
3614                          * queues at this point with the current time
3615                          */
3616                 } else {
3617                         if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
3618                                 bp = lru_bp;
3619                                 *queue = BQ_LRU;
3620                         } else {
3621                                 bp = age_bp;
3622                                 *queue = BQ_AGE;
3623                         }
3624                 }
3625         }
3626
3627         if (!bp) { /* Neither on AGE nor on LRU */
3628                 bp = meta_bp;
3629                 *queue = BQ_META;
3630         }  else if (meta_bp) {
3631                 int             t = buf_timestamp();
3632
3633                 bp_time = t - bp->b_timestamp;
3634                 meta_time = t - meta_bp->b_timestamp;
3635
3636                 if (!(bp_time < 0) && !(meta_time < 0)) {
3637                         /* time not set backwards */
3638                         int bp_is_stale;
3639                         bp_is_stale = (*queue == BQ_LRU) ?
3640                                         lru_is_stale : age_is_stale;
3641
3642                         if ((meta_time >= meta_is_stale) &&
3643                                         (bp_time < bp_is_stale)) {
3644                                 bp = meta_bp;
3645                                 *queue = BQ_META;
3646                         }
3647                 }
3648         }
3649 found:
3650         if (ISSET(bp->b_flags, B_LOCKED) || ISSET(bp->b_lflags, BL_BUSY))
3651                 panic("getnewbuf: bp @ %p is LOCKED or BUSY! (flags 0x%x)\n", bp, bp->b_flags);
3652
3653         /* Clean it */
3654         if (bcleanbuf(bp, FALSE)) {
3655                 /*
3656                  * moved to the laundry thread, buffer not ready
3657                  */
3658                 *queue = req;
3659                 goto start;
3660         }
3661         return (bp);
3662 }
3663
3664
3665 /*
3666  * Clean a buffer.
3667  * Returns 0 if buffer is ready to use,
3668  * Returns 1 if issued a buf_bawrite() to indicate
3669  * that the buffer is not ready.
3670  *
3671  * buf_mtxp is held upon entry
3672  * returns with buf_mtxp locked
3673  */
3674 int
3675 bcleanbuf(buf_t bp, boolean_t discard)
3676 {
3677         /* Remove from the queue */
3678         bremfree_locked(bp);
3679
3680 #ifdef JOE_DEBUG
3681         bp->b_owner = current_thread();
3682         bp->b_tag   = 2;
3683 #endif
3684         /*
3685          * If buffer was a delayed write, start the IO by queuing
3686          * it on the LAUNDRY queue, and return 1
3687          */
3688         if (ISSET(bp->b_flags, B_DELWRI)) {
3689                 if (discard) {
3690                         SET(bp->b_lflags, BL_WANTDEALLOC);
3691                 }
3692
3693                 bmovelaundry(bp);
3694
3695                 lck_mtx_unlock(buf_mtxp);
3696
3697                 wakeup(&bufqueues[BQ_LAUNDRY]);
3698                 /*
3699                  * and give it a chance to run
3700                  */
3701                 (void)thread_block(THREAD_CONTINUE_NULL);
3702
3703                 lck_mtx_lock_spin(buf_mtxp);
3704
3705                 return (1);
3706         }
3707 #ifdef JOE_DEBUG
3708         bp->b_owner = current_thread();
3709         bp->b_tag   = 8;
3710 #endif
3711         /*
3712          * Buffer is no longer on any free list... we own it
3713          */
3714         SET(bp->b_lflags, BL_BUSY);
3715         buf_busycount++;
3716
3717         bremhash(bp);
3718
3719         /*
3720          * disassociate us from our vnode, if we had one...
3721          */
3722         if (bp->b_vp)
3723                 brelvp_locked(bp);
3724
3725         lck_mtx_unlock(buf_mtxp);
3726
3727         BLISTNONE(bp);
3728
3729         if (ISSET(bp->b_flags, B_META))
3730                 buf_free_meta_store(bp);
3731
3732         trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
3733
3734         buf_release_credentials(bp);
3735
3736         /* If discarding, just move to the empty queue */
3737         if (discard) {
3738                 lck_mtx_lock_spin(buf_mtxp);
3739                 CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA));
3740                 bp->b_whichq = BQ_EMPTY;
3741                 binshash(bp, &invalhash);
3742                 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
3743                 CLR(bp->b_lflags, BL_BUSY);
3744                 buf_busycount--;
3745         } else {
3746                 /* Not discarding: clean up and prepare for reuse */
3747                 bp->b_bufsize = 0;
3748                 bp->b_datap = (uintptr_t)NULL;
3749                 bp->b_upl = (void *)NULL;
3750                 bp->b_fsprivate = (void *)NULL;
3751                 /*
3752                  * preserve the state of whether this buffer
3753                  * was allocated on the fly or not...
3754                  * the only other flag that should be set at
3755                  * this point is BL_BUSY...
3756                  */
3757 #ifdef JOE_DEBUG
3758                 bp->b_owner = current_thread();
3759                 bp->b_tag   = 3;
3760 #endif
3761                 bp->b_lflags = BL_BUSY;
3762                 bp->b_flags = (bp->b_flags & B_HDRALLOC);
3763                 bp->b_redundancy_flags = 0;
3764                 bp->b_dev = NODEV;
3765                 bp->b_blkno = bp->b_lblkno = 0;
3766                 bp->b_iodone = NULL;
3767                 bp->b_error = 0;
3768                 bp->b_resid = 0;
3769                 bp->b_bcount = 0;
3770                 bp->b_dirtyoff = bp->b_dirtyend = 0;
3771                 bp->b_validoff = bp->b_validend = 0;
3772                 bzero(&bp->b_attr, sizeof(struct bufattr));
3773
3774                 lck_mtx_lock_spin(buf_mtxp);
3775         }
3776         return (0);
3777 }
3778
3779
3780
3781 errno_t
3782 buf_invalblkno(vnode_t vp, daddr64_t lblkno, int flags)
3783 {
3784         buf_t   bp;
3785         errno_t error;
3786         struct bufhashhdr *dp;
3787
3788         dp = BUFHASH(vp, lblkno);
3789
3790 relook:
3791         lck_mtx_lock_spin(buf_mtxp);
3792
3793         if ((bp = incore_locked(vp, lblkno, dp)) == (struct buf *)0) {
3794                 lck_mtx_unlock(buf_mtxp);
3795                 return (0);
3796         }
3797         if (ISSET(bp->b_lflags, BL_BUSY)) {
3798                 if ( !ISSET(flags, BUF_WAIT)) {
3799                         lck_mtx_unlock(buf_mtxp);
3800                         return (EBUSY);
3801                 }
3802                 SET(bp->b_lflags, BL_WANTED);
3803
3804                 error = msleep((caddr_t)bp, buf_mtxp, PDROP | (PRIBIO + 1), "buf_invalblkno", NULL);
3805
3806                 if (error) {
3807                         return (error);
3808                 }
3809                 goto relook;
3810         }
3811         bremfree_locked(bp);
3812         SET(bp->b_lflags, BL_BUSY);
3813         SET(bp->b_flags, B_INVAL);
3814         buf_busycount++;
3815 #ifdef JOE_DEBUG
3816         bp->b_owner = current_thread();
3817         bp->b_tag   = 4;
3818 #endif
3819         lck_mtx_unlock(buf_mtxp);
3820         buf_brelse(bp);
3821
3822         return (0);
3823 }
3824
3825
3826 void
3827 buf_drop(buf_t bp)
3828 {
3829         int need_wakeup = 0;
3830
3831         lck_mtx_lock_spin(buf_mtxp);
3832
3833         if (ISSET(bp->b_lflags, BL_WANTED)) {
3834                 /*
3835                  * delay the actual wakeup until after we
3836                  * clear BL_BUSY and we've dropped buf_mtxp
3837                  */
3838                 need_wakeup = 1;
3839         }
3840 #ifdef JOE_DEBUG
3841         bp->b_owner = current_thread();
3842         bp->b_tag   = 9;
3843 #endif
3844         /*
3845          * Unlock the buffer.
3846          */
3847         CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
3848         buf_busycount--;
3849
3850         lck_mtx_unlock(buf_mtxp);
3851
3852         if (need_wakeup) {
3853                 /*
3854                  * Wake up any proceeses waiting for _this_ buffer to become free.
3855                  */
3856                 wakeup(bp);
3857         }
3858 }
3859
3860
3861 errno_t
3862 buf_acquire(buf_t bp, int flags, int slpflag, int slptimeo) {
3863         errno_t error;
3864
3865         lck_mtx_lock_spin(buf_mtxp);
3866
3867         error = buf_acquire_locked(bp, flags, slpflag, slptimeo);
3868
3869         lck_mtx_unlock(buf_mtxp);
3870
3871         return (error);
3872 }
3873
3874
3875 static errno_t
3876 buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo)
3877 {
3878         errno_t error;
3879         struct timespec ts;
3880
3881         if (ISSET(bp->b_flags, B_LOCKED)) {
3882                 if ((flags & BAC_SKIP_LOCKED))
3883                         return (EDEADLK);
3884         } else {
3885                 if ((flags & BAC_SKIP_NONLOCKED))
3886                         return (EDEADLK);
3887         }
3888         if (ISSET(bp->b_lflags, BL_BUSY)) {
3889                 /*
3890                  * since the lck_mtx_lock may block, the buffer
3891                  * may become BUSY, so we need to
3892                  * recheck for a NOWAIT request
3893                  */
3894                 if (flags & BAC_NOWAIT)
3895                         return (EBUSY);
3896                 SET(bp->b_lflags, BL_WANTED);
3897
3898                 /* the hz value is 100; which leads to 10ms */
3899                 ts.tv_sec = (slptimeo/100);
3900                 ts.tv_nsec = (slptimeo % 100) * 10  * NSEC_PER_USEC * 1000;
3901                 error = msleep((caddr_t)bp, buf_mtxp, slpflag | (PRIBIO + 1), "buf_acquire", &ts);
3902
3903                 if (error)
3904                         return (error);
3905                 return (EAGAIN);
3906         }
3907         if (flags & BAC_REMOVE)
3908                 bremfree_locked(bp);
3909         SET(bp->b_lflags, BL_BUSY);
3910         buf_busycount++;
3911
3912 #ifdef JOE_DEBUG
3913         bp->b_owner = current_thread();
3914         bp->b_tag   = 5;
3915 #endif
3916         return (0);
3917 }
3918
3919
3920 /*
3921  * Wait for operations on the buffer to complete.
3922  * When they do, extract and return the I/O's error value.
3923  */
3924 errno_t
3925 buf_biowait(buf_t bp)
3926 {
3927         while (!ISSET(bp->b_flags, B_DONE)) {
3928
3929                 lck_mtx_lock_spin(buf_mtxp);
3930
3931                 if (!ISSET(bp->b_flags, B_DONE)) {
3932                         DTRACE_IO1(wait__start, buf_t, bp);
3933                         (void) msleep(bp, buf_mtxp, PDROP | (PRIBIO+1), "buf_biowait", NULL);
3934                         DTRACE_IO1(wait__done, buf_t, bp);
3935                 } else
3936                         lck_mtx_unlock(buf_mtxp);
3937         }
3938         /* check for interruption of I/O (e.g. via NFS), then errors. */
3939         if (ISSET(bp->b_flags, B_EINTR)) {
3940                 CLR(bp->b_flags, B_EINTR);
3941                 return (EINTR);
3942         } else if (ISSET(bp->b_flags, B_ERROR))
3943                 return (bp->b_error ? bp->b_error : EIO);
3944         else
3945                 return (0);
3946 }
3947
3948
3949 /*
3950  * Mark I/O complete on a buffer.
3951  *
3952  * If a callback has been requested, e.g. the pageout
3953  * daemon, do so. Otherwise, awaken waiting processes.
3954  *
3955  * [ Leffler, et al., says on p.247:
3956  *      "This routine wakes up the blocked process, frees the buffer
3957  *      for an asynchronous write, or, for a request by the pagedaemon
3958  *      process, invokes a procedure specified in the buffer structure" ]
3959  *
3960  * In real life, the pagedaemon (or other system processes) wants
3961  * to do async stuff to, and doesn't want the buffer buf_brelse()'d.
3962  * (for swap pager, that puts swap buffers on the free lists (!!!),
3963  * for the vn device, that puts malloc'd buffers on the free lists!)
3964  */
3965
3966 void
3967 buf_biodone(buf_t bp)
3968 {
3969         mount_t mp;
3970         struct bufattr *bap;
3971
3972         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
3973                      bp, bp->b_datap, bp->b_flags, 0, 0);
3974
3975         if (ISSET(bp->b_flags, B_DONE))
3976                 panic("biodone already");
3977
3978         bap = &bp->b_attr;
3979
3980         if (bp->b_vp && bp->b_vp->v_mount) {
3981                 mp = bp->b_vp->v_mount;
3982         } else {
3983                 mp = NULL;
3984         }
3985
3986         if (ISSET(bp->b_flags, B_ERROR)) {
3987                 if (mp && (MNT_ROOTFS & mp->mnt_flag)) {
3988                         dk_error_description_t desc;
3989                         bzero(&desc, sizeof(desc));
3990                         desc.description      = panic_disk_error_description;
3991                         desc.description_size = panic_disk_error_description_size;
3992                         VNOP_IOCTL(mp->mnt_devvp, DKIOCGETERRORDESCRIPTION, (caddr_t)&desc, 0, vfs_context_kernel());
3993                 }
3994         }
3995
3996         if (mp && (bp->b_flags & B_READ) == 0) {
3997                 update_last_io_time(mp);
3998                 INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_write_size);
3999         } else if (mp) {
4000                 INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_read_size);
4001         }
4002
4003         throttle_info_end_io(bp);
4004
4005         if (kdebug_enable) {
4006                 int code    = DKIO_DONE;
4007                 int io_tier = GET_BUFATTR_IO_TIER(bap);
4008
4009                 if (bp->b_flags & B_READ)
4010                         code |= DKIO_READ;
4011                 if (bp->b_flags & B_ASYNC)
4012                         code |= DKIO_ASYNC;
4013
4014                 if (bp->b_flags & B_META)
4015                         code |= DKIO_META;
4016                 else if (bp->b_flags & B_PAGEIO)
4017                         code |= DKIO_PAGING;
4018
4019                 if (io_tier != 0)
4020                         code |= DKIO_THROTTLE;
4021
4022                 code |= ((io_tier << DKIO_TIER_SHIFT) & DKIO_TIER_MASK);
4023
4024                 if (bp->b_flags & B_PASSIVE)
4025                         code |= DKIO_PASSIVE;
4026
4027                 if (bap->ba_flags & BA_NOCACHE)
4028                         code |= DKIO_NOCACHE;
4029
4030                 if (bap->ba_flags & BA_IO_TIER_UPGRADE) {
4031                         code |= DKIO_TIER_UPGRADE;
4032                 }
4033
4034                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
4035                                           buf_kernel_addrperm_addr(bp), (uintptr_t)VM_KERNEL_ADDRPERM(bp->b_vp), bp->b_resid, bp->b_error, 0);
4036         }
4037
4038         /*
4039          * I/O was done, so don't believe
4040          * the DIRTY state from VM anymore...
4041          * and we need to reset the THROTTLED/PASSIVE
4042          * indicators
4043          */
4044         CLR(bp->b_flags, (B_WASDIRTY | B_PASSIVE));
4045         CLR(bap->ba_flags, (BA_META | BA_NOCACHE | BA_DELAYIDLESLEEP | BA_IO_TIER_UPGRADE));
4046
4047         SET_BUFATTR_IO_TIER(bap, 0);
4048
4049         DTRACE_IO1(done, buf_t, bp);
4050
4051         if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
4052                 /*
4053                  * wake up any writer's blocked
4054                  * on throttle or waiting for I/O
4055                  * to drain
4056                  */
4057                 vnode_writedone(bp->b_vp);
4058
4059         if (ISSET(bp->b_flags, (B_CALL | B_FILTER))) {  /* if necessary, call out */
4060                 void    (*iodone_func)(struct buf *, void *) = bp->b_iodone;
4061                 void    *arg = bp->b_transaction;
4062                 int     callout = ISSET(bp->b_flags, B_CALL);
4063
4064                 if (iodone_func == NULL)
4065                         panic("biodone: bp @ %p has NULL b_iodone!\n", bp);
4066
4067                 CLR(bp->b_flags, (B_CALL | B_FILTER));  /* filters and callouts are one-shot */
4068                 bp->b_iodone = NULL;
4069                 bp->b_transaction = NULL;
4070
4071                 if (callout)
4072                         SET(bp->b_flags, B_DONE);       /* note that it's done */
4073
4074                 (*iodone_func)(bp, arg);
4075
4076                 if (callout) {
4077                         /*
4078                          * assumes that the callback function takes
4079                          * ownership of the bp and deals with releasing it if necessary
4080                          */
4081                         goto biodone_done;
4082                 }
4083                 /*
4084                  * in this case the call back function is acting
4085                  * strictly as a filter... it does not take
4086                  * ownership of the bp and is expecting us
4087                  * to finish cleaning up... this is currently used
4088                  * by the HFS journaling code
4089                  */
4090         }
4091         if (ISSET(bp->b_flags, B_ASYNC)) {      /* if async, release it */
4092                 SET(bp->b_flags, B_DONE);       /* note that it's done */
4093
4094                 buf_brelse(bp);
4095         } else {                                /* or just wakeup the buffer */
4096                 /*
4097                  * by taking the mutex, we serialize
4098                  * the buf owner calling buf_biowait so that we'll
4099                  * only see him in one of 2 states...
4100                  * state 1: B_DONE wasn't set and he's
4101                  * blocked in msleep
4102                  * state 2: he's blocked trying to take the
4103                  * mutex before looking at B_DONE
4104                  * BL_WANTED is cleared in case anyone else
4105                  * is blocked waiting for the buffer... note
4106                  * that we haven't cleared B_BUSY yet, so if
4107                  * they do get to run, their going to re-set
4108                  * BL_WANTED and go back to sleep
4109                  */
4110                 lck_mtx_lock_spin(buf_mtxp);
4111
4112                 CLR(bp->b_lflags, BL_WANTED);
4113                 SET(bp->b_flags, B_DONE);               /* note that it's done */
4114
4115                 lck_mtx_unlock(buf_mtxp);
4116
4117                 wakeup(bp);
4118         }
4119 biodone_done:
4120         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
4121                  (uintptr_t)bp, (uintptr_t)bp->b_datap, bp->b_flags, 0, 0);
4122 }
4123
4124 /*
4125  * Obfuscate buf pointers.
4126  */
4127 vm_offset_t
4128 buf_kernel_addrperm_addr(void * addr)
4129 {
4130         if ((vm_offset_t)addr == 0)
4131                 return 0;
4132         else
4133                 return ((vm_offset_t)addr + buf_kernel_addrperm);
4134 }
4135
4136 /*
4137  * Return a count of buffers on the "locked" queue.
4138  */
4139 int
4140 count_lock_queue(void)
4141 {
4142         buf_t   bp;
4143         int     n = 0;
4144
4145         lck_mtx_lock_spin(buf_mtxp);
4146
4147         for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
4148             bp = bp->b_freelist.tqe_next)
4149                 n++;
4150         lck_mtx_unlock(buf_mtxp);
4151
4152         return (n);
4153 }
4154
4155 /*
4156  * Return a count of 'busy' buffers. Used at the time of shutdown.
4157  * note: This is also called from the mach side in debug context in kdp.c
4158  */
4159 int
4160 count_busy_buffers(void)
4161 {
4162         return buf_busycount + bufstats.bufs_iobufinuse;
4163 }
4164
4165 #if DIAGNOSTIC
4166 /*
4167  * Print out statistics on the current allocation of the buffer pool.
4168  * Can be enabled to print out on every ``sync'' by setting "syncprt"
4169  * in vfs_syscalls.c using sysctl.
4170  */
4171 void
4172 vfs_bufstats()
4173 {
4174         int i, j, count;
4175         struct buf *bp;
4176         struct bqueues *dp;
4177         int counts[MAXBSIZE/CLBYTES+1];
4178         static char *bname[BQUEUES] =
4179                 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
4180
4181         for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
4182                 count = 0;
4183                 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
4184                         counts[j] = 0;
4185
4186                 lck_mtx_lock(buf_mtxp);
4187
4188                 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
4189                         counts[bp->b_bufsize/CLBYTES]++;
4190                         count++;
4191                 }
4192                 lck_mtx_unlock(buf_mtxp);
4193
4194                 printf("%s: total-%d", bname[i], count);
4195                 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
4196                         if (counts[j] != 0)
4197                                 printf(", %d-%d", j * CLBYTES, counts[j]);
4198                 printf("\n");
4199         }
4200 }
4201 #endif /* DIAGNOSTIC */
4202
4203 #define NRESERVEDIOBUFS 128
4204
4205 #define MNT_VIRTUALDEV_MAX_IOBUFS 16
4206 #define VIRTUALDEV_MAX_IOBUFS ((40*niobuf_headers)/100)
4207
4208 buf_t
4209 alloc_io_buf(vnode_t vp, int priv)
4210 {
4211         buf_t   bp;
4212         mount_t mp = NULL;
4213         int alloc_for_virtualdev = FALSE;
4214
4215         lck_mtx_lock_spin(iobuffer_mtxp);
4216
4217         /*
4218          * We subject iobuf requests for diskimages to additional restrictions.
4219          *
4220          * a) A single diskimage mount cannot use up more than
4221          * MNT_VIRTUALDEV_MAX_IOBUFS. However,vm privileged (pageout) requests
4222          * are not subject to this restriction.
4223          * b) iobuf headers used by all diskimage headers by all mount
4224          * points cannot exceed  VIRTUALDEV_MAX_IOBUFS.
4225          */
4226         if (vp && ((mp = vp->v_mount)) && mp != dead_mountp &&
4227             mp->mnt_kern_flag & MNTK_VIRTUALDEV) {
4228                 alloc_for_virtualdev = TRUE;
4229                 while ((!priv && mp->mnt_iobufinuse > MNT_VIRTUALDEV_MAX_IOBUFS) ||
4230                     bufstats.bufs_iobufinuse_vdev > VIRTUALDEV_MAX_IOBUFS) {
4231                         bufstats.bufs_iobufsleeps++;
4232
4233                         need_iobuffer = 1;
4234                         (void)msleep(&need_iobuffer, iobuffer_mtxp,
4235                             PSPIN | (PRIBIO+1), (const char *)"alloc_io_buf (1)",
4236                             NULL);
4237                 }
4238         }
4239
4240         while (((niobuf_headers - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse) && !priv) ||
4241                (bp = iobufqueue.tqh_first) == NULL) {
4242                 bufstats.bufs_iobufsleeps++;
4243
4244                 need_iobuffer = 1;
4245                 (void)msleep(&need_iobuffer, iobuffer_mtxp, PSPIN | (PRIBIO+1),
4246                     (const char *)"alloc_io_buf (2)", NULL);
4247         }
4248         TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
4249
4250         bufstats.bufs_iobufinuse++;
4251         if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
4252                 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
4253
4254         if (alloc_for_virtualdev) {
4255                 mp->mnt_iobufinuse++;
4256                 bufstats.bufs_iobufinuse_vdev++;
4257         }
4258
4259         lck_mtx_unlock(iobuffer_mtxp);
4260
4261         /*
4262          * initialize various fields
4263          * we don't need to hold the mutex since the buffer
4264          * is now private... the vp should have a reference
4265          * on it and is not protected by this mutex in any event
4266          */
4267         bp->b_timestamp = 0;
4268         bp->b_proc = NULL;
4269
4270         bp->b_datap = 0;
4271         bp->b_flags = 0;
4272         bp->b_lflags = BL_BUSY | BL_IOBUF;
4273         if (alloc_for_virtualdev)
4274                 bp->b_lflags |= BL_IOBUF_VDEV;
4275         bp->b_redundancy_flags = 0;
4276         bp->b_blkno = bp->b_lblkno = 0;
4277 #ifdef JOE_DEBUG
4278         bp->b_owner = current_thread();
4279         bp->b_tag   = 6;
4280 #endif
4281         bp->b_iodone = NULL;
4282         bp->b_error = 0;
4283         bp->b_resid = 0;
4284         bp->b_bcount = 0;
4285         bp->b_bufsize = 0;
4286         bp->b_upl = NULL;
4287         bp->b_fsprivate = (void *)NULL;
4288         bp->b_vp = vp;
4289         bzero(&bp->b_attr, sizeof(struct bufattr));
4290
4291         if (vp && (vp->v_type == VBLK || vp->v_type == VCHR))
4292                 bp->b_dev = vp->v_rdev;
4293         else
4294                 bp->b_dev = NODEV;
4295
4296         return (bp);
4297 }
4298
4299
4300 void
4301 free_io_buf(buf_t bp)
4302 {
4303         int need_wakeup = 0;
4304         int free_for_virtualdev = FALSE;
4305         mount_t mp = NULL;
4306
4307         /* Was this iobuf for a diskimage ? */
4308         if (bp->b_lflags & BL_IOBUF_VDEV) {
4309                 free_for_virtualdev = TRUE;
4310                 if (bp->b_vp)
4311                         mp = bp->b_vp->v_mount;
4312         }
4313
4314         /*
4315          * put buffer back on the head of the iobufqueue
4316          */
4317         bp->b_vp = NULL;
4318         bp->b_flags = B_INVAL;
4319
4320         /* Zero out the bufattr and its flags before relinquishing this iobuf */
4321         bzero (&bp->b_attr, sizeof(struct bufattr));
4322
4323         lck_mtx_lock_spin(iobuffer_mtxp);
4324
4325         binsheadfree(bp, &iobufqueue, -1);
4326
4327         if (need_iobuffer) {
4328                 /*
4329                  * Wake up any processes waiting because they need an io buffer
4330                  *
4331                  * do the wakeup after we drop the mutex... it's possible that the
4332                  * wakeup will be superfluous if need_iobuffer gets set again and
4333                  * another thread runs this path, but it's highly unlikely, doesn't
4334                  * hurt, and it means we don't hold up I/O progress if the wakeup blocks
4335                  * trying to grab a task related lock...
4336                  */
4337                 need_iobuffer = 0;
4338                 need_wakeup = 1;
4339         }
4340         if (bufstats.bufs_iobufinuse <= 0)
4341                 panic("free_io_buf: bp(%p) - bufstats.bufs_iobufinuse < 0", bp);
4342
4343         bufstats.bufs_iobufinuse--;
4344
4345         if (free_for_virtualdev) {
4346                 bufstats.bufs_iobufinuse_vdev--;
4347                 if (mp && mp != dead_mountp)
4348                         mp->mnt_iobufinuse--;
4349         }
4350
4351         lck_mtx_unlock(iobuffer_mtxp);
4352
4353         if (need_wakeup)
4354                 wakeup(&need_iobuffer);
4355 }
4356
4357
4358 void
4359 buf_list_lock(void)
4360 {
4361         lck_mtx_lock_spin(buf_mtxp);
4362 }
4363
4364 void
4365 buf_list_unlock(void)
4366 {
4367         lck_mtx_unlock(buf_mtxp);
4368 }
4369
4370 /*
4371  * If getnewbuf() calls bcleanbuf() on the same thread
4372  * there is a potential for stack overrun and deadlocks.
4373  * So we always handoff the work to a worker thread for completion
4374  */
4375
4376
4377 static void
4378 bcleanbuf_thread_init(void)
4379 {
4380         thread_t        thread = THREAD_NULL;
4381
4382         /* create worker thread */
4383         kernel_thread_start((thread_continue_t)bcleanbuf_thread, NULL, &thread);
4384         thread_deallocate(thread);
4385 }
4386
4387 typedef int (*bcleanbufcontinuation)(int);
4388
4389 __attribute__((noreturn))
4390 static void
4391 bcleanbuf_thread(void)
4392 {
4393         struct buf *bp;
4394         int error = 0;
4395         int loopcnt = 0;
4396
4397         for (;;) {
4398                 lck_mtx_lock_spin(buf_mtxp);
4399
4400                 while ( (bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY])) == NULL) {
4401                         (void)msleep0(&bufqueues[BQ_LAUNDRY], buf_mtxp, PRIBIO|PDROP, "blaundry", 0, (bcleanbufcontinuation)bcleanbuf_thread);
4402                 }
4403
4404                 /*
4405                  * Remove from the queue
4406                  */
4407                 bremfree_locked(bp);
4408
4409                 /*
4410                  * Buffer is no longer on any free list
4411                  */
4412                 SET(bp->b_lflags, BL_BUSY);
4413                 buf_busycount++;
4414
4415 #ifdef JOE_DEBUG
4416                 bp->b_owner = current_thread();
4417                 bp->b_tag   = 10;
4418 #endif
4419
4420                 lck_mtx_unlock(buf_mtxp);
4421                 /*
4422                  * do the IO
4423                  */
4424                 error = bawrite_internal(bp, 0);
4425
4426                 if (error) {
4427                         bp->b_whichq = BQ_LAUNDRY;
4428                         bp->b_timestamp = buf_timestamp();
4429
4430                         lck_mtx_lock_spin(buf_mtxp);
4431
4432                         binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
4433                         blaundrycnt++;
4434
4435                         /* we never leave a busy page on the laundry queue */
4436                         CLR(bp->b_lflags, BL_BUSY);
4437                         buf_busycount--;
4438 #ifdef JOE_DEBUG
4439                         bp->b_owner = current_thread();
4440                         bp->b_tag   = 11;
4441 #endif
4442
4443                         lck_mtx_unlock(buf_mtxp);
4444
4445                         if (loopcnt > MAXLAUNDRY) {
4446                                 /*
4447                                  * bawrite_internal() can return errors if we're throttled. If we've
4448                                  * done several I/Os and failed, give the system some time to unthrottle
4449                                  * the vnode
4450                                  */
4451                                 (void)tsleep((void *)&bufqueues[BQ_LAUNDRY], PRIBIO, "blaundry", 1);
4452                                 loopcnt = 0;
4453                         } else {
4454                                 /* give other threads a chance to run */
4455                                 (void)thread_block(THREAD_CONTINUE_NULL);
4456                                 loopcnt++;
4457                         }
4458                 }
4459         }
4460 }
4461
4462
4463 static int
4464 brecover_data(buf_t bp)
4465 {
4466         int     upl_offset;
4467         upl_t   upl;
4468         upl_page_info_t *pl;
4469         kern_return_t kret;
4470         vnode_t vp = bp->b_vp;
4471         int upl_flags;
4472
4473
4474         if ( !UBCINFOEXISTS(vp) || bp->b_bufsize == 0)
4475                 goto dump_buffer;
4476
4477         upl_flags = UPL_PRECIOUS;
4478         if (! (buf_flags(bp) & B_READ)) {
4479                 /*
4480                  * "write" operation:  let the UPL subsystem know
4481                  * that we intend to modify the buffer cache pages we're
4482                  * gathering.
4483                  */
4484                 upl_flags |= UPL_WILL_MODIFY;
4485         }
4486
4487         kret = ubc_create_upl(vp,
4488                               ubc_blktooff(vp, bp->b_lblkno),
4489                               bp->b_bufsize,
4490                               &upl,
4491                               &pl,
4492                               upl_flags);
4493         if (kret != KERN_SUCCESS)
4494                 panic("Failed to create UPL");
4495
4496         for (upl_offset = 0; upl_offset < bp->b_bufsize; upl_offset += PAGE_SIZE) {
4497
4498                 if (!upl_valid_page(pl, upl_offset / PAGE_SIZE) || !upl_dirty_page(pl, upl_offset / PAGE_SIZE)) {
4499                         ubc_upl_abort(upl, 0);
4500                         goto dump_buffer;
4501                 }
4502         }
4503         bp->b_upl = upl;
4504
4505         kret = ubc_upl_map(upl, (vm_offset_t *)&(bp->b_datap));
4506
4507         if (kret != KERN_SUCCESS)
4508                 panic("getblk: ubc_upl_map() failed with (%d)", kret);
4509         return (1);
4510
4511 dump_buffer:
4512         bp->b_bufsize = 0;
4513         SET(bp->b_flags, B_INVAL);
4514         buf_brelse(bp);
4515
4516         return(0);
4517 }
4518
4519 int
4520 fs_buffer_cache_gc_register(void (* callout)(int, void *), void *context)
4521 {
4522         lck_mtx_lock(buf_gc_callout);
4523         for (int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
4524                 if (fs_callouts[i].callout == NULL) {
4525                         fs_callouts[i].callout = callout;
4526                         fs_callouts[i].context = context;
4527                         lck_mtx_unlock(buf_gc_callout);
4528                         return 0;
4529                 }
4530         }
4531
4532         lck_mtx_unlock(buf_gc_callout);
4533         return ENOMEM;
4534 }
4535
4536 int
4537 fs_buffer_cache_gc_unregister(void (* callout)(int, void *), void *context)
4538 {
4539         lck_mtx_lock(buf_gc_callout);
4540         for (int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
4541                 if (fs_callouts[i].callout == callout &&
4542                     fs_callouts[i].context == context) {
4543                         fs_callouts[i].callout = NULL;
4544                         fs_callouts[i].context = NULL;
4545                 }
4546         }
4547         lck_mtx_unlock(buf_gc_callout);
4548         return 0;
4549 }
4550
4551 static void
4552 fs_buffer_cache_gc_dispatch_callouts(int all)
4553 {
4554         lck_mtx_lock(buf_gc_callout);
4555         for(int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
4556                 if (fs_callouts[i].callout != NULL) {
4557                         fs_callouts[i].callout(all, fs_callouts[i].context);
4558                 }
4559         }
4560         lck_mtx_unlock(buf_gc_callout);
4561 }
4562
4563 boolean_t
4564 buffer_cache_gc(int all)
4565 {
4566         buf_t bp;
4567         boolean_t did_large_zfree = FALSE;
4568         boolean_t need_wakeup = FALSE;
4569         int now = buf_timestamp();
4570         uint32_t found = 0;
4571         struct bqueues privq;
4572         int thresh_hold = BUF_STALE_THRESHHOLD;
4573
4574         if (all)
4575                 thresh_hold = 0;
4576         /*
4577          * We only care about metadata (incore storage comes from zalloc()).
4578          * Unless "all" is set (used to evict meta data buffers in preparation
4579          * for deep sleep), we only evict up to BUF_MAX_GC_BATCH_SIZE buffers
4580          * that have not been accessed in the last BUF_STALE_THRESHOLD seconds.
4581          * BUF_MAX_GC_BATCH_SIZE controls both the hold time of the global lock
4582          * "buf_mtxp" and the length of time we spend compute bound in the GC
4583          * thread which calls this function
4584          */
4585         lck_mtx_lock(buf_mtxp);
4586
4587         do {
4588                 found = 0;
4589                 TAILQ_INIT(&privq);
4590                 need_wakeup = FALSE;
4591
4592                 while (((bp = TAILQ_FIRST(&bufqueues[BQ_META]))) &&
4593                                 (now > bp->b_timestamp) &&
4594                                 (now - bp->b_timestamp > thresh_hold) &&
4595                                 (found < BUF_MAX_GC_BATCH_SIZE)) {
4596
4597                         /* Remove from free list */
4598                         bremfree_locked(bp);
4599                         found++;
4600
4601 #ifdef JOE_DEBUG
4602                         bp->b_owner = current_thread();
4603                         bp->b_tag   = 12;
4604 #endif
4605
4606                         /* If dirty, move to laundry queue and remember to do wakeup */
4607                         if (ISSET(bp->b_flags, B_DELWRI)) {
4608                                 SET(bp->b_lflags, BL_WANTDEALLOC);
4609
4610                                 bmovelaundry(bp);
4611                                 need_wakeup = TRUE;
4612
4613                                 continue;
4614                         }
4615
4616                         /*
4617                          * Mark busy and put on private list.  We could technically get
4618                          * away without setting BL_BUSY here.
4619                          */
4620                         SET(bp->b_lflags, BL_BUSY);
4621                         buf_busycount++;
4622
4623                         /*
4624                          * Remove from hash and dissociate from vp.
4625                          */
4626                         bremhash(bp);
4627                         if (bp->b_vp) {
4628                                 brelvp_locked(bp);
4629                         }
4630
4631                         TAILQ_INSERT_TAIL(&privq, bp, b_freelist);
4632                 }
4633
4634                 if (found == 0) {
4635                         break;
4636                 }
4637
4638                 /* Drop lock for batch processing */
4639                 lck_mtx_unlock(buf_mtxp);
4640
4641                 /* Wakeup and yield for laundry if need be */
4642                 if (need_wakeup) {
4643                         wakeup(&bufqueues[BQ_LAUNDRY]);
4644                         (void)thread_block(THREAD_CONTINUE_NULL);
4645                 }
4646
4647                 /* Clean up every buffer on private list */
4648                 TAILQ_FOREACH(bp, &privq, b_freelist) {
4649                         /* Take note if we've definitely freed at least a page to a zone */
4650                         if ((ISSET(bp->b_flags, B_ZALLOC)) && (buf_size(bp) >= PAGE_SIZE)) {
4651                                 did_large_zfree = TRUE;
4652                         }
4653
4654                         trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
4655
4656                         /* Free Storage */
4657                         buf_free_meta_store(bp);
4658
4659                         /* Release credentials */
4660                         buf_release_credentials(bp);
4661
4662                         /* Prepare for moving to empty queue */
4663                         CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED
4664                                                 | B_AGE | B_ASYNC | B_NOCACHE | B_FUA));
4665                         bp->b_whichq = BQ_EMPTY;
4666                         BLISTNONE(bp);
4667                 }
4668                 lck_mtx_lock(buf_mtxp);
4669
4670                 /* Back under lock, move them all to invalid hash and clear busy */
4671                 TAILQ_FOREACH(bp, &privq, b_freelist) {
4672                         binshash(bp, &invalhash);
4673                         CLR(bp->b_lflags, BL_BUSY);
4674                         buf_busycount--;
4675
4676 #ifdef JOE_DEBUG
4677                         if (bp->b_owner != current_thread()) {
4678                                 panic("Buffer stolen from buffer_cache_gc()");
4679                         }
4680                         bp->b_owner = current_thread();
4681                         bp->b_tag   = 13;
4682 #endif
4683                 }
4684
4685                 /* And do a big bulk move to the empty queue */
4686                 TAILQ_CONCAT(&bufqueues[BQ_EMPTY], &privq, b_freelist);
4687
4688         } while (all && (found == BUF_MAX_GC_BATCH_SIZE));
4689
4690         lck_mtx_unlock(buf_mtxp);
4691
4692         fs_buffer_cache_gc_dispatch_callouts(all);
4693
4694         return did_large_zfree;
4695 }
4696
4697
4698 /*
4699  * disabled for now
4700  */
4701
4702 #if FLUSH_QUEUES
4703
4704 #define NFLUSH 32
4705
4706 static int
4707 bp_cmp(void *a, void *b)
4708 {
4709     buf_t *bp_a = *(buf_t **)a,
4710           *bp_b = *(buf_t **)b;
4711     daddr64_t res;
4712
4713     // don't have to worry about negative block
4714     // numbers so this is ok to do.
4715     //
4716     res = (bp_a->b_blkno - bp_b->b_blkno);
4717
4718     return (int)res;
4719 }
4720
4721
4722 int
4723 bflushq(int whichq, mount_t mp)
4724 {
4725         buf_t   bp, next;
4726         int     i, buf_count;
4727         int     total_writes = 0;
4728         static buf_t flush_table[NFLUSH];
4729
4730         if (whichq < 0 || whichq >= BQUEUES) {
4731             return (0);
4732         }
4733
4734   restart:
4735         lck_mtx_lock(buf_mtxp);
4736
4737         bp = TAILQ_FIRST(&bufqueues[whichq]);
4738
4739         for (buf_count = 0; bp; bp = next) {
4740             next = bp->b_freelist.tqe_next;
4741
4742             if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) {
4743                 continue;
4744             }
4745
4746             if (ISSET(bp->b_flags, B_DELWRI) && !ISSET(bp->b_lflags, BL_BUSY)) {
4747
4748                 bremfree_locked(bp);
4749 #ifdef JOE_DEBUG
4750                 bp->b_owner = current_thread();
4751                 bp->b_tag   = 7;
4752 #endif
4753                 SET(bp->b_lflags, BL_BUSY);
4754                 buf_busycount++;
4755
4756                 flush_table[buf_count] = bp;
4757                 buf_count++;
4758                 total_writes++;
4759
4760                 if (buf_count >= NFLUSH) {
4761                     lck_mtx_unlock(buf_mtxp);
4762
4763                     qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
4764
4765                     for (i = 0; i < buf_count; i++) {
4766                         buf_bawrite(flush_table[i]);
4767                     }
4768                     goto restart;
4769                 }
4770             }
4771         }
4772         lck_mtx_unlock(buf_mtxp);
4773
4774         if (buf_count > 0) {
4775             qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
4776
4777             for (i = 0; i < buf_count; i++) {
4778                 buf_bawrite(flush_table[i]);
4779             }
4780         }
4781
4782         return (total_writes);
4783 }
4784 #endif