]> git.saurik.com Git - apple/xnu.git/blame - bsd/vfs/vfs_bio.c
xnu-3789.70.16.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_bio.c
CommitLineData
1c79356b 1/*
39037602 2 * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
5d5c5d0d 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
8f6c56a5 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
8f6c56a5 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*-
30 * Copyright (c) 1994 Christopher G. Demetriou
31 * Copyright (c) 1982, 1986, 1989, 1993
32 * The Regents of the University of California. All rights reserved.
33 * (c) UNIX System Laboratories, Inc.
34 * All or some portions of this file are derived from material licensed
35 * to the University of California by American Telephone and Telegraph
36 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
37 * the permission of UNIX System Laboratories, Inc.
38 *
39 * Redistribution and use in source and binary forms, with or without
40 * modification, are permitted provided that the following conditions
41 * are met:
42 * 1. Redistributions of source code must retain the above copyright
43 * notice, this list of conditions and the following disclaimer.
44 * 2. Redistributions in binary form must reproduce the above copyright
45 * notice, this list of conditions and the following disclaimer in the
46 * documentation and/or other materials provided with the distribution.
47 * 3. All advertising materials mentioning features or use of this software
48 * must display the following acknowledgement:
49 * This product includes software developed by the University of
50 * California, Berkeley and its contributors.
51 * 4. Neither the name of the University nor the names of its contributors
52 * may be used to endorse or promote products derived from this software
53 * without specific prior written permission.
54 *
55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65 * SUCH DAMAGE.
66 *
1c79356b
A
67 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
68 */
69
70/*
71 * Some references:
72 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
73 * Leffler, et al.: The Design and Implementation of the 4.3BSD
74 * UNIX Operating System (Addison Welley, 1989)
75 */
1c79356b
A
76
77#include <sys/param.h>
78#include <sys/systm.h>
91447636
A
79#include <sys/proc_internal.h>
80#include <sys/buf_internal.h>
81#include <sys/vnode_internal.h>
82#include <sys/mount_internal.h>
1c79356b
A
83#include <sys/trace.h>
84#include <sys/malloc.h>
85#include <sys/resourcevar.h>
86#include <miscfs/specfs/specdev.h>
87#include <sys/ubc.h>
91447636 88#include <sys/kauth.h>
1c79356b
A
89#if DIAGNOSTIC
90#include <kern/assert.h>
91#endif /* DIAGNOSTIC */
92#include <kern/task.h>
93#include <kern/zalloc.h>
fe8ab488
A
94#include <kern/locks.h>
95#include <kern/thread.h>
91447636 96
2d21ac55 97#include <sys/fslog.h> /* fslog_io_error() */
d190cdc3 98#include <sys/disk.h> /* dk_error_description_t */
2d21ac55
A
99
100#include <mach/mach_types.h>
101#include <mach/memory_object_types.h>
102#include <kern/sched_prim.h> /* thread_block() */
103
91447636 104#include <vm/vm_kern.h>
b0d623f7 105#include <vm/vm_pageout.h>
1c79356b
A
106
107#include <sys/kdebug.h>
2d21ac55
A
108
109#include <libkern/OSAtomic.h>
b0d623f7 110#include <libkern/OSDebug.h>
2d21ac55
A
111#include <sys/ubc_internal.h>
112
113#include <sys/sdt.h>
1c79356b 114
6d2010ae 115int bcleanbuf(buf_t bp, boolean_t discard);
91447636
A
116static int brecover_data(buf_t bp);
117static boolean_t incore(vnode_t vp, daddr64_t blkno);
91447636
A
118/* timeout is in msecs */
119static buf_t getnewbuf(int slpflag, int slptimeo, int *queue);
120static void bremfree_locked(buf_t bp);
121static void buf_reassign(buf_t bp, vnode_t newvp);
122static errno_t buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo);
123static int buf_iterprepare(vnode_t vp, struct buflists *, int flags);
124static void buf_itercomplete(vnode_t vp, struct buflists *, int flags);
6d2010ae
A
125static boolean_t buffer_cache_gc(int);
126static buf_t buf_brelse_shadow(buf_t bp);
127static void buf_free_meta_store(buf_t bp);
128
129static buf_t buf_create_shadow_internal(buf_t bp, boolean_t force_copy,
130 uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg, int priv);
131
1c79356b 132
39037602 133int bdwrite_internal(buf_t, int);
1c79356b 134
d52fe63f 135/* zone allocated buffer headers */
39236c6e
A
136static void bufzoneinit(void);
137static void bcleanbuf_thread_init(void);
91447636
A
138static void bcleanbuf_thread(void);
139
140static zone_t buf_hdr_zone;
141static int buf_hdr_count;
d52fe63f 142
1c79356b
A
143
144/*
145 * Definitions for the buffer hash lists.
146 */
147#define BUFHASH(dvp, lbn) \
148 (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
149LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
150u_long bufhash;
151
2d21ac55
A
152static buf_t incore_locked(vnode_t vp, daddr64_t blkno, struct bufhashhdr *dp);
153
1c79356b
A
154/* Definitions for the buffer stats. */
155struct bufstats bufstats;
156
d52fe63f 157/* Number of delayed write buffers */
2d21ac55 158long nbdwrite = 0;
91447636 159int blaundrycnt = 0;
2d21ac55 160static int boot_nbuf_headers = 0;
d52fe63f 161
6d2010ae 162static TAILQ_HEAD(delayqueue, buf) delaybufqueue;
1c79356b 163
91447636
A
164static TAILQ_HEAD(ioqueue, buf) iobufqueue;
165static TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
d52fe63f
A
166static int needbuffer;
167static int need_iobuffer;
1c79356b 168
91447636
A
169static lck_grp_t *buf_mtx_grp;
170static lck_attr_t *buf_mtx_attr;
171static lck_grp_attr_t *buf_mtx_grp_attr;
172static lck_mtx_t *iobuffer_mtxp;
173static lck_mtx_t *buf_mtxp;
813fb2f6 174static lck_mtx_t *buf_gc_callout;
91447636 175
b0d623f7
A
176static int buf_busycount;
177
813fb2f6
A
178#define FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE 16
179typedef struct {
180 void (* callout)(int, void *);
181 void *context;
182} fs_buffer_cache_gc_callout_t;
183
184fs_buffer_cache_gc_callout_t fs_callouts[FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE] = { {NULL, NULL} };
185
91447636
A
186static __inline__ int
187buf_timestamp(void)
188{
189 struct timeval t;
190 microuptime(&t);
191 return (t.tv_sec);
192}
193
1c79356b
A
194/*
195 * Insq/Remq for the buffer free lists.
196 */
91447636
A
197#define binsheadfree(bp, dp, whichq) do { \
198 TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
1c79356b
A
199 } while (0)
200
91447636
A
201#define binstailfree(bp, dp, whichq) do { \
202 TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
91447636 203 } while (0)
91447636 204
1c79356b
A
205#define BHASHENTCHECK(bp) \
206 if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef) \
2d21ac55 207 panic("%p: b_hash.le_prev is not deadbeef", (bp));
1c79356b
A
208
209#define BLISTNONE(bp) \
210 (bp)->b_hash.le_next = (struct buf *)0; \
211 (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
212
9bccf70c
A
213/*
214 * Insq/Remq for the vnode usage lists.
215 */
216#define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs)
217#define bufremvn(bp) { \
218 LIST_REMOVE(bp, b_vnbufs); \
219 (bp)->b_vnbufs.le_next = NOLIST; \
220}
221
1c79356b
A
222/*
223 * Time in seconds before a buffer on a list is
224 * considered as a stale buffer
225 */
226#define LRU_IS_STALE 120 /* default value for the LRU */
227#define AGE_IS_STALE 60 /* default value for the AGE */
228#define META_IS_STALE 180 /* default value for the BQ_META */
229
230int lru_is_stale = LRU_IS_STALE;
231int age_is_stale = AGE_IS_STALE;
232int meta_is_stale = META_IS_STALE;
2d21ac55 233
6d2010ae 234#define MAXLAUNDRY 10
91447636 235
9bccf70c
A
236/* LIST_INSERT_HEAD() with assertions */
237static __inline__ void
91447636 238blistenterhead(struct bufhashhdr * head, buf_t bp)
1c79356b
A
239{
240 if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
241 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
242 (head)->lh_first = bp;
243 bp->b_hash.le_prev = &(head)->lh_first;
244 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
245 panic("blistenterhead: le_prev is deadbeef");
1c79356b 246}
1c79356b 247
9bccf70c 248static __inline__ void
91447636 249binshash(buf_t bp, struct bufhashhdr *dp)
1c79356b 250{
0c530ab8 251#if DIAGNOSTIC
91447636 252 buf_t nbp;
0c530ab8 253#endif /* DIAGNOSTIC */
9bccf70c 254
1c79356b 255 BHASHENTCHECK(bp);
9bccf70c 256
0c530ab8 257#if DIAGNOSTIC
1c79356b
A
258 nbp = dp->lh_first;
259 for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
260 if(nbp == bp)
261 panic("buf already in hashlist");
262 }
0c530ab8 263#endif /* DIAGNOSTIC */
1c79356b 264
1c79356b 265 blistenterhead(dp, bp);
1c79356b
A
266}
267
9bccf70c 268static __inline__ void
91447636 269bremhash(buf_t bp)
1c79356b 270{
1c79356b
A
271 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
272 panic("bremhash le_prev is deadbeef");
273 if (bp->b_hash.le_next == bp)
274 panic("bremhash: next points to self");
275
276 if (bp->b_hash.le_next != NULL)
277 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
278 *bp->b_hash.le_prev = (bp)->b_hash.le_next;
1c79356b
A
279}
280
6d2010ae
A
281/*
282 * buf_mtxp held.
283 */
284static __inline__ void
285bmovelaundry(buf_t bp)
286{
287 bp->b_whichq = BQ_LAUNDRY;
288 bp->b_timestamp = buf_timestamp();
289 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
290 blaundrycnt++;
291}
1c79356b 292
6d2010ae
A
293static __inline__ void
294buf_release_credentials(buf_t bp)
295{
296 if (IS_VALID_CRED(bp->b_rcred)) {
297 kauth_cred_unref(&bp->b_rcred);
298 }
299 if (IS_VALID_CRED(bp->b_wcred)) {
300 kauth_cred_unref(&bp->b_wcred);
301 }
302}
1c79356b 303
9bccf70c 304
91447636
A
305int
306buf_valid(buf_t bp) {
307
308 if ( (bp->b_flags & (B_DONE | B_DELWRI)) )
309 return 1;
310 return 0;
9bccf70c
A
311}
312
91447636
A
313int
314buf_fromcache(buf_t bp) {
9bccf70c 315
91447636
A
316 if ( (bp->b_flags & B_CACHE) )
317 return 1;
318 return 0;
9bccf70c
A
319}
320
9bccf70c 321void
91447636
A
322buf_markinvalid(buf_t bp) {
323
324 SET(bp->b_flags, B_INVAL);
325}
9bccf70c 326
91447636
A
327void
328buf_markdelayed(buf_t bp) {
329
2d21ac55
A
330 if (!ISSET(bp->b_flags, B_DELWRI)) {
331 SET(bp->b_flags, B_DELWRI);
332
b0d623f7 333 OSAddAtomicLong(1, &nbdwrite);
2d21ac55
A
334 buf_reassign(bp, bp->b_vp);
335 }
336 SET(bp->b_flags, B_DONE);
9bccf70c
A
337}
338
6d2010ae
A
339void
340buf_markclean(buf_t bp) {
341
342 if (ISSET(bp->b_flags, B_DELWRI)) {
343 CLR(bp->b_flags, B_DELWRI);
344
345 OSAddAtomicLong(-1, &nbdwrite);
346 buf_reassign(bp, bp->b_vp);
347 }
348}
349
91447636
A
350void
351buf_markeintr(buf_t bp) {
352
353 SET(bp->b_flags, B_EINTR);
354}
765c9de3 355
2d21ac55 356
91447636
A
357void
358buf_markaged(buf_t bp) {
359
360 SET(bp->b_flags, B_AGE);
765c9de3
A
361}
362
2d21ac55
A
363int
364buf_fua(buf_t bp) {
365
366 if ((bp->b_flags & B_FUA) == B_FUA)
367 return 1;
368 return 0;
369}
370
371void
372buf_markfua(buf_t bp) {
373
374 SET(bp->b_flags, B_FUA);
375}
376
316670eb 377#if CONFIG_PROTECT
3e170ce0
A
378cpx_t bufattr_cpx(bufattr_t bap)
379{
380 return bap->ba_cpx;
381}
382
383void bufattr_setcpx(bufattr_t bap, cpx_t cpx)
384{
385 bap->ba_cpx = cpx;
316670eb
A
386}
387
388void
389buf_setcpoff (buf_t bp, uint64_t foffset) {
390 bp->b_attr.ba_cp_file_off = foffset;
391}
392
316670eb
A
393uint64_t
394bufattr_cpoff(bufattr_t bap) {
3e170ce0 395 return bap->ba_cp_file_off;
316670eb
A
396}
397
398void
399bufattr_setcpoff(bufattr_t bap, uint64_t foffset) {
3e170ce0 400 bap->ba_cp_file_off = foffset;
d1ecb069
A
401}
402
3e170ce0 403#else // !CONTECT_PROTECT
d1ecb069 404
316670eb
A
405uint64_t
406bufattr_cpoff(bufattr_t bap __unused) {
407 return 0;
408}
409
316670eb
A
410void
411bufattr_setcpoff(__unused bufattr_t bap, __unused uint64_t foffset) {
d1ecb069
A
412 return;
413}
3e170ce0
A
414
415struct cpx *bufattr_cpx(__unused bufattr_t bap)
416{
417 return NULL;
418}
419
420void bufattr_setcpx(__unused bufattr_t bap, __unused struct cpx *cpx)
421{
422}
423
424#endif /* !CONFIG_PROTECT */
d1ecb069 425
316670eb
A
426bufattr_t
427bufattr_alloc() {
428 bufattr_t bap;
429 MALLOC(bap, bufattr_t, sizeof(struct bufattr), M_TEMP, M_WAITOK);
430 if (bap == NULL)
431 return NULL;
432
433 bzero(bap, sizeof(struct bufattr));
434 return bap;
435}
436
437void
438bufattr_free(bufattr_t bap) {
439 if (bap)
440 FREE(bap, M_TEMP);
441}
442
fe8ab488
A
443bufattr_t
444bufattr_dup(bufattr_t bap) {
445 bufattr_t new_bufattr;
446 MALLOC(new_bufattr, bufattr_t, sizeof(struct bufattr), M_TEMP, M_WAITOK);
447 if (new_bufattr == NULL)
448 return NULL;
449
450 /* Copy the provided one into the new copy */
451 memcpy (new_bufattr, bap, sizeof(struct bufattr));
452 return new_bufattr;
453}
454
316670eb
A
455int
456bufattr_rawencrypted(bufattr_t bap) {
457 if ( (bap->ba_flags & BA_RAW_ENCRYPTED_IO) )
458 return 1;
459 return 0;
460}
461
7ddcb079
A
462int
463bufattr_throttled(bufattr_t bap) {
39236c6e 464 return (GET_BUFATTR_IO_TIER(bap));
7ddcb079
A
465}
466
fe8ab488
A
467int
468bufattr_passive(bufattr_t bap) {
469 if ( (bap->ba_flags & BA_PASSIVE) )
470 return 1;
471 return 0;
472}
473
316670eb
A
474int
475bufattr_nocache(bufattr_t bap) {
476 if ( (bap->ba_flags & BA_NOCACHE) )
477 return 1;
478 return 0;
479}
480
481int
482bufattr_meta(bufattr_t bap) {
483 if ( (bap->ba_flags & BA_META) )
484 return 1;
485 return 0;
486}
487
fe8ab488
A
488void
489bufattr_markmeta(bufattr_t bap) {
490 SET(bap->ba_flags, BA_META);
491}
492
316670eb 493int
316670eb 494bufattr_delayidlesleep(bufattr_t bap)
316670eb 495{
316670eb
A
496 if ( (bap->ba_flags & BA_DELAYIDLESLEEP) )
497 return 1;
316670eb
A
498 return 0;
499}
500
7ddcb079
A
501bufattr_t
502buf_attr(buf_t bp) {
503 return &bp->b_attr;
504}
505
316670eb
A
506void
507buf_markstatic(buf_t bp __unused) {
508 SET(bp->b_flags, B_STATICCONTENT);
509}
510
511int
512buf_static(buf_t bp) {
513 if ( (bp->b_flags & B_STATICCONTENT) )
514 return 1;
515 return 0;
516}
517
39236c6e
A
518void
519bufattr_markgreedymode(bufattr_t bap) {
520 SET(bap->ba_flags, BA_GREEDY_MODE);
521}
522
523int
524bufattr_greedymode(bufattr_t bap) {
525 if ( (bap->ba_flags & BA_GREEDY_MODE) )
526 return 1;
527 return 0;
528}
529
fe8ab488
A
530void
531bufattr_markisochronous(bufattr_t bap) {
532 SET(bap->ba_flags, BA_ISOCHRONOUS);
533}
534
535int
536bufattr_isochronous(bufattr_t bap) {
537 if ( (bap->ba_flags & BA_ISOCHRONOUS) )
538 return 1;
539 return 0;
540}
541
39236c6e
A
542void
543bufattr_markquickcomplete(bufattr_t bap) {
544 SET(bap->ba_flags, BA_QUICK_COMPLETE);
545}
546
547int
548bufattr_quickcomplete(bufattr_t bap) {
549 if ( (bap->ba_flags & BA_QUICK_COMPLETE) )
550 return 1;
551 return 0;
552}
553
91447636
A
554errno_t
555buf_error(buf_t bp) {
556
557 return (bp->b_error);
558}
1c79356b 559
91447636
A
560void
561buf_seterror(buf_t bp, errno_t error) {
1c79356b 562
91447636
A
563 if ((bp->b_error = error))
564 SET(bp->b_flags, B_ERROR);
565 else
566 CLR(bp->b_flags, B_ERROR);
567}
1c79356b 568
91447636
A
569void
570buf_setflags(buf_t bp, int32_t flags) {
1c79356b 571
91447636
A
572 SET(bp->b_flags, (flags & BUF_X_WRFLAGS));
573}
765c9de3 574
91447636
A
575void
576buf_clearflags(buf_t bp, int32_t flags) {
1c79356b 577
91447636
A
578 CLR(bp->b_flags, (flags & BUF_X_WRFLAGS));
579}
1c79356b 580
91447636
A
581int32_t
582buf_flags(buf_t bp) {
583
584 return ((bp->b_flags & BUF_X_RDFLAGS));
585}
1c79356b 586
91447636
A
587void
588buf_reset(buf_t bp, int32_t io_flags) {
589
2d21ac55 590 CLR(bp->b_flags, (B_READ | B_WRITE | B_ERROR | B_DONE | B_INVAL | B_ASYNC | B_NOCACHE | B_FUA));
91447636 591 SET(bp->b_flags, (io_flags & (B_ASYNC | B_READ | B_WRITE | B_NOCACHE)));
1c79356b 592
91447636
A
593 bp->b_error = 0;
594}
1c79356b 595
91447636
A
596uint32_t
597buf_count(buf_t bp) {
598
599 return (bp->b_bcount);
600}
765c9de3 601
91447636
A
602void
603buf_setcount(buf_t bp, uint32_t bcount) {
604
605 bp->b_bcount = bcount;
1c79356b
A
606}
607
91447636
A
608uint32_t
609buf_size(buf_t bp) {
610
611 return (bp->b_bufsize);
612}
1c79356b 613
91447636
A
614void
615buf_setsize(buf_t bp, uint32_t bufsize) {
616
617 bp->b_bufsize = bufsize;
618}
1c79356b 619
91447636
A
620uint32_t
621buf_resid(buf_t bp) {
622
623 return (bp->b_resid);
624}
b4c24cb9 625
91447636
A
626void
627buf_setresid(buf_t bp, uint32_t resid) {
628
629 bp->b_resid = resid;
630}
1c79356b 631
91447636
A
632uint32_t
633buf_dirtyoff(buf_t bp) {
1c79356b 634
91447636
A
635 return (bp->b_dirtyoff);
636}
1c79356b 637
91447636
A
638uint32_t
639buf_dirtyend(buf_t bp) {
1c79356b 640
91447636 641 return (bp->b_dirtyend);
1c79356b 642}
1c79356b 643
91447636
A
644void
645buf_setdirtyoff(buf_t bp, uint32_t dirtyoff) {
646
647 bp->b_dirtyoff = dirtyoff;
648}
1c79356b 649
91447636
A
650void
651buf_setdirtyend(buf_t bp, uint32_t dirtyend) {
652
653 bp->b_dirtyend = dirtyend;
1c79356b
A
654}
655
91447636
A
656uintptr_t
657buf_dataptr(buf_t bp) {
658
659 return (bp->b_datap);
660}
1c79356b 661
91447636
A
662void
663buf_setdataptr(buf_t bp, uintptr_t data) {
664
665 bp->b_datap = data;
666}
667
668vnode_t
669buf_vnode(buf_t bp) {
670
671 return (bp->b_vp);
672}
673
674void
675buf_setvnode(buf_t bp, vnode_t vp) {
676
677 bp->b_vp = vp;
678}
679
680
681void *
682buf_callback(buf_t bp)
683{
91447636
A
684 if ( !(bp->b_flags & B_CALL) )
685 return ((void *) NULL);
686
687 return ((void *)bp->b_iodone);
688}
689
690
691errno_t
692buf_setcallback(buf_t bp, void (*callback)(buf_t, void *), void *transaction)
693{
3e170ce0
A
694 assert(!ISSET(bp->b_flags, B_FILTER) && ISSET(bp->b_lflags, BL_BUSY));
695
91447636
A
696 if (callback)
697 bp->b_flags |= (B_CALL | B_ASYNC);
698 else
699 bp->b_flags &= ~B_CALL;
700 bp->b_transaction = transaction;
701 bp->b_iodone = callback;
702
703 return (0);
704}
705
706errno_t
707buf_setupl(buf_t bp, upl_t upl, uint32_t offset)
708{
709
710 if ( !(bp->b_lflags & BL_IOBUF) )
711 return (EINVAL);
712
713 if (upl)
714 bp->b_flags |= B_CLUSTER;
715 else
716 bp->b_flags &= ~B_CLUSTER;
717 bp->b_upl = upl;
718 bp->b_uploffset = offset;
719
720 return (0);
721}
722
723buf_t
724buf_clone(buf_t bp, int io_offset, int io_size, void (*iodone)(buf_t, void *), void *arg)
725{
726 buf_t io_bp;
727
728 if (io_offset < 0 || io_size < 0)
729 return (NULL);
730
731 if ((unsigned)(io_offset + io_size) > (unsigned)bp->b_bcount)
732 return (NULL);
733
734 if (bp->b_flags & B_CLUSTER) {
735 if (io_offset && ((bp->b_uploffset + io_offset) & PAGE_MASK))
736 return (NULL);
737
738 if (((bp->b_uploffset + io_offset + io_size) & PAGE_MASK) && ((io_offset + io_size) < bp->b_bcount))
739 return (NULL);
740 }
741 io_bp = alloc_io_buf(bp->b_vp, 0);
742
2d21ac55 743 io_bp->b_flags = bp->b_flags & (B_COMMIT_UPL | B_META | B_PAGEIO | B_CLUSTER | B_PHYS | B_RAW | B_ASYNC | B_READ | B_FUA);
91447636
A
744
745 if (iodone) {
746 io_bp->b_transaction = arg;
747 io_bp->b_iodone = iodone;
748 io_bp->b_flags |= B_CALL;
749 }
750 if (bp->b_flags & B_CLUSTER) {
751 io_bp->b_upl = bp->b_upl;
752 io_bp->b_uploffset = bp->b_uploffset + io_offset;
753 } else {
754 io_bp->b_datap = (uintptr_t)(((char *)bp->b_datap) + io_offset);
755 }
756 io_bp->b_bcount = io_size;
757
758 return (io_bp);
759}
760
761
6d2010ae
A
762int
763buf_shadow(buf_t bp)
764{
765 if (bp->b_lflags & BL_SHADOW)
766 return 1;
767 return 0;
768}
769
770
771buf_t
772buf_create_shadow_priv(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg)
773{
774 return (buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, 1));
775}
776
777buf_t
778buf_create_shadow(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg)
779{
780 return (buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, 0));
781}
782
783
784static buf_t
785buf_create_shadow_internal(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg, int priv)
786{
787 buf_t io_bp;
788
789 KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_START, bp, 0, 0, 0, 0);
790
791 if ( !(bp->b_flags & B_META) || (bp->b_lflags & BL_IOBUF)) {
792
793 KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_END, bp, 0, 0, 0, 0);
794 return (NULL);
795 }
796#ifdef BUF_MAKE_PRIVATE
797 if (bp->b_shadow_ref && bp->b_data_ref == 0 && external_storage == 0)
798 panic("buf_create_shadow: %p is in the private state (%d, %d)", bp, bp->b_shadow_ref, bp->b_data_ref);
799#endif
800 io_bp = alloc_io_buf(bp->b_vp, priv);
801
802 io_bp->b_flags = bp->b_flags & (B_META | B_ZALLOC | B_ASYNC | B_READ | B_FUA);
803 io_bp->b_blkno = bp->b_blkno;
804 io_bp->b_lblkno = bp->b_lblkno;
805
806 if (iodone) {
807 io_bp->b_transaction = arg;
808 io_bp->b_iodone = iodone;
809 io_bp->b_flags |= B_CALL;
810 }
811 if (force_copy == FALSE) {
812 io_bp->b_bcount = bp->b_bcount;
813 io_bp->b_bufsize = bp->b_bufsize;
814
815 if (external_storage) {
816 io_bp->b_datap = external_storage;
817#ifdef BUF_MAKE_PRIVATE
818 io_bp->b_data_store = NULL;
819#endif
820 } else {
821 io_bp->b_datap = bp->b_datap;
822#ifdef BUF_MAKE_PRIVATE
823 io_bp->b_data_store = bp;
824#endif
825 }
826 *(buf_t *)(&io_bp->b_orig) = bp;
827
828 lck_mtx_lock_spin(buf_mtxp);
829
830 io_bp->b_lflags |= BL_SHADOW;
831 io_bp->b_shadow = bp->b_shadow;
832 bp->b_shadow = io_bp;
833 bp->b_shadow_ref++;
834
835#ifdef BUF_MAKE_PRIVATE
836 if (external_storage)
837 io_bp->b_lflags |= BL_EXTERNAL;
838 else
839 bp->b_data_ref++;
840#endif
841 lck_mtx_unlock(buf_mtxp);
842 } else {
843 if (external_storage) {
844#ifdef BUF_MAKE_PRIVATE
845 io_bp->b_lflags |= BL_EXTERNAL;
846#endif
847 io_bp->b_bcount = bp->b_bcount;
848 io_bp->b_bufsize = bp->b_bufsize;
849 io_bp->b_datap = external_storage;
850 } else {
851 allocbuf(io_bp, bp->b_bcount);
852
853 io_bp->b_lflags |= BL_IOBUF_ALLOC;
854 }
855 bcopy((caddr_t)bp->b_datap, (caddr_t)io_bp->b_datap, bp->b_bcount);
856
857#ifdef BUF_MAKE_PRIVATE
858 io_bp->b_data_store = NULL;
859#endif
860 }
861 KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, io_bp, 0);
862
863 return (io_bp);
864}
865
866
867#ifdef BUF_MAKE_PRIVATE
868errno_t
869buf_make_private(buf_t bp)
870{
871 buf_t ds_bp;
872 buf_t t_bp;
873 struct buf my_buf;
874
875 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_START, bp, bp->b_shadow_ref, 0, 0, 0);
876
877 if (bp->b_shadow_ref == 0 || bp->b_data_ref == 0 || ISSET(bp->b_lflags, BL_SHADOW)) {
878
879 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, EINVAL, 0);
880 return (EINVAL);
881 }
882 my_buf.b_flags = B_META;
883 my_buf.b_datap = (uintptr_t)NULL;
884 allocbuf(&my_buf, bp->b_bcount);
885
886 bcopy((caddr_t)bp->b_datap, (caddr_t)my_buf.b_datap, bp->b_bcount);
887
888 lck_mtx_lock_spin(buf_mtxp);
889
890 for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) {
891 if ( !ISSET(bp->b_lflags, BL_EXTERNAL))
892 break;
893 }
894 ds_bp = t_bp;
895
896 if (ds_bp == NULL && bp->b_data_ref)
897 panic("buf_make_private: b_data_ref != 0 && ds_bp == NULL");
898
899 if (ds_bp && (bp->b_data_ref == 0 || bp->b_shadow_ref == 0))
900 panic("buf_make_private: ref_count == 0 && ds_bp != NULL");
901
902 if (ds_bp == NULL) {
903 lck_mtx_unlock(buf_mtxp);
904
905 buf_free_meta_store(&my_buf);
906
907 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, EINVAL, 0);
908 return (EINVAL);
909 }
910 for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) {
911 if ( !ISSET(t_bp->b_lflags, BL_EXTERNAL))
912 t_bp->b_data_store = ds_bp;
913 }
914 ds_bp->b_data_ref = bp->b_data_ref;
915
916 bp->b_data_ref = 0;
917 bp->b_datap = my_buf.b_datap;
918
919 lck_mtx_unlock(buf_mtxp);
920
921 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, 0, 0);
922 return (0);
923}
924#endif
925
91447636
A
926
927void
928buf_setfilter(buf_t bp, void (*filter)(buf_t, void *), void *transaction,
6d2010ae 929 void (**old_iodone)(buf_t, void *), void **old_transaction)
91447636 930{
3e170ce0
A
931 assert(ISSET(bp->b_lflags, BL_BUSY));
932
6d2010ae
A
933 if (old_iodone)
934 *old_iodone = bp->b_iodone;
91447636 935 if (old_transaction)
6d2010ae 936 *old_transaction = bp->b_transaction;
91447636
A
937
938 bp->b_transaction = transaction;
939 bp->b_iodone = filter;
2d21ac55
A
940 if (filter)
941 bp->b_flags |= B_FILTER;
942 else
943 bp->b_flags &= ~B_FILTER;
91447636
A
944}
945
946
947daddr64_t
948buf_blkno(buf_t bp) {
949
950 return (bp->b_blkno);
951}
952
953daddr64_t
954buf_lblkno(buf_t bp) {
955
956 return (bp->b_lblkno);
957}
958
959void
960buf_setblkno(buf_t bp, daddr64_t blkno) {
961
962 bp->b_blkno = blkno;
963}
964
965void
966buf_setlblkno(buf_t bp, daddr64_t lblkno) {
967
968 bp->b_lblkno = lblkno;
969}
970
971dev_t
972buf_device(buf_t bp) {
973
974 return (bp->b_dev);
975}
976
977errno_t
978buf_setdevice(buf_t bp, vnode_t vp) {
979
980 if ((vp->v_type != VBLK) && (vp->v_type != VCHR))
981 return EINVAL;
982 bp->b_dev = vp->v_rdev;
983
984 return 0;
985}
986
987
988void *
989buf_drvdata(buf_t bp) {
990
991 return (bp->b_drvdata);
992}
993
994void
995buf_setdrvdata(buf_t bp, void *drvdata) {
996
997 bp->b_drvdata = drvdata;
998}
999
1000void *
1001buf_fsprivate(buf_t bp) {
1002
1003 return (bp->b_fsprivate);
1004}
1005
1006void
1007buf_setfsprivate(buf_t bp, void *fsprivate) {
1008
1009 bp->b_fsprivate = fsprivate;
1010}
1011
b0d623f7 1012kauth_cred_t
91447636
A
1013buf_rcred(buf_t bp) {
1014
1015 return (bp->b_rcred);
1016}
1017
b0d623f7 1018kauth_cred_t
91447636
A
1019buf_wcred(buf_t bp) {
1020
1021 return (bp->b_wcred);
1022}
1023
1024void *
1025buf_upl(buf_t bp) {
1026
1027 return (bp->b_upl);
1028}
1029
1030uint32_t
1031buf_uploffset(buf_t bp) {
1032
1033 return ((uint32_t)(bp->b_uploffset));
1034}
1035
1036proc_t
1037buf_proc(buf_t bp) {
1038
1039 return (bp->b_proc);
1040}
1041
1042
1043errno_t
1044buf_map(buf_t bp, caddr_t *io_addr)
1045{
1046 buf_t real_bp;
b0d623f7 1047 vm_offset_t vaddr;
91447636
A
1048 kern_return_t kret;
1049
1050 if ( !(bp->b_flags & B_CLUSTER)) {
1051 *io_addr = (caddr_t)bp->b_datap;
1052 return (0);
1053 }
1054 real_bp = (buf_t)(bp->b_real_bp);
1055
1056 if (real_bp && real_bp->b_datap) {
1057 /*
1058 * b_real_bp is only valid if B_CLUSTER is SET
1059 * if it's non-zero, than someone did a cluster_bp call
1060 * if the backing physical pages were already mapped
1061 * in before the call to cluster_bp (non-zero b_datap),
1062 * than we just use that mapping
1063 */
1064 *io_addr = (caddr_t)real_bp->b_datap;
1065 return (0);
1066 }
1067 kret = ubc_upl_map(bp->b_upl, &vaddr); /* Map it in */
1068
1069 if (kret != KERN_SUCCESS) {
2d21ac55 1070 *io_addr = NULL;
91447636
A
1071
1072 return(ENOMEM);
1073 }
1074 vaddr += bp->b_uploffset;
1075
1076 *io_addr = (caddr_t)vaddr;
1077
1078 return (0);
1079}
1080
1081errno_t
1082buf_unmap(buf_t bp)
1083{
1084 buf_t real_bp;
1085 kern_return_t kret;
1086
1087 if ( !(bp->b_flags & B_CLUSTER))
1088 return (0);
1089 /*
1090 * see buf_map for the explanation
1091 */
1092 real_bp = (buf_t)(bp->b_real_bp);
1093
1094 if (real_bp && real_bp->b_datap)
1095 return (0);
1096
2d21ac55
A
1097 if ((bp->b_lflags & BL_IOBUF) &&
1098 ((bp->b_flags & (B_PAGEIO | B_READ)) != (B_PAGEIO | B_READ))) {
91447636 1099 /*
2d21ac55
A
1100 * ignore pageins... the 'right' thing will
1101 * happen due to the way we handle speculative
1102 * clusters...
1103 *
91447636
A
1104 * when we commit these pages, we'll hit
1105 * it with UPL_COMMIT_INACTIVE which
1106 * will clear the reference bit that got
1107 * turned on when we touched the mapping
1108 */
1109 bp->b_flags |= B_AGE;
1110 }
1111 kret = ubc_upl_unmap(bp->b_upl);
1112
1113 if (kret != KERN_SUCCESS)
1114 return (EINVAL);
1115 return (0);
1116}
1117
1118
1119void
1120buf_clear(buf_t bp) {
1121 caddr_t baddr;
1122
1123 if (buf_map(bp, &baddr) == 0) {
1124 bzero(baddr, bp->b_bcount);
1125 buf_unmap(bp);
1126 }
1127 bp->b_resid = 0;
1128}
1129
91447636
A
1130/*
1131 * Read or write a buffer that is not contiguous on disk.
1132 * buffer is marked done/error at the conclusion
1133 */
1134static int
1135buf_strategy_fragmented(vnode_t devvp, buf_t bp, off_t f_offset, size_t contig_bytes)
1136{
1137 vnode_t vp = buf_vnode(bp);
1138 buf_t io_bp; /* For reading or writing a single block */
1139 int io_direction;
1140 int io_resid;
1141 size_t io_contig_bytes;
1142 daddr64_t io_blkno;
1143 int error = 0;
1144 int bmap_flags;
1145
1146 /*
1147 * save our starting point... the bp was already mapped
1148 * in buf_strategy before we got called
1149 * no sense doing it again.
1150 */
1151 io_blkno = bp->b_blkno;
1152 /*
1153 * Make sure we redo this mapping for the next I/O
1154 * i.e. this can never be a 'permanent' mapping
1155 */
1156 bp->b_blkno = bp->b_lblkno;
1157
1158 /*
1159 * Get an io buffer to do the deblocking
1160 */
1161 io_bp = alloc_io_buf(devvp, 0);
1162
1163 io_bp->b_lblkno = bp->b_lblkno;
1164 io_bp->b_datap = bp->b_datap;
1165 io_resid = bp->b_bcount;
1166 io_direction = bp->b_flags & B_READ;
1167 io_contig_bytes = contig_bytes;
1168
1169 if (bp->b_flags & B_READ)
1170 bmap_flags = VNODE_READ;
1171 else
1172 bmap_flags = VNODE_WRITE;
1173
1174 for (;;) {
1175 if (io_blkno == -1)
1176 /*
1177 * this is unexepected, but we'll allow for it
1178 */
1179 bzero((caddr_t)io_bp->b_datap, (int)io_contig_bytes);
1180 else {
1181 io_bp->b_bcount = io_contig_bytes;
1182 io_bp->b_bufsize = io_contig_bytes;
1183 io_bp->b_resid = io_contig_bytes;
1184 io_bp->b_blkno = io_blkno;
1185
1186 buf_reset(io_bp, io_direction);
2d21ac55 1187
91447636 1188 /*
2d21ac55 1189 * Call the device to do the I/O and wait for it. Make sure the appropriate party is charged for write
91447636 1190 */
2d21ac55
A
1191
1192 if (!ISSET(bp->b_flags, B_READ))
1193 OSAddAtomic(1, &devvp->v_numoutput);
1194
91447636
A
1195 if ((error = VNOP_STRATEGY(io_bp)))
1196 break;
1197 if ((error = (int)buf_biowait(io_bp)))
1198 break;
1199 if (io_bp->b_resid) {
1200 io_resid -= (io_contig_bytes - io_bp->b_resid);
1201 break;
1202 }
1203 }
1204 if ((io_resid -= io_contig_bytes) == 0)
1205 break;
1206 f_offset += io_contig_bytes;
1207 io_bp->b_datap += io_contig_bytes;
1208
1209 /*
1210 * Map the current position to a physical block number
1211 */
1212 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_resid, &io_blkno, &io_contig_bytes, NULL, bmap_flags, NULL)))
1213 break;
1214 }
1215 buf_free(io_bp);
1216
1217 if (error)
1218 buf_seterror(bp, error);
1219 bp->b_resid = io_resid;
1220 /*
1221 * This I/O is now complete
1222 */
1223 buf_biodone(bp);
1224
1225 return error;
1226}
1227
1228
1229/*
1230 * struct vnop_strategy_args {
1231 * struct buf *a_bp;
1232 * } *ap;
1233 */
1234errno_t
1235buf_strategy(vnode_t devvp, void *ap)
1236{
1237 buf_t bp = ((struct vnop_strategy_args *)ap)->a_bp;
1238 vnode_t vp = bp->b_vp;
1239 int bmap_flags;
1240 errno_t error;
6d2010ae
A
1241#if CONFIG_DTRACE
1242 int dtrace_io_start_flag = 0; /* We only want to trip the io:::start
39236c6e 1243 * probe once, with the true physical
6d2010ae
A
1244 * block in place (b_blkno)
1245 */
1246
1247#endif
91447636
A
1248
1249 if (vp == NULL || vp->v_type == VCHR || vp->v_type == VBLK)
1250 panic("buf_strategy: b_vp == NULL || vtype == VCHR | VBLK\n");
1251 /*
1252 * associate the physical device with
1253 * with this buf_t even if we don't
1254 * end up issuing the I/O...
1255 */
1256 bp->b_dev = devvp->v_rdev;
1257
1258 if (bp->b_flags & B_READ)
1259 bmap_flags = VNODE_READ;
1260 else
1261 bmap_flags = VNODE_WRITE;
1262
1263 if ( !(bp->b_flags & B_CLUSTER)) {
1264
1265 if ( (bp->b_upl) ) {
1266 /*
1267 * we have a UPL associated with this bp
1268 * go through cluster_bp which knows how
1269 * to deal with filesystem block sizes
1270 * that aren't equal to the page size
1271 */
6d2010ae 1272 DTRACE_IO1(start, buf_t, bp);
91447636
A
1273 return (cluster_bp(bp));
1274 }
1275 if (bp->b_blkno == bp->b_lblkno) {
316670eb 1276 off_t f_offset;
91447636
A
1277 size_t contig_bytes;
1278
1279 if ((error = VNOP_BLKTOOFF(vp, bp->b_lblkno, &f_offset))) {
6d2010ae 1280 DTRACE_IO1(start, buf_t, bp);
91447636
A
1281 buf_seterror(bp, error);
1282 buf_biodone(bp);
1283
316670eb 1284 return (error);
91447636 1285 }
316670eb
A
1286
1287 if ((error = VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))) {
6d2010ae 1288 DTRACE_IO1(start, buf_t, bp);
91447636
A
1289 buf_seterror(bp, error);
1290 buf_biodone(bp);
1291
1292 return (error);
1293 }
316670eb 1294
6d2010ae
A
1295 DTRACE_IO1(start, buf_t, bp);
1296#if CONFIG_DTRACE
1297 dtrace_io_start_flag = 1;
1298#endif /* CONFIG_DTRACE */
316670eb 1299
b0d623f7
A
1300 if ((bp->b_blkno == -1) || (contig_bytes == 0)) {
1301 /* Set block number to force biodone later */
1302 bp->b_blkno = -1;
91447636 1303 buf_clear(bp);
b0d623f7 1304 }
6d2010ae 1305 else if ((long)contig_bytes < bp->b_bcount) {
91447636 1306 return (buf_strategy_fragmented(devvp, bp, f_offset, contig_bytes));
6d2010ae 1307 }
91447636 1308 }
6d2010ae
A
1309
1310#if CONFIG_DTRACE
1311 if (dtrace_io_start_flag == 0) {
1312 DTRACE_IO1(start, buf_t, bp);
1313 dtrace_io_start_flag = 1;
1314 }
1315#endif /* CONFIG_DTRACE */
1316
91447636
A
1317 if (bp->b_blkno == -1) {
1318 buf_biodone(bp);
1319 return (0);
1320 }
1321 }
6d2010ae
A
1322
1323#if CONFIG_DTRACE
1324 if (dtrace_io_start_flag == 0)
1325 DTRACE_IO1(start, buf_t, bp);
1326#endif /* CONFIG_DTRACE */
1327
316670eb
A
1328#if CONFIG_PROTECT
1329 /* Capture f_offset in the bufattr*/
3e170ce0
A
1330 cpx_t cpx = bufattr_cpx(buf_attr(bp));
1331 if (cpx) {
316670eb 1332 /* No need to go here for older EAs */
39037602 1333 if(cpx_use_offset_for_iv(cpx) && !cpx_synthetic_offset_for_iv(cpx)) {
316670eb
A
1334 off_t f_offset;
1335 if ((error = VNOP_BLKTOOFF(bp->b_vp, bp->b_lblkno, &f_offset)))
1336 return error;
1337
1338 /*
1339 * Attach the file offset to this buffer. The
1340 * bufattr attributes will be passed down the stack
813fb2f6
A
1341 * until they reach the storage driver (whether
1342 * IOFlashStorage, ASP, or IONVMe). The driver
316670eb
A
1343 * will retain the offset in a local variable when it
1344 * issues its I/Os to the NAND controller.
1345 *
1346 * Note that LwVM may end up splitting this I/O
1347 * into sub-I/Os if it crosses a chunk boundary. In this
1348 * case, LwVM will update this field when it dispatches
1349 * each I/O to IOFlashStorage. But from our perspective
1350 * we have only issued a single I/O.
813fb2f6
A
1351 *
1352 * In the case of APFS we do not bounce through another
1353 * intermediate layer (such as CoreStorage). APFS will
1354 * issue the I/Os directly to the block device / IOMedia
1355 * via buf_strategy on the specfs node.
316670eb 1356 */
3e170ce0 1357 buf_setcpoff(bp, f_offset);
fe8ab488 1358 CP_DEBUG((CPDBG_OFFSET_IO | DBG_FUNC_NONE), (uint32_t) f_offset, (uint32_t) bp->b_lblkno, (uint32_t) bp->b_blkno, (uint32_t) bp->b_bcount, 0);
316670eb
A
1359 }
1360 }
1361#endif
1362
91447636
A
1363 /*
1364 * we can issue the I/O because...
1365 * either B_CLUSTER is set which
1366 * means that the I/O is properly set
1367 * up to be a multiple of the page size, or
1368 * we were able to successfully set up the
39236c6e 1369 * physical block mapping
91447636 1370 */
39236c6e
A
1371 error = VOCALL(devvp->v_op, VOFFSET(vnop_strategy), ap);
1372 DTRACE_FSINFO(strategy, vnode_t, vp);
1373 return (error);
91447636
A
1374}
1375
1376
1377
1378buf_t
1379buf_alloc(vnode_t vp)
1380{
39037602 1381 return(alloc_io_buf(vp, is_vm_privileged()));
91447636
A
1382}
1383
1384void
1385buf_free(buf_t bp) {
1386
1387 free_io_buf(bp);
1388}
1389
1390
2d21ac55
A
1391/*
1392 * iterate buffers for the specified vp.
1393 * if BUF_SCAN_DIRTY is set, do the dirty list
1394 * if BUF_SCAN_CLEAN is set, do the clean list
1395 * if neither flag is set, default to BUF_SCAN_DIRTY
1396 * if BUF_NOTIFY_BUSY is set, call the callout function using a NULL bp for busy pages
1397 */
1398
1399struct buf_iterate_info_t {
1400 int flag;
1401 struct buflists *listhead;
1402};
91447636
A
1403
1404void
2d21ac55
A
1405buf_iterate(vnode_t vp, int (*callout)(buf_t, void *), int flags, void *arg)
1406{
91447636
A
1407 buf_t bp;
1408 int retval;
1409 struct buflists local_iterblkhd;
1410 int lock_flags = BAC_NOWAIT | BAC_REMOVE;
2d21ac55
A
1411 int notify_busy = flags & BUF_NOTIFY_BUSY;
1412 struct buf_iterate_info_t list[2];
1413 int num_lists, i;
91447636
A
1414
1415 if (flags & BUF_SKIP_LOCKED)
1416 lock_flags |= BAC_SKIP_LOCKED;
1417 if (flags & BUF_SKIP_NONLOCKED)
1418 lock_flags |= BAC_SKIP_NONLOCKED;
1419
2d21ac55
A
1420 if ( !(flags & (BUF_SCAN_DIRTY | BUF_SCAN_CLEAN)))
1421 flags |= BUF_SCAN_DIRTY;
1422
1423 num_lists = 0;
1424
1425 if (flags & BUF_SCAN_DIRTY) {
1426 list[num_lists].flag = VBI_DIRTY;
1427 list[num_lists].listhead = &vp->v_dirtyblkhd;
1428 num_lists++;
1429 }
1430 if (flags & BUF_SCAN_CLEAN) {
1431 list[num_lists].flag = VBI_CLEAN;
1432 list[num_lists].listhead = &vp->v_cleanblkhd;
1433 num_lists++;
91447636 1434 }
91447636 1435
2d21ac55
A
1436 for (i = 0; i < num_lists; i++) {
1437 lck_mtx_lock(buf_mtxp);
1438
1439 if (buf_iterprepare(vp, &local_iterblkhd, list[i].flag)) {
1440 lck_mtx_unlock(buf_mtxp);
1441 continue;
1442 }
1443 while (!LIST_EMPTY(&local_iterblkhd)) {
1444 bp = LIST_FIRST(&local_iterblkhd);
1445 LIST_REMOVE(bp, b_vnbufs);
1446 LIST_INSERT_HEAD(list[i].listhead, bp, b_vnbufs);
91447636 1447
2d21ac55
A
1448 if (buf_acquire_locked(bp, lock_flags, 0, 0)) {
1449 if (notify_busy) {
1450 bp = NULL;
1451 } else {
1452 continue;
1453 }
1454 }
91447636 1455
2d21ac55 1456 lck_mtx_unlock(buf_mtxp);
91447636 1457
2d21ac55 1458 retval = callout(bp, arg);
91447636 1459
2d21ac55
A
1460 switch (retval) {
1461 case BUF_RETURNED:
1462 if (bp)
1463 buf_brelse(bp);
1464 break;
1465 case BUF_CLAIMED:
1466 break;
1467 case BUF_RETURNED_DONE:
1468 if (bp)
1469 buf_brelse(bp);
1470 lck_mtx_lock(buf_mtxp);
1471 goto out;
1472 case BUF_CLAIMED_DONE:
1473 lck_mtx_lock(buf_mtxp);
1474 goto out;
1475 }
1476 lck_mtx_lock(buf_mtxp);
1477 } /* while list has more nodes */
1478 out:
1479 buf_itercomplete(vp, &local_iterblkhd, list[i].flag);
1480 lck_mtx_unlock(buf_mtxp);
1481 } /* for each list */
1482} /* buf_iterate */
91447636
A
1483
1484
1485/*
1486 * Flush out and invalidate all buffers associated with a vnode.
1487 */
1488int
1489buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo)
1490{
1491 buf_t bp;
6d2010ae 1492 int aflags;
91447636
A
1493 int error = 0;
1494 int must_rescan = 1;
1495 struct buflists local_iterblkhd;
1496
b0d623f7
A
1497
1498 if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd))
1499 return (0);
1500
91447636
A
1501 lck_mtx_lock(buf_mtxp);
1502
1503 for (;;) {
1504 if (must_rescan == 0)
1505 /*
1506 * the lists may not be empty, but all that's left at this
1507 * point are metadata or B_LOCKED buffers which are being
1508 * skipped... we know this because we made it through both
1509 * the clean and dirty lists without dropping buf_mtxp...
1510 * each time we drop buf_mtxp we bump "must_rescan"
1511 */
1512 break;
1513 if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd))
1514 break;
1515 must_rescan = 0;
1516 /*
1517 * iterate the clean list
1518 */
1519 if (buf_iterprepare(vp, &local_iterblkhd, VBI_CLEAN)) {
1520 goto try_dirty_list;
1521 }
1522 while (!LIST_EMPTY(&local_iterblkhd)) {
6d2010ae 1523
91447636
A
1524 bp = LIST_FIRST(&local_iterblkhd);
1525
1526 LIST_REMOVE(bp, b_vnbufs);
1527 LIST_INSERT_HEAD(&vp->v_cleanblkhd, bp, b_vnbufs);
1528
1529 /*
1530 * some filesystems distinguish meta data blocks with a negative logical block #
1531 */
1532 if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
1533 continue;
1534
6d2010ae
A
1535 aflags = BAC_REMOVE;
1536
1537 if ( !(flags & BUF_INVALIDATE_LOCKED) )
1538 aflags |= BAC_SKIP_LOCKED;
1539
1540 if ( (error = (int)buf_acquire_locked(bp, aflags, slpflag, slptimeo)) ) {
91447636
A
1541 if (error == EDEADLK)
1542 /*
1543 * this buffer was marked B_LOCKED...
1544 * we didn't drop buf_mtxp, so we
1545 * we don't need to rescan
1546 */
1547 continue;
1548 if (error == EAGAIN) {
1549 /*
1550 * found a busy buffer... we blocked and
1551 * dropped buf_mtxp, so we're going to
1552 * need to rescan after this pass is completed
1553 */
1554 must_rescan++;
1555 continue;
1556 }
1557 /*
1558 * got some kind of 'real' error out of the msleep
1559 * in buf_acquire_locked, terminate the scan and return the error
1560 */
1561 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1562
1563 lck_mtx_unlock(buf_mtxp);
1564 return (error);
1565 }
1566 lck_mtx_unlock(buf_mtxp);
1567
6d2010ae
A
1568 if (bp->b_flags & B_LOCKED)
1569 KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 0, 0);
1570
1571 CLR(bp->b_flags, B_LOCKED);
91447636
A
1572 SET(bp->b_flags, B_INVAL);
1573 buf_brelse(bp);
1574
1575 lck_mtx_lock(buf_mtxp);
1576
1577 /*
1578 * by dropping buf_mtxp, we allow new
1579 * buffers to be added to the vnode list(s)
1580 * we'll have to rescan at least once more
1581 * if the queues aren't empty
1582 */
1583 must_rescan++;
1584 }
1585 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1586
1587try_dirty_list:
1588 /*
1589 * Now iterate on dirty blks
1590 */
1591 if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY)) {
1592 continue;
1593 }
1594 while (!LIST_EMPTY(&local_iterblkhd)) {
1595 bp = LIST_FIRST(&local_iterblkhd);
1596
1597 LIST_REMOVE(bp, b_vnbufs);
1598 LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1599
1600 /*
1601 * some filesystems distinguish meta data blocks with a negative logical block #
1602 */
1603 if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
1604 continue;
1605
6d2010ae
A
1606 aflags = BAC_REMOVE;
1607
1608 if ( !(flags & BUF_INVALIDATE_LOCKED) )
1609 aflags |= BAC_SKIP_LOCKED;
1610
1611 if ( (error = (int)buf_acquire_locked(bp, aflags, slpflag, slptimeo)) ) {
91447636
A
1612 if (error == EDEADLK)
1613 /*
1614 * this buffer was marked B_LOCKED...
1615 * we didn't drop buf_mtxp, so we
1616 * we don't need to rescan
1617 */
1618 continue;
1619 if (error == EAGAIN) {
1620 /*
1621 * found a busy buffer... we blocked and
1622 * dropped buf_mtxp, so we're going to
1623 * need to rescan after this pass is completed
1624 */
1625 must_rescan++;
1626 continue;
1627 }
1628 /*
1629 * got some kind of 'real' error out of the msleep
1630 * in buf_acquire_locked, terminate the scan and return the error
1631 */
1632 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1633
1634 lck_mtx_unlock(buf_mtxp);
1635 return (error);
1636 }
1637 lck_mtx_unlock(buf_mtxp);
1638
6d2010ae
A
1639 if (bp->b_flags & B_LOCKED)
1640 KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 1, 0);
1641
1642 CLR(bp->b_flags, B_LOCKED);
91447636
A
1643 SET(bp->b_flags, B_INVAL);
1644
1645 if (ISSET(bp->b_flags, B_DELWRI) && (flags & BUF_WRITE_DATA))
1646 (void) VNOP_BWRITE(bp);
1647 else
1648 buf_brelse(bp);
1649
1650 lck_mtx_lock(buf_mtxp);
1651 /*
1652 * by dropping buf_mtxp, we allow new
1653 * buffers to be added to the vnode list(s)
1654 * we'll have to rescan at least once more
1655 * if the queues aren't empty
1656 */
1657 must_rescan++;
1658 }
1659 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1660 }
1661 lck_mtx_unlock(buf_mtxp);
1662
1663 return (0);
1664}
1665
1666void
2d21ac55 1667buf_flushdirtyblks(vnode_t vp, int wait, int flags, const char *msg) {
316670eb
A
1668
1669 (void) buf_flushdirtyblks_skipinfo(vp, wait, flags, msg);
1670 return;
1671}
1672
1673int
1674buf_flushdirtyblks_skipinfo(vnode_t vp, int wait, int flags, const char *msg) {
91447636
A
1675 buf_t bp;
1676 int writes_issued = 0;
1677 errno_t error;
1678 int busy = 0;
1679 struct buflists local_iterblkhd;
1680 int lock_flags = BAC_NOWAIT | BAC_REMOVE;
316670eb 1681 int any_locked = 0;
91447636
A
1682
1683 if (flags & BUF_SKIP_LOCKED)
1684 lock_flags |= BAC_SKIP_LOCKED;
1685 if (flags & BUF_SKIP_NONLOCKED)
1686 lock_flags |= BAC_SKIP_NONLOCKED;
1687loop:
1688 lck_mtx_lock(buf_mtxp);
1689
1690 if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY) == 0) {
1691 while (!LIST_EMPTY(&local_iterblkhd)) {
1692 bp = LIST_FIRST(&local_iterblkhd);
1693 LIST_REMOVE(bp, b_vnbufs);
1694 LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
316670eb
A
1695
1696 if ((error = buf_acquire_locked(bp, lock_flags, 0, 0)) == EBUSY) {
1697 busy++;
1698 }
1699 if (error) {
1700 /*
1701 * If we passed in BUF_SKIP_LOCKED or BUF_SKIP_NONLOCKED,
1702 * we may want to do somethign differently if a locked or unlocked
1703 * buffer was encountered (depending on the arg specified).
1704 * In this case, we know that one of those two was set, and the
1705 * buf acquisition failed above.
1706 *
1707 * If it failed with EDEADLK, then save state which can be emitted
1708 * later on to the caller. Most callers should not care.
1709 */
1710 if (error == EDEADLK) {
1711 any_locked++;
1712 }
1713 continue;
1714 }
91447636
A
1715 lck_mtx_unlock(buf_mtxp);
1716
1717 bp->b_flags &= ~B_LOCKED;
1718
1719 /*
1720 * Wait for I/O associated with indirect blocks to complete,
1721 * since there is no way to quickly wait for them below.
1722 */
1723 if ((bp->b_vp == vp) || (wait == 0))
1724 (void) buf_bawrite(bp);
1725 else
1726 (void) VNOP_BWRITE(bp);
1727 writes_issued++;
1728
1729 lck_mtx_lock(buf_mtxp);
1730 }
1731 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1732 }
1733 lck_mtx_unlock(buf_mtxp);
1734
1735 if (wait) {
1736 (void)vnode_waitforwrites(vp, 0, 0, 0, msg);
1737
1738 if (vp->v_dirtyblkhd.lh_first && busy) {
1739 /*
1740 * we had one or more BUSY buffers on
1741 * the dirtyblock list... most likely
1742 * these are due to delayed writes that
1743 * were moved to the bclean queue but
1744 * have not yet been 'written'.
1745 * if we issued some writes on the
1746 * previous pass, we try again immediately
1747 * if we didn't, we'll sleep for some time
1748 * to allow the state to change...
1749 */
1750 if (writes_issued == 0) {
1751 (void)tsleep((caddr_t)&vp->v_numoutput,
1752 PRIBIO + 1, "vnode_flushdirtyblks", hz/20);
1753 }
1754 writes_issued = 0;
1755 busy = 0;
1756
1757 goto loop;
1758 }
1759 }
316670eb
A
1760
1761 return any_locked;
91447636
A
1762}
1763
1764
1765/*
1766 * called with buf_mtxp held...
1767 * this lock protects the queue manipulation
1768 */
1769static int
1770buf_iterprepare(vnode_t vp, struct buflists *iterheadp, int flags)
1771{
1772 struct buflists * listheadp;
1773
1774 if (flags & VBI_DIRTY)
1775 listheadp = &vp->v_dirtyblkhd;
1776 else
1777 listheadp = &vp->v_cleanblkhd;
1778
1779 while (vp->v_iterblkflags & VBI_ITER) {
1780 vp->v_iterblkflags |= VBI_ITERWANT;
2d21ac55 1781 msleep(&vp->v_iterblkflags, buf_mtxp, 0, "buf_iterprepare", NULL);
91447636
A
1782 }
1783 if (LIST_EMPTY(listheadp)) {
1784 LIST_INIT(iterheadp);
1785 return(EINVAL);
1786 }
1787 vp->v_iterblkflags |= VBI_ITER;
1788
1789 iterheadp->lh_first = listheadp->lh_first;
1790 listheadp->lh_first->b_vnbufs.le_prev = &iterheadp->lh_first;
1791 LIST_INIT(listheadp);
1792
1793 return(0);
1794}
1795
1796/*
1797 * called with buf_mtxp held...
1798 * this lock protects the queue manipulation
1799 */
1800static void
1801buf_itercomplete(vnode_t vp, struct buflists *iterheadp, int flags)
1802{
1803 struct buflists * listheadp;
1804 buf_t bp;
1805
1806 if (flags & VBI_DIRTY)
1807 listheadp = &vp->v_dirtyblkhd;
1808 else
1809 listheadp = &vp->v_cleanblkhd;
1810
1811 while (!LIST_EMPTY(iterheadp)) {
1812 bp = LIST_FIRST(iterheadp);
1813 LIST_REMOVE(bp, b_vnbufs);
1814 LIST_INSERT_HEAD(listheadp, bp, b_vnbufs);
1815 }
1816 vp->v_iterblkflags &= ~VBI_ITER;
1817
1818 if (vp->v_iterblkflags & VBI_ITERWANT) {
1819 vp->v_iterblkflags &= ~VBI_ITERWANT;
1820 wakeup(&vp->v_iterblkflags);
1821 }
1822}
1823
1824
1825static void
1826bremfree_locked(buf_t bp)
1827{
1828 struct bqueues *dp = NULL;
2d21ac55 1829 int whichq;
6d2010ae
A
1830
1831 whichq = bp->b_whichq;
1832
1833 if (whichq == -1) {
1834 if (bp->b_shadow_ref == 0)
1835 panic("bremfree_locked: %p not on freelist", bp);
1836 /*
1837 * there are clones pointing to 'bp'...
1838 * therefore, it was not put on a freelist
1839 * when buf_brelse was last called on 'bp'
1840 */
1841 return;
1842 }
91447636
A
1843 /*
1844 * We only calculate the head of the freelist when removing
1845 * the last element of the list as that is the only time that
1846 * it is needed (e.g. to reset the tail pointer).
1847 *
1848 * NB: This makes an assumption about how tailq's are implemented.
1849 */
1850 if (bp->b_freelist.tqe_next == NULL) {
2d21ac55
A
1851 dp = &bufqueues[whichq];
1852
1853 if (dp->tqh_last != &bp->b_freelist.tqe_next)
91447636
A
1854 panic("bremfree: lost tail");
1855 }
1856 TAILQ_REMOVE(dp, bp, b_freelist);
2d21ac55 1857
2d21ac55
A
1858 if (whichq == BQ_LAUNDRY)
1859 blaundrycnt--;
1860
91447636
A
1861 bp->b_whichq = -1;
1862 bp->b_timestamp = 0;
6d2010ae 1863 bp->b_shadow = 0;
91447636
A
1864}
1865
1866/*
1867 * Associate a buffer with a vnode.
2d21ac55 1868 * buf_mtxp must be locked on entry
91447636
A
1869 */
1870static void
2d21ac55 1871bgetvp_locked(vnode_t vp, buf_t bp)
91447636
A
1872{
1873
1874 if (bp->b_vp != vp)
2d21ac55 1875 panic("bgetvp_locked: not free");
91447636
A
1876
1877 if (vp->v_type == VBLK || vp->v_type == VCHR)
1878 bp->b_dev = vp->v_rdev;
1879 else
1880 bp->b_dev = NODEV;
1881 /*
1882 * Insert onto list for new vnode.
1883 */
91447636 1884 bufinsvn(bp, &vp->v_cleanblkhd);
91447636
A
1885}
1886
1887/*
1888 * Disassociate a buffer from a vnode.
2d21ac55 1889 * buf_mtxp must be locked on entry
91447636
A
1890 */
1891static void
2d21ac55 1892brelvp_locked(buf_t bp)
91447636 1893{
91447636
A
1894 /*
1895 * Delete from old vnode list, if on one.
1896 */
91447636
A
1897 if (bp->b_vnbufs.le_next != NOLIST)
1898 bufremvn(bp);
91447636
A
1899
1900 bp->b_vp = (vnode_t)NULL;
1901}
1902
1903/*
1904 * Reassign a buffer from one vnode to another.
1905 * Used to assign file specific control information
1906 * (indirect blocks) to the vnode to which they belong.
1907 */
1908static void
1909buf_reassign(buf_t bp, vnode_t newvp)
1910{
6d2010ae 1911 struct buflists *listheadp;
1c79356b 1912
91447636
A
1913 if (newvp == NULL) {
1914 printf("buf_reassign: NULL");
1915 return;
1916 }
2d21ac55 1917 lck_mtx_lock_spin(buf_mtxp);
91447636
A
1918
1919 /*
1920 * Delete from old vnode list, if on one.
1921 */
1922 if (bp->b_vnbufs.le_next != NOLIST)
1923 bufremvn(bp);
1924 /*
1925 * If dirty, put on list of dirty buffers;
1926 * otherwise insert onto list of clean buffers.
1927 */
1928 if (ISSET(bp->b_flags, B_DELWRI))
1929 listheadp = &newvp->v_dirtyblkhd;
1930 else
1931 listheadp = &newvp->v_cleanblkhd;
1932 bufinsvn(bp, listheadp);
1933
1934 lck_mtx_unlock(buf_mtxp);
1c79356b
A
1935}
1936
91447636
A
1937static __inline__ void
1938bufhdrinit(buf_t bp)
55e303ae 1939{
91447636
A
1940 bzero((char *)bp, sizeof *bp);
1941 bp->b_dev = NODEV;
1942 bp->b_rcred = NOCRED;
1943 bp->b_wcred = NOCRED;
1944 bp->b_vnbufs.le_next = NOLIST;
1945 bp->b_flags = B_INVAL;
1946
1947 return;
55e303ae
A
1948}
1949
1950/*
91447636 1951 * Initialize buffers and hash links for buffers.
55e303ae 1952 */
91447636 1953__private_extern__ void
2d21ac55 1954bufinit(void)
55e303ae 1955{
91447636
A
1956 buf_t bp;
1957 struct bqueues *dp;
1958 int i;
91447636 1959
2d21ac55 1960 nbuf_headers = 0;
91447636
A
1961 /* Initialize the buffer queues ('freelists') and the hash table */
1962 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
1963 TAILQ_INIT(dp);
0c530ab8 1964 bufhashtbl = hashinit(nbuf_hashelements, M_CACHE, &bufhash);
91447636 1965
b0d623f7
A
1966 buf_busycount = 0;
1967
91447636 1968 /* Initialize the buffer headers */
0c530ab8 1969 for (i = 0; i < max_nbuf_headers; i++) {
2d21ac55
A
1970 nbuf_headers++;
1971 bp = &buf_headers[i];
91447636
A
1972 bufhdrinit(bp);
1973
91447636 1974 BLISTNONE(bp);
2d21ac55
A
1975 dp = &bufqueues[BQ_EMPTY];
1976 bp->b_whichq = BQ_EMPTY;
1977 bp->b_timestamp = buf_timestamp();
1978 binsheadfree(bp, dp, BQ_EMPTY);
91447636
A
1979 binshash(bp, &invalhash);
1980 }
2d21ac55 1981 boot_nbuf_headers = nbuf_headers;
6d2010ae
A
1982
1983 TAILQ_INIT(&iobufqueue);
1984 TAILQ_INIT(&delaybufqueue);
1985
2d21ac55
A
1986 for (; i < nbuf_headers + niobuf_headers; i++) {
1987 bp = &buf_headers[i];
91447636 1988 bufhdrinit(bp);
2d21ac55 1989 bp->b_whichq = -1;
91447636
A
1990 binsheadfree(bp, &iobufqueue, -1);
1991 }
1992
2d21ac55 1993 /*
91447636
A
1994 * allocate lock group attribute and group
1995 */
2d21ac55 1996 buf_mtx_grp_attr = lck_grp_attr_alloc_init();
91447636
A
1997 buf_mtx_grp = lck_grp_alloc_init("buffer cache", buf_mtx_grp_attr);
1998
1999 /*
2000 * allocate the lock attribute
2001 */
2002 buf_mtx_attr = lck_attr_alloc_init();
91447636
A
2003
2004 /*
2005 * allocate and initialize mutex's for the buffer and iobuffer pools
2006 */
2007 buf_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
2008 iobuffer_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
813fb2f6 2009 buf_gc_callout = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
91447636
A
2010
2011 if (iobuffer_mtxp == NULL)
2012 panic("couldn't create iobuffer mutex");
2013
2014 if (buf_mtxp == NULL)
2015 panic("couldn't create buf mutex");
2016
813fb2f6
A
2017 if (buf_gc_callout == NULL)
2018 panic("couldn't create buf_gc_callout mutex");
2019
91447636
A
2020 /*
2021 * allocate and initialize cluster specific global locks...
2022 */
2023 cluster_init();
2024
2025 printf("using %d buffer headers and %d cluster IO buffer headers\n",
2d21ac55 2026 nbuf_headers, niobuf_headers);
91447636
A
2027
2028 /* Set up zones used by the buffer cache */
2029 bufzoneinit();
2030
2031 /* start the bcleanbuf() thread */
2032 bcleanbuf_thread_init();
2033
b0d623f7
A
2034 /* Register a callout for relieving vm pressure */
2035 if (vm_set_buffer_cleanup_callout(buffer_cache_gc) != KERN_SUCCESS) {
2036 panic("Couldn't register buffer cache callout for vm pressure!\n");
2037 }
2038
91447636
A
2039}
2040
2d21ac55
A
2041/*
2042 * Zones for the meta data buffers
2043 */
2044
2045#define MINMETA 512
813fb2f6 2046#define MAXMETA 16384
2d21ac55
A
2047
2048struct meta_zone_entry {
2049 zone_t mz_zone;
2050 vm_size_t mz_size;
2051 vm_size_t mz_max;
2052 const char *mz_name;
2053};
2054
2055struct meta_zone_entry meta_zones[] = {
2056 {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
2057 {NULL, (MINMETA * 2), 64 * (MINMETA * 2), "buf.1024" },
2058 {NULL, (MINMETA * 4), 16 * (MINMETA * 4), "buf.2048" },
2059 {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
2060 {NULL, (MINMETA * 16), 512 * (MINMETA * 16), "buf.8192" },
813fb2f6 2061 {NULL, (MINMETA * 32), 512 * (MINMETA * 32), "buf.16384" },
2d21ac55
A
2062 {NULL, 0, 0, "" } /* End */
2063};
2064
2065/*
2066 * Initialize the meta data zones
2067 */
2068static void
2069bufzoneinit(void)
2070{
2071 int i;
2072
2073 for (i = 0; meta_zones[i].mz_size != 0; i++) {
2074 meta_zones[i].mz_zone =
2075 zinit(meta_zones[i].mz_size,
2076 meta_zones[i].mz_max,
2077 PAGE_SIZE,
2078 meta_zones[i].mz_name);
6d2010ae 2079 zone_change(meta_zones[i].mz_zone, Z_CALLERACCT, FALSE);
2d21ac55
A
2080 }
2081 buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
6d2010ae 2082 zone_change(buf_hdr_zone, Z_CALLERACCT, FALSE);
2d21ac55
A
2083}
2084
2085static __inline__ zone_t
2086getbufzone(size_t size)
2087{
2088 int i;
2089
2090 if ((size % 512) || (size < MINMETA) || (size > MAXMETA))
2091 panic("getbufzone: incorect size = %lu", size);
2092
2093 for (i = 0; meta_zones[i].mz_size != 0; i++) {
2094 if (meta_zones[i].mz_size >= size)
2095 break;
2096 }
2097
2098 return (meta_zones[i].mz_zone);
2099}
2100
2101
2102
91447636 2103static struct buf *
b0d623f7 2104bio_doread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, int async, int queuetype)
91447636
A
2105{
2106 buf_t bp;
2107
2108 bp = buf_getblk(vp, blkno, size, 0, 0, queuetype);
2109
2110 /*
2111 * If buffer does not have data valid, start a read.
2112 * Note that if buffer is B_INVAL, buf_getblk() won't return it.
2113 * Therefore, it's valid if it's I/O has completed or been delayed.
2114 */
2115 if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
2116 struct proc *p;
2117
2118 p = current_proc();
2119
2120 /* Start I/O for the buffer (keeping credentials). */
2121 SET(bp->b_flags, B_READ | async);
0c530ab8 2122 if (IS_VALID_CRED(cred) && !IS_VALID_CRED(bp->b_rcred)) {
91447636
A
2123 kauth_cred_ref(cred);
2124 bp->b_rcred = cred;
2125 }
2126
2127 VNOP_STRATEGY(bp);
2128
2129 trace(TR_BREADMISS, pack(vp, size), blkno);
2130
2131 /* Pay for the read. */
39236c6e 2132 if (p && p->p_stats) {
b0d623f7 2133 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_inblock); /* XXX */
39236c6e 2134 }
91447636
A
2135
2136 if (async) {
2137 /*
2138 * since we asked for an ASYNC I/O
2139 * the biodone will do the brelse
2140 * we don't want to pass back a bp
2141 * that we don't 'own'
2142 */
2143 bp = NULL;
2144 }
2145 } else if (async) {
2146 buf_brelse(bp);
2147 bp = NULL;
2148 }
2149
2150 trace(TR_BREADHIT, pack(vp, size), blkno);
2151
2152 return (bp);
55e303ae
A
2153}
2154
2155/*
91447636 2156 * Perform the reads for buf_breadn() and buf_meta_breadn().
55e303ae
A
2157 * Trivial modification to the breada algorithm presented in Bach (p.55).
2158 */
91447636
A
2159static errno_t
2160do_breadn_for_type(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes,
b0d623f7 2161 int nrablks, kauth_cred_t cred, buf_t *bpp, int queuetype)
1c79356b 2162{
91447636
A
2163 buf_t bp;
2164 int i;
1c79356b 2165
55e303ae 2166 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, queuetype);
1c79356b
A
2167
2168 /*
2169 * For each of the read-ahead blocks, start a read, if necessary.
2170 */
2171 for (i = 0; i < nrablks; i++) {
2172 /* If it's in the cache, just go on to next one. */
2173 if (incore(vp, rablks[i]))
2174 continue;
2175
2176 /* Get a buffer for the read-ahead block */
55e303ae 2177 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, queuetype);
1c79356b
A
2178 }
2179
2180 /* Otherwise, we had to start a read for it; wait until it's valid. */
91447636 2181 return (buf_biowait(bp));
1c79356b
A
2182}
2183
91447636 2184
1c79356b 2185/*
91447636
A
2186 * Read a disk block.
2187 * This algorithm described in Bach (p.54).
1c79356b 2188 */
91447636 2189errno_t
b0d623f7 2190buf_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp)
91447636
A
2191{
2192 buf_t bp;
2193
2194 /* Get buffer for block. */
2195 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
2196
2197 /* Wait for the read to complete, and return result. */
2198 return (buf_biowait(bp));
2199}
2200
2201/*
2202 * Read a disk block. [bread() for meta-data]
2203 * This algorithm described in Bach (p.54).
2204 */
2205errno_t
b0d623f7 2206buf_meta_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp)
91447636
A
2207{
2208 buf_t bp;
2209
2210 /* Get buffer for block. */
2211 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
2212
2213 /* Wait for the read to complete, and return result. */
2214 return (buf_biowait(bp));
2215}
2216
2217/*
2218 * Read-ahead multiple disk blocks. The first is sync, the rest async.
2219 */
2220errno_t
b0d623f7 2221buf_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, kauth_cred_t cred, buf_t *bpp)
1c79356b 2222{
91447636
A
2223 return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_READ));
2224}
1c79356b 2225
91447636
A
2226/*
2227 * Read-ahead multiple disk blocks. The first is sync, the rest async.
2228 * [buf_breadn() for meta-data]
2229 */
2230errno_t
b0d623f7 2231buf_meta_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, kauth_cred_t cred, buf_t *bpp)
91447636
A
2232{
2233 return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_META));
1c79356b
A
2234}
2235
2236/*
2237 * Block write. Described in Bach (p.56)
2238 */
91447636
A
2239errno_t
2240buf_bwrite(buf_t bp)
1c79356b 2241{
91447636
A
2242 int sync, wasdelayed;
2243 errno_t rv;
2244 proc_t p = current_proc();
2245 vnode_t vp = bp->b_vp;
1c79356b 2246
91447636 2247 if (bp->b_datap == 0) {
55e303ae
A
2248 if (brecover_data(bp) == 0)
2249 return (0);
2250 }
1c79356b
A
2251 /* Remember buffer type, to switch on it later. */
2252 sync = !ISSET(bp->b_flags, B_ASYNC);
2253 wasdelayed = ISSET(bp->b_flags, B_DELWRI);
2254 CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
91447636
A
2255
2256 if (wasdelayed)
b0d623f7 2257 OSAddAtomicLong(-1, &nbdwrite);
1c79356b
A
2258
2259 if (!sync) {
2260 /*
2261 * If not synchronous, pay for the I/O operation and make
2262 * sure the buf is on the correct vnode queue. We have
2263 * to do this now, because if we don't, the vnode may not
2264 * be properly notified that its I/O has completed.
2265 */
2266 if (wasdelayed)
91447636 2267 buf_reassign(bp, vp);
39236c6e
A
2268 else
2269 if (p && p->p_stats) {
2270 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); /* XXX */
39236c6e 2271 }
1c79356b 2272 }
d52fe63f 2273 trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
1c79356b
A
2274
2275 /* Initiate disk write. Make sure the appropriate party is charged. */
91447636
A
2276
2277 OSAddAtomic(1, &vp->v_numoutput);
1c79356b 2278
91447636 2279 VNOP_STRATEGY(bp);
1c79356b
A
2280
2281 if (sync) {
2282 /*
2283 * If I/O was synchronous, wait for it to complete.
2284 */
91447636 2285 rv = buf_biowait(bp);
1c79356b
A
2286
2287 /*
2288 * Pay for the I/O operation, if it's not been paid for, and
2289 * make sure it's on the correct vnode queue. (async operatings
2290 * were payed for above.)
2291 */
2292 if (wasdelayed)
91447636 2293 buf_reassign(bp, vp);
1c79356b 2294 else
39236c6e
A
2295 if (p && p->p_stats) {
2296 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); /* XXX */
39236c6e 2297 }
1c79356b
A
2298
2299 /* Release the buffer. */
39037602 2300 buf_brelse(bp);
1c79356b
A
2301
2302 return (rv);
2303 } else {
2304 return (0);
2305 }
2306}
2307
2308int
2d21ac55 2309vn_bwrite(struct vnop_bwrite_args *ap)
1c79356b 2310{
91447636 2311 return (buf_bwrite(ap->a_bp));
1c79356b
A
2312}
2313
2314/*
2315 * Delayed write.
2316 *
2317 * The buffer is marked dirty, but is not queued for I/O.
2318 * This routine should be used when the buffer is expected
2319 * to be modified again soon, typically a small write that
2320 * partially fills a buffer.
2321 *
2322 * NB: magnetic tapes cannot be delayed; they must be
2323 * written in the order that the writes are requested.
2324 *
2325 * Described in Leffler, et al. (pp. 208-213).
d52fe63f 2326 *
b0d623f7 2327 * Note: With the ability to allocate additional buffer
d52fe63f 2328 * headers, we can get in to the situation where "too" many
91447636
A
2329 * buf_bdwrite()s can create situation where the kernel can create
2330 * buffers faster than the disks can service. Doing a buf_bawrite() in
6d2010ae 2331 * cases where we have "too many" outstanding buf_bdwrite()s avoids that.
1c79356b 2332 */
39037602 2333int
91447636 2334bdwrite_internal(buf_t bp, int return_error)
1c79356b 2335{
91447636
A
2336 proc_t p = current_proc();
2337 vnode_t vp = bp->b_vp;
1c79356b
A
2338
2339 /*
2340 * If the block hasn't been seen before:
2341 * (1) Mark it as having been seen,
2342 * (2) Charge for the write.
2343 * (3) Make sure it's on its vnode's correct block list,
2344 */
2345 if (!ISSET(bp->b_flags, B_DELWRI)) {
2346 SET(bp->b_flags, B_DELWRI);
39236c6e 2347 if (p && p->p_stats) {
b0d623f7 2348 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); /* XXX */
39236c6e 2349 }
b0d623f7 2350 OSAddAtomicLong(1, &nbdwrite);
91447636 2351 buf_reassign(bp, vp);
1c79356b
A
2352 }
2353
d52fe63f 2354 /*
91447636
A
2355 * if we're not LOCKED, but the total number of delayed writes
2356 * has climbed above 75% of the total buffers in the system
2357 * return an error if the caller has indicated that it can
2358 * handle one in this case, otherwise schedule the I/O now
2359 * this is done to prevent us from allocating tons of extra
2360 * buffers when dealing with virtual disks (i.e. DiskImages),
2361 * because additional buffers are dynamically allocated to prevent
2362 * deadlocks from occurring
2363 *
2364 * however, can't do a buf_bawrite() if the LOCKED bit is set because the
2365 * buffer is part of a transaction and can't go to disk until
2366 * the LOCKED bit is cleared.
d52fe63f 2367 */
2d21ac55 2368 if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf_headers/4)*3)) {
9bccf70c
A
2369 if (return_error)
2370 return (EAGAIN);
91447636
A
2371 /*
2372 * If the vnode has "too many" write operations in progress
2373 * wait for them to finish the IO
2374 */
2d21ac55 2375 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, "buf_bdwrite");
91447636
A
2376
2377 return (buf_bawrite(bp));
d52fe63f
A
2378 }
2379
1c79356b
A
2380 /* Otherwise, the "write" is done, so mark and release the buffer. */
2381 SET(bp->b_flags, B_DONE);
91447636 2382 buf_brelse(bp);
9bccf70c 2383 return (0);
1c79356b
A
2384}
2385
91447636
A
2386errno_t
2387buf_bdwrite(buf_t bp)
9bccf70c 2388{
91447636 2389 return (bdwrite_internal(bp, 0));
9bccf70c
A
2390}
2391
2392
1c79356b 2393/*
91447636 2394 * Asynchronous block write; just an asynchronous buf_bwrite().
d52fe63f
A
2395 *
2396 * Note: With the abilitty to allocate additional buffer
2397 * headers, we can get in to the situation where "too" many
91447636 2398 * buf_bawrite()s can create situation where the kernel can create
d52fe63f
A
2399 * buffers faster than the disks can service.
2400 * We limit the number of "in flight" writes a vnode can have to
2401 * avoid this.
1c79356b 2402 */
9bccf70c 2403static int
91447636 2404bawrite_internal(buf_t bp, int throttle)
1c79356b 2405{
91447636 2406 vnode_t vp = bp->b_vp;
d52fe63f
A
2407
2408 if (vp) {
91447636
A
2409 if (throttle)
2410 /*
2411 * If the vnode has "too many" write operations in progress
2412 * wait for them to finish the IO
2413 */
2414 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, (const char *)"buf_bawrite");
2415 else if (vp->v_numoutput >= VNODE_ASYNC_THROTTLE)
2416 /*
2417 * return to the caller and
2418 * let him decide what to do
2419 */
2420 return (EWOULDBLOCK);
d52fe63f 2421 }
1c79356b 2422 SET(bp->b_flags, B_ASYNC);
9bccf70c 2423
91447636 2424 return (VNOP_BWRITE(bp));
9bccf70c
A
2425}
2426
91447636
A
2427errno_t
2428buf_bawrite(buf_t bp)
9bccf70c 2429{
91447636 2430 return (bawrite_internal(bp, 1));
1c79356b
A
2431}
2432
91447636 2433
6d2010ae
A
2434
2435static void
2436buf_free_meta_store(buf_t bp)
2437{
2438 if (bp->b_bufsize) {
2439 if (ISSET(bp->b_flags, B_ZALLOC)) {
2440 zone_t z;
2441
2442 z = getbufzone(bp->b_bufsize);
2443 zfree(z, (void *)bp->b_datap);
2444 } else
2445 kmem_free(kernel_map, bp->b_datap, bp->b_bufsize);
2446
2447 bp->b_datap = (uintptr_t)NULL;
2448 bp->b_bufsize = 0;
2449 }
2450}
2451
2452
2453static buf_t
2454buf_brelse_shadow(buf_t bp)
2455{
2456 buf_t bp_head;
2457 buf_t bp_temp;
2458 buf_t bp_return = NULL;
2459#ifdef BUF_MAKE_PRIVATE
2460 buf_t bp_data;
2461 int data_ref = 0;
2462#endif
316670eb
A
2463 int need_wakeup = 0;
2464
6d2010ae
A
2465 lck_mtx_lock_spin(buf_mtxp);
2466
3e170ce0 2467 __IGNORE_WCASTALIGN(bp_head = (buf_t)bp->b_orig);
6d2010ae
A
2468
2469 if (bp_head->b_whichq != -1)
2470 panic("buf_brelse_shadow: bp_head on freelist %d\n", bp_head->b_whichq);
2471
2472#ifdef BUF_MAKE_PRIVATE
2473 if (bp_data = bp->b_data_store) {
2474 bp_data->b_data_ref--;
2475 /*
2476 * snapshot the ref count so that we can check it
2477 * outside of the lock... we only want the guy going
2478 * from 1 -> 0 to try and release the storage
2479 */
2480 data_ref = bp_data->b_data_ref;
2481 }
2482#endif
2483 KERNEL_DEBUG(0xbbbbc008 | DBG_FUNC_START, bp, bp_head, bp_head->b_shadow_ref, 0, 0);
2484
2485 bp_head->b_shadow_ref--;
2486
2487 for (bp_temp = bp_head; bp_temp && bp != bp_temp->b_shadow; bp_temp = bp_temp->b_shadow);
2488
2489 if (bp_temp == NULL)
2490 panic("buf_brelse_shadow: bp not on list %p", bp_head);
2491
2492 bp_temp->b_shadow = bp_temp->b_shadow->b_shadow;
2493
2494#ifdef BUF_MAKE_PRIVATE
2495 /*
2496 * we're about to free the current 'owner' of the data buffer and
2497 * there is at least one other shadow buf_t still pointing at it
2498 * so transfer it to the first shadow buf left in the chain
2499 */
2500 if (bp == bp_data && data_ref) {
2501 if ((bp_data = bp_head->b_shadow) == NULL)
2502 panic("buf_brelse_shadow: data_ref mismatch bp(%p)", bp);
2503
2504 for (bp_temp = bp_data; bp_temp; bp_temp = bp_temp->b_shadow)
2505 bp_temp->b_data_store = bp_data;
2506 bp_data->b_data_ref = data_ref;
2507 }
2508#endif
2509 if (bp_head->b_shadow_ref == 0 && bp_head->b_shadow)
2510 panic("buf_relse_shadow: b_shadow != NULL && b_shadow_ref == 0 bp(%p)", bp);
2511 if (bp_head->b_shadow_ref && bp_head->b_shadow == 0)
2512 panic("buf_relse_shadow: b_shadow == NULL && b_shadow_ref != 0 bp(%p)", bp);
2513
2514 if (bp_head->b_shadow_ref == 0) {
2515 if (!ISSET(bp_head->b_lflags, BL_BUSY)) {
2516
2517 CLR(bp_head->b_flags, B_AGE);
2518 bp_head->b_timestamp = buf_timestamp();
2519
2520 if (ISSET(bp_head->b_flags, B_LOCKED)) {
2521 bp_head->b_whichq = BQ_LOCKED;
2522 binstailfree(bp_head, &bufqueues[BQ_LOCKED], BQ_LOCKED);
2523 } else {
2524 bp_head->b_whichq = BQ_META;
2525 binstailfree(bp_head, &bufqueues[BQ_META], BQ_META);
2526 }
2527 } else if (ISSET(bp_head->b_lflags, BL_WAITSHADOW)) {
2528 CLR(bp_head->b_lflags, BL_WAITSHADOW);
2529
2530 bp_return = bp_head;
2531 }
316670eb
A
2532 if (ISSET(bp_head->b_lflags, BL_WANTED_REF)) {
2533 CLR(bp_head->b_lflags, BL_WANTED_REF);
2534 need_wakeup = 1;
2535 }
6d2010ae
A
2536 }
2537 lck_mtx_unlock(buf_mtxp);
39236c6e
A
2538
2539 if (need_wakeup)
316670eb 2540 wakeup(bp_head);
316670eb 2541
6d2010ae
A
2542#ifdef BUF_MAKE_PRIVATE
2543 if (bp == bp_data && data_ref == 0)
2544 buf_free_meta_store(bp);
2545
2546 bp->b_data_store = NULL;
2547#endif
2548 KERNEL_DEBUG(0xbbbbc008 | DBG_FUNC_END, bp, 0, 0, 0, 0);
2549
2550 return (bp_return);
2551}
2552
2553
1c79356b
A
2554/*
2555 * Release a buffer on to the free lists.
2556 * Described in Bach (p. 46).
2557 */
2558void
91447636 2559buf_brelse(buf_t bp)
1c79356b
A
2560{
2561 struct bqueues *bufq;
91447636
A
2562 long whichq;
2563 upl_t upl;
2564 int need_wakeup = 0;
2565 int need_bp_wakeup = 0;
2566
2567
2568 if (bp->b_whichq != -1 || !(bp->b_lflags & BL_BUSY))
2d21ac55 2569 panic("buf_brelse: bad buffer = %p\n", bp);
91447636
A
2570
2571#ifdef JOE_DEBUG
b0d623f7 2572 (void) OSBacktrace(&bp->b_stackbrelse[0], 6);
91447636
A
2573
2574 bp->b_lastbrelse = current_thread();
2575 bp->b_tag = 0;
2576#endif
2577 if (bp->b_lflags & BL_IOBUF) {
6d2010ae
A
2578 buf_t shadow_master_bp = NULL;
2579
2580 if (ISSET(bp->b_lflags, BL_SHADOW))
2581 shadow_master_bp = buf_brelse_shadow(bp);
2582 else if (ISSET(bp->b_lflags, BL_IOBUF_ALLOC))
2583 buf_free_meta_store(bp);
91447636 2584 free_io_buf(bp);
6d2010ae
A
2585
2586 if (shadow_master_bp) {
2587 bp = shadow_master_bp;
2588 goto finish_shadow_master;
2589 }
91447636
A
2590 return;
2591 }
1c79356b
A
2592
2593 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
b0d623f7 2594 bp->b_lblkno * PAGE_SIZE, bp, bp->b_datap,
fa4905b1 2595 bp->b_flags, 0);
1c79356b
A
2596
2597 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
2598
91447636
A
2599 /*
2600 * if we're invalidating a buffer that has the B_FILTER bit
2601 * set then call the b_iodone function so it gets cleaned
2602 * up properly.
2603 *
2604 * the HFS journal code depends on this
2605 */
b4c24cb9 2606 if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
91447636
A
2607 if (ISSET(bp->b_flags, B_FILTER)) { /* if necessary, call out */
2608 void (*iodone_func)(struct buf *, void *) = bp->b_iodone;
6d2010ae 2609 void *arg = bp->b_transaction;
b4c24cb9 2610
91447636 2611 CLR(bp->b_flags, B_FILTER); /* but note callout done */
b4c24cb9 2612 bp->b_iodone = NULL;
91447636 2613 bp->b_transaction = NULL;
b4c24cb9
A
2614
2615 if (iodone_func == NULL) {
2d21ac55 2616 panic("brelse: bp @ %p has NULL b_iodone!\n", bp);
b4c24cb9 2617 }
91447636 2618 (*iodone_func)(bp, arg);
b4c24cb9
A
2619 }
2620 }
91447636
A
2621 /*
2622 * I/O is done. Cleanup the UPL state
2623 */
2624 upl = bp->b_upl;
2625
2626 if ( !ISSET(bp->b_flags, B_META) && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
1c79356b 2627 kern_return_t kret;
1c79356b
A
2628 int upl_flags;
2629
6d2010ae 2630 if (upl == NULL) {
1c79356b 2631 if ( !ISSET(bp->b_flags, B_INVAL)) {
0b4e3aa0 2632 kret = ubc_create_upl(bp->b_vp,
91447636
A
2633 ubc_blktooff(bp->b_vp, bp->b_lblkno),
2634 bp->b_bufsize,
2635 &upl,
2636 NULL,
2637 UPL_PRECIOUS);
2638
1c79356b 2639 if (kret != KERN_SUCCESS)
91447636 2640 panic("brelse: Failed to create UPL");
b0d623f7
A
2641#if UPL_DEBUG
2642 upl_ubc_alias_set(upl, (uintptr_t) bp, (uintptr_t) 5);
91447636
A
2643#endif /* UPL_DEBUG */
2644 }
1c79356b 2645 } else {
91447636 2646 if (bp->b_datap) {
55e303ae
A
2647 kret = ubc_upl_unmap(upl);
2648
2649 if (kret != KERN_SUCCESS)
91447636
A
2650 panic("ubc_upl_unmap failed");
2651 bp->b_datap = (uintptr_t)NULL;
55e303ae 2652 }
1c79356b
A
2653 }
2654 if (upl) {
1c79356b 2655 if (bp->b_flags & (B_ERROR | B_INVAL)) {
91447636 2656 if (bp->b_flags & (B_READ | B_INVAL))
1c79356b
A
2657 upl_flags = UPL_ABORT_DUMP_PAGES;
2658 else
2659 upl_flags = 0;
91447636 2660
0b4e3aa0 2661 ubc_upl_abort(upl, upl_flags);
1c79356b 2662 } else {
91447636
A
2663 if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY))
2664 upl_flags = UPL_COMMIT_SET_DIRTY ;
2665 else
2666 upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
2667
0b4e3aa0 2668 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
91447636 2669 UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
1c79356b 2670 }
91447636 2671 bp->b_upl = NULL;
1c79356b
A
2672 }
2673 } else {
91447636 2674 if ( (upl) )
2d21ac55 2675 panic("brelse: UPL set for non VREG; vp=%p", bp->b_vp);
1c79356b
A
2676 }
2677
1c79356b 2678 /*
91447636 2679 * If it's locked, don't report an error; try again later.
1c79356b 2680 */
1c79356b
A
2681 if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
2682 CLR(bp->b_flags, B_ERROR);
91447636
A
2683 /*
2684 * If it's not cacheable, or an error, mark it invalid.
2685 */
1c79356b
A
2686 if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
2687 SET(bp->b_flags, B_INVAL);
91447636 2688
b0d623f7
A
2689 if ((bp->b_bufsize <= 0) ||
2690 ISSET(bp->b_flags, B_INVAL) ||
2691 (ISSET(bp->b_lflags, BL_WANTDEALLOC) && !ISSET(bp->b_flags, B_DELWRI))) {
6d2010ae
A
2692
2693 boolean_t delayed_buf_free_meta_store = FALSE;
2694
1c79356b 2695 /*
2d21ac55
A
2696 * If it's invalid or empty, dissociate it from its vnode,
2697 * release its storage if B_META, and
2698 * clean it up a bit and put it on the EMPTY queue
1c79356b 2699 */
91447636 2700 if (ISSET(bp->b_flags, B_DELWRI))
b0d623f7 2701 OSAddAtomicLong(-1, &nbdwrite);
91447636 2702
2d21ac55 2703 if (ISSET(bp->b_flags, B_META)) {
6d2010ae
A
2704 if (bp->b_shadow_ref)
2705 delayed_buf_free_meta_store = TRUE;
2706 else
2707 buf_free_meta_store(bp);
2d21ac55 2708 }
91447636 2709 /*
2d21ac55 2710 * nuke any credentials we were holding
91447636 2711 */
6d2010ae
A
2712 buf_release_credentials(bp);
2713
2714 lck_mtx_lock_spin(buf_mtxp);
2715
2716 if (bp->b_shadow_ref) {
2717 SET(bp->b_lflags, BL_WAITSHADOW);
2718
2719 lck_mtx_unlock(buf_mtxp);
2720
2721 return;
2d21ac55 2722 }
6d2010ae 2723 if (delayed_buf_free_meta_store == TRUE) {
91447636 2724
6d2010ae
A
2725 lck_mtx_unlock(buf_mtxp);
2726finish_shadow_master:
2727 buf_free_meta_store(bp);
91447636 2728
6d2010ae
A
2729 lck_mtx_lock_spin(buf_mtxp);
2730 }
2731 CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA));
2d21ac55
A
2732
2733 if (bp->b_vp)
2734 brelvp_locked(bp);
2735
2736 bremhash(bp);
2737 BLISTNONE(bp);
2738 binshash(bp, &invalhash);
2739
6d2010ae
A
2740 bp->b_whichq = BQ_EMPTY;
2741 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
1c79356b 2742 } else {
6d2010ae 2743
1c79356b
A
2744 /*
2745 * It has valid data. Put it on the end of the appropriate
2746 * queue, so that it'll stick around for as long as possible.
2747 */
2748 if (ISSET(bp->b_flags, B_LOCKED))
2749 whichq = BQ_LOCKED; /* locked in core */
2750 else if (ISSET(bp->b_flags, B_META))
2751 whichq = BQ_META; /* meta-data */
2752 else if (ISSET(bp->b_flags, B_AGE))
2753 whichq = BQ_AGE; /* stale but valid data */
2754 else
2755 whichq = BQ_LRU; /* valid data */
1c79356b 2756 bufq = &bufqueues[whichq];
91447636 2757
2d21ac55 2758 bp->b_timestamp = buf_timestamp();
91447636 2759
6d2010ae
A
2760 lck_mtx_lock_spin(buf_mtxp);
2761
2762 /*
2763 * the buf_brelse_shadow routine doesn't take 'ownership'
2764 * of the parent buf_t... it updates state that is protected by
2765 * the buf_mtxp, and checks for BL_BUSY to determine whether to
2766 * put the buf_t back on a free list. b_shadow_ref is protected
2767 * by the lock, and since we have not yet cleared B_BUSY, we need
2768 * to check it while holding the lock to insure that one of us
2769 * puts this buf_t back on a free list when it is safe to do so
2770 */
2771 if (bp->b_shadow_ref == 0) {
2772 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE));
2773 bp->b_whichq = whichq;
2774 binstailfree(bp, bufq, whichq);
2775 } else {
2776 /*
2777 * there are still cloned buf_t's pointing
2778 * at this guy... need to keep it off the
2779 * freelists until a buf_brelse is done on
2780 * the last clone
2781 */
2782 CLR(bp->b_flags, (B_ASYNC | B_NOCACHE));
2783 }
1c79356b 2784 }
91447636
A
2785 if (needbuffer) {
2786 /*
2787 * needbuffer is a global
2788 * we're currently using buf_mtxp to protect it
2789 * delay doing the actual wakeup until after
2790 * we drop buf_mtxp
2791 */
2792 needbuffer = 0;
2793 need_wakeup = 1;
2794 }
2795 if (ISSET(bp->b_lflags, BL_WANTED)) {
2796 /*
2797 * delay the actual wakeup until after we
2798 * clear BL_BUSY and we've dropped buf_mtxp
2799 */
2800 need_bp_wakeup = 1;
2801 }
2802 /*
2803 * Unlock the buffer.
2804 */
2805 CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
b0d623f7 2806 buf_busycount--;
1c79356b 2807
91447636 2808 lck_mtx_unlock(buf_mtxp);
1c79356b 2809
91447636
A
2810 if (need_wakeup) {
2811 /*
2812 * Wake up any processes waiting for any buffer to become free.
2813 */
2814 wakeup(&needbuffer);
2815 }
2816 if (need_bp_wakeup) {
2817 /*
2818 * Wake up any proceeses waiting for _this_ buffer to become free.
2819 */
2820 wakeup(bp);
2821 }
1c79356b 2822 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
b0d623f7 2823 bp, bp->b_datap, bp->b_flags, 0, 0);
1c79356b
A
2824}
2825
2826/*
2827 * Determine if a block is in the cache.
2828 * Just look on what would be its hash chain. If it's there, return
2829 * a pointer to it, unless it's marked invalid. If it's marked invalid,
2830 * we normally don't return the buffer, unless the caller explicitly
2831 * wants us to.
2832 */
91447636
A
2833static boolean_t
2834incore(vnode_t vp, daddr64_t blkno)
2835{
2836 boolean_t retval;
2d21ac55 2837 struct bufhashhdr *dp;
91447636 2838
2d21ac55 2839 dp = BUFHASH(vp, blkno);
91447636 2840
2d21ac55
A
2841 lck_mtx_lock_spin(buf_mtxp);
2842
2843 if (incore_locked(vp, blkno, dp))
91447636
A
2844 retval = TRUE;
2845 else
2846 retval = FALSE;
2847 lck_mtx_unlock(buf_mtxp);
2848
2849 return (retval);
2850}
2851
2852
2853static buf_t
2d21ac55 2854incore_locked(vnode_t vp, daddr64_t blkno, struct bufhashhdr *dp)
1c79356b
A
2855{
2856 struct buf *bp;
1c79356b 2857
1c79356b 2858 /* Search hash chain */
2d21ac55 2859 for (bp = dp->lh_first; bp != NULL; bp = bp->b_hash.le_next) {
1c79356b 2860 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
91447636 2861 !ISSET(bp->b_flags, B_INVAL)) {
1c79356b 2862 return (bp);
91447636 2863 }
1c79356b 2864 }
2d21ac55 2865 return (NULL);
1c79356b
A
2866}
2867
39236c6e 2868
316670eb
A
2869void
2870buf_wait_for_shadow_io(vnode_t vp, daddr64_t blkno)
2871{
2872 buf_t bp;
2873 struct bufhashhdr *dp;
2874
2875 dp = BUFHASH(vp, blkno);
2876
2877 lck_mtx_lock_spin(buf_mtxp);
2878
2879 for (;;) {
2880 if ((bp = incore_locked(vp, blkno, dp)) == NULL)
2881 break;
2882
2883 if (bp->b_shadow_ref == 0)
2884 break;
2885
2886 SET(bp->b_lflags, BL_WANTED_REF);
fa4905b1 2887
316670eb
A
2888 (void) msleep(bp, buf_mtxp, PSPIN | (PRIBIO+1), "buf_wait_for_shadow", NULL);
2889 }
2890 lck_mtx_unlock(buf_mtxp);
2891}
2892
fa4905b1 2893/* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
1c79356b
A
2894/*
2895 * Get a block of requested size that is associated with
2896 * a given vnode and block offset. If it is found in the
2897 * block cache, mark it as having been found, make it busy
2898 * and return it. Otherwise, return an empty block of the
2899 * correct size. It is up to the caller to insure that the
2900 * cached blocks be of the correct size.
2901 */
91447636
A
2902buf_t
2903buf_getblk(vnode_t vp, daddr64_t blkno, int size, int slpflag, int slptimeo, int operation)
1c79356b 2904{
91447636
A
2905 buf_t bp;
2906 int err;
1c79356b
A
2907 upl_t upl;
2908 upl_page_info_t *pl;
1c79356b 2909 kern_return_t kret;
91447636
A
2910 int ret_only_valid;
2911 struct timespec ts;
2912 int upl_flags;
2d21ac55 2913 struct bufhashhdr *dp;
1c79356b 2914
1c79356b 2915 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
b0d623f7 2916 (uintptr_t)(blkno * PAGE_SIZE), size, operation, 0, 0);
1c79356b 2917
91447636
A
2918 ret_only_valid = operation & BLK_ONLYVALID;
2919 operation &= ~BLK_ONLYVALID;
2d21ac55 2920 dp = BUFHASH(vp, blkno);
91447636 2921start:
2d21ac55 2922 lck_mtx_lock_spin(buf_mtxp);
b0d623f7 2923
2d21ac55 2924 if ((bp = incore_locked(vp, blkno, dp))) {
91447636
A
2925 /*
2926 * Found in the Buffer Cache
2927 */
2928 if (ISSET(bp->b_lflags, BL_BUSY)) {
2929 /*
2930 * but is busy
2931 */
1c79356b
A
2932 switch (operation) {
2933 case BLK_READ:
2934 case BLK_WRITE:
2935 case BLK_META:
91447636 2936 SET(bp->b_lflags, BL_WANTED);
1c79356b 2937 bufstats.bufs_busyincore++;
91447636
A
2938
2939 /*
2940 * don't retake the mutex after being awakened...
2941 * the time out is in msecs
2942 */
2943 ts.tv_sec = (slptimeo/1000);
2944 ts.tv_nsec = (slptimeo % 1000) * 10 * NSEC_PER_USEC * 1000;
2945
b0d623f7
A
2946 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 396)) | DBG_FUNC_NONE,
2947 (uintptr_t)blkno, size, operation, 0, 0);
2948
91447636
A
2949 err = msleep(bp, buf_mtxp, slpflag | PDROP | (PRIBIO + 1), "buf_getblk", &ts);
2950
1c79356b
A
2951 /*
2952 * Callers who call with PCATCH or timeout are
2953 * willing to deal with the NULL pointer
2954 */
91447636 2955 if (err && ((slpflag & PCATCH) || ((err == EWOULDBLOCK) && slptimeo)))
1c79356b
A
2956 return (NULL);
2957 goto start;
2958 /*NOTREACHED*/
1c79356b 2959
1c79356b 2960 default:
91447636
A
2961 /*
2962 * unknown operation requested
2963 */
2964 panic("getblk: paging or unknown operation for incore busy buffer - %x\n", operation);
1c79356b
A
2965 /*NOTREACHED*/
2966 break;
2967 }
2968 } else {
d190cdc3
A
2969 int clear_bdone;
2970
91447636
A
2971 /*
2972 * buffer in core and not busy
2973 */
91447636
A
2974 SET(bp->b_lflags, BL_BUSY);
2975 SET(bp->b_flags, B_CACHE);
b0d623f7 2976 buf_busycount++;
2d21ac55 2977
91447636 2978 bremfree_locked(bp);
1c79356b 2979 bufstats.bufs_incore++;
91447636
A
2980
2981 lck_mtx_unlock(buf_mtxp);
2d21ac55
A
2982#ifdef JOE_DEBUG
2983 bp->b_owner = current_thread();
2984 bp->b_tag = 1;
2985#endif
2986 if ( (bp->b_upl) )
2987 panic("buffer has UPL, but not marked BUSY: %p", bp);
1c79356b 2988
d190cdc3
A
2989 clear_bdone = FALSE;
2990 if (!ret_only_valid) {
2991 /*
2992 * If the number bytes that are valid is going
2993 * to increase (even if we end up not doing a
2994 * reallocation through allocbuf) we have to read
2995 * the new size first.
2996 *
2997 * This is required in cases where we doing a read
2998 * modify write of a already valid data on disk but
2999 * in cases where the data on disk beyond (blkno + b_bcount)
3000 * is invalid, we may end up doing extra I/O.
3001 */
3002 if (operation == BLK_META && bp->b_bcount < size) {
3003 /*
3004 * Since we are going to read in the whole size first
3005 * we first have to ensure that any pending delayed write
3006 * is flushed to disk first.
3007 */
3008 if (ISSET(bp->b_flags, B_DELWRI)) {
3009 CLR(bp->b_flags, B_CACHE);
3010 buf_bwrite(bp);
3011 goto start;
3012 }
3013 /*
3014 * clear B_DONE before returning from
3015 * this function so that the caller can
3016 * can issue a read for the new size.
3017 */
3018 clear_bdone = TRUE;
3019 }
3020
3021 if (bp->b_bufsize != size)
3022 allocbuf(bp, size);
3023 }
1c79356b 3024
91447636 3025 upl_flags = 0;
1c79356b 3026 switch (operation) {
1c79356b 3027 case BLK_WRITE:
91447636
A
3028 /*
3029 * "write" operation: let the UPL subsystem
3030 * know that we intend to modify the buffer
3031 * cache pages we're gathering.
3032 */
3033 upl_flags |= UPL_WILL_MODIFY;
3034 case BLK_READ:
3035 upl_flags |= UPL_PRECIOUS;
3036 if (UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
0b4e3aa0 3037 kret = ubc_create_upl(vp,
91447636
A
3038 ubc_blktooff(vp, bp->b_lblkno),
3039 bp->b_bufsize,
3040 &upl,
3041 &pl,
3042 upl_flags);
1c79356b 3043 if (kret != KERN_SUCCESS)
91447636 3044 panic("Failed to create UPL");
1c79356b 3045
91447636 3046 bp->b_upl = upl;
1c79356b 3047
91447636
A
3048 if (upl_valid_page(pl, 0)) {
3049 if (upl_dirty_page(pl, 0))
3050 SET(bp->b_flags, B_WASDIRTY);
3051 else
3052 CLR(bp->b_flags, B_WASDIRTY);
3053 } else
3054 CLR(bp->b_flags, (B_DONE | B_CACHE | B_WASDIRTY | B_DELWRI));
1c79356b 3055
b0d623f7 3056 kret = ubc_upl_map(upl, (vm_offset_t*)&(bp->b_datap));
1c79356b 3057
9bccf70c 3058 if (kret != KERN_SUCCESS)
91447636 3059 panic("getblk: ubc_upl_map() failed with (%d)", kret);
1c79356b
A
3060 }
3061 break;
3062
3063 case BLK_META:
3064 /*
3065 * VM is not involved in IO for the meta data
3066 * buffer already has valid data
3067 */
1c79356b
A
3068 break;
3069
3070 default:
91447636 3071 panic("getblk: paging or unknown operation for incore buffer- %d\n", operation);
1c79356b
A
3072 /*NOTREACHED*/
3073 break;
3074 }
d190cdc3
A
3075
3076 if (clear_bdone)
3077 CLR(bp->b_flags, B_DONE);
1c79356b
A
3078 }
3079 } else { /* not incore() */
3080 int queue = BQ_EMPTY; /* Start with no preference */
1c79356b 3081
91447636
A
3082 if (ret_only_valid) {
3083 lck_mtx_unlock(buf_mtxp);
3084 return (NULL);
1c79356b 3085 }
2d21ac55 3086 if ((vnode_isreg(vp) == 0) || (UBCINFOEXISTS(vp) == 0) /*|| (vnode_issystem(vp) == 1)*/)
91447636
A
3087 operation = BLK_META;
3088
1c79356b 3089 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
b0d623f7 3090 goto start;
91447636
A
3091
3092 /*
3093 * getnewbuf may block for a number of different reasons...
3094 * if it does, it's then possible for someone else to
3095 * create a buffer for the same block and insert it into
3096 * the hash... if we see it incore at this point we dump
3097 * the buffer we were working on and start over
3098 */
2d21ac55 3099 if (incore_locked(vp, blkno, dp)) {
0b4e3aa0
A
3100 SET(bp->b_flags, B_INVAL);
3101 binshash(bp, &invalhash);
91447636
A
3102
3103 lck_mtx_unlock(buf_mtxp);
3104
3105 buf_brelse(bp);
0b4e3aa0
A
3106 goto start;
3107 }
b4c24cb9
A
3108 /*
3109 * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
3110 * CALLED! BE CAREFUL.
3111 */
0b4e3aa0 3112
1c79356b 3113 /*
91447636 3114 * mark the buffer as B_META if indicated
1c79356b 3115 * so that when buffer is released it will goto META queue
1c79356b 3116 */
91447636
A
3117 if (operation == BLK_META)
3118 SET(bp->b_flags, B_META);
9bccf70c
A
3119
3120 bp->b_blkno = bp->b_lblkno = blkno;
3121 bp->b_vp = vp;
3122
0b4e3aa0
A
3123 /*
3124 * Insert in the hash so that incore() can find it
3125 */
3126 binshash(bp, BUFHASH(vp, blkno));
3127
2d21ac55 3128 bgetvp_locked(vp, bp);
91447636 3129
2d21ac55 3130 lck_mtx_unlock(buf_mtxp);
9bccf70c 3131
1c79356b
A
3132 allocbuf(bp, size);
3133
91447636 3134 upl_flags = 0;
1c79356b
A
3135 switch (operation) {
3136 case BLK_META:
91447636
A
3137 /*
3138 * buffer data is invalid...
3139 *
3140 * I don't want to have to retake buf_mtxp,
3141 * so the miss and vmhits counters are done
3142 * with Atomic updates... all other counters
3143 * in bufstats are protected with either
3144 * buf_mtxp or iobuffer_mtxp
3145 */
b0d623f7 3146 OSAddAtomicLong(1, &bufstats.bufs_miss);
1c79356b
A
3147 break;
3148
1c79356b 3149 case BLK_WRITE:
91447636
A
3150 /*
3151 * "write" operation: let the UPL subsystem know
3152 * that we intend to modify the buffer cache pages
3153 * we're gathering.
3154 */
3155 upl_flags |= UPL_WILL_MODIFY;
3156 case BLK_READ:
3157 { off_t f_offset;
3158 size_t contig_bytes;
3159 int bmap_flags;
1c79356b 3160
3e170ce0
A
3161#if DEVELOPMENT || DEBUG
3162 /*
3163 * Apple implemented file systems use UBC excludively; they should
3164 * not call in here."
3165 */
3166 const char* excldfs[] = {"hfs", "afpfs", "smbfs", "acfs",
3167 "exfat", "msdos", "webdav", NULL};
3168
3169 for (int i = 0; excldfs[i] != NULL; i++) {
3170 if (vp->v_mount &&
3171 !strcmp(vp->v_mount->mnt_vfsstat.f_fstypename,
3172 excldfs[i])) {
3173 panic("%s %s calls buf_getblk",
3174 excldfs[i],
3175 operation == BLK_READ ? "BLK_READ" : "BLK_WRITE");
3176 }
3177 }
3178#endif
3179
91447636 3180 if ( (bp->b_upl) )
2d21ac55 3181 panic("bp already has UPL: %p",bp);
1c79356b 3182
91447636
A
3183 f_offset = ubc_blktooff(vp, blkno);
3184
3185 upl_flags |= UPL_PRECIOUS;
0b4e3aa0 3186 kret = ubc_create_upl(vp,
91447636
A
3187 f_offset,
3188 bp->b_bufsize,
3189 &upl,
3190 &pl,
3191 upl_flags);
1c79356b 3192
91447636
A
3193 if (kret != KERN_SUCCESS)
3194 panic("Failed to create UPL");
b0d623f7
A
3195#if UPL_DEBUG
3196 upl_ubc_alias_set(upl, (uintptr_t) bp, (uintptr_t) 4);
91447636
A
3197#endif /* UPL_DEBUG */
3198 bp->b_upl = upl;
1c79356b
A
3199
3200 if (upl_valid_page(pl, 0)) {
1c79356b 3201
91447636
A
3202 if (operation == BLK_READ)
3203 bmap_flags = VNODE_READ;
3204 else
3205 bmap_flags = VNODE_WRITE;
1c79356b 3206
91447636 3207 SET(bp->b_flags, B_CACHE | B_DONE);
1c79356b 3208
b0d623f7 3209 OSAddAtomicLong(1, &bufstats.bufs_vmhits);
1c79356b 3210
91447636
A
3211 bp->b_validoff = 0;
3212 bp->b_dirtyoff = 0;
1c79356b 3213
91447636
A
3214 if (upl_dirty_page(pl, 0)) {
3215 /* page is dirty */
3216 SET(bp->b_flags, B_WASDIRTY);
1c79356b 3217
91447636
A
3218 bp->b_validend = bp->b_bcount;
3219 bp->b_dirtyend = bp->b_bcount;
1c79356b 3220 } else {
91447636
A
3221 /* page is clean */
3222 bp->b_validend = bp->b_bcount;
3223 bp->b_dirtyend = 0;
1c79356b 3224 }
91447636
A
3225 /*
3226 * try to recreate the physical block number associated with
3227 * this buffer...
3228 */
3229 if (VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))
3230 panic("getblk: VNOP_BLOCKMAP failed");
3231 /*
3232 * if the extent represented by this buffer
3233 * is not completely physically contiguous on
3234 * disk, than we can't cache the physical mapping
3235 * in the buffer header
3236 */
3237 if ((long)contig_bytes < bp->b_bcount)
3238 bp->b_blkno = bp->b_lblkno;
1c79356b 3239 } else {
b0d623f7 3240 OSAddAtomicLong(1, &bufstats.bufs_miss);
1c79356b 3241 }
b0d623f7 3242 kret = ubc_upl_map(upl, (vm_offset_t *)&(bp->b_datap));
1c79356b 3243
91447636
A
3244 if (kret != KERN_SUCCESS)
3245 panic("getblk: ubc_upl_map() failed with (%d)", kret);
1c79356b 3246 break;
91447636 3247 }
1c79356b 3248 default:
91447636 3249 panic("getblk: paging or unknown operation - %x", operation);
1c79356b
A
3250 /*NOTREACHED*/
3251 break;
3252 }
3253 }
1c79356b 3254 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
b0d623f7 3255 bp, bp->b_datap, bp->b_flags, 3, 0);
91447636
A
3256
3257#ifdef JOE_DEBUG
b0d623f7 3258 (void) OSBacktrace(&bp->b_stackgetblk[0], 6);
91447636 3259#endif
1c79356b
A
3260 return (bp);
3261}
3262
3263/*
3264 * Get an empty, disassociated buffer of given size.
3265 */
91447636 3266buf_t
2d21ac55 3267buf_geteblk(int size)
1c79356b 3268{
b0d623f7 3269 buf_t bp = NULL;
91447636
A
3270 int queue = BQ_EMPTY;
3271
b0d623f7
A
3272 do {
3273 lck_mtx_lock_spin(buf_mtxp);
3274
3275 bp = getnewbuf(0, 0, &queue);
3276 } while (bp == NULL);
1c79356b 3277
1c79356b 3278 SET(bp->b_flags, (B_META|B_INVAL));
1c79356b
A
3279
3280#if DIAGNOSTIC
3281 assert(queue == BQ_EMPTY);
3282#endif /* DIAGNOSTIC */
3283 /* XXX need to implement logic to deal with other queues */
3284
1c79356b 3285 binshash(bp, &invalhash);
1c79356b
A
3286 bufstats.bufs_eblk++;
3287
91447636
A
3288 lck_mtx_unlock(buf_mtxp);
3289
3290 allocbuf(bp, size);
3291
1c79356b
A
3292 return (bp);
3293}
3294
6d2010ae
A
3295uint32_t
3296buf_redundancy_flags(buf_t bp)
3297{
3298 return bp->b_redundancy_flags;
3299}
3300
3301void
3302buf_set_redundancy_flags(buf_t bp, uint32_t flags)
3303{
3304 SET(bp->b_redundancy_flags, flags);
3305}
3306
3307void
3308buf_clear_redundancy_flags(buf_t bp, uint32_t flags)
3309{
3310 CLR(bp->b_redundancy_flags, flags);
3311}
1c79356b 3312
fe8ab488
A
3313
3314
3315static void *
3316recycle_buf_from_pool(int nsize)
3317{
3318 buf_t bp;
3319 void *ptr = NULL;
3320
3321 lck_mtx_lock_spin(buf_mtxp);
3322
3323 TAILQ_FOREACH(bp, &bufqueues[BQ_META], b_freelist) {
3324 if (ISSET(bp->b_flags, B_DELWRI) || bp->b_bufsize != nsize)
3325 continue;
3326 ptr = (void *)bp->b_datap;
3327 bp->b_bufsize = 0;
3328
3329 bcleanbuf(bp, TRUE);
3330 break;
3331 }
3332 lck_mtx_unlock(buf_mtxp);
3333
3334 return (ptr);
3335}
3336
3337
3338
3339int zalloc_nopagewait_failed = 0;
3340int recycle_buf_failed = 0;
3341
3342static void *
3343grab_memory_for_meta_buf(int nsize)
3344{
3345 zone_t z;
3346 void *ptr;
3347 boolean_t was_vmpriv;
3348
3349 z = getbufzone(nsize);
3350
3351 /*
3352 * make sure we're NOT priviliged so that
3353 * if a vm_page_grab is needed, it won't
3354 * block if we're out of free pages... if
3355 * it blocks, then we can't honor the
3356 * nopagewait request
3357 */
3358 was_vmpriv = set_vm_privilege(FALSE);
3359
3360 ptr = zalloc_nopagewait(z);
3361
3362 if (was_vmpriv == TRUE)
3363 set_vm_privilege(TRUE);
3364
3365 if (ptr == NULL) {
3366
3367 zalloc_nopagewait_failed++;
3368
3369 ptr = recycle_buf_from_pool(nsize);
3370
3371 if (ptr == NULL) {
3372
3373 recycle_buf_failed++;
3374
3375 if (was_vmpriv == FALSE)
3376 set_vm_privilege(TRUE);
3377
3378 ptr = zalloc(z);
3379
3380 if (was_vmpriv == FALSE)
3381 set_vm_privilege(FALSE);
3382 }
3383 }
3384 return (ptr);
3385}
3386
1c79356b
A
3387/*
3388 * With UBC, there is no need to expand / shrink the file data
3389 * buffer. The VM uses the same pages, hence no waste.
3390 * All the file data buffers can have one size.
3391 * In fact expand / shrink would be an expensive operation.
3392 *
3393 * Only exception to this is meta-data buffers. Most of the
3394 * meta data operations are smaller than PAGE_SIZE. Having the
3395 * meta-data buffers grow and shrink as needed, optimizes use
3396 * of the kernel wired memory.
3397 */
3398
3399int
91447636 3400allocbuf(buf_t bp, int size)
1c79356b
A
3401{
3402 vm_size_t desired_size;
3403
3404 desired_size = roundup(size, CLBYTES);
3405
91447636 3406 if (desired_size < PAGE_SIZE)
1c79356b
A
3407 desired_size = PAGE_SIZE;
3408 if (desired_size > MAXBSIZE)
3409 panic("allocbuf: buffer larger than MAXBSIZE requested");
3410
1c79356b 3411 if (ISSET(bp->b_flags, B_META)) {
91447636
A
3412 int nsize = roundup(size, MINMETA);
3413
3414 if (bp->b_datap) {
3415 vm_offset_t elem = (vm_offset_t)bp->b_datap;
3416
3417 if (ISSET(bp->b_flags, B_ZALLOC)) {
3418 if (bp->b_bufsize < nsize) {
fe8ab488
A
3419 zone_t zprev;
3420
91447636
A
3421 /* reallocate to a bigger size */
3422
3423 zprev = getbufzone(bp->b_bufsize);
3424 if (nsize <= MAXMETA) {
3425 desired_size = nsize;
fe8ab488 3426
2d21ac55 3427 /* b_datap not really a ptr */
fe8ab488 3428 *(void **)(&bp->b_datap) = grab_memory_for_meta_buf(nsize);
1c79356b 3429 } else {
91447636 3430 bp->b_datap = (uintptr_t)NULL;
3e170ce0 3431 kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size, VM_KERN_MEMORY_FILE);
91447636 3432 CLR(bp->b_flags, B_ZALLOC);
1c79356b 3433 }
91447636
A
3434 bcopy((void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
3435 zfree(zprev, (void *)elem);
3436 } else {
3437 desired_size = bp->b_bufsize;
3438 }
3439
3440 } else {
3441 if ((vm_size_t)bp->b_bufsize < desired_size) {
1c79356b 3442 /* reallocate to a bigger size */
91447636 3443 bp->b_datap = (uintptr_t)NULL;
3e170ce0 3444 kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size, VM_KERN_MEMORY_FILE);
91447636 3445 bcopy((const void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
1c79356b
A
3446 kmem_free(kernel_map, elem, bp->b_bufsize);
3447 } else {
3448 desired_size = bp->b_bufsize;
3449 }
91447636 3450 }
1c79356b
A
3451 } else {
3452 /* new allocation */
3453 if (nsize <= MAXMETA) {
3454 desired_size = nsize;
fe8ab488 3455
2d21ac55 3456 /* b_datap not really a ptr */
fe8ab488 3457 *(void **)(&bp->b_datap) = grab_memory_for_meta_buf(nsize);
1c79356b 3458 SET(bp->b_flags, B_ZALLOC);
91447636 3459 } else
3e170ce0 3460 kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size, VM_KERN_MEMORY_FILE);
1c79356b 3461 }
2d21ac55
A
3462
3463 if (bp->b_datap == 0)
3464 panic("allocbuf: NULL b_datap");
1c79356b 3465 }
9bccf70c
A
3466 bp->b_bufsize = desired_size;
3467 bp->b_bcount = size;
91447636 3468
9bccf70c 3469 return (0);
1c79356b
A
3470}
3471
3472/*
3473 * Get a new buffer from one of the free lists.
3474 *
3475 * Request for a queue is passes in. The queue from which the buffer was taken
3476 * from is returned. Out of range queue requests get BQ_EMPTY. Request for
3477 * BQUEUE means no preference. Use heuristics in that case.
3478 * Heuristics is as follows:
3479 * Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
3480 * If none available block till one is made available.
3481 * If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
3482 * Pick the most stale buffer.
3483 * If found buffer was marked delayed write, start the async. write
3484 * and restart the search.
3485 * Initialize the fields and disassociate the buffer from the vnode.
3486 * Remove the buffer from the hash. Return the buffer and the queue
3487 * on which it was found.
91447636
A
3488 *
3489 * buf_mtxp is held upon entry
b0d623f7
A
3490 * returns with buf_mtxp locked if new buf available
3491 * returns with buf_mtxp UNlocked if new buf NOT available
1c79356b
A
3492 */
3493
91447636
A
3494static buf_t
3495getnewbuf(int slpflag, int slptimeo, int * queue)
1c79356b 3496{
91447636
A
3497 buf_t bp;
3498 buf_t lru_bp;
3499 buf_t age_bp;
3500 buf_t meta_bp;
3501 int age_time, lru_time, bp_time, meta_time;
3502 int req = *queue; /* save it for restarts */
3503 struct timespec ts;
1c79356b
A
3504
3505start:
91447636
A
3506 /*
3507 * invalid request gets empty queue
3508 */
2d21ac55 3509 if ((*queue >= BQUEUES) || (*queue < 0)
765c9de3 3510 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED))
1c79356b 3511 *queue = BQ_EMPTY;
2d21ac55
A
3512
3513
3514 if (*queue == BQ_EMPTY && (bp = bufqueues[*queue].tqh_first))
3515 goto found;
3516
3517 /*
3518 * need to grow number of bufs, add another one rather than recycling
3519 */
3520 if (nbuf_headers < max_nbuf_headers) {
0c530ab8
A
3521 /*
3522 * Increment count now as lock
3523 * is dropped for allocation.
3524 * That avoids over commits
3525 */
2d21ac55 3526 nbuf_headers++;
0c530ab8
A
3527 goto add_newbufs;
3528 }
2d21ac55
A
3529 /* Try for the requested queue first */
3530 bp = bufqueues[*queue].tqh_first;
3531 if (bp)
3532 goto found;
1c79356b
A
3533
3534 /* Unable to use requested queue */
3535 age_bp = bufqueues[BQ_AGE].tqh_first;
3536 lru_bp = bufqueues[BQ_LRU].tqh_first;
3537 meta_bp = bufqueues[BQ_META].tqh_first;
3538
9bccf70c
A
3539 if (!age_bp && !lru_bp && !meta_bp) {
3540 /*
3541 * Unavailble on AGE or LRU or META queues
3542 * Try the empty list first
3543 */
1c79356b
A
3544 bp = bufqueues[BQ_EMPTY].tqh_first;
3545 if (bp) {
3546 *queue = BQ_EMPTY;
3547 goto found;
3548 }
0c530ab8
A
3549 /*
3550 * We have seen is this is hard to trigger.
3551 * This is an overcommit of nbufs but needed
3552 * in some scenarios with diskiamges
3553 */
3554
3555add_newbufs:
91447636 3556 lck_mtx_unlock(buf_mtxp);
765c9de3 3557
91447636 3558 /* Create a new temporary buffer header */
765c9de3 3559 bp = (struct buf *)zalloc(buf_hdr_zone);
2d21ac55 3560
765c9de3
A
3561 if (bp) {
3562 bufhdrinit(bp);
2d21ac55
A
3563 bp->b_whichq = BQ_EMPTY;
3564 bp->b_timestamp = buf_timestamp();
765c9de3 3565 BLISTNONE(bp);
765c9de3
A
3566 SET(bp->b_flags, B_HDRALLOC);
3567 *queue = BQ_EMPTY;
2d21ac55 3568 }
b0d623f7 3569 lck_mtx_lock_spin(buf_mtxp);
2d21ac55
A
3570
3571 if (bp) {
3572 binshash(bp, &invalhash);
765c9de3
A
3573 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
3574 buf_hdr_count++;
3575 goto found;
3576 }
0c530ab8 3577 /* subtract already accounted bufcount */
2d21ac55 3578 nbuf_headers--;
0c530ab8 3579
91447636 3580 bufstats.bufs_sleeps++;
765c9de3 3581
1c79356b
A
3582 /* wait for a free buffer of any kind */
3583 needbuffer = 1;
91447636
A
3584 /* hz value is 100 */
3585 ts.tv_sec = (slptimeo/1000);
3586 /* the hz value is 100; which leads to 10ms */
3587 ts.tv_nsec = (slptimeo % 1000) * NSEC_PER_USEC * 1000 * 10;
b0d623f7
A
3588
3589 msleep(&needbuffer, buf_mtxp, slpflag | PDROP | (PRIBIO+1), "getnewbuf", &ts);
2d21ac55 3590 return (NULL);
1c79356b
A
3591 }
3592
3593 /* Buffer available either on AGE or LRU or META */
3594 bp = NULL;
3595 *queue = -1;
3596
3597 /* Buffer available either on AGE or LRU */
3598 if (!age_bp) {
3599 bp = lru_bp;
3600 *queue = BQ_LRU;
3601 } else if (!lru_bp) {
3602 bp = age_bp;
3603 *queue = BQ_AGE;
3604 } else { /* buffer available on both AGE and LRU */
91447636
A
3605 int t = buf_timestamp();
3606
3607 age_time = t - age_bp->b_timestamp;
3608 lru_time = t - lru_bp->b_timestamp;
1c79356b
A
3609 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
3610 bp = age_bp;
3611 *queue = BQ_AGE;
3612 /*
3613 * we should probably re-timestamp eveything in the
3614 * queues at this point with the current time
3615 */
3616 } else {
3617 if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
3618 bp = lru_bp;
3619 *queue = BQ_LRU;
3620 } else {
3621 bp = age_bp;
3622 *queue = BQ_AGE;
3623 }
3624 }
3625 }
3626
3627 if (!bp) { /* Neither on AGE nor on LRU */
3628 bp = meta_bp;
3629 *queue = BQ_META;
3630 } else if (meta_bp) {
91447636
A
3631 int t = buf_timestamp();
3632
3633 bp_time = t - bp->b_timestamp;
3634 meta_time = t - meta_bp->b_timestamp;
1c79356b
A
3635
3636 if (!(bp_time < 0) && !(meta_time < 0)) {
3637 /* time not set backwards */
3638 int bp_is_stale;
3639 bp_is_stale = (*queue == BQ_LRU) ?
3640 lru_is_stale : age_is_stale;
3641
3642 if ((meta_time >= meta_is_stale) &&
3643 (bp_time < bp_is_stale)) {
3644 bp = meta_bp;
3645 *queue = BQ_META;
3646 }
3647 }
3648 }
1c79356b 3649found:
91447636 3650 if (ISSET(bp->b_flags, B_LOCKED) || ISSET(bp->b_lflags, BL_BUSY))
b0d623f7 3651 panic("getnewbuf: bp @ %p is LOCKED or BUSY! (flags 0x%x)\n", bp, bp->b_flags);
1c79356b
A
3652
3653 /* Clean it */
b0d623f7 3654 if (bcleanbuf(bp, FALSE)) {
91447636
A
3655 /*
3656 * moved to the laundry thread, buffer not ready
3657 */
1c79356b
A
3658 *queue = req;
3659 goto start;
3660 }
1c79356b
A
3661 return (bp);
3662}
9bccf70c 3663
1c79356b
A
3664
3665/*
3666 * Clean a buffer.
6d2010ae 3667 * Returns 0 if buffer is ready to use,
91447636 3668 * Returns 1 if issued a buf_bawrite() to indicate
1c79356b 3669 * that the buffer is not ready.
91447636
A
3670 *
3671 * buf_mtxp is held upon entry
3672 * returns with buf_mtxp locked
1c79356b 3673 */
6d2010ae 3674int
b0d623f7 3675bcleanbuf(buf_t bp, boolean_t discard)
1c79356b 3676{
1c79356b 3677 /* Remove from the queue */
91447636 3678 bremfree_locked(bp);
1c79356b 3679
91447636
A
3680#ifdef JOE_DEBUG
3681 bp->b_owner = current_thread();
3682 bp->b_tag = 2;
3683#endif
765c9de3
A
3684 /*
3685 * If buffer was a delayed write, start the IO by queuing
3686 * it on the LAUNDRY queue, and return 1
3687 */
1c79356b 3688 if (ISSET(bp->b_flags, B_DELWRI)) {
b0d623f7
A
3689 if (discard) {
3690 SET(bp->b_lflags, BL_WANTDEALLOC);
3691 }
3692
6d2010ae 3693 bmovelaundry(bp);
91447636
A
3694
3695 lck_mtx_unlock(buf_mtxp);
3696
2d21ac55
A
3697 wakeup(&bufqueues[BQ_LAUNDRY]);
3698 /*
3699 * and give it a chance to run
3700 */
9bccf70c 3701 (void)thread_block(THREAD_CONTINUE_NULL);
91447636 3702
b0d623f7 3703 lck_mtx_lock_spin(buf_mtxp);
2d21ac55 3704
1c79356b
A
3705 return (1);
3706 }
2d21ac55
A
3707#ifdef JOE_DEBUG
3708 bp->b_owner = current_thread();
3709 bp->b_tag = 8;
3710#endif
3711 /*
3712 * Buffer is no longer on any free list... we own it
3713 */
3714 SET(bp->b_lflags, BL_BUSY);
b0d623f7
A
3715 buf_busycount++;
3716
2d21ac55 3717 bremhash(bp);
91447636 3718
91447636
A
3719 /*
3720 * disassociate us from our vnode, if we had one...
3721 */
3722 if (bp->b_vp)
2d21ac55
A
3723 brelvp_locked(bp);
3724
3725 lck_mtx_unlock(buf_mtxp);
3726
3727 BLISTNONE(bp);
91447636 3728
6d2010ae
A
3729 if (ISSET(bp->b_flags, B_META))
3730 buf_free_meta_store(bp);
91447636
A
3731
3732 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
3733
6d2010ae 3734 buf_release_credentials(bp);
fe8ab488 3735
b0d623f7
A
3736 /* If discarding, just move to the empty queue */
3737 if (discard) {
3738 lck_mtx_lock_spin(buf_mtxp);
3739 CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA));
3740 bp->b_whichq = BQ_EMPTY;
3741 binshash(bp, &invalhash);
3742 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
3743 CLR(bp->b_lflags, BL_BUSY);
3744 buf_busycount--;
3745 } else {
3746 /* Not discarding: clean up and prepare for reuse */
3747 bp->b_bufsize = 0;
3748 bp->b_datap = (uintptr_t)NULL;
3749 bp->b_upl = (void *)NULL;
3e170ce0 3750 bp->b_fsprivate = (void *)NULL;
b0d623f7
A
3751 /*
3752 * preserve the state of whether this buffer
3753 * was allocated on the fly or not...
3754 * the only other flag that should be set at
3755 * this point is BL_BUSY...
3756 */
3757#ifdef JOE_DEBUG
3758 bp->b_owner = current_thread();
3759 bp->b_tag = 3;
3760#endif
3761 bp->b_lflags = BL_BUSY;
3762 bp->b_flags = (bp->b_flags & B_HDRALLOC);
3e170ce0 3763 bp->b_redundancy_flags = 0;
b0d623f7
A
3764 bp->b_dev = NODEV;
3765 bp->b_blkno = bp->b_lblkno = 0;
3766 bp->b_iodone = NULL;
3767 bp->b_error = 0;
3768 bp->b_resid = 0;
3769 bp->b_bcount = 0;
3770 bp->b_dirtyoff = bp->b_dirtyend = 0;
3771 bp->b_validoff = bp->b_validend = 0;
7ddcb079 3772 bzero(&bp->b_attr, sizeof(struct bufattr));
b0d623f7
A
3773
3774 lck_mtx_lock_spin(buf_mtxp);
3775 }
91447636
A
3776 return (0);
3777}
3778
3779
3780
3781errno_t
3782buf_invalblkno(vnode_t vp, daddr64_t lblkno, int flags)
3783{
3784 buf_t bp;
3785 errno_t error;
2d21ac55
A
3786 struct bufhashhdr *dp;
3787
3788 dp = BUFHASH(vp, lblkno);
91447636 3789
91447636 3790relook:
b0d623f7
A
3791 lck_mtx_lock_spin(buf_mtxp);
3792
2d21ac55 3793 if ((bp = incore_locked(vp, lblkno, dp)) == (struct buf *)0) {
91447636
A
3794 lck_mtx_unlock(buf_mtxp);
3795 return (0);
3796 }
3797 if (ISSET(bp->b_lflags, BL_BUSY)) {
3798 if ( !ISSET(flags, BUF_WAIT)) {
3799 lck_mtx_unlock(buf_mtxp);
3800 return (EBUSY);
3801 }
3802 SET(bp->b_lflags, BL_WANTED);
3803
b0d623f7 3804 error = msleep((caddr_t)bp, buf_mtxp, PDROP | (PRIBIO + 1), "buf_invalblkno", NULL);
91447636 3805
2d21ac55 3806 if (error) {
91447636 3807 return (error);
2d21ac55 3808 }
91447636
A
3809 goto relook;
3810 }
3811 bremfree_locked(bp);
3812 SET(bp->b_lflags, BL_BUSY);
3813 SET(bp->b_flags, B_INVAL);
b0d623f7 3814 buf_busycount++;
91447636
A
3815#ifdef JOE_DEBUG
3816 bp->b_owner = current_thread();
3817 bp->b_tag = 4;
3818#endif
3819 lck_mtx_unlock(buf_mtxp);
3820 buf_brelse(bp);
3821
3822 return (0);
3823}
3824
3825
3826void
3827buf_drop(buf_t bp)
3828{
3829 int need_wakeup = 0;
3830
2d21ac55 3831 lck_mtx_lock_spin(buf_mtxp);
91447636
A
3832
3833 if (ISSET(bp->b_lflags, BL_WANTED)) {
3834 /*
3835 * delay the actual wakeup until after we
3836 * clear BL_BUSY and we've dropped buf_mtxp
3837 */
3838 need_wakeup = 1;
3839 }
2d21ac55
A
3840#ifdef JOE_DEBUG
3841 bp->b_owner = current_thread();
3842 bp->b_tag = 9;
3843#endif
91447636
A
3844 /*
3845 * Unlock the buffer.
3846 */
3847 CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
b0d623f7 3848 buf_busycount--;
1c79356b 3849
91447636 3850 lck_mtx_unlock(buf_mtxp);
1c79356b 3851
91447636
A
3852 if (need_wakeup) {
3853 /*
3854 * Wake up any proceeses waiting for _this_ buffer to become free.
3855 */
3856 wakeup(bp);
3857 }
3858}
1c79356b 3859
1c79356b 3860
91447636
A
3861errno_t
3862buf_acquire(buf_t bp, int flags, int slpflag, int slptimeo) {
3863 errno_t error;
1c79356b 3864
b0d623f7 3865 lck_mtx_lock_spin(buf_mtxp);
1c79356b 3866
91447636 3867 error = buf_acquire_locked(bp, flags, slpflag, slptimeo);
1c79356b 3868
91447636 3869 lck_mtx_unlock(buf_mtxp);
1c79356b 3870
91447636
A
3871 return (error);
3872}
1c79356b 3873
91447636
A
3874
3875static errno_t
3876buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo)
3877{
3878 errno_t error;
3879 struct timespec ts;
3880
3881 if (ISSET(bp->b_flags, B_LOCKED)) {
3882 if ((flags & BAC_SKIP_LOCKED))
3883 return (EDEADLK);
3884 } else {
3885 if ((flags & BAC_SKIP_NONLOCKED))
3886 return (EDEADLK);
1c79356b 3887 }
91447636
A
3888 if (ISSET(bp->b_lflags, BL_BUSY)) {
3889 /*
b0d623f7 3890 * since the lck_mtx_lock may block, the buffer
91447636
A
3891 * may become BUSY, so we need to
3892 * recheck for a NOWAIT request
3893 */
3894 if (flags & BAC_NOWAIT)
3895 return (EBUSY);
3896 SET(bp->b_lflags, BL_WANTED);
3897
3898 /* the hz value is 100; which leads to 10ms */
3899 ts.tv_sec = (slptimeo/100);
3900 ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000;
2d21ac55 3901 error = msleep((caddr_t)bp, buf_mtxp, slpflag | (PRIBIO + 1), "buf_acquire", &ts);
91447636
A
3902
3903 if (error)
3904 return (error);
3905 return (EAGAIN);
1c79356b 3906 }
91447636
A
3907 if (flags & BAC_REMOVE)
3908 bremfree_locked(bp);
3909 SET(bp->b_lflags, BL_BUSY);
b0d623f7
A
3910 buf_busycount++;
3911
91447636
A
3912#ifdef JOE_DEBUG
3913 bp->b_owner = current_thread();
3914 bp->b_tag = 5;
3915#endif
1c79356b
A
3916 return (0);
3917}
3918
3919
3920/*
3921 * Wait for operations on the buffer to complete.
3922 * When they do, extract and return the I/O's error value.
3923 */
91447636
A
3924errno_t
3925buf_biowait(buf_t bp)
1c79356b 3926{
b0d623f7 3927 while (!ISSET(bp->b_flags, B_DONE)) {
1c79356b 3928
b0d623f7 3929 lck_mtx_lock_spin(buf_mtxp);
91447636 3930
b0d623f7
A
3931 if (!ISSET(bp->b_flags, B_DONE)) {
3932 DTRACE_IO1(wait__start, buf_t, bp);
3933 (void) msleep(bp, buf_mtxp, PDROP | (PRIBIO+1), "buf_biowait", NULL);
3934 DTRACE_IO1(wait__done, buf_t, bp);
3935 } else
3936 lck_mtx_unlock(buf_mtxp);
3937 }
1c79356b
A
3938 /* check for interruption of I/O (e.g. via NFS), then errors. */
3939 if (ISSET(bp->b_flags, B_EINTR)) {
3940 CLR(bp->b_flags, B_EINTR);
3941 return (EINTR);
3942 } else if (ISSET(bp->b_flags, B_ERROR))
3943 return (bp->b_error ? bp->b_error : EIO);
3944 else
3945 return (0);
3946}
3947
2d21ac55 3948
1c79356b
A
3949/*
3950 * Mark I/O complete on a buffer.
3951 *
3952 * If a callback has been requested, e.g. the pageout
3953 * daemon, do so. Otherwise, awaken waiting processes.
3954 *
3955 * [ Leffler, et al., says on p.247:
3956 * "This routine wakes up the blocked process, frees the buffer
3957 * for an asynchronous write, or, for a request by the pagedaemon
3958 * process, invokes a procedure specified in the buffer structure" ]
3959 *
3960 * In real life, the pagedaemon (or other system processes) wants
91447636 3961 * to do async stuff to, and doesn't want the buffer buf_brelse()'d.
1c79356b
A
3962 * (for swap pager, that puts swap buffers on the free lists (!!!),
3963 * for the vn device, that puts malloc'd buffers on the free lists!)
3964 */
91447636 3965
1c79356b 3966void
91447636 3967buf_biodone(buf_t bp)
1c79356b 3968{
b0d623f7 3969 mount_t mp;
39236c6e 3970 struct bufattr *bap;
b0d623f7 3971
1c79356b 3972 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
b0d623f7 3973 bp, bp->b_datap, bp->b_flags, 0, 0);
1c79356b
A
3974
3975 if (ISSET(bp->b_flags, B_DONE))
3976 panic("biodone already");
1c79356b 3977
39236c6e
A
3978 bap = &bp->b_attr;
3979
b0d623f7
A
3980 if (bp->b_vp && bp->b_vp->v_mount) {
3981 mp = bp->b_vp->v_mount;
3982 } else {
3983 mp = NULL;
3984 }
3985
d190cdc3
A
3986 if (ISSET(bp->b_flags, B_ERROR)) {
3987 if (mp && (MNT_ROOTFS & mp->mnt_flag)) {
3988 dk_error_description_t desc;
3989 bzero(&desc, sizeof(desc));
3990 desc.description = panic_disk_error_description;
3991 desc.description_size = panic_disk_error_description_size;
3992 VNOP_IOCTL(mp->mnt_devvp, DKIOCGETERRORDESCRIPTION, (caddr_t)&desc, 0, vfs_context_kernel());
3993 }
3994 }
3995
b0d623f7
A
3996 if (mp && (bp->b_flags & B_READ) == 0) {
3997 update_last_io_time(mp);
3998 INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_write_size);
3999 } else if (mp) {
4000 INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_read_size);
e2fac8b1
A
4001 }
4002
39037602
A
4003 throttle_info_end_io(bp);
4004
39236c6e
A
4005 if (kdebug_enable) {
4006 int code = DKIO_DONE;
4007 int io_tier = GET_BUFATTR_IO_TIER(bap);
9bccf70c 4008
91447636
A
4009 if (bp->b_flags & B_READ)
4010 code |= DKIO_READ;
4011 if (bp->b_flags & B_ASYNC)
4012 code |= DKIO_ASYNC;
9bccf70c 4013
91447636
A
4014 if (bp->b_flags & B_META)
4015 code |= DKIO_META;
4016 else if (bp->b_flags & B_PAGEIO)
4017 code |= DKIO_PAGING;
9bccf70c 4018
39236c6e 4019 if (io_tier != 0)
6d2010ae 4020 code |= DKIO_THROTTLE;
39236c6e
A
4021
4022 code |= ((io_tier << DKIO_TIER_SHIFT) & DKIO_TIER_MASK);
4023
4024 if (bp->b_flags & B_PASSIVE)
6d2010ae
A
4025 code |= DKIO_PASSIVE;
4026
39236c6e 4027 if (bap->ba_flags & BA_NOCACHE)
316670eb
A
4028 code |= DKIO_NOCACHE;
4029
d190cdc3
A
4030 if (bap->ba_flags & BA_IO_TIER_UPGRADE) {
4031 code |= DKIO_TIER_UPGRADE;
4032 }
4033
316670eb 4034 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
39236c6e 4035 buf_kernel_addrperm_addr(bp), (uintptr_t)VM_KERNEL_ADDRPERM(bp->b_vp), bp->b_resid, bp->b_error, 0);
9bccf70c 4036 }
6d2010ae 4037
91447636
A
4038 /*
4039 * I/O was done, so don't believe
6d2010ae
A
4040 * the DIRTY state from VM anymore...
4041 * and we need to reset the THROTTLED/PASSIVE
4042 * indicators
91447636 4043 */
39236c6e 4044 CLR(bp->b_flags, (B_WASDIRTY | B_PASSIVE));
d190cdc3 4045 CLR(bap->ba_flags, (BA_META | BA_NOCACHE | BA_DELAYIDLESLEEP | BA_IO_TIER_UPGRADE));
39236c6e
A
4046
4047 SET_BUFATTR_IO_TIER(bap, 0);
4048
2d21ac55 4049 DTRACE_IO1(done, buf_t, bp);
b4c24cb9 4050
91447636
A
4051 if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
4052 /*
4053 * wake up any writer's blocked
4054 * on throttle or waiting for I/O
4055 * to drain
4056 */
4057 vnode_writedone(bp->b_vp);
4058
4059 if (ISSET(bp->b_flags, (B_CALL | B_FILTER))) { /* if necessary, call out */
4060 void (*iodone_func)(struct buf *, void *) = bp->b_iodone;
6d2010ae 4061 void *arg = bp->b_transaction;
91447636
A
4062 int callout = ISSET(bp->b_flags, B_CALL);
4063
6d2010ae
A
4064 if (iodone_func == NULL)
4065 panic("biodone: bp @ %p has NULL b_iodone!\n", bp);
4066
91447636 4067 CLR(bp->b_flags, (B_CALL | B_FILTER)); /* filters and callouts are one-shot */
b4c24cb9 4068 bp->b_iodone = NULL;
91447636 4069 bp->b_transaction = NULL;
b4c24cb9 4070
6d2010ae
A
4071 if (callout)
4072 SET(bp->b_flags, B_DONE); /* note that it's done */
2d21ac55 4073
6d2010ae
A
4074 (*iodone_func)(bp, arg);
4075
4076 if (callout) {
4077 /*
2d21ac55 4078 * assumes that the callback function takes
91447636
A
4079 * ownership of the bp and deals with releasing it if necessary
4080 */
2d21ac55
A
4081 goto biodone_done;
4082 }
91447636
A
4083 /*
4084 * in this case the call back function is acting
4085 * strictly as a filter... it does not take
4086 * ownership of the bp and is expecting us
4087 * to finish cleaning up... this is currently used
4088 * by the HFS journaling code
4089 */
1c79356b 4090 }
91447636
A
4091 if (ISSET(bp->b_flags, B_ASYNC)) { /* if async, release it */
4092 SET(bp->b_flags, B_DONE); /* note that it's done */
1c79356b 4093
91447636
A
4094 buf_brelse(bp);
4095 } else { /* or just wakeup the buffer */
4096 /*
4097 * by taking the mutex, we serialize
4098 * the buf owner calling buf_biowait so that we'll
4099 * only see him in one of 2 states...
4100 * state 1: B_DONE wasn't set and he's
4101 * blocked in msleep
4102 * state 2: he's blocked trying to take the
4103 * mutex before looking at B_DONE
4104 * BL_WANTED is cleared in case anyone else
4105 * is blocked waiting for the buffer... note
4106 * that we haven't cleared B_BUSY yet, so if
4107 * they do get to run, their going to re-set
4108 * BL_WANTED and go back to sleep
4109 */
2d21ac55 4110 lck_mtx_lock_spin(buf_mtxp);
1c79356b 4111
91447636
A
4112 CLR(bp->b_lflags, BL_WANTED);
4113 SET(bp->b_flags, B_DONE); /* note that it's done */
4114
4115 lck_mtx_unlock(buf_mtxp);
4116
4117 wakeup(bp);
4118 }
4119biodone_done:
4120 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
b0d623f7 4121 (uintptr_t)bp, (uintptr_t)bp->b_datap, bp->b_flags, 0, 0);
1c79356b
A
4122}
4123
39236c6e
A
4124/*
4125 * Obfuscate buf pointers.
4126 */
4127vm_offset_t
4128buf_kernel_addrperm_addr(void * addr)
4129{
4130 if ((vm_offset_t)addr == 0)
4131 return 0;
4132 else
4133 return ((vm_offset_t)addr + buf_kernel_addrperm);
4134}
4135
1c79356b
A
4136/*
4137 * Return a count of buffers on the "locked" queue.
4138 */
4139int
91447636 4140count_lock_queue(void)
1c79356b 4141{
91447636
A
4142 buf_t bp;
4143 int n = 0;
4144
b0d623f7 4145 lck_mtx_lock_spin(buf_mtxp);
1c79356b
A
4146
4147 for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
4148 bp = bp->b_freelist.tqe_next)
4149 n++;
91447636
A
4150 lck_mtx_unlock(buf_mtxp);
4151
1c79356b
A
4152 return (n);
4153}
4154
4155/*
4156 * Return a count of 'busy' buffers. Used at the time of shutdown.
316670eb 4157 * note: This is also called from the mach side in debug context in kdp.c
1c79356b
A
4158 */
4159int
91447636 4160count_busy_buffers(void)
1c79356b 4161{
b0d623f7 4162 return buf_busycount + bufstats.bufs_iobufinuse;
1c79356b
A
4163}
4164
9bccf70c 4165#if DIAGNOSTIC
1c79356b
A
4166/*
4167 * Print out statistics on the current allocation of the buffer pool.
4168 * Can be enabled to print out on every ``sync'' by setting "syncprt"
4169 * in vfs_syscalls.c using sysctl.
4170 */
4171void
4172vfs_bufstats()
4173{
91447636 4174 int i, j, count;
6d2010ae
A
4175 struct buf *bp;
4176 struct bqueues *dp;
91447636
A
4177 int counts[MAXBSIZE/CLBYTES+1];
4178 static char *bname[BQUEUES] =
4179 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
4180
4181 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
4182 count = 0;
4183 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
4184 counts[j] = 0;
4185
4186 lck_mtx_lock(buf_mtxp);
4187
4188 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
4189 counts[bp->b_bufsize/CLBYTES]++;
4190 count++;
4191 }
4192 lck_mtx_unlock(buf_mtxp);
4193
4194 printf("%s: total-%d", bname[i], count);
4195 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
4196 if (counts[j] != 0)
4197 printf(", %d-%d", j * CLBYTES, counts[j]);
4198 printf("\n");
4199 }
4200}
4201#endif /* DIAGNOSTIC */
4202
6d2010ae 4203#define NRESERVEDIOBUFS 128
91447636 4204
39037602
A
4205#define MNT_VIRTUALDEV_MAX_IOBUFS 16
4206#define VIRTUALDEV_MAX_IOBUFS ((40*niobuf_headers)/100)
91447636
A
4207
4208buf_t
4209alloc_io_buf(vnode_t vp, int priv)
4210{
4211 buf_t bp;
39037602
A
4212 mount_t mp = NULL;
4213 int alloc_for_virtualdev = FALSE;
91447636 4214
b0d623f7 4215 lck_mtx_lock_spin(iobuffer_mtxp);
91447636 4216
39037602
A
4217 /*
4218 * We subject iobuf requests for diskimages to additional restrictions.
4219 *
4220 * a) A single diskimage mount cannot use up more than
4221 * MNT_VIRTUALDEV_MAX_IOBUFS. However,vm privileged (pageout) requests
4222 * are not subject to this restriction.
4223 * b) iobuf headers used by all diskimage headers by all mount
4224 * points cannot exceed VIRTUALDEV_MAX_IOBUFS.
4225 */
4226 if (vp && ((mp = vp->v_mount)) && mp != dead_mountp &&
4227 mp->mnt_kern_flag & MNTK_VIRTUALDEV) {
4228 alloc_for_virtualdev = TRUE;
4229 while ((!priv && mp->mnt_iobufinuse > MNT_VIRTUALDEV_MAX_IOBUFS) ||
4230 bufstats.bufs_iobufinuse_vdev > VIRTUALDEV_MAX_IOBUFS) {
4231 bufstats.bufs_iobufsleeps++;
4232
4233 need_iobuffer = 1;
4234 (void)msleep(&need_iobuffer, iobuffer_mtxp,
4235 PSPIN | (PRIBIO+1), (const char *)"alloc_io_buf (1)",
4236 NULL);
4237 }
4238 }
4239
2d21ac55 4240 while (((niobuf_headers - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse) && !priv) ||
91447636
A
4241 (bp = iobufqueue.tqh_first) == NULL) {
4242 bufstats.bufs_iobufsleeps++;
4243
4244 need_iobuffer = 1;
39037602
A
4245 (void)msleep(&need_iobuffer, iobuffer_mtxp, PSPIN | (PRIBIO+1),
4246 (const char *)"alloc_io_buf (2)", NULL);
91447636
A
4247 }
4248 TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
4249
4250 bufstats.bufs_iobufinuse++;
4251 if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
4252 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
4253
39037602
A
4254 if (alloc_for_virtualdev) {
4255 mp->mnt_iobufinuse++;
4256 bufstats.bufs_iobufinuse_vdev++;
4257 }
4258
91447636
A
4259 lck_mtx_unlock(iobuffer_mtxp);
4260
4261 /*
4262 * initialize various fields
4263 * we don't need to hold the mutex since the buffer
4264 * is now private... the vp should have a reference
4265 * on it and is not protected by this mutex in any event
4266 */
4267 bp->b_timestamp = 0;
4268 bp->b_proc = NULL;
4269
4270 bp->b_datap = 0;
4271 bp->b_flags = 0;
4272 bp->b_lflags = BL_BUSY | BL_IOBUF;
39037602
A
4273 if (alloc_for_virtualdev)
4274 bp->b_lflags |= BL_IOBUF_VDEV;
6d2010ae 4275 bp->b_redundancy_flags = 0;
91447636
A
4276 bp->b_blkno = bp->b_lblkno = 0;
4277#ifdef JOE_DEBUG
4278 bp->b_owner = current_thread();
4279 bp->b_tag = 6;
4280#endif
4281 bp->b_iodone = NULL;
4282 bp->b_error = 0;
4283 bp->b_resid = 0;
4284 bp->b_bcount = 0;
4285 bp->b_bufsize = 0;
4286 bp->b_upl = NULL;
3e170ce0 4287 bp->b_fsprivate = (void *)NULL;
91447636 4288 bp->b_vp = vp;
7ddcb079 4289 bzero(&bp->b_attr, sizeof(struct bufattr));
91447636
A
4290
4291 if (vp && (vp->v_type == VBLK || vp->v_type == VCHR))
4292 bp->b_dev = vp->v_rdev;
4293 else
4294 bp->b_dev = NODEV;
4295
4296 return (bp);
4297}
4298
4299
4300void
4301free_io_buf(buf_t bp)
4302{
39037602
A
4303 int need_wakeup = 0;
4304 int free_for_virtualdev = FALSE;
4305 mount_t mp = NULL;
4306
4307 /* Was this iobuf for a diskimage ? */
4308 if (bp->b_lflags & BL_IOBUF_VDEV) {
4309 free_for_virtualdev = TRUE;
4310 if (bp->b_vp)
4311 mp = bp->b_vp->v_mount;
4312 }
91447636
A
4313
4314 /*
4315 * put buffer back on the head of the iobufqueue
4316 */
4317 bp->b_vp = NULL;
4318 bp->b_flags = B_INVAL;
4319
fe8ab488
A
4320 /* Zero out the bufattr and its flags before relinquishing this iobuf */
4321 bzero (&bp->b_attr, sizeof(struct bufattr));
4322
2d21ac55 4323 lck_mtx_lock_spin(iobuffer_mtxp);
91447636
A
4324
4325 binsheadfree(bp, &iobufqueue, -1);
4326
4327 if (need_iobuffer) {
4328 /*
4329 * Wake up any processes waiting because they need an io buffer
4330 *
4331 * do the wakeup after we drop the mutex... it's possible that the
4332 * wakeup will be superfluous if need_iobuffer gets set again and
4333 * another thread runs this path, but it's highly unlikely, doesn't
4334 * hurt, and it means we don't hold up I/O progress if the wakeup blocks
4335 * trying to grab a task related lock...
4336 */
4337 need_iobuffer = 0;
4338 need_wakeup = 1;
4339 }
b0d623f7
A
4340 if (bufstats.bufs_iobufinuse <= 0)
4341 panic("free_io_buf: bp(%p) - bufstats.bufs_iobufinuse < 0", bp);
4342
91447636
A
4343 bufstats.bufs_iobufinuse--;
4344
39037602
A
4345 if (free_for_virtualdev) {
4346 bufstats.bufs_iobufinuse_vdev--;
4347 if (mp && mp != dead_mountp)
4348 mp->mnt_iobufinuse--;
4349 }
4350
91447636
A
4351 lck_mtx_unlock(iobuffer_mtxp);
4352
4353 if (need_wakeup)
4354 wakeup(&need_iobuffer);
4355}
4356
4357
2d21ac55
A
4358void
4359buf_list_lock(void)
4360{
b0d623f7 4361 lck_mtx_lock_spin(buf_mtxp);
2d21ac55
A
4362}
4363
4364void
4365buf_list_unlock(void)
4366{
4367 lck_mtx_unlock(buf_mtxp);
4368}
91447636
A
4369
4370/*
4371 * If getnewbuf() calls bcleanbuf() on the same thread
4372 * there is a potential for stack overrun and deadlocks.
4373 * So we always handoff the work to a worker thread for completion
4374 */
91447636
A
4375
4376
4377static void
4378bcleanbuf_thread_init(void)
4379{
b0d623f7
A
4380 thread_t thread = THREAD_NULL;
4381
91447636 4382 /* create worker thread */
b0d623f7
A
4383 kernel_thread_start((thread_continue_t)bcleanbuf_thread, NULL, &thread);
4384 thread_deallocate(thread);
91447636
A
4385}
4386
6d2010ae
A
4387typedef int (*bcleanbufcontinuation)(int);
4388
39037602 4389__attribute__((noreturn))
91447636
A
4390static void
4391bcleanbuf_thread(void)
4392{
4393 struct buf *bp;
4394 int error = 0;
4395 int loopcnt = 0;
4396
4397 for (;;) {
b0d623f7 4398 lck_mtx_lock_spin(buf_mtxp);
91447636 4399
b0d623f7 4400 while ( (bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY])) == NULL) {
6d2010ae 4401 (void)msleep0(&bufqueues[BQ_LAUNDRY], buf_mtxp, PRIBIO|PDROP, "blaundry", 0, (bcleanbufcontinuation)bcleanbuf_thread);
b0d623f7 4402 }
6d2010ae 4403
91447636
A
4404 /*
4405 * Remove from the queue
4406 */
4407 bremfree_locked(bp);
2d21ac55
A
4408
4409 /*
4410 * Buffer is no longer on any free list
4411 */
4412 SET(bp->b_lflags, BL_BUSY);
b0d623f7 4413 buf_busycount++;
2d21ac55
A
4414
4415#ifdef JOE_DEBUG
4416 bp->b_owner = current_thread();
4417 bp->b_tag = 10;
4418#endif
91447636
A
4419
4420 lck_mtx_unlock(buf_mtxp);
4421 /*
4422 * do the IO
4423 */
4424 error = bawrite_internal(bp, 0);
4425
4426 if (error) {
2d21ac55
A
4427 bp->b_whichq = BQ_LAUNDRY;
4428 bp->b_timestamp = buf_timestamp();
4429
4430 lck_mtx_lock_spin(buf_mtxp);
91447636
A
4431
4432 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
4433 blaundrycnt++;
4434
6d2010ae 4435 /* we never leave a busy page on the laundry queue */
2d21ac55 4436 CLR(bp->b_lflags, BL_BUSY);
b0d623f7 4437 buf_busycount--;
2d21ac55
A
4438#ifdef JOE_DEBUG
4439 bp->b_owner = current_thread();
4440 bp->b_tag = 11;
4441#endif
4442
91447636 4443 lck_mtx_unlock(buf_mtxp);
6d2010ae
A
4444
4445 if (loopcnt > MAXLAUNDRY) {
4446 /*
4447 * bawrite_internal() can return errors if we're throttled. If we've
4448 * done several I/Os and failed, give the system some time to unthrottle
4449 * the vnode
4450 */
4451 (void)tsleep((void *)&bufqueues[BQ_LAUNDRY], PRIBIO, "blaundry", 1);
91447636
A
4452 loopcnt = 0;
4453 } else {
6d2010ae
A
4454 /* give other threads a chance to run */
4455 (void)thread_block(THREAD_CONTINUE_NULL);
91447636
A
4456 loopcnt++;
4457 }
4458 }
4459 }
4460}
4461
4462
4463static int
4464brecover_data(buf_t bp)
4465{
4466 int upl_offset;
4467 upl_t upl;
4468 upl_page_info_t *pl;
4469 kern_return_t kret;
4470 vnode_t vp = bp->b_vp;
4471 int upl_flags;
4472
4473
4474 if ( !UBCINFOEXISTS(vp) || bp->b_bufsize == 0)
4475 goto dump_buffer;
4476
4477 upl_flags = UPL_PRECIOUS;
4478 if (! (buf_flags(bp) & B_READ)) {
4479 /*
4480 * "write" operation: let the UPL subsystem know
4481 * that we intend to modify the buffer cache pages we're
4482 * gathering.
4483 */
4484 upl_flags |= UPL_WILL_MODIFY;
4485 }
4486
4487 kret = ubc_create_upl(vp,
4488 ubc_blktooff(vp, bp->b_lblkno),
4489 bp->b_bufsize,
4490 &upl,
4491 &pl,
4492 upl_flags);
4493 if (kret != KERN_SUCCESS)
4494 panic("Failed to create UPL");
4495
4496 for (upl_offset = 0; upl_offset < bp->b_bufsize; upl_offset += PAGE_SIZE) {
4497
4498 if (!upl_valid_page(pl, upl_offset / PAGE_SIZE) || !upl_dirty_page(pl, upl_offset / PAGE_SIZE)) {
4499 ubc_upl_abort(upl, 0);
4500 goto dump_buffer;
4501 }
4502 }
4503 bp->b_upl = upl;
4504
b0d623f7 4505 kret = ubc_upl_map(upl, (vm_offset_t *)&(bp->b_datap));
91447636
A
4506
4507 if (kret != KERN_SUCCESS)
4508 panic("getblk: ubc_upl_map() failed with (%d)", kret);
4509 return (1);
4510
4511dump_buffer:
4512 bp->b_bufsize = 0;
4513 SET(bp->b_flags, B_INVAL);
4514 buf_brelse(bp);
4515
4516 return(0);
4517}
4518
813fb2f6
A
4519int
4520fs_buffer_cache_gc_register(void (* callout)(int, void *), void *context)
4521{
4522 lck_mtx_lock(buf_gc_callout);
4523 for (int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
4524 if (fs_callouts[i].callout == NULL) {
4525 fs_callouts[i].callout = callout;
4526 fs_callouts[i].context = context;
4527 lck_mtx_unlock(buf_gc_callout);
4528 return 0;
4529 }
4530 }
4531
4532 lck_mtx_unlock(buf_gc_callout);
4533 return ENOMEM;
4534}
4535
4536int
4537fs_buffer_cache_gc_unregister(void (* callout)(int, void *), void *context)
4538{
4539 lck_mtx_lock(buf_gc_callout);
4540 for (int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
4541 if (fs_callouts[i].callout == callout &&
4542 fs_callouts[i].context == context) {
4543 fs_callouts[i].callout = NULL;
4544 fs_callouts[i].context = NULL;
4545 }
4546 }
4547 lck_mtx_unlock(buf_gc_callout);
4548 return 0;
4549}
4550
4551static void
4552fs_buffer_cache_gc_dispatch_callouts(int all)
4553{
4554 lck_mtx_lock(buf_gc_callout);
4555 for(int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
4556 if (fs_callouts[i].callout != NULL) {
4557 fs_callouts[i].callout(all, fs_callouts[i].context);
4558 }
4559 }
4560 lck_mtx_unlock(buf_gc_callout);
4561}
4562
b7266188 4563boolean_t
0b4c1975 4564buffer_cache_gc(int all)
b0d623f7
A
4565{
4566 buf_t bp;
4567 boolean_t did_large_zfree = FALSE;
6d2010ae 4568 boolean_t need_wakeup = FALSE;
b0d623f7 4569 int now = buf_timestamp();
316670eb 4570 uint32_t found = 0;
6d2010ae 4571 struct bqueues privq;
0b4c1975
A
4572 int thresh_hold = BUF_STALE_THRESHHOLD;
4573
4574 if (all)
4575 thresh_hold = 0;
6d2010ae
A
4576 /*
4577 * We only care about metadata (incore storage comes from zalloc()).
316670eb
A
4578 * Unless "all" is set (used to evict meta data buffers in preparation
4579 * for deep sleep), we only evict up to BUF_MAX_GC_BATCH_SIZE buffers
fe8ab488
A
4580 * that have not been accessed in the last BUF_STALE_THRESHOLD seconds.
4581 * BUF_MAX_GC_BATCH_SIZE controls both the hold time of the global lock
4582 * "buf_mtxp" and the length of time we spend compute bound in the GC
4583 * thread which calls this function
6d2010ae
A
4584 */
4585 lck_mtx_lock(buf_mtxp);
316670eb 4586
6d2010ae
A
4587 do {
4588 found = 0;
4589 TAILQ_INIT(&privq);
4590 need_wakeup = FALSE;
b0d623f7 4591
6d2010ae
A
4592 while (((bp = TAILQ_FIRST(&bufqueues[BQ_META]))) &&
4593 (now > bp->b_timestamp) &&
4594 (now - bp->b_timestamp > thresh_hold) &&
4595 (found < BUF_MAX_GC_BATCH_SIZE)) {
4596
4597 /* Remove from free list */
4598 bremfree_locked(bp);
4599 found++;
4600
4601#ifdef JOE_DEBUG
4602 bp->b_owner = current_thread();
4603 bp->b_tag = 12;
4604#endif
4605
4606 /* If dirty, move to laundry queue and remember to do wakeup */
4607 if (ISSET(bp->b_flags, B_DELWRI)) {
4608 SET(bp->b_lflags, BL_WANTDEALLOC);
4609
4610 bmovelaundry(bp);
4611 need_wakeup = TRUE;
4612
4613 continue;
4614 }
4615
4616 /*
4617 * Mark busy and put on private list. We could technically get
4618 * away without setting BL_BUSY here.
4619 */
4620 SET(bp->b_lflags, BL_BUSY);
4621 buf_busycount++;
b0d623f7 4622
6d2010ae
A
4623 /*
4624 * Remove from hash and dissociate from vp.
4625 */
4626 bremhash(bp);
4627 if (bp->b_vp) {
4628 brelvp_locked(bp);
4629 }
b0d623f7 4630
6d2010ae
A
4631 TAILQ_INSERT_TAIL(&privq, bp, b_freelist);
4632 }
b0d623f7 4633
6d2010ae
A
4634 if (found == 0) {
4635 break;
4636 }
b0d623f7 4637
6d2010ae
A
4638 /* Drop lock for batch processing */
4639 lck_mtx_unlock(buf_mtxp);
4640
4641 /* Wakeup and yield for laundry if need be */
4642 if (need_wakeup) {
4643 wakeup(&bufqueues[BQ_LAUNDRY]);
4644 (void)thread_block(THREAD_CONTINUE_NULL);
b0d623f7 4645 }
6d2010ae
A
4646
4647 /* Clean up every buffer on private list */
4648 TAILQ_FOREACH(bp, &privq, b_freelist) {
4649 /* Take note if we've definitely freed at least a page to a zone */
4650 if ((ISSET(bp->b_flags, B_ZALLOC)) && (buf_size(bp) >= PAGE_SIZE)) {
4651 did_large_zfree = TRUE;
4652 }
4653
4654 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
4655
4656 /* Free Storage */
4657 buf_free_meta_store(bp);
4658
4659 /* Release credentials */
4660 buf_release_credentials(bp);
4661
4662 /* Prepare for moving to empty queue */
4663 CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED
4664 | B_AGE | B_ASYNC | B_NOCACHE | B_FUA));
4665 bp->b_whichq = BQ_EMPTY;
4666 BLISTNONE(bp);
4667 }
6d2010ae
A
4668 lck_mtx_lock(buf_mtxp);
4669
4670 /* Back under lock, move them all to invalid hash and clear busy */
4671 TAILQ_FOREACH(bp, &privq, b_freelist) {
4672 binshash(bp, &invalhash);
4673 CLR(bp->b_lflags, BL_BUSY);
4674 buf_busycount--;
4675
4676#ifdef JOE_DEBUG
4677 if (bp->b_owner != current_thread()) {
4678 panic("Buffer stolen from buffer_cache_gc()");
4679 }
4680 bp->b_owner = current_thread();
4681 bp->b_tag = 13;
4682#endif
4683 }
4684
4685 /* And do a big bulk move to the empty queue */
4686 TAILQ_CONCAT(&bufqueues[BQ_EMPTY], &privq, b_freelist);
6d2010ae 4687
316670eb 4688 } while (all && (found == BUF_MAX_GC_BATCH_SIZE));
b0d623f7
A
4689
4690 lck_mtx_unlock(buf_mtxp);
4691
813fb2f6
A
4692 fs_buffer_cache_gc_dispatch_callouts(all);
4693
b0d623f7
A
4694 return did_large_zfree;
4695}
91447636
A
4696
4697
4698/*
4699 * disabled for now
4700 */
4701
4702#if FLUSH_QUEUES
4703
4704#define NFLUSH 32
4705
4706static int
4707bp_cmp(void *a, void *b)
4708{
4709 buf_t *bp_a = *(buf_t **)a,
4710 *bp_b = *(buf_t **)b;
4711 daddr64_t res;
1c79356b 4712
91447636
A
4713 // don't have to worry about negative block
4714 // numbers so this is ok to do.
4715 //
4716 res = (bp_a->b_blkno - bp_b->b_blkno);
4717
4718 return (int)res;
1c79356b 4719}
1c79356b
A
4720
4721
91447636
A
4722int
4723bflushq(int whichq, mount_t mp)
1c79356b 4724{
91447636
A
4725 buf_t bp, next;
4726 int i, buf_count;
4727 int total_writes = 0;
4728 static buf_t flush_table[NFLUSH];
1c79356b 4729
91447636
A
4730 if (whichq < 0 || whichq >= BQUEUES) {
4731 return (0);
0b4e3aa0
A
4732 }
4733
91447636
A
4734 restart:
4735 lck_mtx_lock(buf_mtxp);
0b4e3aa0 4736
91447636 4737 bp = TAILQ_FIRST(&bufqueues[whichq]);
1c79356b 4738
91447636
A
4739 for (buf_count = 0; bp; bp = next) {
4740 next = bp->b_freelist.tqe_next;
4741
4742 if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) {
4743 continue;
4744 }
b4c24cb9 4745
91447636 4746 if (ISSET(bp->b_flags, B_DELWRI) && !ISSET(bp->b_lflags, BL_BUSY)) {
1c79356b 4747
91447636
A
4748 bremfree_locked(bp);
4749#ifdef JOE_DEBUG
4750 bp->b_owner = current_thread();
4751 bp->b_tag = 7;
4752#endif
4753 SET(bp->b_lflags, BL_BUSY);
b0d623f7
A
4754 buf_busycount++;
4755
91447636
A
4756 flush_table[buf_count] = bp;
4757 buf_count++;
4758 total_writes++;
1c79356b 4759
91447636
A
4760 if (buf_count >= NFLUSH) {
4761 lck_mtx_unlock(buf_mtxp);
1c79356b 4762
91447636 4763 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
1c79356b 4764
91447636
A
4765 for (i = 0; i < buf_count; i++) {
4766 buf_bawrite(flush_table[i]);
4767 }
4768 goto restart;
4769 }
4770 }
4771 }
4772 lck_mtx_unlock(buf_mtxp);
1c79356b 4773
91447636
A
4774 if (buf_count > 0) {
4775 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
1c79356b 4776
91447636
A
4777 for (i = 0; i < buf_count; i++) {
4778 buf_bawrite(flush_table[i]);
4779 }
1c79356b 4780 }
91447636
A
4781
4782 return (total_writes);
1c79356b 4783}
91447636 4784#endif