]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_bio.c
xnu-7195.101.1.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_bio.c
1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*-
30 * Copyright (c) 1994 Christopher G. Demetriou
31 * Copyright (c) 1982, 1986, 1989, 1993
32 * The Regents of the University of California. All rights reserved.
33 * (c) UNIX System Laboratories, Inc.
34 * All or some portions of this file are derived from material licensed
35 * to the University of California by American Telephone and Telegraph
36 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
37 * the permission of UNIX System Laboratories, Inc.
38 *
39 * Redistribution and use in source and binary forms, with or without
40 * modification, are permitted provided that the following conditions
41 * are met:
42 * 1. Redistributions of source code must retain the above copyright
43 * notice, this list of conditions and the following disclaimer.
44 * 2. Redistributions in binary form must reproduce the above copyright
45 * notice, this list of conditions and the following disclaimer in the
46 * documentation and/or other materials provided with the distribution.
47 * 3. All advertising materials mentioning features or use of this software
48 * must display the following acknowledgement:
49 * This product includes software developed by the University of
50 * California, Berkeley and its contributors.
51 * 4. Neither the name of the University nor the names of its contributors
52 * may be used to endorse or promote products derived from this software
53 * without specific prior written permission.
54 *
55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65 * SUCH DAMAGE.
66 *
67 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
68 */
69
70 /*
71 * Some references:
72 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
73 * Leffler, et al.: The Design and Implementation of the 4.3BSD
74 * UNIX Operating System (Addison Welley, 1989)
75 */
76
77 #include <sys/param.h>
78 #include <sys/systm.h>
79 #include <sys/proc_internal.h>
80 #include <sys/buf_internal.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/trace.h>
84 #include <kern/kalloc.h>
85 #include <sys/resourcevar.h>
86 #include <miscfs/specfs/specdev.h>
87 #include <sys/ubc.h>
88 #include <sys/kauth.h>
89 #if DIAGNOSTIC
90 #include <kern/assert.h>
91 #endif /* DIAGNOSTIC */
92 #include <kern/task.h>
93 #include <kern/zalloc.h>
94 #include <kern/locks.h>
95 #include <kern/thread.h>
96
97 #include <sys/fslog.h> /* fslog_io_error() */
98 #include <sys/disk.h> /* dk_error_description_t */
99
100 #include <mach/mach_types.h>
101 #include <mach/memory_object_types.h>
102 #include <kern/sched_prim.h> /* thread_block() */
103
104 #include <vm/vm_kern.h>
105 #include <vm/vm_pageout.h>
106
107 #include <sys/kdebug.h>
108
109 #include <libkern/OSAtomic.h>
110 #include <libkern/OSDebug.h>
111 #include <sys/ubc_internal.h>
112
113 #include <sys/sdt.h>
114
115 int bcleanbuf(buf_t bp, boolean_t discard);
116 static int brecover_data(buf_t bp);
117 static boolean_t incore(vnode_t vp, daddr64_t blkno);
118 /* timeout is in msecs */
119 static buf_t getnewbuf(int slpflag, int slptimeo, int *queue);
120 static void bremfree_locked(buf_t bp);
121 static void buf_reassign(buf_t bp, vnode_t newvp);
122 static errno_t buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo);
123 static int buf_iterprepare(vnode_t vp, struct buflists *, int flags);
124 static void buf_itercomplete(vnode_t vp, struct buflists *, int flags);
125 static boolean_t buffer_cache_gc(int);
126 static buf_t buf_brelse_shadow(buf_t bp);
127 static void buf_free_meta_store(buf_t bp);
128
129 static buf_t buf_create_shadow_internal(buf_t bp, boolean_t force_copy,
130 uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg, int priv);
131
132
133 int bdwrite_internal(buf_t, int);
134
135 extern void disk_conditioner_delay(buf_t, int, int, uint64_t);
136
137 /* zone allocated buffer headers */
138 static void bcleanbuf_thread_init(void);
139 static void bcleanbuf_thread(void);
140
141 static ZONE_DECLARE(buf_hdr_zone, "buf headers", sizeof(struct buf), ZC_NONE);
142 static int buf_hdr_count;
143
144
145 /*
146 * Definitions for the buffer hash lists.
147 */
148 #define BUFHASH(dvp, lbn) \
149 (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
150 LIST_HEAD(bufhashhdr, buf) * bufhashtbl, invalhash;
151 u_long bufhash;
152
153 static buf_t incore_locked(vnode_t vp, daddr64_t blkno, struct bufhashhdr *dp);
154
155 /* Definitions for the buffer stats. */
156 struct bufstats bufstats;
157
158 /* Number of delayed write buffers */
159 long nbdwrite = 0;
160 int blaundrycnt = 0;
161 static int boot_nbuf_headers = 0;
162
163 static TAILQ_HEAD(delayqueue, buf) delaybufqueue;
164
165 static TAILQ_HEAD(ioqueue, buf) iobufqueue;
166 static TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
167 static int needbuffer;
168 static int need_iobuffer;
169
170 static LCK_GRP_DECLARE(buf_mtx_grp, "buffer cache");
171 static LCK_ATTR_DECLARE(buf_mtx_attr, 0, 0);
172 static LCK_MTX_DECLARE_ATTR(iobuffer_mtxp, &buf_mtx_grp, &buf_mtx_attr);
173 static LCK_MTX_DECLARE_ATTR(buf_mtx, &buf_mtx_grp, &buf_mtx_attr);
174 static LCK_MTX_DECLARE_ATTR(buf_gc_callout, &buf_mtx_grp, &buf_mtx_attr);
175
176 static uint32_t buf_busycount;
177
178 #define FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE 16
179 typedef struct {
180 void (* callout)(int, void *);
181 void *context;
182 } fs_buffer_cache_gc_callout_t;
183
184 fs_buffer_cache_gc_callout_t fs_callouts[FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE] = { {NULL, NULL} };
185
186 static __inline__ int
187 buf_timestamp(void)
188 {
189 struct timeval t;
190 microuptime(&t);
191 return (int)t.tv_sec;
192 }
193
194 /*
195 * Insq/Remq for the buffer free lists.
196 */
197 #define binsheadfree(bp, dp, whichq) do { \
198 TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
199 } while (0)
200
201 #define binstailfree(bp, dp, whichq) do { \
202 TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
203 } while (0)
204
205 #define BHASHENTCHECK(bp) \
206 if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef) \
207 panic("%p: b_hash.le_prev is not deadbeef", (bp));
208
209 #define BLISTNONE(bp) \
210 (bp)->b_hash.le_next = (struct buf *)0; \
211 (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
212
213 /*
214 * Insq/Remq for the vnode usage lists.
215 */
216 #define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs)
217 #define bufremvn(bp) { \
218 LIST_REMOVE(bp, b_vnbufs); \
219 (bp)->b_vnbufs.le_next = NOLIST; \
220 }
221
222 /*
223 * Time in seconds before a buffer on a list is
224 * considered as a stale buffer
225 */
226 #define LRU_IS_STALE 120 /* default value for the LRU */
227 #define AGE_IS_STALE 60 /* default value for the AGE */
228 #define META_IS_STALE 180 /* default value for the BQ_META */
229
230 int lru_is_stale = LRU_IS_STALE;
231 int age_is_stale = AGE_IS_STALE;
232 int meta_is_stale = META_IS_STALE;
233
234 #define MAXLAUNDRY 10
235
236 /* LIST_INSERT_HEAD() with assertions */
237 static __inline__ void
238 blistenterhead(struct bufhashhdr * head, buf_t bp)
239 {
240 if ((bp->b_hash.le_next = (head)->lh_first) != NULL) {
241 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
242 }
243 (head)->lh_first = bp;
244 bp->b_hash.le_prev = &(head)->lh_first;
245 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef) {
246 panic("blistenterhead: le_prev is deadbeef");
247 }
248 }
249
250 static __inline__ void
251 binshash(buf_t bp, struct bufhashhdr *dp)
252 {
253 #if DIAGNOSTIC
254 buf_t nbp;
255 #endif /* DIAGNOSTIC */
256
257 BHASHENTCHECK(bp);
258
259 #if DIAGNOSTIC
260 nbp = dp->lh_first;
261 for (; nbp != NULL; nbp = nbp->b_hash.le_next) {
262 if (nbp == bp) {
263 panic("buf already in hashlist");
264 }
265 }
266 #endif /* DIAGNOSTIC */
267
268 blistenterhead(dp, bp);
269 }
270
271 static __inline__ void
272 bremhash(buf_t bp)
273 {
274 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef) {
275 panic("bremhash le_prev is deadbeef");
276 }
277 if (bp->b_hash.le_next == bp) {
278 panic("bremhash: next points to self");
279 }
280
281 if (bp->b_hash.le_next != NULL) {
282 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
283 }
284 *bp->b_hash.le_prev = (bp)->b_hash.le_next;
285 }
286
287 /*
288 * buf_mtx held.
289 */
290 static __inline__ void
291 bmovelaundry(buf_t bp)
292 {
293 bp->b_whichq = BQ_LAUNDRY;
294 bp->b_timestamp = buf_timestamp();
295 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
296 blaundrycnt++;
297 }
298
299 static __inline__ void
300 buf_release_credentials(buf_t bp)
301 {
302 if (IS_VALID_CRED(bp->b_rcred)) {
303 kauth_cred_unref(&bp->b_rcred);
304 }
305 if (IS_VALID_CRED(bp->b_wcred)) {
306 kauth_cred_unref(&bp->b_wcred);
307 }
308 }
309
310
311 int
312 buf_valid(buf_t bp)
313 {
314 if ((bp->b_flags & (B_DONE | B_DELWRI))) {
315 return 1;
316 }
317 return 0;
318 }
319
320 int
321 buf_fromcache(buf_t bp)
322 {
323 if ((bp->b_flags & B_CACHE)) {
324 return 1;
325 }
326 return 0;
327 }
328
329 void
330 buf_markinvalid(buf_t bp)
331 {
332 SET(bp->b_flags, B_INVAL);
333 }
334
335 void
336 buf_markdelayed(buf_t bp)
337 {
338 if (!ISSET(bp->b_flags, B_DELWRI)) {
339 SET(bp->b_flags, B_DELWRI);
340
341 OSAddAtomicLong(1, &nbdwrite);
342 buf_reassign(bp, bp->b_vp);
343 }
344 SET(bp->b_flags, B_DONE);
345 }
346
347 void
348 buf_markclean(buf_t bp)
349 {
350 if (ISSET(bp->b_flags, B_DELWRI)) {
351 CLR(bp->b_flags, B_DELWRI);
352
353 OSAddAtomicLong(-1, &nbdwrite);
354 buf_reassign(bp, bp->b_vp);
355 }
356 }
357
358 void
359 buf_markeintr(buf_t bp)
360 {
361 SET(bp->b_flags, B_EINTR);
362 }
363
364
365 void
366 buf_markaged(buf_t bp)
367 {
368 SET(bp->b_flags, B_AGE);
369 }
370
371 int
372 buf_fua(buf_t bp)
373 {
374 if ((bp->b_flags & B_FUA) == B_FUA) {
375 return 1;
376 }
377 return 0;
378 }
379
380 void
381 buf_markfua(buf_t bp)
382 {
383 SET(bp->b_flags, B_FUA);
384 }
385
386 #if CONFIG_PROTECT
387 cpx_t
388 bufattr_cpx(bufattr_t bap)
389 {
390 return bap->ba_cpx;
391 }
392
393 void
394 bufattr_setcpx(bufattr_t bap, cpx_t cpx)
395 {
396 bap->ba_cpx = cpx;
397 }
398
399 void
400 buf_setcpoff(buf_t bp, uint64_t foffset)
401 {
402 bp->b_attr.ba_cp_file_off = foffset;
403 }
404
405 uint64_t
406 bufattr_cpoff(bufattr_t bap)
407 {
408 return bap->ba_cp_file_off;
409 }
410
411 void
412 bufattr_setcpoff(bufattr_t bap, uint64_t foffset)
413 {
414 bap->ba_cp_file_off = foffset;
415 }
416
417 #else // !CONTECT_PROTECT
418
419 uint64_t
420 bufattr_cpoff(bufattr_t bap __unused)
421 {
422 return 0;
423 }
424
425 void
426 bufattr_setcpoff(__unused bufattr_t bap, __unused uint64_t foffset)
427 {
428 return;
429 }
430
431 struct cpx *
432 bufattr_cpx(__unused bufattr_t bap)
433 {
434 return NULL;
435 }
436
437 void
438 bufattr_setcpx(__unused bufattr_t bap, __unused struct cpx *cpx)
439 {
440 }
441
442 #endif /* !CONFIG_PROTECT */
443
444 bufattr_t
445 bufattr_alloc(void)
446 {
447 return kheap_alloc(KHEAP_DEFAULT, sizeof(struct bufattr),
448 Z_WAITOK | Z_ZERO);
449 }
450
451 void
452 bufattr_free(bufattr_t bap)
453 {
454 kheap_free(KHEAP_DEFAULT, bap, sizeof(struct bufattr));
455 }
456
457 bufattr_t
458 bufattr_dup(bufattr_t bap)
459 {
460 bufattr_t new_bufattr;
461 new_bufattr = kheap_alloc(KHEAP_DEFAULT, sizeof(struct bufattr),
462 Z_WAITOK);
463 if (new_bufattr == NULL) {
464 return NULL;
465 }
466
467 /* Copy the provided one into the new copy */
468 memcpy(new_bufattr, bap, sizeof(struct bufattr));
469 return new_bufattr;
470 }
471
472 int
473 bufattr_rawencrypted(bufattr_t bap)
474 {
475 if ((bap->ba_flags & BA_RAW_ENCRYPTED_IO)) {
476 return 1;
477 }
478 return 0;
479 }
480
481 int
482 bufattr_throttled(bufattr_t bap)
483 {
484 return GET_BUFATTR_IO_TIER(bap);
485 }
486
487 int
488 bufattr_passive(bufattr_t bap)
489 {
490 if ((bap->ba_flags & BA_PASSIVE)) {
491 return 1;
492 }
493 return 0;
494 }
495
496 int
497 bufattr_nocache(bufattr_t bap)
498 {
499 if ((bap->ba_flags & BA_NOCACHE)) {
500 return 1;
501 }
502 return 0;
503 }
504
505 int
506 bufattr_meta(bufattr_t bap)
507 {
508 if ((bap->ba_flags & BA_META)) {
509 return 1;
510 }
511 return 0;
512 }
513
514 void
515 bufattr_markmeta(bufattr_t bap)
516 {
517 SET(bap->ba_flags, BA_META);
518 }
519
520 int
521 bufattr_delayidlesleep(bufattr_t bap)
522 {
523 if ((bap->ba_flags & BA_DELAYIDLESLEEP)) {
524 return 1;
525 }
526 return 0;
527 }
528
529 bufattr_t
530 buf_attr(buf_t bp)
531 {
532 return &bp->b_attr;
533 }
534
535 void
536 buf_markstatic(buf_t bp __unused)
537 {
538 SET(bp->b_flags, B_STATICCONTENT);
539 }
540
541 int
542 buf_static(buf_t bp)
543 {
544 if ((bp->b_flags & B_STATICCONTENT)) {
545 return 1;
546 }
547 return 0;
548 }
549
550 void
551 bufattr_markgreedymode(bufattr_t bap)
552 {
553 SET(bap->ba_flags, BA_GREEDY_MODE);
554 }
555
556 int
557 bufattr_greedymode(bufattr_t bap)
558 {
559 if ((bap->ba_flags & BA_GREEDY_MODE)) {
560 return 1;
561 }
562 return 0;
563 }
564
565 void
566 bufattr_markisochronous(bufattr_t bap)
567 {
568 SET(bap->ba_flags, BA_ISOCHRONOUS);
569 }
570
571 int
572 bufattr_isochronous(bufattr_t bap)
573 {
574 if ((bap->ba_flags & BA_ISOCHRONOUS)) {
575 return 1;
576 }
577 return 0;
578 }
579
580 void
581 bufattr_markquickcomplete(bufattr_t bap)
582 {
583 SET(bap->ba_flags, BA_QUICK_COMPLETE);
584 }
585
586 int
587 bufattr_quickcomplete(bufattr_t bap)
588 {
589 if ((bap->ba_flags & BA_QUICK_COMPLETE)) {
590 return 1;
591 }
592 return 0;
593 }
594
595 void
596 bufattr_markioscheduled(bufattr_t bap)
597 {
598 SET(bap->ba_flags, BA_IO_SCHEDULED);
599 }
600
601
602 int
603 bufattr_ioscheduled(bufattr_t bap)
604 {
605 if ((bap->ba_flags & BA_IO_SCHEDULED)) {
606 return 1;
607 }
608 return 0;
609 }
610
611 void
612 bufattr_markexpeditedmeta(bufattr_t bap)
613 {
614 SET(bap->ba_flags, BA_EXPEDITED_META_IO);
615 }
616
617 int
618 bufattr_expeditedmeta(bufattr_t bap)
619 {
620 if ((bap->ba_flags & BA_EXPEDITED_META_IO)) {
621 return 1;
622 }
623 return 0;
624 }
625
626 errno_t
627 buf_error(buf_t bp)
628 {
629 return bp->b_error;
630 }
631
632 void
633 buf_seterror(buf_t bp, errno_t error)
634 {
635 if ((bp->b_error = error)) {
636 SET(bp->b_flags, B_ERROR);
637 } else {
638 CLR(bp->b_flags, B_ERROR);
639 }
640 }
641
642 void
643 buf_setflags(buf_t bp, int32_t flags)
644 {
645 SET(bp->b_flags, (flags & BUF_X_WRFLAGS));
646 }
647
648 void
649 buf_clearflags(buf_t bp, int32_t flags)
650 {
651 CLR(bp->b_flags, (flags & BUF_X_WRFLAGS));
652 }
653
654 int32_t
655 buf_flags(buf_t bp)
656 {
657 return bp->b_flags & BUF_X_RDFLAGS;
658 }
659
660 void
661 buf_reset(buf_t bp, int32_t io_flags)
662 {
663 CLR(bp->b_flags, (B_READ | B_WRITE | B_ERROR | B_DONE | B_INVAL | B_ASYNC | B_NOCACHE | B_FUA));
664 SET(bp->b_flags, (io_flags & (B_ASYNC | B_READ | B_WRITE | B_NOCACHE)));
665
666 bp->b_error = 0;
667 }
668
669 uint32_t
670 buf_count(buf_t bp)
671 {
672 return bp->b_bcount;
673 }
674
675 void
676 buf_setcount(buf_t bp, uint32_t bcount)
677 {
678 bp->b_bcount = bcount;
679 }
680
681 uint32_t
682 buf_size(buf_t bp)
683 {
684 return bp->b_bufsize;
685 }
686
687 void
688 buf_setsize(buf_t bp, uint32_t bufsize)
689 {
690 bp->b_bufsize = bufsize;
691 }
692
693 uint32_t
694 buf_resid(buf_t bp)
695 {
696 return bp->b_resid;
697 }
698
699 void
700 buf_setresid(buf_t bp, uint32_t resid)
701 {
702 bp->b_resid = resid;
703 }
704
705 uint32_t
706 buf_dirtyoff(buf_t bp)
707 {
708 return bp->b_dirtyoff;
709 }
710
711 uint32_t
712 buf_dirtyend(buf_t bp)
713 {
714 return bp->b_dirtyend;
715 }
716
717 void
718 buf_setdirtyoff(buf_t bp, uint32_t dirtyoff)
719 {
720 bp->b_dirtyoff = dirtyoff;
721 }
722
723 void
724 buf_setdirtyend(buf_t bp, uint32_t dirtyend)
725 {
726 bp->b_dirtyend = dirtyend;
727 }
728
729 uintptr_t
730 buf_dataptr(buf_t bp)
731 {
732 return bp->b_datap;
733 }
734
735 void
736 buf_setdataptr(buf_t bp, uintptr_t data)
737 {
738 bp->b_datap = data;
739 }
740
741 vnode_t
742 buf_vnode(buf_t bp)
743 {
744 return bp->b_vp;
745 }
746
747 void
748 buf_setvnode(buf_t bp, vnode_t vp)
749 {
750 bp->b_vp = vp;
751 }
752
753
754 void *
755 buf_callback(buf_t bp)
756 {
757 if (!(bp->b_flags & B_CALL)) {
758 return (void *) NULL;
759 }
760
761 return (void *)bp->b_iodone;
762 }
763
764
765 errno_t
766 buf_setcallback(buf_t bp, void (*callback)(buf_t, void *), void *transaction)
767 {
768 assert(!ISSET(bp->b_flags, B_FILTER) && ISSET(bp->b_lflags, BL_BUSY));
769
770 if (callback) {
771 bp->b_flags |= (B_CALL | B_ASYNC);
772 } else {
773 bp->b_flags &= ~B_CALL;
774 }
775 bp->b_transaction = transaction;
776 bp->b_iodone = callback;
777
778 return 0;
779 }
780
781 errno_t
782 buf_setupl(buf_t bp, upl_t upl, uint32_t offset)
783 {
784 if (!(bp->b_lflags & BL_IOBUF)) {
785 return EINVAL;
786 }
787
788 if (upl) {
789 bp->b_flags |= B_CLUSTER;
790 } else {
791 bp->b_flags &= ~B_CLUSTER;
792 }
793 bp->b_upl = upl;
794 bp->b_uploffset = offset;
795
796 return 0;
797 }
798
799 buf_t
800 buf_clone(buf_t bp, int io_offset, int io_size, void (*iodone)(buf_t, void *), void *arg)
801 {
802 buf_t io_bp;
803 int add1, add2;
804
805 if (io_offset < 0 || io_size < 0) {
806 return NULL;
807 }
808
809 if ((unsigned)(io_offset + io_size) > (unsigned)bp->b_bcount) {
810 return NULL;
811 }
812
813 if (bp->b_flags & B_CLUSTER) {
814 if (io_offset && ((bp->b_uploffset + io_offset) & PAGE_MASK)) {
815 return NULL;
816 }
817
818 if (os_add_overflow(io_offset, io_size, &add1) || os_add_overflow(add1, bp->b_uploffset, &add2)) {
819 return NULL;
820 }
821 if ((add2 & PAGE_MASK) && ((uint32_t)add1 < (uint32_t)bp->b_bcount)) {
822 return NULL;
823 }
824 }
825 io_bp = alloc_io_buf(bp->b_vp, 0);
826
827 io_bp->b_flags = bp->b_flags & (B_COMMIT_UPL | B_META | B_PAGEIO | B_CLUSTER | B_PHYS | B_RAW | B_ASYNC | B_READ | B_FUA);
828
829 if (iodone) {
830 io_bp->b_transaction = arg;
831 io_bp->b_iodone = iodone;
832 io_bp->b_flags |= B_CALL;
833 }
834 if (bp->b_flags & B_CLUSTER) {
835 io_bp->b_upl = bp->b_upl;
836 io_bp->b_uploffset = bp->b_uploffset + io_offset;
837 } else {
838 io_bp->b_datap = (uintptr_t)(((char *)bp->b_datap) + io_offset);
839 }
840 io_bp->b_bcount = io_size;
841
842 return io_bp;
843 }
844
845
846 int
847 buf_shadow(buf_t bp)
848 {
849 if (bp->b_lflags & BL_SHADOW) {
850 return 1;
851 }
852 return 0;
853 }
854
855
856 buf_t
857 buf_create_shadow_priv(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg)
858 {
859 return buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, 1);
860 }
861
862 buf_t
863 buf_create_shadow(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg)
864 {
865 return buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, 0);
866 }
867
868
869 static buf_t
870 buf_create_shadow_internal(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg, int priv)
871 {
872 buf_t io_bp;
873
874 KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_START, bp, 0, 0, 0, 0);
875
876 if (!(bp->b_flags & B_META) || (bp->b_lflags & BL_IOBUF)) {
877 KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_END, bp, 0, 0, 0, 0);
878 return NULL;
879 }
880 #ifdef BUF_MAKE_PRIVATE
881 if (bp->b_shadow_ref && bp->b_data_ref == 0 && external_storage == 0) {
882 panic("buf_create_shadow: %p is in the private state (%d, %d)", bp, bp->b_shadow_ref, bp->b_data_ref);
883 }
884 #endif
885 io_bp = alloc_io_buf(bp->b_vp, priv);
886
887 io_bp->b_flags = bp->b_flags & (B_META | B_ZALLOC | B_ASYNC | B_READ | B_FUA);
888 io_bp->b_blkno = bp->b_blkno;
889 io_bp->b_lblkno = bp->b_lblkno;
890
891 if (iodone) {
892 io_bp->b_transaction = arg;
893 io_bp->b_iodone = iodone;
894 io_bp->b_flags |= B_CALL;
895 }
896 if (force_copy == FALSE) {
897 io_bp->b_bcount = bp->b_bcount;
898 io_bp->b_bufsize = bp->b_bufsize;
899
900 if (external_storage) {
901 io_bp->b_datap = external_storage;
902 #ifdef BUF_MAKE_PRIVATE
903 io_bp->b_data_store = NULL;
904 #endif
905 } else {
906 io_bp->b_datap = bp->b_datap;
907 #ifdef BUF_MAKE_PRIVATE
908 io_bp->b_data_store = bp;
909 #endif
910 }
911 *(buf_t *)(&io_bp->b_orig) = bp;
912
913 lck_mtx_lock_spin(&buf_mtx);
914
915 io_bp->b_lflags |= BL_SHADOW;
916 io_bp->b_shadow = bp->b_shadow;
917 bp->b_shadow = io_bp;
918 bp->b_shadow_ref++;
919
920 #ifdef BUF_MAKE_PRIVATE
921 if (external_storage) {
922 io_bp->b_lflags |= BL_EXTERNAL;
923 } else {
924 bp->b_data_ref++;
925 }
926 #endif
927 lck_mtx_unlock(&buf_mtx);
928 } else {
929 if (external_storage) {
930 #ifdef BUF_MAKE_PRIVATE
931 io_bp->b_lflags |= BL_EXTERNAL;
932 #endif
933 io_bp->b_bcount = bp->b_bcount;
934 io_bp->b_bufsize = bp->b_bufsize;
935 io_bp->b_datap = external_storage;
936 } else {
937 allocbuf(io_bp, bp->b_bcount);
938
939 io_bp->b_lflags |= BL_IOBUF_ALLOC;
940 }
941 bcopy((caddr_t)bp->b_datap, (caddr_t)io_bp->b_datap, bp->b_bcount);
942
943 #ifdef BUF_MAKE_PRIVATE
944 io_bp->b_data_store = NULL;
945 #endif
946 }
947 KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, io_bp, 0);
948
949 return io_bp;
950 }
951
952
953 #ifdef BUF_MAKE_PRIVATE
954 errno_t
955 buf_make_private(buf_t bp)
956 {
957 buf_t ds_bp;
958 buf_t t_bp;
959 struct buf my_buf;
960
961 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_START, bp, bp->b_shadow_ref, 0, 0, 0);
962
963 if (bp->b_shadow_ref == 0 || bp->b_data_ref == 0 || ISSET(bp->b_lflags, BL_SHADOW)) {
964 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, EINVAL, 0);
965 return EINVAL;
966 }
967 my_buf.b_flags = B_META;
968 my_buf.b_datap = (uintptr_t)NULL;
969 allocbuf(&my_buf, bp->b_bcount);
970
971 bcopy((caddr_t)bp->b_datap, (caddr_t)my_buf.b_datap, bp->b_bcount);
972
973 lck_mtx_lock_spin(&buf_mtx);
974
975 for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) {
976 if (!ISSET(bp->b_lflags, BL_EXTERNAL)) {
977 break;
978 }
979 }
980 ds_bp = t_bp;
981
982 if (ds_bp == NULL && bp->b_data_ref) {
983 panic("buf_make_private: b_data_ref != 0 && ds_bp == NULL");
984 }
985
986 if (ds_bp && (bp->b_data_ref == 0 || bp->b_shadow_ref == 0)) {
987 panic("buf_make_private: ref_count == 0 && ds_bp != NULL");
988 }
989
990 if (ds_bp == NULL) {
991 lck_mtx_unlock(&buf_mtx);
992
993 buf_free_meta_store(&my_buf);
994
995 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, EINVAL, 0);
996 return EINVAL;
997 }
998 for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) {
999 if (!ISSET(t_bp->b_lflags, BL_EXTERNAL)) {
1000 t_bp->b_data_store = ds_bp;
1001 }
1002 }
1003 ds_bp->b_data_ref = bp->b_data_ref;
1004
1005 bp->b_data_ref = 0;
1006 bp->b_datap = my_buf.b_datap;
1007
1008 lck_mtx_unlock(&buf_mtx);
1009
1010 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, 0, 0);
1011 return 0;
1012 }
1013 #endif
1014
1015
1016 void
1017 buf_setfilter(buf_t bp, void (*filter)(buf_t, void *), void *transaction,
1018 void(**old_iodone)(buf_t, void *), void **old_transaction)
1019 {
1020 assert(ISSET(bp->b_lflags, BL_BUSY));
1021
1022 if (old_iodone) {
1023 *old_iodone = bp->b_iodone;
1024 }
1025 if (old_transaction) {
1026 *old_transaction = bp->b_transaction;
1027 }
1028
1029 bp->b_transaction = transaction;
1030 bp->b_iodone = filter;
1031 if (filter) {
1032 bp->b_flags |= B_FILTER;
1033 } else {
1034 bp->b_flags &= ~B_FILTER;
1035 }
1036 }
1037
1038
1039 daddr64_t
1040 buf_blkno(buf_t bp)
1041 {
1042 return bp->b_blkno;
1043 }
1044
1045 daddr64_t
1046 buf_lblkno(buf_t bp)
1047 {
1048 return bp->b_lblkno;
1049 }
1050
1051 void
1052 buf_setblkno(buf_t bp, daddr64_t blkno)
1053 {
1054 bp->b_blkno = blkno;
1055 }
1056
1057 void
1058 buf_setlblkno(buf_t bp, daddr64_t lblkno)
1059 {
1060 bp->b_lblkno = lblkno;
1061 }
1062
1063 dev_t
1064 buf_device(buf_t bp)
1065 {
1066 return bp->b_dev;
1067 }
1068
1069 errno_t
1070 buf_setdevice(buf_t bp, vnode_t vp)
1071 {
1072 if ((vp->v_type != VBLK) && (vp->v_type != VCHR)) {
1073 return EINVAL;
1074 }
1075 bp->b_dev = vp->v_rdev;
1076
1077 return 0;
1078 }
1079
1080
1081 void *
1082 buf_drvdata(buf_t bp)
1083 {
1084 return bp->b_drvdata;
1085 }
1086
1087 void
1088 buf_setdrvdata(buf_t bp, void *drvdata)
1089 {
1090 bp->b_drvdata = drvdata;
1091 }
1092
1093 void *
1094 buf_fsprivate(buf_t bp)
1095 {
1096 return bp->b_fsprivate;
1097 }
1098
1099 void
1100 buf_setfsprivate(buf_t bp, void *fsprivate)
1101 {
1102 bp->b_fsprivate = fsprivate;
1103 }
1104
1105 kauth_cred_t
1106 buf_rcred(buf_t bp)
1107 {
1108 return bp->b_rcred;
1109 }
1110
1111 kauth_cred_t
1112 buf_wcred(buf_t bp)
1113 {
1114 return bp->b_wcred;
1115 }
1116
1117 void *
1118 buf_upl(buf_t bp)
1119 {
1120 return bp->b_upl;
1121 }
1122
1123 uint32_t
1124 buf_uploffset(buf_t bp)
1125 {
1126 return (uint32_t)(bp->b_uploffset);
1127 }
1128
1129 proc_t
1130 buf_proc(buf_t bp)
1131 {
1132 return bp->b_proc;
1133 }
1134
1135
1136 errno_t
1137 buf_map(buf_t bp, caddr_t *io_addr)
1138 {
1139 buf_t real_bp;
1140 vm_offset_t vaddr;
1141 kern_return_t kret;
1142
1143 if (!(bp->b_flags & B_CLUSTER)) {
1144 *io_addr = (caddr_t)bp->b_datap;
1145 return 0;
1146 }
1147 real_bp = (buf_t)(bp->b_real_bp);
1148
1149 if (real_bp && real_bp->b_datap) {
1150 /*
1151 * b_real_bp is only valid if B_CLUSTER is SET
1152 * if it's non-zero, than someone did a cluster_bp call
1153 * if the backing physical pages were already mapped
1154 * in before the call to cluster_bp (non-zero b_datap),
1155 * than we just use that mapping
1156 */
1157 *io_addr = (caddr_t)real_bp->b_datap;
1158 return 0;
1159 }
1160 kret = ubc_upl_map(bp->b_upl, &vaddr); /* Map it in */
1161
1162 if (kret != KERN_SUCCESS) {
1163 *io_addr = NULL;
1164
1165 return ENOMEM;
1166 }
1167 vaddr += bp->b_uploffset;
1168
1169 *io_addr = (caddr_t)vaddr;
1170
1171 return 0;
1172 }
1173
1174 errno_t
1175 buf_unmap(buf_t bp)
1176 {
1177 buf_t real_bp;
1178 kern_return_t kret;
1179
1180 if (!(bp->b_flags & B_CLUSTER)) {
1181 return 0;
1182 }
1183 /*
1184 * see buf_map for the explanation
1185 */
1186 real_bp = (buf_t)(bp->b_real_bp);
1187
1188 if (real_bp && real_bp->b_datap) {
1189 return 0;
1190 }
1191
1192 if ((bp->b_lflags & BL_IOBUF) &&
1193 ((bp->b_flags & (B_PAGEIO | B_READ)) != (B_PAGEIO | B_READ))) {
1194 /*
1195 * ignore pageins... the 'right' thing will
1196 * happen due to the way we handle speculative
1197 * clusters...
1198 *
1199 * when we commit these pages, we'll hit
1200 * it with UPL_COMMIT_INACTIVE which
1201 * will clear the reference bit that got
1202 * turned on when we touched the mapping
1203 */
1204 bp->b_flags |= B_AGE;
1205 }
1206 kret = ubc_upl_unmap(bp->b_upl);
1207
1208 if (kret != KERN_SUCCESS) {
1209 return EINVAL;
1210 }
1211 return 0;
1212 }
1213
1214
1215 void
1216 buf_clear(buf_t bp)
1217 {
1218 caddr_t baddr;
1219
1220 if (buf_map(bp, &baddr) == 0) {
1221 bzero(baddr, bp->b_bcount);
1222 buf_unmap(bp);
1223 }
1224 bp->b_resid = 0;
1225 }
1226
1227 /*
1228 * Read or write a buffer that is not contiguous on disk.
1229 * buffer is marked done/error at the conclusion
1230 */
1231 static int
1232 buf_strategy_fragmented(vnode_t devvp, buf_t bp, off_t f_offset, size_t contig_bytes)
1233 {
1234 vnode_t vp = buf_vnode(bp);
1235 buf_t io_bp; /* For reading or writing a single block */
1236 int io_direction;
1237 int io_resid;
1238 size_t io_contig_bytes;
1239 daddr64_t io_blkno;
1240 int error = 0;
1241 int bmap_flags;
1242
1243 /*
1244 * save our starting point... the bp was already mapped
1245 * in buf_strategy before we got called
1246 * no sense doing it again.
1247 */
1248 io_blkno = bp->b_blkno;
1249 /*
1250 * Make sure we redo this mapping for the next I/O
1251 * i.e. this can never be a 'permanent' mapping
1252 */
1253 bp->b_blkno = bp->b_lblkno;
1254
1255 /*
1256 * Get an io buffer to do the deblocking
1257 */
1258 io_bp = alloc_io_buf(devvp, 0);
1259
1260 io_bp->b_lblkno = bp->b_lblkno;
1261 io_bp->b_datap = bp->b_datap;
1262 io_resid = bp->b_bcount;
1263 io_direction = bp->b_flags & B_READ;
1264 io_contig_bytes = contig_bytes;
1265
1266 if (bp->b_flags & B_READ) {
1267 bmap_flags = VNODE_READ;
1268 } else {
1269 bmap_flags = VNODE_WRITE;
1270 }
1271
1272 for (;;) {
1273 if (io_blkno == -1) {
1274 /*
1275 * this is unexepected, but we'll allow for it
1276 */
1277 bzero((caddr_t)io_bp->b_datap, (int)io_contig_bytes);
1278 } else {
1279 io_bp->b_bcount = (uint32_t)io_contig_bytes;
1280 io_bp->b_bufsize = (uint32_t)io_contig_bytes;
1281 io_bp->b_resid = (uint32_t)io_contig_bytes;
1282 io_bp->b_blkno = io_blkno;
1283
1284 buf_reset(io_bp, io_direction);
1285
1286 /*
1287 * Call the device to do the I/O and wait for it. Make sure the appropriate party is charged for write
1288 */
1289
1290 if (!ISSET(bp->b_flags, B_READ)) {
1291 OSAddAtomic(1, &devvp->v_numoutput);
1292 }
1293
1294 if ((error = VNOP_STRATEGY(io_bp))) {
1295 break;
1296 }
1297 if ((error = (int)buf_biowait(io_bp))) {
1298 break;
1299 }
1300 if (io_bp->b_resid) {
1301 io_resid -= (io_contig_bytes - io_bp->b_resid);
1302 break;
1303 }
1304 }
1305 if ((io_resid -= io_contig_bytes) == 0) {
1306 break;
1307 }
1308 f_offset += io_contig_bytes;
1309 io_bp->b_datap += io_contig_bytes;
1310
1311 /*
1312 * Map the current position to a physical block number
1313 */
1314 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_resid, &io_blkno, &io_contig_bytes, NULL, bmap_flags, NULL))) {
1315 break;
1316 }
1317 }
1318 buf_free(io_bp);
1319
1320 if (error) {
1321 buf_seterror(bp, error);
1322 }
1323 bp->b_resid = io_resid;
1324 /*
1325 * This I/O is now complete
1326 */
1327 buf_biodone(bp);
1328
1329 return error;
1330 }
1331
1332
1333 /*
1334 * struct vnop_strategy_args {
1335 * struct buf *a_bp;
1336 * } *ap;
1337 */
1338 errno_t
1339 buf_strategy(vnode_t devvp, void *ap)
1340 {
1341 buf_t bp = ((struct vnop_strategy_args *)ap)->a_bp;
1342 vnode_t vp = bp->b_vp;
1343 int bmap_flags;
1344 errno_t error;
1345 #if CONFIG_DTRACE
1346 int dtrace_io_start_flag = 0; /* We only want to trip the io:::start
1347 * probe once, with the true physical
1348 * block in place (b_blkno)
1349 */
1350
1351 #endif
1352
1353 if (vp == NULL || vp->v_type == VCHR || vp->v_type == VBLK) {
1354 panic("buf_strategy: b_vp == NULL || vtype == VCHR | VBLK\n");
1355 }
1356 /*
1357 * associate the physical device with
1358 * with this buf_t even if we don't
1359 * end up issuing the I/O...
1360 */
1361 bp->b_dev = devvp->v_rdev;
1362
1363 if (bp->b_flags & B_READ) {
1364 bmap_flags = VNODE_READ;
1365 } else {
1366 bmap_flags = VNODE_WRITE;
1367 }
1368
1369 if (!(bp->b_flags & B_CLUSTER)) {
1370 if ((bp->b_upl)) {
1371 /*
1372 * we have a UPL associated with this bp
1373 * go through cluster_bp which knows how
1374 * to deal with filesystem block sizes
1375 * that aren't equal to the page size
1376 */
1377 DTRACE_IO1(start, buf_t, bp);
1378 return cluster_bp(bp);
1379 }
1380 if (bp->b_blkno == bp->b_lblkno) {
1381 off_t f_offset;
1382 size_t contig_bytes;
1383
1384 if ((error = VNOP_BLKTOOFF(vp, bp->b_lblkno, &f_offset))) {
1385 DTRACE_IO1(start, buf_t, bp);
1386 buf_seterror(bp, error);
1387 buf_biodone(bp);
1388
1389 return error;
1390 }
1391
1392 if ((error = VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))) {
1393 DTRACE_IO1(start, buf_t, bp);
1394 buf_seterror(bp, error);
1395 buf_biodone(bp);
1396
1397 return error;
1398 }
1399
1400 DTRACE_IO1(start, buf_t, bp);
1401 #if CONFIG_DTRACE
1402 dtrace_io_start_flag = 1;
1403 #endif /* CONFIG_DTRACE */
1404
1405 if ((bp->b_blkno == -1) || (contig_bytes == 0)) {
1406 /* Set block number to force biodone later */
1407 bp->b_blkno = -1;
1408 buf_clear(bp);
1409 } else if (contig_bytes < (size_t)bp->b_bcount) {
1410 return buf_strategy_fragmented(devvp, bp, f_offset, contig_bytes);
1411 }
1412 }
1413
1414 #if CONFIG_DTRACE
1415 if (dtrace_io_start_flag == 0) {
1416 DTRACE_IO1(start, buf_t, bp);
1417 dtrace_io_start_flag = 1;
1418 }
1419 #endif /* CONFIG_DTRACE */
1420
1421 if (bp->b_blkno == -1) {
1422 buf_biodone(bp);
1423 return 0;
1424 }
1425 }
1426
1427 #if CONFIG_DTRACE
1428 if (dtrace_io_start_flag == 0) {
1429 DTRACE_IO1(start, buf_t, bp);
1430 }
1431 #endif /* CONFIG_DTRACE */
1432
1433 #if CONFIG_PROTECT
1434 /* Capture f_offset in the bufattr*/
1435 cpx_t cpx = bufattr_cpx(buf_attr(bp));
1436 if (cpx) {
1437 /* No need to go here for older EAs */
1438 if (cpx_use_offset_for_iv(cpx) && !cpx_synthetic_offset_for_iv(cpx)) {
1439 off_t f_offset;
1440 if ((error = VNOP_BLKTOOFF(bp->b_vp, bp->b_lblkno, &f_offset))) {
1441 return error;
1442 }
1443
1444 /*
1445 * Attach the file offset to this buffer. The
1446 * bufattr attributes will be passed down the stack
1447 * until they reach the storage driver (whether
1448 * IOFlashStorage, ASP, or IONVMe). The driver
1449 * will retain the offset in a local variable when it
1450 * issues its I/Os to the NAND controller.
1451 *
1452 * Note that LwVM may end up splitting this I/O
1453 * into sub-I/Os if it crosses a chunk boundary. In this
1454 * case, LwVM will update this field when it dispatches
1455 * each I/O to IOFlashStorage. But from our perspective
1456 * we have only issued a single I/O.
1457 *
1458 * In the case of APFS we do not bounce through another
1459 * intermediate layer (such as CoreStorage). APFS will
1460 * issue the I/Os directly to the block device / IOMedia
1461 * via buf_strategy on the specfs node.
1462 */
1463 buf_setcpoff(bp, f_offset);
1464 CP_DEBUG((CPDBG_OFFSET_IO | DBG_FUNC_NONE), (uint32_t) f_offset, (uint32_t) bp->b_lblkno, (uint32_t) bp->b_blkno, (uint32_t) bp->b_bcount, 0);
1465 }
1466 }
1467 #endif
1468
1469 /*
1470 * we can issue the I/O because...
1471 * either B_CLUSTER is set which
1472 * means that the I/O is properly set
1473 * up to be a multiple of the page size, or
1474 * we were able to successfully set up the
1475 * physical block mapping
1476 */
1477 error = VOCALL(devvp->v_op, VOFFSET(vnop_strategy), ap);
1478 DTRACE_FSINFO(strategy, vnode_t, vp);
1479 return error;
1480 }
1481
1482
1483
1484 buf_t
1485 buf_alloc(vnode_t vp)
1486 {
1487 return alloc_io_buf(vp, is_vm_privileged());
1488 }
1489
1490 void
1491 buf_free(buf_t bp)
1492 {
1493 free_io_buf(bp);
1494 }
1495
1496
1497 /*
1498 * iterate buffers for the specified vp.
1499 * if BUF_SCAN_DIRTY is set, do the dirty list
1500 * if BUF_SCAN_CLEAN is set, do the clean list
1501 * if neither flag is set, default to BUF_SCAN_DIRTY
1502 * if BUF_NOTIFY_BUSY is set, call the callout function using a NULL bp for busy pages
1503 */
1504
1505 struct buf_iterate_info_t {
1506 int flag;
1507 struct buflists *listhead;
1508 };
1509
1510 void
1511 buf_iterate(vnode_t vp, int (*callout)(buf_t, void *), int flags, void *arg)
1512 {
1513 buf_t bp;
1514 int retval;
1515 struct buflists local_iterblkhd;
1516 int lock_flags = BAC_NOWAIT | BAC_REMOVE;
1517 int notify_busy = flags & BUF_NOTIFY_BUSY;
1518 struct buf_iterate_info_t list[2];
1519 int num_lists, i;
1520
1521 if (flags & BUF_SKIP_LOCKED) {
1522 lock_flags |= BAC_SKIP_LOCKED;
1523 }
1524 if (flags & BUF_SKIP_NONLOCKED) {
1525 lock_flags |= BAC_SKIP_NONLOCKED;
1526 }
1527
1528 if (!(flags & (BUF_SCAN_DIRTY | BUF_SCAN_CLEAN))) {
1529 flags |= BUF_SCAN_DIRTY;
1530 }
1531
1532 num_lists = 0;
1533
1534 if (flags & BUF_SCAN_DIRTY) {
1535 list[num_lists].flag = VBI_DIRTY;
1536 list[num_lists].listhead = &vp->v_dirtyblkhd;
1537 num_lists++;
1538 }
1539 if (flags & BUF_SCAN_CLEAN) {
1540 list[num_lists].flag = VBI_CLEAN;
1541 list[num_lists].listhead = &vp->v_cleanblkhd;
1542 num_lists++;
1543 }
1544
1545 for (i = 0; i < num_lists; i++) {
1546 lck_mtx_lock(&buf_mtx);
1547
1548 if (buf_iterprepare(vp, &local_iterblkhd, list[i].flag)) {
1549 lck_mtx_unlock(&buf_mtx);
1550 continue;
1551 }
1552 while (!LIST_EMPTY(&local_iterblkhd)) {
1553 bp = LIST_FIRST(&local_iterblkhd);
1554 LIST_REMOVE(bp, b_vnbufs);
1555 LIST_INSERT_HEAD(list[i].listhead, bp, b_vnbufs);
1556
1557 if (buf_acquire_locked(bp, lock_flags, 0, 0)) {
1558 if (notify_busy) {
1559 bp = NULL;
1560 } else {
1561 continue;
1562 }
1563 }
1564
1565 lck_mtx_unlock(&buf_mtx);
1566
1567 retval = callout(bp, arg);
1568
1569 switch (retval) {
1570 case BUF_RETURNED:
1571 if (bp) {
1572 buf_brelse(bp);
1573 }
1574 break;
1575 case BUF_CLAIMED:
1576 break;
1577 case BUF_RETURNED_DONE:
1578 if (bp) {
1579 buf_brelse(bp);
1580 }
1581 lck_mtx_lock(&buf_mtx);
1582 goto out;
1583 case BUF_CLAIMED_DONE:
1584 lck_mtx_lock(&buf_mtx);
1585 goto out;
1586 }
1587 lck_mtx_lock(&buf_mtx);
1588 } /* while list has more nodes */
1589 out:
1590 buf_itercomplete(vp, &local_iterblkhd, list[i].flag);
1591 lck_mtx_unlock(&buf_mtx);
1592 } /* for each list */
1593 } /* buf_iterate */
1594
1595
1596 /*
1597 * Flush out and invalidate all buffers associated with a vnode.
1598 */
1599 int
1600 buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo)
1601 {
1602 buf_t bp;
1603 int aflags;
1604 int error = 0;
1605 int must_rescan = 1;
1606 struct buflists local_iterblkhd;
1607
1608
1609 if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd)) {
1610 return 0;
1611 }
1612
1613 lck_mtx_lock(&buf_mtx);
1614
1615 for (;;) {
1616 if (must_rescan == 0) {
1617 /*
1618 * the lists may not be empty, but all that's left at this
1619 * point are metadata or B_LOCKED buffers which are being
1620 * skipped... we know this because we made it through both
1621 * the clean and dirty lists without dropping buf_mtx...
1622 * each time we drop buf_mtx we bump "must_rescan"
1623 */
1624 break;
1625 }
1626 if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd)) {
1627 break;
1628 }
1629 must_rescan = 0;
1630 /*
1631 * iterate the clean list
1632 */
1633 if (buf_iterprepare(vp, &local_iterblkhd, VBI_CLEAN)) {
1634 goto try_dirty_list;
1635 }
1636 while (!LIST_EMPTY(&local_iterblkhd)) {
1637 bp = LIST_FIRST(&local_iterblkhd);
1638
1639 LIST_REMOVE(bp, b_vnbufs);
1640 LIST_INSERT_HEAD(&vp->v_cleanblkhd, bp, b_vnbufs);
1641
1642 /*
1643 * some filesystems distinguish meta data blocks with a negative logical block #
1644 */
1645 if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META))) {
1646 continue;
1647 }
1648
1649 aflags = BAC_REMOVE;
1650
1651 if (!(flags & BUF_INVALIDATE_LOCKED)) {
1652 aflags |= BAC_SKIP_LOCKED;
1653 }
1654
1655 if ((error = (int)buf_acquire_locked(bp, aflags, slpflag, slptimeo))) {
1656 if (error == EDEADLK) {
1657 /*
1658 * this buffer was marked B_LOCKED...
1659 * we didn't drop buf_mtx, so we
1660 * we don't need to rescan
1661 */
1662 continue;
1663 }
1664 if (error == EAGAIN) {
1665 /*
1666 * found a busy buffer... we blocked and
1667 * dropped buf_mtx, so we're going to
1668 * need to rescan after this pass is completed
1669 */
1670 must_rescan++;
1671 continue;
1672 }
1673 /*
1674 * got some kind of 'real' error out of the msleep
1675 * in buf_acquire_locked, terminate the scan and return the error
1676 */
1677 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1678
1679 lck_mtx_unlock(&buf_mtx);
1680 return error;
1681 }
1682 lck_mtx_unlock(&buf_mtx);
1683
1684 if (bp->b_flags & B_LOCKED) {
1685 KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 0, 0);
1686 }
1687
1688 CLR(bp->b_flags, B_LOCKED);
1689 SET(bp->b_flags, B_INVAL);
1690 buf_brelse(bp);
1691
1692 lck_mtx_lock(&buf_mtx);
1693
1694 /*
1695 * by dropping buf_mtx, we allow new
1696 * buffers to be added to the vnode list(s)
1697 * we'll have to rescan at least once more
1698 * if the queues aren't empty
1699 */
1700 must_rescan++;
1701 }
1702 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1703
1704 try_dirty_list:
1705 /*
1706 * Now iterate on dirty blks
1707 */
1708 if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY)) {
1709 continue;
1710 }
1711 while (!LIST_EMPTY(&local_iterblkhd)) {
1712 bp = LIST_FIRST(&local_iterblkhd);
1713
1714 LIST_REMOVE(bp, b_vnbufs);
1715 LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1716
1717 /*
1718 * some filesystems distinguish meta data blocks with a negative logical block #
1719 */
1720 if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META))) {
1721 continue;
1722 }
1723
1724 aflags = BAC_REMOVE;
1725
1726 if (!(flags & BUF_INVALIDATE_LOCKED)) {
1727 aflags |= BAC_SKIP_LOCKED;
1728 }
1729
1730 if ((error = (int)buf_acquire_locked(bp, aflags, slpflag, slptimeo))) {
1731 if (error == EDEADLK) {
1732 /*
1733 * this buffer was marked B_LOCKED...
1734 * we didn't drop buf_mtx, so we
1735 * we don't need to rescan
1736 */
1737 continue;
1738 }
1739 if (error == EAGAIN) {
1740 /*
1741 * found a busy buffer... we blocked and
1742 * dropped buf_mtx, so we're going to
1743 * need to rescan after this pass is completed
1744 */
1745 must_rescan++;
1746 continue;
1747 }
1748 /*
1749 * got some kind of 'real' error out of the msleep
1750 * in buf_acquire_locked, terminate the scan and return the error
1751 */
1752 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1753
1754 lck_mtx_unlock(&buf_mtx);
1755 return error;
1756 }
1757 lck_mtx_unlock(&buf_mtx);
1758
1759 if (bp->b_flags & B_LOCKED) {
1760 KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 1, 0);
1761 }
1762
1763 CLR(bp->b_flags, B_LOCKED);
1764 SET(bp->b_flags, B_INVAL);
1765
1766 if (ISSET(bp->b_flags, B_DELWRI) && (flags & BUF_WRITE_DATA)) {
1767 (void) VNOP_BWRITE(bp);
1768 } else {
1769 buf_brelse(bp);
1770 }
1771
1772 lck_mtx_lock(&buf_mtx);
1773 /*
1774 * by dropping buf_mtx, we allow new
1775 * buffers to be added to the vnode list(s)
1776 * we'll have to rescan at least once more
1777 * if the queues aren't empty
1778 */
1779 must_rescan++;
1780 }
1781 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1782 }
1783 lck_mtx_unlock(&buf_mtx);
1784
1785 return 0;
1786 }
1787
1788 void
1789 buf_flushdirtyblks(vnode_t vp, int wait, int flags, const char *msg)
1790 {
1791 (void) buf_flushdirtyblks_skipinfo(vp, wait, flags, msg);
1792 return;
1793 }
1794
1795 int
1796 buf_flushdirtyblks_skipinfo(vnode_t vp, int wait, int flags, const char *msg)
1797 {
1798 buf_t bp;
1799 int writes_issued = 0;
1800 errno_t error;
1801 int busy = 0;
1802 struct buflists local_iterblkhd;
1803 int lock_flags = BAC_NOWAIT | BAC_REMOVE;
1804 int any_locked = 0;
1805
1806 if (flags & BUF_SKIP_LOCKED) {
1807 lock_flags |= BAC_SKIP_LOCKED;
1808 }
1809 if (flags & BUF_SKIP_NONLOCKED) {
1810 lock_flags |= BAC_SKIP_NONLOCKED;
1811 }
1812 loop:
1813 lck_mtx_lock(&buf_mtx);
1814
1815 if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY) == 0) {
1816 while (!LIST_EMPTY(&local_iterblkhd)) {
1817 bp = LIST_FIRST(&local_iterblkhd);
1818 LIST_REMOVE(bp, b_vnbufs);
1819 LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1820
1821 if ((error = buf_acquire_locked(bp, lock_flags, 0, 0)) == EBUSY) {
1822 busy++;
1823 }
1824 if (error) {
1825 /*
1826 * If we passed in BUF_SKIP_LOCKED or BUF_SKIP_NONLOCKED,
1827 * we may want to do somethign differently if a locked or unlocked
1828 * buffer was encountered (depending on the arg specified).
1829 * In this case, we know that one of those two was set, and the
1830 * buf acquisition failed above.
1831 *
1832 * If it failed with EDEADLK, then save state which can be emitted
1833 * later on to the caller. Most callers should not care.
1834 */
1835 if (error == EDEADLK) {
1836 any_locked++;
1837 }
1838 continue;
1839 }
1840 lck_mtx_unlock(&buf_mtx);
1841
1842 bp->b_flags &= ~B_LOCKED;
1843
1844 /*
1845 * Wait for I/O associated with indirect blocks to complete,
1846 * since there is no way to quickly wait for them below.
1847 */
1848 if ((bp->b_vp == vp) || (wait == 0)) {
1849 (void) buf_bawrite(bp);
1850 } else {
1851 (void) VNOP_BWRITE(bp);
1852 }
1853 writes_issued++;
1854
1855 lck_mtx_lock(&buf_mtx);
1856 }
1857 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1858 }
1859 lck_mtx_unlock(&buf_mtx);
1860
1861 if (wait) {
1862 (void)vnode_waitforwrites(vp, 0, 0, 0, msg);
1863
1864 if (vp->v_dirtyblkhd.lh_first && busy) {
1865 /*
1866 * we had one or more BUSY buffers on
1867 * the dirtyblock list... most likely
1868 * these are due to delayed writes that
1869 * were moved to the bclean queue but
1870 * have not yet been 'written'.
1871 * if we issued some writes on the
1872 * previous pass, we try again immediately
1873 * if we didn't, we'll sleep for some time
1874 * to allow the state to change...
1875 */
1876 if (writes_issued == 0) {
1877 (void)tsleep((caddr_t)&vp->v_numoutput,
1878 PRIBIO + 1, "vnode_flushdirtyblks", hz / 20);
1879 }
1880 writes_issued = 0;
1881 busy = 0;
1882
1883 goto loop;
1884 }
1885 }
1886
1887 return any_locked;
1888 }
1889
1890
1891 /*
1892 * called with buf_mtx held...
1893 * this lock protects the queue manipulation
1894 */
1895 static int
1896 buf_iterprepare(vnode_t vp, struct buflists *iterheadp, int flags)
1897 {
1898 struct buflists * listheadp;
1899
1900 if (flags & VBI_DIRTY) {
1901 listheadp = &vp->v_dirtyblkhd;
1902 } else {
1903 listheadp = &vp->v_cleanblkhd;
1904 }
1905
1906 while (vp->v_iterblkflags & VBI_ITER) {
1907 vp->v_iterblkflags |= VBI_ITERWANT;
1908 msleep(&vp->v_iterblkflags, &buf_mtx, 0, "buf_iterprepare", NULL);
1909 }
1910 if (LIST_EMPTY(listheadp)) {
1911 LIST_INIT(iterheadp);
1912 return EINVAL;
1913 }
1914 vp->v_iterblkflags |= VBI_ITER;
1915
1916 iterheadp->lh_first = listheadp->lh_first;
1917 listheadp->lh_first->b_vnbufs.le_prev = &iterheadp->lh_first;
1918 LIST_INIT(listheadp);
1919
1920 return 0;
1921 }
1922
1923 /*
1924 * called with buf_mtx held...
1925 * this lock protects the queue manipulation
1926 */
1927 static void
1928 buf_itercomplete(vnode_t vp, struct buflists *iterheadp, int flags)
1929 {
1930 struct buflists * listheadp;
1931 buf_t bp;
1932
1933 if (flags & VBI_DIRTY) {
1934 listheadp = &vp->v_dirtyblkhd;
1935 } else {
1936 listheadp = &vp->v_cleanblkhd;
1937 }
1938
1939 while (!LIST_EMPTY(iterheadp)) {
1940 bp = LIST_FIRST(iterheadp);
1941 LIST_REMOVE(bp, b_vnbufs);
1942 LIST_INSERT_HEAD(listheadp, bp, b_vnbufs);
1943 }
1944 vp->v_iterblkflags &= ~VBI_ITER;
1945
1946 if (vp->v_iterblkflags & VBI_ITERWANT) {
1947 vp->v_iterblkflags &= ~VBI_ITERWANT;
1948 wakeup(&vp->v_iterblkflags);
1949 }
1950 }
1951
1952
1953 static void
1954 bremfree_locked(buf_t bp)
1955 {
1956 struct bqueues *dp = NULL;
1957 int whichq;
1958
1959 whichq = bp->b_whichq;
1960
1961 if (whichq == -1) {
1962 if (bp->b_shadow_ref == 0) {
1963 panic("bremfree_locked: %p not on freelist", bp);
1964 }
1965 /*
1966 * there are clones pointing to 'bp'...
1967 * therefore, it was not put on a freelist
1968 * when buf_brelse was last called on 'bp'
1969 */
1970 return;
1971 }
1972 /*
1973 * We only calculate the head of the freelist when removing
1974 * the last element of the list as that is the only time that
1975 * it is needed (e.g. to reset the tail pointer).
1976 *
1977 * NB: This makes an assumption about how tailq's are implemented.
1978 */
1979 if (bp->b_freelist.tqe_next == NULL) {
1980 dp = &bufqueues[whichq];
1981
1982 if (dp->tqh_last != &bp->b_freelist.tqe_next) {
1983 panic("bremfree: lost tail");
1984 }
1985 }
1986 TAILQ_REMOVE(dp, bp, b_freelist);
1987
1988 if (whichq == BQ_LAUNDRY) {
1989 blaundrycnt--;
1990 }
1991
1992 bp->b_whichq = -1;
1993 bp->b_timestamp = 0;
1994 bp->b_shadow = 0;
1995 }
1996
1997 /*
1998 * Associate a buffer with a vnode.
1999 * buf_mtx must be locked on entry
2000 */
2001 static void
2002 bgetvp_locked(vnode_t vp, buf_t bp)
2003 {
2004 if (bp->b_vp != vp) {
2005 panic("bgetvp_locked: not free");
2006 }
2007
2008 if (vp->v_type == VBLK || vp->v_type == VCHR) {
2009 bp->b_dev = vp->v_rdev;
2010 } else {
2011 bp->b_dev = NODEV;
2012 }
2013 /*
2014 * Insert onto list for new vnode.
2015 */
2016 bufinsvn(bp, &vp->v_cleanblkhd);
2017 }
2018
2019 /*
2020 * Disassociate a buffer from a vnode.
2021 * buf_mtx must be locked on entry
2022 */
2023 static void
2024 brelvp_locked(buf_t bp)
2025 {
2026 /*
2027 * Delete from old vnode list, if on one.
2028 */
2029 if (bp->b_vnbufs.le_next != NOLIST) {
2030 bufremvn(bp);
2031 }
2032
2033 bp->b_vp = (vnode_t)NULL;
2034 }
2035
2036 /*
2037 * Reassign a buffer from one vnode to another.
2038 * Used to assign file specific control information
2039 * (indirect blocks) to the vnode to which they belong.
2040 */
2041 static void
2042 buf_reassign(buf_t bp, vnode_t newvp)
2043 {
2044 struct buflists *listheadp;
2045
2046 if (newvp == NULL) {
2047 printf("buf_reassign: NULL");
2048 return;
2049 }
2050 lck_mtx_lock_spin(&buf_mtx);
2051
2052 /*
2053 * Delete from old vnode list, if on one.
2054 */
2055 if (bp->b_vnbufs.le_next != NOLIST) {
2056 bufremvn(bp);
2057 }
2058 /*
2059 * If dirty, put on list of dirty buffers;
2060 * otherwise insert onto list of clean buffers.
2061 */
2062 if (ISSET(bp->b_flags, B_DELWRI)) {
2063 listheadp = &newvp->v_dirtyblkhd;
2064 } else {
2065 listheadp = &newvp->v_cleanblkhd;
2066 }
2067 bufinsvn(bp, listheadp);
2068
2069 lck_mtx_unlock(&buf_mtx);
2070 }
2071
2072 static __inline__ void
2073 bufhdrinit(buf_t bp)
2074 {
2075 bzero((char *)bp, sizeof *bp);
2076 bp->b_dev = NODEV;
2077 bp->b_rcred = NOCRED;
2078 bp->b_wcred = NOCRED;
2079 bp->b_vnbufs.le_next = NOLIST;
2080 bp->b_flags = B_INVAL;
2081
2082 return;
2083 }
2084
2085 /*
2086 * Initialize buffers and hash links for buffers.
2087 */
2088 __private_extern__ void
2089 bufinit(void)
2090 {
2091 buf_t bp;
2092 struct bqueues *dp;
2093 int i;
2094
2095 nbuf_headers = 0;
2096 /* Initialize the buffer queues ('freelists') and the hash table */
2097 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) {
2098 TAILQ_INIT(dp);
2099 }
2100 bufhashtbl = hashinit(nbuf_hashelements, M_CACHE, &bufhash);
2101
2102 buf_busycount = 0;
2103
2104 /* Initialize the buffer headers */
2105 for (i = 0; i < max_nbuf_headers; i++) {
2106 nbuf_headers++;
2107 bp = &buf_headers[i];
2108 bufhdrinit(bp);
2109
2110 BLISTNONE(bp);
2111 dp = &bufqueues[BQ_EMPTY];
2112 bp->b_whichq = BQ_EMPTY;
2113 bp->b_timestamp = buf_timestamp();
2114 binsheadfree(bp, dp, BQ_EMPTY);
2115 binshash(bp, &invalhash);
2116 }
2117 boot_nbuf_headers = nbuf_headers;
2118
2119 TAILQ_INIT(&iobufqueue);
2120 TAILQ_INIT(&delaybufqueue);
2121
2122 for (; i < nbuf_headers + niobuf_headers; i++) {
2123 bp = &buf_headers[i];
2124 bufhdrinit(bp);
2125 bp->b_whichq = -1;
2126 binsheadfree(bp, &iobufqueue, -1);
2127 }
2128
2129 /*
2130 * allocate and initialize cluster specific global locks...
2131 */
2132 cluster_init();
2133
2134 printf("using %d buffer headers and %d cluster IO buffer headers\n",
2135 nbuf_headers, niobuf_headers);
2136
2137 /* start the bcleanbuf() thread */
2138 bcleanbuf_thread_init();
2139
2140 /* Register a callout for relieving vm pressure */
2141 if (vm_set_buffer_cleanup_callout(buffer_cache_gc) != KERN_SUCCESS) {
2142 panic("Couldn't register buffer cache callout for vm pressure!\n");
2143 }
2144 }
2145
2146 /*
2147 * Zones for the meta data buffers
2148 */
2149
2150 #define MINMETA 512
2151 #define MAXMETA 16384
2152
2153 KALLOC_HEAP_DEFINE(KHEAP_VFS_BIO, "vfs_bio", KHEAP_ID_DATA_BUFFERS);
2154
2155 static struct buf *
2156 bio_doread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, int async, int queuetype)
2157 {
2158 buf_t bp;
2159
2160 bp = buf_getblk(vp, blkno, size, 0, 0, queuetype);
2161
2162 /*
2163 * If buffer does not have data valid, start a read.
2164 * Note that if buffer is B_INVAL, buf_getblk() won't return it.
2165 * Therefore, it's valid if it's I/O has completed or been delayed.
2166 */
2167 if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
2168 struct proc *p;
2169
2170 p = current_proc();
2171
2172 /* Start I/O for the buffer (keeping credentials). */
2173 SET(bp->b_flags, B_READ | async);
2174 if (IS_VALID_CRED(cred) && !IS_VALID_CRED(bp->b_rcred)) {
2175 kauth_cred_ref(cred);
2176 bp->b_rcred = cred;
2177 }
2178
2179 VNOP_STRATEGY(bp);
2180
2181 trace(TR_BREADMISS, pack(vp, size), blkno);
2182
2183 /* Pay for the read. */
2184 if (p && p->p_stats) {
2185 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_inblock); /* XXX */
2186 }
2187
2188 if (async) {
2189 /*
2190 * since we asked for an ASYNC I/O
2191 * the biodone will do the brelse
2192 * we don't want to pass back a bp
2193 * that we don't 'own'
2194 */
2195 bp = NULL;
2196 }
2197 } else if (async) {
2198 buf_brelse(bp);
2199 bp = NULL;
2200 }
2201
2202 trace(TR_BREADHIT, pack(vp, size), blkno);
2203
2204 return bp;
2205 }
2206
2207 /*
2208 * Perform the reads for buf_breadn() and buf_meta_breadn().
2209 * Trivial modification to the breada algorithm presented in Bach (p.55).
2210 */
2211 static errno_t
2212 do_breadn_for_type(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes,
2213 int nrablks, kauth_cred_t cred, buf_t *bpp, int queuetype)
2214 {
2215 buf_t bp;
2216 int i;
2217
2218 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, queuetype);
2219
2220 /*
2221 * For each of the read-ahead blocks, start a read, if necessary.
2222 */
2223 for (i = 0; i < nrablks; i++) {
2224 /* If it's in the cache, just go on to next one. */
2225 if (incore(vp, rablks[i])) {
2226 continue;
2227 }
2228
2229 /* Get a buffer for the read-ahead block */
2230 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, queuetype);
2231 }
2232
2233 /* Otherwise, we had to start a read for it; wait until it's valid. */
2234 return buf_biowait(bp);
2235 }
2236
2237
2238 /*
2239 * Read a disk block.
2240 * This algorithm described in Bach (p.54).
2241 */
2242 errno_t
2243 buf_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp)
2244 {
2245 buf_t bp;
2246
2247 /* Get buffer for block. */
2248 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
2249
2250 /* Wait for the read to complete, and return result. */
2251 return buf_biowait(bp);
2252 }
2253
2254 /*
2255 * Read a disk block. [bread() for meta-data]
2256 * This algorithm described in Bach (p.54).
2257 */
2258 errno_t
2259 buf_meta_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp)
2260 {
2261 buf_t bp;
2262
2263 /* Get buffer for block. */
2264 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
2265
2266 /* Wait for the read to complete, and return result. */
2267 return buf_biowait(bp);
2268 }
2269
2270 /*
2271 * Read-ahead multiple disk blocks. The first is sync, the rest async.
2272 */
2273 errno_t
2274 buf_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, kauth_cred_t cred, buf_t *bpp)
2275 {
2276 return do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_READ);
2277 }
2278
2279 /*
2280 * Read-ahead multiple disk blocks. The first is sync, the rest async.
2281 * [buf_breadn() for meta-data]
2282 */
2283 errno_t
2284 buf_meta_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, kauth_cred_t cred, buf_t *bpp)
2285 {
2286 return do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_META);
2287 }
2288
2289 /*
2290 * Block write. Described in Bach (p.56)
2291 */
2292 errno_t
2293 buf_bwrite(buf_t bp)
2294 {
2295 int sync, wasdelayed;
2296 errno_t rv;
2297 proc_t p = current_proc();
2298 vnode_t vp = bp->b_vp;
2299
2300 if (bp->b_datap == 0) {
2301 if (brecover_data(bp) == 0) {
2302 return 0;
2303 }
2304 }
2305 /* Remember buffer type, to switch on it later. */
2306 sync = !ISSET(bp->b_flags, B_ASYNC);
2307 wasdelayed = ISSET(bp->b_flags, B_DELWRI);
2308 CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
2309
2310 if (wasdelayed) {
2311 OSAddAtomicLong(-1, &nbdwrite);
2312 }
2313
2314 if (!sync) {
2315 /*
2316 * If not synchronous, pay for the I/O operation and make
2317 * sure the buf is on the correct vnode queue. We have
2318 * to do this now, because if we don't, the vnode may not
2319 * be properly notified that its I/O has completed.
2320 */
2321 if (wasdelayed) {
2322 buf_reassign(bp, vp);
2323 } else if (p && p->p_stats) {
2324 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); /* XXX */
2325 }
2326 }
2327 trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
2328
2329 /* Initiate disk write. Make sure the appropriate party is charged. */
2330
2331 OSAddAtomic(1, &vp->v_numoutput);
2332
2333 VNOP_STRATEGY(bp);
2334
2335 if (sync) {
2336 /*
2337 * If I/O was synchronous, wait for it to complete.
2338 */
2339 rv = buf_biowait(bp);
2340
2341 /*
2342 * Pay for the I/O operation, if it's not been paid for, and
2343 * make sure it's on the correct vnode queue. (async operatings
2344 * were payed for above.)
2345 */
2346 if (wasdelayed) {
2347 buf_reassign(bp, vp);
2348 } else if (p && p->p_stats) {
2349 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); /* XXX */
2350 }
2351
2352 /* Release the buffer. */
2353 buf_brelse(bp);
2354
2355 return rv;
2356 } else {
2357 return 0;
2358 }
2359 }
2360
2361 int
2362 vn_bwrite(struct vnop_bwrite_args *ap)
2363 {
2364 return buf_bwrite(ap->a_bp);
2365 }
2366
2367 /*
2368 * Delayed write.
2369 *
2370 * The buffer is marked dirty, but is not queued for I/O.
2371 * This routine should be used when the buffer is expected
2372 * to be modified again soon, typically a small write that
2373 * partially fills a buffer.
2374 *
2375 * NB: magnetic tapes cannot be delayed; they must be
2376 * written in the order that the writes are requested.
2377 *
2378 * Described in Leffler, et al. (pp. 208-213).
2379 *
2380 * Note: With the ability to allocate additional buffer
2381 * headers, we can get in to the situation where "too" many
2382 * buf_bdwrite()s can create situation where the kernel can create
2383 * buffers faster than the disks can service. Doing a buf_bawrite() in
2384 * cases where we have "too many" outstanding buf_bdwrite()s avoids that.
2385 */
2386 int
2387 bdwrite_internal(buf_t bp, int return_error)
2388 {
2389 proc_t p = current_proc();
2390 vnode_t vp = bp->b_vp;
2391
2392 /*
2393 * If the block hasn't been seen before:
2394 * (1) Mark it as having been seen,
2395 * (2) Charge for the write.
2396 * (3) Make sure it's on its vnode's correct block list,
2397 */
2398 if (!ISSET(bp->b_flags, B_DELWRI)) {
2399 SET(bp->b_flags, B_DELWRI);
2400 if (p && p->p_stats) {
2401 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); /* XXX */
2402 }
2403 OSAddAtomicLong(1, &nbdwrite);
2404 buf_reassign(bp, vp);
2405 }
2406
2407 /*
2408 * if we're not LOCKED, but the total number of delayed writes
2409 * has climbed above 75% of the total buffers in the system
2410 * return an error if the caller has indicated that it can
2411 * handle one in this case, otherwise schedule the I/O now
2412 * this is done to prevent us from allocating tons of extra
2413 * buffers when dealing with virtual disks (i.e. DiskImages),
2414 * because additional buffers are dynamically allocated to prevent
2415 * deadlocks from occurring
2416 *
2417 * however, can't do a buf_bawrite() if the LOCKED bit is set because the
2418 * buffer is part of a transaction and can't go to disk until
2419 * the LOCKED bit is cleared.
2420 */
2421 if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf_headers / 4) * 3)) {
2422 if (return_error) {
2423 return EAGAIN;
2424 }
2425 /*
2426 * If the vnode has "too many" write operations in progress
2427 * wait for them to finish the IO
2428 */
2429 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, "buf_bdwrite");
2430
2431 return buf_bawrite(bp);
2432 }
2433
2434 /* Otherwise, the "write" is done, so mark and release the buffer. */
2435 SET(bp->b_flags, B_DONE);
2436 buf_brelse(bp);
2437 return 0;
2438 }
2439
2440 errno_t
2441 buf_bdwrite(buf_t bp)
2442 {
2443 return bdwrite_internal(bp, 0);
2444 }
2445
2446
2447 /*
2448 * Asynchronous block write; just an asynchronous buf_bwrite().
2449 *
2450 * Note: With the abilitty to allocate additional buffer
2451 * headers, we can get in to the situation where "too" many
2452 * buf_bawrite()s can create situation where the kernel can create
2453 * buffers faster than the disks can service.
2454 * We limit the number of "in flight" writes a vnode can have to
2455 * avoid this.
2456 */
2457 static int
2458 bawrite_internal(buf_t bp, int throttle)
2459 {
2460 vnode_t vp = bp->b_vp;
2461
2462 if (vp) {
2463 if (throttle) {
2464 /*
2465 * If the vnode has "too many" write operations in progress
2466 * wait for them to finish the IO
2467 */
2468 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, (const char *)"buf_bawrite");
2469 } else if (vp->v_numoutput >= VNODE_ASYNC_THROTTLE) {
2470 /*
2471 * return to the caller and
2472 * let him decide what to do
2473 */
2474 return EWOULDBLOCK;
2475 }
2476 }
2477 SET(bp->b_flags, B_ASYNC);
2478
2479 return VNOP_BWRITE(bp);
2480 }
2481
2482 errno_t
2483 buf_bawrite(buf_t bp)
2484 {
2485 return bawrite_internal(bp, 1);
2486 }
2487
2488
2489
2490 static void
2491 buf_free_meta_store(buf_t bp)
2492 {
2493 if (bp->b_bufsize) {
2494 uintptr_t datap = bp->b_datap;
2495 int bufsize = bp->b_bufsize;
2496
2497 bp->b_datap = (uintptr_t)NULL;
2498 bp->b_bufsize = 0;
2499
2500 /*
2501 * Ensure the assignment of b_datap has global visibility
2502 * before we free the region.
2503 */
2504 OSMemoryBarrier();
2505
2506 if (ISSET(bp->b_flags, B_ZALLOC)) {
2507 kheap_free(KHEAP_VFS_BIO, datap, bufsize);
2508 } else {
2509 kmem_free(kernel_map, datap, bufsize);
2510 }
2511 }
2512 }
2513
2514
2515 static buf_t
2516 buf_brelse_shadow(buf_t bp)
2517 {
2518 buf_t bp_head;
2519 buf_t bp_temp;
2520 buf_t bp_return = NULL;
2521 #ifdef BUF_MAKE_PRIVATE
2522 buf_t bp_data;
2523 int data_ref = 0;
2524 #endif
2525 int need_wakeup = 0;
2526
2527 lck_mtx_lock_spin(&buf_mtx);
2528
2529 __IGNORE_WCASTALIGN(bp_head = (buf_t)bp->b_orig);
2530
2531 if (bp_head->b_whichq != -1) {
2532 panic("buf_brelse_shadow: bp_head on freelist %d\n", bp_head->b_whichq);
2533 }
2534
2535 #ifdef BUF_MAKE_PRIVATE
2536 if (bp_data = bp->b_data_store) {
2537 bp_data->b_data_ref--;
2538 /*
2539 * snapshot the ref count so that we can check it
2540 * outside of the lock... we only want the guy going
2541 * from 1 -> 0 to try and release the storage
2542 */
2543 data_ref = bp_data->b_data_ref;
2544 }
2545 #endif
2546 KERNEL_DEBUG(0xbbbbc008 | DBG_FUNC_START, bp, bp_head, bp_head->b_shadow_ref, 0, 0);
2547
2548 bp_head->b_shadow_ref--;
2549
2550 for (bp_temp = bp_head; bp_temp && bp != bp_temp->b_shadow; bp_temp = bp_temp->b_shadow) {
2551 ;
2552 }
2553
2554 if (bp_temp == NULL) {
2555 panic("buf_brelse_shadow: bp not on list %p", bp_head);
2556 }
2557
2558 bp_temp->b_shadow = bp_temp->b_shadow->b_shadow;
2559
2560 #ifdef BUF_MAKE_PRIVATE
2561 /*
2562 * we're about to free the current 'owner' of the data buffer and
2563 * there is at least one other shadow buf_t still pointing at it
2564 * so transfer it to the first shadow buf left in the chain
2565 */
2566 if (bp == bp_data && data_ref) {
2567 if ((bp_data = bp_head->b_shadow) == NULL) {
2568 panic("buf_brelse_shadow: data_ref mismatch bp(%p)", bp);
2569 }
2570
2571 for (bp_temp = bp_data; bp_temp; bp_temp = bp_temp->b_shadow) {
2572 bp_temp->b_data_store = bp_data;
2573 }
2574 bp_data->b_data_ref = data_ref;
2575 }
2576 #endif
2577 if (bp_head->b_shadow_ref == 0 && bp_head->b_shadow) {
2578 panic("buf_relse_shadow: b_shadow != NULL && b_shadow_ref == 0 bp(%p)", bp);
2579 }
2580 if (bp_head->b_shadow_ref && bp_head->b_shadow == 0) {
2581 panic("buf_relse_shadow: b_shadow == NULL && b_shadow_ref != 0 bp(%p)", bp);
2582 }
2583
2584 if (bp_head->b_shadow_ref == 0) {
2585 if (!ISSET(bp_head->b_lflags, BL_BUSY)) {
2586 CLR(bp_head->b_flags, B_AGE);
2587 bp_head->b_timestamp = buf_timestamp();
2588
2589 if (ISSET(bp_head->b_flags, B_LOCKED)) {
2590 bp_head->b_whichq = BQ_LOCKED;
2591 binstailfree(bp_head, &bufqueues[BQ_LOCKED], BQ_LOCKED);
2592 } else {
2593 bp_head->b_whichq = BQ_META;
2594 binstailfree(bp_head, &bufqueues[BQ_META], BQ_META);
2595 }
2596 } else if (ISSET(bp_head->b_lflags, BL_WAITSHADOW)) {
2597 CLR(bp_head->b_lflags, BL_WAITSHADOW);
2598
2599 bp_return = bp_head;
2600 }
2601 if (ISSET(bp_head->b_lflags, BL_WANTED_REF)) {
2602 CLR(bp_head->b_lflags, BL_WANTED_REF);
2603 need_wakeup = 1;
2604 }
2605 }
2606 lck_mtx_unlock(&buf_mtx);
2607
2608 if (need_wakeup) {
2609 wakeup(bp_head);
2610 }
2611
2612 #ifdef BUF_MAKE_PRIVATE
2613 if (bp == bp_data && data_ref == 0) {
2614 buf_free_meta_store(bp);
2615 }
2616
2617 bp->b_data_store = NULL;
2618 #endif
2619 KERNEL_DEBUG(0xbbbbc008 | DBG_FUNC_END, bp, 0, 0, 0, 0);
2620
2621 return bp_return;
2622 }
2623
2624
2625 /*
2626 * Release a buffer on to the free lists.
2627 * Described in Bach (p. 46).
2628 */
2629 void
2630 buf_brelse(buf_t bp)
2631 {
2632 struct bqueues *bufq;
2633 int whichq;
2634 upl_t upl;
2635 int need_wakeup = 0;
2636 int need_bp_wakeup = 0;
2637
2638
2639 if (bp->b_whichq != -1 || !(bp->b_lflags & BL_BUSY)) {
2640 panic("buf_brelse: bad buffer = %p\n", bp);
2641 }
2642
2643 #ifdef JOE_DEBUG
2644 (void) OSBacktrace(&bp->b_stackbrelse[0], 6);
2645
2646 bp->b_lastbrelse = current_thread();
2647 bp->b_tag = 0;
2648 #endif
2649 if (bp->b_lflags & BL_IOBUF) {
2650 buf_t shadow_master_bp = NULL;
2651
2652 if (ISSET(bp->b_lflags, BL_SHADOW)) {
2653 shadow_master_bp = buf_brelse_shadow(bp);
2654 } else if (ISSET(bp->b_lflags, BL_IOBUF_ALLOC)) {
2655 buf_free_meta_store(bp);
2656 }
2657 free_io_buf(bp);
2658
2659 if (shadow_master_bp) {
2660 bp = shadow_master_bp;
2661 goto finish_shadow_master;
2662 }
2663 return;
2664 }
2665
2666 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
2667 bp->b_lblkno * PAGE_SIZE, bp, bp->b_datap,
2668 bp->b_flags, 0);
2669
2670 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
2671
2672 /*
2673 * if we're invalidating a buffer that has the B_FILTER bit
2674 * set then call the b_iodone function so it gets cleaned
2675 * up properly.
2676 *
2677 * the HFS journal code depends on this
2678 */
2679 if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
2680 if (ISSET(bp->b_flags, B_FILTER)) { /* if necessary, call out */
2681 void (*iodone_func)(struct buf *, void *) = bp->b_iodone;
2682 void *arg = bp->b_transaction;
2683
2684 CLR(bp->b_flags, B_FILTER); /* but note callout done */
2685 bp->b_iodone = NULL;
2686 bp->b_transaction = NULL;
2687
2688 if (iodone_func == NULL) {
2689 panic("brelse: bp @ %p has NULL b_iodone!\n", bp);
2690 }
2691 (*iodone_func)(bp, arg);
2692 }
2693 }
2694 /*
2695 * I/O is done. Cleanup the UPL state
2696 */
2697 upl = bp->b_upl;
2698
2699 if (!ISSET(bp->b_flags, B_META) && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
2700 kern_return_t kret;
2701 int upl_flags;
2702
2703 if (upl == NULL) {
2704 if (!ISSET(bp->b_flags, B_INVAL)) {
2705 kret = ubc_create_upl_kernel(bp->b_vp,
2706 ubc_blktooff(bp->b_vp, bp->b_lblkno),
2707 bp->b_bufsize,
2708 &upl,
2709 NULL,
2710 UPL_PRECIOUS,
2711 VM_KERN_MEMORY_FILE);
2712
2713 if (kret != KERN_SUCCESS) {
2714 panic("brelse: Failed to create UPL");
2715 }
2716 #if UPL_DEBUG
2717 upl_ubc_alias_set(upl, (uintptr_t) bp, (uintptr_t) 5);
2718 #endif /* UPL_DEBUG */
2719 }
2720 } else {
2721 if (bp->b_datap) {
2722 kret = ubc_upl_unmap(upl);
2723
2724 if (kret != KERN_SUCCESS) {
2725 panic("ubc_upl_unmap failed");
2726 }
2727 bp->b_datap = (uintptr_t)NULL;
2728 }
2729 }
2730 if (upl) {
2731 if (bp->b_flags & (B_ERROR | B_INVAL)) {
2732 if (bp->b_flags & (B_READ | B_INVAL)) {
2733 upl_flags = UPL_ABORT_DUMP_PAGES;
2734 } else {
2735 upl_flags = 0;
2736 }
2737
2738 ubc_upl_abort(upl, upl_flags);
2739 } else {
2740 if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY)) {
2741 upl_flags = UPL_COMMIT_SET_DIRTY;
2742 } else {
2743 upl_flags = UPL_COMMIT_CLEAR_DIRTY;
2744 }
2745
2746 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
2747 UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2748 }
2749 bp->b_upl = NULL;
2750 }
2751 } else {
2752 if ((upl)) {
2753 panic("brelse: UPL set for non VREG; vp=%p", bp->b_vp);
2754 }
2755 }
2756
2757 /*
2758 * If it's locked, don't report an error; try again later.
2759 */
2760 if (ISSET(bp->b_flags, (B_LOCKED | B_ERROR)) == (B_LOCKED | B_ERROR)) {
2761 CLR(bp->b_flags, B_ERROR);
2762 }
2763 /*
2764 * If it's not cacheable, or an error, mark it invalid.
2765 */
2766 if (ISSET(bp->b_flags, (B_NOCACHE | B_ERROR))) {
2767 SET(bp->b_flags, B_INVAL);
2768 }
2769
2770 if ((bp->b_bufsize <= 0) ||
2771 ISSET(bp->b_flags, B_INVAL) ||
2772 (ISSET(bp->b_lflags, BL_WANTDEALLOC) && !ISSET(bp->b_flags, B_DELWRI))) {
2773 boolean_t delayed_buf_free_meta_store = FALSE;
2774
2775 /*
2776 * If it's invalid or empty, dissociate it from its vnode,
2777 * release its storage if B_META, and
2778 * clean it up a bit and put it on the EMPTY queue
2779 */
2780 if (ISSET(bp->b_flags, B_DELWRI)) {
2781 OSAddAtomicLong(-1, &nbdwrite);
2782 }
2783
2784 if (ISSET(bp->b_flags, B_META)) {
2785 if (bp->b_shadow_ref) {
2786 delayed_buf_free_meta_store = TRUE;
2787 } else {
2788 buf_free_meta_store(bp);
2789 }
2790 }
2791 /*
2792 * nuke any credentials we were holding
2793 */
2794 buf_release_credentials(bp);
2795
2796 lck_mtx_lock_spin(&buf_mtx);
2797
2798 if (bp->b_shadow_ref) {
2799 SET(bp->b_lflags, BL_WAITSHADOW);
2800
2801 lck_mtx_unlock(&buf_mtx);
2802
2803 return;
2804 }
2805 if (delayed_buf_free_meta_store == TRUE) {
2806 lck_mtx_unlock(&buf_mtx);
2807 finish_shadow_master:
2808 buf_free_meta_store(bp);
2809
2810 lck_mtx_lock_spin(&buf_mtx);
2811 }
2812 CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA));
2813
2814 if (bp->b_vp) {
2815 brelvp_locked(bp);
2816 }
2817
2818 bremhash(bp);
2819 BLISTNONE(bp);
2820 binshash(bp, &invalhash);
2821
2822 bp->b_whichq = BQ_EMPTY;
2823 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
2824 } else {
2825 /*
2826 * It has valid data. Put it on the end of the appropriate
2827 * queue, so that it'll stick around for as long as possible.
2828 */
2829 if (ISSET(bp->b_flags, B_LOCKED)) {
2830 whichq = BQ_LOCKED; /* locked in core */
2831 } else if (ISSET(bp->b_flags, B_META)) {
2832 whichq = BQ_META; /* meta-data */
2833 } else if (ISSET(bp->b_flags, B_AGE)) {
2834 whichq = BQ_AGE; /* stale but valid data */
2835 } else {
2836 whichq = BQ_LRU; /* valid data */
2837 }
2838 bufq = &bufqueues[whichq];
2839
2840 bp->b_timestamp = buf_timestamp();
2841
2842 lck_mtx_lock_spin(&buf_mtx);
2843
2844 /*
2845 * the buf_brelse_shadow routine doesn't take 'ownership'
2846 * of the parent buf_t... it updates state that is protected by
2847 * the buf_mtx, and checks for BL_BUSY to determine whether to
2848 * put the buf_t back on a free list. b_shadow_ref is protected
2849 * by the lock, and since we have not yet cleared B_BUSY, we need
2850 * to check it while holding the lock to insure that one of us
2851 * puts this buf_t back on a free list when it is safe to do so
2852 */
2853 if (bp->b_shadow_ref == 0) {
2854 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE));
2855 bp->b_whichq = whichq;
2856 binstailfree(bp, bufq, whichq);
2857 } else {
2858 /*
2859 * there are still cloned buf_t's pointing
2860 * at this guy... need to keep it off the
2861 * freelists until a buf_brelse is done on
2862 * the last clone
2863 */
2864 CLR(bp->b_flags, (B_ASYNC | B_NOCACHE));
2865 }
2866 }
2867 if (needbuffer) {
2868 /*
2869 * needbuffer is a global
2870 * we're currently using buf_mtx to protect it
2871 * delay doing the actual wakeup until after
2872 * we drop buf_mtx
2873 */
2874 needbuffer = 0;
2875 need_wakeup = 1;
2876 }
2877 if (ISSET(bp->b_lflags, BL_WANTED)) {
2878 /*
2879 * delay the actual wakeup until after we
2880 * clear BL_BUSY and we've dropped buf_mtx
2881 */
2882 need_bp_wakeup = 1;
2883 }
2884 /*
2885 * Unlock the buffer.
2886 */
2887 CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
2888 buf_busycount--;
2889
2890 lck_mtx_unlock(&buf_mtx);
2891
2892 if (need_wakeup) {
2893 /*
2894 * Wake up any processes waiting for any buffer to become free.
2895 */
2896 wakeup(&needbuffer);
2897 }
2898 if (need_bp_wakeup) {
2899 /*
2900 * Wake up any proceeses waiting for _this_ buffer to become free.
2901 */
2902 wakeup(bp);
2903 }
2904 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
2905 bp, bp->b_datap, bp->b_flags, 0, 0);
2906 }
2907
2908 /*
2909 * Determine if a block is in the cache.
2910 * Just look on what would be its hash chain. If it's there, return
2911 * a pointer to it, unless it's marked invalid. If it's marked invalid,
2912 * we normally don't return the buffer, unless the caller explicitly
2913 * wants us to.
2914 */
2915 static boolean_t
2916 incore(vnode_t vp, daddr64_t blkno)
2917 {
2918 boolean_t retval;
2919 struct bufhashhdr *dp;
2920
2921 dp = BUFHASH(vp, blkno);
2922
2923 lck_mtx_lock_spin(&buf_mtx);
2924
2925 if (incore_locked(vp, blkno, dp)) {
2926 retval = TRUE;
2927 } else {
2928 retval = FALSE;
2929 }
2930 lck_mtx_unlock(&buf_mtx);
2931
2932 return retval;
2933 }
2934
2935
2936 static buf_t
2937 incore_locked(vnode_t vp, daddr64_t blkno, struct bufhashhdr *dp)
2938 {
2939 struct buf *bp;
2940
2941 /* Search hash chain */
2942 for (bp = dp->lh_first; bp != NULL; bp = bp->b_hash.le_next) {
2943 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
2944 !ISSET(bp->b_flags, B_INVAL)) {
2945 return bp;
2946 }
2947 }
2948 return NULL;
2949 }
2950
2951
2952 void
2953 buf_wait_for_shadow_io(vnode_t vp, daddr64_t blkno)
2954 {
2955 buf_t bp;
2956 struct bufhashhdr *dp;
2957
2958 dp = BUFHASH(vp, blkno);
2959
2960 lck_mtx_lock_spin(&buf_mtx);
2961
2962 for (;;) {
2963 if ((bp = incore_locked(vp, blkno, dp)) == NULL) {
2964 break;
2965 }
2966
2967 if (bp->b_shadow_ref == 0) {
2968 break;
2969 }
2970
2971 SET(bp->b_lflags, BL_WANTED_REF);
2972
2973 (void) msleep(bp, &buf_mtx, PSPIN | (PRIBIO + 1), "buf_wait_for_shadow", NULL);
2974 }
2975 lck_mtx_unlock(&buf_mtx);
2976 }
2977
2978 /* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
2979 /*
2980 * Get a block of requested size that is associated with
2981 * a given vnode and block offset. If it is found in the
2982 * block cache, mark it as having been found, make it busy
2983 * and return it. Otherwise, return an empty block of the
2984 * correct size. It is up to the caller to insure that the
2985 * cached blocks be of the correct size.
2986 */
2987 buf_t
2988 buf_getblk(vnode_t vp, daddr64_t blkno, int size, int slpflag, int slptimeo, int operation)
2989 {
2990 buf_t bp;
2991 int err;
2992 upl_t upl;
2993 upl_page_info_t *pl;
2994 kern_return_t kret;
2995 int ret_only_valid;
2996 struct timespec ts;
2997 int upl_flags;
2998 struct bufhashhdr *dp;
2999
3000 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
3001 (uintptr_t)(blkno * PAGE_SIZE), size, operation, 0, 0);
3002
3003 ret_only_valid = operation & BLK_ONLYVALID;
3004 operation &= ~BLK_ONLYVALID;
3005 dp = BUFHASH(vp, blkno);
3006 start:
3007 lck_mtx_lock_spin(&buf_mtx);
3008
3009 if ((bp = incore_locked(vp, blkno, dp))) {
3010 /*
3011 * Found in the Buffer Cache
3012 */
3013 if (ISSET(bp->b_lflags, BL_BUSY)) {
3014 /*
3015 * but is busy
3016 */
3017 switch (operation) {
3018 case BLK_READ:
3019 case BLK_WRITE:
3020 case BLK_META:
3021 SET(bp->b_lflags, BL_WANTED);
3022 bufstats.bufs_busyincore++;
3023
3024 /*
3025 * don't retake the mutex after being awakened...
3026 * the time out is in msecs
3027 */
3028 ts.tv_sec = (slptimeo / 1000);
3029 ts.tv_nsec = (slptimeo % 1000) * 10 * NSEC_PER_USEC * 1000;
3030
3031 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 396)) | DBG_FUNC_NONE,
3032 (uintptr_t)blkno, size, operation, 0, 0);
3033
3034 err = msleep(bp, &buf_mtx, slpflag | PDROP | (PRIBIO + 1), "buf_getblk", &ts);
3035
3036 /*
3037 * Callers who call with PCATCH or timeout are
3038 * willing to deal with the NULL pointer
3039 */
3040 if (err && ((slpflag & PCATCH) || ((err == EWOULDBLOCK) && slptimeo))) {
3041 return NULL;
3042 }
3043 goto start;
3044 /*NOTREACHED*/
3045
3046 default:
3047 /*
3048 * unknown operation requested
3049 */
3050 panic("getblk: paging or unknown operation for incore busy buffer - %x\n", operation);
3051 /*NOTREACHED*/
3052 break;
3053 }
3054 } else {
3055 int clear_bdone;
3056
3057 /*
3058 * buffer in core and not busy
3059 */
3060 SET(bp->b_lflags, BL_BUSY);
3061 SET(bp->b_flags, B_CACHE);
3062 buf_busycount++;
3063
3064 bremfree_locked(bp);
3065 bufstats.bufs_incore++;
3066
3067 lck_mtx_unlock(&buf_mtx);
3068 #ifdef JOE_DEBUG
3069 bp->b_owner = current_thread();
3070 bp->b_tag = 1;
3071 #endif
3072 if ((bp->b_upl)) {
3073 panic("buffer has UPL, but not marked BUSY: %p", bp);
3074 }
3075
3076 clear_bdone = FALSE;
3077 if (!ret_only_valid) {
3078 /*
3079 * If the number bytes that are valid is going
3080 * to increase (even if we end up not doing a
3081 * reallocation through allocbuf) we have to read
3082 * the new size first.
3083 *
3084 * This is required in cases where we doing a read
3085 * modify write of a already valid data on disk but
3086 * in cases where the data on disk beyond (blkno + b_bcount)
3087 * is invalid, we may end up doing extra I/O.
3088 */
3089 if (operation == BLK_META && bp->b_bcount < (uint32_t)size) {
3090 /*
3091 * Since we are going to read in the whole size first
3092 * we first have to ensure that any pending delayed write
3093 * is flushed to disk first.
3094 */
3095 if (ISSET(bp->b_flags, B_DELWRI)) {
3096 CLR(bp->b_flags, B_CACHE);
3097 buf_bwrite(bp);
3098 goto start;
3099 }
3100 /*
3101 * clear B_DONE before returning from
3102 * this function so that the caller can
3103 * can issue a read for the new size.
3104 */
3105 clear_bdone = TRUE;
3106 }
3107
3108 if (bp->b_bufsize != (uint32_t)size) {
3109 allocbuf(bp, size);
3110 }
3111 }
3112
3113 upl_flags = 0;
3114 switch (operation) {
3115 case BLK_WRITE:
3116 /*
3117 * "write" operation: let the UPL subsystem
3118 * know that we intend to modify the buffer
3119 * cache pages we're gathering.
3120 */
3121 upl_flags |= UPL_WILL_MODIFY;
3122 OS_FALLTHROUGH;
3123 case BLK_READ:
3124 upl_flags |= UPL_PRECIOUS;
3125 if (UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
3126 kret = ubc_create_upl_kernel(vp,
3127 ubc_blktooff(vp, bp->b_lblkno),
3128 bp->b_bufsize,
3129 &upl,
3130 &pl,
3131 upl_flags,
3132 VM_KERN_MEMORY_FILE);
3133 if (kret != KERN_SUCCESS) {
3134 panic("Failed to create UPL");
3135 }
3136
3137 bp->b_upl = upl;
3138
3139 if (upl_valid_page(pl, 0)) {
3140 if (upl_dirty_page(pl, 0)) {
3141 SET(bp->b_flags, B_WASDIRTY);
3142 } else {
3143 CLR(bp->b_flags, B_WASDIRTY);
3144 }
3145 } else {
3146 CLR(bp->b_flags, (B_DONE | B_CACHE | B_WASDIRTY | B_DELWRI));
3147 }
3148
3149 kret = ubc_upl_map(upl, (vm_offset_t*)&(bp->b_datap));
3150
3151 if (kret != KERN_SUCCESS) {
3152 panic("getblk: ubc_upl_map() failed with (%d)", kret);
3153 }
3154 }
3155 break;
3156
3157 case BLK_META:
3158 /*
3159 * VM is not involved in IO for the meta data
3160 * buffer already has valid data
3161 */
3162 break;
3163
3164 default:
3165 panic("getblk: paging or unknown operation for incore buffer- %d\n", operation);
3166 /*NOTREACHED*/
3167 break;
3168 }
3169
3170 if (clear_bdone) {
3171 CLR(bp->b_flags, B_DONE);
3172 }
3173 }
3174 } else { /* not incore() */
3175 int queue = BQ_EMPTY; /* Start with no preference */
3176
3177 if (ret_only_valid) {
3178 lck_mtx_unlock(&buf_mtx);
3179 return NULL;
3180 }
3181 if ((vnode_isreg(vp) == 0) || (UBCINFOEXISTS(vp) == 0) /*|| (vnode_issystem(vp) == 1)*/) {
3182 operation = BLK_META;
3183 }
3184
3185 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL) {
3186 goto start;
3187 }
3188
3189 /*
3190 * getnewbuf may block for a number of different reasons...
3191 * if it does, it's then possible for someone else to
3192 * create a buffer for the same block and insert it into
3193 * the hash... if we see it incore at this point we dump
3194 * the buffer we were working on and start over
3195 */
3196 if (incore_locked(vp, blkno, dp)) {
3197 SET(bp->b_flags, B_INVAL);
3198 binshash(bp, &invalhash);
3199
3200 lck_mtx_unlock(&buf_mtx);
3201
3202 buf_brelse(bp);
3203 goto start;
3204 }
3205 /*
3206 * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
3207 * CALLED! BE CAREFUL.
3208 */
3209
3210 /*
3211 * mark the buffer as B_META if indicated
3212 * so that when buffer is released it will goto META queue
3213 */
3214 if (operation == BLK_META) {
3215 SET(bp->b_flags, B_META);
3216 }
3217
3218 bp->b_blkno = bp->b_lblkno = blkno;
3219 bp->b_vp = vp;
3220
3221 /*
3222 * Insert in the hash so that incore() can find it
3223 */
3224 binshash(bp, BUFHASH(vp, blkno));
3225
3226 bgetvp_locked(vp, bp);
3227
3228 lck_mtx_unlock(&buf_mtx);
3229
3230 allocbuf(bp, size);
3231
3232 upl_flags = 0;
3233 switch (operation) {
3234 case BLK_META:
3235 /*
3236 * buffer data is invalid...
3237 *
3238 * I don't want to have to retake buf_mtx,
3239 * so the miss and vmhits counters are done
3240 * with Atomic updates... all other counters
3241 * in bufstats are protected with either
3242 * buf_mtx or iobuffer_mtxp
3243 */
3244 OSAddAtomicLong(1, &bufstats.bufs_miss);
3245 break;
3246
3247 case BLK_WRITE:
3248 /*
3249 * "write" operation: let the UPL subsystem know
3250 * that we intend to modify the buffer cache pages
3251 * we're gathering.
3252 */
3253 upl_flags |= UPL_WILL_MODIFY;
3254 OS_FALLTHROUGH;
3255 case BLK_READ:
3256 { off_t f_offset;
3257 size_t contig_bytes;
3258 int bmap_flags;
3259
3260 #if DEVELOPMENT || DEBUG
3261 /*
3262 * Apple implemented file systems use UBC excludively; they should
3263 * not call in here."
3264 */
3265 const char* excldfs[] = {"hfs", "afpfs", "smbfs", "acfs",
3266 "exfat", "msdos", "webdav", NULL};
3267
3268 for (int i = 0; excldfs[i] != NULL; i++) {
3269 if (vp->v_mount &&
3270 !strcmp(vp->v_mount->mnt_vfsstat.f_fstypename,
3271 excldfs[i])) {
3272 panic("%s %s calls buf_getblk",
3273 excldfs[i],
3274 operation == BLK_READ ? "BLK_READ" : "BLK_WRITE");
3275 }
3276 }
3277 #endif
3278
3279 if ((bp->b_upl)) {
3280 panic("bp already has UPL: %p", bp);
3281 }
3282
3283 f_offset = ubc_blktooff(vp, blkno);
3284
3285 upl_flags |= UPL_PRECIOUS;
3286 kret = ubc_create_upl_kernel(vp,
3287 f_offset,
3288 bp->b_bufsize,
3289 &upl,
3290 &pl,
3291 upl_flags,
3292 VM_KERN_MEMORY_FILE);
3293
3294 if (kret != KERN_SUCCESS) {
3295 panic("Failed to create UPL");
3296 }
3297 #if UPL_DEBUG
3298 upl_ubc_alias_set(upl, (uintptr_t) bp, (uintptr_t) 4);
3299 #endif /* UPL_DEBUG */
3300 bp->b_upl = upl;
3301
3302 if (upl_valid_page(pl, 0)) {
3303 if (operation == BLK_READ) {
3304 bmap_flags = VNODE_READ;
3305 } else {
3306 bmap_flags = VNODE_WRITE;
3307 }
3308
3309 SET(bp->b_flags, B_CACHE | B_DONE);
3310
3311 OSAddAtomicLong(1, &bufstats.bufs_vmhits);
3312
3313 bp->b_validoff = 0;
3314 bp->b_dirtyoff = 0;
3315
3316 if (upl_dirty_page(pl, 0)) {
3317 /* page is dirty */
3318 SET(bp->b_flags, B_WASDIRTY);
3319
3320 bp->b_validend = bp->b_bcount;
3321 bp->b_dirtyend = bp->b_bcount;
3322 } else {
3323 /* page is clean */
3324 bp->b_validend = bp->b_bcount;
3325 bp->b_dirtyend = 0;
3326 }
3327 /*
3328 * try to recreate the physical block number associated with
3329 * this buffer...
3330 */
3331 if (VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL)) {
3332 panic("getblk: VNOP_BLOCKMAP failed");
3333 }
3334 /*
3335 * if the extent represented by this buffer
3336 * is not completely physically contiguous on
3337 * disk, than we can't cache the physical mapping
3338 * in the buffer header
3339 */
3340 if ((uint32_t)contig_bytes < bp->b_bcount) {
3341 bp->b_blkno = bp->b_lblkno;
3342 }
3343 } else {
3344 OSAddAtomicLong(1, &bufstats.bufs_miss);
3345 }
3346 kret = ubc_upl_map(upl, (vm_offset_t *)&(bp->b_datap));
3347
3348 if (kret != KERN_SUCCESS) {
3349 panic("getblk: ubc_upl_map() failed with (%d)", kret);
3350 }
3351 break;} // end BLK_READ
3352 default:
3353 panic("getblk: paging or unknown operation - %x", operation);
3354 /*NOTREACHED*/
3355 break;
3356 } // end switch
3357 } //end buf_t !incore
3358
3359 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
3360 bp, bp->b_datap, bp->b_flags, 3, 0);
3361
3362 #ifdef JOE_DEBUG
3363 (void) OSBacktrace(&bp->b_stackgetblk[0], 6);
3364 #endif
3365 return bp;
3366 }
3367
3368 /*
3369 * Get an empty, disassociated buffer of given size.
3370 */
3371 buf_t
3372 buf_geteblk(int size)
3373 {
3374 buf_t bp = NULL;
3375 int queue = BQ_EMPTY;
3376
3377 do {
3378 lck_mtx_lock_spin(&buf_mtx);
3379
3380 bp = getnewbuf(0, 0, &queue);
3381 } while (bp == NULL);
3382
3383 SET(bp->b_flags, (B_META | B_INVAL));
3384
3385 #if DIAGNOSTIC
3386 assert(queue == BQ_EMPTY);
3387 #endif /* DIAGNOSTIC */
3388 /* XXX need to implement logic to deal with other queues */
3389
3390 binshash(bp, &invalhash);
3391 bufstats.bufs_eblk++;
3392
3393 lck_mtx_unlock(&buf_mtx);
3394
3395 allocbuf(bp, size);
3396
3397 return bp;
3398 }
3399
3400 uint32_t
3401 buf_redundancy_flags(buf_t bp)
3402 {
3403 return bp->b_redundancy_flags;
3404 }
3405
3406 void
3407 buf_set_redundancy_flags(buf_t bp, uint32_t flags)
3408 {
3409 SET(bp->b_redundancy_flags, flags);
3410 }
3411
3412 void
3413 buf_clear_redundancy_flags(buf_t bp, uint32_t flags)
3414 {
3415 CLR(bp->b_redundancy_flags, flags);
3416 }
3417
3418
3419
3420 static void *
3421 recycle_buf_from_pool(int nsize)
3422 {
3423 buf_t bp;
3424 void *ptr = NULL;
3425
3426 lck_mtx_lock_spin(&buf_mtx);
3427
3428 TAILQ_FOREACH(bp, &bufqueues[BQ_META], b_freelist) {
3429 if (ISSET(bp->b_flags, B_DELWRI) || bp->b_bufsize != (uint32_t)nsize) {
3430 continue;
3431 }
3432 ptr = (void *)bp->b_datap;
3433 bp->b_bufsize = 0;
3434
3435 bcleanbuf(bp, TRUE);
3436 break;
3437 }
3438 lck_mtx_unlock(&buf_mtx);
3439
3440 return ptr;
3441 }
3442
3443
3444
3445 int zalloc_nopagewait_failed = 0;
3446 int recycle_buf_failed = 0;
3447
3448 static void *
3449 grab_memory_for_meta_buf(int nsize)
3450 {
3451 void *ptr;
3452 boolean_t was_vmpriv;
3453
3454
3455 /*
3456 * make sure we're NOT priviliged so that
3457 * if a vm_page_grab is needed, it won't
3458 * block if we're out of free pages... if
3459 * it blocks, then we can't honor the
3460 * nopagewait request
3461 */
3462 was_vmpriv = set_vm_privilege(FALSE);
3463
3464 ptr = kheap_alloc(KHEAP_VFS_BIO, nsize, Z_NOPAGEWAIT);
3465
3466 if (was_vmpriv == TRUE) {
3467 set_vm_privilege(TRUE);
3468 }
3469
3470 if (ptr == NULL) {
3471 zalloc_nopagewait_failed++;
3472
3473 ptr = recycle_buf_from_pool(nsize);
3474
3475 if (ptr == NULL) {
3476 recycle_buf_failed++;
3477
3478 if (was_vmpriv == FALSE) {
3479 set_vm_privilege(TRUE);
3480 }
3481
3482 ptr = kheap_alloc(KHEAP_VFS_BIO, nsize, Z_WAITOK);
3483
3484 if (was_vmpriv == FALSE) {
3485 set_vm_privilege(FALSE);
3486 }
3487 }
3488 }
3489 return ptr;
3490 }
3491
3492 /*
3493 * With UBC, there is no need to expand / shrink the file data
3494 * buffer. The VM uses the same pages, hence no waste.
3495 * All the file data buffers can have one size.
3496 * In fact expand / shrink would be an expensive operation.
3497 *
3498 * Only exception to this is meta-data buffers. Most of the
3499 * meta data operations are smaller than PAGE_SIZE. Having the
3500 * meta-data buffers grow and shrink as needed, optimizes use
3501 * of the kernel wired memory.
3502 */
3503
3504 int
3505 allocbuf(buf_t bp, int size)
3506 {
3507 vm_size_t desired_size;
3508
3509 desired_size = roundup(size, CLBYTES);
3510
3511 if (desired_size < PAGE_SIZE) {
3512 desired_size = PAGE_SIZE;
3513 }
3514 if (desired_size > MAXBSIZE) {
3515 panic("allocbuf: buffer larger than MAXBSIZE requested");
3516 }
3517
3518 if (ISSET(bp->b_flags, B_META)) {
3519 int nsize = roundup(size, MINMETA);
3520
3521 if (bp->b_datap) {
3522 void *elem = (void *)bp->b_datap;
3523
3524 if (ISSET(bp->b_flags, B_ZALLOC)) {
3525 if (bp->b_bufsize < (uint32_t)nsize) {
3526 /* reallocate to a bigger size */
3527
3528 if (nsize <= MAXMETA) {
3529 desired_size = nsize;
3530
3531 /* b_datap not really a ptr */
3532 *(void **)(&bp->b_datap) = grab_memory_for_meta_buf(nsize);
3533 } else {
3534 bp->b_datap = (uintptr_t)NULL;
3535 kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size, VM_KERN_MEMORY_FILE);
3536 CLR(bp->b_flags, B_ZALLOC);
3537 }
3538 bcopy(elem, (caddr_t)bp->b_datap, bp->b_bufsize);
3539 kheap_free(KHEAP_VFS_BIO, elem, bp->b_bufsize);
3540 } else {
3541 desired_size = bp->b_bufsize;
3542 }
3543 } else {
3544 if ((vm_size_t)bp->b_bufsize < desired_size) {
3545 /* reallocate to a bigger size */
3546 bp->b_datap = (uintptr_t)NULL;
3547 kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size, VM_KERN_MEMORY_FILE);
3548 bcopy(elem, (caddr_t)bp->b_datap, bp->b_bufsize);
3549 kmem_free(kernel_map, (vm_offset_t)elem, bp->b_bufsize);
3550 } else {
3551 desired_size = bp->b_bufsize;
3552 }
3553 }
3554 } else {
3555 /* new allocation */
3556 if (nsize <= MAXMETA) {
3557 desired_size = nsize;
3558
3559 /* b_datap not really a ptr */
3560 *(void **)(&bp->b_datap) = grab_memory_for_meta_buf(nsize);
3561 SET(bp->b_flags, B_ZALLOC);
3562 } else {
3563 kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size, VM_KERN_MEMORY_FILE);
3564 }
3565 }
3566
3567 if (bp->b_datap == 0) {
3568 panic("allocbuf: NULL b_datap");
3569 }
3570 }
3571 bp->b_bufsize = (uint32_t)desired_size;
3572 bp->b_bcount = size;
3573
3574 return 0;
3575 }
3576
3577 /*
3578 * Get a new buffer from one of the free lists.
3579 *
3580 * Request for a queue is passes in. The queue from which the buffer was taken
3581 * from is returned. Out of range queue requests get BQ_EMPTY. Request for
3582 * BQUEUE means no preference. Use heuristics in that case.
3583 * Heuristics is as follows:
3584 * Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
3585 * If none available block till one is made available.
3586 * If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
3587 * Pick the most stale buffer.
3588 * If found buffer was marked delayed write, start the async. write
3589 * and restart the search.
3590 * Initialize the fields and disassociate the buffer from the vnode.
3591 * Remove the buffer from the hash. Return the buffer and the queue
3592 * on which it was found.
3593 *
3594 * buf_mtx is held upon entry
3595 * returns with buf_mtx locked if new buf available
3596 * returns with buf_mtx UNlocked if new buf NOT available
3597 */
3598
3599 static buf_t
3600 getnewbuf(int slpflag, int slptimeo, int * queue)
3601 {
3602 buf_t bp;
3603 buf_t lru_bp;
3604 buf_t age_bp;
3605 buf_t meta_bp;
3606 int age_time, lru_time, bp_time, meta_time;
3607 int req = *queue; /* save it for restarts */
3608 struct timespec ts;
3609
3610 start:
3611 /*
3612 * invalid request gets empty queue
3613 */
3614 if ((*queue >= BQUEUES) || (*queue < 0)
3615 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED)) {
3616 *queue = BQ_EMPTY;
3617 }
3618
3619
3620 if (*queue == BQ_EMPTY && (bp = bufqueues[*queue].tqh_first)) {
3621 goto found;
3622 }
3623
3624 /*
3625 * need to grow number of bufs, add another one rather than recycling
3626 */
3627 if (nbuf_headers < max_nbuf_headers) {
3628 /*
3629 * Increment count now as lock
3630 * is dropped for allocation.
3631 * That avoids over commits
3632 */
3633 nbuf_headers++;
3634 goto add_newbufs;
3635 }
3636 /* Try for the requested queue first */
3637 bp = bufqueues[*queue].tqh_first;
3638 if (bp) {
3639 goto found;
3640 }
3641
3642 /* Unable to use requested queue */
3643 age_bp = bufqueues[BQ_AGE].tqh_first;
3644 lru_bp = bufqueues[BQ_LRU].tqh_first;
3645 meta_bp = bufqueues[BQ_META].tqh_first;
3646
3647 if (!age_bp && !lru_bp && !meta_bp) {
3648 /*
3649 * Unavailble on AGE or LRU or META queues
3650 * Try the empty list first
3651 */
3652 bp = bufqueues[BQ_EMPTY].tqh_first;
3653 if (bp) {
3654 *queue = BQ_EMPTY;
3655 goto found;
3656 }
3657 /*
3658 * We have seen is this is hard to trigger.
3659 * This is an overcommit of nbufs but needed
3660 * in some scenarios with diskiamges
3661 */
3662
3663 add_newbufs:
3664 lck_mtx_unlock(&buf_mtx);
3665
3666 /* Create a new temporary buffer header */
3667 bp = (struct buf *)zalloc(buf_hdr_zone);
3668
3669 if (bp) {
3670 bufhdrinit(bp);
3671 bp->b_whichq = BQ_EMPTY;
3672 bp->b_timestamp = buf_timestamp();
3673 BLISTNONE(bp);
3674 SET(bp->b_flags, B_HDRALLOC);
3675 *queue = BQ_EMPTY;
3676 }
3677 lck_mtx_lock_spin(&buf_mtx);
3678
3679 if (bp) {
3680 binshash(bp, &invalhash);
3681 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
3682 buf_hdr_count++;
3683 goto found;
3684 }
3685 /* subtract already accounted bufcount */
3686 nbuf_headers--;
3687
3688 bufstats.bufs_sleeps++;
3689
3690 /* wait for a free buffer of any kind */
3691 needbuffer = 1;
3692 /* hz value is 100 */
3693 ts.tv_sec = (slptimeo / 1000);
3694 /* the hz value is 100; which leads to 10ms */
3695 ts.tv_nsec = (slptimeo % 1000) * NSEC_PER_USEC * 1000 * 10;
3696
3697 msleep(&needbuffer, &buf_mtx, slpflag | PDROP | (PRIBIO + 1), "getnewbuf", &ts);
3698 return NULL;
3699 }
3700
3701 /* Buffer available either on AGE or LRU or META */
3702 bp = NULL;
3703 *queue = -1;
3704
3705 /* Buffer available either on AGE or LRU */
3706 if (!age_bp) {
3707 bp = lru_bp;
3708 *queue = BQ_LRU;
3709 } else if (!lru_bp) {
3710 bp = age_bp;
3711 *queue = BQ_AGE;
3712 } else { /* buffer available on both AGE and LRU */
3713 int t = buf_timestamp();
3714
3715 age_time = t - age_bp->b_timestamp;
3716 lru_time = t - lru_bp->b_timestamp;
3717 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
3718 bp = age_bp;
3719 *queue = BQ_AGE;
3720 /*
3721 * we should probably re-timestamp eveything in the
3722 * queues at this point with the current time
3723 */
3724 } else {
3725 if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
3726 bp = lru_bp;
3727 *queue = BQ_LRU;
3728 } else {
3729 bp = age_bp;
3730 *queue = BQ_AGE;
3731 }
3732 }
3733 }
3734
3735 if (!bp) { /* Neither on AGE nor on LRU */
3736 bp = meta_bp;
3737 *queue = BQ_META;
3738 } else if (meta_bp) {
3739 int t = buf_timestamp();
3740
3741 bp_time = t - bp->b_timestamp;
3742 meta_time = t - meta_bp->b_timestamp;
3743
3744 if (!(bp_time < 0) && !(meta_time < 0)) {
3745 /* time not set backwards */
3746 int bp_is_stale;
3747 bp_is_stale = (*queue == BQ_LRU) ?
3748 lru_is_stale : age_is_stale;
3749
3750 if ((meta_time >= meta_is_stale) &&
3751 (bp_time < bp_is_stale)) {
3752 bp = meta_bp;
3753 *queue = BQ_META;
3754 }
3755 }
3756 }
3757 found:
3758 if (ISSET(bp->b_flags, B_LOCKED) || ISSET(bp->b_lflags, BL_BUSY)) {
3759 panic("getnewbuf: bp @ %p is LOCKED or BUSY! (flags 0x%x)\n", bp, bp->b_flags);
3760 }
3761
3762 /* Clean it */
3763 if (bcleanbuf(bp, FALSE)) {
3764 /*
3765 * moved to the laundry thread, buffer not ready
3766 */
3767 *queue = req;
3768 goto start;
3769 }
3770 return bp;
3771 }
3772
3773
3774 /*
3775 * Clean a buffer.
3776 * Returns 0 if buffer is ready to use,
3777 * Returns 1 if issued a buf_bawrite() to indicate
3778 * that the buffer is not ready.
3779 *
3780 * buf_mtx is held upon entry
3781 * returns with buf_mtx locked
3782 */
3783 int
3784 bcleanbuf(buf_t bp, boolean_t discard)
3785 {
3786 /* Remove from the queue */
3787 bremfree_locked(bp);
3788
3789 #ifdef JOE_DEBUG
3790 bp->b_owner = current_thread();
3791 bp->b_tag = 2;
3792 #endif
3793 /*
3794 * If buffer was a delayed write, start the IO by queuing
3795 * it on the LAUNDRY queue, and return 1
3796 */
3797 if (ISSET(bp->b_flags, B_DELWRI)) {
3798 if (discard) {
3799 SET(bp->b_lflags, BL_WANTDEALLOC);
3800 }
3801
3802 bmovelaundry(bp);
3803
3804 lck_mtx_unlock(&buf_mtx);
3805
3806 wakeup(&bufqueues[BQ_LAUNDRY]);
3807 /*
3808 * and give it a chance to run
3809 */
3810 (void)thread_block(THREAD_CONTINUE_NULL);
3811
3812 lck_mtx_lock_spin(&buf_mtx);
3813
3814 return 1;
3815 }
3816 #ifdef JOE_DEBUG
3817 bp->b_owner = current_thread();
3818 bp->b_tag = 8;
3819 #endif
3820 /*
3821 * Buffer is no longer on any free list... we own it
3822 */
3823 SET(bp->b_lflags, BL_BUSY);
3824 buf_busycount++;
3825
3826 bremhash(bp);
3827
3828 /*
3829 * disassociate us from our vnode, if we had one...
3830 */
3831 if (bp->b_vp) {
3832 brelvp_locked(bp);
3833 }
3834
3835 lck_mtx_unlock(&buf_mtx);
3836
3837 BLISTNONE(bp);
3838
3839 if (ISSET(bp->b_flags, B_META)) {
3840 buf_free_meta_store(bp);
3841 }
3842
3843 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
3844
3845 buf_release_credentials(bp);
3846
3847 /* If discarding, just move to the empty queue */
3848 if (discard) {
3849 lck_mtx_lock_spin(&buf_mtx);
3850 CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA));
3851 bp->b_whichq = BQ_EMPTY;
3852 binshash(bp, &invalhash);
3853 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
3854 CLR(bp->b_lflags, BL_BUSY);
3855 buf_busycount--;
3856 } else {
3857 /* Not discarding: clean up and prepare for reuse */
3858 bp->b_bufsize = 0;
3859 bp->b_datap = (uintptr_t)NULL;
3860 bp->b_upl = (void *)NULL;
3861 bp->b_fsprivate = (void *)NULL;
3862 /*
3863 * preserve the state of whether this buffer
3864 * was allocated on the fly or not...
3865 * the only other flag that should be set at
3866 * this point is BL_BUSY...
3867 */
3868 #ifdef JOE_DEBUG
3869 bp->b_owner = current_thread();
3870 bp->b_tag = 3;
3871 #endif
3872 bp->b_lflags = BL_BUSY;
3873 bp->b_flags = (bp->b_flags & B_HDRALLOC);
3874 bp->b_redundancy_flags = 0;
3875 bp->b_dev = NODEV;
3876 bp->b_blkno = bp->b_lblkno = 0;
3877 bp->b_iodone = NULL;
3878 bp->b_error = 0;
3879 bp->b_resid = 0;
3880 bp->b_bcount = 0;
3881 bp->b_dirtyoff = bp->b_dirtyend = 0;
3882 bp->b_validoff = bp->b_validend = 0;
3883 bzero(&bp->b_attr, sizeof(struct bufattr));
3884
3885 lck_mtx_lock_spin(&buf_mtx);
3886 }
3887 return 0;
3888 }
3889
3890
3891
3892 errno_t
3893 buf_invalblkno(vnode_t vp, daddr64_t lblkno, int flags)
3894 {
3895 buf_t bp;
3896 errno_t error;
3897 struct bufhashhdr *dp;
3898
3899 dp = BUFHASH(vp, lblkno);
3900
3901 relook:
3902 lck_mtx_lock_spin(&buf_mtx);
3903
3904 if ((bp = incore_locked(vp, lblkno, dp)) == (struct buf *)0) {
3905 lck_mtx_unlock(&buf_mtx);
3906 return 0;
3907 }
3908 if (ISSET(bp->b_lflags, BL_BUSY)) {
3909 if (!ISSET(flags, BUF_WAIT)) {
3910 lck_mtx_unlock(&buf_mtx);
3911 return EBUSY;
3912 }
3913 SET(bp->b_lflags, BL_WANTED);
3914
3915 error = msleep((caddr_t)bp, &buf_mtx, PDROP | (PRIBIO + 1), "buf_invalblkno", NULL);
3916
3917 if (error) {
3918 return error;
3919 }
3920 goto relook;
3921 }
3922 bremfree_locked(bp);
3923 SET(bp->b_lflags, BL_BUSY);
3924 SET(bp->b_flags, B_INVAL);
3925 buf_busycount++;
3926 #ifdef JOE_DEBUG
3927 bp->b_owner = current_thread();
3928 bp->b_tag = 4;
3929 #endif
3930 lck_mtx_unlock(&buf_mtx);
3931 buf_brelse(bp);
3932
3933 return 0;
3934 }
3935
3936
3937 void
3938 buf_drop(buf_t bp)
3939 {
3940 int need_wakeup = 0;
3941
3942 lck_mtx_lock_spin(&buf_mtx);
3943
3944 if (ISSET(bp->b_lflags, BL_WANTED)) {
3945 /*
3946 * delay the actual wakeup until after we
3947 * clear BL_BUSY and we've dropped buf_mtx
3948 */
3949 need_wakeup = 1;
3950 }
3951 #ifdef JOE_DEBUG
3952 bp->b_owner = current_thread();
3953 bp->b_tag = 9;
3954 #endif
3955 /*
3956 * Unlock the buffer.
3957 */
3958 CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
3959 buf_busycount--;
3960
3961 lck_mtx_unlock(&buf_mtx);
3962
3963 if (need_wakeup) {
3964 /*
3965 * Wake up any proceeses waiting for _this_ buffer to become free.
3966 */
3967 wakeup(bp);
3968 }
3969 }
3970
3971
3972 errno_t
3973 buf_acquire(buf_t bp, int flags, int slpflag, int slptimeo)
3974 {
3975 errno_t error;
3976
3977 lck_mtx_lock_spin(&buf_mtx);
3978
3979 error = buf_acquire_locked(bp, flags, slpflag, slptimeo);
3980
3981 lck_mtx_unlock(&buf_mtx);
3982
3983 return error;
3984 }
3985
3986
3987 static errno_t
3988 buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo)
3989 {
3990 errno_t error;
3991 struct timespec ts;
3992
3993 if (ISSET(bp->b_flags, B_LOCKED)) {
3994 if ((flags & BAC_SKIP_LOCKED)) {
3995 return EDEADLK;
3996 }
3997 } else {
3998 if ((flags & BAC_SKIP_NONLOCKED)) {
3999 return EDEADLK;
4000 }
4001 }
4002 if (ISSET(bp->b_lflags, BL_BUSY)) {
4003 /*
4004 * since the lck_mtx_lock may block, the buffer
4005 * may become BUSY, so we need to
4006 * recheck for a NOWAIT request
4007 */
4008 if (flags & BAC_NOWAIT) {
4009 return EBUSY;
4010 }
4011 SET(bp->b_lflags, BL_WANTED);
4012
4013 /* the hz value is 100; which leads to 10ms */
4014 ts.tv_sec = (slptimeo / 100);
4015 ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000;
4016 error = msleep((caddr_t)bp, &buf_mtx, slpflag | (PRIBIO + 1), "buf_acquire", &ts);
4017
4018 if (error) {
4019 return error;
4020 }
4021 return EAGAIN;
4022 }
4023 if (flags & BAC_REMOVE) {
4024 bremfree_locked(bp);
4025 }
4026 SET(bp->b_lflags, BL_BUSY);
4027 buf_busycount++;
4028
4029 #ifdef JOE_DEBUG
4030 bp->b_owner = current_thread();
4031 bp->b_tag = 5;
4032 #endif
4033 return 0;
4034 }
4035
4036
4037 /*
4038 * Wait for operations on the buffer to complete.
4039 * When they do, extract and return the I/O's error value.
4040 */
4041 errno_t
4042 buf_biowait(buf_t bp)
4043 {
4044 while (!ISSET(bp->b_flags, B_DONE)) {
4045 lck_mtx_lock_spin(&buf_mtx);
4046
4047 if (!ISSET(bp->b_flags, B_DONE)) {
4048 DTRACE_IO1(wait__start, buf_t, bp);
4049 (void) msleep(bp, &buf_mtx, PDROP | (PRIBIO + 1), "buf_biowait", NULL);
4050 DTRACE_IO1(wait__done, buf_t, bp);
4051 } else {
4052 lck_mtx_unlock(&buf_mtx);
4053 }
4054 }
4055 /* check for interruption of I/O (e.g. via NFS), then errors. */
4056 if (ISSET(bp->b_flags, B_EINTR)) {
4057 CLR(bp->b_flags, B_EINTR);
4058 return EINTR;
4059 } else if (ISSET(bp->b_flags, B_ERROR)) {
4060 return bp->b_error ? bp->b_error : EIO;
4061 } else {
4062 return 0;
4063 }
4064 }
4065
4066
4067 /*
4068 * Mark I/O complete on a buffer.
4069 *
4070 * If a callback has been requested, e.g. the pageout
4071 * daemon, do so. Otherwise, awaken waiting processes.
4072 *
4073 * [ Leffler, et al., says on p.247:
4074 * "This routine wakes up the blocked process, frees the buffer
4075 * for an asynchronous write, or, for a request by the pagedaemon
4076 * process, invokes a procedure specified in the buffer structure" ]
4077 *
4078 * In real life, the pagedaemon (or other system processes) wants
4079 * to do async stuff to, and doesn't want the buffer buf_brelse()'d.
4080 * (for swap pager, that puts swap buffers on the free lists (!!!),
4081 * for the vn device, that puts malloc'd buffers on the free lists!)
4082 */
4083
4084 void
4085 buf_biodone(buf_t bp)
4086 {
4087 mount_t mp;
4088 struct bufattr *bap;
4089 struct timeval real_elapsed;
4090 uint64_t real_elapsed_usec = 0;
4091
4092 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
4093 bp, bp->b_datap, bp->b_flags, 0, 0);
4094
4095 if (ISSET(bp->b_flags, B_DONE)) {
4096 panic("biodone already");
4097 }
4098
4099 bap = &bp->b_attr;
4100
4101 if (bp->b_vp && bp->b_vp->v_mount) {
4102 mp = bp->b_vp->v_mount;
4103 } else {
4104 mp = NULL;
4105 }
4106
4107 if (ISSET(bp->b_flags, B_ERROR)) {
4108 if (mp && (MNT_ROOTFS & mp->mnt_flag)) {
4109 dk_error_description_t desc;
4110 bzero(&desc, sizeof(desc));
4111 desc.description = panic_disk_error_description;
4112 desc.description_size = panic_disk_error_description_size;
4113 VNOP_IOCTL(mp->mnt_devvp, DKIOCGETERRORDESCRIPTION, (caddr_t)&desc, 0, vfs_context_kernel());
4114 }
4115 }
4116
4117 if (mp && (bp->b_flags & B_READ) == 0) {
4118 update_last_io_time(mp);
4119 INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_write_size);
4120 } else if (mp) {
4121 INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_read_size);
4122 }
4123
4124 throttle_info_end_io(bp);
4125
4126 if (kdebug_enable) {
4127 int code = DKIO_DONE;
4128 int io_tier = GET_BUFATTR_IO_TIER(bap);
4129
4130 if (bp->b_flags & B_READ) {
4131 code |= DKIO_READ;
4132 }
4133 if (bp->b_flags & B_ASYNC) {
4134 code |= DKIO_ASYNC;
4135 }
4136
4137 if (bp->b_flags & B_META) {
4138 code |= DKIO_META;
4139 } else if (bp->b_flags & B_PAGEIO) {
4140 code |= DKIO_PAGING;
4141 }
4142
4143 if (io_tier != 0) {
4144 code |= DKIO_THROTTLE;
4145 }
4146
4147 code |= ((io_tier << DKIO_TIER_SHIFT) & DKIO_TIER_MASK);
4148
4149 if (bp->b_flags & B_PASSIVE) {
4150 code |= DKIO_PASSIVE;
4151 }
4152
4153 if (bap->ba_flags & BA_NOCACHE) {
4154 code |= DKIO_NOCACHE;
4155 }
4156
4157 if (bap->ba_flags & BA_IO_TIER_UPGRADE) {
4158 code |= DKIO_TIER_UPGRADE;
4159 }
4160
4161 KDBG_RELEASE_NOPROCFILT(FSDBG_CODE(DBG_DKRW, code),
4162 buf_kernel_addrperm_addr(bp),
4163 (uintptr_t)VM_KERNEL_ADDRPERM(bp->b_vp), bp->b_resid,
4164 bp->b_error);
4165 }
4166
4167 microuptime(&real_elapsed);
4168 timevalsub(&real_elapsed, &bp->b_timestamp_tv);
4169 real_elapsed_usec = real_elapsed.tv_sec * USEC_PER_SEC + real_elapsed.tv_usec;
4170 disk_conditioner_delay(bp, 1, bp->b_bcount, real_elapsed_usec);
4171
4172 /*
4173 * I/O was done, so don't believe
4174 * the DIRTY state from VM anymore...
4175 * and we need to reset the THROTTLED/PASSIVE
4176 * indicators
4177 */
4178 CLR(bp->b_flags, (B_WASDIRTY | B_PASSIVE));
4179 CLR(bap->ba_flags, (BA_META | BA_NOCACHE | BA_DELAYIDLESLEEP | BA_IO_TIER_UPGRADE));
4180
4181 SET_BUFATTR_IO_TIER(bap, 0);
4182
4183 DTRACE_IO1(done, buf_t, bp);
4184
4185 if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW)) {
4186 /*
4187 * wake up any writer's blocked
4188 * on throttle or waiting for I/O
4189 * to drain
4190 */
4191 vnode_writedone(bp->b_vp);
4192 }
4193
4194 if (ISSET(bp->b_flags, (B_CALL | B_FILTER))) { /* if necessary, call out */
4195 void (*iodone_func)(struct buf *, void *) = bp->b_iodone;
4196 void *arg = bp->b_transaction;
4197 int callout = ISSET(bp->b_flags, B_CALL);
4198
4199 if (iodone_func == NULL) {
4200 panic("biodone: bp @ %p has NULL b_iodone!\n", bp);
4201 }
4202
4203 CLR(bp->b_flags, (B_CALL | B_FILTER)); /* filters and callouts are one-shot */
4204 bp->b_iodone = NULL;
4205 bp->b_transaction = NULL;
4206
4207 if (callout) {
4208 SET(bp->b_flags, B_DONE); /* note that it's done */
4209 }
4210 (*iodone_func)(bp, arg);
4211
4212 if (callout) {
4213 /*
4214 * assumes that the callback function takes
4215 * ownership of the bp and deals with releasing it if necessary
4216 */
4217 goto biodone_done;
4218 }
4219 /*
4220 * in this case the call back function is acting
4221 * strictly as a filter... it does not take
4222 * ownership of the bp and is expecting us
4223 * to finish cleaning up... this is currently used
4224 * by the HFS journaling code
4225 */
4226 }
4227 if (ISSET(bp->b_flags, B_ASYNC)) { /* if async, release it */
4228 SET(bp->b_flags, B_DONE); /* note that it's done */
4229
4230 buf_brelse(bp);
4231 } else { /* or just wakeup the buffer */
4232 /*
4233 * by taking the mutex, we serialize
4234 * the buf owner calling buf_biowait so that we'll
4235 * only see him in one of 2 states...
4236 * state 1: B_DONE wasn't set and he's
4237 * blocked in msleep
4238 * state 2: he's blocked trying to take the
4239 * mutex before looking at B_DONE
4240 * BL_WANTED is cleared in case anyone else
4241 * is blocked waiting for the buffer... note
4242 * that we haven't cleared B_BUSY yet, so if
4243 * they do get to run, their going to re-set
4244 * BL_WANTED and go back to sleep
4245 */
4246 lck_mtx_lock_spin(&buf_mtx);
4247
4248 CLR(bp->b_lflags, BL_WANTED);
4249 SET(bp->b_flags, B_DONE); /* note that it's done */
4250
4251 lck_mtx_unlock(&buf_mtx);
4252
4253 wakeup(bp);
4254 }
4255 biodone_done:
4256 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
4257 (uintptr_t)bp, (uintptr_t)bp->b_datap, bp->b_flags, 0, 0);
4258 }
4259
4260 /*
4261 * Obfuscate buf pointers.
4262 */
4263 vm_offset_t
4264 buf_kernel_addrperm_addr(void * addr)
4265 {
4266 if ((vm_offset_t)addr == 0) {
4267 return 0;
4268 } else {
4269 return (vm_offset_t)addr + buf_kernel_addrperm;
4270 }
4271 }
4272
4273 /*
4274 * Return a count of buffers on the "locked" queue.
4275 */
4276 int
4277 count_lock_queue(void)
4278 {
4279 buf_t bp;
4280 int n = 0;
4281
4282 lck_mtx_lock_spin(&buf_mtx);
4283
4284 for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
4285 bp = bp->b_freelist.tqe_next) {
4286 n++;
4287 }
4288 lck_mtx_unlock(&buf_mtx);
4289
4290 return n;
4291 }
4292
4293 /*
4294 * Return a count of 'busy' buffers. Used at the time of shutdown.
4295 * note: This is also called from the mach side in debug context in kdp.c
4296 */
4297 uint32_t
4298 count_busy_buffers(void)
4299 {
4300 return buf_busycount + bufstats.bufs_iobufinuse;
4301 }
4302
4303 #if DIAGNOSTIC
4304 /*
4305 * Print out statistics on the current allocation of the buffer pool.
4306 * Can be enabled to print out on every ``sync'' by setting "syncprt"
4307 * in vfs_syscalls.c using sysctl.
4308 */
4309 void
4310 vfs_bufstats()
4311 {
4312 int i, j, count;
4313 struct buf *bp;
4314 struct bqueues *dp;
4315 int counts[MAXBSIZE / CLBYTES + 1];
4316 static char *bname[BQUEUES] =
4317 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
4318
4319 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
4320 count = 0;
4321 for (j = 0; j <= MAXBSIZE / CLBYTES; j++) {
4322 counts[j] = 0;
4323 }
4324
4325 lck_mtx_lock(&buf_mtx);
4326
4327 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
4328 counts[bp->b_bufsize / CLBYTES]++;
4329 count++;
4330 }
4331 lck_mtx_unlock(&buf_mtx);
4332
4333 printf("%s: total-%d", bname[i], count);
4334 for (j = 0; j <= MAXBSIZE / CLBYTES; j++) {
4335 if (counts[j] != 0) {
4336 printf(", %d-%d", j * CLBYTES, counts[j]);
4337 }
4338 }
4339 printf("\n");
4340 }
4341 }
4342 #endif /* DIAGNOSTIC */
4343
4344 #define NRESERVEDIOBUFS 128
4345
4346 #define MNT_VIRTUALDEV_MAX_IOBUFS 16
4347 #define VIRTUALDEV_MAX_IOBUFS ((40*niobuf_headers)/100)
4348
4349 buf_t
4350 alloc_io_buf(vnode_t vp, int priv)
4351 {
4352 buf_t bp;
4353 mount_t mp = NULL;
4354 int alloc_for_virtualdev = FALSE;
4355
4356 lck_mtx_lock_spin(&iobuffer_mtxp);
4357
4358 /*
4359 * We subject iobuf requests for diskimages to additional restrictions.
4360 *
4361 * a) A single diskimage mount cannot use up more than
4362 * MNT_VIRTUALDEV_MAX_IOBUFS. However,vm privileged (pageout) requests
4363 * are not subject to this restriction.
4364 * b) iobuf headers used by all diskimage headers by all mount
4365 * points cannot exceed VIRTUALDEV_MAX_IOBUFS.
4366 */
4367 if (vp && ((mp = vp->v_mount)) && mp != dead_mountp &&
4368 mp->mnt_kern_flag & MNTK_VIRTUALDEV) {
4369 alloc_for_virtualdev = TRUE;
4370 while ((!priv && mp->mnt_iobufinuse > MNT_VIRTUALDEV_MAX_IOBUFS) ||
4371 bufstats.bufs_iobufinuse_vdev > VIRTUALDEV_MAX_IOBUFS) {
4372 bufstats.bufs_iobufsleeps++;
4373
4374 need_iobuffer = 1;
4375 (void)msleep(&need_iobuffer, &iobuffer_mtxp,
4376 PSPIN | (PRIBIO + 1), (const char *)"alloc_io_buf (1)",
4377 NULL);
4378 }
4379 }
4380
4381 while ((((uint32_t)(niobuf_headers - NRESERVEDIOBUFS) < bufstats.bufs_iobufinuse) && !priv) ||
4382 (bp = iobufqueue.tqh_first) == NULL) {
4383 bufstats.bufs_iobufsleeps++;
4384
4385 need_iobuffer = 1;
4386 (void)msleep(&need_iobuffer, &iobuffer_mtxp, PSPIN | (PRIBIO + 1),
4387 (const char *)"alloc_io_buf (2)", NULL);
4388 }
4389 TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
4390
4391 bufstats.bufs_iobufinuse++;
4392 if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax) {
4393 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
4394 }
4395
4396 if (alloc_for_virtualdev) {
4397 mp->mnt_iobufinuse++;
4398 bufstats.bufs_iobufinuse_vdev++;
4399 }
4400
4401 lck_mtx_unlock(&iobuffer_mtxp);
4402
4403 /*
4404 * initialize various fields
4405 * we don't need to hold the mutex since the buffer
4406 * is now private... the vp should have a reference
4407 * on it and is not protected by this mutex in any event
4408 */
4409 bp->b_timestamp = 0;
4410 bp->b_proc = NULL;
4411
4412 bp->b_datap = 0;
4413 bp->b_flags = 0;
4414 bp->b_lflags = BL_BUSY | BL_IOBUF;
4415 if (alloc_for_virtualdev) {
4416 bp->b_lflags |= BL_IOBUF_VDEV;
4417 }
4418 bp->b_redundancy_flags = 0;
4419 bp->b_blkno = bp->b_lblkno = 0;
4420 #ifdef JOE_DEBUG
4421 bp->b_owner = current_thread();
4422 bp->b_tag = 6;
4423 #endif
4424 bp->b_iodone = NULL;
4425 bp->b_error = 0;
4426 bp->b_resid = 0;
4427 bp->b_bcount = 0;
4428 bp->b_bufsize = 0;
4429 bp->b_upl = NULL;
4430 bp->b_fsprivate = (void *)NULL;
4431 bp->b_vp = vp;
4432 bzero(&bp->b_attr, sizeof(struct bufattr));
4433
4434 if (vp && (vp->v_type == VBLK || vp->v_type == VCHR)) {
4435 bp->b_dev = vp->v_rdev;
4436 } else {
4437 bp->b_dev = NODEV;
4438 }
4439
4440 return bp;
4441 }
4442
4443
4444 void
4445 free_io_buf(buf_t bp)
4446 {
4447 int need_wakeup = 0;
4448 int free_for_virtualdev = FALSE;
4449 mount_t mp = NULL;
4450
4451 /* Was this iobuf for a diskimage ? */
4452 if (bp->b_lflags & BL_IOBUF_VDEV) {
4453 free_for_virtualdev = TRUE;
4454 if (bp->b_vp) {
4455 mp = bp->b_vp->v_mount;
4456 }
4457 }
4458
4459 /*
4460 * put buffer back on the head of the iobufqueue
4461 */
4462 bp->b_vp = NULL;
4463 bp->b_flags = B_INVAL;
4464
4465 /* Zero out the bufattr and its flags before relinquishing this iobuf */
4466 bzero(&bp->b_attr, sizeof(struct bufattr));
4467
4468 lck_mtx_lock_spin(&iobuffer_mtxp);
4469
4470 binsheadfree(bp, &iobufqueue, -1);
4471
4472 if (need_iobuffer) {
4473 /*
4474 * Wake up any processes waiting because they need an io buffer
4475 *
4476 * do the wakeup after we drop the mutex... it's possible that the
4477 * wakeup will be superfluous if need_iobuffer gets set again and
4478 * another thread runs this path, but it's highly unlikely, doesn't
4479 * hurt, and it means we don't hold up I/O progress if the wakeup blocks
4480 * trying to grab a task related lock...
4481 */
4482 need_iobuffer = 0;
4483 need_wakeup = 1;
4484 }
4485 if (bufstats.bufs_iobufinuse <= 0) {
4486 panic("free_io_buf: bp(%p) - bufstats.bufs_iobufinuse < 0", bp);
4487 }
4488
4489 bufstats.bufs_iobufinuse--;
4490
4491 if (free_for_virtualdev) {
4492 bufstats.bufs_iobufinuse_vdev--;
4493 if (mp && mp != dead_mountp) {
4494 mp->mnt_iobufinuse--;
4495 }
4496 }
4497
4498 lck_mtx_unlock(&iobuffer_mtxp);
4499
4500 if (need_wakeup) {
4501 wakeup(&need_iobuffer);
4502 }
4503 }
4504
4505
4506 void
4507 buf_list_lock(void)
4508 {
4509 lck_mtx_lock_spin(&buf_mtx);
4510 }
4511
4512 void
4513 buf_list_unlock(void)
4514 {
4515 lck_mtx_unlock(&buf_mtx);
4516 }
4517
4518 /*
4519 * If getnewbuf() calls bcleanbuf() on the same thread
4520 * there is a potential for stack overrun and deadlocks.
4521 * So we always handoff the work to a worker thread for completion
4522 */
4523
4524
4525 static void
4526 bcleanbuf_thread_init(void)
4527 {
4528 thread_t thread = THREAD_NULL;
4529
4530 /* create worker thread */
4531 kernel_thread_start((thread_continue_t)bcleanbuf_thread, NULL, &thread);
4532 thread_deallocate(thread);
4533 }
4534
4535 typedef int (*bcleanbufcontinuation)(int);
4536
4537 __attribute__((noreturn))
4538 static void
4539 bcleanbuf_thread(void)
4540 {
4541 struct buf *bp;
4542 int error = 0;
4543 int loopcnt = 0;
4544
4545 for (;;) {
4546 lck_mtx_lock_spin(&buf_mtx);
4547
4548 while ((bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY])) == NULL) {
4549 (void)msleep0(&bufqueues[BQ_LAUNDRY], &buf_mtx, PRIBIO | PDROP, "blaundry", 0, (bcleanbufcontinuation)bcleanbuf_thread);
4550 }
4551
4552 /*
4553 * Remove from the queue
4554 */
4555 bremfree_locked(bp);
4556
4557 /*
4558 * Buffer is no longer on any free list
4559 */
4560 SET(bp->b_lflags, BL_BUSY);
4561 buf_busycount++;
4562
4563 #ifdef JOE_DEBUG
4564 bp->b_owner = current_thread();
4565 bp->b_tag = 10;
4566 #endif
4567
4568 lck_mtx_unlock(&buf_mtx);
4569 /*
4570 * do the IO
4571 */
4572 error = bawrite_internal(bp, 0);
4573
4574 if (error) {
4575 bp->b_whichq = BQ_LAUNDRY;
4576 bp->b_timestamp = buf_timestamp();
4577
4578 lck_mtx_lock_spin(&buf_mtx);
4579
4580 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
4581 blaundrycnt++;
4582
4583 /* we never leave a busy page on the laundry queue */
4584 CLR(bp->b_lflags, BL_BUSY);
4585 buf_busycount--;
4586 #ifdef JOE_DEBUG
4587 bp->b_owner = current_thread();
4588 bp->b_tag = 11;
4589 #endif
4590
4591 lck_mtx_unlock(&buf_mtx);
4592
4593 if (loopcnt > MAXLAUNDRY) {
4594 /*
4595 * bawrite_internal() can return errors if we're throttled. If we've
4596 * done several I/Os and failed, give the system some time to unthrottle
4597 * the vnode
4598 */
4599 (void)tsleep((void *)&bufqueues[BQ_LAUNDRY], PRIBIO, "blaundry", 1);
4600 loopcnt = 0;
4601 } else {
4602 /* give other threads a chance to run */
4603 (void)thread_block(THREAD_CONTINUE_NULL);
4604 loopcnt++;
4605 }
4606 }
4607 }
4608 }
4609
4610
4611 static int
4612 brecover_data(buf_t bp)
4613 {
4614 int upl_offset;
4615 upl_t upl;
4616 upl_page_info_t *pl;
4617 kern_return_t kret;
4618 vnode_t vp = bp->b_vp;
4619 int upl_flags;
4620
4621
4622 if (!UBCINFOEXISTS(vp) || bp->b_bufsize == 0) {
4623 goto dump_buffer;
4624 }
4625
4626 upl_flags = UPL_PRECIOUS;
4627 if (!(buf_flags(bp) & B_READ)) {
4628 /*
4629 * "write" operation: let the UPL subsystem know
4630 * that we intend to modify the buffer cache pages we're
4631 * gathering.
4632 */
4633 upl_flags |= UPL_WILL_MODIFY;
4634 }
4635
4636 kret = ubc_create_upl_kernel(vp,
4637 ubc_blktooff(vp, bp->b_lblkno),
4638 bp->b_bufsize,
4639 &upl,
4640 &pl,
4641 upl_flags,
4642 VM_KERN_MEMORY_FILE);
4643 if (kret != KERN_SUCCESS) {
4644 panic("Failed to create UPL");
4645 }
4646
4647 for (upl_offset = 0; (uint32_t)upl_offset < bp->b_bufsize; upl_offset += PAGE_SIZE) {
4648 if (!upl_valid_page(pl, upl_offset / PAGE_SIZE) || !upl_dirty_page(pl, upl_offset / PAGE_SIZE)) {
4649 ubc_upl_abort(upl, 0);
4650 goto dump_buffer;
4651 }
4652 }
4653 bp->b_upl = upl;
4654
4655 kret = ubc_upl_map(upl, (vm_offset_t *)&(bp->b_datap));
4656
4657 if (kret != KERN_SUCCESS) {
4658 panic("getblk: ubc_upl_map() failed with (%d)", kret);
4659 }
4660 return 1;
4661
4662 dump_buffer:
4663 bp->b_bufsize = 0;
4664 SET(bp->b_flags, B_INVAL);
4665 buf_brelse(bp);
4666
4667 return 0;
4668 }
4669
4670 int
4671 fs_buffer_cache_gc_register(void (* callout)(int, void *), void *context)
4672 {
4673 lck_mtx_lock(&buf_gc_callout);
4674 for (int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
4675 if (fs_callouts[i].callout == NULL) {
4676 fs_callouts[i].callout = callout;
4677 fs_callouts[i].context = context;
4678 lck_mtx_unlock(&buf_gc_callout);
4679 return 0;
4680 }
4681 }
4682
4683 lck_mtx_unlock(&buf_gc_callout);
4684 return ENOMEM;
4685 }
4686
4687 int
4688 fs_buffer_cache_gc_unregister(void (* callout)(int, void *), void *context)
4689 {
4690 lck_mtx_lock(&buf_gc_callout);
4691 for (int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
4692 if (fs_callouts[i].callout == callout &&
4693 fs_callouts[i].context == context) {
4694 fs_callouts[i].callout = NULL;
4695 fs_callouts[i].context = NULL;
4696 }
4697 }
4698 lck_mtx_unlock(&buf_gc_callout);
4699 return 0;
4700 }
4701
4702 static void
4703 fs_buffer_cache_gc_dispatch_callouts(int all)
4704 {
4705 lck_mtx_lock(&buf_gc_callout);
4706 for (int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
4707 if (fs_callouts[i].callout != NULL) {
4708 fs_callouts[i].callout(all, fs_callouts[i].context);
4709 }
4710 }
4711 lck_mtx_unlock(&buf_gc_callout);
4712 }
4713
4714 static boolean_t
4715 buffer_cache_gc(int all)
4716 {
4717 buf_t bp;
4718 boolean_t did_large_zfree = FALSE;
4719 boolean_t need_wakeup = FALSE;
4720 int now = buf_timestamp();
4721 uint32_t found = 0;
4722 struct bqueues privq;
4723 int thresh_hold = BUF_STALE_THRESHHOLD;
4724
4725 if (all) {
4726 thresh_hold = 0;
4727 }
4728 /*
4729 * We only care about metadata (incore storage comes from zalloc()).
4730 * Unless "all" is set (used to evict meta data buffers in preparation
4731 * for deep sleep), we only evict up to BUF_MAX_GC_BATCH_SIZE buffers
4732 * that have not been accessed in the last BUF_STALE_THRESHOLD seconds.
4733 * BUF_MAX_GC_BATCH_SIZE controls both the hold time of the global lock
4734 * "buf_mtx" and the length of time we spend compute bound in the GC
4735 * thread which calls this function
4736 */
4737 lck_mtx_lock(&buf_mtx);
4738
4739 do {
4740 found = 0;
4741 TAILQ_INIT(&privq);
4742 need_wakeup = FALSE;
4743
4744 while (((bp = TAILQ_FIRST(&bufqueues[BQ_META]))) &&
4745 (now > bp->b_timestamp) &&
4746 (now - bp->b_timestamp > thresh_hold) &&
4747 (found < BUF_MAX_GC_BATCH_SIZE)) {
4748 /* Remove from free list */
4749 bremfree_locked(bp);
4750 found++;
4751
4752 #ifdef JOE_DEBUG
4753 bp->b_owner = current_thread();
4754 bp->b_tag = 12;
4755 #endif
4756
4757 /* If dirty, move to laundry queue and remember to do wakeup */
4758 if (ISSET(bp->b_flags, B_DELWRI)) {
4759 SET(bp->b_lflags, BL_WANTDEALLOC);
4760
4761 bmovelaundry(bp);
4762 need_wakeup = TRUE;
4763
4764 continue;
4765 }
4766
4767 /*
4768 * Mark busy and put on private list. We could technically get
4769 * away without setting BL_BUSY here.
4770 */
4771 SET(bp->b_lflags, BL_BUSY);
4772 buf_busycount++;
4773
4774 /*
4775 * Remove from hash and dissociate from vp.
4776 */
4777 bremhash(bp);
4778 if (bp->b_vp) {
4779 brelvp_locked(bp);
4780 }
4781
4782 TAILQ_INSERT_TAIL(&privq, bp, b_freelist);
4783 }
4784
4785 if (found == 0) {
4786 break;
4787 }
4788
4789 /* Drop lock for batch processing */
4790 lck_mtx_unlock(&buf_mtx);
4791
4792 /* Wakeup and yield for laundry if need be */
4793 if (need_wakeup) {
4794 wakeup(&bufqueues[BQ_LAUNDRY]);
4795 (void)thread_block(THREAD_CONTINUE_NULL);
4796 }
4797
4798 /* Clean up every buffer on private list */
4799 TAILQ_FOREACH(bp, &privq, b_freelist) {
4800 /* Take note if we've definitely freed at least a page to a zone */
4801 if ((ISSET(bp->b_flags, B_ZALLOC)) && (buf_size(bp) >= PAGE_SIZE)) {
4802 did_large_zfree = TRUE;
4803 }
4804
4805 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
4806
4807 /* Free Storage */
4808 buf_free_meta_store(bp);
4809
4810 /* Release credentials */
4811 buf_release_credentials(bp);
4812
4813 /* Prepare for moving to empty queue */
4814 CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED
4815 | B_AGE | B_ASYNC | B_NOCACHE | B_FUA));
4816 bp->b_whichq = BQ_EMPTY;
4817 BLISTNONE(bp);
4818 }
4819 lck_mtx_lock(&buf_mtx);
4820
4821 /* Back under lock, move them all to invalid hash and clear busy */
4822 TAILQ_FOREACH(bp, &privq, b_freelist) {
4823 binshash(bp, &invalhash);
4824 CLR(bp->b_lflags, BL_BUSY);
4825 buf_busycount--;
4826
4827 #ifdef JOE_DEBUG
4828 if (bp->b_owner != current_thread()) {
4829 panic("Buffer stolen from buffer_cache_gc()");
4830 }
4831 bp->b_owner = current_thread();
4832 bp->b_tag = 13;
4833 #endif
4834 }
4835
4836 /* And do a big bulk move to the empty queue */
4837 TAILQ_CONCAT(&bufqueues[BQ_EMPTY], &privq, b_freelist);
4838 } while (all && (found == BUF_MAX_GC_BATCH_SIZE));
4839
4840 lck_mtx_unlock(&buf_mtx);
4841
4842 fs_buffer_cache_gc_dispatch_callouts(all);
4843
4844 return did_large_zfree;
4845 }
4846
4847
4848 /*
4849 * disabled for now
4850 */
4851
4852 #if FLUSH_QUEUES
4853
4854 #define NFLUSH 32
4855
4856 static int
4857 bp_cmp(void *a, void *b)
4858 {
4859 buf_t *bp_a = *(buf_t **)a,
4860 *bp_b = *(buf_t **)b;
4861 daddr64_t res;
4862
4863 // don't have to worry about negative block
4864 // numbers so this is ok to do.
4865 //
4866 res = (bp_a->b_blkno - bp_b->b_blkno);
4867
4868 return (int)res;
4869 }
4870
4871
4872 int
4873 bflushq(int whichq, mount_t mp)
4874 {
4875 buf_t bp, next;
4876 int i, buf_count;
4877 int total_writes = 0;
4878 static buf_t flush_table[NFLUSH];
4879
4880 if (whichq < 0 || whichq >= BQUEUES) {
4881 return 0;
4882 }
4883
4884 restart:
4885 lck_mtx_lock(&buf_mtx);
4886
4887 bp = TAILQ_FIRST(&bufqueues[whichq]);
4888
4889 for (buf_count = 0; bp; bp = next) {
4890 next = bp->b_freelist.tqe_next;
4891
4892 if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) {
4893 continue;
4894 }
4895
4896 if (ISSET(bp->b_flags, B_DELWRI) && !ISSET(bp->b_lflags, BL_BUSY)) {
4897 bremfree_locked(bp);
4898 #ifdef JOE_DEBUG
4899 bp->b_owner = current_thread();
4900 bp->b_tag = 7;
4901 #endif
4902 SET(bp->b_lflags, BL_BUSY);
4903 buf_busycount++;
4904
4905 flush_table[buf_count] = bp;
4906 buf_count++;
4907 total_writes++;
4908
4909 if (buf_count >= NFLUSH) {
4910 lck_mtx_unlock(&buf_mtx);
4911
4912 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
4913
4914 for (i = 0; i < buf_count; i++) {
4915 buf_bawrite(flush_table[i]);
4916 }
4917 goto restart;
4918 }
4919 }
4920 }
4921 lck_mtx_unlock(&buf_mtx);
4922
4923 if (buf_count > 0) {
4924 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
4925
4926 for (i = 0; i < buf_count; i++) {
4927 buf_bawrite(flush_table[i]);
4928 }
4929 }
4930
4931 return total_writes;
4932 }
4933 #endif