]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_bio.c
xnu-4903.270.47.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_bio.c
1 /*
2 * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*-
30 * Copyright (c) 1994 Christopher G. Demetriou
31 * Copyright (c) 1982, 1986, 1989, 1993
32 * The Regents of the University of California. All rights reserved.
33 * (c) UNIX System Laboratories, Inc.
34 * All or some portions of this file are derived from material licensed
35 * to the University of California by American Telephone and Telegraph
36 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
37 * the permission of UNIX System Laboratories, Inc.
38 *
39 * Redistribution and use in source and binary forms, with or without
40 * modification, are permitted provided that the following conditions
41 * are met:
42 * 1. Redistributions of source code must retain the above copyright
43 * notice, this list of conditions and the following disclaimer.
44 * 2. Redistributions in binary form must reproduce the above copyright
45 * notice, this list of conditions and the following disclaimer in the
46 * documentation and/or other materials provided with the distribution.
47 * 3. All advertising materials mentioning features or use of this software
48 * must display the following acknowledgement:
49 * This product includes software developed by the University of
50 * California, Berkeley and its contributors.
51 * 4. Neither the name of the University nor the names of its contributors
52 * may be used to endorse or promote products derived from this software
53 * without specific prior written permission.
54 *
55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65 * SUCH DAMAGE.
66 *
67 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
68 */
69
70 /*
71 * Some references:
72 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
73 * Leffler, et al.: The Design and Implementation of the 4.3BSD
74 * UNIX Operating System (Addison Welley, 1989)
75 */
76
77 #include <sys/param.h>
78 #include <sys/systm.h>
79 #include <sys/proc_internal.h>
80 #include <sys/buf_internal.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/trace.h>
84 #include <sys/malloc.h>
85 #include <sys/resourcevar.h>
86 #include <miscfs/specfs/specdev.h>
87 #include <sys/ubc.h>
88 #include <sys/kauth.h>
89 #if DIAGNOSTIC
90 #include <kern/assert.h>
91 #endif /* DIAGNOSTIC */
92 #include <kern/task.h>
93 #include <kern/zalloc.h>
94 #include <kern/locks.h>
95 #include <kern/thread.h>
96
97 #include <sys/fslog.h> /* fslog_io_error() */
98 #include <sys/disk.h> /* dk_error_description_t */
99
100 #include <mach/mach_types.h>
101 #include <mach/memory_object_types.h>
102 #include <kern/sched_prim.h> /* thread_block() */
103
104 #include <vm/vm_kern.h>
105 #include <vm/vm_pageout.h>
106
107 #include <sys/kdebug.h>
108
109 #include <libkern/OSAtomic.h>
110 #include <libkern/OSDebug.h>
111 #include <sys/ubc_internal.h>
112
113 #include <sys/sdt.h>
114
115 int bcleanbuf(buf_t bp, boolean_t discard);
116 static int brecover_data(buf_t bp);
117 static boolean_t incore(vnode_t vp, daddr64_t blkno);
118 /* timeout is in msecs */
119 static buf_t getnewbuf(int slpflag, int slptimeo, int *queue);
120 static void bremfree_locked(buf_t bp);
121 static void buf_reassign(buf_t bp, vnode_t newvp);
122 static errno_t buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo);
123 static int buf_iterprepare(vnode_t vp, struct buflists *, int flags);
124 static void buf_itercomplete(vnode_t vp, struct buflists *, int flags);
125 static boolean_t buffer_cache_gc(int);
126 static buf_t buf_brelse_shadow(buf_t bp);
127 static void buf_free_meta_store(buf_t bp);
128
129 static buf_t buf_create_shadow_internal(buf_t bp, boolean_t force_copy,
130 uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg, int priv);
131
132
133 int bdwrite_internal(buf_t, int);
134
135 extern void disk_conditioner_delay(buf_t, int, int, uint64_t);
136
137 /* zone allocated buffer headers */
138 static void bufzoneinit(void);
139 static void bcleanbuf_thread_init(void);
140 static void bcleanbuf_thread(void);
141
142 static zone_t buf_hdr_zone;
143 static int buf_hdr_count;
144
145
146 /*
147 * Definitions for the buffer hash lists.
148 */
149 #define BUFHASH(dvp, lbn) \
150 (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
151 LIST_HEAD(bufhashhdr, buf) * bufhashtbl, invalhash;
152 u_long bufhash;
153
154 static buf_t incore_locked(vnode_t vp, daddr64_t blkno, struct bufhashhdr *dp);
155
156 /* Definitions for the buffer stats. */
157 struct bufstats bufstats;
158
159 /* Number of delayed write buffers */
160 long nbdwrite = 0;
161 int blaundrycnt = 0;
162 static int boot_nbuf_headers = 0;
163
164 static TAILQ_HEAD(delayqueue, buf) delaybufqueue;
165
166 static TAILQ_HEAD(ioqueue, buf) iobufqueue;
167 static TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
168 static int needbuffer;
169 static int need_iobuffer;
170
171 static lck_grp_t *buf_mtx_grp;
172 static lck_attr_t *buf_mtx_attr;
173 static lck_grp_attr_t *buf_mtx_grp_attr;
174 static lck_mtx_t *iobuffer_mtxp;
175 static lck_mtx_t *buf_mtxp;
176 static lck_mtx_t *buf_gc_callout;
177
178 static int buf_busycount;
179
180 #define FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE 16
181 typedef struct {
182 void (* callout)(int, void *);
183 void *context;
184 } fs_buffer_cache_gc_callout_t;
185
186 fs_buffer_cache_gc_callout_t fs_callouts[FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE] = { {NULL, NULL} };
187
188 static __inline__ int
189 buf_timestamp(void)
190 {
191 struct timeval t;
192 microuptime(&t);
193 return t.tv_sec;
194 }
195
196 /*
197 * Insq/Remq for the buffer free lists.
198 */
199 #define binsheadfree(bp, dp, whichq) do { \
200 TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
201 } while (0)
202
203 #define binstailfree(bp, dp, whichq) do { \
204 TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
205 } while (0)
206
207 #define BHASHENTCHECK(bp) \
208 if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef) \
209 panic("%p: b_hash.le_prev is not deadbeef", (bp));
210
211 #define BLISTNONE(bp) \
212 (bp)->b_hash.le_next = (struct buf *)0; \
213 (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
214
215 /*
216 * Insq/Remq for the vnode usage lists.
217 */
218 #define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs)
219 #define bufremvn(bp) { \
220 LIST_REMOVE(bp, b_vnbufs); \
221 (bp)->b_vnbufs.le_next = NOLIST; \
222 }
223
224 /*
225 * Time in seconds before a buffer on a list is
226 * considered as a stale buffer
227 */
228 #define LRU_IS_STALE 120 /* default value for the LRU */
229 #define AGE_IS_STALE 60 /* default value for the AGE */
230 #define META_IS_STALE 180 /* default value for the BQ_META */
231
232 int lru_is_stale = LRU_IS_STALE;
233 int age_is_stale = AGE_IS_STALE;
234 int meta_is_stale = META_IS_STALE;
235
236 #define MAXLAUNDRY 10
237
238 /* LIST_INSERT_HEAD() with assertions */
239 static __inline__ void
240 blistenterhead(struct bufhashhdr * head, buf_t bp)
241 {
242 if ((bp->b_hash.le_next = (head)->lh_first) != NULL) {
243 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
244 }
245 (head)->lh_first = bp;
246 bp->b_hash.le_prev = &(head)->lh_first;
247 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef) {
248 panic("blistenterhead: le_prev is deadbeef");
249 }
250 }
251
252 static __inline__ void
253 binshash(buf_t bp, struct bufhashhdr *dp)
254 {
255 #if DIAGNOSTIC
256 buf_t nbp;
257 #endif /* DIAGNOSTIC */
258
259 BHASHENTCHECK(bp);
260
261 #if DIAGNOSTIC
262 nbp = dp->lh_first;
263 for (; nbp != NULL; nbp = nbp->b_hash.le_next) {
264 if (nbp == bp) {
265 panic("buf already in hashlist");
266 }
267 }
268 #endif /* DIAGNOSTIC */
269
270 blistenterhead(dp, bp);
271 }
272
273 static __inline__ void
274 bremhash(buf_t bp)
275 {
276 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef) {
277 panic("bremhash le_prev is deadbeef");
278 }
279 if (bp->b_hash.le_next == bp) {
280 panic("bremhash: next points to self");
281 }
282
283 if (bp->b_hash.le_next != NULL) {
284 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
285 }
286 *bp->b_hash.le_prev = (bp)->b_hash.le_next;
287 }
288
289 /*
290 * buf_mtxp held.
291 */
292 static __inline__ void
293 bmovelaundry(buf_t bp)
294 {
295 bp->b_whichq = BQ_LAUNDRY;
296 bp->b_timestamp = buf_timestamp();
297 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
298 blaundrycnt++;
299 }
300
301 static __inline__ void
302 buf_release_credentials(buf_t bp)
303 {
304 if (IS_VALID_CRED(bp->b_rcred)) {
305 kauth_cred_unref(&bp->b_rcred);
306 }
307 if (IS_VALID_CRED(bp->b_wcred)) {
308 kauth_cred_unref(&bp->b_wcred);
309 }
310 }
311
312
313 int
314 buf_valid(buf_t bp)
315 {
316 if ((bp->b_flags & (B_DONE | B_DELWRI))) {
317 return 1;
318 }
319 return 0;
320 }
321
322 int
323 buf_fromcache(buf_t bp)
324 {
325 if ((bp->b_flags & B_CACHE)) {
326 return 1;
327 }
328 return 0;
329 }
330
331 void
332 buf_markinvalid(buf_t bp)
333 {
334 SET(bp->b_flags, B_INVAL);
335 }
336
337 void
338 buf_markdelayed(buf_t bp)
339 {
340 if (!ISSET(bp->b_flags, B_DELWRI)) {
341 SET(bp->b_flags, B_DELWRI);
342
343 OSAddAtomicLong(1, &nbdwrite);
344 buf_reassign(bp, bp->b_vp);
345 }
346 SET(bp->b_flags, B_DONE);
347 }
348
349 void
350 buf_markclean(buf_t bp)
351 {
352 if (ISSET(bp->b_flags, B_DELWRI)) {
353 CLR(bp->b_flags, B_DELWRI);
354
355 OSAddAtomicLong(-1, &nbdwrite);
356 buf_reassign(bp, bp->b_vp);
357 }
358 }
359
360 void
361 buf_markeintr(buf_t bp)
362 {
363 SET(bp->b_flags, B_EINTR);
364 }
365
366
367 void
368 buf_markaged(buf_t bp)
369 {
370 SET(bp->b_flags, B_AGE);
371 }
372
373 int
374 buf_fua(buf_t bp)
375 {
376 if ((bp->b_flags & B_FUA) == B_FUA) {
377 return 1;
378 }
379 return 0;
380 }
381
382 void
383 buf_markfua(buf_t bp)
384 {
385 SET(bp->b_flags, B_FUA);
386 }
387
388 #if CONFIG_PROTECT
389 cpx_t
390 bufattr_cpx(bufattr_t bap)
391 {
392 return bap->ba_cpx;
393 }
394
395 void
396 bufattr_setcpx(bufattr_t bap, cpx_t cpx)
397 {
398 bap->ba_cpx = cpx;
399 }
400
401 void
402 buf_setcpoff(buf_t bp, uint64_t foffset)
403 {
404 bp->b_attr.ba_cp_file_off = foffset;
405 }
406
407 uint64_t
408 bufattr_cpoff(bufattr_t bap)
409 {
410 return bap->ba_cp_file_off;
411 }
412
413 void
414 bufattr_setcpoff(bufattr_t bap, uint64_t foffset)
415 {
416 bap->ba_cp_file_off = foffset;
417 }
418
419 #else // !CONTECT_PROTECT
420
421 uint64_t
422 bufattr_cpoff(bufattr_t bap __unused)
423 {
424 return 0;
425 }
426
427 void
428 bufattr_setcpoff(__unused bufattr_t bap, __unused uint64_t foffset)
429 {
430 return;
431 }
432
433 struct cpx *
434 bufattr_cpx(__unused bufattr_t bap)
435 {
436 return NULL;
437 }
438
439 void
440 bufattr_setcpx(__unused bufattr_t bap, __unused struct cpx *cpx)
441 {
442 }
443
444 #endif /* !CONFIG_PROTECT */
445
446 bufattr_t
447 bufattr_alloc()
448 {
449 bufattr_t bap;
450 MALLOC(bap, bufattr_t, sizeof(struct bufattr), M_TEMP, M_WAITOK);
451 if (bap == NULL) {
452 return NULL;
453 }
454
455 bzero(bap, sizeof(struct bufattr));
456 return bap;
457 }
458
459 void
460 bufattr_free(bufattr_t bap)
461 {
462 if (bap) {
463 FREE(bap, M_TEMP);
464 }
465 }
466
467 bufattr_t
468 bufattr_dup(bufattr_t bap)
469 {
470 bufattr_t new_bufattr;
471 MALLOC(new_bufattr, bufattr_t, sizeof(struct bufattr), M_TEMP, M_WAITOK);
472 if (new_bufattr == NULL) {
473 return NULL;
474 }
475
476 /* Copy the provided one into the new copy */
477 memcpy(new_bufattr, bap, sizeof(struct bufattr));
478 return new_bufattr;
479 }
480
481 int
482 bufattr_rawencrypted(bufattr_t bap)
483 {
484 if ((bap->ba_flags & BA_RAW_ENCRYPTED_IO)) {
485 return 1;
486 }
487 return 0;
488 }
489
490 int
491 bufattr_throttled(bufattr_t bap)
492 {
493 return GET_BUFATTR_IO_TIER(bap);
494 }
495
496 int
497 bufattr_passive(bufattr_t bap)
498 {
499 if ((bap->ba_flags & BA_PASSIVE)) {
500 return 1;
501 }
502 return 0;
503 }
504
505 int
506 bufattr_nocache(bufattr_t bap)
507 {
508 if ((bap->ba_flags & BA_NOCACHE)) {
509 return 1;
510 }
511 return 0;
512 }
513
514 int
515 bufattr_meta(bufattr_t bap)
516 {
517 if ((bap->ba_flags & BA_META)) {
518 return 1;
519 }
520 return 0;
521 }
522
523 void
524 bufattr_markmeta(bufattr_t bap)
525 {
526 SET(bap->ba_flags, BA_META);
527 }
528
529 int
530 #if !CONFIG_EMBEDDED
531 bufattr_delayidlesleep(bufattr_t bap)
532 #else /* !CONFIG_EMBEDDED */
533 bufattr_delayidlesleep(__unused bufattr_t bap)
534 #endif /* !CONFIG_EMBEDDED */
535 {
536 #if !CONFIG_EMBEDDED
537 if ((bap->ba_flags & BA_DELAYIDLESLEEP)) {
538 return 1;
539 }
540 #endif /* !CONFIG_EMBEDDED */
541 return 0;
542 }
543
544 bufattr_t
545 buf_attr(buf_t bp)
546 {
547 return &bp->b_attr;
548 }
549
550 void
551 buf_markstatic(buf_t bp __unused)
552 {
553 SET(bp->b_flags, B_STATICCONTENT);
554 }
555
556 int
557 buf_static(buf_t bp)
558 {
559 if ((bp->b_flags & B_STATICCONTENT)) {
560 return 1;
561 }
562 return 0;
563 }
564
565 void
566 bufattr_markgreedymode(bufattr_t bap)
567 {
568 SET(bap->ba_flags, BA_GREEDY_MODE);
569 }
570
571 int
572 bufattr_greedymode(bufattr_t bap)
573 {
574 if ((bap->ba_flags & BA_GREEDY_MODE)) {
575 return 1;
576 }
577 return 0;
578 }
579
580 void
581 bufattr_markisochronous(bufattr_t bap)
582 {
583 SET(bap->ba_flags, BA_ISOCHRONOUS);
584 }
585
586 int
587 bufattr_isochronous(bufattr_t bap)
588 {
589 if ((bap->ba_flags & BA_ISOCHRONOUS)) {
590 return 1;
591 }
592 return 0;
593 }
594
595 void
596 bufattr_markquickcomplete(bufattr_t bap)
597 {
598 SET(bap->ba_flags, BA_QUICK_COMPLETE);
599 }
600
601 int
602 bufattr_quickcomplete(bufattr_t bap)
603 {
604 if ((bap->ba_flags & BA_QUICK_COMPLETE)) {
605 return 1;
606 }
607 return 0;
608 }
609
610 errno_t
611 buf_error(buf_t bp)
612 {
613 return bp->b_error;
614 }
615
616 void
617 buf_seterror(buf_t bp, errno_t error)
618 {
619 if ((bp->b_error = error)) {
620 SET(bp->b_flags, B_ERROR);
621 } else {
622 CLR(bp->b_flags, B_ERROR);
623 }
624 }
625
626 void
627 buf_setflags(buf_t bp, int32_t flags)
628 {
629 SET(bp->b_flags, (flags & BUF_X_WRFLAGS));
630 }
631
632 void
633 buf_clearflags(buf_t bp, int32_t flags)
634 {
635 CLR(bp->b_flags, (flags & BUF_X_WRFLAGS));
636 }
637
638 int32_t
639 buf_flags(buf_t bp)
640 {
641 return bp->b_flags & BUF_X_RDFLAGS;
642 }
643
644 void
645 buf_reset(buf_t bp, int32_t io_flags)
646 {
647 CLR(bp->b_flags, (B_READ | B_WRITE | B_ERROR | B_DONE | B_INVAL | B_ASYNC | B_NOCACHE | B_FUA));
648 SET(bp->b_flags, (io_flags & (B_ASYNC | B_READ | B_WRITE | B_NOCACHE)));
649
650 bp->b_error = 0;
651 }
652
653 uint32_t
654 buf_count(buf_t bp)
655 {
656 return bp->b_bcount;
657 }
658
659 void
660 buf_setcount(buf_t bp, uint32_t bcount)
661 {
662 bp->b_bcount = bcount;
663 }
664
665 uint32_t
666 buf_size(buf_t bp)
667 {
668 return bp->b_bufsize;
669 }
670
671 void
672 buf_setsize(buf_t bp, uint32_t bufsize)
673 {
674 bp->b_bufsize = bufsize;
675 }
676
677 uint32_t
678 buf_resid(buf_t bp)
679 {
680 return bp->b_resid;
681 }
682
683 void
684 buf_setresid(buf_t bp, uint32_t resid)
685 {
686 bp->b_resid = resid;
687 }
688
689 uint32_t
690 buf_dirtyoff(buf_t bp)
691 {
692 return bp->b_dirtyoff;
693 }
694
695 uint32_t
696 buf_dirtyend(buf_t bp)
697 {
698 return bp->b_dirtyend;
699 }
700
701 void
702 buf_setdirtyoff(buf_t bp, uint32_t dirtyoff)
703 {
704 bp->b_dirtyoff = dirtyoff;
705 }
706
707 void
708 buf_setdirtyend(buf_t bp, uint32_t dirtyend)
709 {
710 bp->b_dirtyend = dirtyend;
711 }
712
713 uintptr_t
714 buf_dataptr(buf_t bp)
715 {
716 return bp->b_datap;
717 }
718
719 void
720 buf_setdataptr(buf_t bp, uintptr_t data)
721 {
722 bp->b_datap = data;
723 }
724
725 vnode_t
726 buf_vnode(buf_t bp)
727 {
728 return bp->b_vp;
729 }
730
731 void
732 buf_setvnode(buf_t bp, vnode_t vp)
733 {
734 bp->b_vp = vp;
735 }
736
737
738 void *
739 buf_callback(buf_t bp)
740 {
741 if (!(bp->b_flags & B_CALL)) {
742 return (void *) NULL;
743 }
744
745 return (void *)bp->b_iodone;
746 }
747
748
749 errno_t
750 buf_setcallback(buf_t bp, void (*callback)(buf_t, void *), void *transaction)
751 {
752 assert(!ISSET(bp->b_flags, B_FILTER) && ISSET(bp->b_lflags, BL_BUSY));
753
754 if (callback) {
755 bp->b_flags |= (B_CALL | B_ASYNC);
756 } else {
757 bp->b_flags &= ~B_CALL;
758 }
759 bp->b_transaction = transaction;
760 bp->b_iodone = callback;
761
762 return 0;
763 }
764
765 errno_t
766 buf_setupl(buf_t bp, upl_t upl, uint32_t offset)
767 {
768 if (!(bp->b_lflags & BL_IOBUF)) {
769 return EINVAL;
770 }
771
772 if (upl) {
773 bp->b_flags |= B_CLUSTER;
774 } else {
775 bp->b_flags &= ~B_CLUSTER;
776 }
777 bp->b_upl = upl;
778 bp->b_uploffset = offset;
779
780 return 0;
781 }
782
783 buf_t
784 buf_clone(buf_t bp, int io_offset, int io_size, void (*iodone)(buf_t, void *), void *arg)
785 {
786 buf_t io_bp;
787
788 if (io_offset < 0 || io_size < 0) {
789 return NULL;
790 }
791
792 if ((unsigned)(io_offset + io_size) > (unsigned)bp->b_bcount) {
793 return NULL;
794 }
795
796 if (bp->b_flags & B_CLUSTER) {
797 if (io_offset && ((bp->b_uploffset + io_offset) & PAGE_MASK)) {
798 return NULL;
799 }
800
801 if (((bp->b_uploffset + io_offset + io_size) & PAGE_MASK) && ((io_offset + io_size) < bp->b_bcount)) {
802 return NULL;
803 }
804 }
805 io_bp = alloc_io_buf(bp->b_vp, 0);
806
807 io_bp->b_flags = bp->b_flags & (B_COMMIT_UPL | B_META | B_PAGEIO | B_CLUSTER | B_PHYS | B_RAW | B_ASYNC | B_READ | B_FUA);
808
809 if (iodone) {
810 io_bp->b_transaction = arg;
811 io_bp->b_iodone = iodone;
812 io_bp->b_flags |= B_CALL;
813 }
814 if (bp->b_flags & B_CLUSTER) {
815 io_bp->b_upl = bp->b_upl;
816 io_bp->b_uploffset = bp->b_uploffset + io_offset;
817 } else {
818 io_bp->b_datap = (uintptr_t)(((char *)bp->b_datap) + io_offset);
819 }
820 io_bp->b_bcount = io_size;
821
822 return io_bp;
823 }
824
825
826 int
827 buf_shadow(buf_t bp)
828 {
829 if (bp->b_lflags & BL_SHADOW) {
830 return 1;
831 }
832 return 0;
833 }
834
835
836 buf_t
837 buf_create_shadow_priv(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg)
838 {
839 return buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, 1);
840 }
841
842 buf_t
843 buf_create_shadow(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg)
844 {
845 return buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, 0);
846 }
847
848
849 static buf_t
850 buf_create_shadow_internal(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg, int priv)
851 {
852 buf_t io_bp;
853
854 KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_START, bp, 0, 0, 0, 0);
855
856 if (!(bp->b_flags & B_META) || (bp->b_lflags & BL_IOBUF)) {
857 KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_END, bp, 0, 0, 0, 0);
858 return NULL;
859 }
860 #ifdef BUF_MAKE_PRIVATE
861 if (bp->b_shadow_ref && bp->b_data_ref == 0 && external_storage == 0) {
862 panic("buf_create_shadow: %p is in the private state (%d, %d)", bp, bp->b_shadow_ref, bp->b_data_ref);
863 }
864 #endif
865 io_bp = alloc_io_buf(bp->b_vp, priv);
866
867 io_bp->b_flags = bp->b_flags & (B_META | B_ZALLOC | B_ASYNC | B_READ | B_FUA);
868 io_bp->b_blkno = bp->b_blkno;
869 io_bp->b_lblkno = bp->b_lblkno;
870
871 if (iodone) {
872 io_bp->b_transaction = arg;
873 io_bp->b_iodone = iodone;
874 io_bp->b_flags |= B_CALL;
875 }
876 if (force_copy == FALSE) {
877 io_bp->b_bcount = bp->b_bcount;
878 io_bp->b_bufsize = bp->b_bufsize;
879
880 if (external_storage) {
881 io_bp->b_datap = external_storage;
882 #ifdef BUF_MAKE_PRIVATE
883 io_bp->b_data_store = NULL;
884 #endif
885 } else {
886 io_bp->b_datap = bp->b_datap;
887 #ifdef BUF_MAKE_PRIVATE
888 io_bp->b_data_store = bp;
889 #endif
890 }
891 *(buf_t *)(&io_bp->b_orig) = bp;
892
893 lck_mtx_lock_spin(buf_mtxp);
894
895 io_bp->b_lflags |= BL_SHADOW;
896 io_bp->b_shadow = bp->b_shadow;
897 bp->b_shadow = io_bp;
898 bp->b_shadow_ref++;
899
900 #ifdef BUF_MAKE_PRIVATE
901 if (external_storage) {
902 io_bp->b_lflags |= BL_EXTERNAL;
903 } else {
904 bp->b_data_ref++;
905 }
906 #endif
907 lck_mtx_unlock(buf_mtxp);
908 } else {
909 if (external_storage) {
910 #ifdef BUF_MAKE_PRIVATE
911 io_bp->b_lflags |= BL_EXTERNAL;
912 #endif
913 io_bp->b_bcount = bp->b_bcount;
914 io_bp->b_bufsize = bp->b_bufsize;
915 io_bp->b_datap = external_storage;
916 } else {
917 allocbuf(io_bp, bp->b_bcount);
918
919 io_bp->b_lflags |= BL_IOBUF_ALLOC;
920 }
921 bcopy((caddr_t)bp->b_datap, (caddr_t)io_bp->b_datap, bp->b_bcount);
922
923 #ifdef BUF_MAKE_PRIVATE
924 io_bp->b_data_store = NULL;
925 #endif
926 }
927 KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, io_bp, 0);
928
929 return io_bp;
930 }
931
932
933 #ifdef BUF_MAKE_PRIVATE
934 errno_t
935 buf_make_private(buf_t bp)
936 {
937 buf_t ds_bp;
938 buf_t t_bp;
939 struct buf my_buf;
940
941 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_START, bp, bp->b_shadow_ref, 0, 0, 0);
942
943 if (bp->b_shadow_ref == 0 || bp->b_data_ref == 0 || ISSET(bp->b_lflags, BL_SHADOW)) {
944 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, EINVAL, 0);
945 return EINVAL;
946 }
947 my_buf.b_flags = B_META;
948 my_buf.b_datap = (uintptr_t)NULL;
949 allocbuf(&my_buf, bp->b_bcount);
950
951 bcopy((caddr_t)bp->b_datap, (caddr_t)my_buf.b_datap, bp->b_bcount);
952
953 lck_mtx_lock_spin(buf_mtxp);
954
955 for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) {
956 if (!ISSET(bp->b_lflags, BL_EXTERNAL)) {
957 break;
958 }
959 }
960 ds_bp = t_bp;
961
962 if (ds_bp == NULL && bp->b_data_ref) {
963 panic("buf_make_private: b_data_ref != 0 && ds_bp == NULL");
964 }
965
966 if (ds_bp && (bp->b_data_ref == 0 || bp->b_shadow_ref == 0)) {
967 panic("buf_make_private: ref_count == 0 && ds_bp != NULL");
968 }
969
970 if (ds_bp == NULL) {
971 lck_mtx_unlock(buf_mtxp);
972
973 buf_free_meta_store(&my_buf);
974
975 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, EINVAL, 0);
976 return EINVAL;
977 }
978 for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) {
979 if (!ISSET(t_bp->b_lflags, BL_EXTERNAL)) {
980 t_bp->b_data_store = ds_bp;
981 }
982 }
983 ds_bp->b_data_ref = bp->b_data_ref;
984
985 bp->b_data_ref = 0;
986 bp->b_datap = my_buf.b_datap;
987
988 lck_mtx_unlock(buf_mtxp);
989
990 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, 0, 0);
991 return 0;
992 }
993 #endif
994
995
996 void
997 buf_setfilter(buf_t bp, void (*filter)(buf_t, void *), void *transaction,
998 void(**old_iodone)(buf_t, void *), void **old_transaction)
999 {
1000 assert(ISSET(bp->b_lflags, BL_BUSY));
1001
1002 if (old_iodone) {
1003 *old_iodone = bp->b_iodone;
1004 }
1005 if (old_transaction) {
1006 *old_transaction = bp->b_transaction;
1007 }
1008
1009 bp->b_transaction = transaction;
1010 bp->b_iodone = filter;
1011 if (filter) {
1012 bp->b_flags |= B_FILTER;
1013 } else {
1014 bp->b_flags &= ~B_FILTER;
1015 }
1016 }
1017
1018
1019 daddr64_t
1020 buf_blkno(buf_t bp)
1021 {
1022 return bp->b_blkno;
1023 }
1024
1025 daddr64_t
1026 buf_lblkno(buf_t bp)
1027 {
1028 return bp->b_lblkno;
1029 }
1030
1031 void
1032 buf_setblkno(buf_t bp, daddr64_t blkno)
1033 {
1034 bp->b_blkno = blkno;
1035 }
1036
1037 void
1038 buf_setlblkno(buf_t bp, daddr64_t lblkno)
1039 {
1040 bp->b_lblkno = lblkno;
1041 }
1042
1043 dev_t
1044 buf_device(buf_t bp)
1045 {
1046 return bp->b_dev;
1047 }
1048
1049 errno_t
1050 buf_setdevice(buf_t bp, vnode_t vp)
1051 {
1052 if ((vp->v_type != VBLK) && (vp->v_type != VCHR)) {
1053 return EINVAL;
1054 }
1055 bp->b_dev = vp->v_rdev;
1056
1057 return 0;
1058 }
1059
1060
1061 void *
1062 buf_drvdata(buf_t bp)
1063 {
1064 return bp->b_drvdata;
1065 }
1066
1067 void
1068 buf_setdrvdata(buf_t bp, void *drvdata)
1069 {
1070 bp->b_drvdata = drvdata;
1071 }
1072
1073 void *
1074 buf_fsprivate(buf_t bp)
1075 {
1076 return bp->b_fsprivate;
1077 }
1078
1079 void
1080 buf_setfsprivate(buf_t bp, void *fsprivate)
1081 {
1082 bp->b_fsprivate = fsprivate;
1083 }
1084
1085 kauth_cred_t
1086 buf_rcred(buf_t bp)
1087 {
1088 return bp->b_rcred;
1089 }
1090
1091 kauth_cred_t
1092 buf_wcred(buf_t bp)
1093 {
1094 return bp->b_wcred;
1095 }
1096
1097 void *
1098 buf_upl(buf_t bp)
1099 {
1100 return bp->b_upl;
1101 }
1102
1103 uint32_t
1104 buf_uploffset(buf_t bp)
1105 {
1106 return (uint32_t)(bp->b_uploffset);
1107 }
1108
1109 proc_t
1110 buf_proc(buf_t bp)
1111 {
1112 return bp->b_proc;
1113 }
1114
1115
1116 errno_t
1117 buf_map(buf_t bp, caddr_t *io_addr)
1118 {
1119 buf_t real_bp;
1120 vm_offset_t vaddr;
1121 kern_return_t kret;
1122
1123 if (!(bp->b_flags & B_CLUSTER)) {
1124 *io_addr = (caddr_t)bp->b_datap;
1125 return 0;
1126 }
1127 real_bp = (buf_t)(bp->b_real_bp);
1128
1129 if (real_bp && real_bp->b_datap) {
1130 /*
1131 * b_real_bp is only valid if B_CLUSTER is SET
1132 * if it's non-zero, than someone did a cluster_bp call
1133 * if the backing physical pages were already mapped
1134 * in before the call to cluster_bp (non-zero b_datap),
1135 * than we just use that mapping
1136 */
1137 *io_addr = (caddr_t)real_bp->b_datap;
1138 return 0;
1139 }
1140 kret = ubc_upl_map(bp->b_upl, &vaddr); /* Map it in */
1141
1142 if (kret != KERN_SUCCESS) {
1143 *io_addr = NULL;
1144
1145 return ENOMEM;
1146 }
1147 vaddr += bp->b_uploffset;
1148
1149 *io_addr = (caddr_t)vaddr;
1150
1151 return 0;
1152 }
1153
1154 errno_t
1155 buf_unmap(buf_t bp)
1156 {
1157 buf_t real_bp;
1158 kern_return_t kret;
1159
1160 if (!(bp->b_flags & B_CLUSTER)) {
1161 return 0;
1162 }
1163 /*
1164 * see buf_map for the explanation
1165 */
1166 real_bp = (buf_t)(bp->b_real_bp);
1167
1168 if (real_bp && real_bp->b_datap) {
1169 return 0;
1170 }
1171
1172 if ((bp->b_lflags & BL_IOBUF) &&
1173 ((bp->b_flags & (B_PAGEIO | B_READ)) != (B_PAGEIO | B_READ))) {
1174 /*
1175 * ignore pageins... the 'right' thing will
1176 * happen due to the way we handle speculative
1177 * clusters...
1178 *
1179 * when we commit these pages, we'll hit
1180 * it with UPL_COMMIT_INACTIVE which
1181 * will clear the reference bit that got
1182 * turned on when we touched the mapping
1183 */
1184 bp->b_flags |= B_AGE;
1185 }
1186 kret = ubc_upl_unmap(bp->b_upl);
1187
1188 if (kret != KERN_SUCCESS) {
1189 return EINVAL;
1190 }
1191 return 0;
1192 }
1193
1194
1195 void
1196 buf_clear(buf_t bp)
1197 {
1198 caddr_t baddr;
1199
1200 if (buf_map(bp, &baddr) == 0) {
1201 bzero(baddr, bp->b_bcount);
1202 buf_unmap(bp);
1203 }
1204 bp->b_resid = 0;
1205 }
1206
1207 /*
1208 * Read or write a buffer that is not contiguous on disk.
1209 * buffer is marked done/error at the conclusion
1210 */
1211 static int
1212 buf_strategy_fragmented(vnode_t devvp, buf_t bp, off_t f_offset, size_t contig_bytes)
1213 {
1214 vnode_t vp = buf_vnode(bp);
1215 buf_t io_bp; /* For reading or writing a single block */
1216 int io_direction;
1217 int io_resid;
1218 size_t io_contig_bytes;
1219 daddr64_t io_blkno;
1220 int error = 0;
1221 int bmap_flags;
1222
1223 /*
1224 * save our starting point... the bp was already mapped
1225 * in buf_strategy before we got called
1226 * no sense doing it again.
1227 */
1228 io_blkno = bp->b_blkno;
1229 /*
1230 * Make sure we redo this mapping for the next I/O
1231 * i.e. this can never be a 'permanent' mapping
1232 */
1233 bp->b_blkno = bp->b_lblkno;
1234
1235 /*
1236 * Get an io buffer to do the deblocking
1237 */
1238 io_bp = alloc_io_buf(devvp, 0);
1239
1240 io_bp->b_lblkno = bp->b_lblkno;
1241 io_bp->b_datap = bp->b_datap;
1242 io_resid = bp->b_bcount;
1243 io_direction = bp->b_flags & B_READ;
1244 io_contig_bytes = contig_bytes;
1245
1246 if (bp->b_flags & B_READ) {
1247 bmap_flags = VNODE_READ;
1248 } else {
1249 bmap_flags = VNODE_WRITE;
1250 }
1251
1252 for (;;) {
1253 if (io_blkno == -1) {
1254 /*
1255 * this is unexepected, but we'll allow for it
1256 */
1257 bzero((caddr_t)io_bp->b_datap, (int)io_contig_bytes);
1258 } else {
1259 io_bp->b_bcount = io_contig_bytes;
1260 io_bp->b_bufsize = io_contig_bytes;
1261 io_bp->b_resid = io_contig_bytes;
1262 io_bp->b_blkno = io_blkno;
1263
1264 buf_reset(io_bp, io_direction);
1265
1266 /*
1267 * Call the device to do the I/O and wait for it. Make sure the appropriate party is charged for write
1268 */
1269
1270 if (!ISSET(bp->b_flags, B_READ)) {
1271 OSAddAtomic(1, &devvp->v_numoutput);
1272 }
1273
1274 if ((error = VNOP_STRATEGY(io_bp))) {
1275 break;
1276 }
1277 if ((error = (int)buf_biowait(io_bp))) {
1278 break;
1279 }
1280 if (io_bp->b_resid) {
1281 io_resid -= (io_contig_bytes - io_bp->b_resid);
1282 break;
1283 }
1284 }
1285 if ((io_resid -= io_contig_bytes) == 0) {
1286 break;
1287 }
1288 f_offset += io_contig_bytes;
1289 io_bp->b_datap += io_contig_bytes;
1290
1291 /*
1292 * Map the current position to a physical block number
1293 */
1294 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_resid, &io_blkno, &io_contig_bytes, NULL, bmap_flags, NULL))) {
1295 break;
1296 }
1297 }
1298 buf_free(io_bp);
1299
1300 if (error) {
1301 buf_seterror(bp, error);
1302 }
1303 bp->b_resid = io_resid;
1304 /*
1305 * This I/O is now complete
1306 */
1307 buf_biodone(bp);
1308
1309 return error;
1310 }
1311
1312
1313 /*
1314 * struct vnop_strategy_args {
1315 * struct buf *a_bp;
1316 * } *ap;
1317 */
1318 errno_t
1319 buf_strategy(vnode_t devvp, void *ap)
1320 {
1321 buf_t bp = ((struct vnop_strategy_args *)ap)->a_bp;
1322 vnode_t vp = bp->b_vp;
1323 int bmap_flags;
1324 errno_t error;
1325 #if CONFIG_DTRACE
1326 int dtrace_io_start_flag = 0; /* We only want to trip the io:::start
1327 * probe once, with the true physical
1328 * block in place (b_blkno)
1329 */
1330
1331 #endif
1332
1333 if (vp == NULL || vp->v_type == VCHR || vp->v_type == VBLK) {
1334 panic("buf_strategy: b_vp == NULL || vtype == VCHR | VBLK\n");
1335 }
1336 /*
1337 * associate the physical device with
1338 * with this buf_t even if we don't
1339 * end up issuing the I/O...
1340 */
1341 bp->b_dev = devvp->v_rdev;
1342
1343 if (bp->b_flags & B_READ) {
1344 bmap_flags = VNODE_READ;
1345 } else {
1346 bmap_flags = VNODE_WRITE;
1347 }
1348
1349 if (!(bp->b_flags & B_CLUSTER)) {
1350 if ((bp->b_upl)) {
1351 /*
1352 * we have a UPL associated with this bp
1353 * go through cluster_bp which knows how
1354 * to deal with filesystem block sizes
1355 * that aren't equal to the page size
1356 */
1357 DTRACE_IO1(start, buf_t, bp);
1358 return cluster_bp(bp);
1359 }
1360 if (bp->b_blkno == bp->b_lblkno) {
1361 off_t f_offset;
1362 size_t contig_bytes;
1363
1364 if ((error = VNOP_BLKTOOFF(vp, bp->b_lblkno, &f_offset))) {
1365 DTRACE_IO1(start, buf_t, bp);
1366 buf_seterror(bp, error);
1367 buf_biodone(bp);
1368
1369 return error;
1370 }
1371
1372 if ((error = VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))) {
1373 DTRACE_IO1(start, buf_t, bp);
1374 buf_seterror(bp, error);
1375 buf_biodone(bp);
1376
1377 return error;
1378 }
1379
1380 DTRACE_IO1(start, buf_t, bp);
1381 #if CONFIG_DTRACE
1382 dtrace_io_start_flag = 1;
1383 #endif /* CONFIG_DTRACE */
1384
1385 if ((bp->b_blkno == -1) || (contig_bytes == 0)) {
1386 /* Set block number to force biodone later */
1387 bp->b_blkno = -1;
1388 buf_clear(bp);
1389 } else if ((long)contig_bytes < bp->b_bcount) {
1390 return buf_strategy_fragmented(devvp, bp, f_offset, contig_bytes);
1391 }
1392 }
1393
1394 #if CONFIG_DTRACE
1395 if (dtrace_io_start_flag == 0) {
1396 DTRACE_IO1(start, buf_t, bp);
1397 dtrace_io_start_flag = 1;
1398 }
1399 #endif /* CONFIG_DTRACE */
1400
1401 if (bp->b_blkno == -1) {
1402 buf_biodone(bp);
1403 return 0;
1404 }
1405 }
1406
1407 #if CONFIG_DTRACE
1408 if (dtrace_io_start_flag == 0) {
1409 DTRACE_IO1(start, buf_t, bp);
1410 }
1411 #endif /* CONFIG_DTRACE */
1412
1413 #if CONFIG_PROTECT
1414 /* Capture f_offset in the bufattr*/
1415 cpx_t cpx = bufattr_cpx(buf_attr(bp));
1416 if (cpx) {
1417 /* No need to go here for older EAs */
1418 if (cpx_use_offset_for_iv(cpx) && !cpx_synthetic_offset_for_iv(cpx)) {
1419 off_t f_offset;
1420 if ((error = VNOP_BLKTOOFF(bp->b_vp, bp->b_lblkno, &f_offset))) {
1421 return error;
1422 }
1423
1424 /*
1425 * Attach the file offset to this buffer. The
1426 * bufattr attributes will be passed down the stack
1427 * until they reach the storage driver (whether
1428 * IOFlashStorage, ASP, or IONVMe). The driver
1429 * will retain the offset in a local variable when it
1430 * issues its I/Os to the NAND controller.
1431 *
1432 * Note that LwVM may end up splitting this I/O
1433 * into sub-I/Os if it crosses a chunk boundary. In this
1434 * case, LwVM will update this field when it dispatches
1435 * each I/O to IOFlashStorage. But from our perspective
1436 * we have only issued a single I/O.
1437 *
1438 * In the case of APFS we do not bounce through another
1439 * intermediate layer (such as CoreStorage). APFS will
1440 * issue the I/Os directly to the block device / IOMedia
1441 * via buf_strategy on the specfs node.
1442 */
1443 buf_setcpoff(bp, f_offset);
1444 CP_DEBUG((CPDBG_OFFSET_IO | DBG_FUNC_NONE), (uint32_t) f_offset, (uint32_t) bp->b_lblkno, (uint32_t) bp->b_blkno, (uint32_t) bp->b_bcount, 0);
1445 }
1446 }
1447 #endif
1448
1449 /*
1450 * we can issue the I/O because...
1451 * either B_CLUSTER is set which
1452 * means that the I/O is properly set
1453 * up to be a multiple of the page size, or
1454 * we were able to successfully set up the
1455 * physical block mapping
1456 */
1457 error = VOCALL(devvp->v_op, VOFFSET(vnop_strategy), ap);
1458 DTRACE_FSINFO(strategy, vnode_t, vp);
1459 return error;
1460 }
1461
1462
1463
1464 buf_t
1465 buf_alloc(vnode_t vp)
1466 {
1467 return alloc_io_buf(vp, is_vm_privileged());
1468 }
1469
1470 void
1471 buf_free(buf_t bp)
1472 {
1473 free_io_buf(bp);
1474 }
1475
1476
1477 /*
1478 * iterate buffers for the specified vp.
1479 * if BUF_SCAN_DIRTY is set, do the dirty list
1480 * if BUF_SCAN_CLEAN is set, do the clean list
1481 * if neither flag is set, default to BUF_SCAN_DIRTY
1482 * if BUF_NOTIFY_BUSY is set, call the callout function using a NULL bp for busy pages
1483 */
1484
1485 struct buf_iterate_info_t {
1486 int flag;
1487 struct buflists *listhead;
1488 };
1489
1490 void
1491 buf_iterate(vnode_t vp, int (*callout)(buf_t, void *), int flags, void *arg)
1492 {
1493 buf_t bp;
1494 int retval;
1495 struct buflists local_iterblkhd;
1496 int lock_flags = BAC_NOWAIT | BAC_REMOVE;
1497 int notify_busy = flags & BUF_NOTIFY_BUSY;
1498 struct buf_iterate_info_t list[2];
1499 int num_lists, i;
1500
1501 if (flags & BUF_SKIP_LOCKED) {
1502 lock_flags |= BAC_SKIP_LOCKED;
1503 }
1504 if (flags & BUF_SKIP_NONLOCKED) {
1505 lock_flags |= BAC_SKIP_NONLOCKED;
1506 }
1507
1508 if (!(flags & (BUF_SCAN_DIRTY | BUF_SCAN_CLEAN))) {
1509 flags |= BUF_SCAN_DIRTY;
1510 }
1511
1512 num_lists = 0;
1513
1514 if (flags & BUF_SCAN_DIRTY) {
1515 list[num_lists].flag = VBI_DIRTY;
1516 list[num_lists].listhead = &vp->v_dirtyblkhd;
1517 num_lists++;
1518 }
1519 if (flags & BUF_SCAN_CLEAN) {
1520 list[num_lists].flag = VBI_CLEAN;
1521 list[num_lists].listhead = &vp->v_cleanblkhd;
1522 num_lists++;
1523 }
1524
1525 for (i = 0; i < num_lists; i++) {
1526 lck_mtx_lock(buf_mtxp);
1527
1528 if (buf_iterprepare(vp, &local_iterblkhd, list[i].flag)) {
1529 lck_mtx_unlock(buf_mtxp);
1530 continue;
1531 }
1532 while (!LIST_EMPTY(&local_iterblkhd)) {
1533 bp = LIST_FIRST(&local_iterblkhd);
1534 LIST_REMOVE(bp, b_vnbufs);
1535 LIST_INSERT_HEAD(list[i].listhead, bp, b_vnbufs);
1536
1537 if (buf_acquire_locked(bp, lock_flags, 0, 0)) {
1538 if (notify_busy) {
1539 bp = NULL;
1540 } else {
1541 continue;
1542 }
1543 }
1544
1545 lck_mtx_unlock(buf_mtxp);
1546
1547 retval = callout(bp, arg);
1548
1549 switch (retval) {
1550 case BUF_RETURNED:
1551 if (bp) {
1552 buf_brelse(bp);
1553 }
1554 break;
1555 case BUF_CLAIMED:
1556 break;
1557 case BUF_RETURNED_DONE:
1558 if (bp) {
1559 buf_brelse(bp);
1560 }
1561 lck_mtx_lock(buf_mtxp);
1562 goto out;
1563 case BUF_CLAIMED_DONE:
1564 lck_mtx_lock(buf_mtxp);
1565 goto out;
1566 }
1567 lck_mtx_lock(buf_mtxp);
1568 } /* while list has more nodes */
1569 out:
1570 buf_itercomplete(vp, &local_iterblkhd, list[i].flag);
1571 lck_mtx_unlock(buf_mtxp);
1572 } /* for each list */
1573 } /* buf_iterate */
1574
1575
1576 /*
1577 * Flush out and invalidate all buffers associated with a vnode.
1578 */
1579 int
1580 buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo)
1581 {
1582 buf_t bp;
1583 int aflags;
1584 int error = 0;
1585 int must_rescan = 1;
1586 struct buflists local_iterblkhd;
1587
1588
1589 if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd)) {
1590 return 0;
1591 }
1592
1593 lck_mtx_lock(buf_mtxp);
1594
1595 for (;;) {
1596 if (must_rescan == 0) {
1597 /*
1598 * the lists may not be empty, but all that's left at this
1599 * point are metadata or B_LOCKED buffers which are being
1600 * skipped... we know this because we made it through both
1601 * the clean and dirty lists without dropping buf_mtxp...
1602 * each time we drop buf_mtxp we bump "must_rescan"
1603 */
1604 break;
1605 }
1606 if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd)) {
1607 break;
1608 }
1609 must_rescan = 0;
1610 /*
1611 * iterate the clean list
1612 */
1613 if (buf_iterprepare(vp, &local_iterblkhd, VBI_CLEAN)) {
1614 goto try_dirty_list;
1615 }
1616 while (!LIST_EMPTY(&local_iterblkhd)) {
1617 bp = LIST_FIRST(&local_iterblkhd);
1618
1619 LIST_REMOVE(bp, b_vnbufs);
1620 LIST_INSERT_HEAD(&vp->v_cleanblkhd, bp, b_vnbufs);
1621
1622 /*
1623 * some filesystems distinguish meta data blocks with a negative logical block #
1624 */
1625 if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META))) {
1626 continue;
1627 }
1628
1629 aflags = BAC_REMOVE;
1630
1631 if (!(flags & BUF_INVALIDATE_LOCKED)) {
1632 aflags |= BAC_SKIP_LOCKED;
1633 }
1634
1635 if ((error = (int)buf_acquire_locked(bp, aflags, slpflag, slptimeo))) {
1636 if (error == EDEADLK) {
1637 /*
1638 * this buffer was marked B_LOCKED...
1639 * we didn't drop buf_mtxp, so we
1640 * we don't need to rescan
1641 */
1642 continue;
1643 }
1644 if (error == EAGAIN) {
1645 /*
1646 * found a busy buffer... we blocked and
1647 * dropped buf_mtxp, so we're going to
1648 * need to rescan after this pass is completed
1649 */
1650 must_rescan++;
1651 continue;
1652 }
1653 /*
1654 * got some kind of 'real' error out of the msleep
1655 * in buf_acquire_locked, terminate the scan and return the error
1656 */
1657 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1658
1659 lck_mtx_unlock(buf_mtxp);
1660 return error;
1661 }
1662 lck_mtx_unlock(buf_mtxp);
1663
1664 if (bp->b_flags & B_LOCKED) {
1665 KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 0, 0);
1666 }
1667
1668 CLR(bp->b_flags, B_LOCKED);
1669 SET(bp->b_flags, B_INVAL);
1670 buf_brelse(bp);
1671
1672 lck_mtx_lock(buf_mtxp);
1673
1674 /*
1675 * by dropping buf_mtxp, we allow new
1676 * buffers to be added to the vnode list(s)
1677 * we'll have to rescan at least once more
1678 * if the queues aren't empty
1679 */
1680 must_rescan++;
1681 }
1682 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1683
1684 try_dirty_list:
1685 /*
1686 * Now iterate on dirty blks
1687 */
1688 if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY)) {
1689 continue;
1690 }
1691 while (!LIST_EMPTY(&local_iterblkhd)) {
1692 bp = LIST_FIRST(&local_iterblkhd);
1693
1694 LIST_REMOVE(bp, b_vnbufs);
1695 LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1696
1697 /*
1698 * some filesystems distinguish meta data blocks with a negative logical block #
1699 */
1700 if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META))) {
1701 continue;
1702 }
1703
1704 aflags = BAC_REMOVE;
1705
1706 if (!(flags & BUF_INVALIDATE_LOCKED)) {
1707 aflags |= BAC_SKIP_LOCKED;
1708 }
1709
1710 if ((error = (int)buf_acquire_locked(bp, aflags, slpflag, slptimeo))) {
1711 if (error == EDEADLK) {
1712 /*
1713 * this buffer was marked B_LOCKED...
1714 * we didn't drop buf_mtxp, so we
1715 * we don't need to rescan
1716 */
1717 continue;
1718 }
1719 if (error == EAGAIN) {
1720 /*
1721 * found a busy buffer... we blocked and
1722 * dropped buf_mtxp, so we're going to
1723 * need to rescan after this pass is completed
1724 */
1725 must_rescan++;
1726 continue;
1727 }
1728 /*
1729 * got some kind of 'real' error out of the msleep
1730 * in buf_acquire_locked, terminate the scan and return the error
1731 */
1732 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1733
1734 lck_mtx_unlock(buf_mtxp);
1735 return error;
1736 }
1737 lck_mtx_unlock(buf_mtxp);
1738
1739 if (bp->b_flags & B_LOCKED) {
1740 KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 1, 0);
1741 }
1742
1743 CLR(bp->b_flags, B_LOCKED);
1744 SET(bp->b_flags, B_INVAL);
1745
1746 if (ISSET(bp->b_flags, B_DELWRI) && (flags & BUF_WRITE_DATA)) {
1747 (void) VNOP_BWRITE(bp);
1748 } else {
1749 buf_brelse(bp);
1750 }
1751
1752 lck_mtx_lock(buf_mtxp);
1753 /*
1754 * by dropping buf_mtxp, we allow new
1755 * buffers to be added to the vnode list(s)
1756 * we'll have to rescan at least once more
1757 * if the queues aren't empty
1758 */
1759 must_rescan++;
1760 }
1761 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1762 }
1763 lck_mtx_unlock(buf_mtxp);
1764
1765 return 0;
1766 }
1767
1768 void
1769 buf_flushdirtyblks(vnode_t vp, int wait, int flags, const char *msg)
1770 {
1771 (void) buf_flushdirtyblks_skipinfo(vp, wait, flags, msg);
1772 return;
1773 }
1774
1775 int
1776 buf_flushdirtyblks_skipinfo(vnode_t vp, int wait, int flags, const char *msg)
1777 {
1778 buf_t bp;
1779 int writes_issued = 0;
1780 errno_t error;
1781 int busy = 0;
1782 struct buflists local_iterblkhd;
1783 int lock_flags = BAC_NOWAIT | BAC_REMOVE;
1784 int any_locked = 0;
1785
1786 if (flags & BUF_SKIP_LOCKED) {
1787 lock_flags |= BAC_SKIP_LOCKED;
1788 }
1789 if (flags & BUF_SKIP_NONLOCKED) {
1790 lock_flags |= BAC_SKIP_NONLOCKED;
1791 }
1792 loop:
1793 lck_mtx_lock(buf_mtxp);
1794
1795 if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY) == 0) {
1796 while (!LIST_EMPTY(&local_iterblkhd)) {
1797 bp = LIST_FIRST(&local_iterblkhd);
1798 LIST_REMOVE(bp, b_vnbufs);
1799 LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1800
1801 if ((error = buf_acquire_locked(bp, lock_flags, 0, 0)) == EBUSY) {
1802 busy++;
1803 }
1804 if (error) {
1805 /*
1806 * If we passed in BUF_SKIP_LOCKED or BUF_SKIP_NONLOCKED,
1807 * we may want to do somethign differently if a locked or unlocked
1808 * buffer was encountered (depending on the arg specified).
1809 * In this case, we know that one of those two was set, and the
1810 * buf acquisition failed above.
1811 *
1812 * If it failed with EDEADLK, then save state which can be emitted
1813 * later on to the caller. Most callers should not care.
1814 */
1815 if (error == EDEADLK) {
1816 any_locked++;
1817 }
1818 continue;
1819 }
1820 lck_mtx_unlock(buf_mtxp);
1821
1822 bp->b_flags &= ~B_LOCKED;
1823
1824 /*
1825 * Wait for I/O associated with indirect blocks to complete,
1826 * since there is no way to quickly wait for them below.
1827 */
1828 if ((bp->b_vp == vp) || (wait == 0)) {
1829 (void) buf_bawrite(bp);
1830 } else {
1831 (void) VNOP_BWRITE(bp);
1832 }
1833 writes_issued++;
1834
1835 lck_mtx_lock(buf_mtxp);
1836 }
1837 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1838 }
1839 lck_mtx_unlock(buf_mtxp);
1840
1841 if (wait) {
1842 (void)vnode_waitforwrites(vp, 0, 0, 0, msg);
1843
1844 if (vp->v_dirtyblkhd.lh_first && busy) {
1845 /*
1846 * we had one or more BUSY buffers on
1847 * the dirtyblock list... most likely
1848 * these are due to delayed writes that
1849 * were moved to the bclean queue but
1850 * have not yet been 'written'.
1851 * if we issued some writes on the
1852 * previous pass, we try again immediately
1853 * if we didn't, we'll sleep for some time
1854 * to allow the state to change...
1855 */
1856 if (writes_issued == 0) {
1857 (void)tsleep((caddr_t)&vp->v_numoutput,
1858 PRIBIO + 1, "vnode_flushdirtyblks", hz / 20);
1859 }
1860 writes_issued = 0;
1861 busy = 0;
1862
1863 goto loop;
1864 }
1865 }
1866
1867 return any_locked;
1868 }
1869
1870
1871 /*
1872 * called with buf_mtxp held...
1873 * this lock protects the queue manipulation
1874 */
1875 static int
1876 buf_iterprepare(vnode_t vp, struct buflists *iterheadp, int flags)
1877 {
1878 struct buflists * listheadp;
1879
1880 if (flags & VBI_DIRTY) {
1881 listheadp = &vp->v_dirtyblkhd;
1882 } else {
1883 listheadp = &vp->v_cleanblkhd;
1884 }
1885
1886 while (vp->v_iterblkflags & VBI_ITER) {
1887 vp->v_iterblkflags |= VBI_ITERWANT;
1888 msleep(&vp->v_iterblkflags, buf_mtxp, 0, "buf_iterprepare", NULL);
1889 }
1890 if (LIST_EMPTY(listheadp)) {
1891 LIST_INIT(iterheadp);
1892 return EINVAL;
1893 }
1894 vp->v_iterblkflags |= VBI_ITER;
1895
1896 iterheadp->lh_first = listheadp->lh_first;
1897 listheadp->lh_first->b_vnbufs.le_prev = &iterheadp->lh_first;
1898 LIST_INIT(listheadp);
1899
1900 return 0;
1901 }
1902
1903 /*
1904 * called with buf_mtxp held...
1905 * this lock protects the queue manipulation
1906 */
1907 static void
1908 buf_itercomplete(vnode_t vp, struct buflists *iterheadp, int flags)
1909 {
1910 struct buflists * listheadp;
1911 buf_t bp;
1912
1913 if (flags & VBI_DIRTY) {
1914 listheadp = &vp->v_dirtyblkhd;
1915 } else {
1916 listheadp = &vp->v_cleanblkhd;
1917 }
1918
1919 while (!LIST_EMPTY(iterheadp)) {
1920 bp = LIST_FIRST(iterheadp);
1921 LIST_REMOVE(bp, b_vnbufs);
1922 LIST_INSERT_HEAD(listheadp, bp, b_vnbufs);
1923 }
1924 vp->v_iterblkflags &= ~VBI_ITER;
1925
1926 if (vp->v_iterblkflags & VBI_ITERWANT) {
1927 vp->v_iterblkflags &= ~VBI_ITERWANT;
1928 wakeup(&vp->v_iterblkflags);
1929 }
1930 }
1931
1932
1933 static void
1934 bremfree_locked(buf_t bp)
1935 {
1936 struct bqueues *dp = NULL;
1937 int whichq;
1938
1939 whichq = bp->b_whichq;
1940
1941 if (whichq == -1) {
1942 if (bp->b_shadow_ref == 0) {
1943 panic("bremfree_locked: %p not on freelist", bp);
1944 }
1945 /*
1946 * there are clones pointing to 'bp'...
1947 * therefore, it was not put on a freelist
1948 * when buf_brelse was last called on 'bp'
1949 */
1950 return;
1951 }
1952 /*
1953 * We only calculate the head of the freelist when removing
1954 * the last element of the list as that is the only time that
1955 * it is needed (e.g. to reset the tail pointer).
1956 *
1957 * NB: This makes an assumption about how tailq's are implemented.
1958 */
1959 if (bp->b_freelist.tqe_next == NULL) {
1960 dp = &bufqueues[whichq];
1961
1962 if (dp->tqh_last != &bp->b_freelist.tqe_next) {
1963 panic("bremfree: lost tail");
1964 }
1965 }
1966 TAILQ_REMOVE(dp, bp, b_freelist);
1967
1968 if (whichq == BQ_LAUNDRY) {
1969 blaundrycnt--;
1970 }
1971
1972 bp->b_whichq = -1;
1973 bp->b_timestamp = 0;
1974 bp->b_shadow = 0;
1975 }
1976
1977 /*
1978 * Associate a buffer with a vnode.
1979 * buf_mtxp must be locked on entry
1980 */
1981 static void
1982 bgetvp_locked(vnode_t vp, buf_t bp)
1983 {
1984 if (bp->b_vp != vp) {
1985 panic("bgetvp_locked: not free");
1986 }
1987
1988 if (vp->v_type == VBLK || vp->v_type == VCHR) {
1989 bp->b_dev = vp->v_rdev;
1990 } else {
1991 bp->b_dev = NODEV;
1992 }
1993 /*
1994 * Insert onto list for new vnode.
1995 */
1996 bufinsvn(bp, &vp->v_cleanblkhd);
1997 }
1998
1999 /*
2000 * Disassociate a buffer from a vnode.
2001 * buf_mtxp must be locked on entry
2002 */
2003 static void
2004 brelvp_locked(buf_t bp)
2005 {
2006 /*
2007 * Delete from old vnode list, if on one.
2008 */
2009 if (bp->b_vnbufs.le_next != NOLIST) {
2010 bufremvn(bp);
2011 }
2012
2013 bp->b_vp = (vnode_t)NULL;
2014 }
2015
2016 /*
2017 * Reassign a buffer from one vnode to another.
2018 * Used to assign file specific control information
2019 * (indirect blocks) to the vnode to which they belong.
2020 */
2021 static void
2022 buf_reassign(buf_t bp, vnode_t newvp)
2023 {
2024 struct buflists *listheadp;
2025
2026 if (newvp == NULL) {
2027 printf("buf_reassign: NULL");
2028 return;
2029 }
2030 lck_mtx_lock_spin(buf_mtxp);
2031
2032 /*
2033 * Delete from old vnode list, if on one.
2034 */
2035 if (bp->b_vnbufs.le_next != NOLIST) {
2036 bufremvn(bp);
2037 }
2038 /*
2039 * If dirty, put on list of dirty buffers;
2040 * otherwise insert onto list of clean buffers.
2041 */
2042 if (ISSET(bp->b_flags, B_DELWRI)) {
2043 listheadp = &newvp->v_dirtyblkhd;
2044 } else {
2045 listheadp = &newvp->v_cleanblkhd;
2046 }
2047 bufinsvn(bp, listheadp);
2048
2049 lck_mtx_unlock(buf_mtxp);
2050 }
2051
2052 static __inline__ void
2053 bufhdrinit(buf_t bp)
2054 {
2055 bzero((char *)bp, sizeof *bp);
2056 bp->b_dev = NODEV;
2057 bp->b_rcred = NOCRED;
2058 bp->b_wcred = NOCRED;
2059 bp->b_vnbufs.le_next = NOLIST;
2060 bp->b_flags = B_INVAL;
2061
2062 return;
2063 }
2064
2065 /*
2066 * Initialize buffers and hash links for buffers.
2067 */
2068 __private_extern__ void
2069 bufinit(void)
2070 {
2071 buf_t bp;
2072 struct bqueues *dp;
2073 int i;
2074
2075 nbuf_headers = 0;
2076 /* Initialize the buffer queues ('freelists') and the hash table */
2077 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) {
2078 TAILQ_INIT(dp);
2079 }
2080 bufhashtbl = hashinit(nbuf_hashelements, M_CACHE, &bufhash);
2081
2082 buf_busycount = 0;
2083
2084 /* Initialize the buffer headers */
2085 for (i = 0; i < max_nbuf_headers; i++) {
2086 nbuf_headers++;
2087 bp = &buf_headers[i];
2088 bufhdrinit(bp);
2089
2090 BLISTNONE(bp);
2091 dp = &bufqueues[BQ_EMPTY];
2092 bp->b_whichq = BQ_EMPTY;
2093 bp->b_timestamp = buf_timestamp();
2094 binsheadfree(bp, dp, BQ_EMPTY);
2095 binshash(bp, &invalhash);
2096 }
2097 boot_nbuf_headers = nbuf_headers;
2098
2099 TAILQ_INIT(&iobufqueue);
2100 TAILQ_INIT(&delaybufqueue);
2101
2102 for (; i < nbuf_headers + niobuf_headers; i++) {
2103 bp = &buf_headers[i];
2104 bufhdrinit(bp);
2105 bp->b_whichq = -1;
2106 binsheadfree(bp, &iobufqueue, -1);
2107 }
2108
2109 /*
2110 * allocate lock group attribute and group
2111 */
2112 buf_mtx_grp_attr = lck_grp_attr_alloc_init();
2113 buf_mtx_grp = lck_grp_alloc_init("buffer cache", buf_mtx_grp_attr);
2114
2115 /*
2116 * allocate the lock attribute
2117 */
2118 buf_mtx_attr = lck_attr_alloc_init();
2119
2120 /*
2121 * allocate and initialize mutex's for the buffer and iobuffer pools
2122 */
2123 buf_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
2124 iobuffer_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
2125 buf_gc_callout = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
2126
2127 if (iobuffer_mtxp == NULL) {
2128 panic("couldn't create iobuffer mutex");
2129 }
2130
2131 if (buf_mtxp == NULL) {
2132 panic("couldn't create buf mutex");
2133 }
2134
2135 if (buf_gc_callout == NULL) {
2136 panic("couldn't create buf_gc_callout mutex");
2137 }
2138
2139 /*
2140 * allocate and initialize cluster specific global locks...
2141 */
2142 cluster_init();
2143
2144 printf("using %d buffer headers and %d cluster IO buffer headers\n",
2145 nbuf_headers, niobuf_headers);
2146
2147 /* Set up zones used by the buffer cache */
2148 bufzoneinit();
2149
2150 /* start the bcleanbuf() thread */
2151 bcleanbuf_thread_init();
2152
2153 /* Register a callout for relieving vm pressure */
2154 if (vm_set_buffer_cleanup_callout(buffer_cache_gc) != KERN_SUCCESS) {
2155 panic("Couldn't register buffer cache callout for vm pressure!\n");
2156 }
2157 }
2158
2159 /*
2160 * Zones for the meta data buffers
2161 */
2162
2163 #define MINMETA 512
2164 #define MAXMETA 16384
2165
2166 struct meta_zone_entry {
2167 zone_t mz_zone;
2168 vm_size_t mz_size;
2169 vm_size_t mz_max;
2170 const char *mz_name;
2171 };
2172
2173 struct meta_zone_entry meta_zones[] = {
2174 {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
2175 {NULL, (MINMETA * 2), 64 * (MINMETA * 2), "buf.1024" },
2176 {NULL, (MINMETA * 4), 16 * (MINMETA * 4), "buf.2048" },
2177 {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
2178 {NULL, (MINMETA * 16), 512 * (MINMETA * 16), "buf.8192" },
2179 {NULL, (MINMETA * 32), 512 * (MINMETA * 32), "buf.16384" },
2180 {NULL, 0, 0, "" } /* End */
2181 };
2182
2183 /*
2184 * Initialize the meta data zones
2185 */
2186 static void
2187 bufzoneinit(void)
2188 {
2189 int i;
2190
2191 for (i = 0; meta_zones[i].mz_size != 0; i++) {
2192 meta_zones[i].mz_zone =
2193 zinit(meta_zones[i].mz_size,
2194 meta_zones[i].mz_max,
2195 PAGE_SIZE,
2196 meta_zones[i].mz_name);
2197 zone_change(meta_zones[i].mz_zone, Z_CALLERACCT, FALSE);
2198 }
2199 buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
2200 zone_change(buf_hdr_zone, Z_CALLERACCT, FALSE);
2201 }
2202
2203 static __inline__ zone_t
2204 getbufzone(size_t size)
2205 {
2206 int i;
2207
2208 if ((size % 512) || (size < MINMETA) || (size > MAXMETA)) {
2209 panic("getbufzone: incorect size = %lu", size);
2210 }
2211
2212 for (i = 0; meta_zones[i].mz_size != 0; i++) {
2213 if (meta_zones[i].mz_size >= size) {
2214 break;
2215 }
2216 }
2217
2218 return meta_zones[i].mz_zone;
2219 }
2220
2221
2222
2223 static struct buf *
2224 bio_doread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, int async, int queuetype)
2225 {
2226 buf_t bp;
2227
2228 bp = buf_getblk(vp, blkno, size, 0, 0, queuetype);
2229
2230 /*
2231 * If buffer does not have data valid, start a read.
2232 * Note that if buffer is B_INVAL, buf_getblk() won't return it.
2233 * Therefore, it's valid if it's I/O has completed or been delayed.
2234 */
2235 if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
2236 struct proc *p;
2237
2238 p = current_proc();
2239
2240 /* Start I/O for the buffer (keeping credentials). */
2241 SET(bp->b_flags, B_READ | async);
2242 if (IS_VALID_CRED(cred) && !IS_VALID_CRED(bp->b_rcred)) {
2243 kauth_cred_ref(cred);
2244 bp->b_rcred = cred;
2245 }
2246
2247 VNOP_STRATEGY(bp);
2248
2249 trace(TR_BREADMISS, pack(vp, size), blkno);
2250
2251 /* Pay for the read. */
2252 if (p && p->p_stats) {
2253 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_inblock); /* XXX */
2254 }
2255
2256 if (async) {
2257 /*
2258 * since we asked for an ASYNC I/O
2259 * the biodone will do the brelse
2260 * we don't want to pass back a bp
2261 * that we don't 'own'
2262 */
2263 bp = NULL;
2264 }
2265 } else if (async) {
2266 buf_brelse(bp);
2267 bp = NULL;
2268 }
2269
2270 trace(TR_BREADHIT, pack(vp, size), blkno);
2271
2272 return bp;
2273 }
2274
2275 /*
2276 * Perform the reads for buf_breadn() and buf_meta_breadn().
2277 * Trivial modification to the breada algorithm presented in Bach (p.55).
2278 */
2279 static errno_t
2280 do_breadn_for_type(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes,
2281 int nrablks, kauth_cred_t cred, buf_t *bpp, int queuetype)
2282 {
2283 buf_t bp;
2284 int i;
2285
2286 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, queuetype);
2287
2288 /*
2289 * For each of the read-ahead blocks, start a read, if necessary.
2290 */
2291 for (i = 0; i < nrablks; i++) {
2292 /* If it's in the cache, just go on to next one. */
2293 if (incore(vp, rablks[i])) {
2294 continue;
2295 }
2296
2297 /* Get a buffer for the read-ahead block */
2298 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, queuetype);
2299 }
2300
2301 /* Otherwise, we had to start a read for it; wait until it's valid. */
2302 return buf_biowait(bp);
2303 }
2304
2305
2306 /*
2307 * Read a disk block.
2308 * This algorithm described in Bach (p.54).
2309 */
2310 errno_t
2311 buf_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp)
2312 {
2313 buf_t bp;
2314
2315 /* Get buffer for block. */
2316 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
2317
2318 /* Wait for the read to complete, and return result. */
2319 return buf_biowait(bp);
2320 }
2321
2322 /*
2323 * Read a disk block. [bread() for meta-data]
2324 * This algorithm described in Bach (p.54).
2325 */
2326 errno_t
2327 buf_meta_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp)
2328 {
2329 buf_t bp;
2330
2331 /* Get buffer for block. */
2332 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
2333
2334 /* Wait for the read to complete, and return result. */
2335 return buf_biowait(bp);
2336 }
2337
2338 /*
2339 * Read-ahead multiple disk blocks. The first is sync, the rest async.
2340 */
2341 errno_t
2342 buf_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, kauth_cred_t cred, buf_t *bpp)
2343 {
2344 return do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_READ);
2345 }
2346
2347 /*
2348 * Read-ahead multiple disk blocks. The first is sync, the rest async.
2349 * [buf_breadn() for meta-data]
2350 */
2351 errno_t
2352 buf_meta_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, kauth_cred_t cred, buf_t *bpp)
2353 {
2354 return do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_META);
2355 }
2356
2357 /*
2358 * Block write. Described in Bach (p.56)
2359 */
2360 errno_t
2361 buf_bwrite(buf_t bp)
2362 {
2363 int sync, wasdelayed;
2364 errno_t rv;
2365 proc_t p = current_proc();
2366 vnode_t vp = bp->b_vp;
2367
2368 if (bp->b_datap == 0) {
2369 if (brecover_data(bp) == 0) {
2370 return 0;
2371 }
2372 }
2373 /* Remember buffer type, to switch on it later. */
2374 sync = !ISSET(bp->b_flags, B_ASYNC);
2375 wasdelayed = ISSET(bp->b_flags, B_DELWRI);
2376 CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
2377
2378 if (wasdelayed) {
2379 OSAddAtomicLong(-1, &nbdwrite);
2380 }
2381
2382 if (!sync) {
2383 /*
2384 * If not synchronous, pay for the I/O operation and make
2385 * sure the buf is on the correct vnode queue. We have
2386 * to do this now, because if we don't, the vnode may not
2387 * be properly notified that its I/O has completed.
2388 */
2389 if (wasdelayed) {
2390 buf_reassign(bp, vp);
2391 } else if (p && p->p_stats) {
2392 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); /* XXX */
2393 }
2394 }
2395 trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
2396
2397 /* Initiate disk write. Make sure the appropriate party is charged. */
2398
2399 OSAddAtomic(1, &vp->v_numoutput);
2400
2401 VNOP_STRATEGY(bp);
2402
2403 if (sync) {
2404 /*
2405 * If I/O was synchronous, wait for it to complete.
2406 */
2407 rv = buf_biowait(bp);
2408
2409 /*
2410 * Pay for the I/O operation, if it's not been paid for, and
2411 * make sure it's on the correct vnode queue. (async operatings
2412 * were payed for above.)
2413 */
2414 if (wasdelayed) {
2415 buf_reassign(bp, vp);
2416 } else if (p && p->p_stats) {
2417 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); /* XXX */
2418 }
2419
2420 /* Release the buffer. */
2421 buf_brelse(bp);
2422
2423 return rv;
2424 } else {
2425 return 0;
2426 }
2427 }
2428
2429 int
2430 vn_bwrite(struct vnop_bwrite_args *ap)
2431 {
2432 return buf_bwrite(ap->a_bp);
2433 }
2434
2435 /*
2436 * Delayed write.
2437 *
2438 * The buffer is marked dirty, but is not queued for I/O.
2439 * This routine should be used when the buffer is expected
2440 * to be modified again soon, typically a small write that
2441 * partially fills a buffer.
2442 *
2443 * NB: magnetic tapes cannot be delayed; they must be
2444 * written in the order that the writes are requested.
2445 *
2446 * Described in Leffler, et al. (pp. 208-213).
2447 *
2448 * Note: With the ability to allocate additional buffer
2449 * headers, we can get in to the situation where "too" many
2450 * buf_bdwrite()s can create situation where the kernel can create
2451 * buffers faster than the disks can service. Doing a buf_bawrite() in
2452 * cases where we have "too many" outstanding buf_bdwrite()s avoids that.
2453 */
2454 int
2455 bdwrite_internal(buf_t bp, int return_error)
2456 {
2457 proc_t p = current_proc();
2458 vnode_t vp = bp->b_vp;
2459
2460 /*
2461 * If the block hasn't been seen before:
2462 * (1) Mark it as having been seen,
2463 * (2) Charge for the write.
2464 * (3) Make sure it's on its vnode's correct block list,
2465 */
2466 if (!ISSET(bp->b_flags, B_DELWRI)) {
2467 SET(bp->b_flags, B_DELWRI);
2468 if (p && p->p_stats) {
2469 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); /* XXX */
2470 }
2471 OSAddAtomicLong(1, &nbdwrite);
2472 buf_reassign(bp, vp);
2473 }
2474
2475 /*
2476 * if we're not LOCKED, but the total number of delayed writes
2477 * has climbed above 75% of the total buffers in the system
2478 * return an error if the caller has indicated that it can
2479 * handle one in this case, otherwise schedule the I/O now
2480 * this is done to prevent us from allocating tons of extra
2481 * buffers when dealing with virtual disks (i.e. DiskImages),
2482 * because additional buffers are dynamically allocated to prevent
2483 * deadlocks from occurring
2484 *
2485 * however, can't do a buf_bawrite() if the LOCKED bit is set because the
2486 * buffer is part of a transaction and can't go to disk until
2487 * the LOCKED bit is cleared.
2488 */
2489 if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf_headers / 4) * 3)) {
2490 if (return_error) {
2491 return EAGAIN;
2492 }
2493 /*
2494 * If the vnode has "too many" write operations in progress
2495 * wait for them to finish the IO
2496 */
2497 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, "buf_bdwrite");
2498
2499 return buf_bawrite(bp);
2500 }
2501
2502 /* Otherwise, the "write" is done, so mark and release the buffer. */
2503 SET(bp->b_flags, B_DONE);
2504 buf_brelse(bp);
2505 return 0;
2506 }
2507
2508 errno_t
2509 buf_bdwrite(buf_t bp)
2510 {
2511 return bdwrite_internal(bp, 0);
2512 }
2513
2514
2515 /*
2516 * Asynchronous block write; just an asynchronous buf_bwrite().
2517 *
2518 * Note: With the abilitty to allocate additional buffer
2519 * headers, we can get in to the situation where "too" many
2520 * buf_bawrite()s can create situation where the kernel can create
2521 * buffers faster than the disks can service.
2522 * We limit the number of "in flight" writes a vnode can have to
2523 * avoid this.
2524 */
2525 static int
2526 bawrite_internal(buf_t bp, int throttle)
2527 {
2528 vnode_t vp = bp->b_vp;
2529
2530 if (vp) {
2531 if (throttle) {
2532 /*
2533 * If the vnode has "too many" write operations in progress
2534 * wait for them to finish the IO
2535 */
2536 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, (const char *)"buf_bawrite");
2537 } else if (vp->v_numoutput >= VNODE_ASYNC_THROTTLE) {
2538 /*
2539 * return to the caller and
2540 * let him decide what to do
2541 */
2542 return EWOULDBLOCK;
2543 }
2544 }
2545 SET(bp->b_flags, B_ASYNC);
2546
2547 return VNOP_BWRITE(bp);
2548 }
2549
2550 errno_t
2551 buf_bawrite(buf_t bp)
2552 {
2553 return bawrite_internal(bp, 1);
2554 }
2555
2556
2557
2558 static void
2559 buf_free_meta_store(buf_t bp)
2560 {
2561 if (bp->b_bufsize) {
2562 if (ISSET(bp->b_flags, B_ZALLOC)) {
2563 zone_t z;
2564
2565 z = getbufzone(bp->b_bufsize);
2566 zfree(z, bp->b_datap);
2567 } else {
2568 kmem_free(kernel_map, bp->b_datap, bp->b_bufsize);
2569 }
2570
2571 bp->b_datap = (uintptr_t)NULL;
2572 bp->b_bufsize = 0;
2573 }
2574 }
2575
2576
2577 static buf_t
2578 buf_brelse_shadow(buf_t bp)
2579 {
2580 buf_t bp_head;
2581 buf_t bp_temp;
2582 buf_t bp_return = NULL;
2583 #ifdef BUF_MAKE_PRIVATE
2584 buf_t bp_data;
2585 int data_ref = 0;
2586 #endif
2587 int need_wakeup = 0;
2588
2589 lck_mtx_lock_spin(buf_mtxp);
2590
2591 __IGNORE_WCASTALIGN(bp_head = (buf_t)bp->b_orig);
2592
2593 if (bp_head->b_whichq != -1) {
2594 panic("buf_brelse_shadow: bp_head on freelist %d\n", bp_head->b_whichq);
2595 }
2596
2597 #ifdef BUF_MAKE_PRIVATE
2598 if (bp_data = bp->b_data_store) {
2599 bp_data->b_data_ref--;
2600 /*
2601 * snapshot the ref count so that we can check it
2602 * outside of the lock... we only want the guy going
2603 * from 1 -> 0 to try and release the storage
2604 */
2605 data_ref = bp_data->b_data_ref;
2606 }
2607 #endif
2608 KERNEL_DEBUG(0xbbbbc008 | DBG_FUNC_START, bp, bp_head, bp_head->b_shadow_ref, 0, 0);
2609
2610 bp_head->b_shadow_ref--;
2611
2612 for (bp_temp = bp_head; bp_temp && bp != bp_temp->b_shadow; bp_temp = bp_temp->b_shadow) {
2613 ;
2614 }
2615
2616 if (bp_temp == NULL) {
2617 panic("buf_brelse_shadow: bp not on list %p", bp_head);
2618 }
2619
2620 bp_temp->b_shadow = bp_temp->b_shadow->b_shadow;
2621
2622 #ifdef BUF_MAKE_PRIVATE
2623 /*
2624 * we're about to free the current 'owner' of the data buffer and
2625 * there is at least one other shadow buf_t still pointing at it
2626 * so transfer it to the first shadow buf left in the chain
2627 */
2628 if (bp == bp_data && data_ref) {
2629 if ((bp_data = bp_head->b_shadow) == NULL) {
2630 panic("buf_brelse_shadow: data_ref mismatch bp(%p)", bp);
2631 }
2632
2633 for (bp_temp = bp_data; bp_temp; bp_temp = bp_temp->b_shadow) {
2634 bp_temp->b_data_store = bp_data;
2635 }
2636 bp_data->b_data_ref = data_ref;
2637 }
2638 #endif
2639 if (bp_head->b_shadow_ref == 0 && bp_head->b_shadow) {
2640 panic("buf_relse_shadow: b_shadow != NULL && b_shadow_ref == 0 bp(%p)", bp);
2641 }
2642 if (bp_head->b_shadow_ref && bp_head->b_shadow == 0) {
2643 panic("buf_relse_shadow: b_shadow == NULL && b_shadow_ref != 0 bp(%p)", bp);
2644 }
2645
2646 if (bp_head->b_shadow_ref == 0) {
2647 if (!ISSET(bp_head->b_lflags, BL_BUSY)) {
2648 CLR(bp_head->b_flags, B_AGE);
2649 bp_head->b_timestamp = buf_timestamp();
2650
2651 if (ISSET(bp_head->b_flags, B_LOCKED)) {
2652 bp_head->b_whichq = BQ_LOCKED;
2653 binstailfree(bp_head, &bufqueues[BQ_LOCKED], BQ_LOCKED);
2654 } else {
2655 bp_head->b_whichq = BQ_META;
2656 binstailfree(bp_head, &bufqueues[BQ_META], BQ_META);
2657 }
2658 } else if (ISSET(bp_head->b_lflags, BL_WAITSHADOW)) {
2659 CLR(bp_head->b_lflags, BL_WAITSHADOW);
2660
2661 bp_return = bp_head;
2662 }
2663 if (ISSET(bp_head->b_lflags, BL_WANTED_REF)) {
2664 CLR(bp_head->b_lflags, BL_WANTED_REF);
2665 need_wakeup = 1;
2666 }
2667 }
2668 lck_mtx_unlock(buf_mtxp);
2669
2670 if (need_wakeup) {
2671 wakeup(bp_head);
2672 }
2673
2674 #ifdef BUF_MAKE_PRIVATE
2675 if (bp == bp_data && data_ref == 0) {
2676 buf_free_meta_store(bp);
2677 }
2678
2679 bp->b_data_store = NULL;
2680 #endif
2681 KERNEL_DEBUG(0xbbbbc008 | DBG_FUNC_END, bp, 0, 0, 0, 0);
2682
2683 return bp_return;
2684 }
2685
2686
2687 /*
2688 * Release a buffer on to the free lists.
2689 * Described in Bach (p. 46).
2690 */
2691 void
2692 buf_brelse(buf_t bp)
2693 {
2694 struct bqueues *bufq;
2695 long whichq;
2696 upl_t upl;
2697 int need_wakeup = 0;
2698 int need_bp_wakeup = 0;
2699
2700
2701 if (bp->b_whichq != -1 || !(bp->b_lflags & BL_BUSY)) {
2702 panic("buf_brelse: bad buffer = %p\n", bp);
2703 }
2704
2705 #ifdef JOE_DEBUG
2706 (void) OSBacktrace(&bp->b_stackbrelse[0], 6);
2707
2708 bp->b_lastbrelse = current_thread();
2709 bp->b_tag = 0;
2710 #endif
2711 if (bp->b_lflags & BL_IOBUF) {
2712 buf_t shadow_master_bp = NULL;
2713
2714 if (ISSET(bp->b_lflags, BL_SHADOW)) {
2715 shadow_master_bp = buf_brelse_shadow(bp);
2716 } else if (ISSET(bp->b_lflags, BL_IOBUF_ALLOC)) {
2717 buf_free_meta_store(bp);
2718 }
2719 free_io_buf(bp);
2720
2721 if (shadow_master_bp) {
2722 bp = shadow_master_bp;
2723 goto finish_shadow_master;
2724 }
2725 return;
2726 }
2727
2728 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
2729 bp->b_lblkno * PAGE_SIZE, bp, bp->b_datap,
2730 bp->b_flags, 0);
2731
2732 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
2733
2734 /*
2735 * if we're invalidating a buffer that has the B_FILTER bit
2736 * set then call the b_iodone function so it gets cleaned
2737 * up properly.
2738 *
2739 * the HFS journal code depends on this
2740 */
2741 if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
2742 if (ISSET(bp->b_flags, B_FILTER)) { /* if necessary, call out */
2743 void (*iodone_func)(struct buf *, void *) = bp->b_iodone;
2744 void *arg = bp->b_transaction;
2745
2746 CLR(bp->b_flags, B_FILTER); /* but note callout done */
2747 bp->b_iodone = NULL;
2748 bp->b_transaction = NULL;
2749
2750 if (iodone_func == NULL) {
2751 panic("brelse: bp @ %p has NULL b_iodone!\n", bp);
2752 }
2753 (*iodone_func)(bp, arg);
2754 }
2755 }
2756 /*
2757 * I/O is done. Cleanup the UPL state
2758 */
2759 upl = bp->b_upl;
2760
2761 if (!ISSET(bp->b_flags, B_META) && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
2762 kern_return_t kret;
2763 int upl_flags;
2764
2765 if (upl == NULL) {
2766 if (!ISSET(bp->b_flags, B_INVAL)) {
2767 kret = ubc_create_upl_kernel(bp->b_vp,
2768 ubc_blktooff(bp->b_vp, bp->b_lblkno),
2769 bp->b_bufsize,
2770 &upl,
2771 NULL,
2772 UPL_PRECIOUS,
2773 VM_KERN_MEMORY_FILE);
2774
2775 if (kret != KERN_SUCCESS) {
2776 panic("brelse: Failed to create UPL");
2777 }
2778 #if UPL_DEBUG
2779 upl_ubc_alias_set(upl, (uintptr_t) bp, (uintptr_t) 5);
2780 #endif /* UPL_DEBUG */
2781 }
2782 } else {
2783 if (bp->b_datap) {
2784 kret = ubc_upl_unmap(upl);
2785
2786 if (kret != KERN_SUCCESS) {
2787 panic("ubc_upl_unmap failed");
2788 }
2789 bp->b_datap = (uintptr_t)NULL;
2790 }
2791 }
2792 if (upl) {
2793 if (bp->b_flags & (B_ERROR | B_INVAL)) {
2794 if (bp->b_flags & (B_READ | B_INVAL)) {
2795 upl_flags = UPL_ABORT_DUMP_PAGES;
2796 } else {
2797 upl_flags = 0;
2798 }
2799
2800 ubc_upl_abort(upl, upl_flags);
2801 } else {
2802 if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY)) {
2803 upl_flags = UPL_COMMIT_SET_DIRTY;
2804 } else {
2805 upl_flags = UPL_COMMIT_CLEAR_DIRTY;
2806 }
2807
2808 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
2809 UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2810 }
2811 bp->b_upl = NULL;
2812 }
2813 } else {
2814 if ((upl)) {
2815 panic("brelse: UPL set for non VREG; vp=%p", bp->b_vp);
2816 }
2817 }
2818
2819 /*
2820 * If it's locked, don't report an error; try again later.
2821 */
2822 if (ISSET(bp->b_flags, (B_LOCKED | B_ERROR)) == (B_LOCKED | B_ERROR)) {
2823 CLR(bp->b_flags, B_ERROR);
2824 }
2825 /*
2826 * If it's not cacheable, or an error, mark it invalid.
2827 */
2828 if (ISSET(bp->b_flags, (B_NOCACHE | B_ERROR))) {
2829 SET(bp->b_flags, B_INVAL);
2830 }
2831
2832 if ((bp->b_bufsize <= 0) ||
2833 ISSET(bp->b_flags, B_INVAL) ||
2834 (ISSET(bp->b_lflags, BL_WANTDEALLOC) && !ISSET(bp->b_flags, B_DELWRI))) {
2835 boolean_t delayed_buf_free_meta_store = FALSE;
2836
2837 /*
2838 * If it's invalid or empty, dissociate it from its vnode,
2839 * release its storage if B_META, and
2840 * clean it up a bit and put it on the EMPTY queue
2841 */
2842 if (ISSET(bp->b_flags, B_DELWRI)) {
2843 OSAddAtomicLong(-1, &nbdwrite);
2844 }
2845
2846 if (ISSET(bp->b_flags, B_META)) {
2847 if (bp->b_shadow_ref) {
2848 delayed_buf_free_meta_store = TRUE;
2849 } else {
2850 buf_free_meta_store(bp);
2851 }
2852 }
2853 /*
2854 * nuke any credentials we were holding
2855 */
2856 buf_release_credentials(bp);
2857
2858 lck_mtx_lock_spin(buf_mtxp);
2859
2860 if (bp->b_shadow_ref) {
2861 SET(bp->b_lflags, BL_WAITSHADOW);
2862
2863 lck_mtx_unlock(buf_mtxp);
2864
2865 return;
2866 }
2867 if (delayed_buf_free_meta_store == TRUE) {
2868 lck_mtx_unlock(buf_mtxp);
2869 finish_shadow_master:
2870 buf_free_meta_store(bp);
2871
2872 lck_mtx_lock_spin(buf_mtxp);
2873 }
2874 CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA));
2875
2876 if (bp->b_vp) {
2877 brelvp_locked(bp);
2878 }
2879
2880 bremhash(bp);
2881 BLISTNONE(bp);
2882 binshash(bp, &invalhash);
2883
2884 bp->b_whichq = BQ_EMPTY;
2885 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
2886 } else {
2887 /*
2888 * It has valid data. Put it on the end of the appropriate
2889 * queue, so that it'll stick around for as long as possible.
2890 */
2891 if (ISSET(bp->b_flags, B_LOCKED)) {
2892 whichq = BQ_LOCKED; /* locked in core */
2893 } else if (ISSET(bp->b_flags, B_META)) {
2894 whichq = BQ_META; /* meta-data */
2895 } else if (ISSET(bp->b_flags, B_AGE)) {
2896 whichq = BQ_AGE; /* stale but valid data */
2897 } else {
2898 whichq = BQ_LRU; /* valid data */
2899 }
2900 bufq = &bufqueues[whichq];
2901
2902 bp->b_timestamp = buf_timestamp();
2903
2904 lck_mtx_lock_spin(buf_mtxp);
2905
2906 /*
2907 * the buf_brelse_shadow routine doesn't take 'ownership'
2908 * of the parent buf_t... it updates state that is protected by
2909 * the buf_mtxp, and checks for BL_BUSY to determine whether to
2910 * put the buf_t back on a free list. b_shadow_ref is protected
2911 * by the lock, and since we have not yet cleared B_BUSY, we need
2912 * to check it while holding the lock to insure that one of us
2913 * puts this buf_t back on a free list when it is safe to do so
2914 */
2915 if (bp->b_shadow_ref == 0) {
2916 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE));
2917 bp->b_whichq = whichq;
2918 binstailfree(bp, bufq, whichq);
2919 } else {
2920 /*
2921 * there are still cloned buf_t's pointing
2922 * at this guy... need to keep it off the
2923 * freelists until a buf_brelse is done on
2924 * the last clone
2925 */
2926 CLR(bp->b_flags, (B_ASYNC | B_NOCACHE));
2927 }
2928 }
2929 if (needbuffer) {
2930 /*
2931 * needbuffer is a global
2932 * we're currently using buf_mtxp to protect it
2933 * delay doing the actual wakeup until after
2934 * we drop buf_mtxp
2935 */
2936 needbuffer = 0;
2937 need_wakeup = 1;
2938 }
2939 if (ISSET(bp->b_lflags, BL_WANTED)) {
2940 /*
2941 * delay the actual wakeup until after we
2942 * clear BL_BUSY and we've dropped buf_mtxp
2943 */
2944 need_bp_wakeup = 1;
2945 }
2946 /*
2947 * Unlock the buffer.
2948 */
2949 CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
2950 buf_busycount--;
2951
2952 lck_mtx_unlock(buf_mtxp);
2953
2954 if (need_wakeup) {
2955 /*
2956 * Wake up any processes waiting for any buffer to become free.
2957 */
2958 wakeup(&needbuffer);
2959 }
2960 if (need_bp_wakeup) {
2961 /*
2962 * Wake up any proceeses waiting for _this_ buffer to become free.
2963 */
2964 wakeup(bp);
2965 }
2966 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
2967 bp, bp->b_datap, bp->b_flags, 0, 0);
2968 }
2969
2970 /*
2971 * Determine if a block is in the cache.
2972 * Just look on what would be its hash chain. If it's there, return
2973 * a pointer to it, unless it's marked invalid. If it's marked invalid,
2974 * we normally don't return the buffer, unless the caller explicitly
2975 * wants us to.
2976 */
2977 static boolean_t
2978 incore(vnode_t vp, daddr64_t blkno)
2979 {
2980 boolean_t retval;
2981 struct bufhashhdr *dp;
2982
2983 dp = BUFHASH(vp, blkno);
2984
2985 lck_mtx_lock_spin(buf_mtxp);
2986
2987 if (incore_locked(vp, blkno, dp)) {
2988 retval = TRUE;
2989 } else {
2990 retval = FALSE;
2991 }
2992 lck_mtx_unlock(buf_mtxp);
2993
2994 return retval;
2995 }
2996
2997
2998 static buf_t
2999 incore_locked(vnode_t vp, daddr64_t blkno, struct bufhashhdr *dp)
3000 {
3001 struct buf *bp;
3002
3003 /* Search hash chain */
3004 for (bp = dp->lh_first; bp != NULL; bp = bp->b_hash.le_next) {
3005 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
3006 !ISSET(bp->b_flags, B_INVAL)) {
3007 return bp;
3008 }
3009 }
3010 return NULL;
3011 }
3012
3013
3014 void
3015 buf_wait_for_shadow_io(vnode_t vp, daddr64_t blkno)
3016 {
3017 buf_t bp;
3018 struct bufhashhdr *dp;
3019
3020 dp = BUFHASH(vp, blkno);
3021
3022 lck_mtx_lock_spin(buf_mtxp);
3023
3024 for (;;) {
3025 if ((bp = incore_locked(vp, blkno, dp)) == NULL) {
3026 break;
3027 }
3028
3029 if (bp->b_shadow_ref == 0) {
3030 break;
3031 }
3032
3033 SET(bp->b_lflags, BL_WANTED_REF);
3034
3035 (void) msleep(bp, buf_mtxp, PSPIN | (PRIBIO + 1), "buf_wait_for_shadow", NULL);
3036 }
3037 lck_mtx_unlock(buf_mtxp);
3038 }
3039
3040 /* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
3041 /*
3042 * Get a block of requested size that is associated with
3043 * a given vnode and block offset. If it is found in the
3044 * block cache, mark it as having been found, make it busy
3045 * and return it. Otherwise, return an empty block of the
3046 * correct size. It is up to the caller to insure that the
3047 * cached blocks be of the correct size.
3048 */
3049 buf_t
3050 buf_getblk(vnode_t vp, daddr64_t blkno, int size, int slpflag, int slptimeo, int operation)
3051 {
3052 buf_t bp;
3053 int err;
3054 upl_t upl;
3055 upl_page_info_t *pl;
3056 kern_return_t kret;
3057 int ret_only_valid;
3058 struct timespec ts;
3059 int upl_flags;
3060 struct bufhashhdr *dp;
3061
3062 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
3063 (uintptr_t)(blkno * PAGE_SIZE), size, operation, 0, 0);
3064
3065 ret_only_valid = operation & BLK_ONLYVALID;
3066 operation &= ~BLK_ONLYVALID;
3067 dp = BUFHASH(vp, blkno);
3068 start:
3069 lck_mtx_lock_spin(buf_mtxp);
3070
3071 if ((bp = incore_locked(vp, blkno, dp))) {
3072 /*
3073 * Found in the Buffer Cache
3074 */
3075 if (ISSET(bp->b_lflags, BL_BUSY)) {
3076 /*
3077 * but is busy
3078 */
3079 switch (operation) {
3080 case BLK_READ:
3081 case BLK_WRITE:
3082 case BLK_META:
3083 SET(bp->b_lflags, BL_WANTED);
3084 bufstats.bufs_busyincore++;
3085
3086 /*
3087 * don't retake the mutex after being awakened...
3088 * the time out is in msecs
3089 */
3090 ts.tv_sec = (slptimeo / 1000);
3091 ts.tv_nsec = (slptimeo % 1000) * 10 * NSEC_PER_USEC * 1000;
3092
3093 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 396)) | DBG_FUNC_NONE,
3094 (uintptr_t)blkno, size, operation, 0, 0);
3095
3096 err = msleep(bp, buf_mtxp, slpflag | PDROP | (PRIBIO + 1), "buf_getblk", &ts);
3097
3098 /*
3099 * Callers who call with PCATCH or timeout are
3100 * willing to deal with the NULL pointer
3101 */
3102 if (err && ((slpflag & PCATCH) || ((err == EWOULDBLOCK) && slptimeo))) {
3103 return NULL;
3104 }
3105 goto start;
3106 /*NOTREACHED*/
3107
3108 default:
3109 /*
3110 * unknown operation requested
3111 */
3112 panic("getblk: paging or unknown operation for incore busy buffer - %x\n", operation);
3113 /*NOTREACHED*/
3114 break;
3115 }
3116 } else {
3117 int clear_bdone;
3118
3119 /*
3120 * buffer in core and not busy
3121 */
3122 SET(bp->b_lflags, BL_BUSY);
3123 SET(bp->b_flags, B_CACHE);
3124 buf_busycount++;
3125
3126 bremfree_locked(bp);
3127 bufstats.bufs_incore++;
3128
3129 lck_mtx_unlock(buf_mtxp);
3130 #ifdef JOE_DEBUG
3131 bp->b_owner = current_thread();
3132 bp->b_tag = 1;
3133 #endif
3134 if ((bp->b_upl)) {
3135 panic("buffer has UPL, but not marked BUSY: %p", bp);
3136 }
3137
3138 clear_bdone = FALSE;
3139 if (!ret_only_valid) {
3140 /*
3141 * If the number bytes that are valid is going
3142 * to increase (even if we end up not doing a
3143 * reallocation through allocbuf) we have to read
3144 * the new size first.
3145 *
3146 * This is required in cases where we doing a read
3147 * modify write of a already valid data on disk but
3148 * in cases where the data on disk beyond (blkno + b_bcount)
3149 * is invalid, we may end up doing extra I/O.
3150 */
3151 if (operation == BLK_META && bp->b_bcount < size) {
3152 /*
3153 * Since we are going to read in the whole size first
3154 * we first have to ensure that any pending delayed write
3155 * is flushed to disk first.
3156 */
3157 if (ISSET(bp->b_flags, B_DELWRI)) {
3158 CLR(bp->b_flags, B_CACHE);
3159 buf_bwrite(bp);
3160 goto start;
3161 }
3162 /*
3163 * clear B_DONE before returning from
3164 * this function so that the caller can
3165 * can issue a read for the new size.
3166 */
3167 clear_bdone = TRUE;
3168 }
3169
3170 if (bp->b_bufsize != size) {
3171 allocbuf(bp, size);
3172 }
3173 }
3174
3175 upl_flags = 0;
3176 switch (operation) {
3177 case BLK_WRITE:
3178 /*
3179 * "write" operation: let the UPL subsystem
3180 * know that we intend to modify the buffer
3181 * cache pages we're gathering.
3182 */
3183 upl_flags |= UPL_WILL_MODIFY;
3184 case BLK_READ:
3185 upl_flags |= UPL_PRECIOUS;
3186 if (UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
3187 kret = ubc_create_upl_kernel(vp,
3188 ubc_blktooff(vp, bp->b_lblkno),
3189 bp->b_bufsize,
3190 &upl,
3191 &pl,
3192 upl_flags,
3193 VM_KERN_MEMORY_FILE);
3194 if (kret != KERN_SUCCESS) {
3195 panic("Failed to create UPL");
3196 }
3197
3198 bp->b_upl = upl;
3199
3200 if (upl_valid_page(pl, 0)) {
3201 if (upl_dirty_page(pl, 0)) {
3202 SET(bp->b_flags, B_WASDIRTY);
3203 } else {
3204 CLR(bp->b_flags, B_WASDIRTY);
3205 }
3206 } else {
3207 CLR(bp->b_flags, (B_DONE | B_CACHE | B_WASDIRTY | B_DELWRI));
3208 }
3209
3210 kret = ubc_upl_map(upl, (vm_offset_t*)&(bp->b_datap));
3211
3212 if (kret != KERN_SUCCESS) {
3213 panic("getblk: ubc_upl_map() failed with (%d)", kret);
3214 }
3215 }
3216 break;
3217
3218 case BLK_META:
3219 /*
3220 * VM is not involved in IO for the meta data
3221 * buffer already has valid data
3222 */
3223 break;
3224
3225 default:
3226 panic("getblk: paging or unknown operation for incore buffer- %d\n", operation);
3227 /*NOTREACHED*/
3228 break;
3229 }
3230
3231 if (clear_bdone) {
3232 CLR(bp->b_flags, B_DONE);
3233 }
3234 }
3235 } else { /* not incore() */
3236 int queue = BQ_EMPTY; /* Start with no preference */
3237
3238 if (ret_only_valid) {
3239 lck_mtx_unlock(buf_mtxp);
3240 return NULL;
3241 }
3242 if ((vnode_isreg(vp) == 0) || (UBCINFOEXISTS(vp) == 0) /*|| (vnode_issystem(vp) == 1)*/) {
3243 operation = BLK_META;
3244 }
3245
3246 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL) {
3247 goto start;
3248 }
3249
3250 /*
3251 * getnewbuf may block for a number of different reasons...
3252 * if it does, it's then possible for someone else to
3253 * create a buffer for the same block and insert it into
3254 * the hash... if we see it incore at this point we dump
3255 * the buffer we were working on and start over
3256 */
3257 if (incore_locked(vp, blkno, dp)) {
3258 SET(bp->b_flags, B_INVAL);
3259 binshash(bp, &invalhash);
3260
3261 lck_mtx_unlock(buf_mtxp);
3262
3263 buf_brelse(bp);
3264 goto start;
3265 }
3266 /*
3267 * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
3268 * CALLED! BE CAREFUL.
3269 */
3270
3271 /*
3272 * mark the buffer as B_META if indicated
3273 * so that when buffer is released it will goto META queue
3274 */
3275 if (operation == BLK_META) {
3276 SET(bp->b_flags, B_META);
3277 }
3278
3279 bp->b_blkno = bp->b_lblkno = blkno;
3280 bp->b_vp = vp;
3281
3282 /*
3283 * Insert in the hash so that incore() can find it
3284 */
3285 binshash(bp, BUFHASH(vp, blkno));
3286
3287 bgetvp_locked(vp, bp);
3288
3289 lck_mtx_unlock(buf_mtxp);
3290
3291 allocbuf(bp, size);
3292
3293 upl_flags = 0;
3294 switch (operation) {
3295 case BLK_META:
3296 /*
3297 * buffer data is invalid...
3298 *
3299 * I don't want to have to retake buf_mtxp,
3300 * so the miss and vmhits counters are done
3301 * with Atomic updates... all other counters
3302 * in bufstats are protected with either
3303 * buf_mtxp or iobuffer_mtxp
3304 */
3305 OSAddAtomicLong(1, &bufstats.bufs_miss);
3306 break;
3307
3308 case BLK_WRITE:
3309 /*
3310 * "write" operation: let the UPL subsystem know
3311 * that we intend to modify the buffer cache pages
3312 * we're gathering.
3313 */
3314 upl_flags |= UPL_WILL_MODIFY;
3315 case BLK_READ:
3316 { off_t f_offset;
3317 size_t contig_bytes;
3318 int bmap_flags;
3319
3320 #if DEVELOPMENT || DEBUG
3321 /*
3322 * Apple implemented file systems use UBC excludively; they should
3323 * not call in here."
3324 */
3325 const char* excldfs[] = {"hfs", "afpfs", "smbfs", "acfs",
3326 "exfat", "msdos", "webdav", NULL};
3327
3328 for (int i = 0; excldfs[i] != NULL; i++) {
3329 if (vp->v_mount &&
3330 !strcmp(vp->v_mount->mnt_vfsstat.f_fstypename,
3331 excldfs[i])) {
3332 panic("%s %s calls buf_getblk",
3333 excldfs[i],
3334 operation == BLK_READ ? "BLK_READ" : "BLK_WRITE");
3335 }
3336 }
3337 #endif
3338
3339 if ((bp->b_upl)) {
3340 panic("bp already has UPL: %p", bp);
3341 }
3342
3343 f_offset = ubc_blktooff(vp, blkno);
3344
3345 upl_flags |= UPL_PRECIOUS;
3346 kret = ubc_create_upl_kernel(vp,
3347 f_offset,
3348 bp->b_bufsize,
3349 &upl,
3350 &pl,
3351 upl_flags,
3352 VM_KERN_MEMORY_FILE);
3353
3354 if (kret != KERN_SUCCESS) {
3355 panic("Failed to create UPL");
3356 }
3357 #if UPL_DEBUG
3358 upl_ubc_alias_set(upl, (uintptr_t) bp, (uintptr_t) 4);
3359 #endif /* UPL_DEBUG */
3360 bp->b_upl = upl;
3361
3362 if (upl_valid_page(pl, 0)) {
3363 if (operation == BLK_READ) {
3364 bmap_flags = VNODE_READ;
3365 } else {
3366 bmap_flags = VNODE_WRITE;
3367 }
3368
3369 SET(bp->b_flags, B_CACHE | B_DONE);
3370
3371 OSAddAtomicLong(1, &bufstats.bufs_vmhits);
3372
3373 bp->b_validoff = 0;
3374 bp->b_dirtyoff = 0;
3375
3376 if (upl_dirty_page(pl, 0)) {
3377 /* page is dirty */
3378 SET(bp->b_flags, B_WASDIRTY);
3379
3380 bp->b_validend = bp->b_bcount;
3381 bp->b_dirtyend = bp->b_bcount;
3382 } else {
3383 /* page is clean */
3384 bp->b_validend = bp->b_bcount;
3385 bp->b_dirtyend = 0;
3386 }
3387 /*
3388 * try to recreate the physical block number associated with
3389 * this buffer...
3390 */
3391 if (VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL)) {
3392 panic("getblk: VNOP_BLOCKMAP failed");
3393 }
3394 /*
3395 * if the extent represented by this buffer
3396 * is not completely physically contiguous on
3397 * disk, than we can't cache the physical mapping
3398 * in the buffer header
3399 */
3400 if ((long)contig_bytes < bp->b_bcount) {
3401 bp->b_blkno = bp->b_lblkno;
3402 }
3403 } else {
3404 OSAddAtomicLong(1, &bufstats.bufs_miss);
3405 }
3406 kret = ubc_upl_map(upl, (vm_offset_t *)&(bp->b_datap));
3407
3408 if (kret != KERN_SUCCESS) {
3409 panic("getblk: ubc_upl_map() failed with (%d)", kret);
3410 }
3411 break;} // end BLK_READ
3412 default:
3413 panic("getblk: paging or unknown operation - %x", operation);
3414 /*NOTREACHED*/
3415 break;
3416 } // end switch
3417 } //end buf_t !incore
3418
3419 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
3420 bp, bp->b_datap, bp->b_flags, 3, 0);
3421
3422 #ifdef JOE_DEBUG
3423 (void) OSBacktrace(&bp->b_stackgetblk[0], 6);
3424 #endif
3425 return bp;
3426 }
3427
3428 /*
3429 * Get an empty, disassociated buffer of given size.
3430 */
3431 buf_t
3432 buf_geteblk(int size)
3433 {
3434 buf_t bp = NULL;
3435 int queue = BQ_EMPTY;
3436
3437 do {
3438 lck_mtx_lock_spin(buf_mtxp);
3439
3440 bp = getnewbuf(0, 0, &queue);
3441 } while (bp == NULL);
3442
3443 SET(bp->b_flags, (B_META | B_INVAL));
3444
3445 #if DIAGNOSTIC
3446 assert(queue == BQ_EMPTY);
3447 #endif /* DIAGNOSTIC */
3448 /* XXX need to implement logic to deal with other queues */
3449
3450 binshash(bp, &invalhash);
3451 bufstats.bufs_eblk++;
3452
3453 lck_mtx_unlock(buf_mtxp);
3454
3455 allocbuf(bp, size);
3456
3457 return bp;
3458 }
3459
3460 uint32_t
3461 buf_redundancy_flags(buf_t bp)
3462 {
3463 return bp->b_redundancy_flags;
3464 }
3465
3466 void
3467 buf_set_redundancy_flags(buf_t bp, uint32_t flags)
3468 {
3469 SET(bp->b_redundancy_flags, flags);
3470 }
3471
3472 void
3473 buf_clear_redundancy_flags(buf_t bp, uint32_t flags)
3474 {
3475 CLR(bp->b_redundancy_flags, flags);
3476 }
3477
3478
3479
3480 static void *
3481 recycle_buf_from_pool(int nsize)
3482 {
3483 buf_t bp;
3484 void *ptr = NULL;
3485
3486 lck_mtx_lock_spin(buf_mtxp);
3487
3488 TAILQ_FOREACH(bp, &bufqueues[BQ_META], b_freelist) {
3489 if (ISSET(bp->b_flags, B_DELWRI) || bp->b_bufsize != nsize) {
3490 continue;
3491 }
3492 ptr = (void *)bp->b_datap;
3493 bp->b_bufsize = 0;
3494
3495 bcleanbuf(bp, TRUE);
3496 break;
3497 }
3498 lck_mtx_unlock(buf_mtxp);
3499
3500 return ptr;
3501 }
3502
3503
3504
3505 int zalloc_nopagewait_failed = 0;
3506 int recycle_buf_failed = 0;
3507
3508 static void *
3509 grab_memory_for_meta_buf(int nsize)
3510 {
3511 zone_t z;
3512 void *ptr;
3513 boolean_t was_vmpriv;
3514
3515 z = getbufzone(nsize);
3516
3517 /*
3518 * make sure we're NOT priviliged so that
3519 * if a vm_page_grab is needed, it won't
3520 * block if we're out of free pages... if
3521 * it blocks, then we can't honor the
3522 * nopagewait request
3523 */
3524 was_vmpriv = set_vm_privilege(FALSE);
3525
3526 ptr = zalloc_nopagewait(z);
3527
3528 if (was_vmpriv == TRUE) {
3529 set_vm_privilege(TRUE);
3530 }
3531
3532 if (ptr == NULL) {
3533 zalloc_nopagewait_failed++;
3534
3535 ptr = recycle_buf_from_pool(nsize);
3536
3537 if (ptr == NULL) {
3538 recycle_buf_failed++;
3539
3540 if (was_vmpriv == FALSE) {
3541 set_vm_privilege(TRUE);
3542 }
3543
3544 ptr = zalloc(z);
3545
3546 if (was_vmpriv == FALSE) {
3547 set_vm_privilege(FALSE);
3548 }
3549 }
3550 }
3551 return ptr;
3552 }
3553
3554 /*
3555 * With UBC, there is no need to expand / shrink the file data
3556 * buffer. The VM uses the same pages, hence no waste.
3557 * All the file data buffers can have one size.
3558 * In fact expand / shrink would be an expensive operation.
3559 *
3560 * Only exception to this is meta-data buffers. Most of the
3561 * meta data operations are smaller than PAGE_SIZE. Having the
3562 * meta-data buffers grow and shrink as needed, optimizes use
3563 * of the kernel wired memory.
3564 */
3565
3566 int
3567 allocbuf(buf_t bp, int size)
3568 {
3569 vm_size_t desired_size;
3570
3571 desired_size = roundup(size, CLBYTES);
3572
3573 if (desired_size < PAGE_SIZE) {
3574 desired_size = PAGE_SIZE;
3575 }
3576 if (desired_size > MAXBSIZE) {
3577 panic("allocbuf: buffer larger than MAXBSIZE requested");
3578 }
3579
3580 if (ISSET(bp->b_flags, B_META)) {
3581 int nsize = roundup(size, MINMETA);
3582
3583 if (bp->b_datap) {
3584 vm_offset_t elem = (vm_offset_t)bp->b_datap;
3585
3586 if (ISSET(bp->b_flags, B_ZALLOC)) {
3587 if (bp->b_bufsize < nsize) {
3588 zone_t zprev;
3589
3590 /* reallocate to a bigger size */
3591
3592 zprev = getbufzone(bp->b_bufsize);
3593 if (nsize <= MAXMETA) {
3594 desired_size = nsize;
3595
3596 /* b_datap not really a ptr */
3597 *(void **)(&bp->b_datap) = grab_memory_for_meta_buf(nsize);
3598 } else {
3599 bp->b_datap = (uintptr_t)NULL;
3600 kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size, VM_KERN_MEMORY_FILE);
3601 CLR(bp->b_flags, B_ZALLOC);
3602 }
3603 bcopy((void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
3604 zfree(zprev, elem);
3605 } else {
3606 desired_size = bp->b_bufsize;
3607 }
3608 } else {
3609 if ((vm_size_t)bp->b_bufsize < desired_size) {
3610 /* reallocate to a bigger size */
3611 bp->b_datap = (uintptr_t)NULL;
3612 kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size, VM_KERN_MEMORY_FILE);
3613 bcopy((const void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
3614 kmem_free(kernel_map, elem, bp->b_bufsize);
3615 } else {
3616 desired_size = bp->b_bufsize;
3617 }
3618 }
3619 } else {
3620 /* new allocation */
3621 if (nsize <= MAXMETA) {
3622 desired_size = nsize;
3623
3624 /* b_datap not really a ptr */
3625 *(void **)(&bp->b_datap) = grab_memory_for_meta_buf(nsize);
3626 SET(bp->b_flags, B_ZALLOC);
3627 } else {
3628 kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size, VM_KERN_MEMORY_FILE);
3629 }
3630 }
3631
3632 if (bp->b_datap == 0) {
3633 panic("allocbuf: NULL b_datap");
3634 }
3635 }
3636 bp->b_bufsize = desired_size;
3637 bp->b_bcount = size;
3638
3639 return 0;
3640 }
3641
3642 /*
3643 * Get a new buffer from one of the free lists.
3644 *
3645 * Request for a queue is passes in. The queue from which the buffer was taken
3646 * from is returned. Out of range queue requests get BQ_EMPTY. Request for
3647 * BQUEUE means no preference. Use heuristics in that case.
3648 * Heuristics is as follows:
3649 * Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
3650 * If none available block till one is made available.
3651 * If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
3652 * Pick the most stale buffer.
3653 * If found buffer was marked delayed write, start the async. write
3654 * and restart the search.
3655 * Initialize the fields and disassociate the buffer from the vnode.
3656 * Remove the buffer from the hash. Return the buffer and the queue
3657 * on which it was found.
3658 *
3659 * buf_mtxp is held upon entry
3660 * returns with buf_mtxp locked if new buf available
3661 * returns with buf_mtxp UNlocked if new buf NOT available
3662 */
3663
3664 static buf_t
3665 getnewbuf(int slpflag, int slptimeo, int * queue)
3666 {
3667 buf_t bp;
3668 buf_t lru_bp;
3669 buf_t age_bp;
3670 buf_t meta_bp;
3671 int age_time, lru_time, bp_time, meta_time;
3672 int req = *queue; /* save it for restarts */
3673 struct timespec ts;
3674
3675 start:
3676 /*
3677 * invalid request gets empty queue
3678 */
3679 if ((*queue >= BQUEUES) || (*queue < 0)
3680 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED)) {
3681 *queue = BQ_EMPTY;
3682 }
3683
3684
3685 if (*queue == BQ_EMPTY && (bp = bufqueues[*queue].tqh_first)) {
3686 goto found;
3687 }
3688
3689 /*
3690 * need to grow number of bufs, add another one rather than recycling
3691 */
3692 if (nbuf_headers < max_nbuf_headers) {
3693 /*
3694 * Increment count now as lock
3695 * is dropped for allocation.
3696 * That avoids over commits
3697 */
3698 nbuf_headers++;
3699 goto add_newbufs;
3700 }
3701 /* Try for the requested queue first */
3702 bp = bufqueues[*queue].tqh_first;
3703 if (bp) {
3704 goto found;
3705 }
3706
3707 /* Unable to use requested queue */
3708 age_bp = bufqueues[BQ_AGE].tqh_first;
3709 lru_bp = bufqueues[BQ_LRU].tqh_first;
3710 meta_bp = bufqueues[BQ_META].tqh_first;
3711
3712 if (!age_bp && !lru_bp && !meta_bp) {
3713 /*
3714 * Unavailble on AGE or LRU or META queues
3715 * Try the empty list first
3716 */
3717 bp = bufqueues[BQ_EMPTY].tqh_first;
3718 if (bp) {
3719 *queue = BQ_EMPTY;
3720 goto found;
3721 }
3722 /*
3723 * We have seen is this is hard to trigger.
3724 * This is an overcommit of nbufs but needed
3725 * in some scenarios with diskiamges
3726 */
3727
3728 add_newbufs:
3729 lck_mtx_unlock(buf_mtxp);
3730
3731 /* Create a new temporary buffer header */
3732 bp = (struct buf *)zalloc(buf_hdr_zone);
3733
3734 if (bp) {
3735 bufhdrinit(bp);
3736 bp->b_whichq = BQ_EMPTY;
3737 bp->b_timestamp = buf_timestamp();
3738 BLISTNONE(bp);
3739 SET(bp->b_flags, B_HDRALLOC);
3740 *queue = BQ_EMPTY;
3741 }
3742 lck_mtx_lock_spin(buf_mtxp);
3743
3744 if (bp) {
3745 binshash(bp, &invalhash);
3746 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
3747 buf_hdr_count++;
3748 goto found;
3749 }
3750 /* subtract already accounted bufcount */
3751 nbuf_headers--;
3752
3753 bufstats.bufs_sleeps++;
3754
3755 /* wait for a free buffer of any kind */
3756 needbuffer = 1;
3757 /* hz value is 100 */
3758 ts.tv_sec = (slptimeo / 1000);
3759 /* the hz value is 100; which leads to 10ms */
3760 ts.tv_nsec = (slptimeo % 1000) * NSEC_PER_USEC * 1000 * 10;
3761
3762 msleep(&needbuffer, buf_mtxp, slpflag | PDROP | (PRIBIO + 1), "getnewbuf", &ts);
3763 return NULL;
3764 }
3765
3766 /* Buffer available either on AGE or LRU or META */
3767 bp = NULL;
3768 *queue = -1;
3769
3770 /* Buffer available either on AGE or LRU */
3771 if (!age_bp) {
3772 bp = lru_bp;
3773 *queue = BQ_LRU;
3774 } else if (!lru_bp) {
3775 bp = age_bp;
3776 *queue = BQ_AGE;
3777 } else { /* buffer available on both AGE and LRU */
3778 int t = buf_timestamp();
3779
3780 age_time = t - age_bp->b_timestamp;
3781 lru_time = t - lru_bp->b_timestamp;
3782 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
3783 bp = age_bp;
3784 *queue = BQ_AGE;
3785 /*
3786 * we should probably re-timestamp eveything in the
3787 * queues at this point with the current time
3788 */
3789 } else {
3790 if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
3791 bp = lru_bp;
3792 *queue = BQ_LRU;
3793 } else {
3794 bp = age_bp;
3795 *queue = BQ_AGE;
3796 }
3797 }
3798 }
3799
3800 if (!bp) { /* Neither on AGE nor on LRU */
3801 bp = meta_bp;
3802 *queue = BQ_META;
3803 } else if (meta_bp) {
3804 int t = buf_timestamp();
3805
3806 bp_time = t - bp->b_timestamp;
3807 meta_time = t - meta_bp->b_timestamp;
3808
3809 if (!(bp_time < 0) && !(meta_time < 0)) {
3810 /* time not set backwards */
3811 int bp_is_stale;
3812 bp_is_stale = (*queue == BQ_LRU) ?
3813 lru_is_stale : age_is_stale;
3814
3815 if ((meta_time >= meta_is_stale) &&
3816 (bp_time < bp_is_stale)) {
3817 bp = meta_bp;
3818 *queue = BQ_META;
3819 }
3820 }
3821 }
3822 found:
3823 if (ISSET(bp->b_flags, B_LOCKED) || ISSET(bp->b_lflags, BL_BUSY)) {
3824 panic("getnewbuf: bp @ %p is LOCKED or BUSY! (flags 0x%x)\n", bp, bp->b_flags);
3825 }
3826
3827 /* Clean it */
3828 if (bcleanbuf(bp, FALSE)) {
3829 /*
3830 * moved to the laundry thread, buffer not ready
3831 */
3832 *queue = req;
3833 goto start;
3834 }
3835 return bp;
3836 }
3837
3838
3839 /*
3840 * Clean a buffer.
3841 * Returns 0 if buffer is ready to use,
3842 * Returns 1 if issued a buf_bawrite() to indicate
3843 * that the buffer is not ready.
3844 *
3845 * buf_mtxp is held upon entry
3846 * returns with buf_mtxp locked
3847 */
3848 int
3849 bcleanbuf(buf_t bp, boolean_t discard)
3850 {
3851 /* Remove from the queue */
3852 bremfree_locked(bp);
3853
3854 #ifdef JOE_DEBUG
3855 bp->b_owner = current_thread();
3856 bp->b_tag = 2;
3857 #endif
3858 /*
3859 * If buffer was a delayed write, start the IO by queuing
3860 * it on the LAUNDRY queue, and return 1
3861 */
3862 if (ISSET(bp->b_flags, B_DELWRI)) {
3863 if (discard) {
3864 SET(bp->b_lflags, BL_WANTDEALLOC);
3865 }
3866
3867 bmovelaundry(bp);
3868
3869 lck_mtx_unlock(buf_mtxp);
3870
3871 wakeup(&bufqueues[BQ_LAUNDRY]);
3872 /*
3873 * and give it a chance to run
3874 */
3875 (void)thread_block(THREAD_CONTINUE_NULL);
3876
3877 lck_mtx_lock_spin(buf_mtxp);
3878
3879 return 1;
3880 }
3881 #ifdef JOE_DEBUG
3882 bp->b_owner = current_thread();
3883 bp->b_tag = 8;
3884 #endif
3885 /*
3886 * Buffer is no longer on any free list... we own it
3887 */
3888 SET(bp->b_lflags, BL_BUSY);
3889 buf_busycount++;
3890
3891 bremhash(bp);
3892
3893 /*
3894 * disassociate us from our vnode, if we had one...
3895 */
3896 if (bp->b_vp) {
3897 brelvp_locked(bp);
3898 }
3899
3900 lck_mtx_unlock(buf_mtxp);
3901
3902 BLISTNONE(bp);
3903
3904 if (ISSET(bp->b_flags, B_META)) {
3905 buf_free_meta_store(bp);
3906 }
3907
3908 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
3909
3910 buf_release_credentials(bp);
3911
3912 /* If discarding, just move to the empty queue */
3913 if (discard) {
3914 lck_mtx_lock_spin(buf_mtxp);
3915 CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA));
3916 bp->b_whichq = BQ_EMPTY;
3917 binshash(bp, &invalhash);
3918 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
3919 CLR(bp->b_lflags, BL_BUSY);
3920 buf_busycount--;
3921 } else {
3922 /* Not discarding: clean up and prepare for reuse */
3923 bp->b_bufsize = 0;
3924 bp->b_datap = (uintptr_t)NULL;
3925 bp->b_upl = (void *)NULL;
3926 bp->b_fsprivate = (void *)NULL;
3927 /*
3928 * preserve the state of whether this buffer
3929 * was allocated on the fly or not...
3930 * the only other flag that should be set at
3931 * this point is BL_BUSY...
3932 */
3933 #ifdef JOE_DEBUG
3934 bp->b_owner = current_thread();
3935 bp->b_tag = 3;
3936 #endif
3937 bp->b_lflags = BL_BUSY;
3938 bp->b_flags = (bp->b_flags & B_HDRALLOC);
3939 bp->b_redundancy_flags = 0;
3940 bp->b_dev = NODEV;
3941 bp->b_blkno = bp->b_lblkno = 0;
3942 bp->b_iodone = NULL;
3943 bp->b_error = 0;
3944 bp->b_resid = 0;
3945 bp->b_bcount = 0;
3946 bp->b_dirtyoff = bp->b_dirtyend = 0;
3947 bp->b_validoff = bp->b_validend = 0;
3948 bzero(&bp->b_attr, sizeof(struct bufattr));
3949
3950 lck_mtx_lock_spin(buf_mtxp);
3951 }
3952 return 0;
3953 }
3954
3955
3956
3957 errno_t
3958 buf_invalblkno(vnode_t vp, daddr64_t lblkno, int flags)
3959 {
3960 buf_t bp;
3961 errno_t error;
3962 struct bufhashhdr *dp;
3963
3964 dp = BUFHASH(vp, lblkno);
3965
3966 relook:
3967 lck_mtx_lock_spin(buf_mtxp);
3968
3969 if ((bp = incore_locked(vp, lblkno, dp)) == (struct buf *)0) {
3970 lck_mtx_unlock(buf_mtxp);
3971 return 0;
3972 }
3973 if (ISSET(bp->b_lflags, BL_BUSY)) {
3974 if (!ISSET(flags, BUF_WAIT)) {
3975 lck_mtx_unlock(buf_mtxp);
3976 return EBUSY;
3977 }
3978 SET(bp->b_lflags, BL_WANTED);
3979
3980 error = msleep((caddr_t)bp, buf_mtxp, PDROP | (PRIBIO + 1), "buf_invalblkno", NULL);
3981
3982 if (error) {
3983 return error;
3984 }
3985 goto relook;
3986 }
3987 bremfree_locked(bp);
3988 SET(bp->b_lflags, BL_BUSY);
3989 SET(bp->b_flags, B_INVAL);
3990 buf_busycount++;
3991 #ifdef JOE_DEBUG
3992 bp->b_owner = current_thread();
3993 bp->b_tag = 4;
3994 #endif
3995 lck_mtx_unlock(buf_mtxp);
3996 buf_brelse(bp);
3997
3998 return 0;
3999 }
4000
4001
4002 void
4003 buf_drop(buf_t bp)
4004 {
4005 int need_wakeup = 0;
4006
4007 lck_mtx_lock_spin(buf_mtxp);
4008
4009 if (ISSET(bp->b_lflags, BL_WANTED)) {
4010 /*
4011 * delay the actual wakeup until after we
4012 * clear BL_BUSY and we've dropped buf_mtxp
4013 */
4014 need_wakeup = 1;
4015 }
4016 #ifdef JOE_DEBUG
4017 bp->b_owner = current_thread();
4018 bp->b_tag = 9;
4019 #endif
4020 /*
4021 * Unlock the buffer.
4022 */
4023 CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
4024 buf_busycount--;
4025
4026 lck_mtx_unlock(buf_mtxp);
4027
4028 if (need_wakeup) {
4029 /*
4030 * Wake up any proceeses waiting for _this_ buffer to become free.
4031 */
4032 wakeup(bp);
4033 }
4034 }
4035
4036
4037 errno_t
4038 buf_acquire(buf_t bp, int flags, int slpflag, int slptimeo)
4039 {
4040 errno_t error;
4041
4042 lck_mtx_lock_spin(buf_mtxp);
4043
4044 error = buf_acquire_locked(bp, flags, slpflag, slptimeo);
4045
4046 lck_mtx_unlock(buf_mtxp);
4047
4048 return error;
4049 }
4050
4051
4052 static errno_t
4053 buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo)
4054 {
4055 errno_t error;
4056 struct timespec ts;
4057
4058 if (ISSET(bp->b_flags, B_LOCKED)) {
4059 if ((flags & BAC_SKIP_LOCKED)) {
4060 return EDEADLK;
4061 }
4062 } else {
4063 if ((flags & BAC_SKIP_NONLOCKED)) {
4064 return EDEADLK;
4065 }
4066 }
4067 if (ISSET(bp->b_lflags, BL_BUSY)) {
4068 /*
4069 * since the lck_mtx_lock may block, the buffer
4070 * may become BUSY, so we need to
4071 * recheck for a NOWAIT request
4072 */
4073 if (flags & BAC_NOWAIT) {
4074 return EBUSY;
4075 }
4076 SET(bp->b_lflags, BL_WANTED);
4077
4078 /* the hz value is 100; which leads to 10ms */
4079 ts.tv_sec = (slptimeo / 100);
4080 ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000;
4081 error = msleep((caddr_t)bp, buf_mtxp, slpflag | (PRIBIO + 1), "buf_acquire", &ts);
4082
4083 if (error) {
4084 return error;
4085 }
4086 return EAGAIN;
4087 }
4088 if (flags & BAC_REMOVE) {
4089 bremfree_locked(bp);
4090 }
4091 SET(bp->b_lflags, BL_BUSY);
4092 buf_busycount++;
4093
4094 #ifdef JOE_DEBUG
4095 bp->b_owner = current_thread();
4096 bp->b_tag = 5;
4097 #endif
4098 return 0;
4099 }
4100
4101
4102 /*
4103 * Wait for operations on the buffer to complete.
4104 * When they do, extract and return the I/O's error value.
4105 */
4106 errno_t
4107 buf_biowait(buf_t bp)
4108 {
4109 while (!ISSET(bp->b_flags, B_DONE)) {
4110 lck_mtx_lock_spin(buf_mtxp);
4111
4112 if (!ISSET(bp->b_flags, B_DONE)) {
4113 DTRACE_IO1(wait__start, buf_t, bp);
4114 (void) msleep(bp, buf_mtxp, PDROP | (PRIBIO + 1), "buf_biowait", NULL);
4115 DTRACE_IO1(wait__done, buf_t, bp);
4116 } else {
4117 lck_mtx_unlock(buf_mtxp);
4118 }
4119 }
4120 /* check for interruption of I/O (e.g. via NFS), then errors. */
4121 if (ISSET(bp->b_flags, B_EINTR)) {
4122 CLR(bp->b_flags, B_EINTR);
4123 return EINTR;
4124 } else if (ISSET(bp->b_flags, B_ERROR)) {
4125 return bp->b_error ? bp->b_error : EIO;
4126 } else {
4127 return 0;
4128 }
4129 }
4130
4131
4132 /*
4133 * Mark I/O complete on a buffer.
4134 *
4135 * If a callback has been requested, e.g. the pageout
4136 * daemon, do so. Otherwise, awaken waiting processes.
4137 *
4138 * [ Leffler, et al., says on p.247:
4139 * "This routine wakes up the blocked process, frees the buffer
4140 * for an asynchronous write, or, for a request by the pagedaemon
4141 * process, invokes a procedure specified in the buffer structure" ]
4142 *
4143 * In real life, the pagedaemon (or other system processes) wants
4144 * to do async stuff to, and doesn't want the buffer buf_brelse()'d.
4145 * (for swap pager, that puts swap buffers on the free lists (!!!),
4146 * for the vn device, that puts malloc'd buffers on the free lists!)
4147 */
4148
4149 void
4150 buf_biodone(buf_t bp)
4151 {
4152 mount_t mp;
4153 struct bufattr *bap;
4154 struct timeval real_elapsed;
4155 uint64_t real_elapsed_usec = 0;
4156
4157 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
4158 bp, bp->b_datap, bp->b_flags, 0, 0);
4159
4160 if (ISSET(bp->b_flags, B_DONE)) {
4161 panic("biodone already");
4162 }
4163
4164 bap = &bp->b_attr;
4165
4166 if (bp->b_vp && bp->b_vp->v_mount) {
4167 mp = bp->b_vp->v_mount;
4168 } else {
4169 mp = NULL;
4170 }
4171
4172 if (ISSET(bp->b_flags, B_ERROR)) {
4173 if (mp && (MNT_ROOTFS & mp->mnt_flag)) {
4174 dk_error_description_t desc;
4175 bzero(&desc, sizeof(desc));
4176 desc.description = panic_disk_error_description;
4177 desc.description_size = panic_disk_error_description_size;
4178 VNOP_IOCTL(mp->mnt_devvp, DKIOCGETERRORDESCRIPTION, (caddr_t)&desc, 0, vfs_context_kernel());
4179 }
4180 }
4181
4182 if (mp && (bp->b_flags & B_READ) == 0) {
4183 update_last_io_time(mp);
4184 INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_write_size);
4185 } else if (mp) {
4186 INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_read_size);
4187 }
4188
4189 throttle_info_end_io(bp);
4190
4191 if (kdebug_enable) {
4192 int code = DKIO_DONE;
4193 int io_tier = GET_BUFATTR_IO_TIER(bap);
4194
4195 if (bp->b_flags & B_READ) {
4196 code |= DKIO_READ;
4197 }
4198 if (bp->b_flags & B_ASYNC) {
4199 code |= DKIO_ASYNC;
4200 }
4201
4202 if (bp->b_flags & B_META) {
4203 code |= DKIO_META;
4204 } else if (bp->b_flags & B_PAGEIO) {
4205 code |= DKIO_PAGING;
4206 }
4207
4208 if (io_tier != 0) {
4209 code |= DKIO_THROTTLE;
4210 }
4211
4212 code |= ((io_tier << DKIO_TIER_SHIFT) & DKIO_TIER_MASK);
4213
4214 if (bp->b_flags & B_PASSIVE) {
4215 code |= DKIO_PASSIVE;
4216 }
4217
4218 if (bap->ba_flags & BA_NOCACHE) {
4219 code |= DKIO_NOCACHE;
4220 }
4221
4222 if (bap->ba_flags & BA_IO_TIER_UPGRADE) {
4223 code |= DKIO_TIER_UPGRADE;
4224 }
4225
4226 KDBG_RELEASE_NOPROCFILT(FSDBG_CODE(DBG_DKRW, code),
4227 buf_kernel_addrperm_addr(bp),
4228 (uintptr_t)VM_KERNEL_ADDRPERM(bp->b_vp), bp->b_resid,
4229 bp->b_error);
4230 }
4231
4232 microuptime(&real_elapsed);
4233 timevalsub(&real_elapsed, &bp->b_timestamp_tv);
4234 real_elapsed_usec = real_elapsed.tv_sec * USEC_PER_SEC + real_elapsed.tv_usec;
4235 disk_conditioner_delay(bp, 1, bp->b_bcount, real_elapsed_usec);
4236
4237 /*
4238 * I/O was done, so don't believe
4239 * the DIRTY state from VM anymore...
4240 * and we need to reset the THROTTLED/PASSIVE
4241 * indicators
4242 */
4243 CLR(bp->b_flags, (B_WASDIRTY | B_PASSIVE));
4244 CLR(bap->ba_flags, (BA_META | BA_NOCACHE | BA_DELAYIDLESLEEP | BA_IO_TIER_UPGRADE));
4245
4246 SET_BUFATTR_IO_TIER(bap, 0);
4247
4248 DTRACE_IO1(done, buf_t, bp);
4249
4250 if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW)) {
4251 /*
4252 * wake up any writer's blocked
4253 * on throttle or waiting for I/O
4254 * to drain
4255 */
4256 vnode_writedone(bp->b_vp);
4257 }
4258
4259 if (ISSET(bp->b_flags, (B_CALL | B_FILTER))) { /* if necessary, call out */
4260 void (*iodone_func)(struct buf *, void *) = bp->b_iodone;
4261 void *arg = bp->b_transaction;
4262 int callout = ISSET(bp->b_flags, B_CALL);
4263
4264 if (iodone_func == NULL) {
4265 panic("biodone: bp @ %p has NULL b_iodone!\n", bp);
4266 }
4267
4268 CLR(bp->b_flags, (B_CALL | B_FILTER)); /* filters and callouts are one-shot */
4269 bp->b_iodone = NULL;
4270 bp->b_transaction = NULL;
4271
4272 if (callout) {
4273 SET(bp->b_flags, B_DONE); /* note that it's done */
4274 }
4275 (*iodone_func)(bp, arg);
4276
4277 if (callout) {
4278 /*
4279 * assumes that the callback function takes
4280 * ownership of the bp and deals with releasing it if necessary
4281 */
4282 goto biodone_done;
4283 }
4284 /*
4285 * in this case the call back function is acting
4286 * strictly as a filter... it does not take
4287 * ownership of the bp and is expecting us
4288 * to finish cleaning up... this is currently used
4289 * by the HFS journaling code
4290 */
4291 }
4292 if (ISSET(bp->b_flags, B_ASYNC)) { /* if async, release it */
4293 SET(bp->b_flags, B_DONE); /* note that it's done */
4294
4295 buf_brelse(bp);
4296 } else { /* or just wakeup the buffer */
4297 /*
4298 * by taking the mutex, we serialize
4299 * the buf owner calling buf_biowait so that we'll
4300 * only see him in one of 2 states...
4301 * state 1: B_DONE wasn't set and he's
4302 * blocked in msleep
4303 * state 2: he's blocked trying to take the
4304 * mutex before looking at B_DONE
4305 * BL_WANTED is cleared in case anyone else
4306 * is blocked waiting for the buffer... note
4307 * that we haven't cleared B_BUSY yet, so if
4308 * they do get to run, their going to re-set
4309 * BL_WANTED and go back to sleep
4310 */
4311 lck_mtx_lock_spin(buf_mtxp);
4312
4313 CLR(bp->b_lflags, BL_WANTED);
4314 SET(bp->b_flags, B_DONE); /* note that it's done */
4315
4316 lck_mtx_unlock(buf_mtxp);
4317
4318 wakeup(bp);
4319 }
4320 biodone_done:
4321 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
4322 (uintptr_t)bp, (uintptr_t)bp->b_datap, bp->b_flags, 0, 0);
4323 }
4324
4325 /*
4326 * Obfuscate buf pointers.
4327 */
4328 vm_offset_t
4329 buf_kernel_addrperm_addr(void * addr)
4330 {
4331 if ((vm_offset_t)addr == 0) {
4332 return 0;
4333 } else {
4334 return (vm_offset_t)addr + buf_kernel_addrperm;
4335 }
4336 }
4337
4338 /*
4339 * Return a count of buffers on the "locked" queue.
4340 */
4341 int
4342 count_lock_queue(void)
4343 {
4344 buf_t bp;
4345 int n = 0;
4346
4347 lck_mtx_lock_spin(buf_mtxp);
4348
4349 for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
4350 bp = bp->b_freelist.tqe_next) {
4351 n++;
4352 }
4353 lck_mtx_unlock(buf_mtxp);
4354
4355 return n;
4356 }
4357
4358 /*
4359 * Return a count of 'busy' buffers. Used at the time of shutdown.
4360 * note: This is also called from the mach side in debug context in kdp.c
4361 */
4362 int
4363 count_busy_buffers(void)
4364 {
4365 return buf_busycount + bufstats.bufs_iobufinuse;
4366 }
4367
4368 #if DIAGNOSTIC
4369 /*
4370 * Print out statistics on the current allocation of the buffer pool.
4371 * Can be enabled to print out on every ``sync'' by setting "syncprt"
4372 * in vfs_syscalls.c using sysctl.
4373 */
4374 void
4375 vfs_bufstats()
4376 {
4377 int i, j, count;
4378 struct buf *bp;
4379 struct bqueues *dp;
4380 int counts[MAXBSIZE / CLBYTES + 1];
4381 static char *bname[BQUEUES] =
4382 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
4383
4384 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
4385 count = 0;
4386 for (j = 0; j <= MAXBSIZE / CLBYTES; j++) {
4387 counts[j] = 0;
4388 }
4389
4390 lck_mtx_lock(buf_mtxp);
4391
4392 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
4393 counts[bp->b_bufsize / CLBYTES]++;
4394 count++;
4395 }
4396 lck_mtx_unlock(buf_mtxp);
4397
4398 printf("%s: total-%d", bname[i], count);
4399 for (j = 0; j <= MAXBSIZE / CLBYTES; j++) {
4400 if (counts[j] != 0) {
4401 printf(", %d-%d", j * CLBYTES, counts[j]);
4402 }
4403 }
4404 printf("\n");
4405 }
4406 }
4407 #endif /* DIAGNOSTIC */
4408
4409 #define NRESERVEDIOBUFS 128
4410
4411 #define MNT_VIRTUALDEV_MAX_IOBUFS 16
4412 #define VIRTUALDEV_MAX_IOBUFS ((40*niobuf_headers)/100)
4413
4414 buf_t
4415 alloc_io_buf(vnode_t vp, int priv)
4416 {
4417 buf_t bp;
4418 mount_t mp = NULL;
4419 int alloc_for_virtualdev = FALSE;
4420
4421 lck_mtx_lock_spin(iobuffer_mtxp);
4422
4423 /*
4424 * We subject iobuf requests for diskimages to additional restrictions.
4425 *
4426 * a) A single diskimage mount cannot use up more than
4427 * MNT_VIRTUALDEV_MAX_IOBUFS. However,vm privileged (pageout) requests
4428 * are not subject to this restriction.
4429 * b) iobuf headers used by all diskimage headers by all mount
4430 * points cannot exceed VIRTUALDEV_MAX_IOBUFS.
4431 */
4432 if (vp && ((mp = vp->v_mount)) && mp != dead_mountp &&
4433 mp->mnt_kern_flag & MNTK_VIRTUALDEV) {
4434 alloc_for_virtualdev = TRUE;
4435 while ((!priv && mp->mnt_iobufinuse > MNT_VIRTUALDEV_MAX_IOBUFS) ||
4436 bufstats.bufs_iobufinuse_vdev > VIRTUALDEV_MAX_IOBUFS) {
4437 bufstats.bufs_iobufsleeps++;
4438
4439 need_iobuffer = 1;
4440 (void)msleep(&need_iobuffer, iobuffer_mtxp,
4441 PSPIN | (PRIBIO + 1), (const char *)"alloc_io_buf (1)",
4442 NULL);
4443 }
4444 }
4445
4446 while (((niobuf_headers - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse) && !priv) ||
4447 (bp = iobufqueue.tqh_first) == NULL) {
4448 bufstats.bufs_iobufsleeps++;
4449
4450 need_iobuffer = 1;
4451 (void)msleep(&need_iobuffer, iobuffer_mtxp, PSPIN | (PRIBIO + 1),
4452 (const char *)"alloc_io_buf (2)", NULL);
4453 }
4454 TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
4455
4456 bufstats.bufs_iobufinuse++;
4457 if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax) {
4458 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
4459 }
4460
4461 if (alloc_for_virtualdev) {
4462 mp->mnt_iobufinuse++;
4463 bufstats.bufs_iobufinuse_vdev++;
4464 }
4465
4466 lck_mtx_unlock(iobuffer_mtxp);
4467
4468 /*
4469 * initialize various fields
4470 * we don't need to hold the mutex since the buffer
4471 * is now private... the vp should have a reference
4472 * on it and is not protected by this mutex in any event
4473 */
4474 bp->b_timestamp = 0;
4475 bp->b_proc = NULL;
4476
4477 bp->b_datap = 0;
4478 bp->b_flags = 0;
4479 bp->b_lflags = BL_BUSY | BL_IOBUF;
4480 if (alloc_for_virtualdev) {
4481 bp->b_lflags |= BL_IOBUF_VDEV;
4482 }
4483 bp->b_redundancy_flags = 0;
4484 bp->b_blkno = bp->b_lblkno = 0;
4485 #ifdef JOE_DEBUG
4486 bp->b_owner = current_thread();
4487 bp->b_tag = 6;
4488 #endif
4489 bp->b_iodone = NULL;
4490 bp->b_error = 0;
4491 bp->b_resid = 0;
4492 bp->b_bcount = 0;
4493 bp->b_bufsize = 0;
4494 bp->b_upl = NULL;
4495 bp->b_fsprivate = (void *)NULL;
4496 bp->b_vp = vp;
4497 bzero(&bp->b_attr, sizeof(struct bufattr));
4498
4499 if (vp && (vp->v_type == VBLK || vp->v_type == VCHR)) {
4500 bp->b_dev = vp->v_rdev;
4501 } else {
4502 bp->b_dev = NODEV;
4503 }
4504
4505 return bp;
4506 }
4507
4508
4509 void
4510 free_io_buf(buf_t bp)
4511 {
4512 int need_wakeup = 0;
4513 int free_for_virtualdev = FALSE;
4514 mount_t mp = NULL;
4515
4516 /* Was this iobuf for a diskimage ? */
4517 if (bp->b_lflags & BL_IOBUF_VDEV) {
4518 free_for_virtualdev = TRUE;
4519 if (bp->b_vp) {
4520 mp = bp->b_vp->v_mount;
4521 }
4522 }
4523
4524 /*
4525 * put buffer back on the head of the iobufqueue
4526 */
4527 bp->b_vp = NULL;
4528 bp->b_flags = B_INVAL;
4529
4530 /* Zero out the bufattr and its flags before relinquishing this iobuf */
4531 bzero(&bp->b_attr, sizeof(struct bufattr));
4532
4533 lck_mtx_lock_spin(iobuffer_mtxp);
4534
4535 binsheadfree(bp, &iobufqueue, -1);
4536
4537 if (need_iobuffer) {
4538 /*
4539 * Wake up any processes waiting because they need an io buffer
4540 *
4541 * do the wakeup after we drop the mutex... it's possible that the
4542 * wakeup will be superfluous if need_iobuffer gets set again and
4543 * another thread runs this path, but it's highly unlikely, doesn't
4544 * hurt, and it means we don't hold up I/O progress if the wakeup blocks
4545 * trying to grab a task related lock...
4546 */
4547 need_iobuffer = 0;
4548 need_wakeup = 1;
4549 }
4550 if (bufstats.bufs_iobufinuse <= 0) {
4551 panic("free_io_buf: bp(%p) - bufstats.bufs_iobufinuse < 0", bp);
4552 }
4553
4554 bufstats.bufs_iobufinuse--;
4555
4556 if (free_for_virtualdev) {
4557 bufstats.bufs_iobufinuse_vdev--;
4558 if (mp && mp != dead_mountp) {
4559 mp->mnt_iobufinuse--;
4560 }
4561 }
4562
4563 lck_mtx_unlock(iobuffer_mtxp);
4564
4565 if (need_wakeup) {
4566 wakeup(&need_iobuffer);
4567 }
4568 }
4569
4570
4571 void
4572 buf_list_lock(void)
4573 {
4574 lck_mtx_lock_spin(buf_mtxp);
4575 }
4576
4577 void
4578 buf_list_unlock(void)
4579 {
4580 lck_mtx_unlock(buf_mtxp);
4581 }
4582
4583 /*
4584 * If getnewbuf() calls bcleanbuf() on the same thread
4585 * there is a potential for stack overrun and deadlocks.
4586 * So we always handoff the work to a worker thread for completion
4587 */
4588
4589
4590 static void
4591 bcleanbuf_thread_init(void)
4592 {
4593 thread_t thread = THREAD_NULL;
4594
4595 /* create worker thread */
4596 kernel_thread_start((thread_continue_t)bcleanbuf_thread, NULL, &thread);
4597 thread_deallocate(thread);
4598 }
4599
4600 typedef int (*bcleanbufcontinuation)(int);
4601
4602 __attribute__((noreturn))
4603 static void
4604 bcleanbuf_thread(void)
4605 {
4606 struct buf *bp;
4607 int error = 0;
4608 int loopcnt = 0;
4609
4610 for (;;) {
4611 lck_mtx_lock_spin(buf_mtxp);
4612
4613 while ((bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY])) == NULL) {
4614 (void)msleep0(&bufqueues[BQ_LAUNDRY], buf_mtxp, PRIBIO | PDROP, "blaundry", 0, (bcleanbufcontinuation)bcleanbuf_thread);
4615 }
4616
4617 /*
4618 * Remove from the queue
4619 */
4620 bremfree_locked(bp);
4621
4622 /*
4623 * Buffer is no longer on any free list
4624 */
4625 SET(bp->b_lflags, BL_BUSY);
4626 buf_busycount++;
4627
4628 #ifdef JOE_DEBUG
4629 bp->b_owner = current_thread();
4630 bp->b_tag = 10;
4631 #endif
4632
4633 lck_mtx_unlock(buf_mtxp);
4634 /*
4635 * do the IO
4636 */
4637 error = bawrite_internal(bp, 0);
4638
4639 if (error) {
4640 bp->b_whichq = BQ_LAUNDRY;
4641 bp->b_timestamp = buf_timestamp();
4642
4643 lck_mtx_lock_spin(buf_mtxp);
4644
4645 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
4646 blaundrycnt++;
4647
4648 /* we never leave a busy page on the laundry queue */
4649 CLR(bp->b_lflags, BL_BUSY);
4650 buf_busycount--;
4651 #ifdef JOE_DEBUG
4652 bp->b_owner = current_thread();
4653 bp->b_tag = 11;
4654 #endif
4655
4656 lck_mtx_unlock(buf_mtxp);
4657
4658 if (loopcnt > MAXLAUNDRY) {
4659 /*
4660 * bawrite_internal() can return errors if we're throttled. If we've
4661 * done several I/Os and failed, give the system some time to unthrottle
4662 * the vnode
4663 */
4664 (void)tsleep((void *)&bufqueues[BQ_LAUNDRY], PRIBIO, "blaundry", 1);
4665 loopcnt = 0;
4666 } else {
4667 /* give other threads a chance to run */
4668 (void)thread_block(THREAD_CONTINUE_NULL);
4669 loopcnt++;
4670 }
4671 }
4672 }
4673 }
4674
4675
4676 static int
4677 brecover_data(buf_t bp)
4678 {
4679 int upl_offset;
4680 upl_t upl;
4681 upl_page_info_t *pl;
4682 kern_return_t kret;
4683 vnode_t vp = bp->b_vp;
4684 int upl_flags;
4685
4686
4687 if (!UBCINFOEXISTS(vp) || bp->b_bufsize == 0) {
4688 goto dump_buffer;
4689 }
4690
4691 upl_flags = UPL_PRECIOUS;
4692 if (!(buf_flags(bp) & B_READ)) {
4693 /*
4694 * "write" operation: let the UPL subsystem know
4695 * that we intend to modify the buffer cache pages we're
4696 * gathering.
4697 */
4698 upl_flags |= UPL_WILL_MODIFY;
4699 }
4700
4701 kret = ubc_create_upl_kernel(vp,
4702 ubc_blktooff(vp, bp->b_lblkno),
4703 bp->b_bufsize,
4704 &upl,
4705 &pl,
4706 upl_flags,
4707 VM_KERN_MEMORY_FILE);
4708 if (kret != KERN_SUCCESS) {
4709 panic("Failed to create UPL");
4710 }
4711
4712 for (upl_offset = 0; upl_offset < bp->b_bufsize; upl_offset += PAGE_SIZE) {
4713 if (!upl_valid_page(pl, upl_offset / PAGE_SIZE) || !upl_dirty_page(pl, upl_offset / PAGE_SIZE)) {
4714 ubc_upl_abort(upl, 0);
4715 goto dump_buffer;
4716 }
4717 }
4718 bp->b_upl = upl;
4719
4720 kret = ubc_upl_map(upl, (vm_offset_t *)&(bp->b_datap));
4721
4722 if (kret != KERN_SUCCESS) {
4723 panic("getblk: ubc_upl_map() failed with (%d)", kret);
4724 }
4725 return 1;
4726
4727 dump_buffer:
4728 bp->b_bufsize = 0;
4729 SET(bp->b_flags, B_INVAL);
4730 buf_brelse(bp);
4731
4732 return 0;
4733 }
4734
4735 int
4736 fs_buffer_cache_gc_register(void (* callout)(int, void *), void *context)
4737 {
4738 lck_mtx_lock(buf_gc_callout);
4739 for (int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
4740 if (fs_callouts[i].callout == NULL) {
4741 fs_callouts[i].callout = callout;
4742 fs_callouts[i].context = context;
4743 lck_mtx_unlock(buf_gc_callout);
4744 return 0;
4745 }
4746 }
4747
4748 lck_mtx_unlock(buf_gc_callout);
4749 return ENOMEM;
4750 }
4751
4752 int
4753 fs_buffer_cache_gc_unregister(void (* callout)(int, void *), void *context)
4754 {
4755 lck_mtx_lock(buf_gc_callout);
4756 for (int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
4757 if (fs_callouts[i].callout == callout &&
4758 fs_callouts[i].context == context) {
4759 fs_callouts[i].callout = NULL;
4760 fs_callouts[i].context = NULL;
4761 }
4762 }
4763 lck_mtx_unlock(buf_gc_callout);
4764 return 0;
4765 }
4766
4767 static void
4768 fs_buffer_cache_gc_dispatch_callouts(int all)
4769 {
4770 lck_mtx_lock(buf_gc_callout);
4771 for (int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
4772 if (fs_callouts[i].callout != NULL) {
4773 fs_callouts[i].callout(all, fs_callouts[i].context);
4774 }
4775 }
4776 lck_mtx_unlock(buf_gc_callout);
4777 }
4778
4779 static boolean_t
4780 buffer_cache_gc(int all)
4781 {
4782 buf_t bp;
4783 boolean_t did_large_zfree = FALSE;
4784 boolean_t need_wakeup = FALSE;
4785 int now = buf_timestamp();
4786 uint32_t found = 0;
4787 struct bqueues privq;
4788 int thresh_hold = BUF_STALE_THRESHHOLD;
4789
4790 if (all) {
4791 thresh_hold = 0;
4792 }
4793 /*
4794 * We only care about metadata (incore storage comes from zalloc()).
4795 * Unless "all" is set (used to evict meta data buffers in preparation
4796 * for deep sleep), we only evict up to BUF_MAX_GC_BATCH_SIZE buffers
4797 * that have not been accessed in the last BUF_STALE_THRESHOLD seconds.
4798 * BUF_MAX_GC_BATCH_SIZE controls both the hold time of the global lock
4799 * "buf_mtxp" and the length of time we spend compute bound in the GC
4800 * thread which calls this function
4801 */
4802 lck_mtx_lock(buf_mtxp);
4803
4804 do {
4805 found = 0;
4806 TAILQ_INIT(&privq);
4807 need_wakeup = FALSE;
4808
4809 while (((bp = TAILQ_FIRST(&bufqueues[BQ_META]))) &&
4810 (now > bp->b_timestamp) &&
4811 (now - bp->b_timestamp > thresh_hold) &&
4812 (found < BUF_MAX_GC_BATCH_SIZE)) {
4813 /* Remove from free list */
4814 bremfree_locked(bp);
4815 found++;
4816
4817 #ifdef JOE_DEBUG
4818 bp->b_owner = current_thread();
4819 bp->b_tag = 12;
4820 #endif
4821
4822 /* If dirty, move to laundry queue and remember to do wakeup */
4823 if (ISSET(bp->b_flags, B_DELWRI)) {
4824 SET(bp->b_lflags, BL_WANTDEALLOC);
4825
4826 bmovelaundry(bp);
4827 need_wakeup = TRUE;
4828
4829 continue;
4830 }
4831
4832 /*
4833 * Mark busy and put on private list. We could technically get
4834 * away without setting BL_BUSY here.
4835 */
4836 SET(bp->b_lflags, BL_BUSY);
4837 buf_busycount++;
4838
4839 /*
4840 * Remove from hash and dissociate from vp.
4841 */
4842 bremhash(bp);
4843 if (bp->b_vp) {
4844 brelvp_locked(bp);
4845 }
4846
4847 TAILQ_INSERT_TAIL(&privq, bp, b_freelist);
4848 }
4849
4850 if (found == 0) {
4851 break;
4852 }
4853
4854 /* Drop lock for batch processing */
4855 lck_mtx_unlock(buf_mtxp);
4856
4857 /* Wakeup and yield for laundry if need be */
4858 if (need_wakeup) {
4859 wakeup(&bufqueues[BQ_LAUNDRY]);
4860 (void)thread_block(THREAD_CONTINUE_NULL);
4861 }
4862
4863 /* Clean up every buffer on private list */
4864 TAILQ_FOREACH(bp, &privq, b_freelist) {
4865 /* Take note if we've definitely freed at least a page to a zone */
4866 if ((ISSET(bp->b_flags, B_ZALLOC)) && (buf_size(bp) >= PAGE_SIZE)) {
4867 did_large_zfree = TRUE;
4868 }
4869
4870 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
4871
4872 /* Free Storage */
4873 buf_free_meta_store(bp);
4874
4875 /* Release credentials */
4876 buf_release_credentials(bp);
4877
4878 /* Prepare for moving to empty queue */
4879 CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED
4880 | B_AGE | B_ASYNC | B_NOCACHE | B_FUA));
4881 bp->b_whichq = BQ_EMPTY;
4882 BLISTNONE(bp);
4883 }
4884 lck_mtx_lock(buf_mtxp);
4885
4886 /* Back under lock, move them all to invalid hash and clear busy */
4887 TAILQ_FOREACH(bp, &privq, b_freelist) {
4888 binshash(bp, &invalhash);
4889 CLR(bp->b_lflags, BL_BUSY);
4890 buf_busycount--;
4891
4892 #ifdef JOE_DEBUG
4893 if (bp->b_owner != current_thread()) {
4894 panic("Buffer stolen from buffer_cache_gc()");
4895 }
4896 bp->b_owner = current_thread();
4897 bp->b_tag = 13;
4898 #endif
4899 }
4900
4901 /* And do a big bulk move to the empty queue */
4902 TAILQ_CONCAT(&bufqueues[BQ_EMPTY], &privq, b_freelist);
4903 } while (all && (found == BUF_MAX_GC_BATCH_SIZE));
4904
4905 lck_mtx_unlock(buf_mtxp);
4906
4907 fs_buffer_cache_gc_dispatch_callouts(all);
4908
4909 return did_large_zfree;
4910 }
4911
4912
4913 /*
4914 * disabled for now
4915 */
4916
4917 #if FLUSH_QUEUES
4918
4919 #define NFLUSH 32
4920
4921 static int
4922 bp_cmp(void *a, void *b)
4923 {
4924 buf_t *bp_a = *(buf_t **)a,
4925 *bp_b = *(buf_t **)b;
4926 daddr64_t res;
4927
4928 // don't have to worry about negative block
4929 // numbers so this is ok to do.
4930 //
4931 res = (bp_a->b_blkno - bp_b->b_blkno);
4932
4933 return (int)res;
4934 }
4935
4936
4937 int
4938 bflushq(int whichq, mount_t mp)
4939 {
4940 buf_t bp, next;
4941 int i, buf_count;
4942 int total_writes = 0;
4943 static buf_t flush_table[NFLUSH];
4944
4945 if (whichq < 0 || whichq >= BQUEUES) {
4946 return 0;
4947 }
4948
4949 restart:
4950 lck_mtx_lock(buf_mtxp);
4951
4952 bp = TAILQ_FIRST(&bufqueues[whichq]);
4953
4954 for (buf_count = 0; bp; bp = next) {
4955 next = bp->b_freelist.tqe_next;
4956
4957 if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) {
4958 continue;
4959 }
4960
4961 if (ISSET(bp->b_flags, B_DELWRI) && !ISSET(bp->b_lflags, BL_BUSY)) {
4962 bremfree_locked(bp);
4963 #ifdef JOE_DEBUG
4964 bp->b_owner = current_thread();
4965 bp->b_tag = 7;
4966 #endif
4967 SET(bp->b_lflags, BL_BUSY);
4968 buf_busycount++;
4969
4970 flush_table[buf_count] = bp;
4971 buf_count++;
4972 total_writes++;
4973
4974 if (buf_count >= NFLUSH) {
4975 lck_mtx_unlock(buf_mtxp);
4976
4977 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
4978
4979 for (i = 0; i < buf_count; i++) {
4980 buf_bawrite(flush_table[i]);
4981 }
4982 goto restart;
4983 }
4984 }
4985 }
4986 lck_mtx_unlock(buf_mtxp);
4987
4988 if (buf_count > 0) {
4989 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
4990
4991 for (i = 0; i < buf_count; i++) {
4992 buf_bawrite(flush_table[i]);
4993 }
4994 }
4995
4996 return total_writes;
4997 }
4998 #endif