]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_bio.c
xnu-3247.1.106.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_bio.c
1 /*
2 * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*-
30 * Copyright (c) 1994 Christopher G. Demetriou
31 * Copyright (c) 1982, 1986, 1989, 1993
32 * The Regents of the University of California. All rights reserved.
33 * (c) UNIX System Laboratories, Inc.
34 * All or some portions of this file are derived from material licensed
35 * to the University of California by American Telephone and Telegraph
36 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
37 * the permission of UNIX System Laboratories, Inc.
38 *
39 * Redistribution and use in source and binary forms, with or without
40 * modification, are permitted provided that the following conditions
41 * are met:
42 * 1. Redistributions of source code must retain the above copyright
43 * notice, this list of conditions and the following disclaimer.
44 * 2. Redistributions in binary form must reproduce the above copyright
45 * notice, this list of conditions and the following disclaimer in the
46 * documentation and/or other materials provided with the distribution.
47 * 3. All advertising materials mentioning features or use of this software
48 * must display the following acknowledgement:
49 * This product includes software developed by the University of
50 * California, Berkeley and its contributors.
51 * 4. Neither the name of the University nor the names of its contributors
52 * may be used to endorse or promote products derived from this software
53 * without specific prior written permission.
54 *
55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65 * SUCH DAMAGE.
66 *
67 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
68 */
69
70 /*
71 * Some references:
72 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
73 * Leffler, et al.: The Design and Implementation of the 4.3BSD
74 * UNIX Operating System (Addison Welley, 1989)
75 */
76
77 #include <sys/param.h>
78 #include <sys/systm.h>
79 #include <sys/proc_internal.h>
80 #include <sys/buf_internal.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/trace.h>
84 #include <sys/malloc.h>
85 #include <sys/resourcevar.h>
86 #include <miscfs/specfs/specdev.h>
87 #include <sys/ubc.h>
88 #include <sys/kauth.h>
89 #if DIAGNOSTIC
90 #include <kern/assert.h>
91 #endif /* DIAGNOSTIC */
92 #include <kern/task.h>
93 #include <kern/zalloc.h>
94 #include <kern/locks.h>
95 #include <kern/thread.h>
96
97 #include <sys/fslog.h> /* fslog_io_error() */
98
99 #include <mach/mach_types.h>
100 #include <mach/memory_object_types.h>
101 #include <kern/sched_prim.h> /* thread_block() */
102
103 #include <vm/vm_kern.h>
104 #include <vm/vm_pageout.h>
105
106 #include <sys/kdebug.h>
107
108 #include <libkern/OSAtomic.h>
109 #include <libkern/OSDebug.h>
110 #include <sys/ubc_internal.h>
111
112 #include <sys/sdt.h>
113
114 int bcleanbuf(buf_t bp, boolean_t discard);
115 static int brecover_data(buf_t bp);
116 static boolean_t incore(vnode_t vp, daddr64_t blkno);
117 /* timeout is in msecs */
118 static buf_t getnewbuf(int slpflag, int slptimeo, int *queue);
119 static void bremfree_locked(buf_t bp);
120 static void buf_reassign(buf_t bp, vnode_t newvp);
121 static errno_t buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo);
122 static int buf_iterprepare(vnode_t vp, struct buflists *, int flags);
123 static void buf_itercomplete(vnode_t vp, struct buflists *, int flags);
124 static boolean_t buffer_cache_gc(int);
125 static buf_t buf_brelse_shadow(buf_t bp);
126 static void buf_free_meta_store(buf_t bp);
127
128 static buf_t buf_create_shadow_internal(buf_t bp, boolean_t force_copy,
129 uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg, int priv);
130
131
132 __private_extern__ int bdwrite_internal(buf_t, int);
133
134 /* zone allocated buffer headers */
135 static void bufzoneinit(void);
136 static void bcleanbuf_thread_init(void);
137 static void bcleanbuf_thread(void);
138
139 static zone_t buf_hdr_zone;
140 static int buf_hdr_count;
141
142
143 /*
144 * Definitions for the buffer hash lists.
145 */
146 #define BUFHASH(dvp, lbn) \
147 (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
148 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
149 u_long bufhash;
150
151 static buf_t incore_locked(vnode_t vp, daddr64_t blkno, struct bufhashhdr *dp);
152
153 /* Definitions for the buffer stats. */
154 struct bufstats bufstats;
155
156 /* Number of delayed write buffers */
157 long nbdwrite = 0;
158 int blaundrycnt = 0;
159 static int boot_nbuf_headers = 0;
160
161 static TAILQ_HEAD(delayqueue, buf) delaybufqueue;
162
163 static TAILQ_HEAD(ioqueue, buf) iobufqueue;
164 static TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
165 static int needbuffer;
166 static int need_iobuffer;
167
168 static lck_grp_t *buf_mtx_grp;
169 static lck_attr_t *buf_mtx_attr;
170 static lck_grp_attr_t *buf_mtx_grp_attr;
171 static lck_mtx_t *iobuffer_mtxp;
172 static lck_mtx_t *buf_mtxp;
173
174 static int buf_busycount;
175
176 static __inline__ int
177 buf_timestamp(void)
178 {
179 struct timeval t;
180 microuptime(&t);
181 return (t.tv_sec);
182 }
183
184 /*
185 * Insq/Remq for the buffer free lists.
186 */
187 #define binsheadfree(bp, dp, whichq) do { \
188 TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
189 } while (0)
190
191 #define binstailfree(bp, dp, whichq) do { \
192 TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
193 } while (0)
194
195 #define BHASHENTCHECK(bp) \
196 if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef) \
197 panic("%p: b_hash.le_prev is not deadbeef", (bp));
198
199 #define BLISTNONE(bp) \
200 (bp)->b_hash.le_next = (struct buf *)0; \
201 (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
202
203 /*
204 * Insq/Remq for the vnode usage lists.
205 */
206 #define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs)
207 #define bufremvn(bp) { \
208 LIST_REMOVE(bp, b_vnbufs); \
209 (bp)->b_vnbufs.le_next = NOLIST; \
210 }
211
212 /*
213 * Time in seconds before a buffer on a list is
214 * considered as a stale buffer
215 */
216 #define LRU_IS_STALE 120 /* default value for the LRU */
217 #define AGE_IS_STALE 60 /* default value for the AGE */
218 #define META_IS_STALE 180 /* default value for the BQ_META */
219
220 int lru_is_stale = LRU_IS_STALE;
221 int age_is_stale = AGE_IS_STALE;
222 int meta_is_stale = META_IS_STALE;
223
224 #define MAXLAUNDRY 10
225
226 /* LIST_INSERT_HEAD() with assertions */
227 static __inline__ void
228 blistenterhead(struct bufhashhdr * head, buf_t bp)
229 {
230 if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
231 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
232 (head)->lh_first = bp;
233 bp->b_hash.le_prev = &(head)->lh_first;
234 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
235 panic("blistenterhead: le_prev is deadbeef");
236 }
237
238 static __inline__ void
239 binshash(buf_t bp, struct bufhashhdr *dp)
240 {
241 #if DIAGNOSTIC
242 buf_t nbp;
243 #endif /* DIAGNOSTIC */
244
245 BHASHENTCHECK(bp);
246
247 #if DIAGNOSTIC
248 nbp = dp->lh_first;
249 for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
250 if(nbp == bp)
251 panic("buf already in hashlist");
252 }
253 #endif /* DIAGNOSTIC */
254
255 blistenterhead(dp, bp);
256 }
257
258 static __inline__ void
259 bremhash(buf_t bp)
260 {
261 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
262 panic("bremhash le_prev is deadbeef");
263 if (bp->b_hash.le_next == bp)
264 panic("bremhash: next points to self");
265
266 if (bp->b_hash.le_next != NULL)
267 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
268 *bp->b_hash.le_prev = (bp)->b_hash.le_next;
269 }
270
271 /*
272 * buf_mtxp held.
273 */
274 static __inline__ void
275 bmovelaundry(buf_t bp)
276 {
277 bp->b_whichq = BQ_LAUNDRY;
278 bp->b_timestamp = buf_timestamp();
279 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
280 blaundrycnt++;
281 }
282
283 static __inline__ void
284 buf_release_credentials(buf_t bp)
285 {
286 if (IS_VALID_CRED(bp->b_rcred)) {
287 kauth_cred_unref(&bp->b_rcred);
288 }
289 if (IS_VALID_CRED(bp->b_wcred)) {
290 kauth_cred_unref(&bp->b_wcred);
291 }
292 }
293
294
295 int
296 buf_valid(buf_t bp) {
297
298 if ( (bp->b_flags & (B_DONE | B_DELWRI)) )
299 return 1;
300 return 0;
301 }
302
303 int
304 buf_fromcache(buf_t bp) {
305
306 if ( (bp->b_flags & B_CACHE) )
307 return 1;
308 return 0;
309 }
310
311 void
312 buf_markinvalid(buf_t bp) {
313
314 SET(bp->b_flags, B_INVAL);
315 }
316
317 void
318 buf_markdelayed(buf_t bp) {
319
320 if (!ISSET(bp->b_flags, B_DELWRI)) {
321 SET(bp->b_flags, B_DELWRI);
322
323 OSAddAtomicLong(1, &nbdwrite);
324 buf_reassign(bp, bp->b_vp);
325 }
326 SET(bp->b_flags, B_DONE);
327 }
328
329 void
330 buf_markclean(buf_t bp) {
331
332 if (ISSET(bp->b_flags, B_DELWRI)) {
333 CLR(bp->b_flags, B_DELWRI);
334
335 OSAddAtomicLong(-1, &nbdwrite);
336 buf_reassign(bp, bp->b_vp);
337 }
338 }
339
340 void
341 buf_markeintr(buf_t bp) {
342
343 SET(bp->b_flags, B_EINTR);
344 }
345
346
347 void
348 buf_markaged(buf_t bp) {
349
350 SET(bp->b_flags, B_AGE);
351 }
352
353 int
354 buf_fua(buf_t bp) {
355
356 if ((bp->b_flags & B_FUA) == B_FUA)
357 return 1;
358 return 0;
359 }
360
361 void
362 buf_markfua(buf_t bp) {
363
364 SET(bp->b_flags, B_FUA);
365 }
366
367 #if CONFIG_PROTECT
368 cpx_t bufattr_cpx(bufattr_t bap)
369 {
370 return bap->ba_cpx;
371 }
372
373 void bufattr_setcpx(bufattr_t bap, cpx_t cpx)
374 {
375 bap->ba_cpx = cpx;
376 }
377
378 void
379 buf_setcpoff (buf_t bp, uint64_t foffset) {
380 bp->b_attr.ba_cp_file_off = foffset;
381 }
382
383 uint64_t
384 bufattr_cpoff(bufattr_t bap) {
385 return bap->ba_cp_file_off;
386 }
387
388 void
389 bufattr_setcpoff(bufattr_t bap, uint64_t foffset) {
390 bap->ba_cp_file_off = foffset;
391 }
392
393 #else // !CONTECT_PROTECT
394
395 uint64_t
396 bufattr_cpoff(bufattr_t bap __unused) {
397 return 0;
398 }
399
400 void
401 bufattr_setcpoff(__unused bufattr_t bap, __unused uint64_t foffset) {
402 return;
403 }
404
405 struct cpx *bufattr_cpx(__unused bufattr_t bap)
406 {
407 return NULL;
408 }
409
410 void bufattr_setcpx(__unused bufattr_t bap, __unused struct cpx *cpx)
411 {
412 }
413
414 #endif /* !CONFIG_PROTECT */
415
416 bufattr_t
417 bufattr_alloc() {
418 bufattr_t bap;
419 MALLOC(bap, bufattr_t, sizeof(struct bufattr), M_TEMP, M_WAITOK);
420 if (bap == NULL)
421 return NULL;
422
423 bzero(bap, sizeof(struct bufattr));
424 return bap;
425 }
426
427 void
428 bufattr_free(bufattr_t bap) {
429 if (bap)
430 FREE(bap, M_TEMP);
431 }
432
433 bufattr_t
434 bufattr_dup(bufattr_t bap) {
435 bufattr_t new_bufattr;
436 MALLOC(new_bufattr, bufattr_t, sizeof(struct bufattr), M_TEMP, M_WAITOK);
437 if (new_bufattr == NULL)
438 return NULL;
439
440 /* Copy the provided one into the new copy */
441 memcpy (new_bufattr, bap, sizeof(struct bufattr));
442 return new_bufattr;
443 }
444
445 int
446 bufattr_rawencrypted(bufattr_t bap) {
447 if ( (bap->ba_flags & BA_RAW_ENCRYPTED_IO) )
448 return 1;
449 return 0;
450 }
451
452 int
453 bufattr_throttled(bufattr_t bap) {
454 return (GET_BUFATTR_IO_TIER(bap));
455 }
456
457 int
458 bufattr_passive(bufattr_t bap) {
459 if ( (bap->ba_flags & BA_PASSIVE) )
460 return 1;
461 return 0;
462 }
463
464 int
465 bufattr_nocache(bufattr_t bap) {
466 if ( (bap->ba_flags & BA_NOCACHE) )
467 return 1;
468 return 0;
469 }
470
471 int
472 bufattr_meta(bufattr_t bap) {
473 if ( (bap->ba_flags & BA_META) )
474 return 1;
475 return 0;
476 }
477
478 void
479 bufattr_markmeta(bufattr_t bap) {
480 SET(bap->ba_flags, BA_META);
481 }
482
483 int
484 bufattr_delayidlesleep(bufattr_t bap)
485 {
486 if ( (bap->ba_flags & BA_DELAYIDLESLEEP) )
487 return 1;
488 return 0;
489 }
490
491 bufattr_t
492 buf_attr(buf_t bp) {
493 return &bp->b_attr;
494 }
495
496 void
497 buf_markstatic(buf_t bp __unused) {
498 SET(bp->b_flags, B_STATICCONTENT);
499 }
500
501 int
502 buf_static(buf_t bp) {
503 if ( (bp->b_flags & B_STATICCONTENT) )
504 return 1;
505 return 0;
506 }
507
508 void
509 bufattr_markgreedymode(bufattr_t bap) {
510 SET(bap->ba_flags, BA_GREEDY_MODE);
511 }
512
513 int
514 bufattr_greedymode(bufattr_t bap) {
515 if ( (bap->ba_flags & BA_GREEDY_MODE) )
516 return 1;
517 return 0;
518 }
519
520 void
521 bufattr_markisochronous(bufattr_t bap) {
522 SET(bap->ba_flags, BA_ISOCHRONOUS);
523 }
524
525 int
526 bufattr_isochronous(bufattr_t bap) {
527 if ( (bap->ba_flags & BA_ISOCHRONOUS) )
528 return 1;
529 return 0;
530 }
531
532 void
533 bufattr_markquickcomplete(bufattr_t bap) {
534 SET(bap->ba_flags, BA_QUICK_COMPLETE);
535 }
536
537 int
538 bufattr_quickcomplete(bufattr_t bap) {
539 if ( (bap->ba_flags & BA_QUICK_COMPLETE) )
540 return 1;
541 return 0;
542 }
543
544 errno_t
545 buf_error(buf_t bp) {
546
547 return (bp->b_error);
548 }
549
550 void
551 buf_seterror(buf_t bp, errno_t error) {
552
553 if ((bp->b_error = error))
554 SET(bp->b_flags, B_ERROR);
555 else
556 CLR(bp->b_flags, B_ERROR);
557 }
558
559 void
560 buf_setflags(buf_t bp, int32_t flags) {
561
562 SET(bp->b_flags, (flags & BUF_X_WRFLAGS));
563 }
564
565 void
566 buf_clearflags(buf_t bp, int32_t flags) {
567
568 CLR(bp->b_flags, (flags & BUF_X_WRFLAGS));
569 }
570
571 int32_t
572 buf_flags(buf_t bp) {
573
574 return ((bp->b_flags & BUF_X_RDFLAGS));
575 }
576
577 void
578 buf_reset(buf_t bp, int32_t io_flags) {
579
580 CLR(bp->b_flags, (B_READ | B_WRITE | B_ERROR | B_DONE | B_INVAL | B_ASYNC | B_NOCACHE | B_FUA));
581 SET(bp->b_flags, (io_flags & (B_ASYNC | B_READ | B_WRITE | B_NOCACHE)));
582
583 bp->b_error = 0;
584 }
585
586 uint32_t
587 buf_count(buf_t bp) {
588
589 return (bp->b_bcount);
590 }
591
592 void
593 buf_setcount(buf_t bp, uint32_t bcount) {
594
595 bp->b_bcount = bcount;
596 }
597
598 uint32_t
599 buf_size(buf_t bp) {
600
601 return (bp->b_bufsize);
602 }
603
604 void
605 buf_setsize(buf_t bp, uint32_t bufsize) {
606
607 bp->b_bufsize = bufsize;
608 }
609
610 uint32_t
611 buf_resid(buf_t bp) {
612
613 return (bp->b_resid);
614 }
615
616 void
617 buf_setresid(buf_t bp, uint32_t resid) {
618
619 bp->b_resid = resid;
620 }
621
622 uint32_t
623 buf_dirtyoff(buf_t bp) {
624
625 return (bp->b_dirtyoff);
626 }
627
628 uint32_t
629 buf_dirtyend(buf_t bp) {
630
631 return (bp->b_dirtyend);
632 }
633
634 void
635 buf_setdirtyoff(buf_t bp, uint32_t dirtyoff) {
636
637 bp->b_dirtyoff = dirtyoff;
638 }
639
640 void
641 buf_setdirtyend(buf_t bp, uint32_t dirtyend) {
642
643 bp->b_dirtyend = dirtyend;
644 }
645
646 uintptr_t
647 buf_dataptr(buf_t bp) {
648
649 return (bp->b_datap);
650 }
651
652 void
653 buf_setdataptr(buf_t bp, uintptr_t data) {
654
655 bp->b_datap = data;
656 }
657
658 vnode_t
659 buf_vnode(buf_t bp) {
660
661 return (bp->b_vp);
662 }
663
664 void
665 buf_setvnode(buf_t bp, vnode_t vp) {
666
667 bp->b_vp = vp;
668 }
669
670
671 void *
672 buf_callback(buf_t bp)
673 {
674 if ( !(bp->b_flags & B_CALL) )
675 return ((void *) NULL);
676
677 return ((void *)bp->b_iodone);
678 }
679
680
681 errno_t
682 buf_setcallback(buf_t bp, void (*callback)(buf_t, void *), void *transaction)
683 {
684 assert(!ISSET(bp->b_flags, B_FILTER) && ISSET(bp->b_lflags, BL_BUSY));
685
686 if (callback)
687 bp->b_flags |= (B_CALL | B_ASYNC);
688 else
689 bp->b_flags &= ~B_CALL;
690 bp->b_transaction = transaction;
691 bp->b_iodone = callback;
692
693 return (0);
694 }
695
696 errno_t
697 buf_setupl(buf_t bp, upl_t upl, uint32_t offset)
698 {
699
700 if ( !(bp->b_lflags & BL_IOBUF) )
701 return (EINVAL);
702
703 if (upl)
704 bp->b_flags |= B_CLUSTER;
705 else
706 bp->b_flags &= ~B_CLUSTER;
707 bp->b_upl = upl;
708 bp->b_uploffset = offset;
709
710 return (0);
711 }
712
713 buf_t
714 buf_clone(buf_t bp, int io_offset, int io_size, void (*iodone)(buf_t, void *), void *arg)
715 {
716 buf_t io_bp;
717
718 if (io_offset < 0 || io_size < 0)
719 return (NULL);
720
721 if ((unsigned)(io_offset + io_size) > (unsigned)bp->b_bcount)
722 return (NULL);
723
724 if (bp->b_flags & B_CLUSTER) {
725 if (io_offset && ((bp->b_uploffset + io_offset) & PAGE_MASK))
726 return (NULL);
727
728 if (((bp->b_uploffset + io_offset + io_size) & PAGE_MASK) && ((io_offset + io_size) < bp->b_bcount))
729 return (NULL);
730 }
731 io_bp = alloc_io_buf(bp->b_vp, 0);
732
733 io_bp->b_flags = bp->b_flags & (B_COMMIT_UPL | B_META | B_PAGEIO | B_CLUSTER | B_PHYS | B_RAW | B_ASYNC | B_READ | B_FUA);
734
735 if (iodone) {
736 io_bp->b_transaction = arg;
737 io_bp->b_iodone = iodone;
738 io_bp->b_flags |= B_CALL;
739 }
740 if (bp->b_flags & B_CLUSTER) {
741 io_bp->b_upl = bp->b_upl;
742 io_bp->b_uploffset = bp->b_uploffset + io_offset;
743 } else {
744 io_bp->b_datap = (uintptr_t)(((char *)bp->b_datap) + io_offset);
745 }
746 io_bp->b_bcount = io_size;
747
748 return (io_bp);
749 }
750
751
752 int
753 buf_shadow(buf_t bp)
754 {
755 if (bp->b_lflags & BL_SHADOW)
756 return 1;
757 return 0;
758 }
759
760
761 buf_t
762 buf_create_shadow_priv(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg)
763 {
764 return (buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, 1));
765 }
766
767 buf_t
768 buf_create_shadow(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg)
769 {
770 return (buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, 0));
771 }
772
773
774 static buf_t
775 buf_create_shadow_internal(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg, int priv)
776 {
777 buf_t io_bp;
778
779 KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_START, bp, 0, 0, 0, 0);
780
781 if ( !(bp->b_flags & B_META) || (bp->b_lflags & BL_IOBUF)) {
782
783 KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_END, bp, 0, 0, 0, 0);
784 return (NULL);
785 }
786 #ifdef BUF_MAKE_PRIVATE
787 if (bp->b_shadow_ref && bp->b_data_ref == 0 && external_storage == 0)
788 panic("buf_create_shadow: %p is in the private state (%d, %d)", bp, bp->b_shadow_ref, bp->b_data_ref);
789 #endif
790 io_bp = alloc_io_buf(bp->b_vp, priv);
791
792 io_bp->b_flags = bp->b_flags & (B_META | B_ZALLOC | B_ASYNC | B_READ | B_FUA);
793 io_bp->b_blkno = bp->b_blkno;
794 io_bp->b_lblkno = bp->b_lblkno;
795
796 if (iodone) {
797 io_bp->b_transaction = arg;
798 io_bp->b_iodone = iodone;
799 io_bp->b_flags |= B_CALL;
800 }
801 if (force_copy == FALSE) {
802 io_bp->b_bcount = bp->b_bcount;
803 io_bp->b_bufsize = bp->b_bufsize;
804
805 if (external_storage) {
806 io_bp->b_datap = external_storage;
807 #ifdef BUF_MAKE_PRIVATE
808 io_bp->b_data_store = NULL;
809 #endif
810 } else {
811 io_bp->b_datap = bp->b_datap;
812 #ifdef BUF_MAKE_PRIVATE
813 io_bp->b_data_store = bp;
814 #endif
815 }
816 *(buf_t *)(&io_bp->b_orig) = bp;
817
818 lck_mtx_lock_spin(buf_mtxp);
819
820 io_bp->b_lflags |= BL_SHADOW;
821 io_bp->b_shadow = bp->b_shadow;
822 bp->b_shadow = io_bp;
823 bp->b_shadow_ref++;
824
825 #ifdef BUF_MAKE_PRIVATE
826 if (external_storage)
827 io_bp->b_lflags |= BL_EXTERNAL;
828 else
829 bp->b_data_ref++;
830 #endif
831 lck_mtx_unlock(buf_mtxp);
832 } else {
833 if (external_storage) {
834 #ifdef BUF_MAKE_PRIVATE
835 io_bp->b_lflags |= BL_EXTERNAL;
836 #endif
837 io_bp->b_bcount = bp->b_bcount;
838 io_bp->b_bufsize = bp->b_bufsize;
839 io_bp->b_datap = external_storage;
840 } else {
841 allocbuf(io_bp, bp->b_bcount);
842
843 io_bp->b_lflags |= BL_IOBUF_ALLOC;
844 }
845 bcopy((caddr_t)bp->b_datap, (caddr_t)io_bp->b_datap, bp->b_bcount);
846
847 #ifdef BUF_MAKE_PRIVATE
848 io_bp->b_data_store = NULL;
849 #endif
850 }
851 KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, io_bp, 0);
852
853 return (io_bp);
854 }
855
856
857 #ifdef BUF_MAKE_PRIVATE
858 errno_t
859 buf_make_private(buf_t bp)
860 {
861 buf_t ds_bp;
862 buf_t t_bp;
863 struct buf my_buf;
864
865 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_START, bp, bp->b_shadow_ref, 0, 0, 0);
866
867 if (bp->b_shadow_ref == 0 || bp->b_data_ref == 0 || ISSET(bp->b_lflags, BL_SHADOW)) {
868
869 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, EINVAL, 0);
870 return (EINVAL);
871 }
872 my_buf.b_flags = B_META;
873 my_buf.b_datap = (uintptr_t)NULL;
874 allocbuf(&my_buf, bp->b_bcount);
875
876 bcopy((caddr_t)bp->b_datap, (caddr_t)my_buf.b_datap, bp->b_bcount);
877
878 lck_mtx_lock_spin(buf_mtxp);
879
880 for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) {
881 if ( !ISSET(bp->b_lflags, BL_EXTERNAL))
882 break;
883 }
884 ds_bp = t_bp;
885
886 if (ds_bp == NULL && bp->b_data_ref)
887 panic("buf_make_private: b_data_ref != 0 && ds_bp == NULL");
888
889 if (ds_bp && (bp->b_data_ref == 0 || bp->b_shadow_ref == 0))
890 panic("buf_make_private: ref_count == 0 && ds_bp != NULL");
891
892 if (ds_bp == NULL) {
893 lck_mtx_unlock(buf_mtxp);
894
895 buf_free_meta_store(&my_buf);
896
897 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, EINVAL, 0);
898 return (EINVAL);
899 }
900 for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) {
901 if ( !ISSET(t_bp->b_lflags, BL_EXTERNAL))
902 t_bp->b_data_store = ds_bp;
903 }
904 ds_bp->b_data_ref = bp->b_data_ref;
905
906 bp->b_data_ref = 0;
907 bp->b_datap = my_buf.b_datap;
908
909 lck_mtx_unlock(buf_mtxp);
910
911 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, 0, 0);
912 return (0);
913 }
914 #endif
915
916
917 void
918 buf_setfilter(buf_t bp, void (*filter)(buf_t, void *), void *transaction,
919 void (**old_iodone)(buf_t, void *), void **old_transaction)
920 {
921 assert(ISSET(bp->b_lflags, BL_BUSY));
922
923 if (old_iodone)
924 *old_iodone = bp->b_iodone;
925 if (old_transaction)
926 *old_transaction = bp->b_transaction;
927
928 bp->b_transaction = transaction;
929 bp->b_iodone = filter;
930 if (filter)
931 bp->b_flags |= B_FILTER;
932 else
933 bp->b_flags &= ~B_FILTER;
934 }
935
936
937 daddr64_t
938 buf_blkno(buf_t bp) {
939
940 return (bp->b_blkno);
941 }
942
943 daddr64_t
944 buf_lblkno(buf_t bp) {
945
946 return (bp->b_lblkno);
947 }
948
949 void
950 buf_setblkno(buf_t bp, daddr64_t blkno) {
951
952 bp->b_blkno = blkno;
953 }
954
955 void
956 buf_setlblkno(buf_t bp, daddr64_t lblkno) {
957
958 bp->b_lblkno = lblkno;
959 }
960
961 dev_t
962 buf_device(buf_t bp) {
963
964 return (bp->b_dev);
965 }
966
967 errno_t
968 buf_setdevice(buf_t bp, vnode_t vp) {
969
970 if ((vp->v_type != VBLK) && (vp->v_type != VCHR))
971 return EINVAL;
972 bp->b_dev = vp->v_rdev;
973
974 return 0;
975 }
976
977
978 void *
979 buf_drvdata(buf_t bp) {
980
981 return (bp->b_drvdata);
982 }
983
984 void
985 buf_setdrvdata(buf_t bp, void *drvdata) {
986
987 bp->b_drvdata = drvdata;
988 }
989
990 void *
991 buf_fsprivate(buf_t bp) {
992
993 return (bp->b_fsprivate);
994 }
995
996 void
997 buf_setfsprivate(buf_t bp, void *fsprivate) {
998
999 bp->b_fsprivate = fsprivate;
1000 }
1001
1002 kauth_cred_t
1003 buf_rcred(buf_t bp) {
1004
1005 return (bp->b_rcred);
1006 }
1007
1008 kauth_cred_t
1009 buf_wcred(buf_t bp) {
1010
1011 return (bp->b_wcred);
1012 }
1013
1014 void *
1015 buf_upl(buf_t bp) {
1016
1017 return (bp->b_upl);
1018 }
1019
1020 uint32_t
1021 buf_uploffset(buf_t bp) {
1022
1023 return ((uint32_t)(bp->b_uploffset));
1024 }
1025
1026 proc_t
1027 buf_proc(buf_t bp) {
1028
1029 return (bp->b_proc);
1030 }
1031
1032
1033 errno_t
1034 buf_map(buf_t bp, caddr_t *io_addr)
1035 {
1036 buf_t real_bp;
1037 vm_offset_t vaddr;
1038 kern_return_t kret;
1039
1040 if ( !(bp->b_flags & B_CLUSTER)) {
1041 *io_addr = (caddr_t)bp->b_datap;
1042 return (0);
1043 }
1044 real_bp = (buf_t)(bp->b_real_bp);
1045
1046 if (real_bp && real_bp->b_datap) {
1047 /*
1048 * b_real_bp is only valid if B_CLUSTER is SET
1049 * if it's non-zero, than someone did a cluster_bp call
1050 * if the backing physical pages were already mapped
1051 * in before the call to cluster_bp (non-zero b_datap),
1052 * than we just use that mapping
1053 */
1054 *io_addr = (caddr_t)real_bp->b_datap;
1055 return (0);
1056 }
1057 kret = ubc_upl_map(bp->b_upl, &vaddr); /* Map it in */
1058
1059 if (kret != KERN_SUCCESS) {
1060 *io_addr = NULL;
1061
1062 return(ENOMEM);
1063 }
1064 vaddr += bp->b_uploffset;
1065
1066 *io_addr = (caddr_t)vaddr;
1067
1068 return (0);
1069 }
1070
1071 errno_t
1072 buf_unmap(buf_t bp)
1073 {
1074 buf_t real_bp;
1075 kern_return_t kret;
1076
1077 if ( !(bp->b_flags & B_CLUSTER))
1078 return (0);
1079 /*
1080 * see buf_map for the explanation
1081 */
1082 real_bp = (buf_t)(bp->b_real_bp);
1083
1084 if (real_bp && real_bp->b_datap)
1085 return (0);
1086
1087 if ((bp->b_lflags & BL_IOBUF) &&
1088 ((bp->b_flags & (B_PAGEIO | B_READ)) != (B_PAGEIO | B_READ))) {
1089 /*
1090 * ignore pageins... the 'right' thing will
1091 * happen due to the way we handle speculative
1092 * clusters...
1093 *
1094 * when we commit these pages, we'll hit
1095 * it with UPL_COMMIT_INACTIVE which
1096 * will clear the reference bit that got
1097 * turned on when we touched the mapping
1098 */
1099 bp->b_flags |= B_AGE;
1100 }
1101 kret = ubc_upl_unmap(bp->b_upl);
1102
1103 if (kret != KERN_SUCCESS)
1104 return (EINVAL);
1105 return (0);
1106 }
1107
1108
1109 void
1110 buf_clear(buf_t bp) {
1111 caddr_t baddr;
1112
1113 if (buf_map(bp, &baddr) == 0) {
1114 bzero(baddr, bp->b_bcount);
1115 buf_unmap(bp);
1116 }
1117 bp->b_resid = 0;
1118 }
1119
1120 /*
1121 * Read or write a buffer that is not contiguous on disk.
1122 * buffer is marked done/error at the conclusion
1123 */
1124 static int
1125 buf_strategy_fragmented(vnode_t devvp, buf_t bp, off_t f_offset, size_t contig_bytes)
1126 {
1127 vnode_t vp = buf_vnode(bp);
1128 buf_t io_bp; /* For reading or writing a single block */
1129 int io_direction;
1130 int io_resid;
1131 size_t io_contig_bytes;
1132 daddr64_t io_blkno;
1133 int error = 0;
1134 int bmap_flags;
1135
1136 /*
1137 * save our starting point... the bp was already mapped
1138 * in buf_strategy before we got called
1139 * no sense doing it again.
1140 */
1141 io_blkno = bp->b_blkno;
1142 /*
1143 * Make sure we redo this mapping for the next I/O
1144 * i.e. this can never be a 'permanent' mapping
1145 */
1146 bp->b_blkno = bp->b_lblkno;
1147
1148 /*
1149 * Get an io buffer to do the deblocking
1150 */
1151 io_bp = alloc_io_buf(devvp, 0);
1152
1153 io_bp->b_lblkno = bp->b_lblkno;
1154 io_bp->b_datap = bp->b_datap;
1155 io_resid = bp->b_bcount;
1156 io_direction = bp->b_flags & B_READ;
1157 io_contig_bytes = contig_bytes;
1158
1159 if (bp->b_flags & B_READ)
1160 bmap_flags = VNODE_READ;
1161 else
1162 bmap_flags = VNODE_WRITE;
1163
1164 for (;;) {
1165 if (io_blkno == -1)
1166 /*
1167 * this is unexepected, but we'll allow for it
1168 */
1169 bzero((caddr_t)io_bp->b_datap, (int)io_contig_bytes);
1170 else {
1171 io_bp->b_bcount = io_contig_bytes;
1172 io_bp->b_bufsize = io_contig_bytes;
1173 io_bp->b_resid = io_contig_bytes;
1174 io_bp->b_blkno = io_blkno;
1175
1176 buf_reset(io_bp, io_direction);
1177
1178 /*
1179 * Call the device to do the I/O and wait for it. Make sure the appropriate party is charged for write
1180 */
1181
1182 if (!ISSET(bp->b_flags, B_READ))
1183 OSAddAtomic(1, &devvp->v_numoutput);
1184
1185 if ((error = VNOP_STRATEGY(io_bp)))
1186 break;
1187 if ((error = (int)buf_biowait(io_bp)))
1188 break;
1189 if (io_bp->b_resid) {
1190 io_resid -= (io_contig_bytes - io_bp->b_resid);
1191 break;
1192 }
1193 }
1194 if ((io_resid -= io_contig_bytes) == 0)
1195 break;
1196 f_offset += io_contig_bytes;
1197 io_bp->b_datap += io_contig_bytes;
1198
1199 /*
1200 * Map the current position to a physical block number
1201 */
1202 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_resid, &io_blkno, &io_contig_bytes, NULL, bmap_flags, NULL)))
1203 break;
1204 }
1205 buf_free(io_bp);
1206
1207 if (error)
1208 buf_seterror(bp, error);
1209 bp->b_resid = io_resid;
1210 /*
1211 * This I/O is now complete
1212 */
1213 buf_biodone(bp);
1214
1215 return error;
1216 }
1217
1218
1219 /*
1220 * struct vnop_strategy_args {
1221 * struct buf *a_bp;
1222 * } *ap;
1223 */
1224 errno_t
1225 buf_strategy(vnode_t devvp, void *ap)
1226 {
1227 buf_t bp = ((struct vnop_strategy_args *)ap)->a_bp;
1228 vnode_t vp = bp->b_vp;
1229 int bmap_flags;
1230 errno_t error;
1231 #if CONFIG_DTRACE
1232 int dtrace_io_start_flag = 0; /* We only want to trip the io:::start
1233 * probe once, with the true physical
1234 * block in place (b_blkno)
1235 */
1236
1237 #endif
1238
1239 if (vp == NULL || vp->v_type == VCHR || vp->v_type == VBLK)
1240 panic("buf_strategy: b_vp == NULL || vtype == VCHR | VBLK\n");
1241 /*
1242 * associate the physical device with
1243 * with this buf_t even if we don't
1244 * end up issuing the I/O...
1245 */
1246 bp->b_dev = devvp->v_rdev;
1247
1248 if (bp->b_flags & B_READ)
1249 bmap_flags = VNODE_READ;
1250 else
1251 bmap_flags = VNODE_WRITE;
1252
1253 if ( !(bp->b_flags & B_CLUSTER)) {
1254
1255 if ( (bp->b_upl) ) {
1256 /*
1257 * we have a UPL associated with this bp
1258 * go through cluster_bp which knows how
1259 * to deal with filesystem block sizes
1260 * that aren't equal to the page size
1261 */
1262 DTRACE_IO1(start, buf_t, bp);
1263 return (cluster_bp(bp));
1264 }
1265 if (bp->b_blkno == bp->b_lblkno) {
1266 off_t f_offset;
1267 size_t contig_bytes;
1268
1269 if ((error = VNOP_BLKTOOFF(vp, bp->b_lblkno, &f_offset))) {
1270 DTRACE_IO1(start, buf_t, bp);
1271 buf_seterror(bp, error);
1272 buf_biodone(bp);
1273
1274 return (error);
1275 }
1276
1277 if ((error = VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))) {
1278 DTRACE_IO1(start, buf_t, bp);
1279 buf_seterror(bp, error);
1280 buf_biodone(bp);
1281
1282 return (error);
1283 }
1284
1285 DTRACE_IO1(start, buf_t, bp);
1286 #if CONFIG_DTRACE
1287 dtrace_io_start_flag = 1;
1288 #endif /* CONFIG_DTRACE */
1289
1290 if ((bp->b_blkno == -1) || (contig_bytes == 0)) {
1291 /* Set block number to force biodone later */
1292 bp->b_blkno = -1;
1293 buf_clear(bp);
1294 }
1295 else if ((long)contig_bytes < bp->b_bcount) {
1296 return (buf_strategy_fragmented(devvp, bp, f_offset, contig_bytes));
1297 }
1298 }
1299
1300 #if CONFIG_DTRACE
1301 if (dtrace_io_start_flag == 0) {
1302 DTRACE_IO1(start, buf_t, bp);
1303 dtrace_io_start_flag = 1;
1304 }
1305 #endif /* CONFIG_DTRACE */
1306
1307 if (bp->b_blkno == -1) {
1308 buf_biodone(bp);
1309 return (0);
1310 }
1311 }
1312
1313 #if CONFIG_DTRACE
1314 if (dtrace_io_start_flag == 0)
1315 DTRACE_IO1(start, buf_t, bp);
1316 #endif /* CONFIG_DTRACE */
1317
1318 #if CONFIG_PROTECT
1319 /* Capture f_offset in the bufattr*/
1320 cpx_t cpx = bufattr_cpx(buf_attr(bp));
1321 if (cpx) {
1322 /* No need to go here for older EAs */
1323 if(cpx_use_offset_for_iv(cpx)) {
1324 off_t f_offset;
1325 if ((error = VNOP_BLKTOOFF(bp->b_vp, bp->b_lblkno, &f_offset)))
1326 return error;
1327
1328 /*
1329 * Attach the file offset to this buffer. The
1330 * bufattr attributes will be passed down the stack
1331 * until they reach IOFlashStorage. IOFlashStorage
1332 * will retain the offset in a local variable when it
1333 * issues its I/Os to the NAND controller.
1334 *
1335 * Note that LwVM may end up splitting this I/O
1336 * into sub-I/Os if it crosses a chunk boundary. In this
1337 * case, LwVM will update this field when it dispatches
1338 * each I/O to IOFlashStorage. But from our perspective
1339 * we have only issued a single I/O.
1340 */
1341 buf_setcpoff(bp, f_offset);
1342 CP_DEBUG((CPDBG_OFFSET_IO | DBG_FUNC_NONE), (uint32_t) f_offset, (uint32_t) bp->b_lblkno, (uint32_t) bp->b_blkno, (uint32_t) bp->b_bcount, 0);
1343 }
1344 }
1345 #endif
1346
1347 /*
1348 * we can issue the I/O because...
1349 * either B_CLUSTER is set which
1350 * means that the I/O is properly set
1351 * up to be a multiple of the page size, or
1352 * we were able to successfully set up the
1353 * physical block mapping
1354 */
1355 error = VOCALL(devvp->v_op, VOFFSET(vnop_strategy), ap);
1356 DTRACE_FSINFO(strategy, vnode_t, vp);
1357 return (error);
1358 }
1359
1360
1361
1362 buf_t
1363 buf_alloc(vnode_t vp)
1364 {
1365 return(alloc_io_buf(vp, 0));
1366 }
1367
1368 void
1369 buf_free(buf_t bp) {
1370
1371 free_io_buf(bp);
1372 }
1373
1374
1375 /*
1376 * iterate buffers for the specified vp.
1377 * if BUF_SCAN_DIRTY is set, do the dirty list
1378 * if BUF_SCAN_CLEAN is set, do the clean list
1379 * if neither flag is set, default to BUF_SCAN_DIRTY
1380 * if BUF_NOTIFY_BUSY is set, call the callout function using a NULL bp for busy pages
1381 */
1382
1383 struct buf_iterate_info_t {
1384 int flag;
1385 struct buflists *listhead;
1386 };
1387
1388 void
1389 buf_iterate(vnode_t vp, int (*callout)(buf_t, void *), int flags, void *arg)
1390 {
1391 buf_t bp;
1392 int retval;
1393 struct buflists local_iterblkhd;
1394 int lock_flags = BAC_NOWAIT | BAC_REMOVE;
1395 int notify_busy = flags & BUF_NOTIFY_BUSY;
1396 struct buf_iterate_info_t list[2];
1397 int num_lists, i;
1398
1399 if (flags & BUF_SKIP_LOCKED)
1400 lock_flags |= BAC_SKIP_LOCKED;
1401 if (flags & BUF_SKIP_NONLOCKED)
1402 lock_flags |= BAC_SKIP_NONLOCKED;
1403
1404 if ( !(flags & (BUF_SCAN_DIRTY | BUF_SCAN_CLEAN)))
1405 flags |= BUF_SCAN_DIRTY;
1406
1407 num_lists = 0;
1408
1409 if (flags & BUF_SCAN_DIRTY) {
1410 list[num_lists].flag = VBI_DIRTY;
1411 list[num_lists].listhead = &vp->v_dirtyblkhd;
1412 num_lists++;
1413 }
1414 if (flags & BUF_SCAN_CLEAN) {
1415 list[num_lists].flag = VBI_CLEAN;
1416 list[num_lists].listhead = &vp->v_cleanblkhd;
1417 num_lists++;
1418 }
1419
1420 for (i = 0; i < num_lists; i++) {
1421 lck_mtx_lock(buf_mtxp);
1422
1423 if (buf_iterprepare(vp, &local_iterblkhd, list[i].flag)) {
1424 lck_mtx_unlock(buf_mtxp);
1425 continue;
1426 }
1427 while (!LIST_EMPTY(&local_iterblkhd)) {
1428 bp = LIST_FIRST(&local_iterblkhd);
1429 LIST_REMOVE(bp, b_vnbufs);
1430 LIST_INSERT_HEAD(list[i].listhead, bp, b_vnbufs);
1431
1432 if (buf_acquire_locked(bp, lock_flags, 0, 0)) {
1433 if (notify_busy) {
1434 bp = NULL;
1435 } else {
1436 continue;
1437 }
1438 }
1439
1440 lck_mtx_unlock(buf_mtxp);
1441
1442 retval = callout(bp, arg);
1443
1444 switch (retval) {
1445 case BUF_RETURNED:
1446 if (bp)
1447 buf_brelse(bp);
1448 break;
1449 case BUF_CLAIMED:
1450 break;
1451 case BUF_RETURNED_DONE:
1452 if (bp)
1453 buf_brelse(bp);
1454 lck_mtx_lock(buf_mtxp);
1455 goto out;
1456 case BUF_CLAIMED_DONE:
1457 lck_mtx_lock(buf_mtxp);
1458 goto out;
1459 }
1460 lck_mtx_lock(buf_mtxp);
1461 } /* while list has more nodes */
1462 out:
1463 buf_itercomplete(vp, &local_iterblkhd, list[i].flag);
1464 lck_mtx_unlock(buf_mtxp);
1465 } /* for each list */
1466 } /* buf_iterate */
1467
1468
1469 /*
1470 * Flush out and invalidate all buffers associated with a vnode.
1471 */
1472 int
1473 buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo)
1474 {
1475 buf_t bp;
1476 int aflags;
1477 int error = 0;
1478 int must_rescan = 1;
1479 struct buflists local_iterblkhd;
1480
1481
1482 if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd))
1483 return (0);
1484
1485 lck_mtx_lock(buf_mtxp);
1486
1487 for (;;) {
1488 if (must_rescan == 0)
1489 /*
1490 * the lists may not be empty, but all that's left at this
1491 * point are metadata or B_LOCKED buffers which are being
1492 * skipped... we know this because we made it through both
1493 * the clean and dirty lists without dropping buf_mtxp...
1494 * each time we drop buf_mtxp we bump "must_rescan"
1495 */
1496 break;
1497 if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd))
1498 break;
1499 must_rescan = 0;
1500 /*
1501 * iterate the clean list
1502 */
1503 if (buf_iterprepare(vp, &local_iterblkhd, VBI_CLEAN)) {
1504 goto try_dirty_list;
1505 }
1506 while (!LIST_EMPTY(&local_iterblkhd)) {
1507
1508 bp = LIST_FIRST(&local_iterblkhd);
1509
1510 LIST_REMOVE(bp, b_vnbufs);
1511 LIST_INSERT_HEAD(&vp->v_cleanblkhd, bp, b_vnbufs);
1512
1513 /*
1514 * some filesystems distinguish meta data blocks with a negative logical block #
1515 */
1516 if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
1517 continue;
1518
1519 aflags = BAC_REMOVE;
1520
1521 if ( !(flags & BUF_INVALIDATE_LOCKED) )
1522 aflags |= BAC_SKIP_LOCKED;
1523
1524 if ( (error = (int)buf_acquire_locked(bp, aflags, slpflag, slptimeo)) ) {
1525 if (error == EDEADLK)
1526 /*
1527 * this buffer was marked B_LOCKED...
1528 * we didn't drop buf_mtxp, so we
1529 * we don't need to rescan
1530 */
1531 continue;
1532 if (error == EAGAIN) {
1533 /*
1534 * found a busy buffer... we blocked and
1535 * dropped buf_mtxp, so we're going to
1536 * need to rescan after this pass is completed
1537 */
1538 must_rescan++;
1539 continue;
1540 }
1541 /*
1542 * got some kind of 'real' error out of the msleep
1543 * in buf_acquire_locked, terminate the scan and return the error
1544 */
1545 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1546
1547 lck_mtx_unlock(buf_mtxp);
1548 return (error);
1549 }
1550 lck_mtx_unlock(buf_mtxp);
1551
1552 if (bp->b_flags & B_LOCKED)
1553 KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 0, 0);
1554
1555 CLR(bp->b_flags, B_LOCKED);
1556 SET(bp->b_flags, B_INVAL);
1557 buf_brelse(bp);
1558
1559 lck_mtx_lock(buf_mtxp);
1560
1561 /*
1562 * by dropping buf_mtxp, we allow new
1563 * buffers to be added to the vnode list(s)
1564 * we'll have to rescan at least once more
1565 * if the queues aren't empty
1566 */
1567 must_rescan++;
1568 }
1569 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1570
1571 try_dirty_list:
1572 /*
1573 * Now iterate on dirty blks
1574 */
1575 if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY)) {
1576 continue;
1577 }
1578 while (!LIST_EMPTY(&local_iterblkhd)) {
1579 bp = LIST_FIRST(&local_iterblkhd);
1580
1581 LIST_REMOVE(bp, b_vnbufs);
1582 LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1583
1584 /*
1585 * some filesystems distinguish meta data blocks with a negative logical block #
1586 */
1587 if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
1588 continue;
1589
1590 aflags = BAC_REMOVE;
1591
1592 if ( !(flags & BUF_INVALIDATE_LOCKED) )
1593 aflags |= BAC_SKIP_LOCKED;
1594
1595 if ( (error = (int)buf_acquire_locked(bp, aflags, slpflag, slptimeo)) ) {
1596 if (error == EDEADLK)
1597 /*
1598 * this buffer was marked B_LOCKED...
1599 * we didn't drop buf_mtxp, so we
1600 * we don't need to rescan
1601 */
1602 continue;
1603 if (error == EAGAIN) {
1604 /*
1605 * found a busy buffer... we blocked and
1606 * dropped buf_mtxp, so we're going to
1607 * need to rescan after this pass is completed
1608 */
1609 must_rescan++;
1610 continue;
1611 }
1612 /*
1613 * got some kind of 'real' error out of the msleep
1614 * in buf_acquire_locked, terminate the scan and return the error
1615 */
1616 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1617
1618 lck_mtx_unlock(buf_mtxp);
1619 return (error);
1620 }
1621 lck_mtx_unlock(buf_mtxp);
1622
1623 if (bp->b_flags & B_LOCKED)
1624 KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 1, 0);
1625
1626 CLR(bp->b_flags, B_LOCKED);
1627 SET(bp->b_flags, B_INVAL);
1628
1629 if (ISSET(bp->b_flags, B_DELWRI) && (flags & BUF_WRITE_DATA))
1630 (void) VNOP_BWRITE(bp);
1631 else
1632 buf_brelse(bp);
1633
1634 lck_mtx_lock(buf_mtxp);
1635 /*
1636 * by dropping buf_mtxp, we allow new
1637 * buffers to be added to the vnode list(s)
1638 * we'll have to rescan at least once more
1639 * if the queues aren't empty
1640 */
1641 must_rescan++;
1642 }
1643 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1644 }
1645 lck_mtx_unlock(buf_mtxp);
1646
1647 return (0);
1648 }
1649
1650 void
1651 buf_flushdirtyblks(vnode_t vp, int wait, int flags, const char *msg) {
1652
1653 (void) buf_flushdirtyblks_skipinfo(vp, wait, flags, msg);
1654 return;
1655 }
1656
1657 int
1658 buf_flushdirtyblks_skipinfo(vnode_t vp, int wait, int flags, const char *msg) {
1659 buf_t bp;
1660 int writes_issued = 0;
1661 errno_t error;
1662 int busy = 0;
1663 struct buflists local_iterblkhd;
1664 int lock_flags = BAC_NOWAIT | BAC_REMOVE;
1665 int any_locked = 0;
1666
1667 if (flags & BUF_SKIP_LOCKED)
1668 lock_flags |= BAC_SKIP_LOCKED;
1669 if (flags & BUF_SKIP_NONLOCKED)
1670 lock_flags |= BAC_SKIP_NONLOCKED;
1671 loop:
1672 lck_mtx_lock(buf_mtxp);
1673
1674 if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY) == 0) {
1675 while (!LIST_EMPTY(&local_iterblkhd)) {
1676 bp = LIST_FIRST(&local_iterblkhd);
1677 LIST_REMOVE(bp, b_vnbufs);
1678 LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1679
1680 if ((error = buf_acquire_locked(bp, lock_flags, 0, 0)) == EBUSY) {
1681 busy++;
1682 }
1683 if (error) {
1684 /*
1685 * If we passed in BUF_SKIP_LOCKED or BUF_SKIP_NONLOCKED,
1686 * we may want to do somethign differently if a locked or unlocked
1687 * buffer was encountered (depending on the arg specified).
1688 * In this case, we know that one of those two was set, and the
1689 * buf acquisition failed above.
1690 *
1691 * If it failed with EDEADLK, then save state which can be emitted
1692 * later on to the caller. Most callers should not care.
1693 */
1694 if (error == EDEADLK) {
1695 any_locked++;
1696 }
1697 continue;
1698 }
1699 lck_mtx_unlock(buf_mtxp);
1700
1701 bp->b_flags &= ~B_LOCKED;
1702
1703 /*
1704 * Wait for I/O associated with indirect blocks to complete,
1705 * since there is no way to quickly wait for them below.
1706 */
1707 if ((bp->b_vp == vp) || (wait == 0))
1708 (void) buf_bawrite(bp);
1709 else
1710 (void) VNOP_BWRITE(bp);
1711 writes_issued++;
1712
1713 lck_mtx_lock(buf_mtxp);
1714 }
1715 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1716 }
1717 lck_mtx_unlock(buf_mtxp);
1718
1719 if (wait) {
1720 (void)vnode_waitforwrites(vp, 0, 0, 0, msg);
1721
1722 if (vp->v_dirtyblkhd.lh_first && busy) {
1723 /*
1724 * we had one or more BUSY buffers on
1725 * the dirtyblock list... most likely
1726 * these are due to delayed writes that
1727 * were moved to the bclean queue but
1728 * have not yet been 'written'.
1729 * if we issued some writes on the
1730 * previous pass, we try again immediately
1731 * if we didn't, we'll sleep for some time
1732 * to allow the state to change...
1733 */
1734 if (writes_issued == 0) {
1735 (void)tsleep((caddr_t)&vp->v_numoutput,
1736 PRIBIO + 1, "vnode_flushdirtyblks", hz/20);
1737 }
1738 writes_issued = 0;
1739 busy = 0;
1740
1741 goto loop;
1742 }
1743 }
1744
1745 return any_locked;
1746 }
1747
1748
1749 /*
1750 * called with buf_mtxp held...
1751 * this lock protects the queue manipulation
1752 */
1753 static int
1754 buf_iterprepare(vnode_t vp, struct buflists *iterheadp, int flags)
1755 {
1756 struct buflists * listheadp;
1757
1758 if (flags & VBI_DIRTY)
1759 listheadp = &vp->v_dirtyblkhd;
1760 else
1761 listheadp = &vp->v_cleanblkhd;
1762
1763 while (vp->v_iterblkflags & VBI_ITER) {
1764 vp->v_iterblkflags |= VBI_ITERWANT;
1765 msleep(&vp->v_iterblkflags, buf_mtxp, 0, "buf_iterprepare", NULL);
1766 }
1767 if (LIST_EMPTY(listheadp)) {
1768 LIST_INIT(iterheadp);
1769 return(EINVAL);
1770 }
1771 vp->v_iterblkflags |= VBI_ITER;
1772
1773 iterheadp->lh_first = listheadp->lh_first;
1774 listheadp->lh_first->b_vnbufs.le_prev = &iterheadp->lh_first;
1775 LIST_INIT(listheadp);
1776
1777 return(0);
1778 }
1779
1780 /*
1781 * called with buf_mtxp held...
1782 * this lock protects the queue manipulation
1783 */
1784 static void
1785 buf_itercomplete(vnode_t vp, struct buflists *iterheadp, int flags)
1786 {
1787 struct buflists * listheadp;
1788 buf_t bp;
1789
1790 if (flags & VBI_DIRTY)
1791 listheadp = &vp->v_dirtyblkhd;
1792 else
1793 listheadp = &vp->v_cleanblkhd;
1794
1795 while (!LIST_EMPTY(iterheadp)) {
1796 bp = LIST_FIRST(iterheadp);
1797 LIST_REMOVE(bp, b_vnbufs);
1798 LIST_INSERT_HEAD(listheadp, bp, b_vnbufs);
1799 }
1800 vp->v_iterblkflags &= ~VBI_ITER;
1801
1802 if (vp->v_iterblkflags & VBI_ITERWANT) {
1803 vp->v_iterblkflags &= ~VBI_ITERWANT;
1804 wakeup(&vp->v_iterblkflags);
1805 }
1806 }
1807
1808
1809 static void
1810 bremfree_locked(buf_t bp)
1811 {
1812 struct bqueues *dp = NULL;
1813 int whichq;
1814
1815 whichq = bp->b_whichq;
1816
1817 if (whichq == -1) {
1818 if (bp->b_shadow_ref == 0)
1819 panic("bremfree_locked: %p not on freelist", bp);
1820 /*
1821 * there are clones pointing to 'bp'...
1822 * therefore, it was not put on a freelist
1823 * when buf_brelse was last called on 'bp'
1824 */
1825 return;
1826 }
1827 /*
1828 * We only calculate the head of the freelist when removing
1829 * the last element of the list as that is the only time that
1830 * it is needed (e.g. to reset the tail pointer).
1831 *
1832 * NB: This makes an assumption about how tailq's are implemented.
1833 */
1834 if (bp->b_freelist.tqe_next == NULL) {
1835 dp = &bufqueues[whichq];
1836
1837 if (dp->tqh_last != &bp->b_freelist.tqe_next)
1838 panic("bremfree: lost tail");
1839 }
1840 TAILQ_REMOVE(dp, bp, b_freelist);
1841
1842 if (whichq == BQ_LAUNDRY)
1843 blaundrycnt--;
1844
1845 bp->b_whichq = -1;
1846 bp->b_timestamp = 0;
1847 bp->b_shadow = 0;
1848 }
1849
1850 /*
1851 * Associate a buffer with a vnode.
1852 * buf_mtxp must be locked on entry
1853 */
1854 static void
1855 bgetvp_locked(vnode_t vp, buf_t bp)
1856 {
1857
1858 if (bp->b_vp != vp)
1859 panic("bgetvp_locked: not free");
1860
1861 if (vp->v_type == VBLK || vp->v_type == VCHR)
1862 bp->b_dev = vp->v_rdev;
1863 else
1864 bp->b_dev = NODEV;
1865 /*
1866 * Insert onto list for new vnode.
1867 */
1868 bufinsvn(bp, &vp->v_cleanblkhd);
1869 }
1870
1871 /*
1872 * Disassociate a buffer from a vnode.
1873 * buf_mtxp must be locked on entry
1874 */
1875 static void
1876 brelvp_locked(buf_t bp)
1877 {
1878 /*
1879 * Delete from old vnode list, if on one.
1880 */
1881 if (bp->b_vnbufs.le_next != NOLIST)
1882 bufremvn(bp);
1883
1884 bp->b_vp = (vnode_t)NULL;
1885 }
1886
1887 /*
1888 * Reassign a buffer from one vnode to another.
1889 * Used to assign file specific control information
1890 * (indirect blocks) to the vnode to which they belong.
1891 */
1892 static void
1893 buf_reassign(buf_t bp, vnode_t newvp)
1894 {
1895 struct buflists *listheadp;
1896
1897 if (newvp == NULL) {
1898 printf("buf_reassign: NULL");
1899 return;
1900 }
1901 lck_mtx_lock_spin(buf_mtxp);
1902
1903 /*
1904 * Delete from old vnode list, if on one.
1905 */
1906 if (bp->b_vnbufs.le_next != NOLIST)
1907 bufremvn(bp);
1908 /*
1909 * If dirty, put on list of dirty buffers;
1910 * otherwise insert onto list of clean buffers.
1911 */
1912 if (ISSET(bp->b_flags, B_DELWRI))
1913 listheadp = &newvp->v_dirtyblkhd;
1914 else
1915 listheadp = &newvp->v_cleanblkhd;
1916 bufinsvn(bp, listheadp);
1917
1918 lck_mtx_unlock(buf_mtxp);
1919 }
1920
1921 static __inline__ void
1922 bufhdrinit(buf_t bp)
1923 {
1924 bzero((char *)bp, sizeof *bp);
1925 bp->b_dev = NODEV;
1926 bp->b_rcred = NOCRED;
1927 bp->b_wcred = NOCRED;
1928 bp->b_vnbufs.le_next = NOLIST;
1929 bp->b_flags = B_INVAL;
1930
1931 return;
1932 }
1933
1934 /*
1935 * Initialize buffers and hash links for buffers.
1936 */
1937 __private_extern__ void
1938 bufinit(void)
1939 {
1940 buf_t bp;
1941 struct bqueues *dp;
1942 int i;
1943
1944 nbuf_headers = 0;
1945 /* Initialize the buffer queues ('freelists') and the hash table */
1946 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
1947 TAILQ_INIT(dp);
1948 bufhashtbl = hashinit(nbuf_hashelements, M_CACHE, &bufhash);
1949
1950 buf_busycount = 0;
1951
1952 /* Initialize the buffer headers */
1953 for (i = 0; i < max_nbuf_headers; i++) {
1954 nbuf_headers++;
1955 bp = &buf_headers[i];
1956 bufhdrinit(bp);
1957
1958 BLISTNONE(bp);
1959 dp = &bufqueues[BQ_EMPTY];
1960 bp->b_whichq = BQ_EMPTY;
1961 bp->b_timestamp = buf_timestamp();
1962 binsheadfree(bp, dp, BQ_EMPTY);
1963 binshash(bp, &invalhash);
1964 }
1965 boot_nbuf_headers = nbuf_headers;
1966
1967 TAILQ_INIT(&iobufqueue);
1968 TAILQ_INIT(&delaybufqueue);
1969
1970 for (; i < nbuf_headers + niobuf_headers; i++) {
1971 bp = &buf_headers[i];
1972 bufhdrinit(bp);
1973 bp->b_whichq = -1;
1974 binsheadfree(bp, &iobufqueue, -1);
1975 }
1976
1977 /*
1978 * allocate lock group attribute and group
1979 */
1980 buf_mtx_grp_attr = lck_grp_attr_alloc_init();
1981 buf_mtx_grp = lck_grp_alloc_init("buffer cache", buf_mtx_grp_attr);
1982
1983 /*
1984 * allocate the lock attribute
1985 */
1986 buf_mtx_attr = lck_attr_alloc_init();
1987
1988 /*
1989 * allocate and initialize mutex's for the buffer and iobuffer pools
1990 */
1991 buf_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
1992 iobuffer_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
1993
1994 if (iobuffer_mtxp == NULL)
1995 panic("couldn't create iobuffer mutex");
1996
1997 if (buf_mtxp == NULL)
1998 panic("couldn't create buf mutex");
1999
2000 /*
2001 * allocate and initialize cluster specific global locks...
2002 */
2003 cluster_init();
2004
2005 printf("using %d buffer headers and %d cluster IO buffer headers\n",
2006 nbuf_headers, niobuf_headers);
2007
2008 /* Set up zones used by the buffer cache */
2009 bufzoneinit();
2010
2011 /* start the bcleanbuf() thread */
2012 bcleanbuf_thread_init();
2013
2014 /* Register a callout for relieving vm pressure */
2015 if (vm_set_buffer_cleanup_callout(buffer_cache_gc) != KERN_SUCCESS) {
2016 panic("Couldn't register buffer cache callout for vm pressure!\n");
2017 }
2018
2019 }
2020
2021 /*
2022 * Zones for the meta data buffers
2023 */
2024
2025 #define MINMETA 512
2026 #define MAXMETA 8192
2027
2028 struct meta_zone_entry {
2029 zone_t mz_zone;
2030 vm_size_t mz_size;
2031 vm_size_t mz_max;
2032 const char *mz_name;
2033 };
2034
2035 struct meta_zone_entry meta_zones[] = {
2036 {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
2037 {NULL, (MINMETA * 2), 64 * (MINMETA * 2), "buf.1024" },
2038 {NULL, (MINMETA * 4), 16 * (MINMETA * 4), "buf.2048" },
2039 {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
2040 {NULL, (MINMETA * 16), 512 * (MINMETA * 16), "buf.8192" },
2041 {NULL, 0, 0, "" } /* End */
2042 };
2043
2044 /*
2045 * Initialize the meta data zones
2046 */
2047 static void
2048 bufzoneinit(void)
2049 {
2050 int i;
2051
2052 for (i = 0; meta_zones[i].mz_size != 0; i++) {
2053 meta_zones[i].mz_zone =
2054 zinit(meta_zones[i].mz_size,
2055 meta_zones[i].mz_max,
2056 PAGE_SIZE,
2057 meta_zones[i].mz_name);
2058 zone_change(meta_zones[i].mz_zone, Z_CALLERACCT, FALSE);
2059 }
2060 buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
2061 zone_change(buf_hdr_zone, Z_CALLERACCT, FALSE);
2062 }
2063
2064 static __inline__ zone_t
2065 getbufzone(size_t size)
2066 {
2067 int i;
2068
2069 if ((size % 512) || (size < MINMETA) || (size > MAXMETA))
2070 panic("getbufzone: incorect size = %lu", size);
2071
2072 for (i = 0; meta_zones[i].mz_size != 0; i++) {
2073 if (meta_zones[i].mz_size >= size)
2074 break;
2075 }
2076
2077 return (meta_zones[i].mz_zone);
2078 }
2079
2080
2081
2082 static struct buf *
2083 bio_doread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, int async, int queuetype)
2084 {
2085 buf_t bp;
2086
2087 bp = buf_getblk(vp, blkno, size, 0, 0, queuetype);
2088
2089 /*
2090 * If buffer does not have data valid, start a read.
2091 * Note that if buffer is B_INVAL, buf_getblk() won't return it.
2092 * Therefore, it's valid if it's I/O has completed or been delayed.
2093 */
2094 if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
2095 struct proc *p;
2096
2097 p = current_proc();
2098
2099 /* Start I/O for the buffer (keeping credentials). */
2100 SET(bp->b_flags, B_READ | async);
2101 if (IS_VALID_CRED(cred) && !IS_VALID_CRED(bp->b_rcred)) {
2102 kauth_cred_ref(cred);
2103 bp->b_rcred = cred;
2104 }
2105
2106 VNOP_STRATEGY(bp);
2107
2108 trace(TR_BREADMISS, pack(vp, size), blkno);
2109
2110 /* Pay for the read. */
2111 if (p && p->p_stats) {
2112 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_inblock); /* XXX */
2113 }
2114
2115 if (async) {
2116 /*
2117 * since we asked for an ASYNC I/O
2118 * the biodone will do the brelse
2119 * we don't want to pass back a bp
2120 * that we don't 'own'
2121 */
2122 bp = NULL;
2123 }
2124 } else if (async) {
2125 buf_brelse(bp);
2126 bp = NULL;
2127 }
2128
2129 trace(TR_BREADHIT, pack(vp, size), blkno);
2130
2131 return (bp);
2132 }
2133
2134 /*
2135 * Perform the reads for buf_breadn() and buf_meta_breadn().
2136 * Trivial modification to the breada algorithm presented in Bach (p.55).
2137 */
2138 static errno_t
2139 do_breadn_for_type(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes,
2140 int nrablks, kauth_cred_t cred, buf_t *bpp, int queuetype)
2141 {
2142 buf_t bp;
2143 int i;
2144
2145 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, queuetype);
2146
2147 /*
2148 * For each of the read-ahead blocks, start a read, if necessary.
2149 */
2150 for (i = 0; i < nrablks; i++) {
2151 /* If it's in the cache, just go on to next one. */
2152 if (incore(vp, rablks[i]))
2153 continue;
2154
2155 /* Get a buffer for the read-ahead block */
2156 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, queuetype);
2157 }
2158
2159 /* Otherwise, we had to start a read for it; wait until it's valid. */
2160 return (buf_biowait(bp));
2161 }
2162
2163
2164 /*
2165 * Read a disk block.
2166 * This algorithm described in Bach (p.54).
2167 */
2168 errno_t
2169 buf_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp)
2170 {
2171 buf_t bp;
2172
2173 /* Get buffer for block. */
2174 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
2175
2176 /* Wait for the read to complete, and return result. */
2177 return (buf_biowait(bp));
2178 }
2179
2180 /*
2181 * Read a disk block. [bread() for meta-data]
2182 * This algorithm described in Bach (p.54).
2183 */
2184 errno_t
2185 buf_meta_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp)
2186 {
2187 buf_t bp;
2188
2189 /* Get buffer for block. */
2190 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
2191
2192 /* Wait for the read to complete, and return result. */
2193 return (buf_biowait(bp));
2194 }
2195
2196 /*
2197 * Read-ahead multiple disk blocks. The first is sync, the rest async.
2198 */
2199 errno_t
2200 buf_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, kauth_cred_t cred, buf_t *bpp)
2201 {
2202 return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_READ));
2203 }
2204
2205 /*
2206 * Read-ahead multiple disk blocks. The first is sync, the rest async.
2207 * [buf_breadn() for meta-data]
2208 */
2209 errno_t
2210 buf_meta_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, kauth_cred_t cred, buf_t *bpp)
2211 {
2212 return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_META));
2213 }
2214
2215 /*
2216 * Block write. Described in Bach (p.56)
2217 */
2218 errno_t
2219 buf_bwrite(buf_t bp)
2220 {
2221 int sync, wasdelayed;
2222 errno_t rv;
2223 proc_t p = current_proc();
2224 vnode_t vp = bp->b_vp;
2225
2226 if (bp->b_datap == 0) {
2227 if (brecover_data(bp) == 0)
2228 return (0);
2229 }
2230 /* Remember buffer type, to switch on it later. */
2231 sync = !ISSET(bp->b_flags, B_ASYNC);
2232 wasdelayed = ISSET(bp->b_flags, B_DELWRI);
2233 CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
2234
2235 if (wasdelayed)
2236 OSAddAtomicLong(-1, &nbdwrite);
2237
2238 if (!sync) {
2239 /*
2240 * If not synchronous, pay for the I/O operation and make
2241 * sure the buf is on the correct vnode queue. We have
2242 * to do this now, because if we don't, the vnode may not
2243 * be properly notified that its I/O has completed.
2244 */
2245 if (wasdelayed)
2246 buf_reassign(bp, vp);
2247 else
2248 if (p && p->p_stats) {
2249 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); /* XXX */
2250 }
2251 }
2252 trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
2253
2254 /* Initiate disk write. Make sure the appropriate party is charged. */
2255
2256 OSAddAtomic(1, &vp->v_numoutput);
2257
2258 VNOP_STRATEGY(bp);
2259
2260 if (sync) {
2261 /*
2262 * If I/O was synchronous, wait for it to complete.
2263 */
2264 rv = buf_biowait(bp);
2265
2266 /*
2267 * Pay for the I/O operation, if it's not been paid for, and
2268 * make sure it's on the correct vnode queue. (async operatings
2269 * were payed for above.)
2270 */
2271 if (wasdelayed)
2272 buf_reassign(bp, vp);
2273 else
2274 if (p && p->p_stats) {
2275 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); /* XXX */
2276 }
2277
2278 /* Release the buffer. */
2279 // XXXdbg - only if the unused bit is set
2280 if (!ISSET(bp->b_flags, B_NORELSE)) {
2281 buf_brelse(bp);
2282 } else {
2283 CLR(bp->b_flags, B_NORELSE);
2284 }
2285
2286 return (rv);
2287 } else {
2288 return (0);
2289 }
2290 }
2291
2292 int
2293 vn_bwrite(struct vnop_bwrite_args *ap)
2294 {
2295 return (buf_bwrite(ap->a_bp));
2296 }
2297
2298 /*
2299 * Delayed write.
2300 *
2301 * The buffer is marked dirty, but is not queued for I/O.
2302 * This routine should be used when the buffer is expected
2303 * to be modified again soon, typically a small write that
2304 * partially fills a buffer.
2305 *
2306 * NB: magnetic tapes cannot be delayed; they must be
2307 * written in the order that the writes are requested.
2308 *
2309 * Described in Leffler, et al. (pp. 208-213).
2310 *
2311 * Note: With the ability to allocate additional buffer
2312 * headers, we can get in to the situation where "too" many
2313 * buf_bdwrite()s can create situation where the kernel can create
2314 * buffers faster than the disks can service. Doing a buf_bawrite() in
2315 * cases where we have "too many" outstanding buf_bdwrite()s avoids that.
2316 */
2317 __private_extern__ int
2318 bdwrite_internal(buf_t bp, int return_error)
2319 {
2320 proc_t p = current_proc();
2321 vnode_t vp = bp->b_vp;
2322
2323 /*
2324 * If the block hasn't been seen before:
2325 * (1) Mark it as having been seen,
2326 * (2) Charge for the write.
2327 * (3) Make sure it's on its vnode's correct block list,
2328 */
2329 if (!ISSET(bp->b_flags, B_DELWRI)) {
2330 SET(bp->b_flags, B_DELWRI);
2331 if (p && p->p_stats) {
2332 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); /* XXX */
2333 }
2334 OSAddAtomicLong(1, &nbdwrite);
2335 buf_reassign(bp, vp);
2336 }
2337
2338 /*
2339 * if we're not LOCKED, but the total number of delayed writes
2340 * has climbed above 75% of the total buffers in the system
2341 * return an error if the caller has indicated that it can
2342 * handle one in this case, otherwise schedule the I/O now
2343 * this is done to prevent us from allocating tons of extra
2344 * buffers when dealing with virtual disks (i.e. DiskImages),
2345 * because additional buffers are dynamically allocated to prevent
2346 * deadlocks from occurring
2347 *
2348 * however, can't do a buf_bawrite() if the LOCKED bit is set because the
2349 * buffer is part of a transaction and can't go to disk until
2350 * the LOCKED bit is cleared.
2351 */
2352 if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf_headers/4)*3)) {
2353 if (return_error)
2354 return (EAGAIN);
2355 /*
2356 * If the vnode has "too many" write operations in progress
2357 * wait for them to finish the IO
2358 */
2359 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, "buf_bdwrite");
2360
2361 return (buf_bawrite(bp));
2362 }
2363
2364 /* Otherwise, the "write" is done, so mark and release the buffer. */
2365 SET(bp->b_flags, B_DONE);
2366 buf_brelse(bp);
2367 return (0);
2368 }
2369
2370 errno_t
2371 buf_bdwrite(buf_t bp)
2372 {
2373 return (bdwrite_internal(bp, 0));
2374 }
2375
2376
2377 /*
2378 * Asynchronous block write; just an asynchronous buf_bwrite().
2379 *
2380 * Note: With the abilitty to allocate additional buffer
2381 * headers, we can get in to the situation where "too" many
2382 * buf_bawrite()s can create situation where the kernel can create
2383 * buffers faster than the disks can service.
2384 * We limit the number of "in flight" writes a vnode can have to
2385 * avoid this.
2386 */
2387 static int
2388 bawrite_internal(buf_t bp, int throttle)
2389 {
2390 vnode_t vp = bp->b_vp;
2391
2392 if (vp) {
2393 if (throttle)
2394 /*
2395 * If the vnode has "too many" write operations in progress
2396 * wait for them to finish the IO
2397 */
2398 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, (const char *)"buf_bawrite");
2399 else if (vp->v_numoutput >= VNODE_ASYNC_THROTTLE)
2400 /*
2401 * return to the caller and
2402 * let him decide what to do
2403 */
2404 return (EWOULDBLOCK);
2405 }
2406 SET(bp->b_flags, B_ASYNC);
2407
2408 return (VNOP_BWRITE(bp));
2409 }
2410
2411 errno_t
2412 buf_bawrite(buf_t bp)
2413 {
2414 return (bawrite_internal(bp, 1));
2415 }
2416
2417
2418
2419 static void
2420 buf_free_meta_store(buf_t bp)
2421 {
2422 if (bp->b_bufsize) {
2423 if (ISSET(bp->b_flags, B_ZALLOC)) {
2424 zone_t z;
2425
2426 z = getbufzone(bp->b_bufsize);
2427 zfree(z, (void *)bp->b_datap);
2428 } else
2429 kmem_free(kernel_map, bp->b_datap, bp->b_bufsize);
2430
2431 bp->b_datap = (uintptr_t)NULL;
2432 bp->b_bufsize = 0;
2433 }
2434 }
2435
2436
2437 static buf_t
2438 buf_brelse_shadow(buf_t bp)
2439 {
2440 buf_t bp_head;
2441 buf_t bp_temp;
2442 buf_t bp_return = NULL;
2443 #ifdef BUF_MAKE_PRIVATE
2444 buf_t bp_data;
2445 int data_ref = 0;
2446 #endif
2447 int need_wakeup = 0;
2448
2449 lck_mtx_lock_spin(buf_mtxp);
2450
2451 __IGNORE_WCASTALIGN(bp_head = (buf_t)bp->b_orig);
2452
2453 if (bp_head->b_whichq != -1)
2454 panic("buf_brelse_shadow: bp_head on freelist %d\n", bp_head->b_whichq);
2455
2456 #ifdef BUF_MAKE_PRIVATE
2457 if (bp_data = bp->b_data_store) {
2458 bp_data->b_data_ref--;
2459 /*
2460 * snapshot the ref count so that we can check it
2461 * outside of the lock... we only want the guy going
2462 * from 1 -> 0 to try and release the storage
2463 */
2464 data_ref = bp_data->b_data_ref;
2465 }
2466 #endif
2467 KERNEL_DEBUG(0xbbbbc008 | DBG_FUNC_START, bp, bp_head, bp_head->b_shadow_ref, 0, 0);
2468
2469 bp_head->b_shadow_ref--;
2470
2471 for (bp_temp = bp_head; bp_temp && bp != bp_temp->b_shadow; bp_temp = bp_temp->b_shadow);
2472
2473 if (bp_temp == NULL)
2474 panic("buf_brelse_shadow: bp not on list %p", bp_head);
2475
2476 bp_temp->b_shadow = bp_temp->b_shadow->b_shadow;
2477
2478 #ifdef BUF_MAKE_PRIVATE
2479 /*
2480 * we're about to free the current 'owner' of the data buffer and
2481 * there is at least one other shadow buf_t still pointing at it
2482 * so transfer it to the first shadow buf left in the chain
2483 */
2484 if (bp == bp_data && data_ref) {
2485 if ((bp_data = bp_head->b_shadow) == NULL)
2486 panic("buf_brelse_shadow: data_ref mismatch bp(%p)", bp);
2487
2488 for (bp_temp = bp_data; bp_temp; bp_temp = bp_temp->b_shadow)
2489 bp_temp->b_data_store = bp_data;
2490 bp_data->b_data_ref = data_ref;
2491 }
2492 #endif
2493 if (bp_head->b_shadow_ref == 0 && bp_head->b_shadow)
2494 panic("buf_relse_shadow: b_shadow != NULL && b_shadow_ref == 0 bp(%p)", bp);
2495 if (bp_head->b_shadow_ref && bp_head->b_shadow == 0)
2496 panic("buf_relse_shadow: b_shadow == NULL && b_shadow_ref != 0 bp(%p)", bp);
2497
2498 if (bp_head->b_shadow_ref == 0) {
2499 if (!ISSET(bp_head->b_lflags, BL_BUSY)) {
2500
2501 CLR(bp_head->b_flags, B_AGE);
2502 bp_head->b_timestamp = buf_timestamp();
2503
2504 if (ISSET(bp_head->b_flags, B_LOCKED)) {
2505 bp_head->b_whichq = BQ_LOCKED;
2506 binstailfree(bp_head, &bufqueues[BQ_LOCKED], BQ_LOCKED);
2507 } else {
2508 bp_head->b_whichq = BQ_META;
2509 binstailfree(bp_head, &bufqueues[BQ_META], BQ_META);
2510 }
2511 } else if (ISSET(bp_head->b_lflags, BL_WAITSHADOW)) {
2512 CLR(bp_head->b_lflags, BL_WAITSHADOW);
2513
2514 bp_return = bp_head;
2515 }
2516 if (ISSET(bp_head->b_lflags, BL_WANTED_REF)) {
2517 CLR(bp_head->b_lflags, BL_WANTED_REF);
2518 need_wakeup = 1;
2519 }
2520 }
2521 lck_mtx_unlock(buf_mtxp);
2522
2523 if (need_wakeup)
2524 wakeup(bp_head);
2525
2526 #ifdef BUF_MAKE_PRIVATE
2527 if (bp == bp_data && data_ref == 0)
2528 buf_free_meta_store(bp);
2529
2530 bp->b_data_store = NULL;
2531 #endif
2532 KERNEL_DEBUG(0xbbbbc008 | DBG_FUNC_END, bp, 0, 0, 0, 0);
2533
2534 return (bp_return);
2535 }
2536
2537
2538 /*
2539 * Release a buffer on to the free lists.
2540 * Described in Bach (p. 46).
2541 */
2542 void
2543 buf_brelse(buf_t bp)
2544 {
2545 struct bqueues *bufq;
2546 long whichq;
2547 upl_t upl;
2548 int need_wakeup = 0;
2549 int need_bp_wakeup = 0;
2550
2551
2552 if (bp->b_whichq != -1 || !(bp->b_lflags & BL_BUSY))
2553 panic("buf_brelse: bad buffer = %p\n", bp);
2554
2555 #ifdef JOE_DEBUG
2556 (void) OSBacktrace(&bp->b_stackbrelse[0], 6);
2557
2558 bp->b_lastbrelse = current_thread();
2559 bp->b_tag = 0;
2560 #endif
2561 if (bp->b_lflags & BL_IOBUF) {
2562 buf_t shadow_master_bp = NULL;
2563
2564 if (ISSET(bp->b_lflags, BL_SHADOW))
2565 shadow_master_bp = buf_brelse_shadow(bp);
2566 else if (ISSET(bp->b_lflags, BL_IOBUF_ALLOC))
2567 buf_free_meta_store(bp);
2568 free_io_buf(bp);
2569
2570 if (shadow_master_bp) {
2571 bp = shadow_master_bp;
2572 goto finish_shadow_master;
2573 }
2574 return;
2575 }
2576
2577 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
2578 bp->b_lblkno * PAGE_SIZE, bp, bp->b_datap,
2579 bp->b_flags, 0);
2580
2581 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
2582
2583 /*
2584 * if we're invalidating a buffer that has the B_FILTER bit
2585 * set then call the b_iodone function so it gets cleaned
2586 * up properly.
2587 *
2588 * the HFS journal code depends on this
2589 */
2590 if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
2591 if (ISSET(bp->b_flags, B_FILTER)) { /* if necessary, call out */
2592 void (*iodone_func)(struct buf *, void *) = bp->b_iodone;
2593 void *arg = bp->b_transaction;
2594
2595 CLR(bp->b_flags, B_FILTER); /* but note callout done */
2596 bp->b_iodone = NULL;
2597 bp->b_transaction = NULL;
2598
2599 if (iodone_func == NULL) {
2600 panic("brelse: bp @ %p has NULL b_iodone!\n", bp);
2601 }
2602 (*iodone_func)(bp, arg);
2603 }
2604 }
2605 /*
2606 * I/O is done. Cleanup the UPL state
2607 */
2608 upl = bp->b_upl;
2609
2610 if ( !ISSET(bp->b_flags, B_META) && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
2611 kern_return_t kret;
2612 int upl_flags;
2613
2614 if (upl == NULL) {
2615 if ( !ISSET(bp->b_flags, B_INVAL)) {
2616 kret = ubc_create_upl(bp->b_vp,
2617 ubc_blktooff(bp->b_vp, bp->b_lblkno),
2618 bp->b_bufsize,
2619 &upl,
2620 NULL,
2621 UPL_PRECIOUS);
2622
2623 if (kret != KERN_SUCCESS)
2624 panic("brelse: Failed to create UPL");
2625 #if UPL_DEBUG
2626 upl_ubc_alias_set(upl, (uintptr_t) bp, (uintptr_t) 5);
2627 #endif /* UPL_DEBUG */
2628 }
2629 } else {
2630 if (bp->b_datap) {
2631 kret = ubc_upl_unmap(upl);
2632
2633 if (kret != KERN_SUCCESS)
2634 panic("ubc_upl_unmap failed");
2635 bp->b_datap = (uintptr_t)NULL;
2636 }
2637 }
2638 if (upl) {
2639 if (bp->b_flags & (B_ERROR | B_INVAL)) {
2640 if (bp->b_flags & (B_READ | B_INVAL))
2641 upl_flags = UPL_ABORT_DUMP_PAGES;
2642 else
2643 upl_flags = 0;
2644
2645 ubc_upl_abort(upl, upl_flags);
2646 } else {
2647 if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY))
2648 upl_flags = UPL_COMMIT_SET_DIRTY ;
2649 else
2650 upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
2651
2652 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
2653 UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2654 }
2655 bp->b_upl = NULL;
2656 }
2657 } else {
2658 if ( (upl) )
2659 panic("brelse: UPL set for non VREG; vp=%p", bp->b_vp);
2660 }
2661
2662 /*
2663 * If it's locked, don't report an error; try again later.
2664 */
2665 if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
2666 CLR(bp->b_flags, B_ERROR);
2667 /*
2668 * If it's not cacheable, or an error, mark it invalid.
2669 */
2670 if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
2671 SET(bp->b_flags, B_INVAL);
2672
2673 if ((bp->b_bufsize <= 0) ||
2674 ISSET(bp->b_flags, B_INVAL) ||
2675 (ISSET(bp->b_lflags, BL_WANTDEALLOC) && !ISSET(bp->b_flags, B_DELWRI))) {
2676
2677 boolean_t delayed_buf_free_meta_store = FALSE;
2678
2679 /*
2680 * If it's invalid or empty, dissociate it from its vnode,
2681 * release its storage if B_META, and
2682 * clean it up a bit and put it on the EMPTY queue
2683 */
2684 if (ISSET(bp->b_flags, B_DELWRI))
2685 OSAddAtomicLong(-1, &nbdwrite);
2686
2687 if (ISSET(bp->b_flags, B_META)) {
2688 if (bp->b_shadow_ref)
2689 delayed_buf_free_meta_store = TRUE;
2690 else
2691 buf_free_meta_store(bp);
2692 }
2693 /*
2694 * nuke any credentials we were holding
2695 */
2696 buf_release_credentials(bp);
2697
2698 lck_mtx_lock_spin(buf_mtxp);
2699
2700 if (bp->b_shadow_ref) {
2701 SET(bp->b_lflags, BL_WAITSHADOW);
2702
2703 lck_mtx_unlock(buf_mtxp);
2704
2705 return;
2706 }
2707 if (delayed_buf_free_meta_store == TRUE) {
2708
2709 lck_mtx_unlock(buf_mtxp);
2710 finish_shadow_master:
2711 buf_free_meta_store(bp);
2712
2713 lck_mtx_lock_spin(buf_mtxp);
2714 }
2715 CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA));
2716
2717 if (bp->b_vp)
2718 brelvp_locked(bp);
2719
2720 bremhash(bp);
2721 BLISTNONE(bp);
2722 binshash(bp, &invalhash);
2723
2724 bp->b_whichq = BQ_EMPTY;
2725 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
2726 } else {
2727
2728 /*
2729 * It has valid data. Put it on the end of the appropriate
2730 * queue, so that it'll stick around for as long as possible.
2731 */
2732 if (ISSET(bp->b_flags, B_LOCKED))
2733 whichq = BQ_LOCKED; /* locked in core */
2734 else if (ISSET(bp->b_flags, B_META))
2735 whichq = BQ_META; /* meta-data */
2736 else if (ISSET(bp->b_flags, B_AGE))
2737 whichq = BQ_AGE; /* stale but valid data */
2738 else
2739 whichq = BQ_LRU; /* valid data */
2740 bufq = &bufqueues[whichq];
2741
2742 bp->b_timestamp = buf_timestamp();
2743
2744 lck_mtx_lock_spin(buf_mtxp);
2745
2746 /*
2747 * the buf_brelse_shadow routine doesn't take 'ownership'
2748 * of the parent buf_t... it updates state that is protected by
2749 * the buf_mtxp, and checks for BL_BUSY to determine whether to
2750 * put the buf_t back on a free list. b_shadow_ref is protected
2751 * by the lock, and since we have not yet cleared B_BUSY, we need
2752 * to check it while holding the lock to insure that one of us
2753 * puts this buf_t back on a free list when it is safe to do so
2754 */
2755 if (bp->b_shadow_ref == 0) {
2756 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE));
2757 bp->b_whichq = whichq;
2758 binstailfree(bp, bufq, whichq);
2759 } else {
2760 /*
2761 * there are still cloned buf_t's pointing
2762 * at this guy... need to keep it off the
2763 * freelists until a buf_brelse is done on
2764 * the last clone
2765 */
2766 CLR(bp->b_flags, (B_ASYNC | B_NOCACHE));
2767 }
2768 }
2769 if (needbuffer) {
2770 /*
2771 * needbuffer is a global
2772 * we're currently using buf_mtxp to protect it
2773 * delay doing the actual wakeup until after
2774 * we drop buf_mtxp
2775 */
2776 needbuffer = 0;
2777 need_wakeup = 1;
2778 }
2779 if (ISSET(bp->b_lflags, BL_WANTED)) {
2780 /*
2781 * delay the actual wakeup until after we
2782 * clear BL_BUSY and we've dropped buf_mtxp
2783 */
2784 need_bp_wakeup = 1;
2785 }
2786 /*
2787 * Unlock the buffer.
2788 */
2789 CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
2790 buf_busycount--;
2791
2792 lck_mtx_unlock(buf_mtxp);
2793
2794 if (need_wakeup) {
2795 /*
2796 * Wake up any processes waiting for any buffer to become free.
2797 */
2798 wakeup(&needbuffer);
2799 }
2800 if (need_bp_wakeup) {
2801 /*
2802 * Wake up any proceeses waiting for _this_ buffer to become free.
2803 */
2804 wakeup(bp);
2805 }
2806 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
2807 bp, bp->b_datap, bp->b_flags, 0, 0);
2808 }
2809
2810 /*
2811 * Determine if a block is in the cache.
2812 * Just look on what would be its hash chain. If it's there, return
2813 * a pointer to it, unless it's marked invalid. If it's marked invalid,
2814 * we normally don't return the buffer, unless the caller explicitly
2815 * wants us to.
2816 */
2817 static boolean_t
2818 incore(vnode_t vp, daddr64_t blkno)
2819 {
2820 boolean_t retval;
2821 struct bufhashhdr *dp;
2822
2823 dp = BUFHASH(vp, blkno);
2824
2825 lck_mtx_lock_spin(buf_mtxp);
2826
2827 if (incore_locked(vp, blkno, dp))
2828 retval = TRUE;
2829 else
2830 retval = FALSE;
2831 lck_mtx_unlock(buf_mtxp);
2832
2833 return (retval);
2834 }
2835
2836
2837 static buf_t
2838 incore_locked(vnode_t vp, daddr64_t blkno, struct bufhashhdr *dp)
2839 {
2840 struct buf *bp;
2841
2842 /* Search hash chain */
2843 for (bp = dp->lh_first; bp != NULL; bp = bp->b_hash.le_next) {
2844 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
2845 !ISSET(bp->b_flags, B_INVAL)) {
2846 return (bp);
2847 }
2848 }
2849 return (NULL);
2850 }
2851
2852
2853 void
2854 buf_wait_for_shadow_io(vnode_t vp, daddr64_t blkno)
2855 {
2856 buf_t bp;
2857 struct bufhashhdr *dp;
2858
2859 dp = BUFHASH(vp, blkno);
2860
2861 lck_mtx_lock_spin(buf_mtxp);
2862
2863 for (;;) {
2864 if ((bp = incore_locked(vp, blkno, dp)) == NULL)
2865 break;
2866
2867 if (bp->b_shadow_ref == 0)
2868 break;
2869
2870 SET(bp->b_lflags, BL_WANTED_REF);
2871
2872 (void) msleep(bp, buf_mtxp, PSPIN | (PRIBIO+1), "buf_wait_for_shadow", NULL);
2873 }
2874 lck_mtx_unlock(buf_mtxp);
2875 }
2876
2877 /* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
2878 /*
2879 * Get a block of requested size that is associated with
2880 * a given vnode and block offset. If it is found in the
2881 * block cache, mark it as having been found, make it busy
2882 * and return it. Otherwise, return an empty block of the
2883 * correct size. It is up to the caller to insure that the
2884 * cached blocks be of the correct size.
2885 */
2886 buf_t
2887 buf_getblk(vnode_t vp, daddr64_t blkno, int size, int slpflag, int slptimeo, int operation)
2888 {
2889 buf_t bp;
2890 int err;
2891 upl_t upl;
2892 upl_page_info_t *pl;
2893 kern_return_t kret;
2894 int ret_only_valid;
2895 struct timespec ts;
2896 int upl_flags;
2897 struct bufhashhdr *dp;
2898
2899 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
2900 (uintptr_t)(blkno * PAGE_SIZE), size, operation, 0, 0);
2901
2902 ret_only_valid = operation & BLK_ONLYVALID;
2903 operation &= ~BLK_ONLYVALID;
2904 dp = BUFHASH(vp, blkno);
2905 start:
2906 lck_mtx_lock_spin(buf_mtxp);
2907
2908 if ((bp = incore_locked(vp, blkno, dp))) {
2909 /*
2910 * Found in the Buffer Cache
2911 */
2912 if (ISSET(bp->b_lflags, BL_BUSY)) {
2913 /*
2914 * but is busy
2915 */
2916 switch (operation) {
2917 case BLK_READ:
2918 case BLK_WRITE:
2919 case BLK_META:
2920 SET(bp->b_lflags, BL_WANTED);
2921 bufstats.bufs_busyincore++;
2922
2923 /*
2924 * don't retake the mutex after being awakened...
2925 * the time out is in msecs
2926 */
2927 ts.tv_sec = (slptimeo/1000);
2928 ts.tv_nsec = (slptimeo % 1000) * 10 * NSEC_PER_USEC * 1000;
2929
2930 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 396)) | DBG_FUNC_NONE,
2931 (uintptr_t)blkno, size, operation, 0, 0);
2932
2933 err = msleep(bp, buf_mtxp, slpflag | PDROP | (PRIBIO + 1), "buf_getblk", &ts);
2934
2935 /*
2936 * Callers who call with PCATCH or timeout are
2937 * willing to deal with the NULL pointer
2938 */
2939 if (err && ((slpflag & PCATCH) || ((err == EWOULDBLOCK) && slptimeo)))
2940 return (NULL);
2941 goto start;
2942 /*NOTREACHED*/
2943 break;
2944
2945 default:
2946 /*
2947 * unknown operation requested
2948 */
2949 panic("getblk: paging or unknown operation for incore busy buffer - %x\n", operation);
2950 /*NOTREACHED*/
2951 break;
2952 }
2953 } else {
2954 /*
2955 * buffer in core and not busy
2956 */
2957 SET(bp->b_lflags, BL_BUSY);
2958 SET(bp->b_flags, B_CACHE);
2959 buf_busycount++;
2960
2961 bremfree_locked(bp);
2962 bufstats.bufs_incore++;
2963
2964 lck_mtx_unlock(buf_mtxp);
2965 #ifdef JOE_DEBUG
2966 bp->b_owner = current_thread();
2967 bp->b_tag = 1;
2968 #endif
2969 if ( (bp->b_upl) )
2970 panic("buffer has UPL, but not marked BUSY: %p", bp);
2971
2972 if ( !ret_only_valid && bp->b_bufsize != size)
2973 allocbuf(bp, size);
2974
2975 upl_flags = 0;
2976 switch (operation) {
2977 case BLK_WRITE:
2978 /*
2979 * "write" operation: let the UPL subsystem
2980 * know that we intend to modify the buffer
2981 * cache pages we're gathering.
2982 */
2983 upl_flags |= UPL_WILL_MODIFY;
2984 case BLK_READ:
2985 upl_flags |= UPL_PRECIOUS;
2986 if (UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
2987 kret = ubc_create_upl(vp,
2988 ubc_blktooff(vp, bp->b_lblkno),
2989 bp->b_bufsize,
2990 &upl,
2991 &pl,
2992 upl_flags);
2993 if (kret != KERN_SUCCESS)
2994 panic("Failed to create UPL");
2995
2996 bp->b_upl = upl;
2997
2998 if (upl_valid_page(pl, 0)) {
2999 if (upl_dirty_page(pl, 0))
3000 SET(bp->b_flags, B_WASDIRTY);
3001 else
3002 CLR(bp->b_flags, B_WASDIRTY);
3003 } else
3004 CLR(bp->b_flags, (B_DONE | B_CACHE | B_WASDIRTY | B_DELWRI));
3005
3006 kret = ubc_upl_map(upl, (vm_offset_t*)&(bp->b_datap));
3007
3008 if (kret != KERN_SUCCESS)
3009 panic("getblk: ubc_upl_map() failed with (%d)", kret);
3010 }
3011 break;
3012
3013 case BLK_META:
3014 /*
3015 * VM is not involved in IO for the meta data
3016 * buffer already has valid data
3017 */
3018 break;
3019
3020 default:
3021 panic("getblk: paging or unknown operation for incore buffer- %d\n", operation);
3022 /*NOTREACHED*/
3023 break;
3024 }
3025 }
3026 } else { /* not incore() */
3027 int queue = BQ_EMPTY; /* Start with no preference */
3028
3029 if (ret_only_valid) {
3030 lck_mtx_unlock(buf_mtxp);
3031 return (NULL);
3032 }
3033 if ((vnode_isreg(vp) == 0) || (UBCINFOEXISTS(vp) == 0) /*|| (vnode_issystem(vp) == 1)*/)
3034 operation = BLK_META;
3035
3036 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
3037 goto start;
3038
3039 /*
3040 * getnewbuf may block for a number of different reasons...
3041 * if it does, it's then possible for someone else to
3042 * create a buffer for the same block and insert it into
3043 * the hash... if we see it incore at this point we dump
3044 * the buffer we were working on and start over
3045 */
3046 if (incore_locked(vp, blkno, dp)) {
3047 SET(bp->b_flags, B_INVAL);
3048 binshash(bp, &invalhash);
3049
3050 lck_mtx_unlock(buf_mtxp);
3051
3052 buf_brelse(bp);
3053 goto start;
3054 }
3055 /*
3056 * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
3057 * CALLED! BE CAREFUL.
3058 */
3059
3060 /*
3061 * mark the buffer as B_META if indicated
3062 * so that when buffer is released it will goto META queue
3063 */
3064 if (operation == BLK_META)
3065 SET(bp->b_flags, B_META);
3066
3067 bp->b_blkno = bp->b_lblkno = blkno;
3068 bp->b_vp = vp;
3069
3070 /*
3071 * Insert in the hash so that incore() can find it
3072 */
3073 binshash(bp, BUFHASH(vp, blkno));
3074
3075 bgetvp_locked(vp, bp);
3076
3077 lck_mtx_unlock(buf_mtxp);
3078
3079 allocbuf(bp, size);
3080
3081 upl_flags = 0;
3082 switch (operation) {
3083 case BLK_META:
3084 /*
3085 * buffer data is invalid...
3086 *
3087 * I don't want to have to retake buf_mtxp,
3088 * so the miss and vmhits counters are done
3089 * with Atomic updates... all other counters
3090 * in bufstats are protected with either
3091 * buf_mtxp or iobuffer_mtxp
3092 */
3093 OSAddAtomicLong(1, &bufstats.bufs_miss);
3094 break;
3095
3096 case BLK_WRITE:
3097 /*
3098 * "write" operation: let the UPL subsystem know
3099 * that we intend to modify the buffer cache pages
3100 * we're gathering.
3101 */
3102 upl_flags |= UPL_WILL_MODIFY;
3103 case BLK_READ:
3104 { off_t f_offset;
3105 size_t contig_bytes;
3106 int bmap_flags;
3107
3108 #if DEVELOPMENT || DEBUG
3109 /*
3110 * Apple implemented file systems use UBC excludively; they should
3111 * not call in here."
3112 */
3113 const char* excldfs[] = {"hfs", "afpfs", "smbfs", "acfs",
3114 "exfat", "msdos", "webdav", NULL};
3115
3116 for (int i = 0; excldfs[i] != NULL; i++) {
3117 if (vp->v_mount &&
3118 !strcmp(vp->v_mount->mnt_vfsstat.f_fstypename,
3119 excldfs[i])) {
3120 panic("%s %s calls buf_getblk",
3121 excldfs[i],
3122 operation == BLK_READ ? "BLK_READ" : "BLK_WRITE");
3123 }
3124 }
3125 #endif
3126
3127 if ( (bp->b_upl) )
3128 panic("bp already has UPL: %p",bp);
3129
3130 f_offset = ubc_blktooff(vp, blkno);
3131
3132 upl_flags |= UPL_PRECIOUS;
3133 kret = ubc_create_upl(vp,
3134 f_offset,
3135 bp->b_bufsize,
3136 &upl,
3137 &pl,
3138 upl_flags);
3139
3140 if (kret != KERN_SUCCESS)
3141 panic("Failed to create UPL");
3142 #if UPL_DEBUG
3143 upl_ubc_alias_set(upl, (uintptr_t) bp, (uintptr_t) 4);
3144 #endif /* UPL_DEBUG */
3145 bp->b_upl = upl;
3146
3147 if (upl_valid_page(pl, 0)) {
3148
3149 if (operation == BLK_READ)
3150 bmap_flags = VNODE_READ;
3151 else
3152 bmap_flags = VNODE_WRITE;
3153
3154 SET(bp->b_flags, B_CACHE | B_DONE);
3155
3156 OSAddAtomicLong(1, &bufstats.bufs_vmhits);
3157
3158 bp->b_validoff = 0;
3159 bp->b_dirtyoff = 0;
3160
3161 if (upl_dirty_page(pl, 0)) {
3162 /* page is dirty */
3163 SET(bp->b_flags, B_WASDIRTY);
3164
3165 bp->b_validend = bp->b_bcount;
3166 bp->b_dirtyend = bp->b_bcount;
3167 } else {
3168 /* page is clean */
3169 bp->b_validend = bp->b_bcount;
3170 bp->b_dirtyend = 0;
3171 }
3172 /*
3173 * try to recreate the physical block number associated with
3174 * this buffer...
3175 */
3176 if (VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))
3177 panic("getblk: VNOP_BLOCKMAP failed");
3178 /*
3179 * if the extent represented by this buffer
3180 * is not completely physically contiguous on
3181 * disk, than we can't cache the physical mapping
3182 * in the buffer header
3183 */
3184 if ((long)contig_bytes < bp->b_bcount)
3185 bp->b_blkno = bp->b_lblkno;
3186 } else {
3187 OSAddAtomicLong(1, &bufstats.bufs_miss);
3188 }
3189 kret = ubc_upl_map(upl, (vm_offset_t *)&(bp->b_datap));
3190
3191 if (kret != KERN_SUCCESS)
3192 panic("getblk: ubc_upl_map() failed with (%d)", kret);
3193 break;
3194 }
3195 default:
3196 panic("getblk: paging or unknown operation - %x", operation);
3197 /*NOTREACHED*/
3198 break;
3199 }
3200 }
3201 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
3202 bp, bp->b_datap, bp->b_flags, 3, 0);
3203
3204 #ifdef JOE_DEBUG
3205 (void) OSBacktrace(&bp->b_stackgetblk[0], 6);
3206 #endif
3207 return (bp);
3208 }
3209
3210 /*
3211 * Get an empty, disassociated buffer of given size.
3212 */
3213 buf_t
3214 buf_geteblk(int size)
3215 {
3216 buf_t bp = NULL;
3217 int queue = BQ_EMPTY;
3218
3219 do {
3220 lck_mtx_lock_spin(buf_mtxp);
3221
3222 bp = getnewbuf(0, 0, &queue);
3223 } while (bp == NULL);
3224
3225 SET(bp->b_flags, (B_META|B_INVAL));
3226
3227 #if DIAGNOSTIC
3228 assert(queue == BQ_EMPTY);
3229 #endif /* DIAGNOSTIC */
3230 /* XXX need to implement logic to deal with other queues */
3231
3232 binshash(bp, &invalhash);
3233 bufstats.bufs_eblk++;
3234
3235 lck_mtx_unlock(buf_mtxp);
3236
3237 allocbuf(bp, size);
3238
3239 return (bp);
3240 }
3241
3242 uint32_t
3243 buf_redundancy_flags(buf_t bp)
3244 {
3245 return bp->b_redundancy_flags;
3246 }
3247
3248 void
3249 buf_set_redundancy_flags(buf_t bp, uint32_t flags)
3250 {
3251 SET(bp->b_redundancy_flags, flags);
3252 }
3253
3254 void
3255 buf_clear_redundancy_flags(buf_t bp, uint32_t flags)
3256 {
3257 CLR(bp->b_redundancy_flags, flags);
3258 }
3259
3260
3261
3262 static void *
3263 recycle_buf_from_pool(int nsize)
3264 {
3265 buf_t bp;
3266 void *ptr = NULL;
3267
3268 lck_mtx_lock_spin(buf_mtxp);
3269
3270 TAILQ_FOREACH(bp, &bufqueues[BQ_META], b_freelist) {
3271 if (ISSET(bp->b_flags, B_DELWRI) || bp->b_bufsize != nsize)
3272 continue;
3273 ptr = (void *)bp->b_datap;
3274 bp->b_bufsize = 0;
3275
3276 bcleanbuf(bp, TRUE);
3277 break;
3278 }
3279 lck_mtx_unlock(buf_mtxp);
3280
3281 return (ptr);
3282 }
3283
3284
3285
3286 int zalloc_nopagewait_failed = 0;
3287 int recycle_buf_failed = 0;
3288
3289 static void *
3290 grab_memory_for_meta_buf(int nsize)
3291 {
3292 zone_t z;
3293 void *ptr;
3294 boolean_t was_vmpriv;
3295
3296 z = getbufzone(nsize);
3297
3298 /*
3299 * make sure we're NOT priviliged so that
3300 * if a vm_page_grab is needed, it won't
3301 * block if we're out of free pages... if
3302 * it blocks, then we can't honor the
3303 * nopagewait request
3304 */
3305 was_vmpriv = set_vm_privilege(FALSE);
3306
3307 ptr = zalloc_nopagewait(z);
3308
3309 if (was_vmpriv == TRUE)
3310 set_vm_privilege(TRUE);
3311
3312 if (ptr == NULL) {
3313
3314 zalloc_nopagewait_failed++;
3315
3316 ptr = recycle_buf_from_pool(nsize);
3317
3318 if (ptr == NULL) {
3319
3320 recycle_buf_failed++;
3321
3322 if (was_vmpriv == FALSE)
3323 set_vm_privilege(TRUE);
3324
3325 ptr = zalloc(z);
3326
3327 if (was_vmpriv == FALSE)
3328 set_vm_privilege(FALSE);
3329 }
3330 }
3331 return (ptr);
3332 }
3333
3334 /*
3335 * With UBC, there is no need to expand / shrink the file data
3336 * buffer. The VM uses the same pages, hence no waste.
3337 * All the file data buffers can have one size.
3338 * In fact expand / shrink would be an expensive operation.
3339 *
3340 * Only exception to this is meta-data buffers. Most of the
3341 * meta data operations are smaller than PAGE_SIZE. Having the
3342 * meta-data buffers grow and shrink as needed, optimizes use
3343 * of the kernel wired memory.
3344 */
3345
3346 int
3347 allocbuf(buf_t bp, int size)
3348 {
3349 vm_size_t desired_size;
3350
3351 desired_size = roundup(size, CLBYTES);
3352
3353 if (desired_size < PAGE_SIZE)
3354 desired_size = PAGE_SIZE;
3355 if (desired_size > MAXBSIZE)
3356 panic("allocbuf: buffer larger than MAXBSIZE requested");
3357
3358 if (ISSET(bp->b_flags, B_META)) {
3359 int nsize = roundup(size, MINMETA);
3360
3361 if (bp->b_datap) {
3362 vm_offset_t elem = (vm_offset_t)bp->b_datap;
3363
3364 if (ISSET(bp->b_flags, B_ZALLOC)) {
3365 if (bp->b_bufsize < nsize) {
3366 zone_t zprev;
3367
3368 /* reallocate to a bigger size */
3369
3370 zprev = getbufzone(bp->b_bufsize);
3371 if (nsize <= MAXMETA) {
3372 desired_size = nsize;
3373
3374 /* b_datap not really a ptr */
3375 *(void **)(&bp->b_datap) = grab_memory_for_meta_buf(nsize);
3376 } else {
3377 bp->b_datap = (uintptr_t)NULL;
3378 kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size, VM_KERN_MEMORY_FILE);
3379 CLR(bp->b_flags, B_ZALLOC);
3380 }
3381 bcopy((void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
3382 zfree(zprev, (void *)elem);
3383 } else {
3384 desired_size = bp->b_bufsize;
3385 }
3386
3387 } else {
3388 if ((vm_size_t)bp->b_bufsize < desired_size) {
3389 /* reallocate to a bigger size */
3390 bp->b_datap = (uintptr_t)NULL;
3391 kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size, VM_KERN_MEMORY_FILE);
3392 bcopy((const void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
3393 kmem_free(kernel_map, elem, bp->b_bufsize);
3394 } else {
3395 desired_size = bp->b_bufsize;
3396 }
3397 }
3398 } else {
3399 /* new allocation */
3400 if (nsize <= MAXMETA) {
3401 desired_size = nsize;
3402
3403 /* b_datap not really a ptr */
3404 *(void **)(&bp->b_datap) = grab_memory_for_meta_buf(nsize);
3405 SET(bp->b_flags, B_ZALLOC);
3406 } else
3407 kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size, VM_KERN_MEMORY_FILE);
3408 }
3409
3410 if (bp->b_datap == 0)
3411 panic("allocbuf: NULL b_datap");
3412 }
3413 bp->b_bufsize = desired_size;
3414 bp->b_bcount = size;
3415
3416 return (0);
3417 }
3418
3419 /*
3420 * Get a new buffer from one of the free lists.
3421 *
3422 * Request for a queue is passes in. The queue from which the buffer was taken
3423 * from is returned. Out of range queue requests get BQ_EMPTY. Request for
3424 * BQUEUE means no preference. Use heuristics in that case.
3425 * Heuristics is as follows:
3426 * Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
3427 * If none available block till one is made available.
3428 * If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
3429 * Pick the most stale buffer.
3430 * If found buffer was marked delayed write, start the async. write
3431 * and restart the search.
3432 * Initialize the fields and disassociate the buffer from the vnode.
3433 * Remove the buffer from the hash. Return the buffer and the queue
3434 * on which it was found.
3435 *
3436 * buf_mtxp is held upon entry
3437 * returns with buf_mtxp locked if new buf available
3438 * returns with buf_mtxp UNlocked if new buf NOT available
3439 */
3440
3441 static buf_t
3442 getnewbuf(int slpflag, int slptimeo, int * queue)
3443 {
3444 buf_t bp;
3445 buf_t lru_bp;
3446 buf_t age_bp;
3447 buf_t meta_bp;
3448 int age_time, lru_time, bp_time, meta_time;
3449 int req = *queue; /* save it for restarts */
3450 struct timespec ts;
3451
3452 start:
3453 /*
3454 * invalid request gets empty queue
3455 */
3456 if ((*queue >= BQUEUES) || (*queue < 0)
3457 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED))
3458 *queue = BQ_EMPTY;
3459
3460
3461 if (*queue == BQ_EMPTY && (bp = bufqueues[*queue].tqh_first))
3462 goto found;
3463
3464 /*
3465 * need to grow number of bufs, add another one rather than recycling
3466 */
3467 if (nbuf_headers < max_nbuf_headers) {
3468 /*
3469 * Increment count now as lock
3470 * is dropped for allocation.
3471 * That avoids over commits
3472 */
3473 nbuf_headers++;
3474 goto add_newbufs;
3475 }
3476 /* Try for the requested queue first */
3477 bp = bufqueues[*queue].tqh_first;
3478 if (bp)
3479 goto found;
3480
3481 /* Unable to use requested queue */
3482 age_bp = bufqueues[BQ_AGE].tqh_first;
3483 lru_bp = bufqueues[BQ_LRU].tqh_first;
3484 meta_bp = bufqueues[BQ_META].tqh_first;
3485
3486 if (!age_bp && !lru_bp && !meta_bp) {
3487 /*
3488 * Unavailble on AGE or LRU or META queues
3489 * Try the empty list first
3490 */
3491 bp = bufqueues[BQ_EMPTY].tqh_first;
3492 if (bp) {
3493 *queue = BQ_EMPTY;
3494 goto found;
3495 }
3496 /*
3497 * We have seen is this is hard to trigger.
3498 * This is an overcommit of nbufs but needed
3499 * in some scenarios with diskiamges
3500 */
3501
3502 add_newbufs:
3503 lck_mtx_unlock(buf_mtxp);
3504
3505 /* Create a new temporary buffer header */
3506 bp = (struct buf *)zalloc(buf_hdr_zone);
3507
3508 if (bp) {
3509 bufhdrinit(bp);
3510 bp->b_whichq = BQ_EMPTY;
3511 bp->b_timestamp = buf_timestamp();
3512 BLISTNONE(bp);
3513 SET(bp->b_flags, B_HDRALLOC);
3514 *queue = BQ_EMPTY;
3515 }
3516 lck_mtx_lock_spin(buf_mtxp);
3517
3518 if (bp) {
3519 binshash(bp, &invalhash);
3520 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
3521 buf_hdr_count++;
3522 goto found;
3523 }
3524 /* subtract already accounted bufcount */
3525 nbuf_headers--;
3526
3527 bufstats.bufs_sleeps++;
3528
3529 /* wait for a free buffer of any kind */
3530 needbuffer = 1;
3531 /* hz value is 100 */
3532 ts.tv_sec = (slptimeo/1000);
3533 /* the hz value is 100; which leads to 10ms */
3534 ts.tv_nsec = (slptimeo % 1000) * NSEC_PER_USEC * 1000 * 10;
3535
3536 msleep(&needbuffer, buf_mtxp, slpflag | PDROP | (PRIBIO+1), "getnewbuf", &ts);
3537 return (NULL);
3538 }
3539
3540 /* Buffer available either on AGE or LRU or META */
3541 bp = NULL;
3542 *queue = -1;
3543
3544 /* Buffer available either on AGE or LRU */
3545 if (!age_bp) {
3546 bp = lru_bp;
3547 *queue = BQ_LRU;
3548 } else if (!lru_bp) {
3549 bp = age_bp;
3550 *queue = BQ_AGE;
3551 } else { /* buffer available on both AGE and LRU */
3552 int t = buf_timestamp();
3553
3554 age_time = t - age_bp->b_timestamp;
3555 lru_time = t - lru_bp->b_timestamp;
3556 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
3557 bp = age_bp;
3558 *queue = BQ_AGE;
3559 /*
3560 * we should probably re-timestamp eveything in the
3561 * queues at this point with the current time
3562 */
3563 } else {
3564 if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
3565 bp = lru_bp;
3566 *queue = BQ_LRU;
3567 } else {
3568 bp = age_bp;
3569 *queue = BQ_AGE;
3570 }
3571 }
3572 }
3573
3574 if (!bp) { /* Neither on AGE nor on LRU */
3575 bp = meta_bp;
3576 *queue = BQ_META;
3577 } else if (meta_bp) {
3578 int t = buf_timestamp();
3579
3580 bp_time = t - bp->b_timestamp;
3581 meta_time = t - meta_bp->b_timestamp;
3582
3583 if (!(bp_time < 0) && !(meta_time < 0)) {
3584 /* time not set backwards */
3585 int bp_is_stale;
3586 bp_is_stale = (*queue == BQ_LRU) ?
3587 lru_is_stale : age_is_stale;
3588
3589 if ((meta_time >= meta_is_stale) &&
3590 (bp_time < bp_is_stale)) {
3591 bp = meta_bp;
3592 *queue = BQ_META;
3593 }
3594 }
3595 }
3596 found:
3597 if (ISSET(bp->b_flags, B_LOCKED) || ISSET(bp->b_lflags, BL_BUSY))
3598 panic("getnewbuf: bp @ %p is LOCKED or BUSY! (flags 0x%x)\n", bp, bp->b_flags);
3599
3600 /* Clean it */
3601 if (bcleanbuf(bp, FALSE)) {
3602 /*
3603 * moved to the laundry thread, buffer not ready
3604 */
3605 *queue = req;
3606 goto start;
3607 }
3608 return (bp);
3609 }
3610
3611
3612 /*
3613 * Clean a buffer.
3614 * Returns 0 if buffer is ready to use,
3615 * Returns 1 if issued a buf_bawrite() to indicate
3616 * that the buffer is not ready.
3617 *
3618 * buf_mtxp is held upon entry
3619 * returns with buf_mtxp locked
3620 */
3621 int
3622 bcleanbuf(buf_t bp, boolean_t discard)
3623 {
3624 /* Remove from the queue */
3625 bremfree_locked(bp);
3626
3627 #ifdef JOE_DEBUG
3628 bp->b_owner = current_thread();
3629 bp->b_tag = 2;
3630 #endif
3631 /*
3632 * If buffer was a delayed write, start the IO by queuing
3633 * it on the LAUNDRY queue, and return 1
3634 */
3635 if (ISSET(bp->b_flags, B_DELWRI)) {
3636 if (discard) {
3637 SET(bp->b_lflags, BL_WANTDEALLOC);
3638 }
3639
3640 bmovelaundry(bp);
3641
3642 lck_mtx_unlock(buf_mtxp);
3643
3644 wakeup(&bufqueues[BQ_LAUNDRY]);
3645 /*
3646 * and give it a chance to run
3647 */
3648 (void)thread_block(THREAD_CONTINUE_NULL);
3649
3650 lck_mtx_lock_spin(buf_mtxp);
3651
3652 return (1);
3653 }
3654 #ifdef JOE_DEBUG
3655 bp->b_owner = current_thread();
3656 bp->b_tag = 8;
3657 #endif
3658 /*
3659 * Buffer is no longer on any free list... we own it
3660 */
3661 SET(bp->b_lflags, BL_BUSY);
3662 buf_busycount++;
3663
3664 bremhash(bp);
3665
3666 /*
3667 * disassociate us from our vnode, if we had one...
3668 */
3669 if (bp->b_vp)
3670 brelvp_locked(bp);
3671
3672 lck_mtx_unlock(buf_mtxp);
3673
3674 BLISTNONE(bp);
3675
3676 if (ISSET(bp->b_flags, B_META))
3677 buf_free_meta_store(bp);
3678
3679 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
3680
3681 buf_release_credentials(bp);
3682
3683 /* If discarding, just move to the empty queue */
3684 if (discard) {
3685 lck_mtx_lock_spin(buf_mtxp);
3686 CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA));
3687 bp->b_whichq = BQ_EMPTY;
3688 binshash(bp, &invalhash);
3689 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
3690 CLR(bp->b_lflags, BL_BUSY);
3691 buf_busycount--;
3692 } else {
3693 /* Not discarding: clean up and prepare for reuse */
3694 bp->b_bufsize = 0;
3695 bp->b_datap = (uintptr_t)NULL;
3696 bp->b_upl = (void *)NULL;
3697 bp->b_fsprivate = (void *)NULL;
3698 /*
3699 * preserve the state of whether this buffer
3700 * was allocated on the fly or not...
3701 * the only other flag that should be set at
3702 * this point is BL_BUSY...
3703 */
3704 #ifdef JOE_DEBUG
3705 bp->b_owner = current_thread();
3706 bp->b_tag = 3;
3707 #endif
3708 bp->b_lflags = BL_BUSY;
3709 bp->b_flags = (bp->b_flags & B_HDRALLOC);
3710 bp->b_redundancy_flags = 0;
3711 bp->b_dev = NODEV;
3712 bp->b_blkno = bp->b_lblkno = 0;
3713 bp->b_iodone = NULL;
3714 bp->b_error = 0;
3715 bp->b_resid = 0;
3716 bp->b_bcount = 0;
3717 bp->b_dirtyoff = bp->b_dirtyend = 0;
3718 bp->b_validoff = bp->b_validend = 0;
3719 bzero(&bp->b_attr, sizeof(struct bufattr));
3720
3721 lck_mtx_lock_spin(buf_mtxp);
3722 }
3723 return (0);
3724 }
3725
3726
3727
3728 errno_t
3729 buf_invalblkno(vnode_t vp, daddr64_t lblkno, int flags)
3730 {
3731 buf_t bp;
3732 errno_t error;
3733 struct bufhashhdr *dp;
3734
3735 dp = BUFHASH(vp, lblkno);
3736
3737 relook:
3738 lck_mtx_lock_spin(buf_mtxp);
3739
3740 if ((bp = incore_locked(vp, lblkno, dp)) == (struct buf *)0) {
3741 lck_mtx_unlock(buf_mtxp);
3742 return (0);
3743 }
3744 if (ISSET(bp->b_lflags, BL_BUSY)) {
3745 if ( !ISSET(flags, BUF_WAIT)) {
3746 lck_mtx_unlock(buf_mtxp);
3747 return (EBUSY);
3748 }
3749 SET(bp->b_lflags, BL_WANTED);
3750
3751 error = msleep((caddr_t)bp, buf_mtxp, PDROP | (PRIBIO + 1), "buf_invalblkno", NULL);
3752
3753 if (error) {
3754 return (error);
3755 }
3756 goto relook;
3757 }
3758 bremfree_locked(bp);
3759 SET(bp->b_lflags, BL_BUSY);
3760 SET(bp->b_flags, B_INVAL);
3761 buf_busycount++;
3762 #ifdef JOE_DEBUG
3763 bp->b_owner = current_thread();
3764 bp->b_tag = 4;
3765 #endif
3766 lck_mtx_unlock(buf_mtxp);
3767 buf_brelse(bp);
3768
3769 return (0);
3770 }
3771
3772
3773 void
3774 buf_drop(buf_t bp)
3775 {
3776 int need_wakeup = 0;
3777
3778 lck_mtx_lock_spin(buf_mtxp);
3779
3780 if (ISSET(bp->b_lflags, BL_WANTED)) {
3781 /*
3782 * delay the actual wakeup until after we
3783 * clear BL_BUSY and we've dropped buf_mtxp
3784 */
3785 need_wakeup = 1;
3786 }
3787 #ifdef JOE_DEBUG
3788 bp->b_owner = current_thread();
3789 bp->b_tag = 9;
3790 #endif
3791 /*
3792 * Unlock the buffer.
3793 */
3794 CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
3795 buf_busycount--;
3796
3797 lck_mtx_unlock(buf_mtxp);
3798
3799 if (need_wakeup) {
3800 /*
3801 * Wake up any proceeses waiting for _this_ buffer to become free.
3802 */
3803 wakeup(bp);
3804 }
3805 }
3806
3807
3808 errno_t
3809 buf_acquire(buf_t bp, int flags, int slpflag, int slptimeo) {
3810 errno_t error;
3811
3812 lck_mtx_lock_spin(buf_mtxp);
3813
3814 error = buf_acquire_locked(bp, flags, slpflag, slptimeo);
3815
3816 lck_mtx_unlock(buf_mtxp);
3817
3818 return (error);
3819 }
3820
3821
3822 static errno_t
3823 buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo)
3824 {
3825 errno_t error;
3826 struct timespec ts;
3827
3828 if (ISSET(bp->b_flags, B_LOCKED)) {
3829 if ((flags & BAC_SKIP_LOCKED))
3830 return (EDEADLK);
3831 } else {
3832 if ((flags & BAC_SKIP_NONLOCKED))
3833 return (EDEADLK);
3834 }
3835 if (ISSET(bp->b_lflags, BL_BUSY)) {
3836 /*
3837 * since the lck_mtx_lock may block, the buffer
3838 * may become BUSY, so we need to
3839 * recheck for a NOWAIT request
3840 */
3841 if (flags & BAC_NOWAIT)
3842 return (EBUSY);
3843 SET(bp->b_lflags, BL_WANTED);
3844
3845 /* the hz value is 100; which leads to 10ms */
3846 ts.tv_sec = (slptimeo/100);
3847 ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000;
3848 error = msleep((caddr_t)bp, buf_mtxp, slpflag | (PRIBIO + 1), "buf_acquire", &ts);
3849
3850 if (error)
3851 return (error);
3852 return (EAGAIN);
3853 }
3854 if (flags & BAC_REMOVE)
3855 bremfree_locked(bp);
3856 SET(bp->b_lflags, BL_BUSY);
3857 buf_busycount++;
3858
3859 #ifdef JOE_DEBUG
3860 bp->b_owner = current_thread();
3861 bp->b_tag = 5;
3862 #endif
3863 return (0);
3864 }
3865
3866
3867 /*
3868 * Wait for operations on the buffer to complete.
3869 * When they do, extract and return the I/O's error value.
3870 */
3871 errno_t
3872 buf_biowait(buf_t bp)
3873 {
3874 while (!ISSET(bp->b_flags, B_DONE)) {
3875
3876 lck_mtx_lock_spin(buf_mtxp);
3877
3878 if (!ISSET(bp->b_flags, B_DONE)) {
3879 DTRACE_IO1(wait__start, buf_t, bp);
3880 (void) msleep(bp, buf_mtxp, PDROP | (PRIBIO+1), "buf_biowait", NULL);
3881 DTRACE_IO1(wait__done, buf_t, bp);
3882 } else
3883 lck_mtx_unlock(buf_mtxp);
3884 }
3885 /* check for interruption of I/O (e.g. via NFS), then errors. */
3886 if (ISSET(bp->b_flags, B_EINTR)) {
3887 CLR(bp->b_flags, B_EINTR);
3888 return (EINTR);
3889 } else if (ISSET(bp->b_flags, B_ERROR))
3890 return (bp->b_error ? bp->b_error : EIO);
3891 else
3892 return (0);
3893 }
3894
3895
3896 /*
3897 * Mark I/O complete on a buffer.
3898 *
3899 * If a callback has been requested, e.g. the pageout
3900 * daemon, do so. Otherwise, awaken waiting processes.
3901 *
3902 * [ Leffler, et al., says on p.247:
3903 * "This routine wakes up the blocked process, frees the buffer
3904 * for an asynchronous write, or, for a request by the pagedaemon
3905 * process, invokes a procedure specified in the buffer structure" ]
3906 *
3907 * In real life, the pagedaemon (or other system processes) wants
3908 * to do async stuff to, and doesn't want the buffer buf_brelse()'d.
3909 * (for swap pager, that puts swap buffers on the free lists (!!!),
3910 * for the vn device, that puts malloc'd buffers on the free lists!)
3911 */
3912
3913 void
3914 buf_biodone(buf_t bp)
3915 {
3916 mount_t mp;
3917 struct bufattr *bap;
3918
3919 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
3920 bp, bp->b_datap, bp->b_flags, 0, 0);
3921
3922 if (ISSET(bp->b_flags, B_DONE))
3923 panic("biodone already");
3924
3925 bap = &bp->b_attr;
3926
3927 if (bp->b_vp && bp->b_vp->v_mount) {
3928 mp = bp->b_vp->v_mount;
3929 } else {
3930 mp = NULL;
3931 }
3932
3933 if (mp && (bp->b_flags & B_READ) == 0) {
3934 update_last_io_time(mp);
3935 INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_write_size);
3936 } else if (mp) {
3937 INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_read_size);
3938 }
3939
3940 if (kdebug_enable) {
3941 int code = DKIO_DONE;
3942 int io_tier = GET_BUFATTR_IO_TIER(bap);
3943
3944 if (bp->b_flags & B_READ)
3945 code |= DKIO_READ;
3946 if (bp->b_flags & B_ASYNC)
3947 code |= DKIO_ASYNC;
3948
3949 if (bp->b_flags & B_META)
3950 code |= DKIO_META;
3951 else if (bp->b_flags & B_PAGEIO)
3952 code |= DKIO_PAGING;
3953
3954 if (io_tier != 0)
3955 code |= DKIO_THROTTLE;
3956
3957 code |= ((io_tier << DKIO_TIER_SHIFT) & DKIO_TIER_MASK);
3958
3959 if (bp->b_flags & B_PASSIVE)
3960 code |= DKIO_PASSIVE;
3961
3962 if (bap->ba_flags & BA_NOCACHE)
3963 code |= DKIO_NOCACHE;
3964
3965 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
3966 buf_kernel_addrperm_addr(bp), (uintptr_t)VM_KERNEL_ADDRPERM(bp->b_vp), bp->b_resid, bp->b_error, 0);
3967 }
3968
3969 /*
3970 * I/O was done, so don't believe
3971 * the DIRTY state from VM anymore...
3972 * and we need to reset the THROTTLED/PASSIVE
3973 * indicators
3974 */
3975 CLR(bp->b_flags, (B_WASDIRTY | B_PASSIVE));
3976 CLR(bap->ba_flags, (BA_META | BA_NOCACHE | BA_DELAYIDLESLEEP));
3977
3978 SET_BUFATTR_IO_TIER(bap, 0);
3979
3980 DTRACE_IO1(done, buf_t, bp);
3981
3982 if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
3983 /*
3984 * wake up any writer's blocked
3985 * on throttle or waiting for I/O
3986 * to drain
3987 */
3988 vnode_writedone(bp->b_vp);
3989
3990 if (ISSET(bp->b_flags, (B_CALL | B_FILTER))) { /* if necessary, call out */
3991 void (*iodone_func)(struct buf *, void *) = bp->b_iodone;
3992 void *arg = bp->b_transaction;
3993 int callout = ISSET(bp->b_flags, B_CALL);
3994
3995 if (iodone_func == NULL)
3996 panic("biodone: bp @ %p has NULL b_iodone!\n", bp);
3997
3998 CLR(bp->b_flags, (B_CALL | B_FILTER)); /* filters and callouts are one-shot */
3999 bp->b_iodone = NULL;
4000 bp->b_transaction = NULL;
4001
4002 if (callout)
4003 SET(bp->b_flags, B_DONE); /* note that it's done */
4004
4005 (*iodone_func)(bp, arg);
4006
4007 if (callout) {
4008 /*
4009 * assumes that the callback function takes
4010 * ownership of the bp and deals with releasing it if necessary
4011 */
4012 goto biodone_done;
4013 }
4014 /*
4015 * in this case the call back function is acting
4016 * strictly as a filter... it does not take
4017 * ownership of the bp and is expecting us
4018 * to finish cleaning up... this is currently used
4019 * by the HFS journaling code
4020 */
4021 }
4022 if (ISSET(bp->b_flags, B_ASYNC)) { /* if async, release it */
4023 SET(bp->b_flags, B_DONE); /* note that it's done */
4024
4025 buf_brelse(bp);
4026 } else { /* or just wakeup the buffer */
4027 /*
4028 * by taking the mutex, we serialize
4029 * the buf owner calling buf_biowait so that we'll
4030 * only see him in one of 2 states...
4031 * state 1: B_DONE wasn't set and he's
4032 * blocked in msleep
4033 * state 2: he's blocked trying to take the
4034 * mutex before looking at B_DONE
4035 * BL_WANTED is cleared in case anyone else
4036 * is blocked waiting for the buffer... note
4037 * that we haven't cleared B_BUSY yet, so if
4038 * they do get to run, their going to re-set
4039 * BL_WANTED and go back to sleep
4040 */
4041 lck_mtx_lock_spin(buf_mtxp);
4042
4043 CLR(bp->b_lflags, BL_WANTED);
4044 SET(bp->b_flags, B_DONE); /* note that it's done */
4045
4046 lck_mtx_unlock(buf_mtxp);
4047
4048 wakeup(bp);
4049 }
4050 biodone_done:
4051 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
4052 (uintptr_t)bp, (uintptr_t)bp->b_datap, bp->b_flags, 0, 0);
4053 }
4054
4055 /*
4056 * Obfuscate buf pointers.
4057 */
4058 vm_offset_t
4059 buf_kernel_addrperm_addr(void * addr)
4060 {
4061 if ((vm_offset_t)addr == 0)
4062 return 0;
4063 else
4064 return ((vm_offset_t)addr + buf_kernel_addrperm);
4065 }
4066
4067 /*
4068 * Return a count of buffers on the "locked" queue.
4069 */
4070 int
4071 count_lock_queue(void)
4072 {
4073 buf_t bp;
4074 int n = 0;
4075
4076 lck_mtx_lock_spin(buf_mtxp);
4077
4078 for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
4079 bp = bp->b_freelist.tqe_next)
4080 n++;
4081 lck_mtx_unlock(buf_mtxp);
4082
4083 return (n);
4084 }
4085
4086 /*
4087 * Return a count of 'busy' buffers. Used at the time of shutdown.
4088 * note: This is also called from the mach side in debug context in kdp.c
4089 */
4090 int
4091 count_busy_buffers(void)
4092 {
4093 return buf_busycount + bufstats.bufs_iobufinuse;
4094 }
4095
4096 #if DIAGNOSTIC
4097 /*
4098 * Print out statistics on the current allocation of the buffer pool.
4099 * Can be enabled to print out on every ``sync'' by setting "syncprt"
4100 * in vfs_syscalls.c using sysctl.
4101 */
4102 void
4103 vfs_bufstats()
4104 {
4105 int i, j, count;
4106 struct buf *bp;
4107 struct bqueues *dp;
4108 int counts[MAXBSIZE/CLBYTES+1];
4109 static char *bname[BQUEUES] =
4110 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
4111
4112 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
4113 count = 0;
4114 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
4115 counts[j] = 0;
4116
4117 lck_mtx_lock(buf_mtxp);
4118
4119 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
4120 counts[bp->b_bufsize/CLBYTES]++;
4121 count++;
4122 }
4123 lck_mtx_unlock(buf_mtxp);
4124
4125 printf("%s: total-%d", bname[i], count);
4126 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
4127 if (counts[j] != 0)
4128 printf(", %d-%d", j * CLBYTES, counts[j]);
4129 printf("\n");
4130 }
4131 }
4132 #endif /* DIAGNOSTIC */
4133
4134 #define NRESERVEDIOBUFS 128
4135
4136
4137 buf_t
4138 alloc_io_buf(vnode_t vp, int priv)
4139 {
4140 buf_t bp;
4141
4142 lck_mtx_lock_spin(iobuffer_mtxp);
4143
4144 while (((niobuf_headers - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse) && !priv) ||
4145 (bp = iobufqueue.tqh_first) == NULL) {
4146 bufstats.bufs_iobufsleeps++;
4147
4148 need_iobuffer = 1;
4149 (void) msleep(&need_iobuffer, iobuffer_mtxp, PSPIN | (PRIBIO+1), (const char *)"alloc_io_buf", NULL);
4150 }
4151 TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
4152
4153 bufstats.bufs_iobufinuse++;
4154 if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
4155 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
4156
4157 lck_mtx_unlock(iobuffer_mtxp);
4158
4159 /*
4160 * initialize various fields
4161 * we don't need to hold the mutex since the buffer
4162 * is now private... the vp should have a reference
4163 * on it and is not protected by this mutex in any event
4164 */
4165 bp->b_timestamp = 0;
4166 bp->b_proc = NULL;
4167
4168 bp->b_datap = 0;
4169 bp->b_flags = 0;
4170 bp->b_lflags = BL_BUSY | BL_IOBUF;
4171 bp->b_redundancy_flags = 0;
4172 bp->b_blkno = bp->b_lblkno = 0;
4173 #ifdef JOE_DEBUG
4174 bp->b_owner = current_thread();
4175 bp->b_tag = 6;
4176 #endif
4177 bp->b_iodone = NULL;
4178 bp->b_error = 0;
4179 bp->b_resid = 0;
4180 bp->b_bcount = 0;
4181 bp->b_bufsize = 0;
4182 bp->b_upl = NULL;
4183 bp->b_fsprivate = (void *)NULL;
4184 bp->b_vp = vp;
4185 bzero(&bp->b_attr, sizeof(struct bufattr));
4186
4187 if (vp && (vp->v_type == VBLK || vp->v_type == VCHR))
4188 bp->b_dev = vp->v_rdev;
4189 else
4190 bp->b_dev = NODEV;
4191
4192 return (bp);
4193 }
4194
4195
4196 void
4197 free_io_buf(buf_t bp)
4198 {
4199 int need_wakeup = 0;
4200
4201 /*
4202 * put buffer back on the head of the iobufqueue
4203 */
4204 bp->b_vp = NULL;
4205 bp->b_flags = B_INVAL;
4206
4207 /* Zero out the bufattr and its flags before relinquishing this iobuf */
4208 bzero (&bp->b_attr, sizeof(struct bufattr));
4209
4210 lck_mtx_lock_spin(iobuffer_mtxp);
4211
4212 binsheadfree(bp, &iobufqueue, -1);
4213
4214 if (need_iobuffer) {
4215 /*
4216 * Wake up any processes waiting because they need an io buffer
4217 *
4218 * do the wakeup after we drop the mutex... it's possible that the
4219 * wakeup will be superfluous if need_iobuffer gets set again and
4220 * another thread runs this path, but it's highly unlikely, doesn't
4221 * hurt, and it means we don't hold up I/O progress if the wakeup blocks
4222 * trying to grab a task related lock...
4223 */
4224 need_iobuffer = 0;
4225 need_wakeup = 1;
4226 }
4227 if (bufstats.bufs_iobufinuse <= 0)
4228 panic("free_io_buf: bp(%p) - bufstats.bufs_iobufinuse < 0", bp);
4229
4230 bufstats.bufs_iobufinuse--;
4231
4232 lck_mtx_unlock(iobuffer_mtxp);
4233
4234 if (need_wakeup)
4235 wakeup(&need_iobuffer);
4236 }
4237
4238
4239 void
4240 buf_list_lock(void)
4241 {
4242 lck_mtx_lock_spin(buf_mtxp);
4243 }
4244
4245 void
4246 buf_list_unlock(void)
4247 {
4248 lck_mtx_unlock(buf_mtxp);
4249 }
4250
4251 /*
4252 * If getnewbuf() calls bcleanbuf() on the same thread
4253 * there is a potential for stack overrun and deadlocks.
4254 * So we always handoff the work to a worker thread for completion
4255 */
4256
4257
4258 static void
4259 bcleanbuf_thread_init(void)
4260 {
4261 thread_t thread = THREAD_NULL;
4262
4263 /* create worker thread */
4264 kernel_thread_start((thread_continue_t)bcleanbuf_thread, NULL, &thread);
4265 thread_deallocate(thread);
4266 }
4267
4268 typedef int (*bcleanbufcontinuation)(int);
4269
4270 static void
4271 bcleanbuf_thread(void)
4272 {
4273 struct buf *bp;
4274 int error = 0;
4275 int loopcnt = 0;
4276
4277 for (;;) {
4278 lck_mtx_lock_spin(buf_mtxp);
4279
4280 while ( (bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY])) == NULL) {
4281 (void)msleep0(&bufqueues[BQ_LAUNDRY], buf_mtxp, PRIBIO|PDROP, "blaundry", 0, (bcleanbufcontinuation)bcleanbuf_thread);
4282 }
4283
4284 /*
4285 * Remove from the queue
4286 */
4287 bremfree_locked(bp);
4288
4289 /*
4290 * Buffer is no longer on any free list
4291 */
4292 SET(bp->b_lflags, BL_BUSY);
4293 buf_busycount++;
4294
4295 #ifdef JOE_DEBUG
4296 bp->b_owner = current_thread();
4297 bp->b_tag = 10;
4298 #endif
4299
4300 lck_mtx_unlock(buf_mtxp);
4301 /*
4302 * do the IO
4303 */
4304 error = bawrite_internal(bp, 0);
4305
4306 if (error) {
4307 bp->b_whichq = BQ_LAUNDRY;
4308 bp->b_timestamp = buf_timestamp();
4309
4310 lck_mtx_lock_spin(buf_mtxp);
4311
4312 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
4313 blaundrycnt++;
4314
4315 /* we never leave a busy page on the laundry queue */
4316 CLR(bp->b_lflags, BL_BUSY);
4317 buf_busycount--;
4318 #ifdef JOE_DEBUG
4319 bp->b_owner = current_thread();
4320 bp->b_tag = 11;
4321 #endif
4322
4323 lck_mtx_unlock(buf_mtxp);
4324
4325 if (loopcnt > MAXLAUNDRY) {
4326 /*
4327 * bawrite_internal() can return errors if we're throttled. If we've
4328 * done several I/Os and failed, give the system some time to unthrottle
4329 * the vnode
4330 */
4331 (void)tsleep((void *)&bufqueues[BQ_LAUNDRY], PRIBIO, "blaundry", 1);
4332 loopcnt = 0;
4333 } else {
4334 /* give other threads a chance to run */
4335 (void)thread_block(THREAD_CONTINUE_NULL);
4336 loopcnt++;
4337 }
4338 }
4339 }
4340 }
4341
4342
4343 static int
4344 brecover_data(buf_t bp)
4345 {
4346 int upl_offset;
4347 upl_t upl;
4348 upl_page_info_t *pl;
4349 kern_return_t kret;
4350 vnode_t vp = bp->b_vp;
4351 int upl_flags;
4352
4353
4354 if ( !UBCINFOEXISTS(vp) || bp->b_bufsize == 0)
4355 goto dump_buffer;
4356
4357 upl_flags = UPL_PRECIOUS;
4358 if (! (buf_flags(bp) & B_READ)) {
4359 /*
4360 * "write" operation: let the UPL subsystem know
4361 * that we intend to modify the buffer cache pages we're
4362 * gathering.
4363 */
4364 upl_flags |= UPL_WILL_MODIFY;
4365 }
4366
4367 kret = ubc_create_upl(vp,
4368 ubc_blktooff(vp, bp->b_lblkno),
4369 bp->b_bufsize,
4370 &upl,
4371 &pl,
4372 upl_flags);
4373 if (kret != KERN_SUCCESS)
4374 panic("Failed to create UPL");
4375
4376 for (upl_offset = 0; upl_offset < bp->b_bufsize; upl_offset += PAGE_SIZE) {
4377
4378 if (!upl_valid_page(pl, upl_offset / PAGE_SIZE) || !upl_dirty_page(pl, upl_offset / PAGE_SIZE)) {
4379 ubc_upl_abort(upl, 0);
4380 goto dump_buffer;
4381 }
4382 }
4383 bp->b_upl = upl;
4384
4385 kret = ubc_upl_map(upl, (vm_offset_t *)&(bp->b_datap));
4386
4387 if (kret != KERN_SUCCESS)
4388 panic("getblk: ubc_upl_map() failed with (%d)", kret);
4389 return (1);
4390
4391 dump_buffer:
4392 bp->b_bufsize = 0;
4393 SET(bp->b_flags, B_INVAL);
4394 buf_brelse(bp);
4395
4396 return(0);
4397 }
4398
4399 boolean_t
4400 buffer_cache_gc(int all)
4401 {
4402 buf_t bp;
4403 boolean_t did_large_zfree = FALSE;
4404 boolean_t need_wakeup = FALSE;
4405 int now = buf_timestamp();
4406 uint32_t found = 0;
4407 struct bqueues privq;
4408 int thresh_hold = BUF_STALE_THRESHHOLD;
4409
4410 if (all)
4411 thresh_hold = 0;
4412 /*
4413 * We only care about metadata (incore storage comes from zalloc()).
4414 * Unless "all" is set (used to evict meta data buffers in preparation
4415 * for deep sleep), we only evict up to BUF_MAX_GC_BATCH_SIZE buffers
4416 * that have not been accessed in the last BUF_STALE_THRESHOLD seconds.
4417 * BUF_MAX_GC_BATCH_SIZE controls both the hold time of the global lock
4418 * "buf_mtxp" and the length of time we spend compute bound in the GC
4419 * thread which calls this function
4420 */
4421 lck_mtx_lock(buf_mtxp);
4422
4423 do {
4424 found = 0;
4425 TAILQ_INIT(&privq);
4426 need_wakeup = FALSE;
4427
4428 while (((bp = TAILQ_FIRST(&bufqueues[BQ_META]))) &&
4429 (now > bp->b_timestamp) &&
4430 (now - bp->b_timestamp > thresh_hold) &&
4431 (found < BUF_MAX_GC_BATCH_SIZE)) {
4432
4433 /* Remove from free list */
4434 bremfree_locked(bp);
4435 found++;
4436
4437 #ifdef JOE_DEBUG
4438 bp->b_owner = current_thread();
4439 bp->b_tag = 12;
4440 #endif
4441
4442 /* If dirty, move to laundry queue and remember to do wakeup */
4443 if (ISSET(bp->b_flags, B_DELWRI)) {
4444 SET(bp->b_lflags, BL_WANTDEALLOC);
4445
4446 bmovelaundry(bp);
4447 need_wakeup = TRUE;
4448
4449 continue;
4450 }
4451
4452 /*
4453 * Mark busy and put on private list. We could technically get
4454 * away without setting BL_BUSY here.
4455 */
4456 SET(bp->b_lflags, BL_BUSY);
4457 buf_busycount++;
4458
4459 /*
4460 * Remove from hash and dissociate from vp.
4461 */
4462 bremhash(bp);
4463 if (bp->b_vp) {
4464 brelvp_locked(bp);
4465 }
4466
4467 TAILQ_INSERT_TAIL(&privq, bp, b_freelist);
4468 }
4469
4470 if (found == 0) {
4471 break;
4472 }
4473
4474 /* Drop lock for batch processing */
4475 lck_mtx_unlock(buf_mtxp);
4476
4477 /* Wakeup and yield for laundry if need be */
4478 if (need_wakeup) {
4479 wakeup(&bufqueues[BQ_LAUNDRY]);
4480 (void)thread_block(THREAD_CONTINUE_NULL);
4481 }
4482
4483 /* Clean up every buffer on private list */
4484 TAILQ_FOREACH(bp, &privq, b_freelist) {
4485 /* Take note if we've definitely freed at least a page to a zone */
4486 if ((ISSET(bp->b_flags, B_ZALLOC)) && (buf_size(bp) >= PAGE_SIZE)) {
4487 did_large_zfree = TRUE;
4488 }
4489
4490 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
4491
4492 /* Free Storage */
4493 buf_free_meta_store(bp);
4494
4495 /* Release credentials */
4496 buf_release_credentials(bp);
4497
4498 /* Prepare for moving to empty queue */
4499 CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED
4500 | B_AGE | B_ASYNC | B_NOCACHE | B_FUA));
4501 bp->b_whichq = BQ_EMPTY;
4502 BLISTNONE(bp);
4503 }
4504 lck_mtx_lock(buf_mtxp);
4505
4506 /* Back under lock, move them all to invalid hash and clear busy */
4507 TAILQ_FOREACH(bp, &privq, b_freelist) {
4508 binshash(bp, &invalhash);
4509 CLR(bp->b_lflags, BL_BUSY);
4510 buf_busycount--;
4511
4512 #ifdef JOE_DEBUG
4513 if (bp->b_owner != current_thread()) {
4514 panic("Buffer stolen from buffer_cache_gc()");
4515 }
4516 bp->b_owner = current_thread();
4517 bp->b_tag = 13;
4518 #endif
4519 }
4520
4521 /* And do a big bulk move to the empty queue */
4522 TAILQ_CONCAT(&bufqueues[BQ_EMPTY], &privq, b_freelist);
4523
4524 } while (all && (found == BUF_MAX_GC_BATCH_SIZE));
4525
4526 lck_mtx_unlock(buf_mtxp);
4527
4528 return did_large_zfree;
4529 }
4530
4531
4532 /*
4533 * disabled for now
4534 */
4535
4536 #if FLUSH_QUEUES
4537
4538 #define NFLUSH 32
4539
4540 static int
4541 bp_cmp(void *a, void *b)
4542 {
4543 buf_t *bp_a = *(buf_t **)a,
4544 *bp_b = *(buf_t **)b;
4545 daddr64_t res;
4546
4547 // don't have to worry about negative block
4548 // numbers so this is ok to do.
4549 //
4550 res = (bp_a->b_blkno - bp_b->b_blkno);
4551
4552 return (int)res;
4553 }
4554
4555
4556 int
4557 bflushq(int whichq, mount_t mp)
4558 {
4559 buf_t bp, next;
4560 int i, buf_count;
4561 int total_writes = 0;
4562 static buf_t flush_table[NFLUSH];
4563
4564 if (whichq < 0 || whichq >= BQUEUES) {
4565 return (0);
4566 }
4567
4568 restart:
4569 lck_mtx_lock(buf_mtxp);
4570
4571 bp = TAILQ_FIRST(&bufqueues[whichq]);
4572
4573 for (buf_count = 0; bp; bp = next) {
4574 next = bp->b_freelist.tqe_next;
4575
4576 if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) {
4577 continue;
4578 }
4579
4580 if (ISSET(bp->b_flags, B_DELWRI) && !ISSET(bp->b_lflags, BL_BUSY)) {
4581
4582 bremfree_locked(bp);
4583 #ifdef JOE_DEBUG
4584 bp->b_owner = current_thread();
4585 bp->b_tag = 7;
4586 #endif
4587 SET(bp->b_lflags, BL_BUSY);
4588 buf_busycount++;
4589
4590 flush_table[buf_count] = bp;
4591 buf_count++;
4592 total_writes++;
4593
4594 if (buf_count >= NFLUSH) {
4595 lck_mtx_unlock(buf_mtxp);
4596
4597 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
4598
4599 for (i = 0; i < buf_count; i++) {
4600 buf_bawrite(flush_table[i]);
4601 }
4602 goto restart;
4603 }
4604 }
4605 }
4606 lck_mtx_unlock(buf_mtxp);
4607
4608 if (buf_count > 0) {
4609 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
4610
4611 for (i = 0; i < buf_count; i++) {
4612 buf_bawrite(flush_table[i]);
4613 }
4614 }
4615
4616 return (total_writes);
4617 }
4618 #endif