]> git.saurik.com Git - apple/xnu.git/blame_incremental - bsd/vfs/vfs_bio.c
xnu-2422.115.4.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_bio.c
... / ...
CommitLineData
1/*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*-
30 * Copyright (c) 1994 Christopher G. Demetriou
31 * Copyright (c) 1982, 1986, 1989, 1993
32 * The Regents of the University of California. All rights reserved.
33 * (c) UNIX System Laboratories, Inc.
34 * All or some portions of this file are derived from material licensed
35 * to the University of California by American Telephone and Telegraph
36 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
37 * the permission of UNIX System Laboratories, Inc.
38 *
39 * Redistribution and use in source and binary forms, with or without
40 * modification, are permitted provided that the following conditions
41 * are met:
42 * 1. Redistributions of source code must retain the above copyright
43 * notice, this list of conditions and the following disclaimer.
44 * 2. Redistributions in binary form must reproduce the above copyright
45 * notice, this list of conditions and the following disclaimer in the
46 * documentation and/or other materials provided with the distribution.
47 * 3. All advertising materials mentioning features or use of this software
48 * must display the following acknowledgement:
49 * This product includes software developed by the University of
50 * California, Berkeley and its contributors.
51 * 4. Neither the name of the University nor the names of its contributors
52 * may be used to endorse or promote products derived from this software
53 * without specific prior written permission.
54 *
55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65 * SUCH DAMAGE.
66 *
67 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
68 */
69
70/*
71 * Some references:
72 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
73 * Leffler, et al.: The Design and Implementation of the 4.3BSD
74 * UNIX Operating System (Addison Welley, 1989)
75 */
76
77#include <sys/param.h>
78#include <sys/systm.h>
79#include <sys/proc_internal.h>
80#include <sys/buf_internal.h>
81#include <sys/vnode_internal.h>
82#include <sys/mount_internal.h>
83#include <sys/trace.h>
84#include <sys/malloc.h>
85#include <sys/resourcevar.h>
86#include <miscfs/specfs/specdev.h>
87#include <sys/ubc.h>
88#include <sys/kauth.h>
89#if DIAGNOSTIC
90#include <kern/assert.h>
91#endif /* DIAGNOSTIC */
92#include <kern/task.h>
93#include <kern/zalloc.h>
94#include <kern/lock.h>
95
96#include <sys/fslog.h> /* fslog_io_error() */
97
98#include <mach/mach_types.h>
99#include <mach/memory_object_types.h>
100#include <kern/sched_prim.h> /* thread_block() */
101
102#include <vm/vm_kern.h>
103#include <vm/vm_pageout.h>
104
105#include <sys/kdebug.h>
106
107#include <libkern/OSAtomic.h>
108#include <libkern/OSDebug.h>
109#include <sys/ubc_internal.h>
110
111#include <sys/sdt.h>
112#include <sys/cprotect.h>
113
114
115#if BALANCE_QUEUES
116static __inline__ void bufqinc(int q);
117static __inline__ void bufqdec(int q);
118#endif
119
120int bcleanbuf(buf_t bp, boolean_t discard);
121static int brecover_data(buf_t bp);
122static boolean_t incore(vnode_t vp, daddr64_t blkno);
123/* timeout is in msecs */
124static buf_t getnewbuf(int slpflag, int slptimeo, int *queue);
125static void bremfree_locked(buf_t bp);
126static void buf_reassign(buf_t bp, vnode_t newvp);
127static errno_t buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo);
128static int buf_iterprepare(vnode_t vp, struct buflists *, int flags);
129static void buf_itercomplete(vnode_t vp, struct buflists *, int flags);
130static boolean_t buffer_cache_gc(int);
131static buf_t buf_brelse_shadow(buf_t bp);
132static void buf_free_meta_store(buf_t bp);
133
134static buf_t buf_create_shadow_internal(buf_t bp, boolean_t force_copy,
135 uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg, int priv);
136
137
138__private_extern__ int bdwrite_internal(buf_t, int);
139
140/* zone allocated buffer headers */
141static void bufzoneinit(void);
142static void bcleanbuf_thread_init(void);
143static void bcleanbuf_thread(void);
144
145static zone_t buf_hdr_zone;
146static int buf_hdr_count;
147
148
149/*
150 * Definitions for the buffer hash lists.
151 */
152#define BUFHASH(dvp, lbn) \
153 (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
154LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
155u_long bufhash;
156
157static buf_t incore_locked(vnode_t vp, daddr64_t blkno, struct bufhashhdr *dp);
158
159/* Definitions for the buffer stats. */
160struct bufstats bufstats;
161
162/* Number of delayed write buffers */
163long nbdwrite = 0;
164int blaundrycnt = 0;
165static int boot_nbuf_headers = 0;
166
167static TAILQ_HEAD(delayqueue, buf) delaybufqueue;
168
169static TAILQ_HEAD(ioqueue, buf) iobufqueue;
170static TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
171static int needbuffer;
172static int need_iobuffer;
173
174static lck_grp_t *buf_mtx_grp;
175static lck_attr_t *buf_mtx_attr;
176static lck_grp_attr_t *buf_mtx_grp_attr;
177static lck_mtx_t *iobuffer_mtxp;
178static lck_mtx_t *buf_mtxp;
179
180static int buf_busycount;
181
182static __inline__ int
183buf_timestamp(void)
184{
185 struct timeval t;
186 microuptime(&t);
187 return (t.tv_sec);
188}
189
190/*
191 * Insq/Remq for the buffer free lists.
192 */
193#if BALANCE_QUEUES
194#define binsheadfree(bp, dp, whichq) do { \
195 TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
196 bufqinc((whichq)); \
197 } while (0)
198
199#define binstailfree(bp, dp, whichq) do { \
200 TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
201 bufqinc((whichq)); \
202 } while (0)
203#else
204#define binsheadfree(bp, dp, whichq) do { \
205 TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
206 } while (0)
207
208#define binstailfree(bp, dp, whichq) do { \
209 TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
210 } while (0)
211#endif
212
213
214#define BHASHENTCHECK(bp) \
215 if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef) \
216 panic("%p: b_hash.le_prev is not deadbeef", (bp));
217
218#define BLISTNONE(bp) \
219 (bp)->b_hash.le_next = (struct buf *)0; \
220 (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
221
222/*
223 * Insq/Remq for the vnode usage lists.
224 */
225#define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs)
226#define bufremvn(bp) { \
227 LIST_REMOVE(bp, b_vnbufs); \
228 (bp)->b_vnbufs.le_next = NOLIST; \
229}
230
231/*
232 * Time in seconds before a buffer on a list is
233 * considered as a stale buffer
234 */
235#define LRU_IS_STALE 120 /* default value for the LRU */
236#define AGE_IS_STALE 60 /* default value for the AGE */
237#define META_IS_STALE 180 /* default value for the BQ_META */
238
239int lru_is_stale = LRU_IS_STALE;
240int age_is_stale = AGE_IS_STALE;
241int meta_is_stale = META_IS_STALE;
242
243#define MAXLAUNDRY 10
244
245/* LIST_INSERT_HEAD() with assertions */
246static __inline__ void
247blistenterhead(struct bufhashhdr * head, buf_t bp)
248{
249 if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
250 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
251 (head)->lh_first = bp;
252 bp->b_hash.le_prev = &(head)->lh_first;
253 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
254 panic("blistenterhead: le_prev is deadbeef");
255}
256
257static __inline__ void
258binshash(buf_t bp, struct bufhashhdr *dp)
259{
260#if DIAGNOSTIC
261 buf_t nbp;
262#endif /* DIAGNOSTIC */
263
264 BHASHENTCHECK(bp);
265
266#if DIAGNOSTIC
267 nbp = dp->lh_first;
268 for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
269 if(nbp == bp)
270 panic("buf already in hashlist");
271 }
272#endif /* DIAGNOSTIC */
273
274 blistenterhead(dp, bp);
275}
276
277static __inline__ void
278bremhash(buf_t bp)
279{
280 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
281 panic("bremhash le_prev is deadbeef");
282 if (bp->b_hash.le_next == bp)
283 panic("bremhash: next points to self");
284
285 if (bp->b_hash.le_next != NULL)
286 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
287 *bp->b_hash.le_prev = (bp)->b_hash.le_next;
288}
289
290/*
291 * buf_mtxp held.
292 */
293static __inline__ void
294bmovelaundry(buf_t bp)
295{
296 bp->b_whichq = BQ_LAUNDRY;
297 bp->b_timestamp = buf_timestamp();
298 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
299 blaundrycnt++;
300}
301
302static __inline__ void
303buf_release_credentials(buf_t bp)
304{
305 if (IS_VALID_CRED(bp->b_rcred)) {
306 kauth_cred_unref(&bp->b_rcred);
307 }
308 if (IS_VALID_CRED(bp->b_wcred)) {
309 kauth_cred_unref(&bp->b_wcred);
310 }
311}
312
313
314int
315buf_valid(buf_t bp) {
316
317 if ( (bp->b_flags & (B_DONE | B_DELWRI)) )
318 return 1;
319 return 0;
320}
321
322int
323buf_fromcache(buf_t bp) {
324
325 if ( (bp->b_flags & B_CACHE) )
326 return 1;
327 return 0;
328}
329
330void
331buf_markinvalid(buf_t bp) {
332
333 SET(bp->b_flags, B_INVAL);
334}
335
336void
337buf_markdelayed(buf_t bp) {
338
339 if (!ISSET(bp->b_flags, B_DELWRI)) {
340 SET(bp->b_flags, B_DELWRI);
341
342 OSAddAtomicLong(1, &nbdwrite);
343 buf_reassign(bp, bp->b_vp);
344 }
345 SET(bp->b_flags, B_DONE);
346}
347
348void
349buf_markclean(buf_t bp) {
350
351 if (ISSET(bp->b_flags, B_DELWRI)) {
352 CLR(bp->b_flags, B_DELWRI);
353
354 OSAddAtomicLong(-1, &nbdwrite);
355 buf_reassign(bp, bp->b_vp);
356 }
357}
358
359void
360buf_markeintr(buf_t bp) {
361
362 SET(bp->b_flags, B_EINTR);
363}
364
365
366void
367buf_markaged(buf_t bp) {
368
369 SET(bp->b_flags, B_AGE);
370}
371
372int
373buf_fua(buf_t bp) {
374
375 if ((bp->b_flags & B_FUA) == B_FUA)
376 return 1;
377 return 0;
378}
379
380void
381buf_markfua(buf_t bp) {
382
383 SET(bp->b_flags, B_FUA);
384}
385
386#if CONFIG_PROTECT
387void
388buf_setcpaddr(buf_t bp, struct cprotect *entry) {
389 bp->b_attr.ba_cpentry = entry;
390}
391
392void
393buf_setcpoff (buf_t bp, uint64_t foffset) {
394 bp->b_attr.ba_cp_file_off = foffset;
395}
396
397void *
398bufattr_cpaddr(bufattr_t bap) {
399 return (bap->ba_cpentry);
400}
401
402uint64_t
403bufattr_cpoff(bufattr_t bap) {
404 return (bap->ba_cp_file_off);
405}
406
407void
408bufattr_setcpaddr(bufattr_t bap, void *cp_entry_addr) {
409 bap->ba_cpentry = cp_entry_addr;
410}
411
412void
413bufattr_setcpoff(bufattr_t bap, uint64_t foffset) {
414 bap->ba_cp_file_off = foffset;
415}
416
417#else
418void *
419bufattr_cpaddr(bufattr_t bap __unused) {
420 return NULL;
421}
422
423uint64_t
424bufattr_cpoff(bufattr_t bap __unused) {
425 return 0;
426}
427
428void
429bufattr_setcpaddr(bufattr_t bap __unused, void *cp_entry_addr __unused) {
430}
431
432void
433bufattr_setcpoff(__unused bufattr_t bap, __unused uint64_t foffset) {
434 return;
435}
436#endif /* CONFIG_PROTECT */
437
438bufattr_t
439bufattr_alloc() {
440 bufattr_t bap;
441 MALLOC(bap, bufattr_t, sizeof(struct bufattr), M_TEMP, M_WAITOK);
442 if (bap == NULL)
443 return NULL;
444
445 bzero(bap, sizeof(struct bufattr));
446 return bap;
447}
448
449void
450bufattr_free(bufattr_t bap) {
451 if (bap)
452 FREE(bap, M_TEMP);
453}
454
455int
456bufattr_rawencrypted(bufattr_t bap) {
457 if ( (bap->ba_flags & BA_RAW_ENCRYPTED_IO) )
458 return 1;
459 return 0;
460}
461
462int
463bufattr_throttled(bufattr_t bap) {
464 return (GET_BUFATTR_IO_TIER(bap));
465}
466
467int
468bufattr_nocache(bufattr_t bap) {
469 if ( (bap->ba_flags & BA_NOCACHE) )
470 return 1;
471 return 0;
472}
473
474int
475bufattr_meta(bufattr_t bap) {
476 if ( (bap->ba_flags & BA_META) )
477 return 1;
478 return 0;
479}
480
481int
482bufattr_delayidlesleep(bufattr_t bap)
483{
484 if ( (bap->ba_flags & BA_DELAYIDLESLEEP) )
485 return 1;
486 return 0;
487}
488
489bufattr_t
490buf_attr(buf_t bp) {
491 return &bp->b_attr;
492}
493
494void
495buf_markstatic(buf_t bp __unused) {
496 SET(bp->b_flags, B_STATICCONTENT);
497}
498
499int
500buf_static(buf_t bp) {
501 if ( (bp->b_flags & B_STATICCONTENT) )
502 return 1;
503 return 0;
504}
505
506void
507bufattr_markgreedymode(bufattr_t bap) {
508 SET(bap->ba_flags, BA_GREEDY_MODE);
509}
510
511int
512bufattr_greedymode(bufattr_t bap) {
513 if ( (bap->ba_flags & BA_GREEDY_MODE) )
514 return 1;
515 return 0;
516}
517
518void
519bufattr_markquickcomplete(bufattr_t bap) {
520 SET(bap->ba_flags, BA_QUICK_COMPLETE);
521}
522
523int
524bufattr_quickcomplete(bufattr_t bap) {
525 if ( (bap->ba_flags & BA_QUICK_COMPLETE) )
526 return 1;
527 return 0;
528}
529
530errno_t
531buf_error(buf_t bp) {
532
533 return (bp->b_error);
534}
535
536void
537buf_seterror(buf_t bp, errno_t error) {
538
539 if ((bp->b_error = error))
540 SET(bp->b_flags, B_ERROR);
541 else
542 CLR(bp->b_flags, B_ERROR);
543}
544
545void
546buf_setflags(buf_t bp, int32_t flags) {
547
548 SET(bp->b_flags, (flags & BUF_X_WRFLAGS));
549}
550
551void
552buf_clearflags(buf_t bp, int32_t flags) {
553
554 CLR(bp->b_flags, (flags & BUF_X_WRFLAGS));
555}
556
557int32_t
558buf_flags(buf_t bp) {
559
560 return ((bp->b_flags & BUF_X_RDFLAGS));
561}
562
563void
564buf_reset(buf_t bp, int32_t io_flags) {
565
566 CLR(bp->b_flags, (B_READ | B_WRITE | B_ERROR | B_DONE | B_INVAL | B_ASYNC | B_NOCACHE | B_FUA));
567 SET(bp->b_flags, (io_flags & (B_ASYNC | B_READ | B_WRITE | B_NOCACHE)));
568
569 bp->b_error = 0;
570}
571
572uint32_t
573buf_count(buf_t bp) {
574
575 return (bp->b_bcount);
576}
577
578void
579buf_setcount(buf_t bp, uint32_t bcount) {
580
581 bp->b_bcount = bcount;
582}
583
584uint32_t
585buf_size(buf_t bp) {
586
587 return (bp->b_bufsize);
588}
589
590void
591buf_setsize(buf_t bp, uint32_t bufsize) {
592
593 bp->b_bufsize = bufsize;
594}
595
596uint32_t
597buf_resid(buf_t bp) {
598
599 return (bp->b_resid);
600}
601
602void
603buf_setresid(buf_t bp, uint32_t resid) {
604
605 bp->b_resid = resid;
606}
607
608uint32_t
609buf_dirtyoff(buf_t bp) {
610
611 return (bp->b_dirtyoff);
612}
613
614uint32_t
615buf_dirtyend(buf_t bp) {
616
617 return (bp->b_dirtyend);
618}
619
620void
621buf_setdirtyoff(buf_t bp, uint32_t dirtyoff) {
622
623 bp->b_dirtyoff = dirtyoff;
624}
625
626void
627buf_setdirtyend(buf_t bp, uint32_t dirtyend) {
628
629 bp->b_dirtyend = dirtyend;
630}
631
632uintptr_t
633buf_dataptr(buf_t bp) {
634
635 return (bp->b_datap);
636}
637
638void
639buf_setdataptr(buf_t bp, uintptr_t data) {
640
641 bp->b_datap = data;
642}
643
644vnode_t
645buf_vnode(buf_t bp) {
646
647 return (bp->b_vp);
648}
649
650void
651buf_setvnode(buf_t bp, vnode_t vp) {
652
653 bp->b_vp = vp;
654}
655
656
657void *
658buf_callback(buf_t bp)
659{
660 if ( !(bp->b_flags & B_CALL) )
661 return ((void *) NULL);
662
663 return ((void *)bp->b_iodone);
664}
665
666
667errno_t
668buf_setcallback(buf_t bp, void (*callback)(buf_t, void *), void *transaction)
669{
670 if (callback)
671 bp->b_flags |= (B_CALL | B_ASYNC);
672 else
673 bp->b_flags &= ~B_CALL;
674 bp->b_transaction = transaction;
675 bp->b_iodone = callback;
676
677 return (0);
678}
679
680errno_t
681buf_setupl(buf_t bp, upl_t upl, uint32_t offset)
682{
683
684 if ( !(bp->b_lflags & BL_IOBUF) )
685 return (EINVAL);
686
687 if (upl)
688 bp->b_flags |= B_CLUSTER;
689 else
690 bp->b_flags &= ~B_CLUSTER;
691 bp->b_upl = upl;
692 bp->b_uploffset = offset;
693
694 return (0);
695}
696
697buf_t
698buf_clone(buf_t bp, int io_offset, int io_size, void (*iodone)(buf_t, void *), void *arg)
699{
700 buf_t io_bp;
701
702 if (io_offset < 0 || io_size < 0)
703 return (NULL);
704
705 if ((unsigned)(io_offset + io_size) > (unsigned)bp->b_bcount)
706 return (NULL);
707
708 if (bp->b_flags & B_CLUSTER) {
709 if (io_offset && ((bp->b_uploffset + io_offset) & PAGE_MASK))
710 return (NULL);
711
712 if (((bp->b_uploffset + io_offset + io_size) & PAGE_MASK) && ((io_offset + io_size) < bp->b_bcount))
713 return (NULL);
714 }
715 io_bp = alloc_io_buf(bp->b_vp, 0);
716
717 io_bp->b_flags = bp->b_flags & (B_COMMIT_UPL | B_META | B_PAGEIO | B_CLUSTER | B_PHYS | B_RAW | B_ASYNC | B_READ | B_FUA);
718
719 if (iodone) {
720 io_bp->b_transaction = arg;
721 io_bp->b_iodone = iodone;
722 io_bp->b_flags |= B_CALL;
723 }
724 if (bp->b_flags & B_CLUSTER) {
725 io_bp->b_upl = bp->b_upl;
726 io_bp->b_uploffset = bp->b_uploffset + io_offset;
727 } else {
728 io_bp->b_datap = (uintptr_t)(((char *)bp->b_datap) + io_offset);
729 }
730 io_bp->b_bcount = io_size;
731
732 return (io_bp);
733}
734
735
736int
737buf_shadow(buf_t bp)
738{
739 if (bp->b_lflags & BL_SHADOW)
740 return 1;
741 return 0;
742}
743
744
745buf_t
746buf_create_shadow_priv(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg)
747{
748 return (buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, 1));
749}
750
751buf_t
752buf_create_shadow(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg)
753{
754 return (buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, 0));
755}
756
757
758static buf_t
759buf_create_shadow_internal(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg, int priv)
760{
761 buf_t io_bp;
762
763 KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_START, bp, 0, 0, 0, 0);
764
765 if ( !(bp->b_flags & B_META) || (bp->b_lflags & BL_IOBUF)) {
766
767 KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_END, bp, 0, 0, 0, 0);
768 return (NULL);
769 }
770#ifdef BUF_MAKE_PRIVATE
771 if (bp->b_shadow_ref && bp->b_data_ref == 0 && external_storage == 0)
772 panic("buf_create_shadow: %p is in the private state (%d, %d)", bp, bp->b_shadow_ref, bp->b_data_ref);
773#endif
774 io_bp = alloc_io_buf(bp->b_vp, priv);
775
776 io_bp->b_flags = bp->b_flags & (B_META | B_ZALLOC | B_ASYNC | B_READ | B_FUA);
777 io_bp->b_blkno = bp->b_blkno;
778 io_bp->b_lblkno = bp->b_lblkno;
779
780 if (iodone) {
781 io_bp->b_transaction = arg;
782 io_bp->b_iodone = iodone;
783 io_bp->b_flags |= B_CALL;
784 }
785 if (force_copy == FALSE) {
786 io_bp->b_bcount = bp->b_bcount;
787 io_bp->b_bufsize = bp->b_bufsize;
788
789 if (external_storage) {
790 io_bp->b_datap = external_storage;
791#ifdef BUF_MAKE_PRIVATE
792 io_bp->b_data_store = NULL;
793#endif
794 } else {
795 io_bp->b_datap = bp->b_datap;
796#ifdef BUF_MAKE_PRIVATE
797 io_bp->b_data_store = bp;
798#endif
799 }
800 *(buf_t *)(&io_bp->b_orig) = bp;
801
802 lck_mtx_lock_spin(buf_mtxp);
803
804 io_bp->b_lflags |= BL_SHADOW;
805 io_bp->b_shadow = bp->b_shadow;
806 bp->b_shadow = io_bp;
807 bp->b_shadow_ref++;
808
809#ifdef BUF_MAKE_PRIVATE
810 if (external_storage)
811 io_bp->b_lflags |= BL_EXTERNAL;
812 else
813 bp->b_data_ref++;
814#endif
815 lck_mtx_unlock(buf_mtxp);
816 } else {
817 if (external_storage) {
818#ifdef BUF_MAKE_PRIVATE
819 io_bp->b_lflags |= BL_EXTERNAL;
820#endif
821 io_bp->b_bcount = bp->b_bcount;
822 io_bp->b_bufsize = bp->b_bufsize;
823 io_bp->b_datap = external_storage;
824 } else {
825 allocbuf(io_bp, bp->b_bcount);
826
827 io_bp->b_lflags |= BL_IOBUF_ALLOC;
828 }
829 bcopy((caddr_t)bp->b_datap, (caddr_t)io_bp->b_datap, bp->b_bcount);
830
831#ifdef BUF_MAKE_PRIVATE
832 io_bp->b_data_store = NULL;
833#endif
834 }
835 KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, io_bp, 0);
836
837 return (io_bp);
838}
839
840
841#ifdef BUF_MAKE_PRIVATE
842errno_t
843buf_make_private(buf_t bp)
844{
845 buf_t ds_bp;
846 buf_t t_bp;
847 struct buf my_buf;
848
849 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_START, bp, bp->b_shadow_ref, 0, 0, 0);
850
851 if (bp->b_shadow_ref == 0 || bp->b_data_ref == 0 || ISSET(bp->b_lflags, BL_SHADOW)) {
852
853 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, EINVAL, 0);
854 return (EINVAL);
855 }
856 my_buf.b_flags = B_META;
857 my_buf.b_datap = (uintptr_t)NULL;
858 allocbuf(&my_buf, bp->b_bcount);
859
860 bcopy((caddr_t)bp->b_datap, (caddr_t)my_buf.b_datap, bp->b_bcount);
861
862 lck_mtx_lock_spin(buf_mtxp);
863
864 for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) {
865 if ( !ISSET(bp->b_lflags, BL_EXTERNAL))
866 break;
867 }
868 ds_bp = t_bp;
869
870 if (ds_bp == NULL && bp->b_data_ref)
871 panic("buf_make_private: b_data_ref != 0 && ds_bp == NULL");
872
873 if (ds_bp && (bp->b_data_ref == 0 || bp->b_shadow_ref == 0))
874 panic("buf_make_private: ref_count == 0 && ds_bp != NULL");
875
876 if (ds_bp == NULL) {
877 lck_mtx_unlock(buf_mtxp);
878
879 buf_free_meta_store(&my_buf);
880
881 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, EINVAL, 0);
882 return (EINVAL);
883 }
884 for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) {
885 if ( !ISSET(t_bp->b_lflags, BL_EXTERNAL))
886 t_bp->b_data_store = ds_bp;
887 }
888 ds_bp->b_data_ref = bp->b_data_ref;
889
890 bp->b_data_ref = 0;
891 bp->b_datap = my_buf.b_datap;
892
893 lck_mtx_unlock(buf_mtxp);
894
895 KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, 0, 0);
896 return (0);
897}
898#endif
899
900
901void
902buf_setfilter(buf_t bp, void (*filter)(buf_t, void *), void *transaction,
903 void (**old_iodone)(buf_t, void *), void **old_transaction)
904{
905 if (old_iodone)
906 *old_iodone = bp->b_iodone;
907 if (old_transaction)
908 *old_transaction = bp->b_transaction;
909
910 bp->b_transaction = transaction;
911 bp->b_iodone = filter;
912 if (filter)
913 bp->b_flags |= B_FILTER;
914 else
915 bp->b_flags &= ~B_FILTER;
916}
917
918
919daddr64_t
920buf_blkno(buf_t bp) {
921
922 return (bp->b_blkno);
923}
924
925daddr64_t
926buf_lblkno(buf_t bp) {
927
928 return (bp->b_lblkno);
929}
930
931void
932buf_setblkno(buf_t bp, daddr64_t blkno) {
933
934 bp->b_blkno = blkno;
935}
936
937void
938buf_setlblkno(buf_t bp, daddr64_t lblkno) {
939
940 bp->b_lblkno = lblkno;
941}
942
943dev_t
944buf_device(buf_t bp) {
945
946 return (bp->b_dev);
947}
948
949errno_t
950buf_setdevice(buf_t bp, vnode_t vp) {
951
952 if ((vp->v_type != VBLK) && (vp->v_type != VCHR))
953 return EINVAL;
954 bp->b_dev = vp->v_rdev;
955
956 return 0;
957}
958
959
960void *
961buf_drvdata(buf_t bp) {
962
963 return (bp->b_drvdata);
964}
965
966void
967buf_setdrvdata(buf_t bp, void *drvdata) {
968
969 bp->b_drvdata = drvdata;
970}
971
972void *
973buf_fsprivate(buf_t bp) {
974
975 return (bp->b_fsprivate);
976}
977
978void
979buf_setfsprivate(buf_t bp, void *fsprivate) {
980
981 bp->b_fsprivate = fsprivate;
982}
983
984kauth_cred_t
985buf_rcred(buf_t bp) {
986
987 return (bp->b_rcred);
988}
989
990kauth_cred_t
991buf_wcred(buf_t bp) {
992
993 return (bp->b_wcred);
994}
995
996void *
997buf_upl(buf_t bp) {
998
999 return (bp->b_upl);
1000}
1001
1002uint32_t
1003buf_uploffset(buf_t bp) {
1004
1005 return ((uint32_t)(bp->b_uploffset));
1006}
1007
1008proc_t
1009buf_proc(buf_t bp) {
1010
1011 return (bp->b_proc);
1012}
1013
1014
1015errno_t
1016buf_map(buf_t bp, caddr_t *io_addr)
1017{
1018 buf_t real_bp;
1019 vm_offset_t vaddr;
1020 kern_return_t kret;
1021
1022 if ( !(bp->b_flags & B_CLUSTER)) {
1023 *io_addr = (caddr_t)bp->b_datap;
1024 return (0);
1025 }
1026 real_bp = (buf_t)(bp->b_real_bp);
1027
1028 if (real_bp && real_bp->b_datap) {
1029 /*
1030 * b_real_bp is only valid if B_CLUSTER is SET
1031 * if it's non-zero, than someone did a cluster_bp call
1032 * if the backing physical pages were already mapped
1033 * in before the call to cluster_bp (non-zero b_datap),
1034 * than we just use that mapping
1035 */
1036 *io_addr = (caddr_t)real_bp->b_datap;
1037 return (0);
1038 }
1039 kret = ubc_upl_map(bp->b_upl, &vaddr); /* Map it in */
1040
1041 if (kret != KERN_SUCCESS) {
1042 *io_addr = NULL;
1043
1044 return(ENOMEM);
1045 }
1046 vaddr += bp->b_uploffset;
1047
1048 *io_addr = (caddr_t)vaddr;
1049
1050 return (0);
1051}
1052
1053errno_t
1054buf_unmap(buf_t bp)
1055{
1056 buf_t real_bp;
1057 kern_return_t kret;
1058
1059 if ( !(bp->b_flags & B_CLUSTER))
1060 return (0);
1061 /*
1062 * see buf_map for the explanation
1063 */
1064 real_bp = (buf_t)(bp->b_real_bp);
1065
1066 if (real_bp && real_bp->b_datap)
1067 return (0);
1068
1069 if ((bp->b_lflags & BL_IOBUF) &&
1070 ((bp->b_flags & (B_PAGEIO | B_READ)) != (B_PAGEIO | B_READ))) {
1071 /*
1072 * ignore pageins... the 'right' thing will
1073 * happen due to the way we handle speculative
1074 * clusters...
1075 *
1076 * when we commit these pages, we'll hit
1077 * it with UPL_COMMIT_INACTIVE which
1078 * will clear the reference bit that got
1079 * turned on when we touched the mapping
1080 */
1081 bp->b_flags |= B_AGE;
1082 }
1083 kret = ubc_upl_unmap(bp->b_upl);
1084
1085 if (kret != KERN_SUCCESS)
1086 return (EINVAL);
1087 return (0);
1088}
1089
1090
1091void
1092buf_clear(buf_t bp) {
1093 caddr_t baddr;
1094
1095 if (buf_map(bp, &baddr) == 0) {
1096 bzero(baddr, bp->b_bcount);
1097 buf_unmap(bp);
1098 }
1099 bp->b_resid = 0;
1100}
1101
1102/*
1103 * Read or write a buffer that is not contiguous on disk.
1104 * buffer is marked done/error at the conclusion
1105 */
1106static int
1107buf_strategy_fragmented(vnode_t devvp, buf_t bp, off_t f_offset, size_t contig_bytes)
1108{
1109 vnode_t vp = buf_vnode(bp);
1110 buf_t io_bp; /* For reading or writing a single block */
1111 int io_direction;
1112 int io_resid;
1113 size_t io_contig_bytes;
1114 daddr64_t io_blkno;
1115 int error = 0;
1116 int bmap_flags;
1117
1118 /*
1119 * save our starting point... the bp was already mapped
1120 * in buf_strategy before we got called
1121 * no sense doing it again.
1122 */
1123 io_blkno = bp->b_blkno;
1124 /*
1125 * Make sure we redo this mapping for the next I/O
1126 * i.e. this can never be a 'permanent' mapping
1127 */
1128 bp->b_blkno = bp->b_lblkno;
1129
1130 /*
1131 * Get an io buffer to do the deblocking
1132 */
1133 io_bp = alloc_io_buf(devvp, 0);
1134
1135 io_bp->b_lblkno = bp->b_lblkno;
1136 io_bp->b_datap = bp->b_datap;
1137 io_resid = bp->b_bcount;
1138 io_direction = bp->b_flags & B_READ;
1139 io_contig_bytes = contig_bytes;
1140
1141 if (bp->b_flags & B_READ)
1142 bmap_flags = VNODE_READ;
1143 else
1144 bmap_flags = VNODE_WRITE;
1145
1146 for (;;) {
1147 if (io_blkno == -1)
1148 /*
1149 * this is unexepected, but we'll allow for it
1150 */
1151 bzero((caddr_t)io_bp->b_datap, (int)io_contig_bytes);
1152 else {
1153 io_bp->b_bcount = io_contig_bytes;
1154 io_bp->b_bufsize = io_contig_bytes;
1155 io_bp->b_resid = io_contig_bytes;
1156 io_bp->b_blkno = io_blkno;
1157
1158 buf_reset(io_bp, io_direction);
1159
1160 /*
1161 * Call the device to do the I/O and wait for it. Make sure the appropriate party is charged for write
1162 */
1163
1164 if (!ISSET(bp->b_flags, B_READ))
1165 OSAddAtomic(1, &devvp->v_numoutput);
1166
1167 if ((error = VNOP_STRATEGY(io_bp)))
1168 break;
1169 if ((error = (int)buf_biowait(io_bp)))
1170 break;
1171 if (io_bp->b_resid) {
1172 io_resid -= (io_contig_bytes - io_bp->b_resid);
1173 break;
1174 }
1175 }
1176 if ((io_resid -= io_contig_bytes) == 0)
1177 break;
1178 f_offset += io_contig_bytes;
1179 io_bp->b_datap += io_contig_bytes;
1180
1181 /*
1182 * Map the current position to a physical block number
1183 */
1184 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_resid, &io_blkno, &io_contig_bytes, NULL, bmap_flags, NULL)))
1185 break;
1186 }
1187 buf_free(io_bp);
1188
1189 if (error)
1190 buf_seterror(bp, error);
1191 bp->b_resid = io_resid;
1192 /*
1193 * This I/O is now complete
1194 */
1195 buf_biodone(bp);
1196
1197 return error;
1198}
1199
1200
1201/*
1202 * struct vnop_strategy_args {
1203 * struct buf *a_bp;
1204 * } *ap;
1205 */
1206errno_t
1207buf_strategy(vnode_t devvp, void *ap)
1208{
1209 buf_t bp = ((struct vnop_strategy_args *)ap)->a_bp;
1210 vnode_t vp = bp->b_vp;
1211 int bmap_flags;
1212 errno_t error;
1213#if CONFIG_DTRACE
1214 int dtrace_io_start_flag = 0; /* We only want to trip the io:::start
1215 * probe once, with the true physical
1216 * block in place (b_blkno)
1217 */
1218
1219#endif
1220
1221 if (vp == NULL || vp->v_type == VCHR || vp->v_type == VBLK)
1222 panic("buf_strategy: b_vp == NULL || vtype == VCHR | VBLK\n");
1223 /*
1224 * associate the physical device with
1225 * with this buf_t even if we don't
1226 * end up issuing the I/O...
1227 */
1228 bp->b_dev = devvp->v_rdev;
1229
1230 if (bp->b_flags & B_READ)
1231 bmap_flags = VNODE_READ;
1232 else
1233 bmap_flags = VNODE_WRITE;
1234
1235 if ( !(bp->b_flags & B_CLUSTER)) {
1236
1237 if ( (bp->b_upl) ) {
1238 /*
1239 * we have a UPL associated with this bp
1240 * go through cluster_bp which knows how
1241 * to deal with filesystem block sizes
1242 * that aren't equal to the page size
1243 */
1244 DTRACE_IO1(start, buf_t, bp);
1245 return (cluster_bp(bp));
1246 }
1247 if (bp->b_blkno == bp->b_lblkno) {
1248 off_t f_offset;
1249 size_t contig_bytes;
1250
1251 if ((error = VNOP_BLKTOOFF(vp, bp->b_lblkno, &f_offset))) {
1252 DTRACE_IO1(start, buf_t, bp);
1253 buf_seterror(bp, error);
1254 buf_biodone(bp);
1255
1256 return (error);
1257 }
1258
1259 if ((error = VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))) {
1260 DTRACE_IO1(start, buf_t, bp);
1261 buf_seterror(bp, error);
1262 buf_biodone(bp);
1263
1264 return (error);
1265 }
1266
1267 DTRACE_IO1(start, buf_t, bp);
1268#if CONFIG_DTRACE
1269 dtrace_io_start_flag = 1;
1270#endif /* CONFIG_DTRACE */
1271
1272 if ((bp->b_blkno == -1) || (contig_bytes == 0)) {
1273 /* Set block number to force biodone later */
1274 bp->b_blkno = -1;
1275 buf_clear(bp);
1276 }
1277 else if ((long)contig_bytes < bp->b_bcount) {
1278 return (buf_strategy_fragmented(devvp, bp, f_offset, contig_bytes));
1279 }
1280 }
1281
1282#if CONFIG_DTRACE
1283 if (dtrace_io_start_flag == 0) {
1284 DTRACE_IO1(start, buf_t, bp);
1285 dtrace_io_start_flag = 1;
1286 }
1287#endif /* CONFIG_DTRACE */
1288
1289 if (bp->b_blkno == -1) {
1290 buf_biodone(bp);
1291 return (0);
1292 }
1293 }
1294
1295#if CONFIG_DTRACE
1296 if (dtrace_io_start_flag == 0)
1297 DTRACE_IO1(start, buf_t, bp);
1298#endif /* CONFIG_DTRACE */
1299
1300#if CONFIG_PROTECT
1301 /* Capture f_offset in the bufattr*/
1302 if (bp->b_attr.ba_cpentry != 0) {
1303 /* No need to go here for older EAs */
1304 if(bp->b_attr.ba_cpentry->cp_flags & CP_OFF_IV_ENABLED) {
1305 off_t f_offset;
1306 if ((error = VNOP_BLKTOOFF(bp->b_vp, bp->b_lblkno, &f_offset)))
1307 return error;
1308
1309 /*
1310 * Attach the file offset to this buffer. The
1311 * bufattr attributes will be passed down the stack
1312 * until they reach IOFlashStorage. IOFlashStorage
1313 * will retain the offset in a local variable when it
1314 * issues its I/Os to the NAND controller.
1315 *
1316 * Note that LwVM may end up splitting this I/O
1317 * into sub-I/Os if it crosses a chunk boundary. In this
1318 * case, LwVM will update this field when it dispatches
1319 * each I/O to IOFlashStorage. But from our perspective
1320 * we have only issued a single I/O.
1321 */
1322 bufattr_setcpoff (&(bp->b_attr), (u_int64_t)f_offset);
1323 }
1324 }
1325#endif
1326
1327 /*
1328 * we can issue the I/O because...
1329 * either B_CLUSTER is set which
1330 * means that the I/O is properly set
1331 * up to be a multiple of the page size, or
1332 * we were able to successfully set up the
1333 * physical block mapping
1334 */
1335 error = VOCALL(devvp->v_op, VOFFSET(vnop_strategy), ap);
1336 DTRACE_FSINFO(strategy, vnode_t, vp);
1337 return (error);
1338}
1339
1340
1341
1342buf_t
1343buf_alloc(vnode_t vp)
1344{
1345 return(alloc_io_buf(vp, 0));
1346}
1347
1348void
1349buf_free(buf_t bp) {
1350
1351 free_io_buf(bp);
1352}
1353
1354
1355/*
1356 * iterate buffers for the specified vp.
1357 * if BUF_SCAN_DIRTY is set, do the dirty list
1358 * if BUF_SCAN_CLEAN is set, do the clean list
1359 * if neither flag is set, default to BUF_SCAN_DIRTY
1360 * if BUF_NOTIFY_BUSY is set, call the callout function using a NULL bp for busy pages
1361 */
1362
1363struct buf_iterate_info_t {
1364 int flag;
1365 struct buflists *listhead;
1366};
1367
1368void
1369buf_iterate(vnode_t vp, int (*callout)(buf_t, void *), int flags, void *arg)
1370{
1371 buf_t bp;
1372 int retval;
1373 struct buflists local_iterblkhd;
1374 int lock_flags = BAC_NOWAIT | BAC_REMOVE;
1375 int notify_busy = flags & BUF_NOTIFY_BUSY;
1376 struct buf_iterate_info_t list[2];
1377 int num_lists, i;
1378
1379 if (flags & BUF_SKIP_LOCKED)
1380 lock_flags |= BAC_SKIP_LOCKED;
1381 if (flags & BUF_SKIP_NONLOCKED)
1382 lock_flags |= BAC_SKIP_NONLOCKED;
1383
1384 if ( !(flags & (BUF_SCAN_DIRTY | BUF_SCAN_CLEAN)))
1385 flags |= BUF_SCAN_DIRTY;
1386
1387 num_lists = 0;
1388
1389 if (flags & BUF_SCAN_DIRTY) {
1390 list[num_lists].flag = VBI_DIRTY;
1391 list[num_lists].listhead = &vp->v_dirtyblkhd;
1392 num_lists++;
1393 }
1394 if (flags & BUF_SCAN_CLEAN) {
1395 list[num_lists].flag = VBI_CLEAN;
1396 list[num_lists].listhead = &vp->v_cleanblkhd;
1397 num_lists++;
1398 }
1399
1400 for (i = 0; i < num_lists; i++) {
1401 lck_mtx_lock(buf_mtxp);
1402
1403 if (buf_iterprepare(vp, &local_iterblkhd, list[i].flag)) {
1404 lck_mtx_unlock(buf_mtxp);
1405 continue;
1406 }
1407 while (!LIST_EMPTY(&local_iterblkhd)) {
1408 bp = LIST_FIRST(&local_iterblkhd);
1409 LIST_REMOVE(bp, b_vnbufs);
1410 LIST_INSERT_HEAD(list[i].listhead, bp, b_vnbufs);
1411
1412 if (buf_acquire_locked(bp, lock_flags, 0, 0)) {
1413 if (notify_busy) {
1414 bp = NULL;
1415 } else {
1416 continue;
1417 }
1418 }
1419
1420 lck_mtx_unlock(buf_mtxp);
1421
1422 retval = callout(bp, arg);
1423
1424 switch (retval) {
1425 case BUF_RETURNED:
1426 if (bp)
1427 buf_brelse(bp);
1428 break;
1429 case BUF_CLAIMED:
1430 break;
1431 case BUF_RETURNED_DONE:
1432 if (bp)
1433 buf_brelse(bp);
1434 lck_mtx_lock(buf_mtxp);
1435 goto out;
1436 case BUF_CLAIMED_DONE:
1437 lck_mtx_lock(buf_mtxp);
1438 goto out;
1439 }
1440 lck_mtx_lock(buf_mtxp);
1441 } /* while list has more nodes */
1442 out:
1443 buf_itercomplete(vp, &local_iterblkhd, list[i].flag);
1444 lck_mtx_unlock(buf_mtxp);
1445 } /* for each list */
1446} /* buf_iterate */
1447
1448
1449/*
1450 * Flush out and invalidate all buffers associated with a vnode.
1451 */
1452int
1453buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo)
1454{
1455 buf_t bp;
1456 int aflags;
1457 int error = 0;
1458 int must_rescan = 1;
1459 struct buflists local_iterblkhd;
1460
1461
1462 if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd))
1463 return (0);
1464
1465 lck_mtx_lock(buf_mtxp);
1466
1467 for (;;) {
1468 if (must_rescan == 0)
1469 /*
1470 * the lists may not be empty, but all that's left at this
1471 * point are metadata or B_LOCKED buffers which are being
1472 * skipped... we know this because we made it through both
1473 * the clean and dirty lists without dropping buf_mtxp...
1474 * each time we drop buf_mtxp we bump "must_rescan"
1475 */
1476 break;
1477 if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd))
1478 break;
1479 must_rescan = 0;
1480 /*
1481 * iterate the clean list
1482 */
1483 if (buf_iterprepare(vp, &local_iterblkhd, VBI_CLEAN)) {
1484 goto try_dirty_list;
1485 }
1486 while (!LIST_EMPTY(&local_iterblkhd)) {
1487
1488 bp = LIST_FIRST(&local_iterblkhd);
1489
1490 LIST_REMOVE(bp, b_vnbufs);
1491 LIST_INSERT_HEAD(&vp->v_cleanblkhd, bp, b_vnbufs);
1492
1493 /*
1494 * some filesystems distinguish meta data blocks with a negative logical block #
1495 */
1496 if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
1497 continue;
1498
1499 aflags = BAC_REMOVE;
1500
1501 if ( !(flags & BUF_INVALIDATE_LOCKED) )
1502 aflags |= BAC_SKIP_LOCKED;
1503
1504 if ( (error = (int)buf_acquire_locked(bp, aflags, slpflag, slptimeo)) ) {
1505 if (error == EDEADLK)
1506 /*
1507 * this buffer was marked B_LOCKED...
1508 * we didn't drop buf_mtxp, so we
1509 * we don't need to rescan
1510 */
1511 continue;
1512 if (error == EAGAIN) {
1513 /*
1514 * found a busy buffer... we blocked and
1515 * dropped buf_mtxp, so we're going to
1516 * need to rescan after this pass is completed
1517 */
1518 must_rescan++;
1519 continue;
1520 }
1521 /*
1522 * got some kind of 'real' error out of the msleep
1523 * in buf_acquire_locked, terminate the scan and return the error
1524 */
1525 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1526
1527 lck_mtx_unlock(buf_mtxp);
1528 return (error);
1529 }
1530 lck_mtx_unlock(buf_mtxp);
1531
1532 if (bp->b_flags & B_LOCKED)
1533 KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 0, 0);
1534
1535 CLR(bp->b_flags, B_LOCKED);
1536 SET(bp->b_flags, B_INVAL);
1537 buf_brelse(bp);
1538
1539 lck_mtx_lock(buf_mtxp);
1540
1541 /*
1542 * by dropping buf_mtxp, we allow new
1543 * buffers to be added to the vnode list(s)
1544 * we'll have to rescan at least once more
1545 * if the queues aren't empty
1546 */
1547 must_rescan++;
1548 }
1549 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1550
1551try_dirty_list:
1552 /*
1553 * Now iterate on dirty blks
1554 */
1555 if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY)) {
1556 continue;
1557 }
1558 while (!LIST_EMPTY(&local_iterblkhd)) {
1559 bp = LIST_FIRST(&local_iterblkhd);
1560
1561 LIST_REMOVE(bp, b_vnbufs);
1562 LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1563
1564 /*
1565 * some filesystems distinguish meta data blocks with a negative logical block #
1566 */
1567 if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
1568 continue;
1569
1570 aflags = BAC_REMOVE;
1571
1572 if ( !(flags & BUF_INVALIDATE_LOCKED) )
1573 aflags |= BAC_SKIP_LOCKED;
1574
1575 if ( (error = (int)buf_acquire_locked(bp, aflags, slpflag, slptimeo)) ) {
1576 if (error == EDEADLK)
1577 /*
1578 * this buffer was marked B_LOCKED...
1579 * we didn't drop buf_mtxp, so we
1580 * we don't need to rescan
1581 */
1582 continue;
1583 if (error == EAGAIN) {
1584 /*
1585 * found a busy buffer... we blocked and
1586 * dropped buf_mtxp, so we're going to
1587 * need to rescan after this pass is completed
1588 */
1589 must_rescan++;
1590 continue;
1591 }
1592 /*
1593 * got some kind of 'real' error out of the msleep
1594 * in buf_acquire_locked, terminate the scan and return the error
1595 */
1596 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1597
1598 lck_mtx_unlock(buf_mtxp);
1599 return (error);
1600 }
1601 lck_mtx_unlock(buf_mtxp);
1602
1603 if (bp->b_flags & B_LOCKED)
1604 KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 1, 0);
1605
1606 CLR(bp->b_flags, B_LOCKED);
1607 SET(bp->b_flags, B_INVAL);
1608
1609 if (ISSET(bp->b_flags, B_DELWRI) && (flags & BUF_WRITE_DATA))
1610 (void) VNOP_BWRITE(bp);
1611 else
1612 buf_brelse(bp);
1613
1614 lck_mtx_lock(buf_mtxp);
1615 /*
1616 * by dropping buf_mtxp, we allow new
1617 * buffers to be added to the vnode list(s)
1618 * we'll have to rescan at least once more
1619 * if the queues aren't empty
1620 */
1621 must_rescan++;
1622 }
1623 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1624 }
1625 lck_mtx_unlock(buf_mtxp);
1626
1627 return (0);
1628}
1629
1630void
1631buf_flushdirtyblks(vnode_t vp, int wait, int flags, const char *msg) {
1632
1633 (void) buf_flushdirtyblks_skipinfo(vp, wait, flags, msg);
1634 return;
1635}
1636
1637int
1638buf_flushdirtyblks_skipinfo(vnode_t vp, int wait, int flags, const char *msg) {
1639 buf_t bp;
1640 int writes_issued = 0;
1641 errno_t error;
1642 int busy = 0;
1643 struct buflists local_iterblkhd;
1644 int lock_flags = BAC_NOWAIT | BAC_REMOVE;
1645 int any_locked = 0;
1646
1647 if (flags & BUF_SKIP_LOCKED)
1648 lock_flags |= BAC_SKIP_LOCKED;
1649 if (flags & BUF_SKIP_NONLOCKED)
1650 lock_flags |= BAC_SKIP_NONLOCKED;
1651loop:
1652 lck_mtx_lock(buf_mtxp);
1653
1654 if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY) == 0) {
1655 while (!LIST_EMPTY(&local_iterblkhd)) {
1656 bp = LIST_FIRST(&local_iterblkhd);
1657 LIST_REMOVE(bp, b_vnbufs);
1658 LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1659
1660 if ((error = buf_acquire_locked(bp, lock_flags, 0, 0)) == EBUSY) {
1661 busy++;
1662 }
1663 if (error) {
1664 /*
1665 * If we passed in BUF_SKIP_LOCKED or BUF_SKIP_NONLOCKED,
1666 * we may want to do somethign differently if a locked or unlocked
1667 * buffer was encountered (depending on the arg specified).
1668 * In this case, we know that one of those two was set, and the
1669 * buf acquisition failed above.
1670 *
1671 * If it failed with EDEADLK, then save state which can be emitted
1672 * later on to the caller. Most callers should not care.
1673 */
1674 if (error == EDEADLK) {
1675 any_locked++;
1676 }
1677 continue;
1678 }
1679 lck_mtx_unlock(buf_mtxp);
1680
1681 bp->b_flags &= ~B_LOCKED;
1682
1683 /*
1684 * Wait for I/O associated with indirect blocks to complete,
1685 * since there is no way to quickly wait for them below.
1686 */
1687 if ((bp->b_vp == vp) || (wait == 0))
1688 (void) buf_bawrite(bp);
1689 else
1690 (void) VNOP_BWRITE(bp);
1691 writes_issued++;
1692
1693 lck_mtx_lock(buf_mtxp);
1694 }
1695 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1696 }
1697 lck_mtx_unlock(buf_mtxp);
1698
1699 if (wait) {
1700 (void)vnode_waitforwrites(vp, 0, 0, 0, msg);
1701
1702 if (vp->v_dirtyblkhd.lh_first && busy) {
1703 /*
1704 * we had one or more BUSY buffers on
1705 * the dirtyblock list... most likely
1706 * these are due to delayed writes that
1707 * were moved to the bclean queue but
1708 * have not yet been 'written'.
1709 * if we issued some writes on the
1710 * previous pass, we try again immediately
1711 * if we didn't, we'll sleep for some time
1712 * to allow the state to change...
1713 */
1714 if (writes_issued == 0) {
1715 (void)tsleep((caddr_t)&vp->v_numoutput,
1716 PRIBIO + 1, "vnode_flushdirtyblks", hz/20);
1717 }
1718 writes_issued = 0;
1719 busy = 0;
1720
1721 goto loop;
1722 }
1723 }
1724
1725 return any_locked;
1726}
1727
1728
1729/*
1730 * called with buf_mtxp held...
1731 * this lock protects the queue manipulation
1732 */
1733static int
1734buf_iterprepare(vnode_t vp, struct buflists *iterheadp, int flags)
1735{
1736 struct buflists * listheadp;
1737
1738 if (flags & VBI_DIRTY)
1739 listheadp = &vp->v_dirtyblkhd;
1740 else
1741 listheadp = &vp->v_cleanblkhd;
1742
1743 while (vp->v_iterblkflags & VBI_ITER) {
1744 vp->v_iterblkflags |= VBI_ITERWANT;
1745 msleep(&vp->v_iterblkflags, buf_mtxp, 0, "buf_iterprepare", NULL);
1746 }
1747 if (LIST_EMPTY(listheadp)) {
1748 LIST_INIT(iterheadp);
1749 return(EINVAL);
1750 }
1751 vp->v_iterblkflags |= VBI_ITER;
1752
1753 iterheadp->lh_first = listheadp->lh_first;
1754 listheadp->lh_first->b_vnbufs.le_prev = &iterheadp->lh_first;
1755 LIST_INIT(listheadp);
1756
1757 return(0);
1758}
1759
1760/*
1761 * called with buf_mtxp held...
1762 * this lock protects the queue manipulation
1763 */
1764static void
1765buf_itercomplete(vnode_t vp, struct buflists *iterheadp, int flags)
1766{
1767 struct buflists * listheadp;
1768 buf_t bp;
1769
1770 if (flags & VBI_DIRTY)
1771 listheadp = &vp->v_dirtyblkhd;
1772 else
1773 listheadp = &vp->v_cleanblkhd;
1774
1775 while (!LIST_EMPTY(iterheadp)) {
1776 bp = LIST_FIRST(iterheadp);
1777 LIST_REMOVE(bp, b_vnbufs);
1778 LIST_INSERT_HEAD(listheadp, bp, b_vnbufs);
1779 }
1780 vp->v_iterblkflags &= ~VBI_ITER;
1781
1782 if (vp->v_iterblkflags & VBI_ITERWANT) {
1783 vp->v_iterblkflags &= ~VBI_ITERWANT;
1784 wakeup(&vp->v_iterblkflags);
1785 }
1786}
1787
1788
1789static void
1790bremfree_locked(buf_t bp)
1791{
1792 struct bqueues *dp = NULL;
1793 int whichq;
1794
1795 whichq = bp->b_whichq;
1796
1797 if (whichq == -1) {
1798 if (bp->b_shadow_ref == 0)
1799 panic("bremfree_locked: %p not on freelist", bp);
1800 /*
1801 * there are clones pointing to 'bp'...
1802 * therefore, it was not put on a freelist
1803 * when buf_brelse was last called on 'bp'
1804 */
1805 return;
1806 }
1807 /*
1808 * We only calculate the head of the freelist when removing
1809 * the last element of the list as that is the only time that
1810 * it is needed (e.g. to reset the tail pointer).
1811 *
1812 * NB: This makes an assumption about how tailq's are implemented.
1813 */
1814 if (bp->b_freelist.tqe_next == NULL) {
1815 dp = &bufqueues[whichq];
1816
1817 if (dp->tqh_last != &bp->b_freelist.tqe_next)
1818 panic("bremfree: lost tail");
1819 }
1820 TAILQ_REMOVE(dp, bp, b_freelist);
1821
1822#if BALANCE_QUEUES
1823 bufqdec(whichq);
1824#endif
1825 if (whichq == BQ_LAUNDRY)
1826 blaundrycnt--;
1827
1828 bp->b_whichq = -1;
1829 bp->b_timestamp = 0;
1830 bp->b_shadow = 0;
1831}
1832
1833/*
1834 * Associate a buffer with a vnode.
1835 * buf_mtxp must be locked on entry
1836 */
1837static void
1838bgetvp_locked(vnode_t vp, buf_t bp)
1839{
1840
1841 if (bp->b_vp != vp)
1842 panic("bgetvp_locked: not free");
1843
1844 if (vp->v_type == VBLK || vp->v_type == VCHR)
1845 bp->b_dev = vp->v_rdev;
1846 else
1847 bp->b_dev = NODEV;
1848 /*
1849 * Insert onto list for new vnode.
1850 */
1851 bufinsvn(bp, &vp->v_cleanblkhd);
1852}
1853
1854/*
1855 * Disassociate a buffer from a vnode.
1856 * buf_mtxp must be locked on entry
1857 */
1858static void
1859brelvp_locked(buf_t bp)
1860{
1861 /*
1862 * Delete from old vnode list, if on one.
1863 */
1864 if (bp->b_vnbufs.le_next != NOLIST)
1865 bufremvn(bp);
1866
1867 bp->b_vp = (vnode_t)NULL;
1868}
1869
1870/*
1871 * Reassign a buffer from one vnode to another.
1872 * Used to assign file specific control information
1873 * (indirect blocks) to the vnode to which they belong.
1874 */
1875static void
1876buf_reassign(buf_t bp, vnode_t newvp)
1877{
1878 struct buflists *listheadp;
1879
1880 if (newvp == NULL) {
1881 printf("buf_reassign: NULL");
1882 return;
1883 }
1884 lck_mtx_lock_spin(buf_mtxp);
1885
1886 /*
1887 * Delete from old vnode list, if on one.
1888 */
1889 if (bp->b_vnbufs.le_next != NOLIST)
1890 bufremvn(bp);
1891 /*
1892 * If dirty, put on list of dirty buffers;
1893 * otherwise insert onto list of clean buffers.
1894 */
1895 if (ISSET(bp->b_flags, B_DELWRI))
1896 listheadp = &newvp->v_dirtyblkhd;
1897 else
1898 listheadp = &newvp->v_cleanblkhd;
1899 bufinsvn(bp, listheadp);
1900
1901 lck_mtx_unlock(buf_mtxp);
1902}
1903
1904static __inline__ void
1905bufhdrinit(buf_t bp)
1906{
1907 bzero((char *)bp, sizeof *bp);
1908 bp->b_dev = NODEV;
1909 bp->b_rcred = NOCRED;
1910 bp->b_wcred = NOCRED;
1911 bp->b_vnbufs.le_next = NOLIST;
1912 bp->b_flags = B_INVAL;
1913
1914 return;
1915}
1916
1917/*
1918 * Initialize buffers and hash links for buffers.
1919 */
1920__private_extern__ void
1921bufinit(void)
1922{
1923 buf_t bp;
1924 struct bqueues *dp;
1925 int i;
1926
1927 nbuf_headers = 0;
1928 /* Initialize the buffer queues ('freelists') and the hash table */
1929 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
1930 TAILQ_INIT(dp);
1931 bufhashtbl = hashinit(nbuf_hashelements, M_CACHE, &bufhash);
1932
1933 buf_busycount = 0;
1934
1935 /* Initialize the buffer headers */
1936 for (i = 0; i < max_nbuf_headers; i++) {
1937 nbuf_headers++;
1938 bp = &buf_headers[i];
1939 bufhdrinit(bp);
1940
1941 BLISTNONE(bp);
1942 dp = &bufqueues[BQ_EMPTY];
1943 bp->b_whichq = BQ_EMPTY;
1944 bp->b_timestamp = buf_timestamp();
1945 binsheadfree(bp, dp, BQ_EMPTY);
1946 binshash(bp, &invalhash);
1947 }
1948 boot_nbuf_headers = nbuf_headers;
1949
1950 TAILQ_INIT(&iobufqueue);
1951 TAILQ_INIT(&delaybufqueue);
1952
1953 for (; i < nbuf_headers + niobuf_headers; i++) {
1954 bp = &buf_headers[i];
1955 bufhdrinit(bp);
1956 bp->b_whichq = -1;
1957 binsheadfree(bp, &iobufqueue, -1);
1958 }
1959
1960 /*
1961 * allocate lock group attribute and group
1962 */
1963 buf_mtx_grp_attr = lck_grp_attr_alloc_init();
1964 buf_mtx_grp = lck_grp_alloc_init("buffer cache", buf_mtx_grp_attr);
1965
1966 /*
1967 * allocate the lock attribute
1968 */
1969 buf_mtx_attr = lck_attr_alloc_init();
1970
1971 /*
1972 * allocate and initialize mutex's for the buffer and iobuffer pools
1973 */
1974 buf_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
1975 iobuffer_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
1976
1977 if (iobuffer_mtxp == NULL)
1978 panic("couldn't create iobuffer mutex");
1979
1980 if (buf_mtxp == NULL)
1981 panic("couldn't create buf mutex");
1982
1983 /*
1984 * allocate and initialize cluster specific global locks...
1985 */
1986 cluster_init();
1987
1988 printf("using %d buffer headers and %d cluster IO buffer headers\n",
1989 nbuf_headers, niobuf_headers);
1990
1991 /* Set up zones used by the buffer cache */
1992 bufzoneinit();
1993
1994 /* start the bcleanbuf() thread */
1995 bcleanbuf_thread_init();
1996
1997 /* Register a callout for relieving vm pressure */
1998 if (vm_set_buffer_cleanup_callout(buffer_cache_gc) != KERN_SUCCESS) {
1999 panic("Couldn't register buffer cache callout for vm pressure!\n");
2000 }
2001
2002#if BALANCE_QUEUES
2003 {
2004 static void bufq_balance_thread_init(void);
2005 /* create a thread to do dynamic buffer queue balancing */
2006 bufq_balance_thread_init();
2007 }
2008#endif /* notyet */
2009}
2010
2011
2012
2013/*
2014 * Zones for the meta data buffers
2015 */
2016
2017#define MINMETA 512
2018#define MAXMETA 8192
2019
2020struct meta_zone_entry {
2021 zone_t mz_zone;
2022 vm_size_t mz_size;
2023 vm_size_t mz_max;
2024 const char *mz_name;
2025};
2026
2027struct meta_zone_entry meta_zones[] = {
2028 {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
2029 {NULL, (MINMETA * 2), 64 * (MINMETA * 2), "buf.1024" },
2030 {NULL, (MINMETA * 4), 16 * (MINMETA * 4), "buf.2048" },
2031 {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
2032 {NULL, (MINMETA * 16), 512 * (MINMETA * 16), "buf.8192" },
2033 {NULL, 0, 0, "" } /* End */
2034};
2035
2036/*
2037 * Initialize the meta data zones
2038 */
2039static void
2040bufzoneinit(void)
2041{
2042 int i;
2043
2044 for (i = 0; meta_zones[i].mz_size != 0; i++) {
2045 meta_zones[i].mz_zone =
2046 zinit(meta_zones[i].mz_size,
2047 meta_zones[i].mz_max,
2048 PAGE_SIZE,
2049 meta_zones[i].mz_name);
2050 zone_change(meta_zones[i].mz_zone, Z_CALLERACCT, FALSE);
2051 }
2052 buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
2053 zone_change(buf_hdr_zone, Z_CALLERACCT, FALSE);
2054}
2055
2056static __inline__ zone_t
2057getbufzone(size_t size)
2058{
2059 int i;
2060
2061 if ((size % 512) || (size < MINMETA) || (size > MAXMETA))
2062 panic("getbufzone: incorect size = %lu", size);
2063
2064 for (i = 0; meta_zones[i].mz_size != 0; i++) {
2065 if (meta_zones[i].mz_size >= size)
2066 break;
2067 }
2068
2069 return (meta_zones[i].mz_zone);
2070}
2071
2072
2073
2074static struct buf *
2075bio_doread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, int async, int queuetype)
2076{
2077 buf_t bp;
2078
2079 bp = buf_getblk(vp, blkno, size, 0, 0, queuetype);
2080
2081 /*
2082 * If buffer does not have data valid, start a read.
2083 * Note that if buffer is B_INVAL, buf_getblk() won't return it.
2084 * Therefore, it's valid if it's I/O has completed or been delayed.
2085 */
2086 if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
2087 struct proc *p;
2088
2089 p = current_proc();
2090
2091 /* Start I/O for the buffer (keeping credentials). */
2092 SET(bp->b_flags, B_READ | async);
2093 if (IS_VALID_CRED(cred) && !IS_VALID_CRED(bp->b_rcred)) {
2094 kauth_cred_ref(cred);
2095 bp->b_rcred = cred;
2096 }
2097
2098 VNOP_STRATEGY(bp);
2099
2100 trace(TR_BREADMISS, pack(vp, size), blkno);
2101
2102 /* Pay for the read. */
2103 if (p && p->p_stats) {
2104 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_inblock); /* XXX */
2105 OSAddAtomic64(size, &p->p_stats->ri_diskiobytes.ri_bytesread);
2106 }
2107
2108 if (async) {
2109 /*
2110 * since we asked for an ASYNC I/O
2111 * the biodone will do the brelse
2112 * we don't want to pass back a bp
2113 * that we don't 'own'
2114 */
2115 bp = NULL;
2116 }
2117 } else if (async) {
2118 buf_brelse(bp);
2119 bp = NULL;
2120 }
2121
2122 trace(TR_BREADHIT, pack(vp, size), blkno);
2123
2124 return (bp);
2125}
2126
2127/*
2128 * Perform the reads for buf_breadn() and buf_meta_breadn().
2129 * Trivial modification to the breada algorithm presented in Bach (p.55).
2130 */
2131static errno_t
2132do_breadn_for_type(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes,
2133 int nrablks, kauth_cred_t cred, buf_t *bpp, int queuetype)
2134{
2135 buf_t bp;
2136 int i;
2137
2138 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, queuetype);
2139
2140 /*
2141 * For each of the read-ahead blocks, start a read, if necessary.
2142 */
2143 for (i = 0; i < nrablks; i++) {
2144 /* If it's in the cache, just go on to next one. */
2145 if (incore(vp, rablks[i]))
2146 continue;
2147
2148 /* Get a buffer for the read-ahead block */
2149 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, queuetype);
2150 }
2151
2152 /* Otherwise, we had to start a read for it; wait until it's valid. */
2153 return (buf_biowait(bp));
2154}
2155
2156
2157/*
2158 * Read a disk block.
2159 * This algorithm described in Bach (p.54).
2160 */
2161errno_t
2162buf_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp)
2163{
2164 buf_t bp;
2165
2166 /* Get buffer for block. */
2167 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
2168
2169 /* Wait for the read to complete, and return result. */
2170 return (buf_biowait(bp));
2171}
2172
2173/*
2174 * Read a disk block. [bread() for meta-data]
2175 * This algorithm described in Bach (p.54).
2176 */
2177errno_t
2178buf_meta_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp)
2179{
2180 buf_t bp;
2181
2182 /* Get buffer for block. */
2183 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
2184
2185 /* Wait for the read to complete, and return result. */
2186 return (buf_biowait(bp));
2187}
2188
2189/*
2190 * Read-ahead multiple disk blocks. The first is sync, the rest async.
2191 */
2192errno_t
2193buf_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, kauth_cred_t cred, buf_t *bpp)
2194{
2195 return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_READ));
2196}
2197
2198/*
2199 * Read-ahead multiple disk blocks. The first is sync, the rest async.
2200 * [buf_breadn() for meta-data]
2201 */
2202errno_t
2203buf_meta_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, kauth_cred_t cred, buf_t *bpp)
2204{
2205 return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_META));
2206}
2207
2208/*
2209 * Block write. Described in Bach (p.56)
2210 */
2211errno_t
2212buf_bwrite(buf_t bp)
2213{
2214 int sync, wasdelayed;
2215 errno_t rv;
2216 proc_t p = current_proc();
2217 vnode_t vp = bp->b_vp;
2218
2219 if (bp->b_datap == 0) {
2220 if (brecover_data(bp) == 0)
2221 return (0);
2222 }
2223 /* Remember buffer type, to switch on it later. */
2224 sync = !ISSET(bp->b_flags, B_ASYNC);
2225 wasdelayed = ISSET(bp->b_flags, B_DELWRI);
2226 CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
2227
2228 if (wasdelayed)
2229 OSAddAtomicLong(-1, &nbdwrite);
2230
2231 if (!sync) {
2232 /*
2233 * If not synchronous, pay for the I/O operation and make
2234 * sure the buf is on the correct vnode queue. We have
2235 * to do this now, because if we don't, the vnode may not
2236 * be properly notified that its I/O has completed.
2237 */
2238 if (wasdelayed)
2239 buf_reassign(bp, vp);
2240 else
2241 if (p && p->p_stats) {
2242 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); /* XXX */
2243 OSAddAtomic64(buf_count(bp), &p->p_stats->ri_diskiobytes.ri_byteswritten);
2244 }
2245 }
2246 trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
2247
2248 /* Initiate disk write. Make sure the appropriate party is charged. */
2249
2250 OSAddAtomic(1, &vp->v_numoutput);
2251
2252 VNOP_STRATEGY(bp);
2253
2254 if (sync) {
2255 /*
2256 * If I/O was synchronous, wait for it to complete.
2257 */
2258 rv = buf_biowait(bp);
2259
2260 /*
2261 * Pay for the I/O operation, if it's not been paid for, and
2262 * make sure it's on the correct vnode queue. (async operatings
2263 * were payed for above.)
2264 */
2265 if (wasdelayed)
2266 buf_reassign(bp, vp);
2267 else
2268 if (p && p->p_stats) {
2269 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); /* XXX */
2270 OSAddAtomic64(buf_count(bp), &p->p_stats->ri_diskiobytes.ri_byteswritten);
2271 }
2272
2273 /* Release the buffer. */
2274 // XXXdbg - only if the unused bit is set
2275 if (!ISSET(bp->b_flags, B_NORELSE)) {
2276 buf_brelse(bp);
2277 } else {
2278 CLR(bp->b_flags, B_NORELSE);
2279 }
2280
2281 return (rv);
2282 } else {
2283 return (0);
2284 }
2285}
2286
2287int
2288vn_bwrite(struct vnop_bwrite_args *ap)
2289{
2290 return (buf_bwrite(ap->a_bp));
2291}
2292
2293/*
2294 * Delayed write.
2295 *
2296 * The buffer is marked dirty, but is not queued for I/O.
2297 * This routine should be used when the buffer is expected
2298 * to be modified again soon, typically a small write that
2299 * partially fills a buffer.
2300 *
2301 * NB: magnetic tapes cannot be delayed; they must be
2302 * written in the order that the writes are requested.
2303 *
2304 * Described in Leffler, et al. (pp. 208-213).
2305 *
2306 * Note: With the ability to allocate additional buffer
2307 * headers, we can get in to the situation where "too" many
2308 * buf_bdwrite()s can create situation where the kernel can create
2309 * buffers faster than the disks can service. Doing a buf_bawrite() in
2310 * cases where we have "too many" outstanding buf_bdwrite()s avoids that.
2311 */
2312__private_extern__ int
2313bdwrite_internal(buf_t bp, int return_error)
2314{
2315 proc_t p = current_proc();
2316 vnode_t vp = bp->b_vp;
2317
2318 /*
2319 * If the block hasn't been seen before:
2320 * (1) Mark it as having been seen,
2321 * (2) Charge for the write.
2322 * (3) Make sure it's on its vnode's correct block list,
2323 */
2324 if (!ISSET(bp->b_flags, B_DELWRI)) {
2325 SET(bp->b_flags, B_DELWRI);
2326 if (p && p->p_stats) {
2327 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); /* XXX */
2328 OSAddAtomic64(buf_count(bp), &p->p_stats->ri_diskiobytes.ri_byteswritten);
2329 }
2330 OSAddAtomicLong(1, &nbdwrite);
2331 buf_reassign(bp, vp);
2332 }
2333
2334 /*
2335 * if we're not LOCKED, but the total number of delayed writes
2336 * has climbed above 75% of the total buffers in the system
2337 * return an error if the caller has indicated that it can
2338 * handle one in this case, otherwise schedule the I/O now
2339 * this is done to prevent us from allocating tons of extra
2340 * buffers when dealing with virtual disks (i.e. DiskImages),
2341 * because additional buffers are dynamically allocated to prevent
2342 * deadlocks from occurring
2343 *
2344 * however, can't do a buf_bawrite() if the LOCKED bit is set because the
2345 * buffer is part of a transaction and can't go to disk until
2346 * the LOCKED bit is cleared.
2347 */
2348 if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf_headers/4)*3)) {
2349 if (return_error)
2350 return (EAGAIN);
2351 /*
2352 * If the vnode has "too many" write operations in progress
2353 * wait for them to finish the IO
2354 */
2355 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, "buf_bdwrite");
2356
2357 return (buf_bawrite(bp));
2358 }
2359
2360 /* Otherwise, the "write" is done, so mark and release the buffer. */
2361 SET(bp->b_flags, B_DONE);
2362 buf_brelse(bp);
2363 return (0);
2364}
2365
2366errno_t
2367buf_bdwrite(buf_t bp)
2368{
2369 return (bdwrite_internal(bp, 0));
2370}
2371
2372
2373/*
2374 * Asynchronous block write; just an asynchronous buf_bwrite().
2375 *
2376 * Note: With the abilitty to allocate additional buffer
2377 * headers, we can get in to the situation where "too" many
2378 * buf_bawrite()s can create situation where the kernel can create
2379 * buffers faster than the disks can service.
2380 * We limit the number of "in flight" writes a vnode can have to
2381 * avoid this.
2382 */
2383static int
2384bawrite_internal(buf_t bp, int throttle)
2385{
2386 vnode_t vp = bp->b_vp;
2387
2388 if (vp) {
2389 if (throttle)
2390 /*
2391 * If the vnode has "too many" write operations in progress
2392 * wait for them to finish the IO
2393 */
2394 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, (const char *)"buf_bawrite");
2395 else if (vp->v_numoutput >= VNODE_ASYNC_THROTTLE)
2396 /*
2397 * return to the caller and
2398 * let him decide what to do
2399 */
2400 return (EWOULDBLOCK);
2401 }
2402 SET(bp->b_flags, B_ASYNC);
2403
2404 return (VNOP_BWRITE(bp));
2405}
2406
2407errno_t
2408buf_bawrite(buf_t bp)
2409{
2410 return (bawrite_internal(bp, 1));
2411}
2412
2413
2414
2415static void
2416buf_free_meta_store(buf_t bp)
2417{
2418 if (bp->b_bufsize) {
2419 if (ISSET(bp->b_flags, B_ZALLOC)) {
2420 zone_t z;
2421
2422 z = getbufzone(bp->b_bufsize);
2423 zfree(z, (void *)bp->b_datap);
2424 } else
2425 kmem_free(kernel_map, bp->b_datap, bp->b_bufsize);
2426
2427 bp->b_datap = (uintptr_t)NULL;
2428 bp->b_bufsize = 0;
2429 }
2430}
2431
2432
2433static buf_t
2434buf_brelse_shadow(buf_t bp)
2435{
2436 buf_t bp_head;
2437 buf_t bp_temp;
2438 buf_t bp_return = NULL;
2439#ifdef BUF_MAKE_PRIVATE
2440 buf_t bp_data;
2441 int data_ref = 0;
2442#endif
2443 int need_wakeup = 0;
2444
2445 lck_mtx_lock_spin(buf_mtxp);
2446
2447 bp_head = (buf_t)bp->b_orig;
2448
2449 if (bp_head->b_whichq != -1)
2450 panic("buf_brelse_shadow: bp_head on freelist %d\n", bp_head->b_whichq);
2451
2452#ifdef BUF_MAKE_PRIVATE
2453 if (bp_data = bp->b_data_store) {
2454 bp_data->b_data_ref--;
2455 /*
2456 * snapshot the ref count so that we can check it
2457 * outside of the lock... we only want the guy going
2458 * from 1 -> 0 to try and release the storage
2459 */
2460 data_ref = bp_data->b_data_ref;
2461 }
2462#endif
2463 KERNEL_DEBUG(0xbbbbc008 | DBG_FUNC_START, bp, bp_head, bp_head->b_shadow_ref, 0, 0);
2464
2465 bp_head->b_shadow_ref--;
2466
2467 for (bp_temp = bp_head; bp_temp && bp != bp_temp->b_shadow; bp_temp = bp_temp->b_shadow);
2468
2469 if (bp_temp == NULL)
2470 panic("buf_brelse_shadow: bp not on list %p", bp_head);
2471
2472 bp_temp->b_shadow = bp_temp->b_shadow->b_shadow;
2473
2474#ifdef BUF_MAKE_PRIVATE
2475 /*
2476 * we're about to free the current 'owner' of the data buffer and
2477 * there is at least one other shadow buf_t still pointing at it
2478 * so transfer it to the first shadow buf left in the chain
2479 */
2480 if (bp == bp_data && data_ref) {
2481 if ((bp_data = bp_head->b_shadow) == NULL)
2482 panic("buf_brelse_shadow: data_ref mismatch bp(%p)", bp);
2483
2484 for (bp_temp = bp_data; bp_temp; bp_temp = bp_temp->b_shadow)
2485 bp_temp->b_data_store = bp_data;
2486 bp_data->b_data_ref = data_ref;
2487 }
2488#endif
2489 if (bp_head->b_shadow_ref == 0 && bp_head->b_shadow)
2490 panic("buf_relse_shadow: b_shadow != NULL && b_shadow_ref == 0 bp(%p)", bp);
2491 if (bp_head->b_shadow_ref && bp_head->b_shadow == 0)
2492 panic("buf_relse_shadow: b_shadow == NULL && b_shadow_ref != 0 bp(%p)", bp);
2493
2494 if (bp_head->b_shadow_ref == 0) {
2495 if (!ISSET(bp_head->b_lflags, BL_BUSY)) {
2496
2497 CLR(bp_head->b_flags, B_AGE);
2498 bp_head->b_timestamp = buf_timestamp();
2499
2500 if (ISSET(bp_head->b_flags, B_LOCKED)) {
2501 bp_head->b_whichq = BQ_LOCKED;
2502 binstailfree(bp_head, &bufqueues[BQ_LOCKED], BQ_LOCKED);
2503 } else {
2504 bp_head->b_whichq = BQ_META;
2505 binstailfree(bp_head, &bufqueues[BQ_META], BQ_META);
2506 }
2507 } else if (ISSET(bp_head->b_lflags, BL_WAITSHADOW)) {
2508 CLR(bp_head->b_lflags, BL_WAITSHADOW);
2509
2510 bp_return = bp_head;
2511 }
2512 if (ISSET(bp_head->b_lflags, BL_WANTED_REF)) {
2513 CLR(bp_head->b_lflags, BL_WANTED_REF);
2514 need_wakeup = 1;
2515 }
2516 }
2517 lck_mtx_unlock(buf_mtxp);
2518
2519 if (need_wakeup)
2520 wakeup(bp_head);
2521
2522#ifdef BUF_MAKE_PRIVATE
2523 if (bp == bp_data && data_ref == 0)
2524 buf_free_meta_store(bp);
2525
2526 bp->b_data_store = NULL;
2527#endif
2528 KERNEL_DEBUG(0xbbbbc008 | DBG_FUNC_END, bp, 0, 0, 0, 0);
2529
2530 return (bp_return);
2531}
2532
2533
2534/*
2535 * Release a buffer on to the free lists.
2536 * Described in Bach (p. 46).
2537 */
2538void
2539buf_brelse(buf_t bp)
2540{
2541 struct bqueues *bufq;
2542 long whichq;
2543 upl_t upl;
2544 int need_wakeup = 0;
2545 int need_bp_wakeup = 0;
2546
2547
2548 if (bp->b_whichq != -1 || !(bp->b_lflags & BL_BUSY))
2549 panic("buf_brelse: bad buffer = %p\n", bp);
2550
2551#ifdef JOE_DEBUG
2552 (void) OSBacktrace(&bp->b_stackbrelse[0], 6);
2553
2554 bp->b_lastbrelse = current_thread();
2555 bp->b_tag = 0;
2556#endif
2557 if (bp->b_lflags & BL_IOBUF) {
2558 buf_t shadow_master_bp = NULL;
2559
2560 if (ISSET(bp->b_lflags, BL_SHADOW))
2561 shadow_master_bp = buf_brelse_shadow(bp);
2562 else if (ISSET(bp->b_lflags, BL_IOBUF_ALLOC))
2563 buf_free_meta_store(bp);
2564 free_io_buf(bp);
2565
2566 if (shadow_master_bp) {
2567 bp = shadow_master_bp;
2568 goto finish_shadow_master;
2569 }
2570 return;
2571 }
2572
2573 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
2574 bp->b_lblkno * PAGE_SIZE, bp, bp->b_datap,
2575 bp->b_flags, 0);
2576
2577 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
2578
2579 /*
2580 * if we're invalidating a buffer that has the B_FILTER bit
2581 * set then call the b_iodone function so it gets cleaned
2582 * up properly.
2583 *
2584 * the HFS journal code depends on this
2585 */
2586 if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
2587 if (ISSET(bp->b_flags, B_FILTER)) { /* if necessary, call out */
2588 void (*iodone_func)(struct buf *, void *) = bp->b_iodone;
2589 void *arg = bp->b_transaction;
2590
2591 CLR(bp->b_flags, B_FILTER); /* but note callout done */
2592 bp->b_iodone = NULL;
2593 bp->b_transaction = NULL;
2594
2595 if (iodone_func == NULL) {
2596 panic("brelse: bp @ %p has NULL b_iodone!\n", bp);
2597 }
2598 (*iodone_func)(bp, arg);
2599 }
2600 }
2601 /*
2602 * I/O is done. Cleanup the UPL state
2603 */
2604 upl = bp->b_upl;
2605
2606 if ( !ISSET(bp->b_flags, B_META) && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
2607 kern_return_t kret;
2608 int upl_flags;
2609
2610 if (upl == NULL) {
2611 if ( !ISSET(bp->b_flags, B_INVAL)) {
2612 kret = ubc_create_upl(bp->b_vp,
2613 ubc_blktooff(bp->b_vp, bp->b_lblkno),
2614 bp->b_bufsize,
2615 &upl,
2616 NULL,
2617 UPL_PRECIOUS);
2618
2619 if (kret != KERN_SUCCESS)
2620 panic("brelse: Failed to create UPL");
2621#if UPL_DEBUG
2622 upl_ubc_alias_set(upl, (uintptr_t) bp, (uintptr_t) 5);
2623#endif /* UPL_DEBUG */
2624 }
2625 } else {
2626 if (bp->b_datap) {
2627 kret = ubc_upl_unmap(upl);
2628
2629 if (kret != KERN_SUCCESS)
2630 panic("ubc_upl_unmap failed");
2631 bp->b_datap = (uintptr_t)NULL;
2632 }
2633 }
2634 if (upl) {
2635 if (bp->b_flags & (B_ERROR | B_INVAL)) {
2636 if (bp->b_flags & (B_READ | B_INVAL))
2637 upl_flags = UPL_ABORT_DUMP_PAGES;
2638 else
2639 upl_flags = 0;
2640
2641 ubc_upl_abort(upl, upl_flags);
2642 } else {
2643 if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY))
2644 upl_flags = UPL_COMMIT_SET_DIRTY ;
2645 else
2646 upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
2647
2648 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
2649 UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2650 }
2651 bp->b_upl = NULL;
2652 }
2653 } else {
2654 if ( (upl) )
2655 panic("brelse: UPL set for non VREG; vp=%p", bp->b_vp);
2656 }
2657
2658 /*
2659 * If it's locked, don't report an error; try again later.
2660 */
2661 if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
2662 CLR(bp->b_flags, B_ERROR);
2663 /*
2664 * If it's not cacheable, or an error, mark it invalid.
2665 */
2666 if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
2667 SET(bp->b_flags, B_INVAL);
2668
2669 if ((bp->b_bufsize <= 0) ||
2670 ISSET(bp->b_flags, B_INVAL) ||
2671 (ISSET(bp->b_lflags, BL_WANTDEALLOC) && !ISSET(bp->b_flags, B_DELWRI))) {
2672
2673 boolean_t delayed_buf_free_meta_store = FALSE;
2674
2675 /*
2676 * If it's invalid or empty, dissociate it from its vnode,
2677 * release its storage if B_META, and
2678 * clean it up a bit and put it on the EMPTY queue
2679 */
2680 if (ISSET(bp->b_flags, B_DELWRI))
2681 OSAddAtomicLong(-1, &nbdwrite);
2682
2683 if (ISSET(bp->b_flags, B_META)) {
2684 if (bp->b_shadow_ref)
2685 delayed_buf_free_meta_store = TRUE;
2686 else
2687 buf_free_meta_store(bp);
2688 }
2689 /*
2690 * nuke any credentials we were holding
2691 */
2692 buf_release_credentials(bp);
2693
2694 lck_mtx_lock_spin(buf_mtxp);
2695
2696 if (bp->b_shadow_ref) {
2697 SET(bp->b_lflags, BL_WAITSHADOW);
2698
2699 lck_mtx_unlock(buf_mtxp);
2700
2701 return;
2702 }
2703 if (delayed_buf_free_meta_store == TRUE) {
2704
2705 lck_mtx_unlock(buf_mtxp);
2706finish_shadow_master:
2707 buf_free_meta_store(bp);
2708
2709 lck_mtx_lock_spin(buf_mtxp);
2710 }
2711 CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA));
2712
2713 if (bp->b_vp)
2714 brelvp_locked(bp);
2715
2716 bremhash(bp);
2717 BLISTNONE(bp);
2718 binshash(bp, &invalhash);
2719
2720 bp->b_whichq = BQ_EMPTY;
2721 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
2722 } else {
2723
2724 /*
2725 * It has valid data. Put it on the end of the appropriate
2726 * queue, so that it'll stick around for as long as possible.
2727 */
2728 if (ISSET(bp->b_flags, B_LOCKED))
2729 whichq = BQ_LOCKED; /* locked in core */
2730 else if (ISSET(bp->b_flags, B_META))
2731 whichq = BQ_META; /* meta-data */
2732 else if (ISSET(bp->b_flags, B_AGE))
2733 whichq = BQ_AGE; /* stale but valid data */
2734 else
2735 whichq = BQ_LRU; /* valid data */
2736 bufq = &bufqueues[whichq];
2737
2738 bp->b_timestamp = buf_timestamp();
2739
2740 lck_mtx_lock_spin(buf_mtxp);
2741
2742 /*
2743 * the buf_brelse_shadow routine doesn't take 'ownership'
2744 * of the parent buf_t... it updates state that is protected by
2745 * the buf_mtxp, and checks for BL_BUSY to determine whether to
2746 * put the buf_t back on a free list. b_shadow_ref is protected
2747 * by the lock, and since we have not yet cleared B_BUSY, we need
2748 * to check it while holding the lock to insure that one of us
2749 * puts this buf_t back on a free list when it is safe to do so
2750 */
2751 if (bp->b_shadow_ref == 0) {
2752 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE));
2753 bp->b_whichq = whichq;
2754 binstailfree(bp, bufq, whichq);
2755 } else {
2756 /*
2757 * there are still cloned buf_t's pointing
2758 * at this guy... need to keep it off the
2759 * freelists until a buf_brelse is done on
2760 * the last clone
2761 */
2762 CLR(bp->b_flags, (B_ASYNC | B_NOCACHE));
2763 }
2764 }
2765 if (needbuffer) {
2766 /*
2767 * needbuffer is a global
2768 * we're currently using buf_mtxp to protect it
2769 * delay doing the actual wakeup until after
2770 * we drop buf_mtxp
2771 */
2772 needbuffer = 0;
2773 need_wakeup = 1;
2774 }
2775 if (ISSET(bp->b_lflags, BL_WANTED)) {
2776 /*
2777 * delay the actual wakeup until after we
2778 * clear BL_BUSY and we've dropped buf_mtxp
2779 */
2780 need_bp_wakeup = 1;
2781 }
2782 /*
2783 * Unlock the buffer.
2784 */
2785 CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
2786 buf_busycount--;
2787
2788 lck_mtx_unlock(buf_mtxp);
2789
2790 if (need_wakeup) {
2791 /*
2792 * Wake up any processes waiting for any buffer to become free.
2793 */
2794 wakeup(&needbuffer);
2795 }
2796 if (need_bp_wakeup) {
2797 /*
2798 * Wake up any proceeses waiting for _this_ buffer to become free.
2799 */
2800 wakeup(bp);
2801 }
2802 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
2803 bp, bp->b_datap, bp->b_flags, 0, 0);
2804}
2805
2806/*
2807 * Determine if a block is in the cache.
2808 * Just look on what would be its hash chain. If it's there, return
2809 * a pointer to it, unless it's marked invalid. If it's marked invalid,
2810 * we normally don't return the buffer, unless the caller explicitly
2811 * wants us to.
2812 */
2813static boolean_t
2814incore(vnode_t vp, daddr64_t blkno)
2815{
2816 boolean_t retval;
2817 struct bufhashhdr *dp;
2818
2819 dp = BUFHASH(vp, blkno);
2820
2821 lck_mtx_lock_spin(buf_mtxp);
2822
2823 if (incore_locked(vp, blkno, dp))
2824 retval = TRUE;
2825 else
2826 retval = FALSE;
2827 lck_mtx_unlock(buf_mtxp);
2828
2829 return (retval);
2830}
2831
2832
2833static buf_t
2834incore_locked(vnode_t vp, daddr64_t blkno, struct bufhashhdr *dp)
2835{
2836 struct buf *bp;
2837
2838 /* Search hash chain */
2839 for (bp = dp->lh_first; bp != NULL; bp = bp->b_hash.le_next) {
2840 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
2841 !ISSET(bp->b_flags, B_INVAL)) {
2842 return (bp);
2843 }
2844 }
2845 return (NULL);
2846}
2847
2848
2849void
2850buf_wait_for_shadow_io(vnode_t vp, daddr64_t blkno)
2851{
2852 buf_t bp;
2853 struct bufhashhdr *dp;
2854
2855 dp = BUFHASH(vp, blkno);
2856
2857 lck_mtx_lock_spin(buf_mtxp);
2858
2859 for (;;) {
2860 if ((bp = incore_locked(vp, blkno, dp)) == NULL)
2861 break;
2862
2863 if (bp->b_shadow_ref == 0)
2864 break;
2865
2866 SET(bp->b_lflags, BL_WANTED_REF);
2867
2868 (void) msleep(bp, buf_mtxp, PSPIN | (PRIBIO+1), "buf_wait_for_shadow", NULL);
2869 }
2870 lck_mtx_unlock(buf_mtxp);
2871}
2872
2873/* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
2874/*
2875 * Get a block of requested size that is associated with
2876 * a given vnode and block offset. If it is found in the
2877 * block cache, mark it as having been found, make it busy
2878 * and return it. Otherwise, return an empty block of the
2879 * correct size. It is up to the caller to insure that the
2880 * cached blocks be of the correct size.
2881 */
2882buf_t
2883buf_getblk(vnode_t vp, daddr64_t blkno, int size, int slpflag, int slptimeo, int operation)
2884{
2885 buf_t bp;
2886 int err;
2887 upl_t upl;
2888 upl_page_info_t *pl;
2889 kern_return_t kret;
2890 int ret_only_valid;
2891 struct timespec ts;
2892 int upl_flags;
2893 struct bufhashhdr *dp;
2894
2895 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
2896 (uintptr_t)(blkno * PAGE_SIZE), size, operation, 0, 0);
2897
2898 ret_only_valid = operation & BLK_ONLYVALID;
2899 operation &= ~BLK_ONLYVALID;
2900 dp = BUFHASH(vp, blkno);
2901start:
2902 lck_mtx_lock_spin(buf_mtxp);
2903
2904 if ((bp = incore_locked(vp, blkno, dp))) {
2905 /*
2906 * Found in the Buffer Cache
2907 */
2908 if (ISSET(bp->b_lflags, BL_BUSY)) {
2909 /*
2910 * but is busy
2911 */
2912 switch (operation) {
2913 case BLK_READ:
2914 case BLK_WRITE:
2915 case BLK_META:
2916 SET(bp->b_lflags, BL_WANTED);
2917 bufstats.bufs_busyincore++;
2918
2919 /*
2920 * don't retake the mutex after being awakened...
2921 * the time out is in msecs
2922 */
2923 ts.tv_sec = (slptimeo/1000);
2924 ts.tv_nsec = (slptimeo % 1000) * 10 * NSEC_PER_USEC * 1000;
2925
2926 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 396)) | DBG_FUNC_NONE,
2927 (uintptr_t)blkno, size, operation, 0, 0);
2928
2929 err = msleep(bp, buf_mtxp, slpflag | PDROP | (PRIBIO + 1), "buf_getblk", &ts);
2930
2931 /*
2932 * Callers who call with PCATCH or timeout are
2933 * willing to deal with the NULL pointer
2934 */
2935 if (err && ((slpflag & PCATCH) || ((err == EWOULDBLOCK) && slptimeo)))
2936 return (NULL);
2937 goto start;
2938 /*NOTREACHED*/
2939 break;
2940
2941 default:
2942 /*
2943 * unknown operation requested
2944 */
2945 panic("getblk: paging or unknown operation for incore busy buffer - %x\n", operation);
2946 /*NOTREACHED*/
2947 break;
2948 }
2949 } else {
2950 /*
2951 * buffer in core and not busy
2952 */
2953 SET(bp->b_lflags, BL_BUSY);
2954 SET(bp->b_flags, B_CACHE);
2955 buf_busycount++;
2956
2957 bremfree_locked(bp);
2958 bufstats.bufs_incore++;
2959
2960 lck_mtx_unlock(buf_mtxp);
2961#ifdef JOE_DEBUG
2962 bp->b_owner = current_thread();
2963 bp->b_tag = 1;
2964#endif
2965 if ( (bp->b_upl) )
2966 panic("buffer has UPL, but not marked BUSY: %p", bp);
2967
2968 if ( !ret_only_valid && bp->b_bufsize != size)
2969 allocbuf(bp, size);
2970
2971 upl_flags = 0;
2972 switch (operation) {
2973 case BLK_WRITE:
2974 /*
2975 * "write" operation: let the UPL subsystem
2976 * know that we intend to modify the buffer
2977 * cache pages we're gathering.
2978 */
2979 upl_flags |= UPL_WILL_MODIFY;
2980 case BLK_READ:
2981 upl_flags |= UPL_PRECIOUS;
2982 if (UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
2983 kret = ubc_create_upl(vp,
2984 ubc_blktooff(vp, bp->b_lblkno),
2985 bp->b_bufsize,
2986 &upl,
2987 &pl,
2988 upl_flags);
2989 if (kret != KERN_SUCCESS)
2990 panic("Failed to create UPL");
2991
2992 bp->b_upl = upl;
2993
2994 if (upl_valid_page(pl, 0)) {
2995 if (upl_dirty_page(pl, 0))
2996 SET(bp->b_flags, B_WASDIRTY);
2997 else
2998 CLR(bp->b_flags, B_WASDIRTY);
2999 } else
3000 CLR(bp->b_flags, (B_DONE | B_CACHE | B_WASDIRTY | B_DELWRI));
3001
3002 kret = ubc_upl_map(upl, (vm_offset_t*)&(bp->b_datap));
3003
3004 if (kret != KERN_SUCCESS)
3005 panic("getblk: ubc_upl_map() failed with (%d)", kret);
3006 }
3007 break;
3008
3009 case BLK_META:
3010 /*
3011 * VM is not involved in IO for the meta data
3012 * buffer already has valid data
3013 */
3014 break;
3015
3016 default:
3017 panic("getblk: paging or unknown operation for incore buffer- %d\n", operation);
3018 /*NOTREACHED*/
3019 break;
3020 }
3021 }
3022 } else { /* not incore() */
3023 int queue = BQ_EMPTY; /* Start with no preference */
3024
3025 if (ret_only_valid) {
3026 lck_mtx_unlock(buf_mtxp);
3027 return (NULL);
3028 }
3029 if ((vnode_isreg(vp) == 0) || (UBCINFOEXISTS(vp) == 0) /*|| (vnode_issystem(vp) == 1)*/)
3030 operation = BLK_META;
3031
3032 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
3033 goto start;
3034
3035 /*
3036 * getnewbuf may block for a number of different reasons...
3037 * if it does, it's then possible for someone else to
3038 * create a buffer for the same block and insert it into
3039 * the hash... if we see it incore at this point we dump
3040 * the buffer we were working on and start over
3041 */
3042 if (incore_locked(vp, blkno, dp)) {
3043 SET(bp->b_flags, B_INVAL);
3044 binshash(bp, &invalhash);
3045
3046 lck_mtx_unlock(buf_mtxp);
3047
3048 buf_brelse(bp);
3049 goto start;
3050 }
3051 /*
3052 * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
3053 * CALLED! BE CAREFUL.
3054 */
3055
3056 /*
3057 * mark the buffer as B_META if indicated
3058 * so that when buffer is released it will goto META queue
3059 */
3060 if (operation == BLK_META)
3061 SET(bp->b_flags, B_META);
3062
3063 bp->b_blkno = bp->b_lblkno = blkno;
3064 bp->b_vp = vp;
3065
3066 /*
3067 * Insert in the hash so that incore() can find it
3068 */
3069 binshash(bp, BUFHASH(vp, blkno));
3070
3071 bgetvp_locked(vp, bp);
3072
3073 lck_mtx_unlock(buf_mtxp);
3074
3075 allocbuf(bp, size);
3076
3077 upl_flags = 0;
3078 switch (operation) {
3079 case BLK_META:
3080 /*
3081 * buffer data is invalid...
3082 *
3083 * I don't want to have to retake buf_mtxp,
3084 * so the miss and vmhits counters are done
3085 * with Atomic updates... all other counters
3086 * in bufstats are protected with either
3087 * buf_mtxp or iobuffer_mtxp
3088 */
3089 OSAddAtomicLong(1, &bufstats.bufs_miss);
3090 break;
3091
3092 case BLK_WRITE:
3093 /*
3094 * "write" operation: let the UPL subsystem know
3095 * that we intend to modify the buffer cache pages
3096 * we're gathering.
3097 */
3098 upl_flags |= UPL_WILL_MODIFY;
3099 case BLK_READ:
3100 { off_t f_offset;
3101 size_t contig_bytes;
3102 int bmap_flags;
3103
3104 if ( (bp->b_upl) )
3105 panic("bp already has UPL: %p",bp);
3106
3107 f_offset = ubc_blktooff(vp, blkno);
3108
3109 upl_flags |= UPL_PRECIOUS;
3110 kret = ubc_create_upl(vp,
3111 f_offset,
3112 bp->b_bufsize,
3113 &upl,
3114 &pl,
3115 upl_flags);
3116
3117 if (kret != KERN_SUCCESS)
3118 panic("Failed to create UPL");
3119#if UPL_DEBUG
3120 upl_ubc_alias_set(upl, (uintptr_t) bp, (uintptr_t) 4);
3121#endif /* UPL_DEBUG */
3122 bp->b_upl = upl;
3123
3124 if (upl_valid_page(pl, 0)) {
3125
3126 if (operation == BLK_READ)
3127 bmap_flags = VNODE_READ;
3128 else
3129 bmap_flags = VNODE_WRITE;
3130
3131 SET(bp->b_flags, B_CACHE | B_DONE);
3132
3133 OSAddAtomicLong(1, &bufstats.bufs_vmhits);
3134
3135 bp->b_validoff = 0;
3136 bp->b_dirtyoff = 0;
3137
3138 if (upl_dirty_page(pl, 0)) {
3139 /* page is dirty */
3140 SET(bp->b_flags, B_WASDIRTY);
3141
3142 bp->b_validend = bp->b_bcount;
3143 bp->b_dirtyend = bp->b_bcount;
3144 } else {
3145 /* page is clean */
3146 bp->b_validend = bp->b_bcount;
3147 bp->b_dirtyend = 0;
3148 }
3149 /*
3150 * try to recreate the physical block number associated with
3151 * this buffer...
3152 */
3153 if (VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))
3154 panic("getblk: VNOP_BLOCKMAP failed");
3155 /*
3156 * if the extent represented by this buffer
3157 * is not completely physically contiguous on
3158 * disk, than we can't cache the physical mapping
3159 * in the buffer header
3160 */
3161 if ((long)contig_bytes < bp->b_bcount)
3162 bp->b_blkno = bp->b_lblkno;
3163 } else {
3164 OSAddAtomicLong(1, &bufstats.bufs_miss);
3165 }
3166 kret = ubc_upl_map(upl, (vm_offset_t *)&(bp->b_datap));
3167
3168 if (kret != KERN_SUCCESS)
3169 panic("getblk: ubc_upl_map() failed with (%d)", kret);
3170 break;
3171 }
3172 default:
3173 panic("getblk: paging or unknown operation - %x", operation);
3174 /*NOTREACHED*/
3175 break;
3176 }
3177 }
3178 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
3179 bp, bp->b_datap, bp->b_flags, 3, 0);
3180
3181#ifdef JOE_DEBUG
3182 (void) OSBacktrace(&bp->b_stackgetblk[0], 6);
3183#endif
3184 return (bp);
3185}
3186
3187/*
3188 * Get an empty, disassociated buffer of given size.
3189 */
3190buf_t
3191buf_geteblk(int size)
3192{
3193 buf_t bp = NULL;
3194 int queue = BQ_EMPTY;
3195
3196 do {
3197 lck_mtx_lock_spin(buf_mtxp);
3198
3199 bp = getnewbuf(0, 0, &queue);
3200 } while (bp == NULL);
3201
3202 SET(bp->b_flags, (B_META|B_INVAL));
3203
3204#if DIAGNOSTIC
3205 assert(queue == BQ_EMPTY);
3206#endif /* DIAGNOSTIC */
3207 /* XXX need to implement logic to deal with other queues */
3208
3209 binshash(bp, &invalhash);
3210 bufstats.bufs_eblk++;
3211
3212 lck_mtx_unlock(buf_mtxp);
3213
3214 allocbuf(bp, size);
3215
3216 return (bp);
3217}
3218
3219uint32_t
3220buf_redundancy_flags(buf_t bp)
3221{
3222 return bp->b_redundancy_flags;
3223}
3224
3225void
3226buf_set_redundancy_flags(buf_t bp, uint32_t flags)
3227{
3228 SET(bp->b_redundancy_flags, flags);
3229}
3230
3231void
3232buf_clear_redundancy_flags(buf_t bp, uint32_t flags)
3233{
3234 CLR(bp->b_redundancy_flags, flags);
3235}
3236
3237/*
3238 * With UBC, there is no need to expand / shrink the file data
3239 * buffer. The VM uses the same pages, hence no waste.
3240 * All the file data buffers can have one size.
3241 * In fact expand / shrink would be an expensive operation.
3242 *
3243 * Only exception to this is meta-data buffers. Most of the
3244 * meta data operations are smaller than PAGE_SIZE. Having the
3245 * meta-data buffers grow and shrink as needed, optimizes use
3246 * of the kernel wired memory.
3247 */
3248
3249int
3250allocbuf(buf_t bp, int size)
3251{
3252 vm_size_t desired_size;
3253
3254 desired_size = roundup(size, CLBYTES);
3255
3256 if (desired_size < PAGE_SIZE)
3257 desired_size = PAGE_SIZE;
3258 if (desired_size > MAXBSIZE)
3259 panic("allocbuf: buffer larger than MAXBSIZE requested");
3260
3261 if (ISSET(bp->b_flags, B_META)) {
3262 zone_t zprev, z;
3263 int nsize = roundup(size, MINMETA);
3264
3265 if (bp->b_datap) {
3266 vm_offset_t elem = (vm_offset_t)bp->b_datap;
3267
3268 if (ISSET(bp->b_flags, B_ZALLOC)) {
3269 if (bp->b_bufsize < nsize) {
3270 /* reallocate to a bigger size */
3271
3272 zprev = getbufzone(bp->b_bufsize);
3273 if (nsize <= MAXMETA) {
3274 desired_size = nsize;
3275 z = getbufzone(nsize);
3276 /* b_datap not really a ptr */
3277 *(void **)(&bp->b_datap) = zalloc(z);
3278 } else {
3279 bp->b_datap = (uintptr_t)NULL;
3280 kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
3281 CLR(bp->b_flags, B_ZALLOC);
3282 }
3283 bcopy((void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
3284 zfree(zprev, (void *)elem);
3285 } else {
3286 desired_size = bp->b_bufsize;
3287 }
3288
3289 } else {
3290 if ((vm_size_t)bp->b_bufsize < desired_size) {
3291 /* reallocate to a bigger size */
3292 bp->b_datap = (uintptr_t)NULL;
3293 kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
3294 bcopy((const void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
3295 kmem_free(kernel_map, elem, bp->b_bufsize);
3296 } else {
3297 desired_size = bp->b_bufsize;
3298 }
3299 }
3300 } else {
3301 /* new allocation */
3302 if (nsize <= MAXMETA) {
3303 desired_size = nsize;
3304 z = getbufzone(nsize);
3305 /* b_datap not really a ptr */
3306 *(void **)(&bp->b_datap) = zalloc(z);
3307 SET(bp->b_flags, B_ZALLOC);
3308 } else
3309 kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
3310 }
3311
3312 if (bp->b_datap == 0)
3313 panic("allocbuf: NULL b_datap");
3314 }
3315 bp->b_bufsize = desired_size;
3316 bp->b_bcount = size;
3317
3318 return (0);
3319}
3320
3321/*
3322 * Get a new buffer from one of the free lists.
3323 *
3324 * Request for a queue is passes in. The queue from which the buffer was taken
3325 * from is returned. Out of range queue requests get BQ_EMPTY. Request for
3326 * BQUEUE means no preference. Use heuristics in that case.
3327 * Heuristics is as follows:
3328 * Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
3329 * If none available block till one is made available.
3330 * If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
3331 * Pick the most stale buffer.
3332 * If found buffer was marked delayed write, start the async. write
3333 * and restart the search.
3334 * Initialize the fields and disassociate the buffer from the vnode.
3335 * Remove the buffer from the hash. Return the buffer and the queue
3336 * on which it was found.
3337 *
3338 * buf_mtxp is held upon entry
3339 * returns with buf_mtxp locked if new buf available
3340 * returns with buf_mtxp UNlocked if new buf NOT available
3341 */
3342
3343static buf_t
3344getnewbuf(int slpflag, int slptimeo, int * queue)
3345{
3346 buf_t bp;
3347 buf_t lru_bp;
3348 buf_t age_bp;
3349 buf_t meta_bp;
3350 int age_time, lru_time, bp_time, meta_time;
3351 int req = *queue; /* save it for restarts */
3352 struct timespec ts;
3353
3354start:
3355 /*
3356 * invalid request gets empty queue
3357 */
3358 if ((*queue >= BQUEUES) || (*queue < 0)
3359 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED))
3360 *queue = BQ_EMPTY;
3361
3362
3363 if (*queue == BQ_EMPTY && (bp = bufqueues[*queue].tqh_first))
3364 goto found;
3365
3366 /*
3367 * need to grow number of bufs, add another one rather than recycling
3368 */
3369 if (nbuf_headers < max_nbuf_headers) {
3370 /*
3371 * Increment count now as lock
3372 * is dropped for allocation.
3373 * That avoids over commits
3374 */
3375 nbuf_headers++;
3376 goto add_newbufs;
3377 }
3378 /* Try for the requested queue first */
3379 bp = bufqueues[*queue].tqh_first;
3380 if (bp)
3381 goto found;
3382
3383 /* Unable to use requested queue */
3384 age_bp = bufqueues[BQ_AGE].tqh_first;
3385 lru_bp = bufqueues[BQ_LRU].tqh_first;
3386 meta_bp = bufqueues[BQ_META].tqh_first;
3387
3388 if (!age_bp && !lru_bp && !meta_bp) {
3389 /*
3390 * Unavailble on AGE or LRU or META queues
3391 * Try the empty list first
3392 */
3393 bp = bufqueues[BQ_EMPTY].tqh_first;
3394 if (bp) {
3395 *queue = BQ_EMPTY;
3396 goto found;
3397 }
3398 /*
3399 * We have seen is this is hard to trigger.
3400 * This is an overcommit of nbufs but needed
3401 * in some scenarios with diskiamges
3402 */
3403
3404add_newbufs:
3405 lck_mtx_unlock(buf_mtxp);
3406
3407 /* Create a new temporary buffer header */
3408 bp = (struct buf *)zalloc(buf_hdr_zone);
3409
3410 if (bp) {
3411 bufhdrinit(bp);
3412 bp->b_whichq = BQ_EMPTY;
3413 bp->b_timestamp = buf_timestamp();
3414 BLISTNONE(bp);
3415 SET(bp->b_flags, B_HDRALLOC);
3416 *queue = BQ_EMPTY;
3417 }
3418 lck_mtx_lock_spin(buf_mtxp);
3419
3420 if (bp) {
3421 binshash(bp, &invalhash);
3422 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
3423 buf_hdr_count++;
3424 goto found;
3425 }
3426 /* subtract already accounted bufcount */
3427 nbuf_headers--;
3428
3429 bufstats.bufs_sleeps++;
3430
3431 /* wait for a free buffer of any kind */
3432 needbuffer = 1;
3433 /* hz value is 100 */
3434 ts.tv_sec = (slptimeo/1000);
3435 /* the hz value is 100; which leads to 10ms */
3436 ts.tv_nsec = (slptimeo % 1000) * NSEC_PER_USEC * 1000 * 10;
3437
3438 msleep(&needbuffer, buf_mtxp, slpflag | PDROP | (PRIBIO+1), "getnewbuf", &ts);
3439 return (NULL);
3440 }
3441
3442 /* Buffer available either on AGE or LRU or META */
3443 bp = NULL;
3444 *queue = -1;
3445
3446 /* Buffer available either on AGE or LRU */
3447 if (!age_bp) {
3448 bp = lru_bp;
3449 *queue = BQ_LRU;
3450 } else if (!lru_bp) {
3451 bp = age_bp;
3452 *queue = BQ_AGE;
3453 } else { /* buffer available on both AGE and LRU */
3454 int t = buf_timestamp();
3455
3456 age_time = t - age_bp->b_timestamp;
3457 lru_time = t - lru_bp->b_timestamp;
3458 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
3459 bp = age_bp;
3460 *queue = BQ_AGE;
3461 /*
3462 * we should probably re-timestamp eveything in the
3463 * queues at this point with the current time
3464 */
3465 } else {
3466 if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
3467 bp = lru_bp;
3468 *queue = BQ_LRU;
3469 } else {
3470 bp = age_bp;
3471 *queue = BQ_AGE;
3472 }
3473 }
3474 }
3475
3476 if (!bp) { /* Neither on AGE nor on LRU */
3477 bp = meta_bp;
3478 *queue = BQ_META;
3479 } else if (meta_bp) {
3480 int t = buf_timestamp();
3481
3482 bp_time = t - bp->b_timestamp;
3483 meta_time = t - meta_bp->b_timestamp;
3484
3485 if (!(bp_time < 0) && !(meta_time < 0)) {
3486 /* time not set backwards */
3487 int bp_is_stale;
3488 bp_is_stale = (*queue == BQ_LRU) ?
3489 lru_is_stale : age_is_stale;
3490
3491 if ((meta_time >= meta_is_stale) &&
3492 (bp_time < bp_is_stale)) {
3493 bp = meta_bp;
3494 *queue = BQ_META;
3495 }
3496 }
3497 }
3498found:
3499 if (ISSET(bp->b_flags, B_LOCKED) || ISSET(bp->b_lflags, BL_BUSY))
3500 panic("getnewbuf: bp @ %p is LOCKED or BUSY! (flags 0x%x)\n", bp, bp->b_flags);
3501
3502 /* Clean it */
3503 if (bcleanbuf(bp, FALSE)) {
3504 /*
3505 * moved to the laundry thread, buffer not ready
3506 */
3507 *queue = req;
3508 goto start;
3509 }
3510 return (bp);
3511}
3512
3513
3514/*
3515 * Clean a buffer.
3516 * Returns 0 if buffer is ready to use,
3517 * Returns 1 if issued a buf_bawrite() to indicate
3518 * that the buffer is not ready.
3519 *
3520 * buf_mtxp is held upon entry
3521 * returns with buf_mtxp locked
3522 */
3523int
3524bcleanbuf(buf_t bp, boolean_t discard)
3525{
3526 /* Remove from the queue */
3527 bremfree_locked(bp);
3528
3529#ifdef JOE_DEBUG
3530 bp->b_owner = current_thread();
3531 bp->b_tag = 2;
3532#endif
3533 /*
3534 * If buffer was a delayed write, start the IO by queuing
3535 * it on the LAUNDRY queue, and return 1
3536 */
3537 if (ISSET(bp->b_flags, B_DELWRI)) {
3538 if (discard) {
3539 SET(bp->b_lflags, BL_WANTDEALLOC);
3540 }
3541
3542 bmovelaundry(bp);
3543
3544 lck_mtx_unlock(buf_mtxp);
3545
3546 wakeup(&bufqueues[BQ_LAUNDRY]);
3547 /*
3548 * and give it a chance to run
3549 */
3550 (void)thread_block(THREAD_CONTINUE_NULL);
3551
3552 lck_mtx_lock_spin(buf_mtxp);
3553
3554 return (1);
3555 }
3556#ifdef JOE_DEBUG
3557 bp->b_owner = current_thread();
3558 bp->b_tag = 8;
3559#endif
3560 /*
3561 * Buffer is no longer on any free list... we own it
3562 */
3563 SET(bp->b_lflags, BL_BUSY);
3564 buf_busycount++;
3565
3566 bremhash(bp);
3567
3568 /*
3569 * disassociate us from our vnode, if we had one...
3570 */
3571 if (bp->b_vp)
3572 brelvp_locked(bp);
3573
3574 lck_mtx_unlock(buf_mtxp);
3575
3576 BLISTNONE(bp);
3577
3578 if (ISSET(bp->b_flags, B_META))
3579 buf_free_meta_store(bp);
3580
3581 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
3582
3583 buf_release_credentials(bp);
3584
3585 /* If discarding, just move to the empty queue */
3586 if (discard) {
3587 lck_mtx_lock_spin(buf_mtxp);
3588 CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA));
3589 bp->b_whichq = BQ_EMPTY;
3590 binshash(bp, &invalhash);
3591 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
3592 CLR(bp->b_lflags, BL_BUSY);
3593 buf_busycount--;
3594 } else {
3595 /* Not discarding: clean up and prepare for reuse */
3596 bp->b_bufsize = 0;
3597 bp->b_datap = (uintptr_t)NULL;
3598 bp->b_upl = (void *)NULL;
3599 /*
3600 * preserve the state of whether this buffer
3601 * was allocated on the fly or not...
3602 * the only other flag that should be set at
3603 * this point is BL_BUSY...
3604 */
3605#ifdef JOE_DEBUG
3606 bp->b_owner = current_thread();
3607 bp->b_tag = 3;
3608#endif
3609 bp->b_lflags = BL_BUSY;
3610 bp->b_flags = (bp->b_flags & B_HDRALLOC);
3611 bp->b_dev = NODEV;
3612 bp->b_blkno = bp->b_lblkno = 0;
3613 bp->b_iodone = NULL;
3614 bp->b_error = 0;
3615 bp->b_resid = 0;
3616 bp->b_bcount = 0;
3617 bp->b_dirtyoff = bp->b_dirtyend = 0;
3618 bp->b_validoff = bp->b_validend = 0;
3619 bzero(&bp->b_attr, sizeof(struct bufattr));
3620
3621 lck_mtx_lock_spin(buf_mtxp);
3622 }
3623 return (0);
3624}
3625
3626
3627
3628errno_t
3629buf_invalblkno(vnode_t vp, daddr64_t lblkno, int flags)
3630{
3631 buf_t bp;
3632 errno_t error;
3633 struct bufhashhdr *dp;
3634
3635 dp = BUFHASH(vp, lblkno);
3636
3637relook:
3638 lck_mtx_lock_spin(buf_mtxp);
3639
3640 if ((bp = incore_locked(vp, lblkno, dp)) == (struct buf *)0) {
3641 lck_mtx_unlock(buf_mtxp);
3642 return (0);
3643 }
3644 if (ISSET(bp->b_lflags, BL_BUSY)) {
3645 if ( !ISSET(flags, BUF_WAIT)) {
3646 lck_mtx_unlock(buf_mtxp);
3647 return (EBUSY);
3648 }
3649 SET(bp->b_lflags, BL_WANTED);
3650
3651 error = msleep((caddr_t)bp, buf_mtxp, PDROP | (PRIBIO + 1), "buf_invalblkno", NULL);
3652
3653 if (error) {
3654 return (error);
3655 }
3656 goto relook;
3657 }
3658 bremfree_locked(bp);
3659 SET(bp->b_lflags, BL_BUSY);
3660 SET(bp->b_flags, B_INVAL);
3661 buf_busycount++;
3662#ifdef JOE_DEBUG
3663 bp->b_owner = current_thread();
3664 bp->b_tag = 4;
3665#endif
3666 lck_mtx_unlock(buf_mtxp);
3667 buf_brelse(bp);
3668
3669 return (0);
3670}
3671
3672
3673void
3674buf_drop(buf_t bp)
3675{
3676 int need_wakeup = 0;
3677
3678 lck_mtx_lock_spin(buf_mtxp);
3679
3680 if (ISSET(bp->b_lflags, BL_WANTED)) {
3681 /*
3682 * delay the actual wakeup until after we
3683 * clear BL_BUSY and we've dropped buf_mtxp
3684 */
3685 need_wakeup = 1;
3686 }
3687#ifdef JOE_DEBUG
3688 bp->b_owner = current_thread();
3689 bp->b_tag = 9;
3690#endif
3691 /*
3692 * Unlock the buffer.
3693 */
3694 CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
3695 buf_busycount--;
3696
3697 lck_mtx_unlock(buf_mtxp);
3698
3699 if (need_wakeup) {
3700 /*
3701 * Wake up any proceeses waiting for _this_ buffer to become free.
3702 */
3703 wakeup(bp);
3704 }
3705}
3706
3707
3708errno_t
3709buf_acquire(buf_t bp, int flags, int slpflag, int slptimeo) {
3710 errno_t error;
3711
3712 lck_mtx_lock_spin(buf_mtxp);
3713
3714 error = buf_acquire_locked(bp, flags, slpflag, slptimeo);
3715
3716 lck_mtx_unlock(buf_mtxp);
3717
3718 return (error);
3719}
3720
3721
3722static errno_t
3723buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo)
3724{
3725 errno_t error;
3726 struct timespec ts;
3727
3728 if (ISSET(bp->b_flags, B_LOCKED)) {
3729 if ((flags & BAC_SKIP_LOCKED))
3730 return (EDEADLK);
3731 } else {
3732 if ((flags & BAC_SKIP_NONLOCKED))
3733 return (EDEADLK);
3734 }
3735 if (ISSET(bp->b_lflags, BL_BUSY)) {
3736 /*
3737 * since the lck_mtx_lock may block, the buffer
3738 * may become BUSY, so we need to
3739 * recheck for a NOWAIT request
3740 */
3741 if (flags & BAC_NOWAIT)
3742 return (EBUSY);
3743 SET(bp->b_lflags, BL_WANTED);
3744
3745 /* the hz value is 100; which leads to 10ms */
3746 ts.tv_sec = (slptimeo/100);
3747 ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000;
3748 error = msleep((caddr_t)bp, buf_mtxp, slpflag | (PRIBIO + 1), "buf_acquire", &ts);
3749
3750 if (error)
3751 return (error);
3752 return (EAGAIN);
3753 }
3754 if (flags & BAC_REMOVE)
3755 bremfree_locked(bp);
3756 SET(bp->b_lflags, BL_BUSY);
3757 buf_busycount++;
3758
3759#ifdef JOE_DEBUG
3760 bp->b_owner = current_thread();
3761 bp->b_tag = 5;
3762#endif
3763 return (0);
3764}
3765
3766
3767/*
3768 * Wait for operations on the buffer to complete.
3769 * When they do, extract and return the I/O's error value.
3770 */
3771errno_t
3772buf_biowait(buf_t bp)
3773{
3774 while (!ISSET(bp->b_flags, B_DONE)) {
3775
3776 lck_mtx_lock_spin(buf_mtxp);
3777
3778 if (!ISSET(bp->b_flags, B_DONE)) {
3779 DTRACE_IO1(wait__start, buf_t, bp);
3780 (void) msleep(bp, buf_mtxp, PDROP | (PRIBIO+1), "buf_biowait", NULL);
3781 DTRACE_IO1(wait__done, buf_t, bp);
3782 } else
3783 lck_mtx_unlock(buf_mtxp);
3784 }
3785 /* check for interruption of I/O (e.g. via NFS), then errors. */
3786 if (ISSET(bp->b_flags, B_EINTR)) {
3787 CLR(bp->b_flags, B_EINTR);
3788 return (EINTR);
3789 } else if (ISSET(bp->b_flags, B_ERROR))
3790 return (bp->b_error ? bp->b_error : EIO);
3791 else
3792 return (0);
3793}
3794
3795
3796/*
3797 * Mark I/O complete on a buffer.
3798 *
3799 * If a callback has been requested, e.g. the pageout
3800 * daemon, do so. Otherwise, awaken waiting processes.
3801 *
3802 * [ Leffler, et al., says on p.247:
3803 * "This routine wakes up the blocked process, frees the buffer
3804 * for an asynchronous write, or, for a request by the pagedaemon
3805 * process, invokes a procedure specified in the buffer structure" ]
3806 *
3807 * In real life, the pagedaemon (or other system processes) wants
3808 * to do async stuff to, and doesn't want the buffer buf_brelse()'d.
3809 * (for swap pager, that puts swap buffers on the free lists (!!!),
3810 * for the vn device, that puts malloc'd buffers on the free lists!)
3811 */
3812
3813void
3814buf_biodone(buf_t bp)
3815{
3816 mount_t mp;
3817 struct bufattr *bap;
3818
3819 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
3820 bp, bp->b_datap, bp->b_flags, 0, 0);
3821
3822 if (ISSET(bp->b_flags, B_DONE))
3823 panic("biodone already");
3824
3825 if (ISSET(bp->b_flags, B_ERROR)) {
3826 fslog_io_error(bp);
3827 }
3828
3829 bap = &bp->b_attr;
3830
3831 if (bp->b_vp && bp->b_vp->v_mount) {
3832 mp = bp->b_vp->v_mount;
3833 } else {
3834 mp = NULL;
3835 }
3836
3837 if (mp && (bp->b_flags & B_READ) == 0) {
3838 update_last_io_time(mp);
3839 INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_write_size);
3840 } else if (mp) {
3841 INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_read_size);
3842 }
3843
3844 if (kdebug_enable) {
3845 int code = DKIO_DONE;
3846 int io_tier = GET_BUFATTR_IO_TIER(bap);
3847
3848 if (bp->b_flags & B_READ)
3849 code |= DKIO_READ;
3850 if (bp->b_flags & B_ASYNC)
3851 code |= DKIO_ASYNC;
3852
3853 if (bp->b_flags & B_META)
3854 code |= DKIO_META;
3855 else if (bp->b_flags & B_PAGEIO)
3856 code |= DKIO_PAGING;
3857
3858 if (io_tier != 0)
3859 code |= DKIO_THROTTLE;
3860
3861 code |= ((io_tier << DKIO_TIER_SHIFT) & DKIO_TIER_MASK);
3862
3863 if (bp->b_flags & B_PASSIVE)
3864 code |= DKIO_PASSIVE;
3865
3866 if (bap->ba_flags & BA_NOCACHE)
3867 code |= DKIO_NOCACHE;
3868
3869 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
3870 buf_kernel_addrperm_addr(bp), (uintptr_t)VM_KERNEL_ADDRPERM(bp->b_vp), bp->b_resid, bp->b_error, 0);
3871 }
3872
3873 /*
3874 * I/O was done, so don't believe
3875 * the DIRTY state from VM anymore...
3876 * and we need to reset the THROTTLED/PASSIVE
3877 * indicators
3878 */
3879 CLR(bp->b_flags, (B_WASDIRTY | B_PASSIVE));
3880 CLR(bap->ba_flags, (BA_META | BA_NOCACHE | BA_DELAYIDLESLEEP));
3881
3882 SET_BUFATTR_IO_TIER(bap, 0);
3883
3884 DTRACE_IO1(done, buf_t, bp);
3885
3886 if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
3887 /*
3888 * wake up any writer's blocked
3889 * on throttle or waiting for I/O
3890 * to drain
3891 */
3892 vnode_writedone(bp->b_vp);
3893
3894 if (ISSET(bp->b_flags, (B_CALL | B_FILTER))) { /* if necessary, call out */
3895 void (*iodone_func)(struct buf *, void *) = bp->b_iodone;
3896 void *arg = bp->b_transaction;
3897 int callout = ISSET(bp->b_flags, B_CALL);
3898
3899 if (iodone_func == NULL)
3900 panic("biodone: bp @ %p has NULL b_iodone!\n", bp);
3901
3902 CLR(bp->b_flags, (B_CALL | B_FILTER)); /* filters and callouts are one-shot */
3903 bp->b_iodone = NULL;
3904 bp->b_transaction = NULL;
3905
3906 if (callout)
3907 SET(bp->b_flags, B_DONE); /* note that it's done */
3908
3909 (*iodone_func)(bp, arg);
3910
3911 if (callout) {
3912 /*
3913 * assumes that the callback function takes
3914 * ownership of the bp and deals with releasing it if necessary
3915 */
3916 goto biodone_done;
3917 }
3918 /*
3919 * in this case the call back function is acting
3920 * strictly as a filter... it does not take
3921 * ownership of the bp and is expecting us
3922 * to finish cleaning up... this is currently used
3923 * by the HFS journaling code
3924 */
3925 }
3926 if (ISSET(bp->b_flags, B_ASYNC)) { /* if async, release it */
3927 SET(bp->b_flags, B_DONE); /* note that it's done */
3928
3929 buf_brelse(bp);
3930 } else { /* or just wakeup the buffer */
3931 /*
3932 * by taking the mutex, we serialize
3933 * the buf owner calling buf_biowait so that we'll
3934 * only see him in one of 2 states...
3935 * state 1: B_DONE wasn't set and he's
3936 * blocked in msleep
3937 * state 2: he's blocked trying to take the
3938 * mutex before looking at B_DONE
3939 * BL_WANTED is cleared in case anyone else
3940 * is blocked waiting for the buffer... note
3941 * that we haven't cleared B_BUSY yet, so if
3942 * they do get to run, their going to re-set
3943 * BL_WANTED and go back to sleep
3944 */
3945 lck_mtx_lock_spin(buf_mtxp);
3946
3947 CLR(bp->b_lflags, BL_WANTED);
3948 SET(bp->b_flags, B_DONE); /* note that it's done */
3949
3950 lck_mtx_unlock(buf_mtxp);
3951
3952 wakeup(bp);
3953 }
3954biodone_done:
3955 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
3956 (uintptr_t)bp, (uintptr_t)bp->b_datap, bp->b_flags, 0, 0);
3957}
3958
3959/*
3960 * Obfuscate buf pointers.
3961 */
3962vm_offset_t
3963buf_kernel_addrperm_addr(void * addr)
3964{
3965 if ((vm_offset_t)addr == 0)
3966 return 0;
3967 else
3968 return ((vm_offset_t)addr + buf_kernel_addrperm);
3969}
3970
3971/*
3972 * Return a count of buffers on the "locked" queue.
3973 */
3974int
3975count_lock_queue(void)
3976{
3977 buf_t bp;
3978 int n = 0;
3979
3980 lck_mtx_lock_spin(buf_mtxp);
3981
3982 for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
3983 bp = bp->b_freelist.tqe_next)
3984 n++;
3985 lck_mtx_unlock(buf_mtxp);
3986
3987 return (n);
3988}
3989
3990/*
3991 * Return a count of 'busy' buffers. Used at the time of shutdown.
3992 * note: This is also called from the mach side in debug context in kdp.c
3993 */
3994int
3995count_busy_buffers(void)
3996{
3997 return buf_busycount + bufstats.bufs_iobufinuse;
3998}
3999
4000#if DIAGNOSTIC
4001/*
4002 * Print out statistics on the current allocation of the buffer pool.
4003 * Can be enabled to print out on every ``sync'' by setting "syncprt"
4004 * in vfs_syscalls.c using sysctl.
4005 */
4006void
4007vfs_bufstats()
4008{
4009 int i, j, count;
4010 struct buf *bp;
4011 struct bqueues *dp;
4012 int counts[MAXBSIZE/CLBYTES+1];
4013 static char *bname[BQUEUES] =
4014 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
4015
4016 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
4017 count = 0;
4018 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
4019 counts[j] = 0;
4020
4021 lck_mtx_lock(buf_mtxp);
4022
4023 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
4024 counts[bp->b_bufsize/CLBYTES]++;
4025 count++;
4026 }
4027 lck_mtx_unlock(buf_mtxp);
4028
4029 printf("%s: total-%d", bname[i], count);
4030 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
4031 if (counts[j] != 0)
4032 printf(", %d-%d", j * CLBYTES, counts[j]);
4033 printf("\n");
4034 }
4035}
4036#endif /* DIAGNOSTIC */
4037
4038#define NRESERVEDIOBUFS 128
4039
4040
4041buf_t
4042alloc_io_buf(vnode_t vp, int priv)
4043{
4044 buf_t bp;
4045
4046 lck_mtx_lock_spin(iobuffer_mtxp);
4047
4048 while (((niobuf_headers - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse) && !priv) ||
4049 (bp = iobufqueue.tqh_first) == NULL) {
4050 bufstats.bufs_iobufsleeps++;
4051
4052 need_iobuffer = 1;
4053 (void) msleep(&need_iobuffer, iobuffer_mtxp, PSPIN | (PRIBIO+1), (const char *)"alloc_io_buf", NULL);
4054 }
4055 TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
4056
4057 bufstats.bufs_iobufinuse++;
4058 if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
4059 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
4060
4061 lck_mtx_unlock(iobuffer_mtxp);
4062
4063 /*
4064 * initialize various fields
4065 * we don't need to hold the mutex since the buffer
4066 * is now private... the vp should have a reference
4067 * on it and is not protected by this mutex in any event
4068 */
4069 bp->b_timestamp = 0;
4070 bp->b_proc = NULL;
4071
4072 bp->b_datap = 0;
4073 bp->b_flags = 0;
4074 bp->b_lflags = BL_BUSY | BL_IOBUF;
4075 bp->b_redundancy_flags = 0;
4076 bp->b_blkno = bp->b_lblkno = 0;
4077#ifdef JOE_DEBUG
4078 bp->b_owner = current_thread();
4079 bp->b_tag = 6;
4080#endif
4081 bp->b_iodone = NULL;
4082 bp->b_error = 0;
4083 bp->b_resid = 0;
4084 bp->b_bcount = 0;
4085 bp->b_bufsize = 0;
4086 bp->b_upl = NULL;
4087 bp->b_vp = vp;
4088 bzero(&bp->b_attr, sizeof(struct bufattr));
4089
4090 if (vp && (vp->v_type == VBLK || vp->v_type == VCHR))
4091 bp->b_dev = vp->v_rdev;
4092 else
4093 bp->b_dev = NODEV;
4094
4095 return (bp);
4096}
4097
4098
4099void
4100free_io_buf(buf_t bp)
4101{
4102 int need_wakeup = 0;
4103
4104 /*
4105 * put buffer back on the head of the iobufqueue
4106 */
4107 bp->b_vp = NULL;
4108 bp->b_flags = B_INVAL;
4109
4110 lck_mtx_lock_spin(iobuffer_mtxp);
4111
4112 binsheadfree(bp, &iobufqueue, -1);
4113
4114 if (need_iobuffer) {
4115 /*
4116 * Wake up any processes waiting because they need an io buffer
4117 *
4118 * do the wakeup after we drop the mutex... it's possible that the
4119 * wakeup will be superfluous if need_iobuffer gets set again and
4120 * another thread runs this path, but it's highly unlikely, doesn't
4121 * hurt, and it means we don't hold up I/O progress if the wakeup blocks
4122 * trying to grab a task related lock...
4123 */
4124 need_iobuffer = 0;
4125 need_wakeup = 1;
4126 }
4127 if (bufstats.bufs_iobufinuse <= 0)
4128 panic("free_io_buf: bp(%p) - bufstats.bufs_iobufinuse < 0", bp);
4129
4130 bufstats.bufs_iobufinuse--;
4131
4132 lck_mtx_unlock(iobuffer_mtxp);
4133
4134 if (need_wakeup)
4135 wakeup(&need_iobuffer);
4136}
4137
4138
4139void
4140buf_list_lock(void)
4141{
4142 lck_mtx_lock_spin(buf_mtxp);
4143}
4144
4145void
4146buf_list_unlock(void)
4147{
4148 lck_mtx_unlock(buf_mtxp);
4149}
4150
4151/*
4152 * If getnewbuf() calls bcleanbuf() on the same thread
4153 * there is a potential for stack overrun and deadlocks.
4154 * So we always handoff the work to a worker thread for completion
4155 */
4156
4157
4158static void
4159bcleanbuf_thread_init(void)
4160{
4161 thread_t thread = THREAD_NULL;
4162
4163 /* create worker thread */
4164 kernel_thread_start((thread_continue_t)bcleanbuf_thread, NULL, &thread);
4165 thread_deallocate(thread);
4166}
4167
4168typedef int (*bcleanbufcontinuation)(int);
4169
4170static void
4171bcleanbuf_thread(void)
4172{
4173 struct buf *bp;
4174 int error = 0;
4175 int loopcnt = 0;
4176
4177 for (;;) {
4178 lck_mtx_lock_spin(buf_mtxp);
4179
4180 while ( (bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY])) == NULL) {
4181 (void)msleep0(&bufqueues[BQ_LAUNDRY], buf_mtxp, PRIBIO|PDROP, "blaundry", 0, (bcleanbufcontinuation)bcleanbuf_thread);
4182 }
4183
4184 /*
4185 * Remove from the queue
4186 */
4187 bremfree_locked(bp);
4188
4189 /*
4190 * Buffer is no longer on any free list
4191 */
4192 SET(bp->b_lflags, BL_BUSY);
4193 buf_busycount++;
4194
4195#ifdef JOE_DEBUG
4196 bp->b_owner = current_thread();
4197 bp->b_tag = 10;
4198#endif
4199
4200 lck_mtx_unlock(buf_mtxp);
4201 /*
4202 * do the IO
4203 */
4204 error = bawrite_internal(bp, 0);
4205
4206 if (error) {
4207 bp->b_whichq = BQ_LAUNDRY;
4208 bp->b_timestamp = buf_timestamp();
4209
4210 lck_mtx_lock_spin(buf_mtxp);
4211
4212 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
4213 blaundrycnt++;
4214
4215 /* we never leave a busy page on the laundry queue */
4216 CLR(bp->b_lflags, BL_BUSY);
4217 buf_busycount--;
4218#ifdef JOE_DEBUG
4219 bp->b_owner = current_thread();
4220 bp->b_tag = 11;
4221#endif
4222
4223 lck_mtx_unlock(buf_mtxp);
4224
4225 if (loopcnt > MAXLAUNDRY) {
4226 /*
4227 * bawrite_internal() can return errors if we're throttled. If we've
4228 * done several I/Os and failed, give the system some time to unthrottle
4229 * the vnode
4230 */
4231 (void)tsleep((void *)&bufqueues[BQ_LAUNDRY], PRIBIO, "blaundry", 1);
4232 loopcnt = 0;
4233 } else {
4234 /* give other threads a chance to run */
4235 (void)thread_block(THREAD_CONTINUE_NULL);
4236 loopcnt++;
4237 }
4238 }
4239 }
4240}
4241
4242
4243static int
4244brecover_data(buf_t bp)
4245{
4246 int upl_offset;
4247 upl_t upl;
4248 upl_page_info_t *pl;
4249 kern_return_t kret;
4250 vnode_t vp = bp->b_vp;
4251 int upl_flags;
4252
4253
4254 if ( !UBCINFOEXISTS(vp) || bp->b_bufsize == 0)
4255 goto dump_buffer;
4256
4257 upl_flags = UPL_PRECIOUS;
4258 if (! (buf_flags(bp) & B_READ)) {
4259 /*
4260 * "write" operation: let the UPL subsystem know
4261 * that we intend to modify the buffer cache pages we're
4262 * gathering.
4263 */
4264 upl_flags |= UPL_WILL_MODIFY;
4265 }
4266
4267 kret = ubc_create_upl(vp,
4268 ubc_blktooff(vp, bp->b_lblkno),
4269 bp->b_bufsize,
4270 &upl,
4271 &pl,
4272 upl_flags);
4273 if (kret != KERN_SUCCESS)
4274 panic("Failed to create UPL");
4275
4276 for (upl_offset = 0; upl_offset < bp->b_bufsize; upl_offset += PAGE_SIZE) {
4277
4278 if (!upl_valid_page(pl, upl_offset / PAGE_SIZE) || !upl_dirty_page(pl, upl_offset / PAGE_SIZE)) {
4279 ubc_upl_abort(upl, 0);
4280 goto dump_buffer;
4281 }
4282 }
4283 bp->b_upl = upl;
4284
4285 kret = ubc_upl_map(upl, (vm_offset_t *)&(bp->b_datap));
4286
4287 if (kret != KERN_SUCCESS)
4288 panic("getblk: ubc_upl_map() failed with (%d)", kret);
4289 return (1);
4290
4291dump_buffer:
4292 bp->b_bufsize = 0;
4293 SET(bp->b_flags, B_INVAL);
4294 buf_brelse(bp);
4295
4296 return(0);
4297}
4298
4299boolean_t
4300buffer_cache_gc(int all)
4301{
4302 buf_t bp;
4303 boolean_t did_large_zfree = FALSE;
4304 boolean_t need_wakeup = FALSE;
4305 int now = buf_timestamp();
4306 uint32_t found = 0;
4307 struct bqueues privq;
4308 int thresh_hold = BUF_STALE_THRESHHOLD;
4309
4310 if (all)
4311 thresh_hold = 0;
4312 /*
4313 * We only care about metadata (incore storage comes from zalloc()).
4314 * Unless "all" is set (used to evict meta data buffers in preparation
4315 * for deep sleep), we only evict up to BUF_MAX_GC_BATCH_SIZE buffers
4316 * that have not been accessed in the last 30s. This limit controls both
4317 * the hold time of the global lock "buf_mtxp" and the length of time
4318 * we spend compute bound in the GC thread which calls this function
4319 */
4320 lck_mtx_lock(buf_mtxp);
4321
4322 do {
4323 found = 0;
4324 TAILQ_INIT(&privq);
4325 need_wakeup = FALSE;
4326
4327 while (((bp = TAILQ_FIRST(&bufqueues[BQ_META]))) &&
4328 (now > bp->b_timestamp) &&
4329 (now - bp->b_timestamp > thresh_hold) &&
4330 (found < BUF_MAX_GC_BATCH_SIZE)) {
4331
4332 /* Remove from free list */
4333 bremfree_locked(bp);
4334 found++;
4335
4336#ifdef JOE_DEBUG
4337 bp->b_owner = current_thread();
4338 bp->b_tag = 12;
4339#endif
4340
4341 /* If dirty, move to laundry queue and remember to do wakeup */
4342 if (ISSET(bp->b_flags, B_DELWRI)) {
4343 SET(bp->b_lflags, BL_WANTDEALLOC);
4344
4345 bmovelaundry(bp);
4346 need_wakeup = TRUE;
4347
4348 continue;
4349 }
4350
4351 /*
4352 * Mark busy and put on private list. We could technically get
4353 * away without setting BL_BUSY here.
4354 */
4355 SET(bp->b_lflags, BL_BUSY);
4356 buf_busycount++;
4357
4358 /*
4359 * Remove from hash and dissociate from vp.
4360 */
4361 bremhash(bp);
4362 if (bp->b_vp) {
4363 brelvp_locked(bp);
4364 }
4365
4366 TAILQ_INSERT_TAIL(&privq, bp, b_freelist);
4367 }
4368
4369 if (found == 0) {
4370 break;
4371 }
4372
4373 /* Drop lock for batch processing */
4374 lck_mtx_unlock(buf_mtxp);
4375
4376 /* Wakeup and yield for laundry if need be */
4377 if (need_wakeup) {
4378 wakeup(&bufqueues[BQ_LAUNDRY]);
4379 (void)thread_block(THREAD_CONTINUE_NULL);
4380 }
4381
4382 /* Clean up every buffer on private list */
4383 TAILQ_FOREACH(bp, &privq, b_freelist) {
4384 /* Take note if we've definitely freed at least a page to a zone */
4385 if ((ISSET(bp->b_flags, B_ZALLOC)) && (buf_size(bp) >= PAGE_SIZE)) {
4386 did_large_zfree = TRUE;
4387 }
4388
4389 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
4390
4391 /* Free Storage */
4392 buf_free_meta_store(bp);
4393
4394 /* Release credentials */
4395 buf_release_credentials(bp);
4396
4397 /* Prepare for moving to empty queue */
4398 CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED
4399 | B_AGE | B_ASYNC | B_NOCACHE | B_FUA));
4400 bp->b_whichq = BQ_EMPTY;
4401 BLISTNONE(bp);
4402 }
4403 lck_mtx_lock(buf_mtxp);
4404
4405 /* Back under lock, move them all to invalid hash and clear busy */
4406 TAILQ_FOREACH(bp, &privq, b_freelist) {
4407 binshash(bp, &invalhash);
4408 CLR(bp->b_lflags, BL_BUSY);
4409 buf_busycount--;
4410
4411#ifdef JOE_DEBUG
4412 if (bp->b_owner != current_thread()) {
4413 panic("Buffer stolen from buffer_cache_gc()");
4414 }
4415 bp->b_owner = current_thread();
4416 bp->b_tag = 13;
4417#endif
4418 }
4419
4420 /* And do a big bulk move to the empty queue */
4421 TAILQ_CONCAT(&bufqueues[BQ_EMPTY], &privq, b_freelist);
4422
4423 } while (all && (found == BUF_MAX_GC_BATCH_SIZE));
4424
4425 lck_mtx_unlock(buf_mtxp);
4426
4427 return did_large_zfree;
4428}
4429
4430
4431/*
4432 * disabled for now
4433 */
4434
4435#if FLUSH_QUEUES
4436
4437#define NFLUSH 32
4438
4439static int
4440bp_cmp(void *a, void *b)
4441{
4442 buf_t *bp_a = *(buf_t **)a,
4443 *bp_b = *(buf_t **)b;
4444 daddr64_t res;
4445
4446 // don't have to worry about negative block
4447 // numbers so this is ok to do.
4448 //
4449 res = (bp_a->b_blkno - bp_b->b_blkno);
4450
4451 return (int)res;
4452}
4453
4454
4455int
4456bflushq(int whichq, mount_t mp)
4457{
4458 buf_t bp, next;
4459 int i, buf_count;
4460 int total_writes = 0;
4461 static buf_t flush_table[NFLUSH];
4462
4463 if (whichq < 0 || whichq >= BQUEUES) {
4464 return (0);
4465 }
4466
4467 restart:
4468 lck_mtx_lock(buf_mtxp);
4469
4470 bp = TAILQ_FIRST(&bufqueues[whichq]);
4471
4472 for (buf_count = 0; bp; bp = next) {
4473 next = bp->b_freelist.tqe_next;
4474
4475 if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) {
4476 continue;
4477 }
4478
4479 if (ISSET(bp->b_flags, B_DELWRI) && !ISSET(bp->b_lflags, BL_BUSY)) {
4480
4481 bremfree_locked(bp);
4482#ifdef JOE_DEBUG
4483 bp->b_owner = current_thread();
4484 bp->b_tag = 7;
4485#endif
4486 SET(bp->b_lflags, BL_BUSY);
4487 buf_busycount++;
4488
4489 flush_table[buf_count] = bp;
4490 buf_count++;
4491 total_writes++;
4492
4493 if (buf_count >= NFLUSH) {
4494 lck_mtx_unlock(buf_mtxp);
4495
4496 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
4497
4498 for (i = 0; i < buf_count; i++) {
4499 buf_bawrite(flush_table[i]);
4500 }
4501 goto restart;
4502 }
4503 }
4504 }
4505 lck_mtx_unlock(buf_mtxp);
4506
4507 if (buf_count > 0) {
4508 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
4509
4510 for (i = 0; i < buf_count; i++) {
4511 buf_bawrite(flush_table[i]);
4512 }
4513 }
4514
4515 return (total_writes);
4516}
4517#endif
4518
4519
4520#if BALANCE_QUEUES
4521
4522/* XXX move this to a separate file */
4523
4524/*
4525 * NOTE: THIS CODE HAS NOT BEEN UPDATED
4526 * WITH RESPECT TO THE NEW LOCKING MODEL
4527 */
4528
4529
4530/*
4531 * Dynamic Scaling of the Buffer Queues
4532 */
4533
4534typedef long long blsize_t;
4535
4536blsize_t MAXNBUF; /* initialize to (sane_size / PAGE_SIZE) */
4537/* Global tunable limits */
4538blsize_t nbufh; /* number of buffer headers */
4539blsize_t nbuflow; /* minimum number of buffer headers required */
4540blsize_t nbufhigh; /* maximum number of buffer headers allowed */
4541blsize_t nbuftarget; /* preferred number of buffer headers */
4542
4543/*
4544 * assertions:
4545 *
4546 * 1. 0 < nbuflow <= nbufh <= nbufhigh
4547 * 2. nbufhigh <= MAXNBUF
4548 * 3. 0 < nbuflow <= nbuftarget <= nbufhigh
4549 * 4. nbufh can not be set by sysctl().
4550 */
4551
4552/* Per queue tunable limits */
4553
4554struct bufqlim {
4555 blsize_t bl_nlow; /* minimum number of buffer headers required */
4556 blsize_t bl_num; /* number of buffer headers on the queue */
4557 blsize_t bl_nlhigh; /* maximum number of buffer headers allowed */
4558 blsize_t bl_target; /* preferred number of buffer headers */
4559 long bl_stale; /* Seconds after which a buffer is considered stale */
4560} bufqlim[BQUEUES];
4561
4562/*
4563 * assertions:
4564 *
4565 * 1. 0 <= bl_nlow <= bl_num <= bl_nlhigh
4566 * 2. bl_nlhigh <= MAXNBUF
4567 * 3. bufqlim[BQ_META].bl_nlow != 0
4568 * 4. bufqlim[BQ_META].bl_nlow > (number of possible concurrent
4569 * file system IO operations)
4570 * 5. bl_num can not be set by sysctl().
4571 * 6. bl_nhigh <= nbufhigh
4572 */
4573
4574/*
4575 * Rationale:
4576 * ----------
4577 * Defining it blsize_t as long permits 2^31 buffer headers per queue.
4578 * Which can describe (2^31 * PAGE_SIZE) memory per queue.
4579 *
4580 * These limits are exported to by means of sysctl().
4581 * It was decided to define blsize_t as a 64 bit quantity.
4582 * This will make sure that we will not be required to change it
4583 * as long as we do not exceed 64 bit address space for the kernel.
4584 *
4585 * low and high numbers parameters initialized at compile time
4586 * and boot arguments can be used to override them. sysctl()
4587 * would not change the value. sysctl() can get all the values
4588 * but can set only target. num is the current level.
4589 *
4590 * Advantages of having a "bufqscan" thread doing the balancing are,
4591 * Keep enough bufs on BQ_EMPTY.
4592 * getnewbuf() by default will always select a buffer from the BQ_EMPTY.
4593 * getnewbuf() perfoms best if a buffer was found there.
4594 * Also this minimizes the possibility of starting IO
4595 * from getnewbuf(). That's a performance win, too.
4596 *
4597 * Localize complex logic [balancing as well as time aging]
4598 * to balancebufq().
4599 *
4600 * Simplify getnewbuf() logic by elimination of time aging code.
4601 */
4602
4603/*
4604 * Algorithm:
4605 * -----------
4606 * The goal of the dynamic scaling of the buffer queues to to keep
4607 * the size of the LRU close to bl_target. Buffers on a queue would
4608 * be time aged.
4609 *
4610 * There would be a thread which will be responsible for "balancing"
4611 * the buffer cache queues.
4612 *
4613 * The scan order would be: AGE, LRU, META, EMPTY.
4614 */
4615
4616long bufqscanwait = 0;
4617
4618static void bufqscan_thread();
4619static int balancebufq(int q);
4620static int btrimempty(int n);
4621static __inline__ int initbufqscan(void);
4622static __inline__ int nextbufq(int q);
4623static void buqlimprt(int all);
4624
4625
4626static __inline__ void
4627bufqinc(int q)
4628{
4629 if ((q < 0) || (q >= BQUEUES))
4630 return;
4631
4632 bufqlim[q].bl_num++;
4633 return;
4634}
4635
4636static __inline__ void
4637bufqdec(int q)
4638{
4639 if ((q < 0) || (q >= BQUEUES))
4640 return;
4641
4642 bufqlim[q].bl_num--;
4643 return;
4644}
4645
4646static void
4647bufq_balance_thread_init(void)
4648{
4649 thread_t thread = THREAD_NULL;
4650
4651 if (bufqscanwait++ == 0) {
4652
4653 /* Initalize globals */
4654 MAXNBUF = (sane_size / PAGE_SIZE);
4655 nbufh = nbuf_headers;
4656 nbuflow = min(nbufh, 100);
4657 nbufhigh = min(MAXNBUF, max(nbufh, 2048));
4658 nbuftarget = (sane_size >> 5) / PAGE_SIZE;
4659 nbuftarget = max(nbuflow, nbuftarget);
4660 nbuftarget = min(nbufhigh, nbuftarget);
4661
4662 /*
4663 * Initialize the bufqlim
4664 */
4665
4666 /* LOCKED queue */
4667 bufqlim[BQ_LOCKED].bl_nlow = 0;
4668 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
4669 bufqlim[BQ_LOCKED].bl_target = 0;
4670 bufqlim[BQ_LOCKED].bl_stale = 30;
4671
4672 /* LRU queue */
4673 bufqlim[BQ_LRU].bl_nlow = 0;
4674 bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
4675 bufqlim[BQ_LRU].bl_target = nbuftarget/4;
4676 bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
4677
4678 /* AGE queue */
4679 bufqlim[BQ_AGE].bl_nlow = 0;
4680 bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
4681 bufqlim[BQ_AGE].bl_target = nbuftarget/4;
4682 bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
4683
4684 /* EMPTY queue */
4685 bufqlim[BQ_EMPTY].bl_nlow = 0;
4686 bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
4687 bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
4688 bufqlim[BQ_EMPTY].bl_stale = 600000;
4689
4690 /* META queue */
4691 bufqlim[BQ_META].bl_nlow = 0;
4692 bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
4693 bufqlim[BQ_META].bl_target = nbuftarget/4;
4694 bufqlim[BQ_META].bl_stale = META_IS_STALE;
4695
4696 /* LAUNDRY queue */
4697 bufqlim[BQ_LOCKED].bl_nlow = 0;
4698 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
4699 bufqlim[BQ_LOCKED].bl_target = 0;
4700 bufqlim[BQ_LOCKED].bl_stale = 30;
4701
4702 buqlimprt(1);
4703 }
4704
4705 /* create worker thread */
4706 kernel_thread_start((thread_continue_t)bufqscan_thread, NULL, &thread);
4707 thread_deallocate(thread);
4708}
4709
4710/* The workloop for the buffer balancing thread */
4711static void
4712bufqscan_thread()
4713{
4714 int moretodo = 0;
4715
4716 for(;;) {
4717 do {
4718 int q; /* buffer queue to process */
4719
4720 q = initbufqscan();
4721 for (; q; ) {
4722 moretodo |= balancebufq(q);
4723 q = nextbufq(q);
4724 }
4725 } while (moretodo);
4726
4727#if DIAGNOSTIC
4728 vfs_bufstats();
4729 buqlimprt(0);
4730#endif
4731 (void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz);
4732 moretodo = 0;
4733 }
4734}
4735
4736/* Seed for the buffer queue balancing */
4737static __inline__ int
4738initbufqscan()
4739{
4740 /* Start with AGE queue */
4741 return (BQ_AGE);
4742}
4743
4744/* Pick next buffer queue to balance */
4745static __inline__ int
4746nextbufq(int q)
4747{
4748 int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
4749
4750 q++;
4751 q %= sizeof(order);
4752 return (order[q]);
4753}
4754
4755/* function to balance the buffer queues */
4756static int
4757balancebufq(int q)
4758{
4759 int moretodo = 0;
4760 int n, t;
4761
4762 /* reject invalid q */
4763 if ((q < 0) || (q >= BQUEUES))
4764 goto out;
4765
4766 /* LOCKED or LAUNDRY queue MUST not be balanced */
4767 if ((q == BQ_LOCKED) || (q == BQ_LAUNDRY))
4768 goto out;
4769
4770 n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
4771
4772 /* If queue has less than target nothing more to do */
4773 if (n < 0)
4774 goto out;
4775
4776 if ( n > 8 ) {
4777 /* Balance only a small amount (12.5%) at a time */
4778 n >>= 3;
4779 }
4780
4781 /* EMPTY queue needs special handling */
4782 if (q == BQ_EMPTY) {
4783 moretodo |= btrimempty(n);
4784 goto out;
4785 }
4786
4787 t = buf_timestamp():
4788
4789 for (; n > 0; n--) {
4790 struct buf *bp = bufqueues[q].tqh_first;
4791 if (!bp)
4792 break;
4793
4794 /* check if it's stale */
4795 if ((t - bp->b_timestamp) > bufqlim[q].bl_stale) {
4796 if (bcleanbuf(bp, FALSE)) {
4797 /* buf_bawrite() issued, bp not ready */
4798 moretodo = 1;
4799 } else {
4800 /* release the cleaned buffer to BQ_EMPTY */
4801 SET(bp->b_flags, B_INVAL);
4802 buf_brelse(bp);
4803 }
4804 } else
4805 break;
4806 }
4807
4808out:
4809 return (moretodo);
4810}
4811
4812static int
4813btrimempty(int n)
4814{
4815 /*
4816 * When struct buf are allocated dynamically, this would
4817 * reclaim upto 'n' struct buf from the empty queue.
4818 */
4819
4820 return (0);
4821}
4822
4823static void
4824buqlimprt(int all)
4825{
4826 int i;
4827 static char *bname[BQUEUES] =
4828 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
4829
4830 if (all)
4831 for (i = 0; i < BQUEUES; i++) {
4832 printf("%s : ", bname[i]);
4833 printf("min = %ld, ", (long)bufqlim[i].bl_nlow);
4834 printf("cur = %ld, ", (long)bufqlim[i].bl_num);
4835 printf("max = %ld, ", (long)bufqlim[i].bl_nlhigh);
4836 printf("target = %ld, ", (long)bufqlim[i].bl_target);
4837 printf("stale after %ld seconds\n", bufqlim[i].bl_stale);
4838 }
4839 else
4840 for (i = 0; i < BQUEUES; i++) {
4841 printf("%s : ", bname[i]);
4842 printf("cur = %ld, ", (long)bufqlim[i].bl_num);
4843 }
4844}
4845
4846#endif
4847
4848