]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_bio.c
b8dfb6fc8dc1edc0bc9807bf9f1ee8744846fdca
[apple/xnu.git] / bsd / vfs / vfs_bio.c
1 /*
2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*-
30 * Copyright (c) 1994 Christopher G. Demetriou
31 * Copyright (c) 1982, 1986, 1989, 1993
32 * The Regents of the University of California. All rights reserved.
33 * (c) UNIX System Laboratories, Inc.
34 * All or some portions of this file are derived from material licensed
35 * to the University of California by American Telephone and Telegraph
36 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
37 * the permission of UNIX System Laboratories, Inc.
38 *
39 * Redistribution and use in source and binary forms, with or without
40 * modification, are permitted provided that the following conditions
41 * are met:
42 * 1. Redistributions of source code must retain the above copyright
43 * notice, this list of conditions and the following disclaimer.
44 * 2. Redistributions in binary form must reproduce the above copyright
45 * notice, this list of conditions and the following disclaimer in the
46 * documentation and/or other materials provided with the distribution.
47 * 3. All advertising materials mentioning features or use of this software
48 * must display the following acknowledgement:
49 * This product includes software developed by the University of
50 * California, Berkeley and its contributors.
51 * 4. Neither the name of the University nor the names of its contributors
52 * may be used to endorse or promote products derived from this software
53 * without specific prior written permission.
54 *
55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65 * SUCH DAMAGE.
66 *
67 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
68 */
69
70 /*
71 * Some references:
72 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
73 * Leffler, et al.: The Design and Implementation of the 4.3BSD
74 * UNIX Operating System (Addison Welley, 1989)
75 */
76
77 #include <sys/param.h>
78 #include <sys/systm.h>
79 #include <sys/proc_internal.h>
80 #include <sys/buf_internal.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/trace.h>
84 #include <sys/malloc.h>
85 #include <sys/resourcevar.h>
86 #include <miscfs/specfs/specdev.h>
87 #include <sys/ubc.h>
88 #include <sys/kauth.h>
89 #if DIAGNOSTIC
90 #include <kern/assert.h>
91 #endif /* DIAGNOSTIC */
92 #include <kern/task.h>
93 #include <kern/zalloc.h>
94 #include <kern/lock.h>
95
96 #include <vm/vm_kern.h>
97
98 #include <sys/kdebug.h>
99 #include <machine/spl.h>
100
101 #if BALANCE_QUEUES
102 static __inline__ void bufqinc(int q);
103 static __inline__ void bufqdec(int q);
104 #endif
105
106 static int bcleanbuf(buf_t bp);
107 static int brecover_data(buf_t bp);
108 static boolean_t incore(vnode_t vp, daddr64_t blkno);
109 static buf_t incore_locked(vnode_t vp, daddr64_t blkno);
110 /* timeout is in msecs */
111 static buf_t getnewbuf(int slpflag, int slptimeo, int *queue);
112 static void bremfree_locked(buf_t bp);
113 static void buf_reassign(buf_t bp, vnode_t newvp);
114 static errno_t buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo);
115 static int buf_iterprepare(vnode_t vp, struct buflists *, int flags);
116 static void buf_itercomplete(vnode_t vp, struct buflists *, int flags);
117
118 __private_extern__ int bdwrite_internal(buf_t, int);
119
120 /* zone allocated buffer headers */
121 static void bufzoneinit(void);
122 static void bcleanbuf_thread_init(void);
123 static void bcleanbuf_thread(void);
124
125 static zone_t buf_hdr_zone;
126 static int buf_hdr_count;
127
128
129 /*
130 * Definitions for the buffer hash lists.
131 */
132 #define BUFHASH(dvp, lbn) \
133 (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
134 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
135 u_long bufhash;
136
137 /* Definitions for the buffer stats. */
138 struct bufstats bufstats;
139
140 /* Number of delayed write buffers */
141 int nbdwrite = 0;
142 int blaundrycnt = 0;
143
144
145 static TAILQ_HEAD(ioqueue, buf) iobufqueue;
146 static TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
147 static int needbuffer;
148 static int need_iobuffer;
149
150 static lck_grp_t *buf_mtx_grp;
151 static lck_attr_t *buf_mtx_attr;
152 static lck_grp_attr_t *buf_mtx_grp_attr;
153 static lck_mtx_t *iobuffer_mtxp;
154 static lck_mtx_t *buf_mtxp;
155
156 static __inline__ int
157 buf_timestamp(void)
158 {
159 struct timeval t;
160 microuptime(&t);
161 return (t.tv_sec);
162 }
163
164 /*
165 * Insq/Remq for the buffer free lists.
166 */
167 #if BALANCE_QUEUES
168 #define binsheadfree(bp, dp, whichq) do { \
169 TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
170 bufqinc((whichq)); \
171 (bp)->b_whichq = whichq; \
172 (bp)->b_timestamp = buf_timestamp(); \
173 } while (0)
174
175 #define binstailfree(bp, dp, whichq) do { \
176 TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
177 bufqinc((whichq)); \
178 (bp)->b_whichq = whichq; \
179 (bp)->b_timestamp = buf_timestamp(); \
180 } while (0)
181 #else
182 #define binsheadfree(bp, dp, whichq) do { \
183 TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
184 (bp)->b_whichq = whichq; \
185 (bp)->b_timestamp = buf_timestamp(); \
186 } while (0)
187
188 #define binstailfree(bp, dp, whichq) do { \
189 TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
190 (bp)->b_whichq = whichq; \
191 (bp)->b_timestamp = buf_timestamp(); \
192 } while (0)
193 #endif
194
195
196 #define BHASHENTCHECK(bp) \
197 if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef) \
198 panic("%x: b_hash.le_prev is not deadbeef", (bp));
199
200 #define BLISTNONE(bp) \
201 (bp)->b_hash.le_next = (struct buf *)0; \
202 (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
203
204 /*
205 * Insq/Remq for the vnode usage lists.
206 */
207 #define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs)
208 #define bufremvn(bp) { \
209 LIST_REMOVE(bp, b_vnbufs); \
210 (bp)->b_vnbufs.le_next = NOLIST; \
211 }
212
213 /*
214 * Time in seconds before a buffer on a list is
215 * considered as a stale buffer
216 */
217 #define LRU_IS_STALE 120 /* default value for the LRU */
218 #define AGE_IS_STALE 60 /* default value for the AGE */
219 #define META_IS_STALE 180 /* default value for the BQ_META */
220
221 int lru_is_stale = LRU_IS_STALE;
222 int age_is_stale = AGE_IS_STALE;
223 int meta_is_stale = META_IS_STALE;
224 static int boot_nbuf = 0;
225
226
227 /* LIST_INSERT_HEAD() with assertions */
228 static __inline__ void
229 blistenterhead(struct bufhashhdr * head, buf_t bp)
230 {
231 if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
232 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
233 (head)->lh_first = bp;
234 bp->b_hash.le_prev = &(head)->lh_first;
235 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
236 panic("blistenterhead: le_prev is deadbeef");
237 }
238
239 static __inline__ void
240 binshash(buf_t bp, struct bufhashhdr *dp)
241 {
242 #if DIAGNOSTIC
243 buf_t nbp;
244 #endif /* DIAGNOSTIC */
245
246 BHASHENTCHECK(bp);
247
248 #if DIAGNOSTIC
249 nbp = dp->lh_first;
250 for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
251 if(nbp == bp)
252 panic("buf already in hashlist");
253 }
254 #endif /* DIAGNOSTIC */
255
256 blistenterhead(dp, bp);
257 }
258
259 static __inline__ void
260 bremhash(buf_t bp)
261 {
262 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
263 panic("bremhash le_prev is deadbeef");
264 if (bp->b_hash.le_next == bp)
265 panic("bremhash: next points to self");
266
267 if (bp->b_hash.le_next != NULL)
268 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
269 *bp->b_hash.le_prev = (bp)->b_hash.le_next;
270 }
271
272
273
274
275 int
276 buf_valid(buf_t bp) {
277
278 if ( (bp->b_flags & (B_DONE | B_DELWRI)) )
279 return 1;
280 return 0;
281 }
282
283 int
284 buf_fromcache(buf_t bp) {
285
286 if ( (bp->b_flags & B_CACHE) )
287 return 1;
288 return 0;
289 }
290
291 void
292 buf_markinvalid(buf_t bp) {
293
294 SET(bp->b_flags, B_INVAL);
295 }
296
297 void
298 buf_markdelayed(buf_t bp) {
299
300 SET(bp->b_flags, B_DELWRI);
301 buf_reassign(bp, bp->b_vp);
302 }
303
304 void
305 buf_markeintr(buf_t bp) {
306
307 SET(bp->b_flags, B_EINTR);
308 }
309
310 void
311 buf_markaged(buf_t bp) {
312
313 SET(bp->b_flags, B_AGE);
314 }
315
316 errno_t
317 buf_error(buf_t bp) {
318
319 return (bp->b_error);
320 }
321
322 void
323 buf_seterror(buf_t bp, errno_t error) {
324
325 if ((bp->b_error = error))
326 SET(bp->b_flags, B_ERROR);
327 else
328 CLR(bp->b_flags, B_ERROR);
329 }
330
331 void
332 buf_setflags(buf_t bp, int32_t flags) {
333
334 SET(bp->b_flags, (flags & BUF_X_WRFLAGS));
335 }
336
337 void
338 buf_clearflags(buf_t bp, int32_t flags) {
339
340 CLR(bp->b_flags, (flags & BUF_X_WRFLAGS));
341 }
342
343 int32_t
344 buf_flags(buf_t bp) {
345
346 return ((bp->b_flags & BUF_X_RDFLAGS));
347 }
348
349 void
350 buf_reset(buf_t bp, int32_t io_flags) {
351
352 CLR(bp->b_flags, (B_READ | B_WRITE | B_ERROR | B_DONE | B_INVAL | B_ASYNC | B_NOCACHE));
353 SET(bp->b_flags, (io_flags & (B_ASYNC | B_READ | B_WRITE | B_NOCACHE)));
354
355 bp->b_error = 0;
356 }
357
358 uint32_t
359 buf_count(buf_t bp) {
360
361 return (bp->b_bcount);
362 }
363
364 void
365 buf_setcount(buf_t bp, uint32_t bcount) {
366
367 bp->b_bcount = bcount;
368 }
369
370 uint32_t
371 buf_size(buf_t bp) {
372
373 return (bp->b_bufsize);
374 }
375
376 void
377 buf_setsize(buf_t bp, uint32_t bufsize) {
378
379 bp->b_bufsize = bufsize;
380 }
381
382 uint32_t
383 buf_resid(buf_t bp) {
384
385 return (bp->b_resid);
386 }
387
388 void
389 buf_setresid(buf_t bp, uint32_t resid) {
390
391 bp->b_resid = resid;
392 }
393
394 uint32_t
395 buf_dirtyoff(buf_t bp) {
396
397 return (bp->b_dirtyoff);
398 }
399
400 uint32_t
401 buf_dirtyend(buf_t bp) {
402
403 return (bp->b_dirtyend);
404 }
405
406 void
407 buf_setdirtyoff(buf_t bp, uint32_t dirtyoff) {
408
409 bp->b_dirtyoff = dirtyoff;
410 }
411
412 void
413 buf_setdirtyend(buf_t bp, uint32_t dirtyend) {
414
415 bp->b_dirtyend = dirtyend;
416 }
417
418 uintptr_t
419 buf_dataptr(buf_t bp) {
420
421 return (bp->b_datap);
422 }
423
424 void
425 buf_setdataptr(buf_t bp, uintptr_t data) {
426
427 bp->b_datap = data;
428 }
429
430 vnode_t
431 buf_vnode(buf_t bp) {
432
433 return (bp->b_vp);
434 }
435
436 void
437 buf_setvnode(buf_t bp, vnode_t vp) {
438
439 bp->b_vp = vp;
440 }
441
442
443 void *
444 buf_callback(buf_t bp)
445 {
446 if ( !(bp->b_lflags & BL_IOBUF) )
447 return ((void *) NULL);
448 if ( !(bp->b_flags & B_CALL) )
449 return ((void *) NULL);
450
451 return ((void *)bp->b_iodone);
452 }
453
454
455 errno_t
456 buf_setcallback(buf_t bp, void (*callback)(buf_t, void *), void *transaction)
457 {
458
459 if ( !(bp->b_lflags & BL_IOBUF) )
460 return (EINVAL);
461
462 if (callback)
463 bp->b_flags |= (B_CALL | B_ASYNC);
464 else
465 bp->b_flags &= ~B_CALL;
466 bp->b_transaction = transaction;
467 bp->b_iodone = callback;
468
469 return (0);
470 }
471
472 errno_t
473 buf_setupl(buf_t bp, upl_t upl, uint32_t offset)
474 {
475
476 if ( !(bp->b_lflags & BL_IOBUF) )
477 return (EINVAL);
478
479 if (upl)
480 bp->b_flags |= B_CLUSTER;
481 else
482 bp->b_flags &= ~B_CLUSTER;
483 bp->b_upl = upl;
484 bp->b_uploffset = offset;
485
486 return (0);
487 }
488
489 buf_t
490 buf_clone(buf_t bp, int io_offset, int io_size, void (*iodone)(buf_t, void *), void *arg)
491 {
492 buf_t io_bp;
493
494 if (io_offset < 0 || io_size < 0)
495 return (NULL);
496
497 if ((unsigned)(io_offset + io_size) > (unsigned)bp->b_bcount)
498 return (NULL);
499
500 if (bp->b_flags & B_CLUSTER) {
501 if (io_offset && ((bp->b_uploffset + io_offset) & PAGE_MASK))
502 return (NULL);
503
504 if (((bp->b_uploffset + io_offset + io_size) & PAGE_MASK) && ((io_offset + io_size) < bp->b_bcount))
505 return (NULL);
506 }
507 io_bp = alloc_io_buf(bp->b_vp, 0);
508
509 io_bp->b_flags = bp->b_flags & (B_COMMIT_UPL | B_META | B_PAGEIO | B_CLUSTER | B_PHYS | B_ASYNC | B_READ);
510
511 if (iodone) {
512 io_bp->b_transaction = arg;
513 io_bp->b_iodone = iodone;
514 io_bp->b_flags |= B_CALL;
515 }
516 if (bp->b_flags & B_CLUSTER) {
517 io_bp->b_upl = bp->b_upl;
518 io_bp->b_uploffset = bp->b_uploffset + io_offset;
519 } else {
520 io_bp->b_datap = (uintptr_t)(((char *)bp->b_datap) + io_offset);
521 }
522 io_bp->b_bcount = io_size;
523
524 return (io_bp);
525 }
526
527
528
529 void
530 buf_setfilter(buf_t bp, void (*filter)(buf_t, void *), void *transaction,
531 void **old_iodone, void **old_transaction)
532 {
533 if (old_iodone)
534 *old_iodone = (void *)(bp->b_iodone);
535 if (old_transaction)
536 *old_transaction = (void *)(bp->b_transaction);
537
538 bp->b_transaction = transaction;
539 bp->b_iodone = filter;
540 bp->b_flags |= B_FILTER;
541 }
542
543
544 daddr64_t
545 buf_blkno(buf_t bp) {
546
547 return (bp->b_blkno);
548 }
549
550 daddr64_t
551 buf_lblkno(buf_t bp) {
552
553 return (bp->b_lblkno);
554 }
555
556 void
557 buf_setblkno(buf_t bp, daddr64_t blkno) {
558
559 bp->b_blkno = blkno;
560 }
561
562 void
563 buf_setlblkno(buf_t bp, daddr64_t lblkno) {
564
565 bp->b_lblkno = lblkno;
566 }
567
568 dev_t
569 buf_device(buf_t bp) {
570
571 return (bp->b_dev);
572 }
573
574 errno_t
575 buf_setdevice(buf_t bp, vnode_t vp) {
576
577 if ((vp->v_type != VBLK) && (vp->v_type != VCHR))
578 return EINVAL;
579 bp->b_dev = vp->v_rdev;
580
581 return 0;
582 }
583
584
585 void *
586 buf_drvdata(buf_t bp) {
587
588 return (bp->b_drvdata);
589 }
590
591 void
592 buf_setdrvdata(buf_t bp, void *drvdata) {
593
594 bp->b_drvdata = drvdata;
595 }
596
597 void *
598 buf_fsprivate(buf_t bp) {
599
600 return (bp->b_fsprivate);
601 }
602
603 void
604 buf_setfsprivate(buf_t bp, void *fsprivate) {
605
606 bp->b_fsprivate = fsprivate;
607 }
608
609 ucred_t
610 buf_rcred(buf_t bp) {
611
612 return (bp->b_rcred);
613 }
614
615 ucred_t
616 buf_wcred(buf_t bp) {
617
618 return (bp->b_wcred);
619 }
620
621 void *
622 buf_upl(buf_t bp) {
623
624 return (bp->b_upl);
625 }
626
627 uint32_t
628 buf_uploffset(buf_t bp) {
629
630 return ((uint32_t)(bp->b_uploffset));
631 }
632
633 proc_t
634 buf_proc(buf_t bp) {
635
636 return (bp->b_proc);
637 }
638
639
640 errno_t
641 buf_map(buf_t bp, caddr_t *io_addr)
642 {
643 buf_t real_bp;
644 vm_offset_t vaddr;
645 kern_return_t kret;
646
647 if ( !(bp->b_flags & B_CLUSTER)) {
648 *io_addr = (caddr_t)bp->b_datap;
649 return (0);
650 }
651 real_bp = (buf_t)(bp->b_real_bp);
652
653 if (real_bp && real_bp->b_datap) {
654 /*
655 * b_real_bp is only valid if B_CLUSTER is SET
656 * if it's non-zero, than someone did a cluster_bp call
657 * if the backing physical pages were already mapped
658 * in before the call to cluster_bp (non-zero b_datap),
659 * than we just use that mapping
660 */
661 *io_addr = (caddr_t)real_bp->b_datap;
662 return (0);
663 }
664 kret = ubc_upl_map(bp->b_upl, &vaddr); /* Map it in */
665
666 if (kret != KERN_SUCCESS) {
667 *io_addr = 0;
668
669 return(ENOMEM);
670 }
671 vaddr += bp->b_uploffset;
672
673 *io_addr = (caddr_t)vaddr;
674
675 return (0);
676 }
677
678 errno_t
679 buf_unmap(buf_t bp)
680 {
681 buf_t real_bp;
682 kern_return_t kret;
683
684 if ( !(bp->b_flags & B_CLUSTER))
685 return (0);
686 /*
687 * see buf_map for the explanation
688 */
689 real_bp = (buf_t)(bp->b_real_bp);
690
691 if (real_bp && real_bp->b_datap)
692 return (0);
693
694 if (bp->b_lflags & BL_IOBUF) {
695 /*
696 * when we commit these pages, we'll hit
697 * it with UPL_COMMIT_INACTIVE which
698 * will clear the reference bit that got
699 * turned on when we touched the mapping
700 */
701 bp->b_flags |= B_AGE;
702 }
703 kret = ubc_upl_unmap(bp->b_upl);
704
705 if (kret != KERN_SUCCESS)
706 return (EINVAL);
707 return (0);
708 }
709
710
711 void
712 buf_clear(buf_t bp) {
713 caddr_t baddr;
714
715 if (buf_map(bp, &baddr) == 0) {
716 bzero(baddr, bp->b_bcount);
717 buf_unmap(bp);
718 }
719 bp->b_resid = 0;
720 }
721
722
723
724 /*
725 * Read or write a buffer that is not contiguous on disk.
726 * buffer is marked done/error at the conclusion
727 */
728 static int
729 buf_strategy_fragmented(vnode_t devvp, buf_t bp, off_t f_offset, size_t contig_bytes)
730 {
731 vnode_t vp = buf_vnode(bp);
732 buf_t io_bp; /* For reading or writing a single block */
733 int io_direction;
734 int io_resid;
735 size_t io_contig_bytes;
736 daddr64_t io_blkno;
737 int error = 0;
738 int bmap_flags;
739
740 /*
741 * save our starting point... the bp was already mapped
742 * in buf_strategy before we got called
743 * no sense doing it again.
744 */
745 io_blkno = bp->b_blkno;
746 /*
747 * Make sure we redo this mapping for the next I/O
748 * i.e. this can never be a 'permanent' mapping
749 */
750 bp->b_blkno = bp->b_lblkno;
751
752 /*
753 * Get an io buffer to do the deblocking
754 */
755 io_bp = alloc_io_buf(devvp, 0);
756
757 io_bp->b_lblkno = bp->b_lblkno;
758 io_bp->b_datap = bp->b_datap;
759 io_resid = bp->b_bcount;
760 io_direction = bp->b_flags & B_READ;
761 io_contig_bytes = contig_bytes;
762
763 if (bp->b_flags & B_READ)
764 bmap_flags = VNODE_READ;
765 else
766 bmap_flags = VNODE_WRITE;
767
768 for (;;) {
769 if (io_blkno == -1)
770 /*
771 * this is unexepected, but we'll allow for it
772 */
773 bzero((caddr_t)io_bp->b_datap, (int)io_contig_bytes);
774 else {
775 io_bp->b_bcount = io_contig_bytes;
776 io_bp->b_bufsize = io_contig_bytes;
777 io_bp->b_resid = io_contig_bytes;
778 io_bp->b_blkno = io_blkno;
779
780 buf_reset(io_bp, io_direction);
781 /*
782 * Call the device to do the I/O and wait for it
783 */
784 if ((error = VNOP_STRATEGY(io_bp)))
785 break;
786 if ((error = (int)buf_biowait(io_bp)))
787 break;
788 if (io_bp->b_resid) {
789 io_resid -= (io_contig_bytes - io_bp->b_resid);
790 break;
791 }
792 }
793 if ((io_resid -= io_contig_bytes) == 0)
794 break;
795 f_offset += io_contig_bytes;
796 io_bp->b_datap += io_contig_bytes;
797
798 /*
799 * Map the current position to a physical block number
800 */
801 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_resid, &io_blkno, &io_contig_bytes, NULL, bmap_flags, NULL)))
802 break;
803 }
804 buf_free(io_bp);
805
806 if (error)
807 buf_seterror(bp, error);
808 bp->b_resid = io_resid;
809 /*
810 * This I/O is now complete
811 */
812 buf_biodone(bp);
813
814 return error;
815 }
816
817
818 /*
819 * struct vnop_strategy_args {
820 * struct buf *a_bp;
821 * } *ap;
822 */
823 errno_t
824 buf_strategy(vnode_t devvp, void *ap)
825 {
826 buf_t bp = ((struct vnop_strategy_args *)ap)->a_bp;
827 vnode_t vp = bp->b_vp;
828 int bmap_flags;
829 errno_t error;
830
831 if (vp == NULL || vp->v_type == VCHR || vp->v_type == VBLK)
832 panic("buf_strategy: b_vp == NULL || vtype == VCHR | VBLK\n");
833 /*
834 * associate the physical device with
835 * with this buf_t even if we don't
836 * end up issuing the I/O...
837 */
838 bp->b_dev = devvp->v_rdev;
839
840 if (bp->b_flags & B_READ)
841 bmap_flags = VNODE_READ;
842 else
843 bmap_flags = VNODE_WRITE;
844
845 if ( !(bp->b_flags & B_CLUSTER)) {
846
847 if ( (bp->b_upl) ) {
848 /*
849 * we have a UPL associated with this bp
850 * go through cluster_bp which knows how
851 * to deal with filesystem block sizes
852 * that aren't equal to the page size
853 */
854 return (cluster_bp(bp));
855 }
856 if (bp->b_blkno == bp->b_lblkno) {
857 off_t f_offset;
858 size_t contig_bytes;
859
860 if ((error = VNOP_BLKTOOFF(vp, bp->b_lblkno, &f_offset))) {
861 buf_seterror(bp, error);
862 buf_biodone(bp);
863
864 return (error);
865 }
866 if ((error = VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))) {
867 buf_seterror(bp, error);
868 buf_biodone(bp);
869
870 return (error);
871 }
872 if (bp->b_blkno == -1)
873 buf_clear(bp);
874 else if ((long)contig_bytes < bp->b_bcount)
875 return (buf_strategy_fragmented(devvp, bp, f_offset, contig_bytes));
876 }
877 if (bp->b_blkno == -1) {
878 buf_biodone(bp);
879 return (0);
880 }
881 }
882 /*
883 * we can issue the I/O because...
884 * either B_CLUSTER is set which
885 * means that the I/O is properly set
886 * up to be a multiple of the page size, or
887 * we were able to successfully set up the
888 * phsyical block mapping
889 */
890 return (VOCALL(devvp->v_op, VOFFSET(vnop_strategy), ap));
891 }
892
893
894
895 buf_t
896 buf_alloc(vnode_t vp)
897 {
898 return(alloc_io_buf(vp, 0));
899 }
900
901 void
902 buf_free(buf_t bp) {
903
904 free_io_buf(bp);
905 }
906
907
908
909 void
910 buf_iterate(vnode_t vp, int (*callout)(buf_t, void *), int flags, void *arg) {
911 buf_t bp;
912 int retval;
913 struct buflists local_iterblkhd;
914 int lock_flags = BAC_NOWAIT | BAC_REMOVE;
915
916 if (flags & BUF_SKIP_LOCKED)
917 lock_flags |= BAC_SKIP_LOCKED;
918 if (flags & BUF_SKIP_NONLOCKED)
919 lock_flags |= BAC_SKIP_NONLOCKED;
920
921 lck_mtx_lock(buf_mtxp);
922
923 if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY)) {
924 lck_mtx_unlock(buf_mtxp);
925 return;
926 }
927 while (!LIST_EMPTY(&local_iterblkhd)) {
928 bp = LIST_FIRST(&local_iterblkhd);
929 LIST_REMOVE(bp, b_vnbufs);
930 LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
931
932 if (buf_acquire_locked(bp, lock_flags, 0, 0))
933 continue;
934
935 lck_mtx_unlock(buf_mtxp);
936
937 retval = callout(bp, arg);
938
939 switch (retval) {
940 case BUF_RETURNED:
941 buf_brelse(bp);
942 break;
943 case BUF_CLAIMED:
944 break;
945 case BUF_RETURNED_DONE:
946 buf_brelse(bp);
947 lck_mtx_lock(buf_mtxp);
948 goto out;
949 case BUF_CLAIMED_DONE:
950 lck_mtx_lock(buf_mtxp);
951 goto out;
952 }
953 lck_mtx_lock(buf_mtxp);
954 }
955 out:
956 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
957
958 lck_mtx_unlock(buf_mtxp);
959 }
960
961
962 /*
963 * Flush out and invalidate all buffers associated with a vnode.
964 */
965 int
966 buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo)
967 {
968 buf_t bp;
969 int error = 0;
970 int must_rescan = 1;
971 struct buflists local_iterblkhd;
972
973 lck_mtx_lock(buf_mtxp);
974
975 for (;;) {
976 if (must_rescan == 0)
977 /*
978 * the lists may not be empty, but all that's left at this
979 * point are metadata or B_LOCKED buffers which are being
980 * skipped... we know this because we made it through both
981 * the clean and dirty lists without dropping buf_mtxp...
982 * each time we drop buf_mtxp we bump "must_rescan"
983 */
984 break;
985 if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd))
986 break;
987 must_rescan = 0;
988 /*
989 * iterate the clean list
990 */
991 if (buf_iterprepare(vp, &local_iterblkhd, VBI_CLEAN)) {
992 goto try_dirty_list;
993 }
994 while (!LIST_EMPTY(&local_iterblkhd)) {
995 bp = LIST_FIRST(&local_iterblkhd);
996
997 LIST_REMOVE(bp, b_vnbufs);
998 LIST_INSERT_HEAD(&vp->v_cleanblkhd, bp, b_vnbufs);
999
1000 /*
1001 * some filesystems distinguish meta data blocks with a negative logical block #
1002 */
1003 if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
1004 continue;
1005
1006 if ( (error = (int)buf_acquire_locked(bp, BAC_REMOVE | BAC_SKIP_LOCKED, slpflag, slptimeo)) ) {
1007 if (error == EDEADLK)
1008 /*
1009 * this buffer was marked B_LOCKED...
1010 * we didn't drop buf_mtxp, so we
1011 * we don't need to rescan
1012 */
1013 continue;
1014 if (error == EAGAIN) {
1015 /*
1016 * found a busy buffer... we blocked and
1017 * dropped buf_mtxp, so we're going to
1018 * need to rescan after this pass is completed
1019 */
1020 must_rescan++;
1021 continue;
1022 }
1023 /*
1024 * got some kind of 'real' error out of the msleep
1025 * in buf_acquire_locked, terminate the scan and return the error
1026 */
1027 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1028
1029 lck_mtx_unlock(buf_mtxp);
1030 return (error);
1031 }
1032 lck_mtx_unlock(buf_mtxp);
1033
1034 SET(bp->b_flags, B_INVAL);
1035 buf_brelse(bp);
1036
1037 lck_mtx_lock(buf_mtxp);
1038
1039 /*
1040 * by dropping buf_mtxp, we allow new
1041 * buffers to be added to the vnode list(s)
1042 * we'll have to rescan at least once more
1043 * if the queues aren't empty
1044 */
1045 must_rescan++;
1046 }
1047 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1048
1049 try_dirty_list:
1050 /*
1051 * Now iterate on dirty blks
1052 */
1053 if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY)) {
1054 continue;
1055 }
1056 while (!LIST_EMPTY(&local_iterblkhd)) {
1057 bp = LIST_FIRST(&local_iterblkhd);
1058
1059 LIST_REMOVE(bp, b_vnbufs);
1060 LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1061
1062 /*
1063 * some filesystems distinguish meta data blocks with a negative logical block #
1064 */
1065 if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
1066 continue;
1067
1068 if ( (error = (int)buf_acquire_locked(bp, BAC_REMOVE | BAC_SKIP_LOCKED, slpflag, slptimeo)) ) {
1069 if (error == EDEADLK)
1070 /*
1071 * this buffer was marked B_LOCKED...
1072 * we didn't drop buf_mtxp, so we
1073 * we don't need to rescan
1074 */
1075 continue;
1076 if (error == EAGAIN) {
1077 /*
1078 * found a busy buffer... we blocked and
1079 * dropped buf_mtxp, so we're going to
1080 * need to rescan after this pass is completed
1081 */
1082 must_rescan++;
1083 continue;
1084 }
1085 /*
1086 * got some kind of 'real' error out of the msleep
1087 * in buf_acquire_locked, terminate the scan and return the error
1088 */
1089 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1090
1091 lck_mtx_unlock(buf_mtxp);
1092 return (error);
1093 }
1094 lck_mtx_unlock(buf_mtxp);
1095
1096 SET(bp->b_flags, B_INVAL);
1097
1098 if (ISSET(bp->b_flags, B_DELWRI) && (flags & BUF_WRITE_DATA))
1099 (void) VNOP_BWRITE(bp);
1100 else
1101 buf_brelse(bp);
1102
1103 lck_mtx_lock(buf_mtxp);
1104 /*
1105 * by dropping buf_mtxp, we allow new
1106 * buffers to be added to the vnode list(s)
1107 * we'll have to rescan at least once more
1108 * if the queues aren't empty
1109 */
1110 must_rescan++;
1111 }
1112 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1113 }
1114 lck_mtx_unlock(buf_mtxp);
1115
1116 return (0);
1117 }
1118
1119 void
1120 buf_flushdirtyblks(vnode_t vp, int wait, int flags, char *msg) {
1121 buf_t bp;
1122 int writes_issued = 0;
1123 errno_t error;
1124 int busy = 0;
1125 struct buflists local_iterblkhd;
1126 int lock_flags = BAC_NOWAIT | BAC_REMOVE;
1127
1128 if (flags & BUF_SKIP_LOCKED)
1129 lock_flags |= BAC_SKIP_LOCKED;
1130 if (flags & BUF_SKIP_NONLOCKED)
1131 lock_flags |= BAC_SKIP_NONLOCKED;
1132 loop:
1133 lck_mtx_lock(buf_mtxp);
1134
1135 if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY) == 0) {
1136 while (!LIST_EMPTY(&local_iterblkhd)) {
1137 bp = LIST_FIRST(&local_iterblkhd);
1138 LIST_REMOVE(bp, b_vnbufs);
1139 LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1140
1141 if ((error = buf_acquire_locked(bp, lock_flags, 0, 0)) == EBUSY)
1142 busy++;
1143 if (error)
1144 continue;
1145 lck_mtx_unlock(buf_mtxp);
1146
1147 bp->b_flags &= ~B_LOCKED;
1148
1149 /*
1150 * Wait for I/O associated with indirect blocks to complete,
1151 * since there is no way to quickly wait for them below.
1152 */
1153 if ((bp->b_vp == vp) || (wait == 0))
1154 (void) buf_bawrite(bp);
1155 else
1156 (void) VNOP_BWRITE(bp);
1157 writes_issued++;
1158
1159 lck_mtx_lock(buf_mtxp);
1160 }
1161 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1162 }
1163 lck_mtx_unlock(buf_mtxp);
1164
1165 if (wait) {
1166 (void)vnode_waitforwrites(vp, 0, 0, 0, msg);
1167
1168 if (vp->v_dirtyblkhd.lh_first && busy) {
1169 /*
1170 * we had one or more BUSY buffers on
1171 * the dirtyblock list... most likely
1172 * these are due to delayed writes that
1173 * were moved to the bclean queue but
1174 * have not yet been 'written'.
1175 * if we issued some writes on the
1176 * previous pass, we try again immediately
1177 * if we didn't, we'll sleep for some time
1178 * to allow the state to change...
1179 */
1180 if (writes_issued == 0) {
1181 (void)tsleep((caddr_t)&vp->v_numoutput,
1182 PRIBIO + 1, "vnode_flushdirtyblks", hz/20);
1183 }
1184 writes_issued = 0;
1185 busy = 0;
1186
1187 goto loop;
1188 }
1189 }
1190 }
1191
1192
1193 /*
1194 * called with buf_mtxp held...
1195 * this lock protects the queue manipulation
1196 */
1197 static int
1198 buf_iterprepare(vnode_t vp, struct buflists *iterheadp, int flags)
1199 {
1200 struct buflists * listheadp;
1201
1202 if (flags & VBI_DIRTY)
1203 listheadp = &vp->v_dirtyblkhd;
1204 else
1205 listheadp = &vp->v_cleanblkhd;
1206
1207 while (vp->v_iterblkflags & VBI_ITER) {
1208 vp->v_iterblkflags |= VBI_ITERWANT;
1209 msleep(&vp->v_iterblkflags, buf_mtxp, 0, "buf_iterprepare", 0);
1210 }
1211 if (LIST_EMPTY(listheadp)) {
1212 LIST_INIT(iterheadp);
1213 return(EINVAL);
1214 }
1215 vp->v_iterblkflags |= VBI_ITER;
1216
1217 iterheadp->lh_first = listheadp->lh_first;
1218 listheadp->lh_first->b_vnbufs.le_prev = &iterheadp->lh_first;
1219 LIST_INIT(listheadp);
1220
1221 return(0);
1222 }
1223
1224 /*
1225 * called with buf_mtxp held...
1226 * this lock protects the queue manipulation
1227 */
1228 static void
1229 buf_itercomplete(vnode_t vp, struct buflists *iterheadp, int flags)
1230 {
1231 struct buflists * listheadp;
1232 buf_t bp;
1233
1234 if (flags & VBI_DIRTY)
1235 listheadp = &vp->v_dirtyblkhd;
1236 else
1237 listheadp = &vp->v_cleanblkhd;
1238
1239 while (!LIST_EMPTY(iterheadp)) {
1240 bp = LIST_FIRST(iterheadp);
1241 LIST_REMOVE(bp, b_vnbufs);
1242 LIST_INSERT_HEAD(listheadp, bp, b_vnbufs);
1243 }
1244 vp->v_iterblkflags &= ~VBI_ITER;
1245
1246 if (vp->v_iterblkflags & VBI_ITERWANT) {
1247 vp->v_iterblkflags &= ~VBI_ITERWANT;
1248 wakeup(&vp->v_iterblkflags);
1249 }
1250 }
1251
1252
1253 static void
1254 bremfree_locked(buf_t bp)
1255 {
1256 struct bqueues *dp = NULL;
1257 int whichq = -1;
1258
1259 /*
1260 * We only calculate the head of the freelist when removing
1261 * the last element of the list as that is the only time that
1262 * it is needed (e.g. to reset the tail pointer).
1263 *
1264 * NB: This makes an assumption about how tailq's are implemented.
1265 */
1266 if (bp->b_freelist.tqe_next == NULL) {
1267 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
1268 if (dp->tqh_last == &bp->b_freelist.tqe_next)
1269 break;
1270 if (dp == &bufqueues[BQUEUES])
1271 panic("bremfree: lost tail");
1272 }
1273 TAILQ_REMOVE(dp, bp, b_freelist);
1274 whichq = bp->b_whichq;
1275 #if BALANCE_QUEUES
1276 bufqdec(whichq);
1277 #endif
1278 bp->b_whichq = -1;
1279 bp->b_timestamp = 0;
1280 }
1281
1282 /*
1283 * Associate a buffer with a vnode.
1284 */
1285 static void
1286 bgetvp(vnode_t vp, buf_t bp)
1287 {
1288
1289 if (bp->b_vp != vp)
1290 panic("bgetvp: not free");
1291
1292 if (vp->v_type == VBLK || vp->v_type == VCHR)
1293 bp->b_dev = vp->v_rdev;
1294 else
1295 bp->b_dev = NODEV;
1296 /*
1297 * Insert onto list for new vnode.
1298 */
1299 lck_mtx_lock(buf_mtxp);
1300 bufinsvn(bp, &vp->v_cleanblkhd);
1301 lck_mtx_unlock(buf_mtxp);
1302 }
1303
1304 /*
1305 * Disassociate a buffer from a vnode.
1306 */
1307 static void
1308 brelvp(buf_t bp)
1309 {
1310 vnode_t vp;
1311
1312 if ((vp = bp->b_vp) == (vnode_t)NULL)
1313 panic("brelvp: NULL vp");
1314 /*
1315 * Delete from old vnode list, if on one.
1316 */
1317 lck_mtx_lock(buf_mtxp);
1318 if (bp->b_vnbufs.le_next != NOLIST)
1319 bufremvn(bp);
1320 lck_mtx_unlock(buf_mtxp);
1321
1322 bp->b_vp = (vnode_t)NULL;
1323 }
1324
1325 /*
1326 * Reassign a buffer from one vnode to another.
1327 * Used to assign file specific control information
1328 * (indirect blocks) to the vnode to which they belong.
1329 */
1330 static void
1331 buf_reassign(buf_t bp, vnode_t newvp)
1332 {
1333 register struct buflists *listheadp;
1334
1335 if (newvp == NULL) {
1336 printf("buf_reassign: NULL");
1337 return;
1338 }
1339 lck_mtx_lock(buf_mtxp);
1340
1341 /*
1342 * Delete from old vnode list, if on one.
1343 */
1344 if (bp->b_vnbufs.le_next != NOLIST)
1345 bufremvn(bp);
1346 /*
1347 * If dirty, put on list of dirty buffers;
1348 * otherwise insert onto list of clean buffers.
1349 */
1350 if (ISSET(bp->b_flags, B_DELWRI))
1351 listheadp = &newvp->v_dirtyblkhd;
1352 else
1353 listheadp = &newvp->v_cleanblkhd;
1354 bufinsvn(bp, listheadp);
1355
1356 lck_mtx_unlock(buf_mtxp);
1357 }
1358
1359 static __inline__ void
1360 bufhdrinit(buf_t bp)
1361 {
1362 bzero((char *)bp, sizeof *bp);
1363 bp->b_dev = NODEV;
1364 bp->b_rcred = NOCRED;
1365 bp->b_wcred = NOCRED;
1366 bp->b_vnbufs.le_next = NOLIST;
1367 bp->b_flags = B_INVAL;
1368
1369 return;
1370 }
1371
1372 /*
1373 * Initialize buffers and hash links for buffers.
1374 */
1375 __private_extern__ void
1376 bufinit()
1377 {
1378 buf_t bp;
1379 struct bqueues *dp;
1380 int i;
1381 int metabuf;
1382 long whichq;
1383
1384 nbuf = 0;
1385 /* Initialize the buffer queues ('freelists') and the hash table */
1386 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
1387 TAILQ_INIT(dp);
1388 bufhashtbl = hashinit(nbuf_hashelements, M_CACHE, &bufhash);
1389
1390 metabuf = max_nbuf_headers/8; /* reserved for meta buf */
1391
1392 /* Initialize the buffer headers */
1393 for (i = 0; i < max_nbuf_headers; i++) {
1394 nbuf++;
1395 bp = &buf[i];
1396 bufhdrinit(bp);
1397
1398 /*
1399 * metabuf buffer headers on the meta-data list and
1400 * rest of the buffer headers on the empty list
1401 */
1402 if (--metabuf)
1403 whichq = BQ_META;
1404 else
1405 whichq = BQ_EMPTY;
1406
1407 BLISTNONE(bp);
1408 dp = &bufqueues[whichq];
1409 binsheadfree(bp, dp, whichq);
1410 binshash(bp, &invalhash);
1411 }
1412
1413 boot_nbuf = nbuf;
1414
1415 for (; i < nbuf + niobuf; i++) {
1416 bp = &buf[i];
1417 bufhdrinit(bp);
1418 binsheadfree(bp, &iobufqueue, -1);
1419 }
1420
1421 /*
1422 * allocate lock group attribute and group
1423 */
1424 buf_mtx_grp_attr = lck_grp_attr_alloc_init();
1425 buf_mtx_grp = lck_grp_alloc_init("buffer cache", buf_mtx_grp_attr);
1426
1427 /*
1428 * allocate the lock attribute
1429 */
1430 buf_mtx_attr = lck_attr_alloc_init();
1431
1432 /*
1433 * allocate and initialize mutex's for the buffer and iobuffer pools
1434 */
1435 buf_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
1436 iobuffer_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
1437
1438 if (iobuffer_mtxp == NULL)
1439 panic("couldn't create iobuffer mutex");
1440
1441 if (buf_mtxp == NULL)
1442 panic("couldn't create buf mutex");
1443
1444 /*
1445 * allocate and initialize cluster specific global locks...
1446 */
1447 cluster_init();
1448
1449 printf("using %d buffer headers and %d cluster IO buffer headers\n",
1450 nbuf, niobuf);
1451
1452 /* Set up zones used by the buffer cache */
1453 bufzoneinit();
1454
1455 /* start the bcleanbuf() thread */
1456 bcleanbuf_thread_init();
1457
1458 #if BALANCE_QUEUES
1459 {
1460 static void bufq_balance_thread_init();
1461 /* create a thread to do dynamic buffer queue balancing */
1462 bufq_balance_thread_init();
1463 }
1464 #endif /* notyet */
1465 }
1466
1467 static struct buf *
1468 bio_doread(vnode_t vp, daddr64_t blkno, int size, ucred_t cred, int async, int queuetype)
1469 {
1470 buf_t bp;
1471
1472 bp = buf_getblk(vp, blkno, size, 0, 0, queuetype);
1473
1474 /*
1475 * If buffer does not have data valid, start a read.
1476 * Note that if buffer is B_INVAL, buf_getblk() won't return it.
1477 * Therefore, it's valid if it's I/O has completed or been delayed.
1478 */
1479 if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
1480 struct proc *p;
1481
1482 p = current_proc();
1483
1484 /* Start I/O for the buffer (keeping credentials). */
1485 SET(bp->b_flags, B_READ | async);
1486 if (IS_VALID_CRED(cred) && !IS_VALID_CRED(bp->b_rcred)) {
1487 kauth_cred_ref(cred);
1488 bp->b_rcred = cred;
1489 }
1490
1491 VNOP_STRATEGY(bp);
1492
1493 trace(TR_BREADMISS, pack(vp, size), blkno);
1494
1495 /* Pay for the read. */
1496 if (p && p->p_stats)
1497 p->p_stats->p_ru.ru_inblock++; /* XXX */
1498
1499 if (async) {
1500 /*
1501 * since we asked for an ASYNC I/O
1502 * the biodone will do the brelse
1503 * we don't want to pass back a bp
1504 * that we don't 'own'
1505 */
1506 bp = NULL;
1507 }
1508 } else if (async) {
1509 buf_brelse(bp);
1510 bp = NULL;
1511 }
1512
1513 trace(TR_BREADHIT, pack(vp, size), blkno);
1514
1515 return (bp);
1516 }
1517
1518 /*
1519 * Perform the reads for buf_breadn() and buf_meta_breadn().
1520 * Trivial modification to the breada algorithm presented in Bach (p.55).
1521 */
1522 static errno_t
1523 do_breadn_for_type(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes,
1524 int nrablks, ucred_t cred, buf_t *bpp, int queuetype)
1525 {
1526 buf_t bp;
1527 int i;
1528
1529 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, queuetype);
1530
1531 /*
1532 * For each of the read-ahead blocks, start a read, if necessary.
1533 */
1534 for (i = 0; i < nrablks; i++) {
1535 /* If it's in the cache, just go on to next one. */
1536 if (incore(vp, rablks[i]))
1537 continue;
1538
1539 /* Get a buffer for the read-ahead block */
1540 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, queuetype);
1541 }
1542
1543 /* Otherwise, we had to start a read for it; wait until it's valid. */
1544 return (buf_biowait(bp));
1545 }
1546
1547
1548 /*
1549 * Read a disk block.
1550 * This algorithm described in Bach (p.54).
1551 */
1552 errno_t
1553 buf_bread(vnode_t vp, daddr64_t blkno, int size, ucred_t cred, buf_t *bpp)
1554 {
1555 buf_t bp;
1556
1557 /* Get buffer for block. */
1558 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
1559
1560 /* Wait for the read to complete, and return result. */
1561 return (buf_biowait(bp));
1562 }
1563
1564 /*
1565 * Read a disk block. [bread() for meta-data]
1566 * This algorithm described in Bach (p.54).
1567 */
1568 errno_t
1569 buf_meta_bread(vnode_t vp, daddr64_t blkno, int size, ucred_t cred, buf_t *bpp)
1570 {
1571 buf_t bp;
1572
1573 /* Get buffer for block. */
1574 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
1575
1576 /* Wait for the read to complete, and return result. */
1577 return (buf_biowait(bp));
1578 }
1579
1580 /*
1581 * Read-ahead multiple disk blocks. The first is sync, the rest async.
1582 */
1583 errno_t
1584 buf_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, ucred_t cred, buf_t *bpp)
1585 {
1586 return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_READ));
1587 }
1588
1589 /*
1590 * Read-ahead multiple disk blocks. The first is sync, the rest async.
1591 * [buf_breadn() for meta-data]
1592 */
1593 errno_t
1594 buf_meta_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, ucred_t cred, buf_t *bpp)
1595 {
1596 return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_META));
1597 }
1598
1599 /*
1600 * Block write. Described in Bach (p.56)
1601 */
1602 errno_t
1603 buf_bwrite(buf_t bp)
1604 {
1605 int sync, wasdelayed;
1606 errno_t rv;
1607 proc_t p = current_proc();
1608 vnode_t vp = bp->b_vp;
1609
1610 if (bp->b_datap == 0) {
1611 if (brecover_data(bp) == 0)
1612 return (0);
1613 }
1614 /* Remember buffer type, to switch on it later. */
1615 sync = !ISSET(bp->b_flags, B_ASYNC);
1616 wasdelayed = ISSET(bp->b_flags, B_DELWRI);
1617 CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
1618
1619 if (wasdelayed)
1620 OSAddAtomic(-1, &nbdwrite);
1621
1622 if (!sync) {
1623 /*
1624 * If not synchronous, pay for the I/O operation and make
1625 * sure the buf is on the correct vnode queue. We have
1626 * to do this now, because if we don't, the vnode may not
1627 * be properly notified that its I/O has completed.
1628 */
1629 if (wasdelayed)
1630 buf_reassign(bp, vp);
1631 else
1632 if (p && p->p_stats)
1633 p->p_stats->p_ru.ru_oublock++; /* XXX */
1634 }
1635 trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
1636
1637 /* Initiate disk write. Make sure the appropriate party is charged. */
1638
1639 OSAddAtomic(1, &vp->v_numoutput);
1640
1641 VNOP_STRATEGY(bp);
1642
1643 if (sync) {
1644 /*
1645 * If I/O was synchronous, wait for it to complete.
1646 */
1647 rv = buf_biowait(bp);
1648
1649 /*
1650 * Pay for the I/O operation, if it's not been paid for, and
1651 * make sure it's on the correct vnode queue. (async operatings
1652 * were payed for above.)
1653 */
1654 if (wasdelayed)
1655 buf_reassign(bp, vp);
1656 else
1657 if (p && p->p_stats)
1658 p->p_stats->p_ru.ru_oublock++; /* XXX */
1659
1660 /* Release the buffer. */
1661 // XXXdbg - only if the unused bit is set
1662 if (!ISSET(bp->b_flags, B_NORELSE)) {
1663 buf_brelse(bp);
1664 } else {
1665 CLR(bp->b_flags, B_NORELSE);
1666 }
1667
1668 return (rv);
1669 } else {
1670 return (0);
1671 }
1672 }
1673
1674 int
1675 vn_bwrite(ap)
1676 struct vnop_bwrite_args *ap;
1677 {
1678 return (buf_bwrite(ap->a_bp));
1679 }
1680
1681 /*
1682 * Delayed write.
1683 *
1684 * The buffer is marked dirty, but is not queued for I/O.
1685 * This routine should be used when the buffer is expected
1686 * to be modified again soon, typically a small write that
1687 * partially fills a buffer.
1688 *
1689 * NB: magnetic tapes cannot be delayed; they must be
1690 * written in the order that the writes are requested.
1691 *
1692 * Described in Leffler, et al. (pp. 208-213).
1693 *
1694 * Note: With the abilitty to allocate additional buffer
1695 * headers, we can get in to the situation where "too" many
1696 * buf_bdwrite()s can create situation where the kernel can create
1697 * buffers faster than the disks can service. Doing a buf_bawrite() in
1698 * cases were we have "too many" outstanding buf_bdwrite()s avoids that.
1699 */
1700 __private_extern__ int
1701 bdwrite_internal(buf_t bp, int return_error)
1702 {
1703 proc_t p = current_proc();
1704 vnode_t vp = bp->b_vp;
1705
1706 /*
1707 * If the block hasn't been seen before:
1708 * (1) Mark it as having been seen,
1709 * (2) Charge for the write.
1710 * (3) Make sure it's on its vnode's correct block list,
1711 */
1712 if (!ISSET(bp->b_flags, B_DELWRI)) {
1713 SET(bp->b_flags, B_DELWRI);
1714 if (p && p->p_stats)
1715 p->p_stats->p_ru.ru_oublock++; /* XXX */
1716 OSAddAtomic(1, &nbdwrite);
1717 buf_reassign(bp, vp);
1718 }
1719
1720 /* If this is a tape block, write it the block now. */
1721 if (ISSET(bp->b_flags, B_TAPE)) {
1722 VNOP_BWRITE(bp);
1723 return (0);
1724 }
1725
1726 /*
1727 * if we're not LOCKED, but the total number of delayed writes
1728 * has climbed above 75% of the total buffers in the system
1729 * return an error if the caller has indicated that it can
1730 * handle one in this case, otherwise schedule the I/O now
1731 * this is done to prevent us from allocating tons of extra
1732 * buffers when dealing with virtual disks (i.e. DiskImages),
1733 * because additional buffers are dynamically allocated to prevent
1734 * deadlocks from occurring
1735 *
1736 * however, can't do a buf_bawrite() if the LOCKED bit is set because the
1737 * buffer is part of a transaction and can't go to disk until
1738 * the LOCKED bit is cleared.
1739 */
1740 if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf/4)*3)) {
1741 if (return_error)
1742 return (EAGAIN);
1743 /*
1744 * If the vnode has "too many" write operations in progress
1745 * wait for them to finish the IO
1746 */
1747 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, (char *)"buf_bdwrite");
1748
1749 return (buf_bawrite(bp));
1750 }
1751
1752 /* Otherwise, the "write" is done, so mark and release the buffer. */
1753 SET(bp->b_flags, B_DONE);
1754 buf_brelse(bp);
1755 return (0);
1756 }
1757
1758 errno_t
1759 buf_bdwrite(buf_t bp)
1760 {
1761 return (bdwrite_internal(bp, 0));
1762 }
1763
1764
1765 /*
1766 * Asynchronous block write; just an asynchronous buf_bwrite().
1767 *
1768 * Note: With the abilitty to allocate additional buffer
1769 * headers, we can get in to the situation where "too" many
1770 * buf_bawrite()s can create situation where the kernel can create
1771 * buffers faster than the disks can service.
1772 * We limit the number of "in flight" writes a vnode can have to
1773 * avoid this.
1774 */
1775 static int
1776 bawrite_internal(buf_t bp, int throttle)
1777 {
1778 vnode_t vp = bp->b_vp;
1779
1780 if (vp) {
1781 if (throttle)
1782 /*
1783 * If the vnode has "too many" write operations in progress
1784 * wait for them to finish the IO
1785 */
1786 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, (const char *)"buf_bawrite");
1787 else if (vp->v_numoutput >= VNODE_ASYNC_THROTTLE)
1788 /*
1789 * return to the caller and
1790 * let him decide what to do
1791 */
1792 return (EWOULDBLOCK);
1793 }
1794 SET(bp->b_flags, B_ASYNC);
1795
1796 return (VNOP_BWRITE(bp));
1797 }
1798
1799 errno_t
1800 buf_bawrite(buf_t bp)
1801 {
1802 return (bawrite_internal(bp, 1));
1803 }
1804
1805
1806 /*
1807 * Release a buffer on to the free lists.
1808 * Described in Bach (p. 46).
1809 */
1810 void
1811 buf_brelse(buf_t bp)
1812 {
1813 struct bqueues *bufq;
1814 long whichq;
1815 upl_t upl;
1816 int need_wakeup = 0;
1817 int need_bp_wakeup = 0;
1818
1819
1820 if (bp->b_whichq != -1 || !(bp->b_lflags & BL_BUSY))
1821 panic("buf_brelse: bad buffer = %x\n", bp);
1822
1823 #ifdef JOE_DEBUG
1824 bp->b_stackbrelse[0] = __builtin_return_address(0);
1825 bp->b_stackbrelse[1] = __builtin_return_address(1);
1826 bp->b_stackbrelse[2] = __builtin_return_address(2);
1827 bp->b_stackbrelse[3] = __builtin_return_address(3);
1828 bp->b_stackbrelse[4] = __builtin_return_address(4);
1829 bp->b_stackbrelse[5] = __builtin_return_address(5);
1830
1831 bp->b_lastbrelse = current_thread();
1832 bp->b_tag = 0;
1833 #endif
1834 if (bp->b_lflags & BL_IOBUF) {
1835 free_io_buf(bp);
1836 return;
1837 }
1838
1839 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
1840 bp->b_lblkno * PAGE_SIZE, (int)bp, (int)bp->b_datap,
1841 bp->b_flags, 0);
1842
1843 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
1844
1845 /*
1846 * if we're invalidating a buffer that has the B_FILTER bit
1847 * set then call the b_iodone function so it gets cleaned
1848 * up properly.
1849 *
1850 * the HFS journal code depends on this
1851 */
1852 if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
1853 if (ISSET(bp->b_flags, B_FILTER)) { /* if necessary, call out */
1854 void (*iodone_func)(struct buf *, void *) = bp->b_iodone;
1855 void *arg = (void *)bp->b_transaction;
1856
1857 CLR(bp->b_flags, B_FILTER); /* but note callout done */
1858 bp->b_iodone = NULL;
1859 bp->b_transaction = NULL;
1860
1861 if (iodone_func == NULL) {
1862 panic("brelse: bp @ 0x%x has NULL b_iodone!\n", bp);
1863 }
1864 (*iodone_func)(bp, arg);
1865 }
1866 }
1867 /*
1868 * I/O is done. Cleanup the UPL state
1869 */
1870 upl = bp->b_upl;
1871
1872 if ( !ISSET(bp->b_flags, B_META) && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
1873 kern_return_t kret;
1874 int upl_flags;
1875
1876 if ( (upl == NULL) ) {
1877 if ( !ISSET(bp->b_flags, B_INVAL)) {
1878 kret = ubc_create_upl(bp->b_vp,
1879 ubc_blktooff(bp->b_vp, bp->b_lblkno),
1880 bp->b_bufsize,
1881 &upl,
1882 NULL,
1883 UPL_PRECIOUS);
1884
1885 if (kret != KERN_SUCCESS)
1886 panic("brelse: Failed to create UPL");
1887 #ifdef UPL_DEBUG
1888 upl_ubc_alias_set(upl, bp, 5);
1889 #endif /* UPL_DEBUG */
1890 }
1891 } else {
1892 if (bp->b_datap) {
1893 kret = ubc_upl_unmap(upl);
1894
1895 if (kret != KERN_SUCCESS)
1896 panic("ubc_upl_unmap failed");
1897 bp->b_datap = (uintptr_t)NULL;
1898 }
1899 }
1900 if (upl) {
1901 if (bp->b_flags & (B_ERROR | B_INVAL)) {
1902 if (bp->b_flags & (B_READ | B_INVAL))
1903 upl_flags = UPL_ABORT_DUMP_PAGES;
1904 else
1905 upl_flags = 0;
1906
1907 ubc_upl_abort(upl, upl_flags);
1908 } else {
1909 if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY))
1910 upl_flags = UPL_COMMIT_SET_DIRTY ;
1911 else
1912 upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
1913
1914 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
1915 UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
1916 }
1917 bp->b_upl = NULL;
1918 }
1919 } else {
1920 if ( (upl) )
1921 panic("brelse: UPL set for non VREG; vp=%x", bp->b_vp);
1922 }
1923
1924 /*
1925 * If it's locked, don't report an error; try again later.
1926 */
1927 if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
1928 CLR(bp->b_flags, B_ERROR);
1929 /*
1930 * If it's not cacheable, or an error, mark it invalid.
1931 */
1932 if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
1933 SET(bp->b_flags, B_INVAL);
1934
1935 if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
1936 /*
1937 * If it's invalid or empty, dissociate it from its vnode
1938 * and put on the head of the appropriate queue.
1939 */
1940 if (bp->b_vp)
1941 brelvp(bp);
1942
1943 if (ISSET(bp->b_flags, B_DELWRI))
1944 OSAddAtomic(-1, &nbdwrite);
1945
1946 CLR(bp->b_flags, (B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE));
1947 /*
1948 * Determine which queue the buffer should be on, then put it there.
1949 */
1950 if (bp->b_bufsize <= 0)
1951 whichq = BQ_EMPTY; /* no data */
1952 else if (ISSET(bp->b_flags, B_META))
1953 whichq = BQ_META; /* meta-data */
1954 else
1955 whichq = BQ_AGE; /* invalid data */
1956 bufq = &bufqueues[whichq];
1957
1958 lck_mtx_lock(buf_mtxp);
1959
1960 binsheadfree(bp, bufq, whichq);
1961 } else {
1962 /*
1963 * It has valid data. Put it on the end of the appropriate
1964 * queue, so that it'll stick around for as long as possible.
1965 */
1966 if (ISSET(bp->b_flags, B_LOCKED))
1967 whichq = BQ_LOCKED; /* locked in core */
1968 else if (ISSET(bp->b_flags, B_META))
1969 whichq = BQ_META; /* meta-data */
1970 else if (ISSET(bp->b_flags, B_AGE))
1971 whichq = BQ_AGE; /* stale but valid data */
1972 else
1973 whichq = BQ_LRU; /* valid data */
1974 bufq = &bufqueues[whichq];
1975
1976 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE));
1977
1978 lck_mtx_lock(buf_mtxp);
1979
1980 binstailfree(bp, bufq, whichq);
1981 }
1982 if (needbuffer) {
1983 /*
1984 * needbuffer is a global
1985 * we're currently using buf_mtxp to protect it
1986 * delay doing the actual wakeup until after
1987 * we drop buf_mtxp
1988 */
1989 needbuffer = 0;
1990 need_wakeup = 1;
1991 }
1992 if (ISSET(bp->b_lflags, BL_WANTED)) {
1993 /*
1994 * delay the actual wakeup until after we
1995 * clear BL_BUSY and we've dropped buf_mtxp
1996 */
1997 need_bp_wakeup = 1;
1998 }
1999 /*
2000 * Unlock the buffer.
2001 */
2002 CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
2003
2004 lck_mtx_unlock(buf_mtxp);
2005
2006 if (need_wakeup) {
2007 /*
2008 * Wake up any processes waiting for any buffer to become free.
2009 */
2010 wakeup(&needbuffer);
2011 }
2012 if (need_bp_wakeup) {
2013 /*
2014 * Wake up any proceeses waiting for _this_ buffer to become free.
2015 */
2016 wakeup(bp);
2017 }
2018 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
2019 (int)bp, (int)bp->b_datap, bp->b_flags, 0, 0);
2020 }
2021
2022 /*
2023 * Determine if a block is in the cache.
2024 * Just look on what would be its hash chain. If it's there, return
2025 * a pointer to it, unless it's marked invalid. If it's marked invalid,
2026 * we normally don't return the buffer, unless the caller explicitly
2027 * wants us to.
2028 */
2029 static boolean_t
2030 incore(vnode_t vp, daddr64_t blkno)
2031 {
2032 boolean_t retval;
2033
2034 lck_mtx_lock(buf_mtxp);
2035
2036 if (incore_locked(vp, blkno))
2037 retval = TRUE;
2038 else
2039 retval = FALSE;
2040 lck_mtx_unlock(buf_mtxp);
2041
2042 return (retval);
2043 }
2044
2045
2046 static buf_t
2047 incore_locked(vnode_t vp, daddr64_t blkno)
2048 {
2049 struct buf *bp;
2050
2051 bp = BUFHASH(vp, blkno)->lh_first;
2052
2053 /* Search hash chain */
2054 for (; bp != NULL; bp = bp->b_hash.le_next) {
2055 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
2056 !ISSET(bp->b_flags, B_INVAL)) {
2057 return (bp);
2058 }
2059 }
2060 return (0);
2061 }
2062
2063
2064 /* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
2065 /*
2066 * Get a block of requested size that is associated with
2067 * a given vnode and block offset. If it is found in the
2068 * block cache, mark it as having been found, make it busy
2069 * and return it. Otherwise, return an empty block of the
2070 * correct size. It is up to the caller to insure that the
2071 * cached blocks be of the correct size.
2072 */
2073 buf_t
2074 buf_getblk(vnode_t vp, daddr64_t blkno, int size, int slpflag, int slptimeo, int operation)
2075 {
2076 buf_t bp;
2077 int err;
2078 upl_t upl;
2079 upl_page_info_t *pl;
2080 kern_return_t kret;
2081 int ret_only_valid;
2082 struct timespec ts;
2083 int upl_flags;
2084
2085 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
2086 (int)(blkno * PAGE_SIZE), size, operation, 0, 0);
2087
2088 ret_only_valid = operation & BLK_ONLYVALID;
2089 operation &= ~BLK_ONLYVALID;
2090 start:
2091 lck_mtx_lock(buf_mtxp);
2092 start_locked:
2093 if ((bp = incore_locked(vp, blkno))) {
2094 /*
2095 * Found in the Buffer Cache
2096 */
2097 if (ISSET(bp->b_lflags, BL_BUSY)) {
2098 /*
2099 * but is busy
2100 */
2101 switch (operation) {
2102 case BLK_READ:
2103 case BLK_WRITE:
2104 case BLK_META:
2105 SET(bp->b_lflags, BL_WANTED);
2106 bufstats.bufs_busyincore++;
2107
2108 /*
2109 * don't retake the mutex after being awakened...
2110 * the time out is in msecs
2111 */
2112 ts.tv_sec = (slptimeo/1000);
2113 ts.tv_nsec = (slptimeo % 1000) * 10 * NSEC_PER_USEC * 1000;
2114
2115 err = msleep(bp, buf_mtxp, slpflag | PDROP | (PRIBIO + 1), "buf_getblk", &ts);
2116
2117 /*
2118 * Callers who call with PCATCH or timeout are
2119 * willing to deal with the NULL pointer
2120 */
2121 if (err && ((slpflag & PCATCH) || ((err == EWOULDBLOCK) && slptimeo)))
2122 return (NULL);
2123 goto start;
2124 /*NOTREACHED*/
2125 break;
2126
2127 default:
2128 /*
2129 * unknown operation requested
2130 */
2131 panic("getblk: paging or unknown operation for incore busy buffer - %x\n", operation);
2132 /*NOTREACHED*/
2133 break;
2134 }
2135 } else {
2136 /*
2137 * buffer in core and not busy
2138 */
2139 if ( (bp->b_upl) )
2140 panic("buffer has UPL, but not marked BUSY: %x", bp);
2141 SET(bp->b_lflags, BL_BUSY);
2142 SET(bp->b_flags, B_CACHE);
2143 #ifdef JOE_DEBUG
2144 bp->b_owner = current_thread();
2145 bp->b_tag = 1;
2146 #endif
2147 bremfree_locked(bp);
2148 bufstats.bufs_incore++;
2149
2150 lck_mtx_unlock(buf_mtxp);
2151
2152 if ( !ret_only_valid)
2153 allocbuf(bp, size);
2154
2155 upl_flags = 0;
2156 switch (operation) {
2157 case BLK_WRITE:
2158 /*
2159 * "write" operation: let the UPL subsystem
2160 * know that we intend to modify the buffer
2161 * cache pages we're gathering.
2162 */
2163 upl_flags |= UPL_WILL_MODIFY;
2164 case BLK_READ:
2165 upl_flags |= UPL_PRECIOUS;
2166 if (UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
2167 kret = ubc_create_upl(vp,
2168 ubc_blktooff(vp, bp->b_lblkno),
2169 bp->b_bufsize,
2170 &upl,
2171 &pl,
2172 upl_flags);
2173 if (kret != KERN_SUCCESS)
2174 panic("Failed to create UPL");
2175
2176 bp->b_upl = upl;
2177
2178 if (upl_valid_page(pl, 0)) {
2179 if (upl_dirty_page(pl, 0))
2180 SET(bp->b_flags, B_WASDIRTY);
2181 else
2182 CLR(bp->b_flags, B_WASDIRTY);
2183 } else
2184 CLR(bp->b_flags, (B_DONE | B_CACHE | B_WASDIRTY | B_DELWRI));
2185
2186 kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_datap));
2187
2188 if (kret != KERN_SUCCESS)
2189 panic("getblk: ubc_upl_map() failed with (%d)", kret);
2190 }
2191 break;
2192
2193 case BLK_META:
2194 /*
2195 * VM is not involved in IO for the meta data
2196 * buffer already has valid data
2197 */
2198 break;
2199
2200 default:
2201 panic("getblk: paging or unknown operation for incore buffer- %d\n", operation);
2202 /*NOTREACHED*/
2203 break;
2204 }
2205 }
2206 } else { /* not incore() */
2207 int queue = BQ_EMPTY; /* Start with no preference */
2208
2209 if (ret_only_valid) {
2210 lck_mtx_unlock(buf_mtxp);
2211 return (NULL);
2212 }
2213
2214 if ((UBCINVALID(vp)) || !(UBCINFOEXISTS(vp)))
2215 operation = BLK_META;
2216
2217 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
2218 goto start_locked;
2219
2220 /*
2221 * getnewbuf may block for a number of different reasons...
2222 * if it does, it's then possible for someone else to
2223 * create a buffer for the same block and insert it into
2224 * the hash... if we see it incore at this point we dump
2225 * the buffer we were working on and start over
2226 */
2227 if (incore_locked(vp, blkno)) {
2228 SET(bp->b_flags, B_INVAL);
2229 binshash(bp, &invalhash);
2230
2231 lck_mtx_unlock(buf_mtxp);
2232
2233 buf_brelse(bp);
2234 goto start;
2235 }
2236 /*
2237 * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
2238 * CALLED! BE CAREFUL.
2239 */
2240
2241 /*
2242 * mark the buffer as B_META if indicated
2243 * so that when buffer is released it will goto META queue
2244 */
2245 if (operation == BLK_META)
2246 SET(bp->b_flags, B_META);
2247
2248 bp->b_blkno = bp->b_lblkno = blkno;
2249 bp->b_vp = vp;
2250
2251 /*
2252 * Insert in the hash so that incore() can find it
2253 */
2254 binshash(bp, BUFHASH(vp, blkno));
2255
2256 lck_mtx_unlock(buf_mtxp);
2257
2258 bgetvp(vp, bp);
2259
2260 allocbuf(bp, size);
2261
2262 upl_flags = 0;
2263 switch (operation) {
2264 case BLK_META:
2265 /*
2266 * buffer data is invalid...
2267 *
2268 * I don't want to have to retake buf_mtxp,
2269 * so the miss and vmhits counters are done
2270 * with Atomic updates... all other counters
2271 * in bufstats are protected with either
2272 * buf_mtxp or iobuffer_mtxp
2273 */
2274 OSAddAtomic(1, &bufstats.bufs_miss);
2275 break;
2276
2277 case BLK_WRITE:
2278 /*
2279 * "write" operation: let the UPL subsystem know
2280 * that we intend to modify the buffer cache pages
2281 * we're gathering.
2282 */
2283 upl_flags |= UPL_WILL_MODIFY;
2284 case BLK_READ:
2285 { off_t f_offset;
2286 size_t contig_bytes;
2287 int bmap_flags;
2288
2289 if ( (bp->b_upl) )
2290 panic("bp already has UPL: %x",bp);
2291
2292 f_offset = ubc_blktooff(vp, blkno);
2293
2294 upl_flags |= UPL_PRECIOUS;
2295 kret = ubc_create_upl(vp,
2296 f_offset,
2297 bp->b_bufsize,
2298 &upl,
2299 &pl,
2300 upl_flags);
2301
2302 if (kret != KERN_SUCCESS)
2303 panic("Failed to create UPL");
2304 #ifdef UPL_DEBUG
2305 upl_ubc_alias_set(upl, bp, 4);
2306 #endif /* UPL_DEBUG */
2307 bp->b_upl = upl;
2308
2309 if (upl_valid_page(pl, 0)) {
2310
2311 if (operation == BLK_READ)
2312 bmap_flags = VNODE_READ;
2313 else
2314 bmap_flags = VNODE_WRITE;
2315
2316 SET(bp->b_flags, B_CACHE | B_DONE);
2317
2318 OSAddAtomic(1, &bufstats.bufs_vmhits);
2319
2320 bp->b_validoff = 0;
2321 bp->b_dirtyoff = 0;
2322
2323 if (upl_dirty_page(pl, 0)) {
2324 /* page is dirty */
2325 SET(bp->b_flags, B_WASDIRTY);
2326
2327 bp->b_validend = bp->b_bcount;
2328 bp->b_dirtyend = bp->b_bcount;
2329 } else {
2330 /* page is clean */
2331 bp->b_validend = bp->b_bcount;
2332 bp->b_dirtyend = 0;
2333 }
2334 /*
2335 * try to recreate the physical block number associated with
2336 * this buffer...
2337 */
2338 if (VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))
2339 panic("getblk: VNOP_BLOCKMAP failed");
2340 /*
2341 * if the extent represented by this buffer
2342 * is not completely physically contiguous on
2343 * disk, than we can't cache the physical mapping
2344 * in the buffer header
2345 */
2346 if ((long)contig_bytes < bp->b_bcount)
2347 bp->b_blkno = bp->b_lblkno;
2348 } else {
2349 OSAddAtomic(1, &bufstats.bufs_miss);
2350 }
2351 kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_datap));
2352
2353 if (kret != KERN_SUCCESS)
2354 panic("getblk: ubc_upl_map() failed with (%d)", kret);
2355 break;
2356 }
2357 default:
2358 panic("getblk: paging or unknown operation - %x", operation);
2359 /*NOTREACHED*/
2360 break;
2361 }
2362 }
2363 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
2364 (int)bp, (int)bp->b_datap, bp->b_flags, 3, 0);
2365
2366 #ifdef JOE_DEBUG
2367 bp->b_stackgetblk[0] = __builtin_return_address(0);
2368 bp->b_stackgetblk[1] = __builtin_return_address(1);
2369 bp->b_stackgetblk[2] = __builtin_return_address(2);
2370 bp->b_stackgetblk[3] = __builtin_return_address(3);
2371 bp->b_stackgetblk[4] = __builtin_return_address(4);
2372 bp->b_stackgetblk[5] = __builtin_return_address(5);
2373 #endif
2374 return (bp);
2375 }
2376
2377 /*
2378 * Get an empty, disassociated buffer of given size.
2379 */
2380 buf_t
2381 buf_geteblk(size)
2382 int size;
2383 {
2384 buf_t bp;
2385 int queue = BQ_EMPTY;
2386
2387 lck_mtx_lock(buf_mtxp);
2388
2389 while ((bp = getnewbuf(0, 0, &queue)) == 0)
2390 ;
2391 SET(bp->b_flags, (B_META|B_INVAL));
2392
2393 #if DIAGNOSTIC
2394 assert(queue == BQ_EMPTY);
2395 #endif /* DIAGNOSTIC */
2396 /* XXX need to implement logic to deal with other queues */
2397
2398 binshash(bp, &invalhash);
2399 bufstats.bufs_eblk++;
2400
2401 lck_mtx_unlock(buf_mtxp);
2402
2403 allocbuf(bp, size);
2404
2405 return (bp);
2406 }
2407
2408 /*
2409 * Zones for the meta data buffers
2410 */
2411
2412 #define MINMETA 512
2413 #define MAXMETA 4096
2414
2415 struct meta_zone_entry {
2416 zone_t mz_zone;
2417 vm_size_t mz_size;
2418 vm_size_t mz_max;
2419 char *mz_name;
2420 };
2421
2422 struct meta_zone_entry meta_zones[] = {
2423 {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
2424 {NULL, (MINMETA * 2), 64 * (MINMETA * 2), "buf.1024" },
2425 {NULL, (MINMETA * 4), 16 * (MINMETA * 4), "buf.2048" },
2426 {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
2427 {NULL, 0, 0, "" } /* End */
2428 };
2429
2430 /*
2431 * Initialize the meta data zones
2432 */
2433 static void
2434 bufzoneinit(void)
2435 {
2436 int i;
2437
2438 for (i = 0; meta_zones[i].mz_size != 0; i++) {
2439 meta_zones[i].mz_zone =
2440 zinit(meta_zones[i].mz_size,
2441 meta_zones[i].mz_max,
2442 PAGE_SIZE,
2443 meta_zones[i].mz_name);
2444 }
2445 buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
2446 }
2447
2448 static __inline__ zone_t
2449 getbufzone(size_t size)
2450 {
2451 int i;
2452
2453 if ((size % 512) || (size < MINMETA) || (size > MAXMETA))
2454 panic("getbufzone: incorect size = %d", size);
2455
2456 for (i = 0; meta_zones[i].mz_size != 0; i++) {
2457 if (meta_zones[i].mz_size >= size)
2458 break;
2459 }
2460
2461 return (meta_zones[i].mz_zone);
2462 }
2463
2464 /*
2465 * With UBC, there is no need to expand / shrink the file data
2466 * buffer. The VM uses the same pages, hence no waste.
2467 * All the file data buffers can have one size.
2468 * In fact expand / shrink would be an expensive operation.
2469 *
2470 * Only exception to this is meta-data buffers. Most of the
2471 * meta data operations are smaller than PAGE_SIZE. Having the
2472 * meta-data buffers grow and shrink as needed, optimizes use
2473 * of the kernel wired memory.
2474 */
2475
2476 int
2477 allocbuf(buf_t bp, int size)
2478 {
2479 vm_size_t desired_size;
2480
2481 desired_size = roundup(size, CLBYTES);
2482
2483 if (desired_size < PAGE_SIZE)
2484 desired_size = PAGE_SIZE;
2485 if (desired_size > MAXBSIZE)
2486 panic("allocbuf: buffer larger than MAXBSIZE requested");
2487
2488 if (ISSET(bp->b_flags, B_META)) {
2489 zone_t zprev, z;
2490 int nsize = roundup(size, MINMETA);
2491
2492 if (bp->b_datap) {
2493 vm_offset_t elem = (vm_offset_t)bp->b_datap;
2494
2495 if (ISSET(bp->b_flags, B_ZALLOC)) {
2496 if (bp->b_bufsize < nsize) {
2497 /* reallocate to a bigger size */
2498
2499 zprev = getbufzone(bp->b_bufsize);
2500 if (nsize <= MAXMETA) {
2501 desired_size = nsize;
2502 z = getbufzone(nsize);
2503 bp->b_datap = (uintptr_t)zalloc(z);
2504 } else {
2505 bp->b_datap = (uintptr_t)NULL;
2506 kmem_alloc_wired(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
2507 CLR(bp->b_flags, B_ZALLOC);
2508 }
2509 bcopy((void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
2510 zfree(zprev, (void *)elem);
2511 } else {
2512 desired_size = bp->b_bufsize;
2513 }
2514
2515 } else {
2516 if ((vm_size_t)bp->b_bufsize < desired_size) {
2517 /* reallocate to a bigger size */
2518 bp->b_datap = (uintptr_t)NULL;
2519 kmem_alloc_wired(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
2520 bcopy((const void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
2521 kmem_free(kernel_map, elem, bp->b_bufsize);
2522 } else {
2523 desired_size = bp->b_bufsize;
2524 }
2525 }
2526 } else {
2527 /* new allocation */
2528 if (nsize <= MAXMETA) {
2529 desired_size = nsize;
2530 z = getbufzone(nsize);
2531 bp->b_datap = (uintptr_t)zalloc(z);
2532 SET(bp->b_flags, B_ZALLOC);
2533 } else
2534 kmem_alloc_wired(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
2535 }
2536 }
2537 bp->b_bufsize = desired_size;
2538 bp->b_bcount = size;
2539
2540 return (0);
2541 }
2542
2543 /*
2544 * Get a new buffer from one of the free lists.
2545 *
2546 * Request for a queue is passes in. The queue from which the buffer was taken
2547 * from is returned. Out of range queue requests get BQ_EMPTY. Request for
2548 * BQUEUE means no preference. Use heuristics in that case.
2549 * Heuristics is as follows:
2550 * Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
2551 * If none available block till one is made available.
2552 * If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
2553 * Pick the most stale buffer.
2554 * If found buffer was marked delayed write, start the async. write
2555 * and restart the search.
2556 * Initialize the fields and disassociate the buffer from the vnode.
2557 * Remove the buffer from the hash. Return the buffer and the queue
2558 * on which it was found.
2559 *
2560 * buf_mtxp is held upon entry
2561 * returns with buf_mtxp locked
2562 */
2563
2564 static buf_t
2565 getnewbuf(int slpflag, int slptimeo, int * queue)
2566 {
2567 buf_t bp;
2568 buf_t lru_bp;
2569 buf_t age_bp;
2570 buf_t meta_bp;
2571 int age_time, lru_time, bp_time, meta_time;
2572 int req = *queue; /* save it for restarts */
2573 struct timespec ts;
2574
2575 start:
2576 /*
2577 * invalid request gets empty queue
2578 */
2579 if ((*queue > BQUEUES) || (*queue < 0)
2580 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED))
2581 *queue = BQ_EMPTY;
2582 /* need to grow number of bufs, add another one rather than recycling */
2583 if (nbuf < max_nbuf_headers) {
2584 /*
2585 * Increment count now as lock
2586 * is dropped for allocation.
2587 * That avoids over commits
2588 */
2589 nbuf++;
2590 goto add_newbufs;
2591 }
2592
2593 /*
2594 * (*queue == BQUEUES) means no preference
2595 */
2596 if (*queue != BQUEUES) {
2597 /* Try for the requested queue first */
2598 bp = bufqueues[*queue].tqh_first;
2599 if (bp)
2600 goto found;
2601 }
2602
2603 /* Unable to use requested queue */
2604 age_bp = bufqueues[BQ_AGE].tqh_first;
2605 lru_bp = bufqueues[BQ_LRU].tqh_first;
2606 meta_bp = bufqueues[BQ_META].tqh_first;
2607
2608 if (!age_bp && !lru_bp && !meta_bp) {
2609 /*
2610 * Unavailble on AGE or LRU or META queues
2611 * Try the empty list first
2612 */
2613 bp = bufqueues[BQ_EMPTY].tqh_first;
2614 if (bp) {
2615 *queue = BQ_EMPTY;
2616 goto found;
2617 }
2618 /*
2619 * We have seen is this is hard to trigger.
2620 * This is an overcommit of nbufs but needed
2621 * in some scenarios with diskiamges
2622 */
2623
2624 add_newbufs:
2625 lck_mtx_unlock(buf_mtxp);
2626
2627 /* Create a new temporary buffer header */
2628 bp = (struct buf *)zalloc(buf_hdr_zone);
2629
2630 lck_mtx_lock(buf_mtxp);
2631
2632 if (bp) {
2633 bufhdrinit(bp);
2634 BLISTNONE(bp);
2635 binshash(bp, &invalhash);
2636 SET(bp->b_flags, B_HDRALLOC);
2637 *queue = BQ_EMPTY;
2638 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
2639 buf_hdr_count++;
2640 goto found;
2641 }
2642 /* subtract already accounted bufcount */
2643 nbuf--;
2644
2645 bufstats.bufs_sleeps++;
2646
2647 /* wait for a free buffer of any kind */
2648 needbuffer = 1;
2649 /* hz value is 100 */
2650 ts.tv_sec = (slptimeo/1000);
2651 /* the hz value is 100; which leads to 10ms */
2652 ts.tv_nsec = (slptimeo % 1000) * NSEC_PER_USEC * 1000 * 10;
2653 msleep(&needbuffer, buf_mtxp, slpflag|(PRIBIO+1), (char *)"getnewbuf", &ts);
2654 return (0);
2655 }
2656
2657 /* Buffer available either on AGE or LRU or META */
2658 bp = NULL;
2659 *queue = -1;
2660
2661 /* Buffer available either on AGE or LRU */
2662 if (!age_bp) {
2663 bp = lru_bp;
2664 *queue = BQ_LRU;
2665 } else if (!lru_bp) {
2666 bp = age_bp;
2667 *queue = BQ_AGE;
2668 } else { /* buffer available on both AGE and LRU */
2669 int t = buf_timestamp();
2670
2671 age_time = t - age_bp->b_timestamp;
2672 lru_time = t - lru_bp->b_timestamp;
2673 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
2674 bp = age_bp;
2675 *queue = BQ_AGE;
2676 /*
2677 * we should probably re-timestamp eveything in the
2678 * queues at this point with the current time
2679 */
2680 } else {
2681 if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
2682 bp = lru_bp;
2683 *queue = BQ_LRU;
2684 } else {
2685 bp = age_bp;
2686 *queue = BQ_AGE;
2687 }
2688 }
2689 }
2690
2691 if (!bp) { /* Neither on AGE nor on LRU */
2692 bp = meta_bp;
2693 *queue = BQ_META;
2694 } else if (meta_bp) {
2695 int t = buf_timestamp();
2696
2697 bp_time = t - bp->b_timestamp;
2698 meta_time = t - meta_bp->b_timestamp;
2699
2700 if (!(bp_time < 0) && !(meta_time < 0)) {
2701 /* time not set backwards */
2702 int bp_is_stale;
2703 bp_is_stale = (*queue == BQ_LRU) ?
2704 lru_is_stale : age_is_stale;
2705
2706 if ((meta_time >= meta_is_stale) &&
2707 (bp_time < bp_is_stale)) {
2708 bp = meta_bp;
2709 *queue = BQ_META;
2710 }
2711 }
2712 }
2713 found:
2714 if (ISSET(bp->b_flags, B_LOCKED) || ISSET(bp->b_lflags, BL_BUSY))
2715 panic("getnewbuf: bp @ 0x%x is LOCKED or BUSY! (flags 0x%x)\n", bp, bp->b_flags);
2716
2717 /* Clean it */
2718 if (bcleanbuf(bp)) {
2719 /*
2720 * moved to the laundry thread, buffer not ready
2721 */
2722 *queue = req;
2723 goto start;
2724 }
2725 return (bp);
2726 }
2727
2728
2729 /*
2730 * Clean a buffer.
2731 * Returns 0 is buffer is ready to use,
2732 * Returns 1 if issued a buf_bawrite() to indicate
2733 * that the buffer is not ready.
2734 *
2735 * buf_mtxp is held upon entry
2736 * returns with buf_mtxp locked
2737 */
2738 static int
2739 bcleanbuf(buf_t bp)
2740 {
2741 /* Remove from the queue */
2742 bremfree_locked(bp);
2743
2744 /* Buffer is no longer on free lists. */
2745 SET(bp->b_lflags, BL_BUSY);
2746 #ifdef JOE_DEBUG
2747 bp->b_owner = current_thread();
2748 bp->b_tag = 2;
2749 #endif
2750 /*
2751 * If buffer was a delayed write, start the IO by queuing
2752 * it on the LAUNDRY queue, and return 1
2753 */
2754 if (ISSET(bp->b_flags, B_DELWRI)) {
2755 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
2756 blaundrycnt++;
2757
2758 lck_mtx_unlock(buf_mtxp);
2759
2760 wakeup(&blaundrycnt);
2761 /* and give it a chance to run */
2762 (void)thread_block(THREAD_CONTINUE_NULL);
2763
2764 lck_mtx_lock(buf_mtxp);
2765 return (1);
2766 }
2767 bremhash(bp);
2768
2769 lck_mtx_unlock(buf_mtxp);
2770
2771 BLISTNONE(bp);
2772 /*
2773 * disassociate us from our vnode, if we had one...
2774 */
2775 if (bp->b_vp)
2776 brelvp(bp);
2777
2778 if (ISSET(bp->b_flags, B_META)) {
2779 vm_offset_t elem;
2780
2781 elem = (vm_offset_t)bp->b_datap;
2782 bp->b_datap = (uintptr_t)0xdeadbeef;
2783
2784 if (ISSET(bp->b_flags, B_ZALLOC)) {
2785 zone_t z;
2786
2787 z = getbufzone(bp->b_bufsize);
2788 zfree(z, (void *)elem);
2789 } else
2790 kmem_free(kernel_map, elem, bp->b_bufsize);
2791 }
2792
2793 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
2794
2795 /* clear out various other fields */
2796 bp->b_bufsize = 0;
2797 bp->b_datap = (uintptr_t)NULL;
2798 bp->b_upl = (void *)NULL;
2799 /*
2800 * preserve the state of whether this buffer
2801 * was allocated on the fly or not...
2802 * the only other flag that should be set at
2803 * this point is BL_BUSY...
2804 */
2805 #ifdef JOE_DEBUG
2806 bp->b_owner = current_thread();
2807 bp->b_tag = 3;
2808 #endif
2809 bp->b_lflags = BL_BUSY;
2810 bp->b_flags = (bp->b_flags & B_HDRALLOC);
2811 bp->b_dev = NODEV;
2812 bp->b_blkno = bp->b_lblkno = 0;
2813 bp->b_iodone = NULL;
2814 bp->b_error = 0;
2815 bp->b_resid = 0;
2816 bp->b_bcount = 0;
2817 bp->b_dirtyoff = bp->b_dirtyend = 0;
2818 bp->b_validoff = bp->b_validend = 0;
2819
2820 /* nuke any credentials we were holding */
2821 if (IS_VALID_CRED(bp->b_rcred)) {
2822 kauth_cred_unref(&bp->b_rcred);
2823 }
2824 if (IS_VALID_CRED(bp->b_wcred)) {
2825 kauth_cred_unref(&bp->b_wcred);
2826 }
2827 lck_mtx_lock(buf_mtxp);
2828
2829 return (0);
2830 }
2831
2832
2833
2834 errno_t
2835 buf_invalblkno(vnode_t vp, daddr64_t lblkno, int flags)
2836 {
2837 buf_t bp;
2838 errno_t error;
2839
2840 lck_mtx_lock(buf_mtxp);
2841 relook:
2842 if ((bp = incore_locked(vp, lblkno)) == (struct buf *)0) {
2843 lck_mtx_unlock(buf_mtxp);
2844 return (0);
2845 }
2846 if (ISSET(bp->b_lflags, BL_BUSY)) {
2847 if ( !ISSET(flags, BUF_WAIT)) {
2848 lck_mtx_unlock(buf_mtxp);
2849 return (EBUSY);
2850 }
2851 SET(bp->b_lflags, BL_WANTED);
2852
2853 error = msleep((caddr_t)bp, buf_mtxp, (PRIBIO + 1), (char *)"buf_invalblkno", 0);
2854
2855 if (error)
2856 return (error);
2857 goto relook;
2858 }
2859 bremfree_locked(bp);
2860 SET(bp->b_lflags, BL_BUSY);
2861 SET(bp->b_flags, B_INVAL);
2862 #ifdef JOE_DEBUG
2863 bp->b_owner = current_thread();
2864 bp->b_tag = 4;
2865 #endif
2866 lck_mtx_unlock(buf_mtxp);
2867 buf_brelse(bp);
2868
2869 return (0);
2870 }
2871
2872
2873 void
2874 buf_drop(buf_t bp)
2875 {
2876 int need_wakeup = 0;
2877
2878 lck_mtx_lock(buf_mtxp);
2879
2880 if (ISSET(bp->b_lflags, BL_WANTED)) {
2881 /*
2882 * delay the actual wakeup until after we
2883 * clear BL_BUSY and we've dropped buf_mtxp
2884 */
2885 need_wakeup = 1;
2886 }
2887 /*
2888 * Unlock the buffer.
2889 */
2890 CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
2891
2892 lck_mtx_unlock(buf_mtxp);
2893
2894 if (need_wakeup) {
2895 /*
2896 * Wake up any proceeses waiting for _this_ buffer to become free.
2897 */
2898 wakeup(bp);
2899 }
2900 }
2901
2902
2903 errno_t
2904 buf_acquire(buf_t bp, int flags, int slpflag, int slptimeo) {
2905 errno_t error;
2906
2907 lck_mtx_lock(buf_mtxp);
2908
2909 error = buf_acquire_locked(bp, flags, slpflag, slptimeo);
2910
2911 lck_mtx_unlock(buf_mtxp);
2912
2913 return (error);
2914 }
2915
2916
2917 static errno_t
2918 buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo)
2919 {
2920 errno_t error;
2921 struct timespec ts;
2922
2923 if (ISSET(bp->b_flags, B_LOCKED)) {
2924 if ((flags & BAC_SKIP_LOCKED))
2925 return (EDEADLK);
2926 } else {
2927 if ((flags & BAC_SKIP_NONLOCKED))
2928 return (EDEADLK);
2929 }
2930 if (ISSET(bp->b_lflags, BL_BUSY)) {
2931 /*
2932 * since the mutex_lock may block, the buffer
2933 * may become BUSY, so we need to
2934 * recheck for a NOWAIT request
2935 */
2936 if (flags & BAC_NOWAIT)
2937 return (EBUSY);
2938 SET(bp->b_lflags, BL_WANTED);
2939
2940 /* the hz value is 100; which leads to 10ms */
2941 ts.tv_sec = (slptimeo/100);
2942 ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000;
2943 error = msleep((caddr_t)bp, buf_mtxp, slpflag | (PRIBIO + 1), (char *)"buf_acquire", &ts);
2944
2945 if (error)
2946 return (error);
2947 return (EAGAIN);
2948 }
2949 if (flags & BAC_REMOVE)
2950 bremfree_locked(bp);
2951 SET(bp->b_lflags, BL_BUSY);
2952 #ifdef JOE_DEBUG
2953 bp->b_owner = current_thread();
2954 bp->b_tag = 5;
2955 #endif
2956 return (0);
2957 }
2958
2959
2960 /*
2961 * Wait for operations on the buffer to complete.
2962 * When they do, extract and return the I/O's error value.
2963 */
2964 errno_t
2965 buf_biowait(buf_t bp)
2966 {
2967 lck_mtx_lock(buf_mtxp);
2968
2969 while (!ISSET(bp->b_flags, B_DONE))
2970 (void) msleep(bp, buf_mtxp, (PRIBIO+1), (char *)"buf_biowait", 0);
2971
2972 lck_mtx_unlock(buf_mtxp);
2973
2974 /* check for interruption of I/O (e.g. via NFS), then errors. */
2975 if (ISSET(bp->b_flags, B_EINTR)) {
2976 CLR(bp->b_flags, B_EINTR);
2977 return (EINTR);
2978 } else if (ISSET(bp->b_flags, B_ERROR))
2979 return (bp->b_error ? bp->b_error : EIO);
2980 else
2981 return (0);
2982 }
2983
2984 /*
2985 * Mark I/O complete on a buffer.
2986 *
2987 * If a callback has been requested, e.g. the pageout
2988 * daemon, do so. Otherwise, awaken waiting processes.
2989 *
2990 * [ Leffler, et al., says on p.247:
2991 * "This routine wakes up the blocked process, frees the buffer
2992 * for an asynchronous write, or, for a request by the pagedaemon
2993 * process, invokes a procedure specified in the buffer structure" ]
2994 *
2995 * In real life, the pagedaemon (or other system processes) wants
2996 * to do async stuff to, and doesn't want the buffer buf_brelse()'d.
2997 * (for swap pager, that puts swap buffers on the free lists (!!!),
2998 * for the vn device, that puts malloc'd buffers on the free lists!)
2999 */
3000 extern struct timeval priority_IO_timestamp_for_root;
3001 extern int hard_throttle_on_root;
3002
3003 void
3004 buf_biodone(buf_t bp)
3005 {
3006 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
3007 (int)bp, (int)bp->b_datap, bp->b_flags, 0, 0);
3008
3009 if (ISSET(bp->b_flags, B_DONE))
3010 panic("biodone already");
3011
3012 if (kdebug_enable) {
3013 int code = DKIO_DONE;
3014
3015 if (bp->b_flags & B_READ)
3016 code |= DKIO_READ;
3017 if (bp->b_flags & B_ASYNC)
3018 code |= DKIO_ASYNC;
3019
3020 if (bp->b_flags & B_META)
3021 code |= DKIO_META;
3022 else if (bp->b_flags & B_PAGEIO)
3023 code |= DKIO_PAGING;
3024
3025 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
3026 (unsigned int)bp, (unsigned int)bp->b_vp,
3027 bp->b_resid, bp->b_error, 0);
3028 }
3029 if ((bp->b_vp != NULLVP) &&
3030 ((bp->b_flags & (B_PAGEIO | B_READ)) == (B_PAGEIO | B_READ)) &&
3031 (bp->b_vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV)) {
3032 microuptime(&priority_IO_timestamp_for_root);
3033 hard_throttle_on_root = 0;
3034 }
3035 /*
3036 * I/O was done, so don't believe
3037 * the DIRTY state from VM anymore
3038 */
3039 CLR(bp->b_flags, B_WASDIRTY);
3040
3041 if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
3042 /*
3043 * wake up any writer's blocked
3044 * on throttle or waiting for I/O
3045 * to drain
3046 */
3047 vnode_writedone(bp->b_vp);
3048
3049 if (ISSET(bp->b_flags, (B_CALL | B_FILTER))) { /* if necessary, call out */
3050 void (*iodone_func)(struct buf *, void *) = bp->b_iodone;
3051 void *arg = (void *)bp->b_transaction;
3052 int callout = ISSET(bp->b_flags, B_CALL);
3053
3054 CLR(bp->b_flags, (B_CALL | B_FILTER)); /* filters and callouts are one-shot */
3055 bp->b_iodone = NULL;
3056 bp->b_transaction = NULL;
3057
3058 if (iodone_func == NULL) {
3059 panic("biodone: bp @ 0x%x has NULL b_iodone!\n", bp);
3060 } else {
3061 if (callout)
3062 SET(bp->b_flags, B_DONE); /* note that it's done */
3063 (*iodone_func)(bp, arg);
3064 }
3065 if (callout)
3066 /*
3067 * assumes that the call back function takes
3068 * ownership of the bp and deals with releasing it if necessary
3069 */
3070 goto biodone_done;
3071 /*
3072 * in this case the call back function is acting
3073 * strictly as a filter... it does not take
3074 * ownership of the bp and is expecting us
3075 * to finish cleaning up... this is currently used
3076 * by the HFS journaling code
3077 */
3078 }
3079 if (ISSET(bp->b_flags, B_ASYNC)) { /* if async, release it */
3080 SET(bp->b_flags, B_DONE); /* note that it's done */
3081
3082 buf_brelse(bp);
3083 } else { /* or just wakeup the buffer */
3084 /*
3085 * by taking the mutex, we serialize
3086 * the buf owner calling buf_biowait so that we'll
3087 * only see him in one of 2 states...
3088 * state 1: B_DONE wasn't set and he's
3089 * blocked in msleep
3090 * state 2: he's blocked trying to take the
3091 * mutex before looking at B_DONE
3092 * BL_WANTED is cleared in case anyone else
3093 * is blocked waiting for the buffer... note
3094 * that we haven't cleared B_BUSY yet, so if
3095 * they do get to run, their going to re-set
3096 * BL_WANTED and go back to sleep
3097 */
3098 lck_mtx_lock(buf_mtxp);
3099
3100 CLR(bp->b_lflags, BL_WANTED);
3101 SET(bp->b_flags, B_DONE); /* note that it's done */
3102
3103 lck_mtx_unlock(buf_mtxp);
3104
3105 wakeup(bp);
3106 }
3107 biodone_done:
3108 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
3109 (int)bp, (int)bp->b_datap, bp->b_flags, 0, 0);
3110 }
3111
3112 /*
3113 * Return a count of buffers on the "locked" queue.
3114 */
3115 int
3116 count_lock_queue(void)
3117 {
3118 buf_t bp;
3119 int n = 0;
3120
3121 lck_mtx_lock(buf_mtxp);
3122
3123 for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
3124 bp = bp->b_freelist.tqe_next)
3125 n++;
3126 lck_mtx_unlock(buf_mtxp);
3127
3128 return (n);
3129 }
3130
3131 /*
3132 * Return a count of 'busy' buffers. Used at the time of shutdown.
3133 */
3134 int
3135 count_busy_buffers(void)
3136 {
3137 buf_t bp;
3138 int nbusy = 0;
3139
3140 lck_mtx_lock(buf_mtxp);
3141 for (bp = &buf[boot_nbuf]; --bp >= buf; )
3142 if (!ISSET(bp->b_flags, B_INVAL) && ISSET(bp->b_lflags, BL_BUSY))
3143 nbusy++;
3144 lck_mtx_unlock(buf_mtxp);
3145
3146 return (nbusy);
3147 }
3148
3149 #if DIAGNOSTIC
3150 /*
3151 * Print out statistics on the current allocation of the buffer pool.
3152 * Can be enabled to print out on every ``sync'' by setting "syncprt"
3153 * in vfs_syscalls.c using sysctl.
3154 */
3155 void
3156 vfs_bufstats()
3157 {
3158 int i, j, count;
3159 register struct buf *bp;
3160 register struct bqueues *dp;
3161 int counts[MAXBSIZE/CLBYTES+1];
3162 static char *bname[BQUEUES] =
3163 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
3164
3165 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
3166 count = 0;
3167 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
3168 counts[j] = 0;
3169
3170 lck_mtx_lock(buf_mtxp);
3171
3172 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
3173 counts[bp->b_bufsize/CLBYTES]++;
3174 count++;
3175 }
3176 lck_mtx_unlock(buf_mtxp);
3177
3178 printf("%s: total-%d", bname[i], count);
3179 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
3180 if (counts[j] != 0)
3181 printf(", %d-%d", j * CLBYTES, counts[j]);
3182 printf("\n");
3183 }
3184 }
3185 #endif /* DIAGNOSTIC */
3186
3187 #define NRESERVEDIOBUFS 64
3188
3189
3190 buf_t
3191 alloc_io_buf(vnode_t vp, int priv)
3192 {
3193 buf_t bp;
3194
3195 lck_mtx_lock(iobuffer_mtxp);
3196
3197 while (((niobuf - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse) && !priv) ||
3198 (bp = iobufqueue.tqh_first) == NULL) {
3199 bufstats.bufs_iobufsleeps++;
3200
3201 need_iobuffer = 1;
3202 (void) msleep(&need_iobuffer, iobuffer_mtxp, (PRIBIO+1), (const char *)"alloc_io_buf", 0);
3203 }
3204 TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
3205
3206 bufstats.bufs_iobufinuse++;
3207 if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
3208 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
3209
3210 lck_mtx_unlock(iobuffer_mtxp);
3211
3212 /*
3213 * initialize various fields
3214 * we don't need to hold the mutex since the buffer
3215 * is now private... the vp should have a reference
3216 * on it and is not protected by this mutex in any event
3217 */
3218 bp->b_timestamp = 0;
3219 bp->b_proc = NULL;
3220
3221 bp->b_datap = 0;
3222 bp->b_flags = 0;
3223 bp->b_lflags = BL_BUSY | BL_IOBUF;
3224 bp->b_blkno = bp->b_lblkno = 0;
3225 #ifdef JOE_DEBUG
3226 bp->b_owner = current_thread();
3227 bp->b_tag = 6;
3228 #endif
3229 bp->b_iodone = NULL;
3230 bp->b_error = 0;
3231 bp->b_resid = 0;
3232 bp->b_bcount = 0;
3233 bp->b_bufsize = 0;
3234 bp->b_upl = NULL;
3235 bp->b_vp = vp;
3236
3237 if (vp && (vp->v_type == VBLK || vp->v_type == VCHR))
3238 bp->b_dev = vp->v_rdev;
3239 else
3240 bp->b_dev = NODEV;
3241
3242 return (bp);
3243 }
3244
3245
3246 void
3247 free_io_buf(buf_t bp)
3248 {
3249 int need_wakeup = 0;
3250
3251 /*
3252 * put buffer back on the head of the iobufqueue
3253 */
3254 bp->b_vp = NULL;
3255 bp->b_flags = B_INVAL;
3256
3257 lck_mtx_lock(iobuffer_mtxp);
3258
3259 binsheadfree(bp, &iobufqueue, -1);
3260
3261 if (need_iobuffer) {
3262 /*
3263 * Wake up any processes waiting because they need an io buffer
3264 *
3265 * do the wakeup after we drop the mutex... it's possible that the
3266 * wakeup will be superfluous if need_iobuffer gets set again and
3267 * another thread runs this path, but it's highly unlikely, doesn't
3268 * hurt, and it means we don't hold up I/O progress if the wakeup blocks
3269 * trying to grab a task related lock...
3270 */
3271 need_iobuffer = 0;
3272 need_wakeup = 1;
3273 }
3274 bufstats.bufs_iobufinuse--;
3275
3276 lck_mtx_unlock(iobuffer_mtxp);
3277
3278 if (need_wakeup)
3279 wakeup(&need_iobuffer);
3280 }
3281
3282
3283
3284 /*
3285 * If getnewbuf() calls bcleanbuf() on the same thread
3286 * there is a potential for stack overrun and deadlocks.
3287 * So we always handoff the work to a worker thread for completion
3288 */
3289 #include <mach/mach_types.h>
3290 #include <mach/memory_object_types.h>
3291 #include <kern/sched_prim.h>
3292
3293
3294 static void
3295 bcleanbuf_thread_init(void)
3296 {
3297 /* create worker thread */
3298 kernel_thread(kernel_task, bcleanbuf_thread);
3299 }
3300
3301 static void
3302 bcleanbuf_thread(void)
3303 {
3304 struct buf *bp;
3305 int error = 0;
3306 int loopcnt = 0;
3307
3308 for (;;) {
3309 lck_mtx_lock(buf_mtxp);
3310
3311 while (blaundrycnt == 0)
3312 (void)msleep((void *)&blaundrycnt, buf_mtxp, PRIBIO, "blaundry", 0);
3313
3314 bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY]);
3315 /*
3316 * Remove from the queue
3317 */
3318 bremfree_locked(bp);
3319 blaundrycnt--;
3320
3321 lck_mtx_unlock(buf_mtxp);
3322 /*
3323 * do the IO
3324 */
3325 error = bawrite_internal(bp, 0);
3326
3327 if (error) {
3328 lck_mtx_lock(buf_mtxp);
3329
3330 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
3331 blaundrycnt++;
3332
3333 lck_mtx_unlock(buf_mtxp);
3334
3335 if (loopcnt > 10) {
3336 (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 1);
3337 loopcnt = 0;
3338 } else {
3339 (void)thread_block(THREAD_CONTINUE_NULL);
3340 loopcnt++;
3341 }
3342 }
3343 }
3344 }
3345
3346
3347 static int
3348 brecover_data(buf_t bp)
3349 {
3350 int upl_offset;
3351 upl_t upl;
3352 upl_page_info_t *pl;
3353 kern_return_t kret;
3354 vnode_t vp = bp->b_vp;
3355 int upl_flags;
3356
3357
3358 if ( !UBCINFOEXISTS(vp) || bp->b_bufsize == 0)
3359 goto dump_buffer;
3360
3361 upl_flags = UPL_PRECIOUS;
3362 if (! (buf_flags(bp) & B_READ)) {
3363 /*
3364 * "write" operation: let the UPL subsystem know
3365 * that we intend to modify the buffer cache pages we're
3366 * gathering.
3367 */
3368 upl_flags |= UPL_WILL_MODIFY;
3369 }
3370
3371 kret = ubc_create_upl(vp,
3372 ubc_blktooff(vp, bp->b_lblkno),
3373 bp->b_bufsize,
3374 &upl,
3375 &pl,
3376 upl_flags);
3377 if (kret != KERN_SUCCESS)
3378 panic("Failed to create UPL");
3379
3380 for (upl_offset = 0; upl_offset < bp->b_bufsize; upl_offset += PAGE_SIZE) {
3381
3382 if (!upl_valid_page(pl, upl_offset / PAGE_SIZE) || !upl_dirty_page(pl, upl_offset / PAGE_SIZE)) {
3383 ubc_upl_abort(upl, 0);
3384 goto dump_buffer;
3385 }
3386 }
3387 bp->b_upl = upl;
3388
3389 kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_datap));
3390
3391 if (kret != KERN_SUCCESS)
3392 panic("getblk: ubc_upl_map() failed with (%d)", kret);
3393 return (1);
3394
3395 dump_buffer:
3396 bp->b_bufsize = 0;
3397 SET(bp->b_flags, B_INVAL);
3398 buf_brelse(bp);
3399
3400 return(0);
3401 }
3402
3403
3404
3405 /*
3406 * disabled for now
3407 */
3408
3409 #if FLUSH_QUEUES
3410
3411 #define NFLUSH 32
3412
3413 static int
3414 bp_cmp(void *a, void *b)
3415 {
3416 buf_t *bp_a = *(buf_t **)a,
3417 *bp_b = *(buf_t **)b;
3418 daddr64_t res;
3419
3420 // don't have to worry about negative block
3421 // numbers so this is ok to do.
3422 //
3423 res = (bp_a->b_blkno - bp_b->b_blkno);
3424
3425 return (int)res;
3426 }
3427
3428
3429 int
3430 bflushq(int whichq, mount_t mp)
3431 {
3432 buf_t bp, next;
3433 int i, buf_count;
3434 int total_writes = 0;
3435 static buf_t flush_table[NFLUSH];
3436
3437 if (whichq < 0 || whichq >= BQUEUES) {
3438 return (0);
3439 }
3440
3441 restart:
3442 lck_mtx_lock(buf_mtxp);
3443
3444 bp = TAILQ_FIRST(&bufqueues[whichq]);
3445
3446 for (buf_count = 0; bp; bp = next) {
3447 next = bp->b_freelist.tqe_next;
3448
3449 if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) {
3450 continue;
3451 }
3452
3453 if (ISSET(bp->b_flags, B_DELWRI) && !ISSET(bp->b_lflags, BL_BUSY)) {
3454
3455 bremfree_locked(bp);
3456 #ifdef JOE_DEBUG
3457 bp->b_owner = current_thread();
3458 bp->b_tag = 7;
3459 #endif
3460 SET(bp->b_lflags, BL_BUSY);
3461 flush_table[buf_count] = bp;
3462 buf_count++;
3463 total_writes++;
3464
3465 if (buf_count >= NFLUSH) {
3466 lck_mtx_unlock(buf_mtxp);
3467
3468 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
3469
3470 for (i = 0; i < buf_count; i++) {
3471 buf_bawrite(flush_table[i]);
3472 }
3473 goto restart;
3474 }
3475 }
3476 }
3477 lck_mtx_unlock(buf_mtxp);
3478
3479 if (buf_count > 0) {
3480 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
3481
3482 for (i = 0; i < buf_count; i++) {
3483 buf_bawrite(flush_table[i]);
3484 }
3485 }
3486
3487 return (total_writes);
3488 }
3489 #endif
3490
3491
3492 #if BALANCE_QUEUES
3493
3494 /* XXX move this to a separate file */
3495
3496 /*
3497 * NOTE: THIS CODE HAS NOT BEEN UPDATED
3498 * WITH RESPECT TO THE NEW LOCKING MODEL
3499 */
3500
3501
3502 /*
3503 * Dynamic Scaling of the Buffer Queues
3504 */
3505
3506 typedef long long blsize_t;
3507
3508 blsize_t MAXNBUF; /* initialize to (sane_size / PAGE_SIZE) */
3509 /* Global tunable limits */
3510 blsize_t nbufh; /* number of buffer headers */
3511 blsize_t nbuflow; /* minimum number of buffer headers required */
3512 blsize_t nbufhigh; /* maximum number of buffer headers allowed */
3513 blsize_t nbuftarget; /* preferred number of buffer headers */
3514
3515 /*
3516 * assertions:
3517 *
3518 * 1. 0 < nbuflow <= nbufh <= nbufhigh
3519 * 2. nbufhigh <= MAXNBUF
3520 * 3. 0 < nbuflow <= nbuftarget <= nbufhigh
3521 * 4. nbufh can not be set by sysctl().
3522 */
3523
3524 /* Per queue tunable limits */
3525
3526 struct bufqlim {
3527 blsize_t bl_nlow; /* minimum number of buffer headers required */
3528 blsize_t bl_num; /* number of buffer headers on the queue */
3529 blsize_t bl_nlhigh; /* maximum number of buffer headers allowed */
3530 blsize_t bl_target; /* preferred number of buffer headers */
3531 long bl_stale; /* Seconds after which a buffer is considered stale */
3532 } bufqlim[BQUEUES];
3533
3534 /*
3535 * assertions:
3536 *
3537 * 1. 0 <= bl_nlow <= bl_num <= bl_nlhigh
3538 * 2. bl_nlhigh <= MAXNBUF
3539 * 3. bufqlim[BQ_META].bl_nlow != 0
3540 * 4. bufqlim[BQ_META].bl_nlow > (number of possible concurrent
3541 * file system IO operations)
3542 * 5. bl_num can not be set by sysctl().
3543 * 6. bl_nhigh <= nbufhigh
3544 */
3545
3546 /*
3547 * Rationale:
3548 * ----------
3549 * Defining it blsize_t as long permits 2^31 buffer headers per queue.
3550 * Which can describe (2^31 * PAGE_SIZE) memory per queue.
3551 *
3552 * These limits are exported to by means of sysctl().
3553 * It was decided to define blsize_t as a 64 bit quantity.
3554 * This will make sure that we will not be required to change it
3555 * as long as we do not exceed 64 bit address space for the kernel.
3556 *
3557 * low and high numbers parameters initialized at compile time
3558 * and boot arguments can be used to override them. sysctl()
3559 * would not change the value. sysctl() can get all the values
3560 * but can set only target. num is the current level.
3561 *
3562 * Advantages of having a "bufqscan" thread doing the balancing are,
3563 * Keep enough bufs on BQ_EMPTY.
3564 * getnewbuf() by default will always select a buffer from the BQ_EMPTY.
3565 * getnewbuf() perfoms best if a buffer was found there.
3566 * Also this minimizes the possibility of starting IO
3567 * from getnewbuf(). That's a performance win, too.
3568 *
3569 * Localize complex logic [balancing as well as time aging]
3570 * to balancebufq().
3571 *
3572 * Simplify getnewbuf() logic by elimination of time aging code.
3573 */
3574
3575 /*
3576 * Algorithm:
3577 * -----------
3578 * The goal of the dynamic scaling of the buffer queues to to keep
3579 * the size of the LRU close to bl_target. Buffers on a queue would
3580 * be time aged.
3581 *
3582 * There would be a thread which will be responsible for "balancing"
3583 * the buffer cache queues.
3584 *
3585 * The scan order would be: AGE, LRU, META, EMPTY.
3586 */
3587
3588 long bufqscanwait = 0;
3589
3590 static void bufqscan_thread();
3591 static int balancebufq(int q);
3592 static int btrimempty(int n);
3593 static __inline__ int initbufqscan(void);
3594 static __inline__ int nextbufq(int q);
3595 static void buqlimprt(int all);
3596
3597
3598 static __inline__ void
3599 bufqinc(int q)
3600 {
3601 if ((q < 0) || (q >= BQUEUES))
3602 return;
3603
3604 bufqlim[q].bl_num++;
3605 return;
3606 }
3607
3608 static __inline__ void
3609 bufqdec(int q)
3610 {
3611 if ((q < 0) || (q >= BQUEUES))
3612 return;
3613
3614 bufqlim[q].bl_num--;
3615 return;
3616 }
3617
3618 static void
3619 bufq_balance_thread_init()
3620 {
3621
3622 if (bufqscanwait++ == 0) {
3623
3624 /* Initalize globals */
3625 MAXNBUF = (sane_size / PAGE_SIZE);
3626 nbufh = nbuf;
3627 nbuflow = min(nbufh, 100);
3628 nbufhigh = min(MAXNBUF, max(nbufh, 2048));
3629 nbuftarget = (sane_size >> 5) / PAGE_SIZE;
3630 nbuftarget = max(nbuflow, nbuftarget);
3631 nbuftarget = min(nbufhigh, nbuftarget);
3632
3633 /*
3634 * Initialize the bufqlim
3635 */
3636
3637 /* LOCKED queue */
3638 bufqlim[BQ_LOCKED].bl_nlow = 0;
3639 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
3640 bufqlim[BQ_LOCKED].bl_target = 0;
3641 bufqlim[BQ_LOCKED].bl_stale = 30;
3642
3643 /* LRU queue */
3644 bufqlim[BQ_LRU].bl_nlow = 0;
3645 bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
3646 bufqlim[BQ_LRU].bl_target = nbuftarget/4;
3647 bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
3648
3649 /* AGE queue */
3650 bufqlim[BQ_AGE].bl_nlow = 0;
3651 bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
3652 bufqlim[BQ_AGE].bl_target = nbuftarget/4;
3653 bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
3654
3655 /* EMPTY queue */
3656 bufqlim[BQ_EMPTY].bl_nlow = 0;
3657 bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
3658 bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
3659 bufqlim[BQ_EMPTY].bl_stale = 600000;
3660
3661 /* META queue */
3662 bufqlim[BQ_META].bl_nlow = 0;
3663 bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
3664 bufqlim[BQ_META].bl_target = nbuftarget/4;
3665 bufqlim[BQ_META].bl_stale = META_IS_STALE;
3666
3667 /* LAUNDRY queue */
3668 bufqlim[BQ_LOCKED].bl_nlow = 0;
3669 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
3670 bufqlim[BQ_LOCKED].bl_target = 0;
3671 bufqlim[BQ_LOCKED].bl_stale = 30;
3672
3673 buqlimprt(1);
3674 }
3675
3676 /* create worker thread */
3677 kernel_thread(kernel_task, bufqscan_thread);
3678 }
3679
3680 /* The workloop for the buffer balancing thread */
3681 static void
3682 bufqscan_thread()
3683 {
3684 int moretodo = 0;
3685
3686 for(;;) {
3687 do {
3688 int q; /* buffer queue to process */
3689
3690 q = initbufqscan();
3691 for (; q; ) {
3692 moretodo |= balancebufq(q);
3693 q = nextbufq(q);
3694 }
3695 } while (moretodo);
3696
3697 #if DIAGNOSTIC
3698 vfs_bufstats();
3699 buqlimprt(0);
3700 #endif
3701 (void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz);
3702 moretodo = 0;
3703 }
3704 }
3705
3706 /* Seed for the buffer queue balancing */
3707 static __inline__ int
3708 initbufqscan()
3709 {
3710 /* Start with AGE queue */
3711 return (BQ_AGE);
3712 }
3713
3714 /* Pick next buffer queue to balance */
3715 static __inline__ int
3716 nextbufq(int q)
3717 {
3718 int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
3719
3720 q++;
3721 q %= sizeof(order);
3722 return (order[q]);
3723 }
3724
3725 /* function to balance the buffer queues */
3726 static int
3727 balancebufq(int q)
3728 {
3729 int moretodo = 0;
3730 int s = splbio();
3731 int n, t;
3732
3733 /* reject invalid q */
3734 if ((q < 0) || (q >= BQUEUES))
3735 goto out;
3736
3737 /* LOCKED or LAUNDRY queue MUST not be balanced */
3738 if ((q == BQ_LOCKED) || (q == BQ_LAUNDRY))
3739 goto out;
3740
3741 n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
3742
3743 /* If queue has less than target nothing more to do */
3744 if (n < 0)
3745 goto out;
3746
3747 if ( n > 8 ) {
3748 /* Balance only a small amount (12.5%) at a time */
3749 n >>= 3;
3750 }
3751
3752 /* EMPTY queue needs special handling */
3753 if (q == BQ_EMPTY) {
3754 moretodo |= btrimempty(n);
3755 goto out;
3756 }
3757
3758 t = buf_timestamp():
3759
3760 for (; n > 0; n--) {
3761 struct buf *bp = bufqueues[q].tqh_first;
3762 if (!bp)
3763 break;
3764
3765 /* check if it's stale */
3766 if ((t - bp->b_timestamp) > bufqlim[q].bl_stale) {
3767 if (bcleanbuf(bp)) {
3768 /* buf_bawrite() issued, bp not ready */
3769 moretodo = 1;
3770 } else {
3771 /* release the cleaned buffer to BQ_EMPTY */
3772 SET(bp->b_flags, B_INVAL);
3773 buf_brelse(bp);
3774 }
3775 } else
3776 break;
3777 }
3778
3779 out:
3780 splx(s);
3781 return (moretodo);
3782 }
3783
3784 static int
3785 btrimempty(int n)
3786 {
3787 /*
3788 * When struct buf are allocated dynamically, this would
3789 * reclaim upto 'n' struct buf from the empty queue.
3790 */
3791
3792 return (0);
3793 }
3794
3795 static void
3796 buqlimprt(int all)
3797 {
3798 int i;
3799 static char *bname[BQUEUES] =
3800 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
3801
3802 if (all)
3803 for (i = 0; i < BQUEUES; i++) {
3804 printf("%s : ", bname[i]);
3805 printf("min = %ld, ", (long)bufqlim[i].bl_nlow);
3806 printf("cur = %ld, ", (long)bufqlim[i].bl_num);
3807 printf("max = %ld, ", (long)bufqlim[i].bl_nlhigh);
3808 printf("target = %ld, ", (long)bufqlim[i].bl_target);
3809 printf("stale after %ld seconds\n", bufqlim[i].bl_stale);
3810 }
3811 else
3812 for (i = 0; i < BQUEUES; i++) {
3813 printf("%s : ", bname[i]);
3814 printf("cur = %ld, ", (long)bufqlim[i].bl_num);
3815 }
3816 }
3817
3818 #endif
3819
3820