]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_bio.c
f986e3dca7553aa4e744cb77de84c20e5e97fd56
[apple/xnu.git] / bsd / vfs / vfs_bio.c
1 /*
2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
24 /*-
25 * Copyright (c) 1994 Christopher G. Demetriou
26 * Copyright (c) 1982, 1986, 1989, 1993
27 * The Regents of the University of California. All rights reserved.
28 * (c) UNIX System Laboratories, Inc.
29 * All or some portions of this file are derived from material licensed
30 * to the University of California by American Telephone and Telegraph
31 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
32 * the permission of UNIX System Laboratories, Inc.
33 *
34 * Redistribution and use in source and binary forms, with or without
35 * modification, are permitted provided that the following conditions
36 * are met:
37 * 1. Redistributions of source code must retain the above copyright
38 * notice, this list of conditions and the following disclaimer.
39 * 2. Redistributions in binary form must reproduce the above copyright
40 * notice, this list of conditions and the following disclaimer in the
41 * documentation and/or other materials provided with the distribution.
42 * 3. All advertising materials mentioning features or use of this software
43 * must display the following acknowledgement:
44 * This product includes software developed by the University of
45 * California, Berkeley and its contributors.
46 * 4. Neither the name of the University nor the names of its contributors
47 * may be used to endorse or promote products derived from this software
48 * without specific prior written permission.
49 *
50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60 * SUCH DAMAGE.
61 *
62 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
63 */
64
65 /*
66 * Some references:
67 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
68 * Leffler, et al.: The Design and Implementation of the 4.3BSD
69 * UNIX Operating System (Addison Welley, 1989)
70 */
71
72 #include <sys/param.h>
73 #include <sys/systm.h>
74 #include <sys/proc_internal.h>
75 #include <sys/buf_internal.h>
76 #include <sys/vnode_internal.h>
77 #include <sys/mount_internal.h>
78 #include <sys/trace.h>
79 #include <sys/malloc.h>
80 #include <sys/resourcevar.h>
81 #include <miscfs/specfs/specdev.h>
82 #include <sys/ubc.h>
83 #include <sys/kauth.h>
84 #if DIAGNOSTIC
85 #include <kern/assert.h>
86 #endif /* DIAGNOSTIC */
87 #include <kern/task.h>
88 #include <kern/zalloc.h>
89 #include <kern/lock.h>
90
91 #include <vm/vm_kern.h>
92
93 #include <sys/kdebug.h>
94 #include <machine/spl.h>
95
96 #if BALANCE_QUEUES
97 static __inline__ void bufqinc(int q);
98 static __inline__ void bufqdec(int q);
99 #endif
100
101 static int bcleanbuf(buf_t bp);
102 static int brecover_data(buf_t bp);
103 static boolean_t incore(vnode_t vp, daddr64_t blkno);
104 static buf_t incore_locked(vnode_t vp, daddr64_t blkno);
105 /* timeout is in msecs */
106 static buf_t getnewbuf(int slpflag, int slptimeo, int *queue);
107 static void bremfree_locked(buf_t bp);
108 static void buf_reassign(buf_t bp, vnode_t newvp);
109 static errno_t buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo);
110 static int buf_iterprepare(vnode_t vp, struct buflists *, int flags);
111 static void buf_itercomplete(vnode_t vp, struct buflists *, int flags);
112
113 __private_extern__ int bdwrite_internal(buf_t, int);
114
115 /* zone allocated buffer headers */
116 static void bufzoneinit(void);
117 static void bcleanbuf_thread_init(void);
118 static void bcleanbuf_thread(void);
119
120 static zone_t buf_hdr_zone;
121 static int buf_hdr_count;
122
123
124 /*
125 * Definitions for the buffer hash lists.
126 */
127 #define BUFHASH(dvp, lbn) \
128 (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
129 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
130 u_long bufhash;
131
132 /* Definitions for the buffer stats. */
133 struct bufstats bufstats;
134
135 /* Number of delayed write buffers */
136 int nbdwrite = 0;
137 int blaundrycnt = 0;
138
139
140 static TAILQ_HEAD(ioqueue, buf) iobufqueue;
141 static TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
142 static int needbuffer;
143 static int need_iobuffer;
144
145 static lck_grp_t *buf_mtx_grp;
146 static lck_attr_t *buf_mtx_attr;
147 static lck_grp_attr_t *buf_mtx_grp_attr;
148 static lck_mtx_t *iobuffer_mtxp;
149 static lck_mtx_t *buf_mtxp;
150
151 static __inline__ int
152 buf_timestamp(void)
153 {
154 struct timeval t;
155 microuptime(&t);
156 return (t.tv_sec);
157 }
158
159 /*
160 * Insq/Remq for the buffer free lists.
161 */
162 #if BALANCE_QUEUES
163 #define binsheadfree(bp, dp, whichq) do { \
164 TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
165 bufqinc((whichq)); \
166 (bp)->b_whichq = whichq; \
167 (bp)->b_timestamp = buf_timestamp(); \
168 } while (0)
169
170 #define binstailfree(bp, dp, whichq) do { \
171 TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
172 bufqinc((whichq)); \
173 (bp)->b_whichq = whichq; \
174 (bp)->b_timestamp = buf_timestamp(); \
175 } while (0)
176 #else
177 #define binsheadfree(bp, dp, whichq) do { \
178 TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
179 (bp)->b_whichq = whichq; \
180 (bp)->b_timestamp = buf_timestamp(); \
181 } while (0)
182
183 #define binstailfree(bp, dp, whichq) do { \
184 TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
185 (bp)->b_whichq = whichq; \
186 (bp)->b_timestamp = buf_timestamp(); \
187 } while (0)
188 #endif
189
190
191 #define BHASHENTCHECK(bp) \
192 if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef) \
193 panic("%x: b_hash.le_prev is not deadbeef", (bp));
194
195 #define BLISTNONE(bp) \
196 (bp)->b_hash.le_next = (struct buf *)0; \
197 (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
198
199 /*
200 * Insq/Remq for the vnode usage lists.
201 */
202 #define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs)
203 #define bufremvn(bp) { \
204 LIST_REMOVE(bp, b_vnbufs); \
205 (bp)->b_vnbufs.le_next = NOLIST; \
206 }
207
208 /*
209 * Time in seconds before a buffer on a list is
210 * considered as a stale buffer
211 */
212 #define LRU_IS_STALE 120 /* default value for the LRU */
213 #define AGE_IS_STALE 60 /* default value for the AGE */
214 #define META_IS_STALE 180 /* default value for the BQ_META */
215
216 int lru_is_stale = LRU_IS_STALE;
217 int age_is_stale = AGE_IS_STALE;
218 int meta_is_stale = META_IS_STALE;
219
220
221
222 /* LIST_INSERT_HEAD() with assertions */
223 static __inline__ void
224 blistenterhead(struct bufhashhdr * head, buf_t bp)
225 {
226 if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
227 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
228 (head)->lh_first = bp;
229 bp->b_hash.le_prev = &(head)->lh_first;
230 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
231 panic("blistenterhead: le_prev is deadbeef");
232 }
233
234 static __inline__ void
235 binshash(buf_t bp, struct bufhashhdr *dp)
236 {
237 buf_t nbp;
238
239 BHASHENTCHECK(bp);
240
241 nbp = dp->lh_first;
242 for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
243 if(nbp == bp)
244 panic("buf already in hashlist");
245 }
246
247 blistenterhead(dp, bp);
248 }
249
250 static __inline__ void
251 bremhash(buf_t bp)
252 {
253 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
254 panic("bremhash le_prev is deadbeef");
255 if (bp->b_hash.le_next == bp)
256 panic("bremhash: next points to self");
257
258 if (bp->b_hash.le_next != NULL)
259 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
260 *bp->b_hash.le_prev = (bp)->b_hash.le_next;
261 }
262
263
264
265
266 int
267 buf_valid(buf_t bp) {
268
269 if ( (bp->b_flags & (B_DONE | B_DELWRI)) )
270 return 1;
271 return 0;
272 }
273
274 int
275 buf_fromcache(buf_t bp) {
276
277 if ( (bp->b_flags & B_CACHE) )
278 return 1;
279 return 0;
280 }
281
282 void
283 buf_markinvalid(buf_t bp) {
284
285 SET(bp->b_flags, B_INVAL);
286 }
287
288 void
289 buf_markdelayed(buf_t bp) {
290
291 SET(bp->b_flags, B_DELWRI);
292 buf_reassign(bp, bp->b_vp);
293 }
294
295 void
296 buf_markeintr(buf_t bp) {
297
298 SET(bp->b_flags, B_EINTR);
299 }
300
301 void
302 buf_markaged(buf_t bp) {
303
304 SET(bp->b_flags, B_AGE);
305 }
306
307 errno_t
308 buf_error(buf_t bp) {
309
310 return (bp->b_error);
311 }
312
313 void
314 buf_seterror(buf_t bp, errno_t error) {
315
316 if ((bp->b_error = error))
317 SET(bp->b_flags, B_ERROR);
318 else
319 CLR(bp->b_flags, B_ERROR);
320 }
321
322 void
323 buf_setflags(buf_t bp, int32_t flags) {
324
325 SET(bp->b_flags, (flags & BUF_X_WRFLAGS));
326 }
327
328 void
329 buf_clearflags(buf_t bp, int32_t flags) {
330
331 CLR(bp->b_flags, (flags & BUF_X_WRFLAGS));
332 }
333
334 int32_t
335 buf_flags(buf_t bp) {
336
337 return ((bp->b_flags & BUF_X_RDFLAGS));
338 }
339
340 void
341 buf_reset(buf_t bp, int32_t io_flags) {
342
343 CLR(bp->b_flags, (B_READ | B_WRITE | B_ERROR | B_DONE | B_INVAL | B_ASYNC | B_NOCACHE));
344 SET(bp->b_flags, (io_flags & (B_ASYNC | B_READ | B_WRITE | B_NOCACHE)));
345
346 bp->b_error = 0;
347 }
348
349 uint32_t
350 buf_count(buf_t bp) {
351
352 return (bp->b_bcount);
353 }
354
355 void
356 buf_setcount(buf_t bp, uint32_t bcount) {
357
358 bp->b_bcount = bcount;
359 }
360
361 uint32_t
362 buf_size(buf_t bp) {
363
364 return (bp->b_bufsize);
365 }
366
367 void
368 buf_setsize(buf_t bp, uint32_t bufsize) {
369
370 bp->b_bufsize = bufsize;
371 }
372
373 uint32_t
374 buf_resid(buf_t bp) {
375
376 return (bp->b_resid);
377 }
378
379 void
380 buf_setresid(buf_t bp, uint32_t resid) {
381
382 bp->b_resid = resid;
383 }
384
385 uint32_t
386 buf_dirtyoff(buf_t bp) {
387
388 return (bp->b_dirtyoff);
389 }
390
391 uint32_t
392 buf_dirtyend(buf_t bp) {
393
394 return (bp->b_dirtyend);
395 }
396
397 void
398 buf_setdirtyoff(buf_t bp, uint32_t dirtyoff) {
399
400 bp->b_dirtyoff = dirtyoff;
401 }
402
403 void
404 buf_setdirtyend(buf_t bp, uint32_t dirtyend) {
405
406 bp->b_dirtyend = dirtyend;
407 }
408
409 uintptr_t
410 buf_dataptr(buf_t bp) {
411
412 return (bp->b_datap);
413 }
414
415 void
416 buf_setdataptr(buf_t bp, uintptr_t data) {
417
418 bp->b_datap = data;
419 }
420
421 vnode_t
422 buf_vnode(buf_t bp) {
423
424 return (bp->b_vp);
425 }
426
427 void
428 buf_setvnode(buf_t bp, vnode_t vp) {
429
430 bp->b_vp = vp;
431 }
432
433
434 void *
435 buf_callback(buf_t bp)
436 {
437 if ( !(bp->b_lflags & BL_IOBUF) )
438 return ((void *) NULL);
439 if ( !(bp->b_flags & B_CALL) )
440 return ((void *) NULL);
441
442 return ((void *)bp->b_iodone);
443 }
444
445
446 errno_t
447 buf_setcallback(buf_t bp, void (*callback)(buf_t, void *), void *transaction)
448 {
449
450 if ( !(bp->b_lflags & BL_IOBUF) )
451 return (EINVAL);
452
453 if (callback)
454 bp->b_flags |= (B_CALL | B_ASYNC);
455 else
456 bp->b_flags &= ~B_CALL;
457 bp->b_transaction = transaction;
458 bp->b_iodone = callback;
459
460 return (0);
461 }
462
463 errno_t
464 buf_setupl(buf_t bp, upl_t upl, uint32_t offset)
465 {
466
467 if ( !(bp->b_lflags & BL_IOBUF) )
468 return (EINVAL);
469
470 if (upl)
471 bp->b_flags |= B_CLUSTER;
472 else
473 bp->b_flags &= ~B_CLUSTER;
474 bp->b_upl = upl;
475 bp->b_uploffset = offset;
476
477 return (0);
478 }
479
480 buf_t
481 buf_clone(buf_t bp, int io_offset, int io_size, void (*iodone)(buf_t, void *), void *arg)
482 {
483 buf_t io_bp;
484
485 if (io_offset < 0 || io_size < 0)
486 return (NULL);
487
488 if ((unsigned)(io_offset + io_size) > (unsigned)bp->b_bcount)
489 return (NULL);
490
491 if (bp->b_flags & B_CLUSTER) {
492 if (io_offset && ((bp->b_uploffset + io_offset) & PAGE_MASK))
493 return (NULL);
494
495 if (((bp->b_uploffset + io_offset + io_size) & PAGE_MASK) && ((io_offset + io_size) < bp->b_bcount))
496 return (NULL);
497 }
498 io_bp = alloc_io_buf(bp->b_vp, 0);
499
500 io_bp->b_flags = bp->b_flags & (B_COMMIT_UPL | B_META | B_PAGEIO | B_CLUSTER | B_PHYS | B_ASYNC | B_READ);
501
502 if (iodone) {
503 io_bp->b_transaction = arg;
504 io_bp->b_iodone = iodone;
505 io_bp->b_flags |= B_CALL;
506 }
507 if (bp->b_flags & B_CLUSTER) {
508 io_bp->b_upl = bp->b_upl;
509 io_bp->b_uploffset = bp->b_uploffset + io_offset;
510 } else {
511 io_bp->b_datap = (uintptr_t)(((char *)bp->b_datap) + io_offset);
512 }
513 io_bp->b_bcount = io_size;
514
515 return (io_bp);
516 }
517
518
519
520 void
521 buf_setfilter(buf_t bp, void (*filter)(buf_t, void *), void *transaction,
522 void **old_iodone, void **old_transaction)
523 {
524 if (old_iodone)
525 *old_iodone = (void *)(bp->b_iodone);
526 if (old_transaction)
527 *old_transaction = (void *)(bp->b_transaction);
528
529 bp->b_transaction = transaction;
530 bp->b_iodone = filter;
531 bp->b_flags |= B_FILTER;
532 }
533
534
535 daddr64_t
536 buf_blkno(buf_t bp) {
537
538 return (bp->b_blkno);
539 }
540
541 daddr64_t
542 buf_lblkno(buf_t bp) {
543
544 return (bp->b_lblkno);
545 }
546
547 void
548 buf_setblkno(buf_t bp, daddr64_t blkno) {
549
550 bp->b_blkno = blkno;
551 }
552
553 void
554 buf_setlblkno(buf_t bp, daddr64_t lblkno) {
555
556 bp->b_lblkno = lblkno;
557 }
558
559 dev_t
560 buf_device(buf_t bp) {
561
562 return (bp->b_dev);
563 }
564
565 errno_t
566 buf_setdevice(buf_t bp, vnode_t vp) {
567
568 if ((vp->v_type != VBLK) && (vp->v_type != VCHR))
569 return EINVAL;
570 bp->b_dev = vp->v_rdev;
571
572 return 0;
573 }
574
575
576 void *
577 buf_drvdata(buf_t bp) {
578
579 return (bp->b_drvdata);
580 }
581
582 void
583 buf_setdrvdata(buf_t bp, void *drvdata) {
584
585 bp->b_drvdata = drvdata;
586 }
587
588 void *
589 buf_fsprivate(buf_t bp) {
590
591 return (bp->b_fsprivate);
592 }
593
594 void
595 buf_setfsprivate(buf_t bp, void *fsprivate) {
596
597 bp->b_fsprivate = fsprivate;
598 }
599
600 ucred_t
601 buf_rcred(buf_t bp) {
602
603 return (bp->b_rcred);
604 }
605
606 ucred_t
607 buf_wcred(buf_t bp) {
608
609 return (bp->b_wcred);
610 }
611
612 void *
613 buf_upl(buf_t bp) {
614
615 return (bp->b_upl);
616 }
617
618 uint32_t
619 buf_uploffset(buf_t bp) {
620
621 return ((uint32_t)(bp->b_uploffset));
622 }
623
624 proc_t
625 buf_proc(buf_t bp) {
626
627 return (bp->b_proc);
628 }
629
630
631 errno_t
632 buf_map(buf_t bp, caddr_t *io_addr)
633 {
634 buf_t real_bp;
635 vm_offset_t vaddr;
636 kern_return_t kret;
637
638 if ( !(bp->b_flags & B_CLUSTER)) {
639 *io_addr = (caddr_t)bp->b_datap;
640 return (0);
641 }
642 real_bp = (buf_t)(bp->b_real_bp);
643
644 if (real_bp && real_bp->b_datap) {
645 /*
646 * b_real_bp is only valid if B_CLUSTER is SET
647 * if it's non-zero, than someone did a cluster_bp call
648 * if the backing physical pages were already mapped
649 * in before the call to cluster_bp (non-zero b_datap),
650 * than we just use that mapping
651 */
652 *io_addr = (caddr_t)real_bp->b_datap;
653 return (0);
654 }
655 kret = ubc_upl_map(bp->b_upl, &vaddr); /* Map it in */
656
657 if (kret != KERN_SUCCESS) {
658 *io_addr = 0;
659
660 return(ENOMEM);
661 }
662 vaddr += bp->b_uploffset;
663
664 *io_addr = (caddr_t)vaddr;
665
666 return (0);
667 }
668
669 errno_t
670 buf_unmap(buf_t bp)
671 {
672 buf_t real_bp;
673 kern_return_t kret;
674
675 if ( !(bp->b_flags & B_CLUSTER))
676 return (0);
677 /*
678 * see buf_map for the explanation
679 */
680 real_bp = (buf_t)(bp->b_real_bp);
681
682 if (real_bp && real_bp->b_datap)
683 return (0);
684
685 if (bp->b_lflags & BL_IOBUF) {
686 /*
687 * when we commit these pages, we'll hit
688 * it with UPL_COMMIT_INACTIVE which
689 * will clear the reference bit that got
690 * turned on when we touched the mapping
691 */
692 bp->b_flags |= B_AGE;
693 }
694 kret = ubc_upl_unmap(bp->b_upl);
695
696 if (kret != KERN_SUCCESS)
697 return (EINVAL);
698 return (0);
699 }
700
701
702 void
703 buf_clear(buf_t bp) {
704 caddr_t baddr;
705
706 if (buf_map(bp, &baddr) == 0) {
707 bzero(baddr, bp->b_bcount);
708 buf_unmap(bp);
709 }
710 bp->b_resid = 0;
711 }
712
713
714
715 /*
716 * Read or write a buffer that is not contiguous on disk.
717 * buffer is marked done/error at the conclusion
718 */
719 static int
720 buf_strategy_fragmented(vnode_t devvp, buf_t bp, off_t f_offset, size_t contig_bytes)
721 {
722 vnode_t vp = buf_vnode(bp);
723 buf_t io_bp; /* For reading or writing a single block */
724 int io_direction;
725 int io_resid;
726 size_t io_contig_bytes;
727 daddr64_t io_blkno;
728 int error = 0;
729 int bmap_flags;
730
731 /*
732 * save our starting point... the bp was already mapped
733 * in buf_strategy before we got called
734 * no sense doing it again.
735 */
736 io_blkno = bp->b_blkno;
737 /*
738 * Make sure we redo this mapping for the next I/O
739 * i.e. this can never be a 'permanent' mapping
740 */
741 bp->b_blkno = bp->b_lblkno;
742
743 /*
744 * Get an io buffer to do the deblocking
745 */
746 io_bp = alloc_io_buf(devvp, 0);
747
748 io_bp->b_lblkno = bp->b_lblkno;
749 io_bp->b_datap = bp->b_datap;
750 io_resid = bp->b_bcount;
751 io_direction = bp->b_flags & B_READ;
752 io_contig_bytes = contig_bytes;
753
754 if (bp->b_flags & B_READ)
755 bmap_flags = VNODE_READ;
756 else
757 bmap_flags = VNODE_WRITE;
758
759 for (;;) {
760 if (io_blkno == -1)
761 /*
762 * this is unexepected, but we'll allow for it
763 */
764 bzero((caddr_t)io_bp->b_datap, (int)io_contig_bytes);
765 else {
766 io_bp->b_bcount = io_contig_bytes;
767 io_bp->b_bufsize = io_contig_bytes;
768 io_bp->b_resid = io_contig_bytes;
769 io_bp->b_blkno = io_blkno;
770
771 buf_reset(io_bp, io_direction);
772 /*
773 * Call the device to do the I/O and wait for it
774 */
775 if ((error = VNOP_STRATEGY(io_bp)))
776 break;
777 if ((error = (int)buf_biowait(io_bp)))
778 break;
779 if (io_bp->b_resid) {
780 io_resid -= (io_contig_bytes - io_bp->b_resid);
781 break;
782 }
783 }
784 if ((io_resid -= io_contig_bytes) == 0)
785 break;
786 f_offset += io_contig_bytes;
787 io_bp->b_datap += io_contig_bytes;
788
789 /*
790 * Map the current position to a physical block number
791 */
792 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_resid, &io_blkno, &io_contig_bytes, NULL, bmap_flags, NULL)))
793 break;
794 }
795 buf_free(io_bp);
796
797 if (error)
798 buf_seterror(bp, error);
799 bp->b_resid = io_resid;
800 /*
801 * This I/O is now complete
802 */
803 buf_biodone(bp);
804
805 return error;
806 }
807
808
809 /*
810 * struct vnop_strategy_args {
811 * struct buf *a_bp;
812 * } *ap;
813 */
814 errno_t
815 buf_strategy(vnode_t devvp, void *ap)
816 {
817 buf_t bp = ((struct vnop_strategy_args *)ap)->a_bp;
818 vnode_t vp = bp->b_vp;
819 int bmap_flags;
820 errno_t error;
821
822 if (vp == NULL || vp->v_type == VCHR || vp->v_type == VBLK)
823 panic("buf_strategy: b_vp == NULL || vtype == VCHR | VBLK\n");
824 /*
825 * associate the physical device with
826 * with this buf_t even if we don't
827 * end up issuing the I/O...
828 */
829 bp->b_dev = devvp->v_rdev;
830
831 if (bp->b_flags & B_READ)
832 bmap_flags = VNODE_READ;
833 else
834 bmap_flags = VNODE_WRITE;
835
836 if ( !(bp->b_flags & B_CLUSTER)) {
837
838 if ( (bp->b_upl) ) {
839 /*
840 * we have a UPL associated with this bp
841 * go through cluster_bp which knows how
842 * to deal with filesystem block sizes
843 * that aren't equal to the page size
844 */
845 return (cluster_bp(bp));
846 }
847 if (bp->b_blkno == bp->b_lblkno) {
848 off_t f_offset;
849 size_t contig_bytes;
850
851 if ((error = VNOP_BLKTOOFF(vp, bp->b_lblkno, &f_offset))) {
852 buf_seterror(bp, error);
853 buf_biodone(bp);
854
855 return (error);
856 }
857 if ((error = VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))) {
858 buf_seterror(bp, error);
859 buf_biodone(bp);
860
861 return (error);
862 }
863 if (bp->b_blkno == -1)
864 buf_clear(bp);
865 else if ((long)contig_bytes < bp->b_bcount)
866 return (buf_strategy_fragmented(devvp, bp, f_offset, contig_bytes));
867 }
868 if (bp->b_blkno == -1) {
869 buf_biodone(bp);
870 return (0);
871 }
872 }
873 /*
874 * we can issue the I/O because...
875 * either B_CLUSTER is set which
876 * means that the I/O is properly set
877 * up to be a multiple of the page size, or
878 * we were able to successfully set up the
879 * phsyical block mapping
880 */
881 return (VOCALL(devvp->v_op, VOFFSET(vnop_strategy), ap));
882 }
883
884
885
886 buf_t
887 buf_alloc(vnode_t vp)
888 {
889 return(alloc_io_buf(vp, 0));
890 }
891
892 void
893 buf_free(buf_t bp) {
894
895 free_io_buf(bp);
896 }
897
898
899
900 void
901 buf_iterate(vnode_t vp, int (*callout)(buf_t, void *), int flags, void *arg) {
902 buf_t bp;
903 int retval;
904 struct buflists local_iterblkhd;
905 int lock_flags = BAC_NOWAIT | BAC_REMOVE;
906
907 if (flags & BUF_SKIP_LOCKED)
908 lock_flags |= BAC_SKIP_LOCKED;
909 if (flags & BUF_SKIP_NONLOCKED)
910 lock_flags |= BAC_SKIP_NONLOCKED;
911
912 lck_mtx_lock(buf_mtxp);
913
914 if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY)) {
915 lck_mtx_unlock(buf_mtxp);
916 return;
917 }
918 while (!LIST_EMPTY(&local_iterblkhd)) {
919 bp = LIST_FIRST(&local_iterblkhd);
920 LIST_REMOVE(bp, b_vnbufs);
921 LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
922
923 if (buf_acquire_locked(bp, lock_flags, 0, 0))
924 continue;
925
926 lck_mtx_unlock(buf_mtxp);
927
928 retval = callout(bp, arg);
929
930 switch (retval) {
931 case BUF_RETURNED:
932 buf_brelse(bp);
933 break;
934 case BUF_CLAIMED:
935 break;
936 case BUF_RETURNED_DONE:
937 buf_brelse(bp);
938 lck_mtx_lock(buf_mtxp);
939 goto out;
940 case BUF_CLAIMED_DONE:
941 lck_mtx_lock(buf_mtxp);
942 goto out;
943 }
944 lck_mtx_lock(buf_mtxp);
945 }
946 out:
947 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
948
949 lck_mtx_unlock(buf_mtxp);
950 }
951
952
953 /*
954 * Flush out and invalidate all buffers associated with a vnode.
955 */
956 int
957 buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo)
958 {
959 buf_t bp;
960 int error = 0;
961 int must_rescan = 1;
962 struct buflists local_iterblkhd;
963
964 lck_mtx_lock(buf_mtxp);
965
966 for (;;) {
967 if (must_rescan == 0)
968 /*
969 * the lists may not be empty, but all that's left at this
970 * point are metadata or B_LOCKED buffers which are being
971 * skipped... we know this because we made it through both
972 * the clean and dirty lists without dropping buf_mtxp...
973 * each time we drop buf_mtxp we bump "must_rescan"
974 */
975 break;
976 if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd))
977 break;
978 must_rescan = 0;
979 /*
980 * iterate the clean list
981 */
982 if (buf_iterprepare(vp, &local_iterblkhd, VBI_CLEAN)) {
983 goto try_dirty_list;
984 }
985 while (!LIST_EMPTY(&local_iterblkhd)) {
986 bp = LIST_FIRST(&local_iterblkhd);
987
988 LIST_REMOVE(bp, b_vnbufs);
989 LIST_INSERT_HEAD(&vp->v_cleanblkhd, bp, b_vnbufs);
990
991 /*
992 * some filesystems distinguish meta data blocks with a negative logical block #
993 */
994 if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
995 continue;
996
997 if ( (error = (int)buf_acquire_locked(bp, BAC_REMOVE | BAC_SKIP_LOCKED, slpflag, slptimeo)) ) {
998 if (error == EDEADLK)
999 /*
1000 * this buffer was marked B_LOCKED...
1001 * we didn't drop buf_mtxp, so we
1002 * we don't need to rescan
1003 */
1004 continue;
1005 if (error == EAGAIN) {
1006 /*
1007 * found a busy buffer... we blocked and
1008 * dropped buf_mtxp, so we're going to
1009 * need to rescan after this pass is completed
1010 */
1011 must_rescan++;
1012 continue;
1013 }
1014 /*
1015 * got some kind of 'real' error out of the msleep
1016 * in buf_acquire_locked, terminate the scan and return the error
1017 */
1018 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1019
1020 lck_mtx_unlock(buf_mtxp);
1021 return (error);
1022 }
1023 lck_mtx_unlock(buf_mtxp);
1024
1025 SET(bp->b_flags, B_INVAL);
1026 buf_brelse(bp);
1027
1028 lck_mtx_lock(buf_mtxp);
1029
1030 /*
1031 * by dropping buf_mtxp, we allow new
1032 * buffers to be added to the vnode list(s)
1033 * we'll have to rescan at least once more
1034 * if the queues aren't empty
1035 */
1036 must_rescan++;
1037 }
1038 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1039
1040 try_dirty_list:
1041 /*
1042 * Now iterate on dirty blks
1043 */
1044 if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY)) {
1045 continue;
1046 }
1047 while (!LIST_EMPTY(&local_iterblkhd)) {
1048 bp = LIST_FIRST(&local_iterblkhd);
1049
1050 LIST_REMOVE(bp, b_vnbufs);
1051 LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1052
1053 /*
1054 * some filesystems distinguish meta data blocks with a negative logical block #
1055 */
1056 if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
1057 continue;
1058
1059 if ( (error = (int)buf_acquire_locked(bp, BAC_REMOVE | BAC_SKIP_LOCKED, slpflag, slptimeo)) ) {
1060 if (error == EDEADLK)
1061 /*
1062 * this buffer was marked B_LOCKED...
1063 * we didn't drop buf_mtxp, so we
1064 * we don't need to rescan
1065 */
1066 continue;
1067 if (error == EAGAIN) {
1068 /*
1069 * found a busy buffer... we blocked and
1070 * dropped buf_mtxp, so we're going to
1071 * need to rescan after this pass is completed
1072 */
1073 must_rescan++;
1074 continue;
1075 }
1076 /*
1077 * got some kind of 'real' error out of the msleep
1078 * in buf_acquire_locked, terminate the scan and return the error
1079 */
1080 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1081
1082 lck_mtx_unlock(buf_mtxp);
1083 return (error);
1084 }
1085 lck_mtx_unlock(buf_mtxp);
1086
1087 SET(bp->b_flags, B_INVAL);
1088
1089 if (ISSET(bp->b_flags, B_DELWRI) && (flags & BUF_WRITE_DATA))
1090 (void) VNOP_BWRITE(bp);
1091 else
1092 buf_brelse(bp);
1093
1094 lck_mtx_lock(buf_mtxp);
1095 /*
1096 * by dropping buf_mtxp, we allow new
1097 * buffers to be added to the vnode list(s)
1098 * we'll have to rescan at least once more
1099 * if the queues aren't empty
1100 */
1101 must_rescan++;
1102 }
1103 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1104 }
1105 lck_mtx_unlock(buf_mtxp);
1106
1107 return (0);
1108 }
1109
1110 void
1111 buf_flushdirtyblks(vnode_t vp, int wait, int flags, char *msg) {
1112 buf_t bp;
1113 int writes_issued = 0;
1114 errno_t error;
1115 int busy = 0;
1116 struct buflists local_iterblkhd;
1117 int lock_flags = BAC_NOWAIT | BAC_REMOVE;
1118
1119 if (flags & BUF_SKIP_LOCKED)
1120 lock_flags |= BAC_SKIP_LOCKED;
1121 if (flags & BUF_SKIP_NONLOCKED)
1122 lock_flags |= BAC_SKIP_NONLOCKED;
1123 loop:
1124 lck_mtx_lock(buf_mtxp);
1125
1126 if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY) == 0) {
1127 while (!LIST_EMPTY(&local_iterblkhd)) {
1128 bp = LIST_FIRST(&local_iterblkhd);
1129 LIST_REMOVE(bp, b_vnbufs);
1130 LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1131
1132 if ((error = buf_acquire_locked(bp, lock_flags, 0, 0)) == EBUSY)
1133 busy++;
1134 if (error)
1135 continue;
1136 lck_mtx_unlock(buf_mtxp);
1137
1138 bp->b_flags &= ~B_LOCKED;
1139
1140 /*
1141 * Wait for I/O associated with indirect blocks to complete,
1142 * since there is no way to quickly wait for them below.
1143 */
1144 if ((bp->b_vp == vp) || (wait == 0))
1145 (void) buf_bawrite(bp);
1146 else
1147 (void) VNOP_BWRITE(bp);
1148 writes_issued++;
1149
1150 lck_mtx_lock(buf_mtxp);
1151 }
1152 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1153 }
1154 lck_mtx_unlock(buf_mtxp);
1155
1156 if (wait) {
1157 (void)vnode_waitforwrites(vp, 0, 0, 0, msg);
1158
1159 if (vp->v_dirtyblkhd.lh_first && busy) {
1160 /*
1161 * we had one or more BUSY buffers on
1162 * the dirtyblock list... most likely
1163 * these are due to delayed writes that
1164 * were moved to the bclean queue but
1165 * have not yet been 'written'.
1166 * if we issued some writes on the
1167 * previous pass, we try again immediately
1168 * if we didn't, we'll sleep for some time
1169 * to allow the state to change...
1170 */
1171 if (writes_issued == 0) {
1172 (void)tsleep((caddr_t)&vp->v_numoutput,
1173 PRIBIO + 1, "vnode_flushdirtyblks", hz/20);
1174 }
1175 writes_issued = 0;
1176 busy = 0;
1177
1178 goto loop;
1179 }
1180 }
1181 }
1182
1183
1184 /*
1185 * called with buf_mtxp held...
1186 * this lock protects the queue manipulation
1187 */
1188 static int
1189 buf_iterprepare(vnode_t vp, struct buflists *iterheadp, int flags)
1190 {
1191 struct buflists * listheadp;
1192
1193 if (flags & VBI_DIRTY)
1194 listheadp = &vp->v_dirtyblkhd;
1195 else
1196 listheadp = &vp->v_cleanblkhd;
1197
1198 while (vp->v_iterblkflags & VBI_ITER) {
1199 vp->v_iterblkflags |= VBI_ITERWANT;
1200 msleep(&vp->v_iterblkflags, buf_mtxp, 0, "buf_iterprepare", 0);
1201 }
1202 if (LIST_EMPTY(listheadp)) {
1203 LIST_INIT(iterheadp);
1204 return(EINVAL);
1205 }
1206 vp->v_iterblkflags |= VBI_ITER;
1207
1208 iterheadp->lh_first = listheadp->lh_first;
1209 listheadp->lh_first->b_vnbufs.le_prev = &iterheadp->lh_first;
1210 LIST_INIT(listheadp);
1211
1212 return(0);
1213 }
1214
1215 /*
1216 * called with buf_mtxp held...
1217 * this lock protects the queue manipulation
1218 */
1219 static void
1220 buf_itercomplete(vnode_t vp, struct buflists *iterheadp, int flags)
1221 {
1222 struct buflists * listheadp;
1223 buf_t bp;
1224
1225 if (flags & VBI_DIRTY)
1226 listheadp = &vp->v_dirtyblkhd;
1227 else
1228 listheadp = &vp->v_cleanblkhd;
1229
1230 while (!LIST_EMPTY(iterheadp)) {
1231 bp = LIST_FIRST(iterheadp);
1232 LIST_REMOVE(bp, b_vnbufs);
1233 LIST_INSERT_HEAD(listheadp, bp, b_vnbufs);
1234 }
1235 vp->v_iterblkflags &= ~VBI_ITER;
1236
1237 if (vp->v_iterblkflags & VBI_ITERWANT) {
1238 vp->v_iterblkflags &= ~VBI_ITERWANT;
1239 wakeup(&vp->v_iterblkflags);
1240 }
1241 }
1242
1243
1244 static void
1245 bremfree_locked(buf_t bp)
1246 {
1247 struct bqueues *dp = NULL;
1248 int whichq = -1;
1249
1250 /*
1251 * We only calculate the head of the freelist when removing
1252 * the last element of the list as that is the only time that
1253 * it is needed (e.g. to reset the tail pointer).
1254 *
1255 * NB: This makes an assumption about how tailq's are implemented.
1256 */
1257 if (bp->b_freelist.tqe_next == NULL) {
1258 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
1259 if (dp->tqh_last == &bp->b_freelist.tqe_next)
1260 break;
1261 if (dp == &bufqueues[BQUEUES])
1262 panic("bremfree: lost tail");
1263 }
1264 TAILQ_REMOVE(dp, bp, b_freelist);
1265 whichq = bp->b_whichq;
1266 #if BALANCE_QUEUES
1267 bufqdec(whichq);
1268 #endif
1269 bp->b_whichq = -1;
1270 bp->b_timestamp = 0;
1271 }
1272
1273 /*
1274 * Associate a buffer with a vnode.
1275 */
1276 static void
1277 bgetvp(vnode_t vp, buf_t bp)
1278 {
1279
1280 if (bp->b_vp != vp)
1281 panic("bgetvp: not free");
1282
1283 if (vp->v_type == VBLK || vp->v_type == VCHR)
1284 bp->b_dev = vp->v_rdev;
1285 else
1286 bp->b_dev = NODEV;
1287 /*
1288 * Insert onto list for new vnode.
1289 */
1290 lck_mtx_lock(buf_mtxp);
1291 bufinsvn(bp, &vp->v_cleanblkhd);
1292 lck_mtx_unlock(buf_mtxp);
1293 }
1294
1295 /*
1296 * Disassociate a buffer from a vnode.
1297 */
1298 static void
1299 brelvp(buf_t bp)
1300 {
1301 vnode_t vp;
1302
1303 if ((vp = bp->b_vp) == (vnode_t)NULL)
1304 panic("brelvp: NULL vp");
1305 /*
1306 * Delete from old vnode list, if on one.
1307 */
1308 lck_mtx_lock(buf_mtxp);
1309 if (bp->b_vnbufs.le_next != NOLIST)
1310 bufremvn(bp);
1311 lck_mtx_unlock(buf_mtxp);
1312
1313 bp->b_vp = (vnode_t)NULL;
1314 }
1315
1316 /*
1317 * Reassign a buffer from one vnode to another.
1318 * Used to assign file specific control information
1319 * (indirect blocks) to the vnode to which they belong.
1320 */
1321 static void
1322 buf_reassign(buf_t bp, vnode_t newvp)
1323 {
1324 register struct buflists *listheadp;
1325
1326 if (newvp == NULL) {
1327 printf("buf_reassign: NULL");
1328 return;
1329 }
1330 lck_mtx_lock(buf_mtxp);
1331
1332 /*
1333 * Delete from old vnode list, if on one.
1334 */
1335 if (bp->b_vnbufs.le_next != NOLIST)
1336 bufremvn(bp);
1337 /*
1338 * If dirty, put on list of dirty buffers;
1339 * otherwise insert onto list of clean buffers.
1340 */
1341 if (ISSET(bp->b_flags, B_DELWRI))
1342 listheadp = &newvp->v_dirtyblkhd;
1343 else
1344 listheadp = &newvp->v_cleanblkhd;
1345 bufinsvn(bp, listheadp);
1346
1347 lck_mtx_unlock(buf_mtxp);
1348 }
1349
1350 static __inline__ void
1351 bufhdrinit(buf_t bp)
1352 {
1353 bzero((char *)bp, sizeof *bp);
1354 bp->b_dev = NODEV;
1355 bp->b_rcred = NOCRED;
1356 bp->b_wcred = NOCRED;
1357 bp->b_vnbufs.le_next = NOLIST;
1358 bp->b_flags = B_INVAL;
1359
1360 return;
1361 }
1362
1363 /*
1364 * Initialize buffers and hash links for buffers.
1365 */
1366 __private_extern__ void
1367 bufinit()
1368 {
1369 buf_t bp;
1370 struct bqueues *dp;
1371 int i;
1372 int metabuf;
1373 long whichq;
1374
1375 /* Initialize the buffer queues ('freelists') and the hash table */
1376 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
1377 TAILQ_INIT(dp);
1378 bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash);
1379
1380 metabuf = nbuf/8; /* reserved for meta buf */
1381
1382 /* Initialize the buffer headers */
1383 for (i = 0; i < nbuf; i++) {
1384 bp = &buf[i];
1385 bufhdrinit(bp);
1386
1387 /*
1388 * metabuf buffer headers on the meta-data list and
1389 * rest of the buffer headers on the empty list
1390 */
1391 if (--metabuf)
1392 whichq = BQ_META;
1393 else
1394 whichq = BQ_EMPTY;
1395
1396 BLISTNONE(bp);
1397 dp = &bufqueues[whichq];
1398 binsheadfree(bp, dp, whichq);
1399 binshash(bp, &invalhash);
1400 }
1401
1402 for (; i < nbuf + niobuf; i++) {
1403 bp = &buf[i];
1404 bufhdrinit(bp);
1405 binsheadfree(bp, &iobufqueue, -1);
1406 }
1407
1408 /*
1409 * allocate lock group attribute and group
1410 */
1411 buf_mtx_grp_attr = lck_grp_attr_alloc_init();
1412 //lck_grp_attr_setstat(buf_mtx_grp_attr);
1413 buf_mtx_grp = lck_grp_alloc_init("buffer cache", buf_mtx_grp_attr);
1414
1415 /*
1416 * allocate the lock attribute
1417 */
1418 buf_mtx_attr = lck_attr_alloc_init();
1419 //lck_attr_setdebug(buf_mtx_attr);
1420
1421 /*
1422 * allocate and initialize mutex's for the buffer and iobuffer pools
1423 */
1424 buf_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
1425 iobuffer_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
1426
1427 if (iobuffer_mtxp == NULL)
1428 panic("couldn't create iobuffer mutex");
1429
1430 if (buf_mtxp == NULL)
1431 panic("couldn't create buf mutex");
1432
1433 /*
1434 * allocate and initialize cluster specific global locks...
1435 */
1436 cluster_init();
1437
1438 printf("using %d buffer headers and %d cluster IO buffer headers\n",
1439 nbuf, niobuf);
1440
1441 /* Set up zones used by the buffer cache */
1442 bufzoneinit();
1443
1444 /* start the bcleanbuf() thread */
1445 bcleanbuf_thread_init();
1446
1447 #if BALANCE_QUEUES
1448 {
1449 static void bufq_balance_thread_init();
1450 /* create a thread to do dynamic buffer queue balancing */
1451 bufq_balance_thread_init();
1452 }
1453 #endif /* notyet */
1454 }
1455
1456 static struct buf *
1457 bio_doread(vnode_t vp, daddr64_t blkno, int size, ucred_t cred, int async, int queuetype)
1458 {
1459 buf_t bp;
1460
1461 bp = buf_getblk(vp, blkno, size, 0, 0, queuetype);
1462
1463 /*
1464 * If buffer does not have data valid, start a read.
1465 * Note that if buffer is B_INVAL, buf_getblk() won't return it.
1466 * Therefore, it's valid if it's I/O has completed or been delayed.
1467 */
1468 if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
1469 struct proc *p;
1470
1471 p = current_proc();
1472
1473 /* Start I/O for the buffer (keeping credentials). */
1474 SET(bp->b_flags, B_READ | async);
1475 if (cred != NOCRED && bp->b_rcred == NOCRED) {
1476 kauth_cred_ref(cred);
1477 bp->b_rcred = cred;
1478 }
1479
1480 VNOP_STRATEGY(bp);
1481
1482 trace(TR_BREADMISS, pack(vp, size), blkno);
1483
1484 /* Pay for the read. */
1485 if (p && p->p_stats)
1486 p->p_stats->p_ru.ru_inblock++; /* XXX */
1487
1488 if (async) {
1489 /*
1490 * since we asked for an ASYNC I/O
1491 * the biodone will do the brelse
1492 * we don't want to pass back a bp
1493 * that we don't 'own'
1494 */
1495 bp = NULL;
1496 }
1497 } else if (async) {
1498 buf_brelse(bp);
1499 bp = NULL;
1500 }
1501
1502 trace(TR_BREADHIT, pack(vp, size), blkno);
1503
1504 return (bp);
1505 }
1506
1507 /*
1508 * Perform the reads for buf_breadn() and buf_meta_breadn().
1509 * Trivial modification to the breada algorithm presented in Bach (p.55).
1510 */
1511 static errno_t
1512 do_breadn_for_type(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes,
1513 int nrablks, ucred_t cred, buf_t *bpp, int queuetype)
1514 {
1515 buf_t bp;
1516 int i;
1517
1518 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, queuetype);
1519
1520 /*
1521 * For each of the read-ahead blocks, start a read, if necessary.
1522 */
1523 for (i = 0; i < nrablks; i++) {
1524 /* If it's in the cache, just go on to next one. */
1525 if (incore(vp, rablks[i]))
1526 continue;
1527
1528 /* Get a buffer for the read-ahead block */
1529 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, queuetype);
1530 }
1531
1532 /* Otherwise, we had to start a read for it; wait until it's valid. */
1533 return (buf_biowait(bp));
1534 }
1535
1536
1537 /*
1538 * Read a disk block.
1539 * This algorithm described in Bach (p.54).
1540 */
1541 errno_t
1542 buf_bread(vnode_t vp, daddr64_t blkno, int size, ucred_t cred, buf_t *bpp)
1543 {
1544 buf_t bp;
1545
1546 /* Get buffer for block. */
1547 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
1548
1549 /* Wait for the read to complete, and return result. */
1550 return (buf_biowait(bp));
1551 }
1552
1553 /*
1554 * Read a disk block. [bread() for meta-data]
1555 * This algorithm described in Bach (p.54).
1556 */
1557 errno_t
1558 buf_meta_bread(vnode_t vp, daddr64_t blkno, int size, ucred_t cred, buf_t *bpp)
1559 {
1560 buf_t bp;
1561
1562 /* Get buffer for block. */
1563 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
1564
1565 /* Wait for the read to complete, and return result. */
1566 return (buf_biowait(bp));
1567 }
1568
1569 /*
1570 * Read-ahead multiple disk blocks. The first is sync, the rest async.
1571 */
1572 errno_t
1573 buf_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, ucred_t cred, buf_t *bpp)
1574 {
1575 return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_READ));
1576 }
1577
1578 /*
1579 * Read-ahead multiple disk blocks. The first is sync, the rest async.
1580 * [buf_breadn() for meta-data]
1581 */
1582 errno_t
1583 buf_meta_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, ucred_t cred, buf_t *bpp)
1584 {
1585 return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_META));
1586 }
1587
1588 /*
1589 * Block write. Described in Bach (p.56)
1590 */
1591 errno_t
1592 buf_bwrite(buf_t bp)
1593 {
1594 int sync, wasdelayed;
1595 errno_t rv;
1596 proc_t p = current_proc();
1597 vnode_t vp = bp->b_vp;
1598
1599 if (bp->b_datap == 0) {
1600 if (brecover_data(bp) == 0)
1601 return (0);
1602 }
1603 /* Remember buffer type, to switch on it later. */
1604 sync = !ISSET(bp->b_flags, B_ASYNC);
1605 wasdelayed = ISSET(bp->b_flags, B_DELWRI);
1606 CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
1607
1608 if (wasdelayed)
1609 OSAddAtomic(-1, &nbdwrite);
1610
1611 if (!sync) {
1612 /*
1613 * If not synchronous, pay for the I/O operation and make
1614 * sure the buf is on the correct vnode queue. We have
1615 * to do this now, because if we don't, the vnode may not
1616 * be properly notified that its I/O has completed.
1617 */
1618 if (wasdelayed)
1619 buf_reassign(bp, vp);
1620 else
1621 if (p && p->p_stats)
1622 p->p_stats->p_ru.ru_oublock++; /* XXX */
1623 }
1624 trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
1625
1626 /* Initiate disk write. Make sure the appropriate party is charged. */
1627
1628 OSAddAtomic(1, &vp->v_numoutput);
1629
1630 VNOP_STRATEGY(bp);
1631
1632 if (sync) {
1633 /*
1634 * If I/O was synchronous, wait for it to complete.
1635 */
1636 rv = buf_biowait(bp);
1637
1638 /*
1639 * Pay for the I/O operation, if it's not been paid for, and
1640 * make sure it's on the correct vnode queue. (async operatings
1641 * were payed for above.)
1642 */
1643 if (wasdelayed)
1644 buf_reassign(bp, vp);
1645 else
1646 if (p && p->p_stats)
1647 p->p_stats->p_ru.ru_oublock++; /* XXX */
1648
1649 /* Release the buffer. */
1650 // XXXdbg - only if the unused bit is set
1651 if (!ISSET(bp->b_flags, B_NORELSE)) {
1652 buf_brelse(bp);
1653 } else {
1654 CLR(bp->b_flags, B_NORELSE);
1655 }
1656
1657 return (rv);
1658 } else {
1659 return (0);
1660 }
1661 }
1662
1663 int
1664 vn_bwrite(ap)
1665 struct vnop_bwrite_args *ap;
1666 {
1667 return (buf_bwrite(ap->a_bp));
1668 }
1669
1670 /*
1671 * Delayed write.
1672 *
1673 * The buffer is marked dirty, but is not queued for I/O.
1674 * This routine should be used when the buffer is expected
1675 * to be modified again soon, typically a small write that
1676 * partially fills a buffer.
1677 *
1678 * NB: magnetic tapes cannot be delayed; they must be
1679 * written in the order that the writes are requested.
1680 *
1681 * Described in Leffler, et al. (pp. 208-213).
1682 *
1683 * Note: With the abilitty to allocate additional buffer
1684 * headers, we can get in to the situation where "too" many
1685 * buf_bdwrite()s can create situation where the kernel can create
1686 * buffers faster than the disks can service. Doing a buf_bawrite() in
1687 * cases were we have "too many" outstanding buf_bdwrite()s avoids that.
1688 */
1689 __private_extern__ int
1690 bdwrite_internal(buf_t bp, int return_error)
1691 {
1692 proc_t p = current_proc();
1693 vnode_t vp = bp->b_vp;
1694
1695 /*
1696 * If the block hasn't been seen before:
1697 * (1) Mark it as having been seen,
1698 * (2) Charge for the write.
1699 * (3) Make sure it's on its vnode's correct block list,
1700 */
1701 if (!ISSET(bp->b_flags, B_DELWRI)) {
1702 SET(bp->b_flags, B_DELWRI);
1703 if (p && p->p_stats)
1704 p->p_stats->p_ru.ru_oublock++; /* XXX */
1705 OSAddAtomic(1, &nbdwrite);
1706 buf_reassign(bp, vp);
1707 }
1708
1709 /* If this is a tape block, write it the block now. */
1710 if (ISSET(bp->b_flags, B_TAPE)) {
1711 VNOP_BWRITE(bp);
1712 return (0);
1713 }
1714
1715 /*
1716 * if we're not LOCKED, but the total number of delayed writes
1717 * has climbed above 75% of the total buffers in the system
1718 * return an error if the caller has indicated that it can
1719 * handle one in this case, otherwise schedule the I/O now
1720 * this is done to prevent us from allocating tons of extra
1721 * buffers when dealing with virtual disks (i.e. DiskImages),
1722 * because additional buffers are dynamically allocated to prevent
1723 * deadlocks from occurring
1724 *
1725 * however, can't do a buf_bawrite() if the LOCKED bit is set because the
1726 * buffer is part of a transaction and can't go to disk until
1727 * the LOCKED bit is cleared.
1728 */
1729 if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf/4)*3)) {
1730 if (return_error)
1731 return (EAGAIN);
1732 /*
1733 * If the vnode has "too many" write operations in progress
1734 * wait for them to finish the IO
1735 */
1736 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, (char *)"buf_bdwrite");
1737
1738 return (buf_bawrite(bp));
1739 }
1740
1741 /* Otherwise, the "write" is done, so mark and release the buffer. */
1742 SET(bp->b_flags, B_DONE);
1743 buf_brelse(bp);
1744 return (0);
1745 }
1746
1747 errno_t
1748 buf_bdwrite(buf_t bp)
1749 {
1750 return (bdwrite_internal(bp, 0));
1751 }
1752
1753
1754 /*
1755 * Asynchronous block write; just an asynchronous buf_bwrite().
1756 *
1757 * Note: With the abilitty to allocate additional buffer
1758 * headers, we can get in to the situation where "too" many
1759 * buf_bawrite()s can create situation where the kernel can create
1760 * buffers faster than the disks can service.
1761 * We limit the number of "in flight" writes a vnode can have to
1762 * avoid this.
1763 */
1764 static int
1765 bawrite_internal(buf_t bp, int throttle)
1766 {
1767 vnode_t vp = bp->b_vp;
1768
1769 if (vp) {
1770 if (throttle)
1771 /*
1772 * If the vnode has "too many" write operations in progress
1773 * wait for them to finish the IO
1774 */
1775 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, (const char *)"buf_bawrite");
1776 else if (vp->v_numoutput >= VNODE_ASYNC_THROTTLE)
1777 /*
1778 * return to the caller and
1779 * let him decide what to do
1780 */
1781 return (EWOULDBLOCK);
1782 }
1783 SET(bp->b_flags, B_ASYNC);
1784
1785 return (VNOP_BWRITE(bp));
1786 }
1787
1788 errno_t
1789 buf_bawrite(buf_t bp)
1790 {
1791 return (bawrite_internal(bp, 1));
1792 }
1793
1794
1795 /*
1796 * Release a buffer on to the free lists.
1797 * Described in Bach (p. 46).
1798 */
1799 void
1800 buf_brelse(buf_t bp)
1801 {
1802 struct bqueues *bufq;
1803 long whichq;
1804 upl_t upl;
1805 int need_wakeup = 0;
1806 int need_bp_wakeup = 0;
1807
1808
1809 if (bp->b_whichq != -1 || !(bp->b_lflags & BL_BUSY))
1810 panic("buf_brelse: bad buffer = %x\n", bp);
1811
1812 #ifdef JOE_DEBUG
1813 bp->b_stackbrelse[0] = __builtin_return_address(0);
1814 bp->b_stackbrelse[1] = __builtin_return_address(1);
1815 bp->b_stackbrelse[2] = __builtin_return_address(2);
1816 bp->b_stackbrelse[3] = __builtin_return_address(3);
1817 bp->b_stackbrelse[4] = __builtin_return_address(4);
1818 bp->b_stackbrelse[5] = __builtin_return_address(5);
1819
1820 bp->b_lastbrelse = current_thread();
1821 bp->b_tag = 0;
1822 #endif
1823 if (bp->b_lflags & BL_IOBUF) {
1824 free_io_buf(bp);
1825 return;
1826 }
1827
1828 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
1829 bp->b_lblkno * PAGE_SIZE, (int)bp, (int)bp->b_datap,
1830 bp->b_flags, 0);
1831
1832 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
1833
1834 /*
1835 * if we're invalidating a buffer that has the B_FILTER bit
1836 * set then call the b_iodone function so it gets cleaned
1837 * up properly.
1838 *
1839 * the HFS journal code depends on this
1840 */
1841 if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
1842 if (ISSET(bp->b_flags, B_FILTER)) { /* if necessary, call out */
1843 void (*iodone_func)(struct buf *, void *) = bp->b_iodone;
1844 void *arg = (void *)bp->b_transaction;
1845
1846 CLR(bp->b_flags, B_FILTER); /* but note callout done */
1847 bp->b_iodone = NULL;
1848 bp->b_transaction = NULL;
1849
1850 if (iodone_func == NULL) {
1851 panic("brelse: bp @ 0x%x has NULL b_iodone!\n", bp);
1852 }
1853 (*iodone_func)(bp, arg);
1854 }
1855 }
1856 /*
1857 * I/O is done. Cleanup the UPL state
1858 */
1859 upl = bp->b_upl;
1860
1861 if ( !ISSET(bp->b_flags, B_META) && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
1862 kern_return_t kret;
1863 int upl_flags;
1864
1865 if ( (upl == NULL) ) {
1866 if ( !ISSET(bp->b_flags, B_INVAL)) {
1867 kret = ubc_create_upl(bp->b_vp,
1868 ubc_blktooff(bp->b_vp, bp->b_lblkno),
1869 bp->b_bufsize,
1870 &upl,
1871 NULL,
1872 UPL_PRECIOUS);
1873
1874 if (kret != KERN_SUCCESS)
1875 panic("brelse: Failed to create UPL");
1876 #ifdef UPL_DEBUG
1877 upl_ubc_alias_set(upl, bp, 5);
1878 #endif /* UPL_DEBUG */
1879 }
1880 } else {
1881 if (bp->b_datap) {
1882 kret = ubc_upl_unmap(upl);
1883
1884 if (kret != KERN_SUCCESS)
1885 panic("ubc_upl_unmap failed");
1886 bp->b_datap = (uintptr_t)NULL;
1887 }
1888 }
1889 if (upl) {
1890 if (bp->b_flags & (B_ERROR | B_INVAL)) {
1891 if (bp->b_flags & (B_READ | B_INVAL))
1892 upl_flags = UPL_ABORT_DUMP_PAGES;
1893 else
1894 upl_flags = 0;
1895
1896 ubc_upl_abort(upl, upl_flags);
1897 } else {
1898 if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY))
1899 upl_flags = UPL_COMMIT_SET_DIRTY ;
1900 else
1901 upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
1902
1903 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
1904 UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
1905 }
1906 bp->b_upl = NULL;
1907 }
1908 } else {
1909 if ( (upl) )
1910 panic("brelse: UPL set for non VREG; vp=%x", bp->b_vp);
1911 }
1912
1913 /*
1914 * If it's locked, don't report an error; try again later.
1915 */
1916 if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
1917 CLR(bp->b_flags, B_ERROR);
1918 /*
1919 * If it's not cacheable, or an error, mark it invalid.
1920 */
1921 if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
1922 SET(bp->b_flags, B_INVAL);
1923
1924 if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
1925 /*
1926 * If it's invalid or empty, dissociate it from its vnode
1927 * and put on the head of the appropriate queue.
1928 */
1929 if (bp->b_vp)
1930 brelvp(bp);
1931
1932 if (ISSET(bp->b_flags, B_DELWRI))
1933 OSAddAtomic(-1, &nbdwrite);
1934
1935 CLR(bp->b_flags, (B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE));
1936 /*
1937 * Determine which queue the buffer should be on, then put it there.
1938 */
1939 if (bp->b_bufsize <= 0)
1940 whichq = BQ_EMPTY; /* no data */
1941 else if (ISSET(bp->b_flags, B_META))
1942 whichq = BQ_META; /* meta-data */
1943 else
1944 whichq = BQ_AGE; /* invalid data */
1945 bufq = &bufqueues[whichq];
1946
1947 lck_mtx_lock(buf_mtxp);
1948
1949 binsheadfree(bp, bufq, whichq);
1950 } else {
1951 /*
1952 * It has valid data. Put it on the end of the appropriate
1953 * queue, so that it'll stick around for as long as possible.
1954 */
1955 if (ISSET(bp->b_flags, B_LOCKED))
1956 whichq = BQ_LOCKED; /* locked in core */
1957 else if (ISSET(bp->b_flags, B_META))
1958 whichq = BQ_META; /* meta-data */
1959 else if (ISSET(bp->b_flags, B_AGE))
1960 whichq = BQ_AGE; /* stale but valid data */
1961 else
1962 whichq = BQ_LRU; /* valid data */
1963 bufq = &bufqueues[whichq];
1964
1965 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE));
1966
1967 lck_mtx_lock(buf_mtxp);
1968
1969 binstailfree(bp, bufq, whichq);
1970 }
1971 if (needbuffer) {
1972 /*
1973 * needbuffer is a global
1974 * we're currently using buf_mtxp to protect it
1975 * delay doing the actual wakeup until after
1976 * we drop buf_mtxp
1977 */
1978 needbuffer = 0;
1979 need_wakeup = 1;
1980 }
1981 if (ISSET(bp->b_lflags, BL_WANTED)) {
1982 /*
1983 * delay the actual wakeup until after we
1984 * clear BL_BUSY and we've dropped buf_mtxp
1985 */
1986 need_bp_wakeup = 1;
1987 }
1988 /*
1989 * Unlock the buffer.
1990 */
1991 CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
1992
1993 lck_mtx_unlock(buf_mtxp);
1994
1995 if (need_wakeup) {
1996 /*
1997 * Wake up any processes waiting for any buffer to become free.
1998 */
1999 wakeup(&needbuffer);
2000 }
2001 if (need_bp_wakeup) {
2002 /*
2003 * Wake up any proceeses waiting for _this_ buffer to become free.
2004 */
2005 wakeup(bp);
2006 }
2007 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
2008 (int)bp, (int)bp->b_datap, bp->b_flags, 0, 0);
2009 }
2010
2011 /*
2012 * Determine if a block is in the cache.
2013 * Just look on what would be its hash chain. If it's there, return
2014 * a pointer to it, unless it's marked invalid. If it's marked invalid,
2015 * we normally don't return the buffer, unless the caller explicitly
2016 * wants us to.
2017 */
2018 static boolean_t
2019 incore(vnode_t vp, daddr64_t blkno)
2020 {
2021 boolean_t retval;
2022
2023 lck_mtx_lock(buf_mtxp);
2024
2025 if (incore_locked(vp, blkno))
2026 retval = TRUE;
2027 else
2028 retval = FALSE;
2029 lck_mtx_unlock(buf_mtxp);
2030
2031 return (retval);
2032 }
2033
2034
2035 static buf_t
2036 incore_locked(vnode_t vp, daddr64_t blkno)
2037 {
2038 struct buf *bp;
2039
2040 bp = BUFHASH(vp, blkno)->lh_first;
2041
2042 /* Search hash chain */
2043 for (; bp != NULL; bp = bp->b_hash.le_next) {
2044 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
2045 !ISSET(bp->b_flags, B_INVAL)) {
2046 return (bp);
2047 }
2048 }
2049 return (0);
2050 }
2051
2052
2053 /* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
2054 /*
2055 * Get a block of requested size that is associated with
2056 * a given vnode and block offset. If it is found in the
2057 * block cache, mark it as having been found, make it busy
2058 * and return it. Otherwise, return an empty block of the
2059 * correct size. It is up to the caller to insure that the
2060 * cached blocks be of the correct size.
2061 */
2062 buf_t
2063 buf_getblk(vnode_t vp, daddr64_t blkno, int size, int slpflag, int slptimeo, int operation)
2064 {
2065 buf_t bp;
2066 int err;
2067 upl_t upl;
2068 upl_page_info_t *pl;
2069 kern_return_t kret;
2070 int ret_only_valid;
2071 struct timespec ts;
2072 int upl_flags;
2073
2074 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
2075 (int)(blkno * PAGE_SIZE), size, operation, 0, 0);
2076
2077 ret_only_valid = operation & BLK_ONLYVALID;
2078 operation &= ~BLK_ONLYVALID;
2079 start:
2080 lck_mtx_lock(buf_mtxp);
2081 start_locked:
2082 if ((bp = incore_locked(vp, blkno))) {
2083 /*
2084 * Found in the Buffer Cache
2085 */
2086 if (ISSET(bp->b_lflags, BL_BUSY)) {
2087 /*
2088 * but is busy
2089 */
2090 switch (operation) {
2091 case BLK_READ:
2092 case BLK_WRITE:
2093 case BLK_META:
2094 SET(bp->b_lflags, BL_WANTED);
2095 bufstats.bufs_busyincore++;
2096
2097 /*
2098 * don't retake the mutex after being awakened...
2099 * the time out is in msecs
2100 */
2101 ts.tv_sec = (slptimeo/1000);
2102 ts.tv_nsec = (slptimeo % 1000) * 10 * NSEC_PER_USEC * 1000;
2103
2104 err = msleep(bp, buf_mtxp, slpflag | PDROP | (PRIBIO + 1), "buf_getblk", &ts);
2105
2106 /*
2107 * Callers who call with PCATCH or timeout are
2108 * willing to deal with the NULL pointer
2109 */
2110 if (err && ((slpflag & PCATCH) || ((err == EWOULDBLOCK) && slptimeo)))
2111 return (NULL);
2112 goto start;
2113 /*NOTREACHED*/
2114 break;
2115
2116 default:
2117 /*
2118 * unknown operation requested
2119 */
2120 panic("getblk: paging or unknown operation for incore busy buffer - %x\n", operation);
2121 /*NOTREACHED*/
2122 break;
2123 }
2124 } else {
2125 /*
2126 * buffer in core and not busy
2127 */
2128 if ( (bp->b_upl) )
2129 panic("buffer has UPL, but not marked BUSY: %x", bp);
2130 SET(bp->b_lflags, BL_BUSY);
2131 SET(bp->b_flags, B_CACHE);
2132 #ifdef JOE_DEBUG
2133 bp->b_owner = current_thread();
2134 bp->b_tag = 1;
2135 #endif
2136 bremfree_locked(bp);
2137 bufstats.bufs_incore++;
2138
2139 lck_mtx_unlock(buf_mtxp);
2140
2141 if ( !ret_only_valid)
2142 allocbuf(bp, size);
2143
2144 upl_flags = 0;
2145 switch (operation) {
2146 case BLK_WRITE:
2147 /*
2148 * "write" operation: let the UPL subsystem
2149 * know that we intend to modify the buffer
2150 * cache pages we're gathering.
2151 */
2152 upl_flags |= UPL_WILL_MODIFY;
2153 case BLK_READ:
2154 upl_flags |= UPL_PRECIOUS;
2155 if (UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
2156 kret = ubc_create_upl(vp,
2157 ubc_blktooff(vp, bp->b_lblkno),
2158 bp->b_bufsize,
2159 &upl,
2160 &pl,
2161 upl_flags);
2162 if (kret != KERN_SUCCESS)
2163 panic("Failed to create UPL");
2164
2165 bp->b_upl = upl;
2166
2167 if (upl_valid_page(pl, 0)) {
2168 if (upl_dirty_page(pl, 0))
2169 SET(bp->b_flags, B_WASDIRTY);
2170 else
2171 CLR(bp->b_flags, B_WASDIRTY);
2172 } else
2173 CLR(bp->b_flags, (B_DONE | B_CACHE | B_WASDIRTY | B_DELWRI));
2174
2175 kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_datap));
2176
2177 if (kret != KERN_SUCCESS)
2178 panic("getblk: ubc_upl_map() failed with (%d)", kret);
2179 }
2180 break;
2181
2182 case BLK_META:
2183 /*
2184 * VM is not involved in IO for the meta data
2185 * buffer already has valid data
2186 */
2187 break;
2188
2189 default:
2190 panic("getblk: paging or unknown operation for incore buffer- %d\n", operation);
2191 /*NOTREACHED*/
2192 break;
2193 }
2194 }
2195 } else { /* not incore() */
2196 int queue = BQ_EMPTY; /* Start with no preference */
2197
2198 if (ret_only_valid) {
2199 lck_mtx_unlock(buf_mtxp);
2200 return (NULL);
2201 }
2202
2203 if ((UBCINVALID(vp)) || !(UBCINFOEXISTS(vp)))
2204 operation = BLK_META;
2205
2206 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
2207 goto start_locked;
2208
2209 /*
2210 * getnewbuf may block for a number of different reasons...
2211 * if it does, it's then possible for someone else to
2212 * create a buffer for the same block and insert it into
2213 * the hash... if we see it incore at this point we dump
2214 * the buffer we were working on and start over
2215 */
2216 if (incore_locked(vp, blkno)) {
2217 SET(bp->b_flags, B_INVAL);
2218 binshash(bp, &invalhash);
2219
2220 lck_mtx_unlock(buf_mtxp);
2221
2222 buf_brelse(bp);
2223 goto start;
2224 }
2225 /*
2226 * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
2227 * CALLED! BE CAREFUL.
2228 */
2229
2230 /*
2231 * mark the buffer as B_META if indicated
2232 * so that when buffer is released it will goto META queue
2233 */
2234 if (operation == BLK_META)
2235 SET(bp->b_flags, B_META);
2236
2237 bp->b_blkno = bp->b_lblkno = blkno;
2238 bp->b_vp = vp;
2239
2240 /*
2241 * Insert in the hash so that incore() can find it
2242 */
2243 binshash(bp, BUFHASH(vp, blkno));
2244
2245 lck_mtx_unlock(buf_mtxp);
2246
2247 bgetvp(vp, bp);
2248
2249 allocbuf(bp, size);
2250
2251 upl_flags = 0;
2252 switch (operation) {
2253 case BLK_META:
2254 /*
2255 * buffer data is invalid...
2256 *
2257 * I don't want to have to retake buf_mtxp,
2258 * so the miss and vmhits counters are done
2259 * with Atomic updates... all other counters
2260 * in bufstats are protected with either
2261 * buf_mtxp or iobuffer_mtxp
2262 */
2263 OSAddAtomic(1, &bufstats.bufs_miss);
2264 break;
2265
2266 case BLK_WRITE:
2267 /*
2268 * "write" operation: let the UPL subsystem know
2269 * that we intend to modify the buffer cache pages
2270 * we're gathering.
2271 */
2272 upl_flags |= UPL_WILL_MODIFY;
2273 case BLK_READ:
2274 { off_t f_offset;
2275 size_t contig_bytes;
2276 int bmap_flags;
2277
2278 if ( (bp->b_upl) )
2279 panic("bp already has UPL: %x",bp);
2280
2281 f_offset = ubc_blktooff(vp, blkno);
2282
2283 upl_flags |= UPL_PRECIOUS;
2284 kret = ubc_create_upl(vp,
2285 f_offset,
2286 bp->b_bufsize,
2287 &upl,
2288 &pl,
2289 upl_flags);
2290
2291 if (kret != KERN_SUCCESS)
2292 panic("Failed to create UPL");
2293 #ifdef UPL_DEBUG
2294 upl_ubc_alias_set(upl, bp, 4);
2295 #endif /* UPL_DEBUG */
2296 bp->b_upl = upl;
2297
2298 if (upl_valid_page(pl, 0)) {
2299
2300 if (operation == BLK_READ)
2301 bmap_flags = VNODE_READ;
2302 else
2303 bmap_flags = VNODE_WRITE;
2304
2305 SET(bp->b_flags, B_CACHE | B_DONE);
2306
2307 OSAddAtomic(1, &bufstats.bufs_vmhits);
2308
2309 bp->b_validoff = 0;
2310 bp->b_dirtyoff = 0;
2311
2312 if (upl_dirty_page(pl, 0)) {
2313 /* page is dirty */
2314 SET(bp->b_flags, B_WASDIRTY);
2315
2316 bp->b_validend = bp->b_bcount;
2317 bp->b_dirtyend = bp->b_bcount;
2318 } else {
2319 /* page is clean */
2320 bp->b_validend = bp->b_bcount;
2321 bp->b_dirtyend = 0;
2322 }
2323 /*
2324 * try to recreate the physical block number associated with
2325 * this buffer...
2326 */
2327 if (VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))
2328 panic("getblk: VNOP_BLOCKMAP failed");
2329 /*
2330 * if the extent represented by this buffer
2331 * is not completely physically contiguous on
2332 * disk, than we can't cache the physical mapping
2333 * in the buffer header
2334 */
2335 if ((long)contig_bytes < bp->b_bcount)
2336 bp->b_blkno = bp->b_lblkno;
2337 } else {
2338 OSAddAtomic(1, &bufstats.bufs_miss);
2339 }
2340 kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_datap));
2341
2342 if (kret != KERN_SUCCESS)
2343 panic("getblk: ubc_upl_map() failed with (%d)", kret);
2344 break;
2345 }
2346 default:
2347 panic("getblk: paging or unknown operation - %x", operation);
2348 /*NOTREACHED*/
2349 break;
2350 }
2351 }
2352 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
2353 (int)bp, (int)bp->b_datap, bp->b_flags, 3, 0);
2354
2355 #ifdef JOE_DEBUG
2356 bp->b_stackgetblk[0] = __builtin_return_address(0);
2357 bp->b_stackgetblk[1] = __builtin_return_address(1);
2358 bp->b_stackgetblk[2] = __builtin_return_address(2);
2359 bp->b_stackgetblk[3] = __builtin_return_address(3);
2360 bp->b_stackgetblk[4] = __builtin_return_address(4);
2361 bp->b_stackgetblk[5] = __builtin_return_address(5);
2362 #endif
2363 return (bp);
2364 }
2365
2366 /*
2367 * Get an empty, disassociated buffer of given size.
2368 */
2369 buf_t
2370 buf_geteblk(size)
2371 int size;
2372 {
2373 buf_t bp;
2374 int queue = BQ_EMPTY;
2375
2376 lck_mtx_lock(buf_mtxp);
2377
2378 while ((bp = getnewbuf(0, 0, &queue)) == 0)
2379 ;
2380 SET(bp->b_flags, (B_META|B_INVAL));
2381
2382 #if DIAGNOSTIC
2383 assert(queue == BQ_EMPTY);
2384 #endif /* DIAGNOSTIC */
2385 /* XXX need to implement logic to deal with other queues */
2386
2387 binshash(bp, &invalhash);
2388 bufstats.bufs_eblk++;
2389
2390 lck_mtx_unlock(buf_mtxp);
2391
2392 allocbuf(bp, size);
2393
2394 return (bp);
2395 }
2396
2397 /*
2398 * Zones for the meta data buffers
2399 */
2400
2401 #define MINMETA 512
2402 #define MAXMETA 4096
2403
2404 struct meta_zone_entry {
2405 zone_t mz_zone;
2406 vm_size_t mz_size;
2407 vm_size_t mz_max;
2408 char *mz_name;
2409 };
2410
2411 struct meta_zone_entry meta_zones[] = {
2412 {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
2413 {NULL, (MINMETA * 2), 64 * (MINMETA * 2), "buf.1024" },
2414 {NULL, (MINMETA * 4), 16 * (MINMETA * 4), "buf.2048" },
2415 {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
2416 {NULL, 0, 0, "" } /* End */
2417 };
2418
2419 /*
2420 * Initialize the meta data zones
2421 */
2422 static void
2423 bufzoneinit(void)
2424 {
2425 int i;
2426
2427 for (i = 0; meta_zones[i].mz_size != 0; i++) {
2428 meta_zones[i].mz_zone =
2429 zinit(meta_zones[i].mz_size,
2430 meta_zones[i].mz_max,
2431 PAGE_SIZE,
2432 meta_zones[i].mz_name);
2433 }
2434 buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
2435 }
2436
2437 static __inline__ zone_t
2438 getbufzone(size_t size)
2439 {
2440 int i;
2441
2442 if ((size % 512) || (size < MINMETA) || (size > MAXMETA))
2443 panic("getbufzone: incorect size = %d", size);
2444
2445 for (i = 0; meta_zones[i].mz_size != 0; i++) {
2446 if (meta_zones[i].mz_size >= size)
2447 break;
2448 }
2449
2450 return (meta_zones[i].mz_zone);
2451 }
2452
2453 /*
2454 * With UBC, there is no need to expand / shrink the file data
2455 * buffer. The VM uses the same pages, hence no waste.
2456 * All the file data buffers can have one size.
2457 * In fact expand / shrink would be an expensive operation.
2458 *
2459 * Only exception to this is meta-data buffers. Most of the
2460 * meta data operations are smaller than PAGE_SIZE. Having the
2461 * meta-data buffers grow and shrink as needed, optimizes use
2462 * of the kernel wired memory.
2463 */
2464
2465 int
2466 allocbuf(buf_t bp, int size)
2467 {
2468 vm_size_t desired_size;
2469
2470 desired_size = roundup(size, CLBYTES);
2471
2472 if (desired_size < PAGE_SIZE)
2473 desired_size = PAGE_SIZE;
2474 if (desired_size > MAXBSIZE)
2475 panic("allocbuf: buffer larger than MAXBSIZE requested");
2476
2477 if (ISSET(bp->b_flags, B_META)) {
2478 zone_t zprev, z;
2479 int nsize = roundup(size, MINMETA);
2480
2481 if (bp->b_datap) {
2482 vm_offset_t elem = (vm_offset_t)bp->b_datap;
2483
2484 if (ISSET(bp->b_flags, B_ZALLOC)) {
2485 if (bp->b_bufsize < nsize) {
2486 /* reallocate to a bigger size */
2487
2488 zprev = getbufzone(bp->b_bufsize);
2489 if (nsize <= MAXMETA) {
2490 desired_size = nsize;
2491 z = getbufzone(nsize);
2492 bp->b_datap = (uintptr_t)zalloc(z);
2493 } else {
2494 bp->b_datap = (uintptr_t)NULL;
2495 kmem_alloc(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
2496 CLR(bp->b_flags, B_ZALLOC);
2497 }
2498 bcopy((void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
2499 zfree(zprev, (void *)elem);
2500 } else {
2501 desired_size = bp->b_bufsize;
2502 }
2503
2504 } else {
2505 if ((vm_size_t)bp->b_bufsize < desired_size) {
2506 /* reallocate to a bigger size */
2507 bp->b_datap = (uintptr_t)NULL;
2508 kmem_alloc(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
2509 bcopy((const void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
2510 kmem_free(kernel_map, elem, bp->b_bufsize);
2511 } else {
2512 desired_size = bp->b_bufsize;
2513 }
2514 }
2515 } else {
2516 /* new allocation */
2517 if (nsize <= MAXMETA) {
2518 desired_size = nsize;
2519 z = getbufzone(nsize);
2520 bp->b_datap = (uintptr_t)zalloc(z);
2521 SET(bp->b_flags, B_ZALLOC);
2522 } else
2523 kmem_alloc(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
2524 }
2525 }
2526 bp->b_bufsize = desired_size;
2527 bp->b_bcount = size;
2528
2529 return (0);
2530 }
2531
2532 /*
2533 * Get a new buffer from one of the free lists.
2534 *
2535 * Request for a queue is passes in. The queue from which the buffer was taken
2536 * from is returned. Out of range queue requests get BQ_EMPTY. Request for
2537 * BQUEUE means no preference. Use heuristics in that case.
2538 * Heuristics is as follows:
2539 * Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
2540 * If none available block till one is made available.
2541 * If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
2542 * Pick the most stale buffer.
2543 * If found buffer was marked delayed write, start the async. write
2544 * and restart the search.
2545 * Initialize the fields and disassociate the buffer from the vnode.
2546 * Remove the buffer from the hash. Return the buffer and the queue
2547 * on which it was found.
2548 *
2549 * buf_mtxp is held upon entry
2550 * returns with buf_mtxp locked
2551 */
2552
2553 static buf_t
2554 getnewbuf(int slpflag, int slptimeo, int * queue)
2555 {
2556 buf_t bp;
2557 buf_t lru_bp;
2558 buf_t age_bp;
2559 buf_t meta_bp;
2560 int age_time, lru_time, bp_time, meta_time;
2561 int req = *queue; /* save it for restarts */
2562 struct timespec ts;
2563
2564 start:
2565 /*
2566 * invalid request gets empty queue
2567 */
2568 if ((*queue > BQUEUES) || (*queue < 0)
2569 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED))
2570 *queue = BQ_EMPTY;
2571
2572 /*
2573 * (*queue == BQUEUES) means no preference
2574 */
2575 if (*queue != BQUEUES) {
2576 /* Try for the requested queue first */
2577 bp = bufqueues[*queue].tqh_first;
2578 if (bp)
2579 goto found;
2580 }
2581
2582 /* Unable to use requested queue */
2583 age_bp = bufqueues[BQ_AGE].tqh_first;
2584 lru_bp = bufqueues[BQ_LRU].tqh_first;
2585 meta_bp = bufqueues[BQ_META].tqh_first;
2586
2587 if (!age_bp && !lru_bp && !meta_bp) {
2588 /*
2589 * Unavailble on AGE or LRU or META queues
2590 * Try the empty list first
2591 */
2592 bp = bufqueues[BQ_EMPTY].tqh_first;
2593 if (bp) {
2594 *queue = BQ_EMPTY;
2595 goto found;
2596 }
2597 lck_mtx_unlock(buf_mtxp);
2598
2599 /* Create a new temporary buffer header */
2600 bp = (struct buf *)zalloc(buf_hdr_zone);
2601
2602 lck_mtx_lock(buf_mtxp);
2603
2604 if (bp) {
2605 bufhdrinit(bp);
2606 BLISTNONE(bp);
2607 binshash(bp, &invalhash);
2608 SET(bp->b_flags, B_HDRALLOC);
2609 *queue = BQ_EMPTY;
2610 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
2611 buf_hdr_count++;
2612 goto found;
2613 }
2614 bufstats.bufs_sleeps++;
2615
2616 /* wait for a free buffer of any kind */
2617 needbuffer = 1;
2618 /* hz value is 100 */
2619 ts.tv_sec = (slptimeo/1000);
2620 /* the hz value is 100; which leads to 10ms */
2621 ts.tv_nsec = (slptimeo % 1000) * NSEC_PER_USEC * 1000 * 10;
2622 msleep(&needbuffer, buf_mtxp, slpflag|(PRIBIO+1), (char *)"getnewbuf", &ts);
2623
2624 return (0);
2625 }
2626
2627 /* Buffer available either on AGE or LRU or META */
2628 bp = NULL;
2629 *queue = -1;
2630
2631 /* Buffer available either on AGE or LRU */
2632 if (!age_bp) {
2633 bp = lru_bp;
2634 *queue = BQ_LRU;
2635 } else if (!lru_bp) {
2636 bp = age_bp;
2637 *queue = BQ_AGE;
2638 } else { /* buffer available on both AGE and LRU */
2639 int t = buf_timestamp();
2640
2641 age_time = t - age_bp->b_timestamp;
2642 lru_time = t - lru_bp->b_timestamp;
2643 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
2644 bp = age_bp;
2645 *queue = BQ_AGE;
2646 /*
2647 * we should probably re-timestamp eveything in the
2648 * queues at this point with the current time
2649 */
2650 } else {
2651 if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
2652 bp = lru_bp;
2653 *queue = BQ_LRU;
2654 } else {
2655 bp = age_bp;
2656 *queue = BQ_AGE;
2657 }
2658 }
2659 }
2660
2661 if (!bp) { /* Neither on AGE nor on LRU */
2662 bp = meta_bp;
2663 *queue = BQ_META;
2664 } else if (meta_bp) {
2665 int t = buf_timestamp();
2666
2667 bp_time = t - bp->b_timestamp;
2668 meta_time = t - meta_bp->b_timestamp;
2669
2670 if (!(bp_time < 0) && !(meta_time < 0)) {
2671 /* time not set backwards */
2672 int bp_is_stale;
2673 bp_is_stale = (*queue == BQ_LRU) ?
2674 lru_is_stale : age_is_stale;
2675
2676 if ((meta_time >= meta_is_stale) &&
2677 (bp_time < bp_is_stale)) {
2678 bp = meta_bp;
2679 *queue = BQ_META;
2680 }
2681 }
2682 }
2683 found:
2684 if (ISSET(bp->b_flags, B_LOCKED) || ISSET(bp->b_lflags, BL_BUSY))
2685 panic("getnewbuf: bp @ 0x%x is LOCKED or BUSY! (flags 0x%x)\n", bp, bp->b_flags);
2686
2687 /* Clean it */
2688 if (bcleanbuf(bp)) {
2689 /*
2690 * moved to the laundry thread, buffer not ready
2691 */
2692 *queue = req;
2693 goto start;
2694 }
2695 return (bp);
2696 }
2697
2698
2699 /*
2700 * Clean a buffer.
2701 * Returns 0 is buffer is ready to use,
2702 * Returns 1 if issued a buf_bawrite() to indicate
2703 * that the buffer is not ready.
2704 *
2705 * buf_mtxp is held upon entry
2706 * returns with buf_mtxp locked
2707 */
2708 static int
2709 bcleanbuf(buf_t bp)
2710 {
2711 ucred_t cred;
2712
2713
2714 /* Remove from the queue */
2715 bremfree_locked(bp);
2716
2717 /* Buffer is no longer on free lists. */
2718 SET(bp->b_lflags, BL_BUSY);
2719 #ifdef JOE_DEBUG
2720 bp->b_owner = current_thread();
2721 bp->b_tag = 2;
2722 #endif
2723 /*
2724 * If buffer was a delayed write, start the IO by queuing
2725 * it on the LAUNDRY queue, and return 1
2726 */
2727 if (ISSET(bp->b_flags, B_DELWRI)) {
2728 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
2729 blaundrycnt++;
2730
2731 lck_mtx_unlock(buf_mtxp);
2732
2733 wakeup(&blaundrycnt);
2734 /* and give it a chance to run */
2735 (void)thread_block(THREAD_CONTINUE_NULL);
2736
2737 lck_mtx_lock(buf_mtxp);
2738 return (1);
2739 }
2740 bremhash(bp);
2741
2742 lck_mtx_unlock(buf_mtxp);
2743
2744 BLISTNONE(bp);
2745 /*
2746 * disassociate us from our vnode, if we had one...
2747 */
2748 if (bp->b_vp)
2749 brelvp(bp);
2750
2751 if (ISSET(bp->b_flags, B_META)) {
2752 vm_offset_t elem;
2753
2754 elem = (vm_offset_t)bp->b_datap;
2755 bp->b_datap = (uintptr_t)0xdeadbeef;
2756
2757 if (ISSET(bp->b_flags, B_ZALLOC)) {
2758 zone_t z;
2759
2760 z = getbufzone(bp->b_bufsize);
2761 zfree(z, (void *)elem);
2762 } else
2763 kmem_free(kernel_map, elem, bp->b_bufsize);
2764 }
2765
2766 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
2767
2768 /* clear out various other fields */
2769 bp->b_bufsize = 0;
2770 bp->b_datap = (uintptr_t)NULL;
2771 bp->b_upl = (void *)NULL;
2772 /*
2773 * preserve the state of whether this buffer
2774 * was allocated on the fly or not...
2775 * the only other flag that should be set at
2776 * this point is BL_BUSY...
2777 */
2778 #ifdef JOE_DEBUG
2779 bp->b_owner = current_thread();
2780 bp->b_tag = 3;
2781 #endif
2782 bp->b_lflags = BL_BUSY;
2783 bp->b_flags = (bp->b_flags & B_HDRALLOC);
2784 bp->b_dev = NODEV;
2785 bp->b_blkno = bp->b_lblkno = 0;
2786 bp->b_iodone = NULL;
2787 bp->b_error = 0;
2788 bp->b_resid = 0;
2789 bp->b_bcount = 0;
2790 bp->b_dirtyoff = bp->b_dirtyend = 0;
2791 bp->b_validoff = bp->b_validend = 0;
2792
2793 /* nuke any credentials we were holding */
2794 cred = bp->b_rcred;
2795 if (cred != NOCRED) {
2796 bp->b_rcred = NOCRED;
2797 kauth_cred_rele(cred);
2798 }
2799 cred = bp->b_wcred;
2800 if (cred != NOCRED) {
2801 bp->b_wcred = NOCRED;
2802 kauth_cred_rele(cred);
2803 }
2804 lck_mtx_lock(buf_mtxp);
2805
2806 return (0);
2807 }
2808
2809
2810
2811 errno_t
2812 buf_invalblkno(vnode_t vp, daddr64_t lblkno, int flags)
2813 {
2814 buf_t bp;
2815 errno_t error;
2816
2817 lck_mtx_lock(buf_mtxp);
2818 relook:
2819 if ((bp = incore_locked(vp, lblkno)) == (struct buf *)0) {
2820 lck_mtx_unlock(buf_mtxp);
2821 return (0);
2822 }
2823 if (ISSET(bp->b_lflags, BL_BUSY)) {
2824 if ( !ISSET(flags, BUF_WAIT)) {
2825 lck_mtx_unlock(buf_mtxp);
2826 return (EBUSY);
2827 }
2828 SET(bp->b_lflags, BL_WANTED);
2829
2830 error = msleep((caddr_t)bp, buf_mtxp, (PRIBIO + 1), (char *)"buf_invalblkno", 0);
2831
2832 if (error)
2833 return (error);
2834 goto relook;
2835 }
2836 bremfree_locked(bp);
2837 SET(bp->b_lflags, BL_BUSY);
2838 SET(bp->b_flags, B_INVAL);
2839 #ifdef JOE_DEBUG
2840 bp->b_owner = current_thread();
2841 bp->b_tag = 4;
2842 #endif
2843 lck_mtx_unlock(buf_mtxp);
2844 buf_brelse(bp);
2845
2846 return (0);
2847 }
2848
2849
2850 void
2851 buf_drop(buf_t bp)
2852 {
2853 int need_wakeup = 0;
2854
2855 lck_mtx_lock(buf_mtxp);
2856
2857 if (ISSET(bp->b_lflags, BL_WANTED)) {
2858 /*
2859 * delay the actual wakeup until after we
2860 * clear BL_BUSY and we've dropped buf_mtxp
2861 */
2862 need_wakeup = 1;
2863 }
2864 /*
2865 * Unlock the buffer.
2866 */
2867 CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
2868
2869 lck_mtx_unlock(buf_mtxp);
2870
2871 if (need_wakeup) {
2872 /*
2873 * Wake up any proceeses waiting for _this_ buffer to become free.
2874 */
2875 wakeup(bp);
2876 }
2877 }
2878
2879
2880 errno_t
2881 buf_acquire(buf_t bp, int flags, int slpflag, int slptimeo) {
2882 errno_t error;
2883
2884 lck_mtx_lock(buf_mtxp);
2885
2886 error = buf_acquire_locked(bp, flags, slpflag, slptimeo);
2887
2888 lck_mtx_unlock(buf_mtxp);
2889
2890 return (error);
2891 }
2892
2893
2894 static errno_t
2895 buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo)
2896 {
2897 errno_t error;
2898 struct timespec ts;
2899
2900 if (ISSET(bp->b_flags, B_LOCKED)) {
2901 if ((flags & BAC_SKIP_LOCKED))
2902 return (EDEADLK);
2903 } else {
2904 if ((flags & BAC_SKIP_NONLOCKED))
2905 return (EDEADLK);
2906 }
2907 if (ISSET(bp->b_lflags, BL_BUSY)) {
2908 /*
2909 * since the mutex_lock may block, the buffer
2910 * may become BUSY, so we need to
2911 * recheck for a NOWAIT request
2912 */
2913 if (flags & BAC_NOWAIT)
2914 return (EBUSY);
2915 SET(bp->b_lflags, BL_WANTED);
2916
2917 /* the hz value is 100; which leads to 10ms */
2918 ts.tv_sec = (slptimeo/100);
2919 ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000;
2920 error = msleep((caddr_t)bp, buf_mtxp, slpflag | (PRIBIO + 1), (char *)"buf_acquire", &ts);
2921
2922 if (error)
2923 return (error);
2924 return (EAGAIN);
2925 }
2926 if (flags & BAC_REMOVE)
2927 bremfree_locked(bp);
2928 SET(bp->b_lflags, BL_BUSY);
2929 #ifdef JOE_DEBUG
2930 bp->b_owner = current_thread();
2931 bp->b_tag = 5;
2932 #endif
2933 return (0);
2934 }
2935
2936
2937 /*
2938 * Wait for operations on the buffer to complete.
2939 * When they do, extract and return the I/O's error value.
2940 */
2941 errno_t
2942 buf_biowait(buf_t bp)
2943 {
2944 lck_mtx_lock(buf_mtxp);
2945
2946 while (!ISSET(bp->b_flags, B_DONE))
2947 (void) msleep(bp, buf_mtxp, (PRIBIO+1), (char *)"buf_biowait", 0);
2948
2949 lck_mtx_unlock(buf_mtxp);
2950
2951 /* check for interruption of I/O (e.g. via NFS), then errors. */
2952 if (ISSET(bp->b_flags, B_EINTR)) {
2953 CLR(bp->b_flags, B_EINTR);
2954 return (EINTR);
2955 } else if (ISSET(bp->b_flags, B_ERROR))
2956 return (bp->b_error ? bp->b_error : EIO);
2957 else
2958 return (0);
2959 }
2960
2961 /*
2962 * Mark I/O complete on a buffer.
2963 *
2964 * If a callback has been requested, e.g. the pageout
2965 * daemon, do so. Otherwise, awaken waiting processes.
2966 *
2967 * [ Leffler, et al., says on p.247:
2968 * "This routine wakes up the blocked process, frees the buffer
2969 * for an asynchronous write, or, for a request by the pagedaemon
2970 * process, invokes a procedure specified in the buffer structure" ]
2971 *
2972 * In real life, the pagedaemon (or other system processes) wants
2973 * to do async stuff to, and doesn't want the buffer buf_brelse()'d.
2974 * (for swap pager, that puts swap buffers on the free lists (!!!),
2975 * for the vn device, that puts malloc'd buffers on the free lists!)
2976 */
2977 extern struct timeval priority_IO_timestamp_for_root;
2978 extern int hard_throttle_on_root;
2979
2980 void
2981 buf_biodone(buf_t bp)
2982 {
2983 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
2984 (int)bp, (int)bp->b_datap, bp->b_flags, 0, 0);
2985
2986 if (ISSET(bp->b_flags, B_DONE))
2987 panic("biodone already");
2988
2989 if (kdebug_enable) {
2990 int code = DKIO_DONE;
2991
2992 if (bp->b_flags & B_READ)
2993 code |= DKIO_READ;
2994 if (bp->b_flags & B_ASYNC)
2995 code |= DKIO_ASYNC;
2996
2997 if (bp->b_flags & B_META)
2998 code |= DKIO_META;
2999 else if (bp->b_flags & B_PAGEIO)
3000 code |= DKIO_PAGING;
3001
3002 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
3003 (unsigned int)bp, (unsigned int)bp->b_vp,
3004 bp->b_resid, bp->b_error, 0);
3005 }
3006 if ((bp->b_vp != NULLVP) &&
3007 ((bp->b_flags & (B_PAGEIO | B_READ)) == (B_PAGEIO | B_READ)) &&
3008 (bp->b_vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV)) {
3009 microuptime(&priority_IO_timestamp_for_root);
3010 hard_throttle_on_root = 0;
3011 }
3012 /*
3013 * I/O was done, so don't believe
3014 * the DIRTY state from VM anymore
3015 */
3016 CLR(bp->b_flags, B_WASDIRTY);
3017
3018 if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
3019 /*
3020 * wake up any writer's blocked
3021 * on throttle or waiting for I/O
3022 * to drain
3023 */
3024 vnode_writedone(bp->b_vp);
3025
3026 if (ISSET(bp->b_flags, (B_CALL | B_FILTER))) { /* if necessary, call out */
3027 void (*iodone_func)(struct buf *, void *) = bp->b_iodone;
3028 void *arg = (void *)bp->b_transaction;
3029 int callout = ISSET(bp->b_flags, B_CALL);
3030
3031 CLR(bp->b_flags, (B_CALL | B_FILTER)); /* filters and callouts are one-shot */
3032 bp->b_iodone = NULL;
3033 bp->b_transaction = NULL;
3034
3035 if (iodone_func == NULL) {
3036 panic("biodone: bp @ 0x%x has NULL b_iodone!\n", bp);
3037 } else {
3038 if (callout)
3039 SET(bp->b_flags, B_DONE); /* note that it's done */
3040 (*iodone_func)(bp, arg);
3041 }
3042 if (callout)
3043 /*
3044 * assumes that the call back function takes
3045 * ownership of the bp and deals with releasing it if necessary
3046 */
3047 goto biodone_done;
3048 /*
3049 * in this case the call back function is acting
3050 * strictly as a filter... it does not take
3051 * ownership of the bp and is expecting us
3052 * to finish cleaning up... this is currently used
3053 * by the HFS journaling code
3054 */
3055 }
3056 if (ISSET(bp->b_flags, B_ASYNC)) { /* if async, release it */
3057 SET(bp->b_flags, B_DONE); /* note that it's done */
3058
3059 buf_brelse(bp);
3060 } else { /* or just wakeup the buffer */
3061 /*
3062 * by taking the mutex, we serialize
3063 * the buf owner calling buf_biowait so that we'll
3064 * only see him in one of 2 states...
3065 * state 1: B_DONE wasn't set and he's
3066 * blocked in msleep
3067 * state 2: he's blocked trying to take the
3068 * mutex before looking at B_DONE
3069 * BL_WANTED is cleared in case anyone else
3070 * is blocked waiting for the buffer... note
3071 * that we haven't cleared B_BUSY yet, so if
3072 * they do get to run, their going to re-set
3073 * BL_WANTED and go back to sleep
3074 */
3075 lck_mtx_lock(buf_mtxp);
3076
3077 CLR(bp->b_lflags, BL_WANTED);
3078 SET(bp->b_flags, B_DONE); /* note that it's done */
3079
3080 lck_mtx_unlock(buf_mtxp);
3081
3082 wakeup(bp);
3083 }
3084 biodone_done:
3085 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
3086 (int)bp, (int)bp->b_datap, bp->b_flags, 0, 0);
3087 }
3088
3089 /*
3090 * Return a count of buffers on the "locked" queue.
3091 */
3092 int
3093 count_lock_queue(void)
3094 {
3095 buf_t bp;
3096 int n = 0;
3097
3098 lck_mtx_lock(buf_mtxp);
3099
3100 for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
3101 bp = bp->b_freelist.tqe_next)
3102 n++;
3103 lck_mtx_unlock(buf_mtxp);
3104
3105 return (n);
3106 }
3107
3108 /*
3109 * Return a count of 'busy' buffers. Used at the time of shutdown.
3110 */
3111 int
3112 count_busy_buffers(void)
3113 {
3114 buf_t bp;
3115 int nbusy = 0;
3116
3117 for (bp = &buf[nbuf]; --bp >= buf; )
3118 if (!ISSET(bp->b_flags, B_INVAL) && ISSET(bp->b_lflags, BL_BUSY))
3119 nbusy++;
3120 return (nbusy);
3121 }
3122
3123 #if DIAGNOSTIC
3124 /*
3125 * Print out statistics on the current allocation of the buffer pool.
3126 * Can be enabled to print out on every ``sync'' by setting "syncprt"
3127 * in vfs_syscalls.c using sysctl.
3128 */
3129 void
3130 vfs_bufstats()
3131 {
3132 int i, j, count;
3133 register struct buf *bp;
3134 register struct bqueues *dp;
3135 int counts[MAXBSIZE/CLBYTES+1];
3136 static char *bname[BQUEUES] =
3137 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
3138
3139 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
3140 count = 0;
3141 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
3142 counts[j] = 0;
3143
3144 lck_mtx_lock(buf_mtxp);
3145
3146 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
3147 counts[bp->b_bufsize/CLBYTES]++;
3148 count++;
3149 }
3150 lck_mtx_unlock(buf_mtxp);
3151
3152 printf("%s: total-%d", bname[i], count);
3153 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
3154 if (counts[j] != 0)
3155 printf(", %d-%d", j * CLBYTES, counts[j]);
3156 printf("\n");
3157 }
3158 }
3159 #endif /* DIAGNOSTIC */
3160
3161 #define NRESERVEDIOBUFS 64
3162
3163
3164 buf_t
3165 alloc_io_buf(vnode_t vp, int priv)
3166 {
3167 buf_t bp;
3168
3169 lck_mtx_lock(iobuffer_mtxp);
3170
3171 while (((niobuf - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse) && !priv) ||
3172 (bp = iobufqueue.tqh_first) == NULL) {
3173 bufstats.bufs_iobufsleeps++;
3174
3175 need_iobuffer = 1;
3176 (void) msleep(&need_iobuffer, iobuffer_mtxp, (PRIBIO+1), (const char *)"alloc_io_buf", 0);
3177 }
3178 TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
3179
3180 bufstats.bufs_iobufinuse++;
3181 if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
3182 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
3183
3184 lck_mtx_unlock(iobuffer_mtxp);
3185
3186 /*
3187 * initialize various fields
3188 * we don't need to hold the mutex since the buffer
3189 * is now private... the vp should have a reference
3190 * on it and is not protected by this mutex in any event
3191 */
3192 bp->b_timestamp = 0;
3193 bp->b_proc = NULL;
3194
3195 bp->b_datap = 0;
3196 bp->b_flags = 0;
3197 bp->b_lflags = BL_BUSY | BL_IOBUF;
3198 bp->b_blkno = bp->b_lblkno = 0;
3199 #ifdef JOE_DEBUG
3200 bp->b_owner = current_thread();
3201 bp->b_tag = 6;
3202 #endif
3203 bp->b_iodone = NULL;
3204 bp->b_error = 0;
3205 bp->b_resid = 0;
3206 bp->b_bcount = 0;
3207 bp->b_bufsize = 0;
3208 bp->b_upl = NULL;
3209 bp->b_vp = vp;
3210
3211 if (vp && (vp->v_type == VBLK || vp->v_type == VCHR))
3212 bp->b_dev = vp->v_rdev;
3213 else
3214 bp->b_dev = NODEV;
3215
3216 return (bp);
3217 }
3218
3219
3220 void
3221 free_io_buf(buf_t bp)
3222 {
3223 int need_wakeup = 0;
3224
3225 /*
3226 * put buffer back on the head of the iobufqueue
3227 */
3228 bp->b_vp = NULL;
3229 bp->b_flags = B_INVAL;
3230
3231 lck_mtx_lock(iobuffer_mtxp);
3232
3233 binsheadfree(bp, &iobufqueue, -1);
3234
3235 if (need_iobuffer) {
3236 /*
3237 * Wake up any processes waiting because they need an io buffer
3238 *
3239 * do the wakeup after we drop the mutex... it's possible that the
3240 * wakeup will be superfluous if need_iobuffer gets set again and
3241 * another thread runs this path, but it's highly unlikely, doesn't
3242 * hurt, and it means we don't hold up I/O progress if the wakeup blocks
3243 * trying to grab a task related lock...
3244 */
3245 need_iobuffer = 0;
3246 need_wakeup = 1;
3247 }
3248 bufstats.bufs_iobufinuse--;
3249
3250 lck_mtx_unlock(iobuffer_mtxp);
3251
3252 if (need_wakeup)
3253 wakeup(&need_iobuffer);
3254 }
3255
3256
3257
3258 /*
3259 * If getnewbuf() calls bcleanbuf() on the same thread
3260 * there is a potential for stack overrun and deadlocks.
3261 * So we always handoff the work to a worker thread for completion
3262 */
3263 #include <mach/mach_types.h>
3264 #include <mach/memory_object_types.h>
3265 #include <kern/sched_prim.h>
3266
3267
3268 static void
3269 bcleanbuf_thread_init(void)
3270 {
3271 /* create worker thread */
3272 kernel_thread(kernel_task, bcleanbuf_thread);
3273 }
3274
3275 static void
3276 bcleanbuf_thread(void)
3277 {
3278 struct buf *bp;
3279 int error = 0;
3280 int loopcnt = 0;
3281
3282 for (;;) {
3283 lck_mtx_lock(buf_mtxp);
3284
3285 while (blaundrycnt == 0)
3286 (void)msleep((void *)&blaundrycnt, buf_mtxp, PRIBIO, "blaundry", 0);
3287
3288 bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY]);
3289 /*
3290 * Remove from the queue
3291 */
3292 bremfree_locked(bp);
3293 blaundrycnt--;
3294
3295 lck_mtx_unlock(buf_mtxp);
3296 /*
3297 * do the IO
3298 */
3299 error = bawrite_internal(bp, 0);
3300
3301 if (error) {
3302 lck_mtx_lock(buf_mtxp);
3303
3304 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
3305 blaundrycnt++;
3306
3307 lck_mtx_unlock(buf_mtxp);
3308
3309 if (loopcnt > 10) {
3310 (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 1);
3311 loopcnt = 0;
3312 } else {
3313 (void)thread_block(THREAD_CONTINUE_NULL);
3314 loopcnt++;
3315 }
3316 }
3317 }
3318 }
3319
3320
3321 static int
3322 brecover_data(buf_t bp)
3323 {
3324 int upl_offset;
3325 upl_t upl;
3326 upl_page_info_t *pl;
3327 kern_return_t kret;
3328 vnode_t vp = bp->b_vp;
3329 int upl_flags;
3330
3331
3332 if ( !UBCINFOEXISTS(vp) || bp->b_bufsize == 0)
3333 goto dump_buffer;
3334
3335 upl_flags = UPL_PRECIOUS;
3336 if (! (buf_flags(bp) & B_READ)) {
3337 /*
3338 * "write" operation: let the UPL subsystem know
3339 * that we intend to modify the buffer cache pages we're
3340 * gathering.
3341 */
3342 upl_flags |= UPL_WILL_MODIFY;
3343 }
3344
3345 kret = ubc_create_upl(vp,
3346 ubc_blktooff(vp, bp->b_lblkno),
3347 bp->b_bufsize,
3348 &upl,
3349 &pl,
3350 upl_flags);
3351 if (kret != KERN_SUCCESS)
3352 panic("Failed to create UPL");
3353
3354 for (upl_offset = 0; upl_offset < bp->b_bufsize; upl_offset += PAGE_SIZE) {
3355
3356 if (!upl_valid_page(pl, upl_offset / PAGE_SIZE) || !upl_dirty_page(pl, upl_offset / PAGE_SIZE)) {
3357 ubc_upl_abort(upl, 0);
3358 goto dump_buffer;
3359 }
3360 }
3361 bp->b_upl = upl;
3362
3363 kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_datap));
3364
3365 if (kret != KERN_SUCCESS)
3366 panic("getblk: ubc_upl_map() failed with (%d)", kret);
3367 return (1);
3368
3369 dump_buffer:
3370 bp->b_bufsize = 0;
3371 SET(bp->b_flags, B_INVAL);
3372 buf_brelse(bp);
3373
3374 return(0);
3375 }
3376
3377
3378
3379 /*
3380 * disabled for now
3381 */
3382
3383 #if FLUSH_QUEUES
3384
3385 #define NFLUSH 32
3386
3387 static int
3388 bp_cmp(void *a, void *b)
3389 {
3390 buf_t *bp_a = *(buf_t **)a,
3391 *bp_b = *(buf_t **)b;
3392 daddr64_t res;
3393
3394 // don't have to worry about negative block
3395 // numbers so this is ok to do.
3396 //
3397 res = (bp_a->b_blkno - bp_b->b_blkno);
3398
3399 return (int)res;
3400 }
3401
3402
3403 int
3404 bflushq(int whichq, mount_t mp)
3405 {
3406 buf_t bp, next;
3407 int i, buf_count;
3408 int total_writes = 0;
3409 static buf_t flush_table[NFLUSH];
3410
3411 if (whichq < 0 || whichq >= BQUEUES) {
3412 return (0);
3413 }
3414
3415 restart:
3416 lck_mtx_lock(buf_mtxp);
3417
3418 bp = TAILQ_FIRST(&bufqueues[whichq]);
3419
3420 for (buf_count = 0; bp; bp = next) {
3421 next = bp->b_freelist.tqe_next;
3422
3423 if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) {
3424 continue;
3425 }
3426
3427 if (ISSET(bp->b_flags, B_DELWRI) && !ISSET(bp->b_lflags, BL_BUSY)) {
3428
3429 bremfree_locked(bp);
3430 #ifdef JOE_DEBUG
3431 bp->b_owner = current_thread();
3432 bp->b_tag = 7;
3433 #endif
3434 SET(bp->b_lflags, BL_BUSY);
3435 flush_table[buf_count] = bp;
3436 buf_count++;
3437 total_writes++;
3438
3439 if (buf_count >= NFLUSH) {
3440 lck_mtx_unlock(buf_mtxp);
3441
3442 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
3443
3444 for (i = 0; i < buf_count; i++) {
3445 buf_bawrite(flush_table[i]);
3446 }
3447 goto restart;
3448 }
3449 }
3450 }
3451 lck_mtx_unlock(buf_mtxp);
3452
3453 if (buf_count > 0) {
3454 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
3455
3456 for (i = 0; i < buf_count; i++) {
3457 buf_bawrite(flush_table[i]);
3458 }
3459 }
3460
3461 return (total_writes);
3462 }
3463 #endif
3464
3465
3466 #if BALANCE_QUEUES
3467
3468 /* XXX move this to a separate file */
3469
3470 /*
3471 * NOTE: THIS CODE HAS NOT BEEN UPDATED
3472 * WITH RESPECT TO THE NEW LOCKING MODEL
3473 */
3474
3475
3476 /*
3477 * Dynamic Scaling of the Buffer Queues
3478 */
3479
3480 typedef long long blsize_t;
3481
3482 blsize_t MAXNBUF; /* initialize to (sane_size / PAGE_SIZE) */
3483 /* Global tunable limits */
3484 blsize_t nbufh; /* number of buffer headers */
3485 blsize_t nbuflow; /* minimum number of buffer headers required */
3486 blsize_t nbufhigh; /* maximum number of buffer headers allowed */
3487 blsize_t nbuftarget; /* preferred number of buffer headers */
3488
3489 /*
3490 * assertions:
3491 *
3492 * 1. 0 < nbuflow <= nbufh <= nbufhigh
3493 * 2. nbufhigh <= MAXNBUF
3494 * 3. 0 < nbuflow <= nbuftarget <= nbufhigh
3495 * 4. nbufh can not be set by sysctl().
3496 */
3497
3498 /* Per queue tunable limits */
3499
3500 struct bufqlim {
3501 blsize_t bl_nlow; /* minimum number of buffer headers required */
3502 blsize_t bl_num; /* number of buffer headers on the queue */
3503 blsize_t bl_nlhigh; /* maximum number of buffer headers allowed */
3504 blsize_t bl_target; /* preferred number of buffer headers */
3505 long bl_stale; /* Seconds after which a buffer is considered stale */
3506 } bufqlim[BQUEUES];
3507
3508 /*
3509 * assertions:
3510 *
3511 * 1. 0 <= bl_nlow <= bl_num <= bl_nlhigh
3512 * 2. bl_nlhigh <= MAXNBUF
3513 * 3. bufqlim[BQ_META].bl_nlow != 0
3514 * 4. bufqlim[BQ_META].bl_nlow > (number of possible concurrent
3515 * file system IO operations)
3516 * 5. bl_num can not be set by sysctl().
3517 * 6. bl_nhigh <= nbufhigh
3518 */
3519
3520 /*
3521 * Rationale:
3522 * ----------
3523 * Defining it blsize_t as long permits 2^31 buffer headers per queue.
3524 * Which can describe (2^31 * PAGE_SIZE) memory per queue.
3525 *
3526 * These limits are exported to by means of sysctl().
3527 * It was decided to define blsize_t as a 64 bit quantity.
3528 * This will make sure that we will not be required to change it
3529 * as long as we do not exceed 64 bit address space for the kernel.
3530 *
3531 * low and high numbers parameters initialized at compile time
3532 * and boot arguments can be used to override them. sysctl()
3533 * would not change the value. sysctl() can get all the values
3534 * but can set only target. num is the current level.
3535 *
3536 * Advantages of having a "bufqscan" thread doing the balancing are,
3537 * Keep enough bufs on BQ_EMPTY.
3538 * getnewbuf() by default will always select a buffer from the BQ_EMPTY.
3539 * getnewbuf() perfoms best if a buffer was found there.
3540 * Also this minimizes the possibility of starting IO
3541 * from getnewbuf(). That's a performance win, too.
3542 *
3543 * Localize complex logic [balancing as well as time aging]
3544 * to balancebufq().
3545 *
3546 * Simplify getnewbuf() logic by elimination of time aging code.
3547 */
3548
3549 /*
3550 * Algorithm:
3551 * -----------
3552 * The goal of the dynamic scaling of the buffer queues to to keep
3553 * the size of the LRU close to bl_target. Buffers on a queue would
3554 * be time aged.
3555 *
3556 * There would be a thread which will be responsible for "balancing"
3557 * the buffer cache queues.
3558 *
3559 * The scan order would be: AGE, LRU, META, EMPTY.
3560 */
3561
3562 long bufqscanwait = 0;
3563
3564 static void bufqscan_thread();
3565 static int balancebufq(int q);
3566 static int btrimempty(int n);
3567 static __inline__ int initbufqscan(void);
3568 static __inline__ int nextbufq(int q);
3569 static void buqlimprt(int all);
3570
3571
3572 static __inline__ void
3573 bufqinc(int q)
3574 {
3575 if ((q < 0) || (q >= BQUEUES))
3576 return;
3577
3578 bufqlim[q].bl_num++;
3579 return;
3580 }
3581
3582 static __inline__ void
3583 bufqdec(int q)
3584 {
3585 if ((q < 0) || (q >= BQUEUES))
3586 return;
3587
3588 bufqlim[q].bl_num--;
3589 return;
3590 }
3591
3592 static void
3593 bufq_balance_thread_init()
3594 {
3595
3596 if (bufqscanwait++ == 0) {
3597
3598 /* Initalize globals */
3599 MAXNBUF = (sane_size / PAGE_SIZE);
3600 nbufh = nbuf;
3601 nbuflow = min(nbufh, 100);
3602 nbufhigh = min(MAXNBUF, max(nbufh, 2048));
3603 nbuftarget = (sane_size >> 5) / PAGE_SIZE;
3604 nbuftarget = max(nbuflow, nbuftarget);
3605 nbuftarget = min(nbufhigh, nbuftarget);
3606
3607 /*
3608 * Initialize the bufqlim
3609 */
3610
3611 /* LOCKED queue */
3612 bufqlim[BQ_LOCKED].bl_nlow = 0;
3613 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
3614 bufqlim[BQ_LOCKED].bl_target = 0;
3615 bufqlim[BQ_LOCKED].bl_stale = 30;
3616
3617 /* LRU queue */
3618 bufqlim[BQ_LRU].bl_nlow = 0;
3619 bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
3620 bufqlim[BQ_LRU].bl_target = nbuftarget/4;
3621 bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
3622
3623 /* AGE queue */
3624 bufqlim[BQ_AGE].bl_nlow = 0;
3625 bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
3626 bufqlim[BQ_AGE].bl_target = nbuftarget/4;
3627 bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
3628
3629 /* EMPTY queue */
3630 bufqlim[BQ_EMPTY].bl_nlow = 0;
3631 bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
3632 bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
3633 bufqlim[BQ_EMPTY].bl_stale = 600000;
3634
3635 /* META queue */
3636 bufqlim[BQ_META].bl_nlow = 0;
3637 bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
3638 bufqlim[BQ_META].bl_target = nbuftarget/4;
3639 bufqlim[BQ_META].bl_stale = META_IS_STALE;
3640
3641 /* LAUNDRY queue */
3642 bufqlim[BQ_LOCKED].bl_nlow = 0;
3643 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
3644 bufqlim[BQ_LOCKED].bl_target = 0;
3645 bufqlim[BQ_LOCKED].bl_stale = 30;
3646
3647 buqlimprt(1);
3648 }
3649
3650 /* create worker thread */
3651 kernel_thread(kernel_task, bufqscan_thread);
3652 }
3653
3654 /* The workloop for the buffer balancing thread */
3655 static void
3656 bufqscan_thread()
3657 {
3658 int moretodo = 0;
3659
3660 for(;;) {
3661 do {
3662 int q; /* buffer queue to process */
3663
3664 q = initbufqscan();
3665 for (; q; ) {
3666 moretodo |= balancebufq(q);
3667 q = nextbufq(q);
3668 }
3669 } while (moretodo);
3670
3671 #if DIAGNOSTIC
3672 vfs_bufstats();
3673 buqlimprt(0);
3674 #endif
3675 (void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz);
3676 moretodo = 0;
3677 }
3678 }
3679
3680 /* Seed for the buffer queue balancing */
3681 static __inline__ int
3682 initbufqscan()
3683 {
3684 /* Start with AGE queue */
3685 return (BQ_AGE);
3686 }
3687
3688 /* Pick next buffer queue to balance */
3689 static __inline__ int
3690 nextbufq(int q)
3691 {
3692 int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
3693
3694 q++;
3695 q %= sizeof(order);
3696 return (order[q]);
3697 }
3698
3699 /* function to balance the buffer queues */
3700 static int
3701 balancebufq(int q)
3702 {
3703 int moretodo = 0;
3704 int s = splbio();
3705 int n, t;
3706
3707 /* reject invalid q */
3708 if ((q < 0) || (q >= BQUEUES))
3709 goto out;
3710
3711 /* LOCKED or LAUNDRY queue MUST not be balanced */
3712 if ((q == BQ_LOCKED) || (q == BQ_LAUNDRY))
3713 goto out;
3714
3715 n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
3716
3717 /* If queue has less than target nothing more to do */
3718 if (n < 0)
3719 goto out;
3720
3721 if ( n > 8 ) {
3722 /* Balance only a small amount (12.5%) at a time */
3723 n >>= 3;
3724 }
3725
3726 /* EMPTY queue needs special handling */
3727 if (q == BQ_EMPTY) {
3728 moretodo |= btrimempty(n);
3729 goto out;
3730 }
3731
3732 t = buf_timestamp():
3733
3734 for (; n > 0; n--) {
3735 struct buf *bp = bufqueues[q].tqh_first;
3736 if (!bp)
3737 break;
3738
3739 /* check if it's stale */
3740 if ((t - bp->b_timestamp) > bufqlim[q].bl_stale) {
3741 if (bcleanbuf(bp)) {
3742 /* buf_bawrite() issued, bp not ready */
3743 moretodo = 1;
3744 } else {
3745 /* release the cleaned buffer to BQ_EMPTY */
3746 SET(bp->b_flags, B_INVAL);
3747 buf_brelse(bp);
3748 }
3749 } else
3750 break;
3751 }
3752
3753 out:
3754 splx(s);
3755 return (moretodo);
3756 }
3757
3758 static int
3759 btrimempty(int n)
3760 {
3761 /*
3762 * When struct buf are allocated dynamically, this would
3763 * reclaim upto 'n' struct buf from the empty queue.
3764 */
3765
3766 return (0);
3767 }
3768
3769 static void
3770 buqlimprt(int all)
3771 {
3772 int i;
3773 static char *bname[BQUEUES] =
3774 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
3775
3776 if (all)
3777 for (i = 0; i < BQUEUES; i++) {
3778 printf("%s : ", bname[i]);
3779 printf("min = %ld, ", (long)bufqlim[i].bl_nlow);
3780 printf("cur = %ld, ", (long)bufqlim[i].bl_num);
3781 printf("max = %ld, ", (long)bufqlim[i].bl_nlhigh);
3782 printf("target = %ld, ", (long)bufqlim[i].bl_target);
3783 printf("stale after %ld seconds\n", bufqlim[i].bl_stale);
3784 }
3785 else
3786 for (i = 0; i < BQUEUES; i++) {
3787 printf("%s : ", bname[i]);
3788 printf("cur = %ld, ", (long)bufqlim[i].bl_num);
3789 }
3790 }
3791
3792 #endif
3793
3794