]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_bio.c
xnu-792.6.22.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_bio.c
1 /*
2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
23 /*-
24 * Copyright (c) 1994 Christopher G. Demetriou
25 * Copyright (c) 1982, 1986, 1989, 1993
26 * The Regents of the University of California. All rights reserved.
27 * (c) UNIX System Laboratories, Inc.
28 * All or some portions of this file are derived from material licensed
29 * to the University of California by American Telephone and Telegraph
30 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
31 * the permission of UNIX System Laboratories, Inc.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
62 */
63
64 /*
65 * Some references:
66 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
67 * Leffler, et al.: The Design and Implementation of the 4.3BSD
68 * UNIX Operating System (Addison Welley, 1989)
69 */
70
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/proc_internal.h>
74 #include <sys/buf_internal.h>
75 #include <sys/vnode_internal.h>
76 #include <sys/mount_internal.h>
77 #include <sys/trace.h>
78 #include <sys/malloc.h>
79 #include <sys/resourcevar.h>
80 #include <miscfs/specfs/specdev.h>
81 #include <sys/ubc.h>
82 #include <sys/kauth.h>
83 #if DIAGNOSTIC
84 #include <kern/assert.h>
85 #endif /* DIAGNOSTIC */
86 #include <kern/task.h>
87 #include <kern/zalloc.h>
88 #include <kern/lock.h>
89
90 #include <vm/vm_kern.h>
91
92 #include <sys/kdebug.h>
93 #include <machine/spl.h>
94
95 #if BALANCE_QUEUES
96 static __inline__ void bufqinc(int q);
97 static __inline__ void bufqdec(int q);
98 #endif
99
100 static int bcleanbuf(buf_t bp);
101 static int brecover_data(buf_t bp);
102 static boolean_t incore(vnode_t vp, daddr64_t blkno);
103 static buf_t incore_locked(vnode_t vp, daddr64_t blkno);
104 /* timeout is in msecs */
105 static buf_t getnewbuf(int slpflag, int slptimeo, int *queue);
106 static void bremfree_locked(buf_t bp);
107 static void buf_reassign(buf_t bp, vnode_t newvp);
108 static errno_t buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo);
109 static int buf_iterprepare(vnode_t vp, struct buflists *, int flags);
110 static void buf_itercomplete(vnode_t vp, struct buflists *, int flags);
111
112 __private_extern__ int bdwrite_internal(buf_t, int);
113
114 /* zone allocated buffer headers */
115 static void bufzoneinit(void);
116 static void bcleanbuf_thread_init(void);
117 static void bcleanbuf_thread(void);
118
119 static zone_t buf_hdr_zone;
120 static int buf_hdr_count;
121
122
123 /*
124 * Definitions for the buffer hash lists.
125 */
126 #define BUFHASH(dvp, lbn) \
127 (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
128 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
129 u_long bufhash;
130
131 /* Definitions for the buffer stats. */
132 struct bufstats bufstats;
133
134 /* Number of delayed write buffers */
135 int nbdwrite = 0;
136 int blaundrycnt = 0;
137
138
139 static TAILQ_HEAD(ioqueue, buf) iobufqueue;
140 static TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
141 static int needbuffer;
142 static int need_iobuffer;
143
144 static lck_grp_t *buf_mtx_grp;
145 static lck_attr_t *buf_mtx_attr;
146 static lck_grp_attr_t *buf_mtx_grp_attr;
147 static lck_mtx_t *iobuffer_mtxp;
148 static lck_mtx_t *buf_mtxp;
149
150 static __inline__ int
151 buf_timestamp(void)
152 {
153 struct timeval t;
154 microuptime(&t);
155 return (t.tv_sec);
156 }
157
158 /*
159 * Insq/Remq for the buffer free lists.
160 */
161 #if BALANCE_QUEUES
162 #define binsheadfree(bp, dp, whichq) do { \
163 TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
164 bufqinc((whichq)); \
165 (bp)->b_whichq = whichq; \
166 (bp)->b_timestamp = buf_timestamp(); \
167 } while (0)
168
169 #define binstailfree(bp, dp, whichq) do { \
170 TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
171 bufqinc((whichq)); \
172 (bp)->b_whichq = whichq; \
173 (bp)->b_timestamp = buf_timestamp(); \
174 } while (0)
175 #else
176 #define binsheadfree(bp, dp, whichq) do { \
177 TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
178 (bp)->b_whichq = whichq; \
179 (bp)->b_timestamp = buf_timestamp(); \
180 } while (0)
181
182 #define binstailfree(bp, dp, whichq) do { \
183 TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
184 (bp)->b_whichq = whichq; \
185 (bp)->b_timestamp = buf_timestamp(); \
186 } while (0)
187 #endif
188
189
190 #define BHASHENTCHECK(bp) \
191 if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef) \
192 panic("%x: b_hash.le_prev is not deadbeef", (bp));
193
194 #define BLISTNONE(bp) \
195 (bp)->b_hash.le_next = (struct buf *)0; \
196 (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
197
198 /*
199 * Insq/Remq for the vnode usage lists.
200 */
201 #define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs)
202 #define bufremvn(bp) { \
203 LIST_REMOVE(bp, b_vnbufs); \
204 (bp)->b_vnbufs.le_next = NOLIST; \
205 }
206
207 /*
208 * Time in seconds before a buffer on a list is
209 * considered as a stale buffer
210 */
211 #define LRU_IS_STALE 120 /* default value for the LRU */
212 #define AGE_IS_STALE 60 /* default value for the AGE */
213 #define META_IS_STALE 180 /* default value for the BQ_META */
214
215 int lru_is_stale = LRU_IS_STALE;
216 int age_is_stale = AGE_IS_STALE;
217 int meta_is_stale = META_IS_STALE;
218
219
220
221 /* LIST_INSERT_HEAD() with assertions */
222 static __inline__ void
223 blistenterhead(struct bufhashhdr * head, buf_t bp)
224 {
225 if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
226 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
227 (head)->lh_first = bp;
228 bp->b_hash.le_prev = &(head)->lh_first;
229 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
230 panic("blistenterhead: le_prev is deadbeef");
231 }
232
233 static __inline__ void
234 binshash(buf_t bp, struct bufhashhdr *dp)
235 {
236 buf_t nbp;
237
238 BHASHENTCHECK(bp);
239
240 nbp = dp->lh_first;
241 for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
242 if(nbp == bp)
243 panic("buf already in hashlist");
244 }
245
246 blistenterhead(dp, bp);
247 }
248
249 static __inline__ void
250 bremhash(buf_t bp)
251 {
252 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
253 panic("bremhash le_prev is deadbeef");
254 if (bp->b_hash.le_next == bp)
255 panic("bremhash: next points to self");
256
257 if (bp->b_hash.le_next != NULL)
258 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
259 *bp->b_hash.le_prev = (bp)->b_hash.le_next;
260 }
261
262
263
264
265 int
266 buf_valid(buf_t bp) {
267
268 if ( (bp->b_flags & (B_DONE | B_DELWRI)) )
269 return 1;
270 return 0;
271 }
272
273 int
274 buf_fromcache(buf_t bp) {
275
276 if ( (bp->b_flags & B_CACHE) )
277 return 1;
278 return 0;
279 }
280
281 void
282 buf_markinvalid(buf_t bp) {
283
284 SET(bp->b_flags, B_INVAL);
285 }
286
287 void
288 buf_markdelayed(buf_t bp) {
289
290 SET(bp->b_flags, B_DELWRI);
291 buf_reassign(bp, bp->b_vp);
292 }
293
294 void
295 buf_markeintr(buf_t bp) {
296
297 SET(bp->b_flags, B_EINTR);
298 }
299
300 void
301 buf_markaged(buf_t bp) {
302
303 SET(bp->b_flags, B_AGE);
304 }
305
306 errno_t
307 buf_error(buf_t bp) {
308
309 return (bp->b_error);
310 }
311
312 void
313 buf_seterror(buf_t bp, errno_t error) {
314
315 if ((bp->b_error = error))
316 SET(bp->b_flags, B_ERROR);
317 else
318 CLR(bp->b_flags, B_ERROR);
319 }
320
321 void
322 buf_setflags(buf_t bp, int32_t flags) {
323
324 SET(bp->b_flags, (flags & BUF_X_WRFLAGS));
325 }
326
327 void
328 buf_clearflags(buf_t bp, int32_t flags) {
329
330 CLR(bp->b_flags, (flags & BUF_X_WRFLAGS));
331 }
332
333 int32_t
334 buf_flags(buf_t bp) {
335
336 return ((bp->b_flags & BUF_X_RDFLAGS));
337 }
338
339 void
340 buf_reset(buf_t bp, int32_t io_flags) {
341
342 CLR(bp->b_flags, (B_READ | B_WRITE | B_ERROR | B_DONE | B_INVAL | B_ASYNC | B_NOCACHE));
343 SET(bp->b_flags, (io_flags & (B_ASYNC | B_READ | B_WRITE | B_NOCACHE)));
344
345 bp->b_error = 0;
346 }
347
348 uint32_t
349 buf_count(buf_t bp) {
350
351 return (bp->b_bcount);
352 }
353
354 void
355 buf_setcount(buf_t bp, uint32_t bcount) {
356
357 bp->b_bcount = bcount;
358 }
359
360 uint32_t
361 buf_size(buf_t bp) {
362
363 return (bp->b_bufsize);
364 }
365
366 void
367 buf_setsize(buf_t bp, uint32_t bufsize) {
368
369 bp->b_bufsize = bufsize;
370 }
371
372 uint32_t
373 buf_resid(buf_t bp) {
374
375 return (bp->b_resid);
376 }
377
378 void
379 buf_setresid(buf_t bp, uint32_t resid) {
380
381 bp->b_resid = resid;
382 }
383
384 uint32_t
385 buf_dirtyoff(buf_t bp) {
386
387 return (bp->b_dirtyoff);
388 }
389
390 uint32_t
391 buf_dirtyend(buf_t bp) {
392
393 return (bp->b_dirtyend);
394 }
395
396 void
397 buf_setdirtyoff(buf_t bp, uint32_t dirtyoff) {
398
399 bp->b_dirtyoff = dirtyoff;
400 }
401
402 void
403 buf_setdirtyend(buf_t bp, uint32_t dirtyend) {
404
405 bp->b_dirtyend = dirtyend;
406 }
407
408 uintptr_t
409 buf_dataptr(buf_t bp) {
410
411 return (bp->b_datap);
412 }
413
414 void
415 buf_setdataptr(buf_t bp, uintptr_t data) {
416
417 bp->b_datap = data;
418 }
419
420 vnode_t
421 buf_vnode(buf_t bp) {
422
423 return (bp->b_vp);
424 }
425
426 void
427 buf_setvnode(buf_t bp, vnode_t vp) {
428
429 bp->b_vp = vp;
430 }
431
432
433 void *
434 buf_callback(buf_t bp)
435 {
436 if ( !(bp->b_lflags & BL_IOBUF) )
437 return ((void *) NULL);
438 if ( !(bp->b_flags & B_CALL) )
439 return ((void *) NULL);
440
441 return ((void *)bp->b_iodone);
442 }
443
444
445 errno_t
446 buf_setcallback(buf_t bp, void (*callback)(buf_t, void *), void *transaction)
447 {
448
449 if ( !(bp->b_lflags & BL_IOBUF) )
450 return (EINVAL);
451
452 if (callback)
453 bp->b_flags |= (B_CALL | B_ASYNC);
454 else
455 bp->b_flags &= ~B_CALL;
456 bp->b_transaction = transaction;
457 bp->b_iodone = callback;
458
459 return (0);
460 }
461
462 errno_t
463 buf_setupl(buf_t bp, upl_t upl, uint32_t offset)
464 {
465
466 if ( !(bp->b_lflags & BL_IOBUF) )
467 return (EINVAL);
468
469 if (upl)
470 bp->b_flags |= B_CLUSTER;
471 else
472 bp->b_flags &= ~B_CLUSTER;
473 bp->b_upl = upl;
474 bp->b_uploffset = offset;
475
476 return (0);
477 }
478
479 buf_t
480 buf_clone(buf_t bp, int io_offset, int io_size, void (*iodone)(buf_t, void *), void *arg)
481 {
482 buf_t io_bp;
483
484 if (io_offset < 0 || io_size < 0)
485 return (NULL);
486
487 if ((unsigned)(io_offset + io_size) > (unsigned)bp->b_bcount)
488 return (NULL);
489
490 if (bp->b_flags & B_CLUSTER) {
491 if (io_offset && ((bp->b_uploffset + io_offset) & PAGE_MASK))
492 return (NULL);
493
494 if (((bp->b_uploffset + io_offset + io_size) & PAGE_MASK) && ((io_offset + io_size) < bp->b_bcount))
495 return (NULL);
496 }
497 io_bp = alloc_io_buf(bp->b_vp, 0);
498
499 io_bp->b_flags = bp->b_flags & (B_COMMIT_UPL | B_META | B_PAGEIO | B_CLUSTER | B_PHYS | B_ASYNC | B_READ);
500
501 if (iodone) {
502 io_bp->b_transaction = arg;
503 io_bp->b_iodone = iodone;
504 io_bp->b_flags |= B_CALL;
505 }
506 if (bp->b_flags & B_CLUSTER) {
507 io_bp->b_upl = bp->b_upl;
508 io_bp->b_uploffset = bp->b_uploffset + io_offset;
509 } else {
510 io_bp->b_datap = (uintptr_t)(((char *)bp->b_datap) + io_offset);
511 }
512 io_bp->b_bcount = io_size;
513
514 return (io_bp);
515 }
516
517
518
519 void
520 buf_setfilter(buf_t bp, void (*filter)(buf_t, void *), void *transaction,
521 void **old_iodone, void **old_transaction)
522 {
523 if (old_iodone)
524 *old_iodone = (void *)(bp->b_iodone);
525 if (old_transaction)
526 *old_transaction = (void *)(bp->b_transaction);
527
528 bp->b_transaction = transaction;
529 bp->b_iodone = filter;
530 bp->b_flags |= B_FILTER;
531 }
532
533
534 daddr64_t
535 buf_blkno(buf_t bp) {
536
537 return (bp->b_blkno);
538 }
539
540 daddr64_t
541 buf_lblkno(buf_t bp) {
542
543 return (bp->b_lblkno);
544 }
545
546 void
547 buf_setblkno(buf_t bp, daddr64_t blkno) {
548
549 bp->b_blkno = blkno;
550 }
551
552 void
553 buf_setlblkno(buf_t bp, daddr64_t lblkno) {
554
555 bp->b_lblkno = lblkno;
556 }
557
558 dev_t
559 buf_device(buf_t bp) {
560
561 return (bp->b_dev);
562 }
563
564 errno_t
565 buf_setdevice(buf_t bp, vnode_t vp) {
566
567 if ((vp->v_type != VBLK) && (vp->v_type != VCHR))
568 return EINVAL;
569 bp->b_dev = vp->v_rdev;
570
571 return 0;
572 }
573
574
575 void *
576 buf_drvdata(buf_t bp) {
577
578 return (bp->b_drvdata);
579 }
580
581 void
582 buf_setdrvdata(buf_t bp, void *drvdata) {
583
584 bp->b_drvdata = drvdata;
585 }
586
587 void *
588 buf_fsprivate(buf_t bp) {
589
590 return (bp->b_fsprivate);
591 }
592
593 void
594 buf_setfsprivate(buf_t bp, void *fsprivate) {
595
596 bp->b_fsprivate = fsprivate;
597 }
598
599 ucred_t
600 buf_rcred(buf_t bp) {
601
602 return (bp->b_rcred);
603 }
604
605 ucred_t
606 buf_wcred(buf_t bp) {
607
608 return (bp->b_wcred);
609 }
610
611 void *
612 buf_upl(buf_t bp) {
613
614 return (bp->b_upl);
615 }
616
617 uint32_t
618 buf_uploffset(buf_t bp) {
619
620 return ((uint32_t)(bp->b_uploffset));
621 }
622
623 proc_t
624 buf_proc(buf_t bp) {
625
626 return (bp->b_proc);
627 }
628
629
630 errno_t
631 buf_map(buf_t bp, caddr_t *io_addr)
632 {
633 buf_t real_bp;
634 vm_offset_t vaddr;
635 kern_return_t kret;
636
637 if ( !(bp->b_flags & B_CLUSTER)) {
638 *io_addr = (caddr_t)bp->b_datap;
639 return (0);
640 }
641 real_bp = (buf_t)(bp->b_real_bp);
642
643 if (real_bp && real_bp->b_datap) {
644 /*
645 * b_real_bp is only valid if B_CLUSTER is SET
646 * if it's non-zero, than someone did a cluster_bp call
647 * if the backing physical pages were already mapped
648 * in before the call to cluster_bp (non-zero b_datap),
649 * than we just use that mapping
650 */
651 *io_addr = (caddr_t)real_bp->b_datap;
652 return (0);
653 }
654 kret = ubc_upl_map(bp->b_upl, &vaddr); /* Map it in */
655
656 if (kret != KERN_SUCCESS) {
657 *io_addr = 0;
658
659 return(ENOMEM);
660 }
661 vaddr += bp->b_uploffset;
662
663 *io_addr = (caddr_t)vaddr;
664
665 return (0);
666 }
667
668 errno_t
669 buf_unmap(buf_t bp)
670 {
671 buf_t real_bp;
672 kern_return_t kret;
673
674 if ( !(bp->b_flags & B_CLUSTER))
675 return (0);
676 /*
677 * see buf_map for the explanation
678 */
679 real_bp = (buf_t)(bp->b_real_bp);
680
681 if (real_bp && real_bp->b_datap)
682 return (0);
683
684 if (bp->b_lflags & BL_IOBUF) {
685 /*
686 * when we commit these pages, we'll hit
687 * it with UPL_COMMIT_INACTIVE which
688 * will clear the reference bit that got
689 * turned on when we touched the mapping
690 */
691 bp->b_flags |= B_AGE;
692 }
693 kret = ubc_upl_unmap(bp->b_upl);
694
695 if (kret != KERN_SUCCESS)
696 return (EINVAL);
697 return (0);
698 }
699
700
701 void
702 buf_clear(buf_t bp) {
703 caddr_t baddr;
704
705 if (buf_map(bp, &baddr) == 0) {
706 bzero(baddr, bp->b_bcount);
707 buf_unmap(bp);
708 }
709 bp->b_resid = 0;
710 }
711
712
713
714 /*
715 * Read or write a buffer that is not contiguous on disk.
716 * buffer is marked done/error at the conclusion
717 */
718 static int
719 buf_strategy_fragmented(vnode_t devvp, buf_t bp, off_t f_offset, size_t contig_bytes)
720 {
721 vnode_t vp = buf_vnode(bp);
722 buf_t io_bp; /* For reading or writing a single block */
723 int io_direction;
724 int io_resid;
725 size_t io_contig_bytes;
726 daddr64_t io_blkno;
727 int error = 0;
728 int bmap_flags;
729
730 /*
731 * save our starting point... the bp was already mapped
732 * in buf_strategy before we got called
733 * no sense doing it again.
734 */
735 io_blkno = bp->b_blkno;
736 /*
737 * Make sure we redo this mapping for the next I/O
738 * i.e. this can never be a 'permanent' mapping
739 */
740 bp->b_blkno = bp->b_lblkno;
741
742 /*
743 * Get an io buffer to do the deblocking
744 */
745 io_bp = alloc_io_buf(devvp, 0);
746
747 io_bp->b_lblkno = bp->b_lblkno;
748 io_bp->b_datap = bp->b_datap;
749 io_resid = bp->b_bcount;
750 io_direction = bp->b_flags & B_READ;
751 io_contig_bytes = contig_bytes;
752
753 if (bp->b_flags & B_READ)
754 bmap_flags = VNODE_READ;
755 else
756 bmap_flags = VNODE_WRITE;
757
758 for (;;) {
759 if (io_blkno == -1)
760 /*
761 * this is unexepected, but we'll allow for it
762 */
763 bzero((caddr_t)io_bp->b_datap, (int)io_contig_bytes);
764 else {
765 io_bp->b_bcount = io_contig_bytes;
766 io_bp->b_bufsize = io_contig_bytes;
767 io_bp->b_resid = io_contig_bytes;
768 io_bp->b_blkno = io_blkno;
769
770 buf_reset(io_bp, io_direction);
771 /*
772 * Call the device to do the I/O and wait for it
773 */
774 if ((error = VNOP_STRATEGY(io_bp)))
775 break;
776 if ((error = (int)buf_biowait(io_bp)))
777 break;
778 if (io_bp->b_resid) {
779 io_resid -= (io_contig_bytes - io_bp->b_resid);
780 break;
781 }
782 }
783 if ((io_resid -= io_contig_bytes) == 0)
784 break;
785 f_offset += io_contig_bytes;
786 io_bp->b_datap += io_contig_bytes;
787
788 /*
789 * Map the current position to a physical block number
790 */
791 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_resid, &io_blkno, &io_contig_bytes, NULL, bmap_flags, NULL)))
792 break;
793 }
794 buf_free(io_bp);
795
796 if (error)
797 buf_seterror(bp, error);
798 bp->b_resid = io_resid;
799 /*
800 * This I/O is now complete
801 */
802 buf_biodone(bp);
803
804 return error;
805 }
806
807
808 /*
809 * struct vnop_strategy_args {
810 * struct buf *a_bp;
811 * } *ap;
812 */
813 errno_t
814 buf_strategy(vnode_t devvp, void *ap)
815 {
816 buf_t bp = ((struct vnop_strategy_args *)ap)->a_bp;
817 vnode_t vp = bp->b_vp;
818 int bmap_flags;
819 errno_t error;
820
821 if (vp == NULL || vp->v_type == VCHR || vp->v_type == VBLK)
822 panic("buf_strategy: b_vp == NULL || vtype == VCHR | VBLK\n");
823 /*
824 * associate the physical device with
825 * with this buf_t even if we don't
826 * end up issuing the I/O...
827 */
828 bp->b_dev = devvp->v_rdev;
829
830 if (bp->b_flags & B_READ)
831 bmap_flags = VNODE_READ;
832 else
833 bmap_flags = VNODE_WRITE;
834
835 if ( !(bp->b_flags & B_CLUSTER)) {
836
837 if ( (bp->b_upl) ) {
838 /*
839 * we have a UPL associated with this bp
840 * go through cluster_bp which knows how
841 * to deal with filesystem block sizes
842 * that aren't equal to the page size
843 */
844 return (cluster_bp(bp));
845 }
846 if (bp->b_blkno == bp->b_lblkno) {
847 off_t f_offset;
848 size_t contig_bytes;
849
850 if ((error = VNOP_BLKTOOFF(vp, bp->b_lblkno, &f_offset))) {
851 buf_seterror(bp, error);
852 buf_biodone(bp);
853
854 return (error);
855 }
856 if ((error = VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))) {
857 buf_seterror(bp, error);
858 buf_biodone(bp);
859
860 return (error);
861 }
862 if (bp->b_blkno == -1)
863 buf_clear(bp);
864 else if ((long)contig_bytes < bp->b_bcount)
865 return (buf_strategy_fragmented(devvp, bp, f_offset, contig_bytes));
866 }
867 if (bp->b_blkno == -1) {
868 buf_biodone(bp);
869 return (0);
870 }
871 }
872 /*
873 * we can issue the I/O because...
874 * either B_CLUSTER is set which
875 * means that the I/O is properly set
876 * up to be a multiple of the page size, or
877 * we were able to successfully set up the
878 * phsyical block mapping
879 */
880 return (VOCALL(devvp->v_op, VOFFSET(vnop_strategy), ap));
881 }
882
883
884
885 buf_t
886 buf_alloc(vnode_t vp)
887 {
888 return(alloc_io_buf(vp, 0));
889 }
890
891 void
892 buf_free(buf_t bp) {
893
894 free_io_buf(bp);
895 }
896
897
898
899 void
900 buf_iterate(vnode_t vp, int (*callout)(buf_t, void *), int flags, void *arg) {
901 buf_t bp;
902 int retval;
903 struct buflists local_iterblkhd;
904 int lock_flags = BAC_NOWAIT | BAC_REMOVE;
905
906 if (flags & BUF_SKIP_LOCKED)
907 lock_flags |= BAC_SKIP_LOCKED;
908 if (flags & BUF_SKIP_NONLOCKED)
909 lock_flags |= BAC_SKIP_NONLOCKED;
910
911 lck_mtx_lock(buf_mtxp);
912
913 if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY)) {
914 lck_mtx_unlock(buf_mtxp);
915 return;
916 }
917 while (!LIST_EMPTY(&local_iterblkhd)) {
918 bp = LIST_FIRST(&local_iterblkhd);
919 LIST_REMOVE(bp, b_vnbufs);
920 LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
921
922 if (buf_acquire_locked(bp, lock_flags, 0, 0))
923 continue;
924
925 lck_mtx_unlock(buf_mtxp);
926
927 retval = callout(bp, arg);
928
929 switch (retval) {
930 case BUF_RETURNED:
931 buf_brelse(bp);
932 break;
933 case BUF_CLAIMED:
934 break;
935 case BUF_RETURNED_DONE:
936 buf_brelse(bp);
937 lck_mtx_lock(buf_mtxp);
938 goto out;
939 case BUF_CLAIMED_DONE:
940 lck_mtx_lock(buf_mtxp);
941 goto out;
942 }
943 lck_mtx_lock(buf_mtxp);
944 }
945 out:
946 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
947
948 lck_mtx_unlock(buf_mtxp);
949 }
950
951
952 /*
953 * Flush out and invalidate all buffers associated with a vnode.
954 */
955 int
956 buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo)
957 {
958 buf_t bp;
959 int error = 0;
960 int must_rescan = 1;
961 struct buflists local_iterblkhd;
962
963 lck_mtx_lock(buf_mtxp);
964
965 for (;;) {
966 if (must_rescan == 0)
967 /*
968 * the lists may not be empty, but all that's left at this
969 * point are metadata or B_LOCKED buffers which are being
970 * skipped... we know this because we made it through both
971 * the clean and dirty lists without dropping buf_mtxp...
972 * each time we drop buf_mtxp we bump "must_rescan"
973 */
974 break;
975 if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd))
976 break;
977 must_rescan = 0;
978 /*
979 * iterate the clean list
980 */
981 if (buf_iterprepare(vp, &local_iterblkhd, VBI_CLEAN)) {
982 goto try_dirty_list;
983 }
984 while (!LIST_EMPTY(&local_iterblkhd)) {
985 bp = LIST_FIRST(&local_iterblkhd);
986
987 LIST_REMOVE(bp, b_vnbufs);
988 LIST_INSERT_HEAD(&vp->v_cleanblkhd, bp, b_vnbufs);
989
990 /*
991 * some filesystems distinguish meta data blocks with a negative logical block #
992 */
993 if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
994 continue;
995
996 if ( (error = (int)buf_acquire_locked(bp, BAC_REMOVE | BAC_SKIP_LOCKED, slpflag, slptimeo)) ) {
997 if (error == EDEADLK)
998 /*
999 * this buffer was marked B_LOCKED...
1000 * we didn't drop buf_mtxp, so we
1001 * we don't need to rescan
1002 */
1003 continue;
1004 if (error == EAGAIN) {
1005 /*
1006 * found a busy buffer... we blocked and
1007 * dropped buf_mtxp, so we're going to
1008 * need to rescan after this pass is completed
1009 */
1010 must_rescan++;
1011 continue;
1012 }
1013 /*
1014 * got some kind of 'real' error out of the msleep
1015 * in buf_acquire_locked, terminate the scan and return the error
1016 */
1017 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1018
1019 lck_mtx_unlock(buf_mtxp);
1020 return (error);
1021 }
1022 lck_mtx_unlock(buf_mtxp);
1023
1024 SET(bp->b_flags, B_INVAL);
1025 buf_brelse(bp);
1026
1027 lck_mtx_lock(buf_mtxp);
1028
1029 /*
1030 * by dropping buf_mtxp, we allow new
1031 * buffers to be added to the vnode list(s)
1032 * we'll have to rescan at least once more
1033 * if the queues aren't empty
1034 */
1035 must_rescan++;
1036 }
1037 buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1038
1039 try_dirty_list:
1040 /*
1041 * Now iterate on dirty blks
1042 */
1043 if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY)) {
1044 continue;
1045 }
1046 while (!LIST_EMPTY(&local_iterblkhd)) {
1047 bp = LIST_FIRST(&local_iterblkhd);
1048
1049 LIST_REMOVE(bp, b_vnbufs);
1050 LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1051
1052 /*
1053 * some filesystems distinguish meta data blocks with a negative logical block #
1054 */
1055 if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
1056 continue;
1057
1058 if ( (error = (int)buf_acquire_locked(bp, BAC_REMOVE | BAC_SKIP_LOCKED, slpflag, slptimeo)) ) {
1059 if (error == EDEADLK)
1060 /*
1061 * this buffer was marked B_LOCKED...
1062 * we didn't drop buf_mtxp, so we
1063 * we don't need to rescan
1064 */
1065 continue;
1066 if (error == EAGAIN) {
1067 /*
1068 * found a busy buffer... we blocked and
1069 * dropped buf_mtxp, so we're going to
1070 * need to rescan after this pass is completed
1071 */
1072 must_rescan++;
1073 continue;
1074 }
1075 /*
1076 * got some kind of 'real' error out of the msleep
1077 * in buf_acquire_locked, terminate the scan and return the error
1078 */
1079 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1080
1081 lck_mtx_unlock(buf_mtxp);
1082 return (error);
1083 }
1084 lck_mtx_unlock(buf_mtxp);
1085
1086 SET(bp->b_flags, B_INVAL);
1087
1088 if (ISSET(bp->b_flags, B_DELWRI) && (flags & BUF_WRITE_DATA))
1089 (void) VNOP_BWRITE(bp);
1090 else
1091 buf_brelse(bp);
1092
1093 lck_mtx_lock(buf_mtxp);
1094 /*
1095 * by dropping buf_mtxp, we allow new
1096 * buffers to be added to the vnode list(s)
1097 * we'll have to rescan at least once more
1098 * if the queues aren't empty
1099 */
1100 must_rescan++;
1101 }
1102 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1103 }
1104 lck_mtx_unlock(buf_mtxp);
1105
1106 return (0);
1107 }
1108
1109 void
1110 buf_flushdirtyblks(vnode_t vp, int wait, int flags, char *msg) {
1111 buf_t bp;
1112 int writes_issued = 0;
1113 errno_t error;
1114 int busy = 0;
1115 struct buflists local_iterblkhd;
1116 int lock_flags = BAC_NOWAIT | BAC_REMOVE;
1117
1118 if (flags & BUF_SKIP_LOCKED)
1119 lock_flags |= BAC_SKIP_LOCKED;
1120 if (flags & BUF_SKIP_NONLOCKED)
1121 lock_flags |= BAC_SKIP_NONLOCKED;
1122 loop:
1123 lck_mtx_lock(buf_mtxp);
1124
1125 if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY) == 0) {
1126 while (!LIST_EMPTY(&local_iterblkhd)) {
1127 bp = LIST_FIRST(&local_iterblkhd);
1128 LIST_REMOVE(bp, b_vnbufs);
1129 LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1130
1131 if ((error = buf_acquire_locked(bp, lock_flags, 0, 0)) == EBUSY)
1132 busy++;
1133 if (error)
1134 continue;
1135 lck_mtx_unlock(buf_mtxp);
1136
1137 bp->b_flags &= ~B_LOCKED;
1138
1139 /*
1140 * Wait for I/O associated with indirect blocks to complete,
1141 * since there is no way to quickly wait for them below.
1142 */
1143 if ((bp->b_vp == vp) || (wait == 0))
1144 (void) buf_bawrite(bp);
1145 else
1146 (void) VNOP_BWRITE(bp);
1147 writes_issued++;
1148
1149 lck_mtx_lock(buf_mtxp);
1150 }
1151 buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1152 }
1153 lck_mtx_unlock(buf_mtxp);
1154
1155 if (wait) {
1156 (void)vnode_waitforwrites(vp, 0, 0, 0, msg);
1157
1158 if (vp->v_dirtyblkhd.lh_first && busy) {
1159 /*
1160 * we had one or more BUSY buffers on
1161 * the dirtyblock list... most likely
1162 * these are due to delayed writes that
1163 * were moved to the bclean queue but
1164 * have not yet been 'written'.
1165 * if we issued some writes on the
1166 * previous pass, we try again immediately
1167 * if we didn't, we'll sleep for some time
1168 * to allow the state to change...
1169 */
1170 if (writes_issued == 0) {
1171 (void)tsleep((caddr_t)&vp->v_numoutput,
1172 PRIBIO + 1, "vnode_flushdirtyblks", hz/20);
1173 }
1174 writes_issued = 0;
1175 busy = 0;
1176
1177 goto loop;
1178 }
1179 }
1180 }
1181
1182
1183 /*
1184 * called with buf_mtxp held...
1185 * this lock protects the queue manipulation
1186 */
1187 static int
1188 buf_iterprepare(vnode_t vp, struct buflists *iterheadp, int flags)
1189 {
1190 struct buflists * listheadp;
1191
1192 if (flags & VBI_DIRTY)
1193 listheadp = &vp->v_dirtyblkhd;
1194 else
1195 listheadp = &vp->v_cleanblkhd;
1196
1197 while (vp->v_iterblkflags & VBI_ITER) {
1198 vp->v_iterblkflags |= VBI_ITERWANT;
1199 msleep(&vp->v_iterblkflags, buf_mtxp, 0, "buf_iterprepare", 0);
1200 }
1201 if (LIST_EMPTY(listheadp)) {
1202 LIST_INIT(iterheadp);
1203 return(EINVAL);
1204 }
1205 vp->v_iterblkflags |= VBI_ITER;
1206
1207 iterheadp->lh_first = listheadp->lh_first;
1208 listheadp->lh_first->b_vnbufs.le_prev = &iterheadp->lh_first;
1209 LIST_INIT(listheadp);
1210
1211 return(0);
1212 }
1213
1214 /*
1215 * called with buf_mtxp held...
1216 * this lock protects the queue manipulation
1217 */
1218 static void
1219 buf_itercomplete(vnode_t vp, struct buflists *iterheadp, int flags)
1220 {
1221 struct buflists * listheadp;
1222 buf_t bp;
1223
1224 if (flags & VBI_DIRTY)
1225 listheadp = &vp->v_dirtyblkhd;
1226 else
1227 listheadp = &vp->v_cleanblkhd;
1228
1229 while (!LIST_EMPTY(iterheadp)) {
1230 bp = LIST_FIRST(iterheadp);
1231 LIST_REMOVE(bp, b_vnbufs);
1232 LIST_INSERT_HEAD(listheadp, bp, b_vnbufs);
1233 }
1234 vp->v_iterblkflags &= ~VBI_ITER;
1235
1236 if (vp->v_iterblkflags & VBI_ITERWANT) {
1237 vp->v_iterblkflags &= ~VBI_ITERWANT;
1238 wakeup(&vp->v_iterblkflags);
1239 }
1240 }
1241
1242
1243 static void
1244 bremfree_locked(buf_t bp)
1245 {
1246 struct bqueues *dp = NULL;
1247 int whichq = -1;
1248
1249 /*
1250 * We only calculate the head of the freelist when removing
1251 * the last element of the list as that is the only time that
1252 * it is needed (e.g. to reset the tail pointer).
1253 *
1254 * NB: This makes an assumption about how tailq's are implemented.
1255 */
1256 if (bp->b_freelist.tqe_next == NULL) {
1257 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
1258 if (dp->tqh_last == &bp->b_freelist.tqe_next)
1259 break;
1260 if (dp == &bufqueues[BQUEUES])
1261 panic("bremfree: lost tail");
1262 }
1263 TAILQ_REMOVE(dp, bp, b_freelist);
1264 whichq = bp->b_whichq;
1265 #if BALANCE_QUEUES
1266 bufqdec(whichq);
1267 #endif
1268 bp->b_whichq = -1;
1269 bp->b_timestamp = 0;
1270 }
1271
1272 /*
1273 * Associate a buffer with a vnode.
1274 */
1275 static void
1276 bgetvp(vnode_t vp, buf_t bp)
1277 {
1278
1279 if (bp->b_vp != vp)
1280 panic("bgetvp: not free");
1281
1282 if (vp->v_type == VBLK || vp->v_type == VCHR)
1283 bp->b_dev = vp->v_rdev;
1284 else
1285 bp->b_dev = NODEV;
1286 /*
1287 * Insert onto list for new vnode.
1288 */
1289 lck_mtx_lock(buf_mtxp);
1290 bufinsvn(bp, &vp->v_cleanblkhd);
1291 lck_mtx_unlock(buf_mtxp);
1292 }
1293
1294 /*
1295 * Disassociate a buffer from a vnode.
1296 */
1297 static void
1298 brelvp(buf_t bp)
1299 {
1300 vnode_t vp;
1301
1302 if ((vp = bp->b_vp) == (vnode_t)NULL)
1303 panic("brelvp: NULL vp");
1304 /*
1305 * Delete from old vnode list, if on one.
1306 */
1307 lck_mtx_lock(buf_mtxp);
1308 if (bp->b_vnbufs.le_next != NOLIST)
1309 bufremvn(bp);
1310 lck_mtx_unlock(buf_mtxp);
1311
1312 bp->b_vp = (vnode_t)NULL;
1313 }
1314
1315 /*
1316 * Reassign a buffer from one vnode to another.
1317 * Used to assign file specific control information
1318 * (indirect blocks) to the vnode to which they belong.
1319 */
1320 static void
1321 buf_reassign(buf_t bp, vnode_t newvp)
1322 {
1323 register struct buflists *listheadp;
1324
1325 if (newvp == NULL) {
1326 printf("buf_reassign: NULL");
1327 return;
1328 }
1329 lck_mtx_lock(buf_mtxp);
1330
1331 /*
1332 * Delete from old vnode list, if on one.
1333 */
1334 if (bp->b_vnbufs.le_next != NOLIST)
1335 bufremvn(bp);
1336 /*
1337 * If dirty, put on list of dirty buffers;
1338 * otherwise insert onto list of clean buffers.
1339 */
1340 if (ISSET(bp->b_flags, B_DELWRI))
1341 listheadp = &newvp->v_dirtyblkhd;
1342 else
1343 listheadp = &newvp->v_cleanblkhd;
1344 bufinsvn(bp, listheadp);
1345
1346 lck_mtx_unlock(buf_mtxp);
1347 }
1348
1349 static __inline__ void
1350 bufhdrinit(buf_t bp)
1351 {
1352 bzero((char *)bp, sizeof *bp);
1353 bp->b_dev = NODEV;
1354 bp->b_rcred = NOCRED;
1355 bp->b_wcred = NOCRED;
1356 bp->b_vnbufs.le_next = NOLIST;
1357 bp->b_flags = B_INVAL;
1358
1359 return;
1360 }
1361
1362 /*
1363 * Initialize buffers and hash links for buffers.
1364 */
1365 __private_extern__ void
1366 bufinit()
1367 {
1368 buf_t bp;
1369 struct bqueues *dp;
1370 int i;
1371 int metabuf;
1372 long whichq;
1373
1374 /* Initialize the buffer queues ('freelists') and the hash table */
1375 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
1376 TAILQ_INIT(dp);
1377 bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash);
1378
1379 metabuf = nbuf/8; /* reserved for meta buf */
1380
1381 /* Initialize the buffer headers */
1382 for (i = 0; i < nbuf; i++) {
1383 bp = &buf[i];
1384 bufhdrinit(bp);
1385
1386 /*
1387 * metabuf buffer headers on the meta-data list and
1388 * rest of the buffer headers on the empty list
1389 */
1390 if (--metabuf)
1391 whichq = BQ_META;
1392 else
1393 whichq = BQ_EMPTY;
1394
1395 BLISTNONE(bp);
1396 dp = &bufqueues[whichq];
1397 binsheadfree(bp, dp, whichq);
1398 binshash(bp, &invalhash);
1399 }
1400
1401 for (; i < nbuf + niobuf; i++) {
1402 bp = &buf[i];
1403 bufhdrinit(bp);
1404 binsheadfree(bp, &iobufqueue, -1);
1405 }
1406
1407 /*
1408 * allocate lock group attribute and group
1409 */
1410 buf_mtx_grp_attr = lck_grp_attr_alloc_init();
1411 //lck_grp_attr_setstat(buf_mtx_grp_attr);
1412 buf_mtx_grp = lck_grp_alloc_init("buffer cache", buf_mtx_grp_attr);
1413
1414 /*
1415 * allocate the lock attribute
1416 */
1417 buf_mtx_attr = lck_attr_alloc_init();
1418 //lck_attr_setdebug(buf_mtx_attr);
1419
1420 /*
1421 * allocate and initialize mutex's for the buffer and iobuffer pools
1422 */
1423 buf_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
1424 iobuffer_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
1425
1426 if (iobuffer_mtxp == NULL)
1427 panic("couldn't create iobuffer mutex");
1428
1429 if (buf_mtxp == NULL)
1430 panic("couldn't create buf mutex");
1431
1432 /*
1433 * allocate and initialize cluster specific global locks...
1434 */
1435 cluster_init();
1436
1437 printf("using %d buffer headers and %d cluster IO buffer headers\n",
1438 nbuf, niobuf);
1439
1440 /* Set up zones used by the buffer cache */
1441 bufzoneinit();
1442
1443 /* start the bcleanbuf() thread */
1444 bcleanbuf_thread_init();
1445
1446 #if BALANCE_QUEUES
1447 {
1448 static void bufq_balance_thread_init();
1449 /* create a thread to do dynamic buffer queue balancing */
1450 bufq_balance_thread_init();
1451 }
1452 #endif /* notyet */
1453 }
1454
1455 static struct buf *
1456 bio_doread(vnode_t vp, daddr64_t blkno, int size, ucred_t cred, int async, int queuetype)
1457 {
1458 buf_t bp;
1459
1460 bp = buf_getblk(vp, blkno, size, 0, 0, queuetype);
1461
1462 /*
1463 * If buffer does not have data valid, start a read.
1464 * Note that if buffer is B_INVAL, buf_getblk() won't return it.
1465 * Therefore, it's valid if it's I/O has completed or been delayed.
1466 */
1467 if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
1468 struct proc *p;
1469
1470 p = current_proc();
1471
1472 /* Start I/O for the buffer (keeping credentials). */
1473 SET(bp->b_flags, B_READ | async);
1474 if (cred != NOCRED && bp->b_rcred == NOCRED) {
1475 kauth_cred_ref(cred);
1476 bp->b_rcred = cred;
1477 }
1478
1479 VNOP_STRATEGY(bp);
1480
1481 trace(TR_BREADMISS, pack(vp, size), blkno);
1482
1483 /* Pay for the read. */
1484 if (p && p->p_stats)
1485 p->p_stats->p_ru.ru_inblock++; /* XXX */
1486
1487 if (async) {
1488 /*
1489 * since we asked for an ASYNC I/O
1490 * the biodone will do the brelse
1491 * we don't want to pass back a bp
1492 * that we don't 'own'
1493 */
1494 bp = NULL;
1495 }
1496 } else if (async) {
1497 buf_brelse(bp);
1498 bp = NULL;
1499 }
1500
1501 trace(TR_BREADHIT, pack(vp, size), blkno);
1502
1503 return (bp);
1504 }
1505
1506 /*
1507 * Perform the reads for buf_breadn() and buf_meta_breadn().
1508 * Trivial modification to the breada algorithm presented in Bach (p.55).
1509 */
1510 static errno_t
1511 do_breadn_for_type(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes,
1512 int nrablks, ucred_t cred, buf_t *bpp, int queuetype)
1513 {
1514 buf_t bp;
1515 int i;
1516
1517 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, queuetype);
1518
1519 /*
1520 * For each of the read-ahead blocks, start a read, if necessary.
1521 */
1522 for (i = 0; i < nrablks; i++) {
1523 /* If it's in the cache, just go on to next one. */
1524 if (incore(vp, rablks[i]))
1525 continue;
1526
1527 /* Get a buffer for the read-ahead block */
1528 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, queuetype);
1529 }
1530
1531 /* Otherwise, we had to start a read for it; wait until it's valid. */
1532 return (buf_biowait(bp));
1533 }
1534
1535
1536 /*
1537 * Read a disk block.
1538 * This algorithm described in Bach (p.54).
1539 */
1540 errno_t
1541 buf_bread(vnode_t vp, daddr64_t blkno, int size, ucred_t cred, buf_t *bpp)
1542 {
1543 buf_t bp;
1544
1545 /* Get buffer for block. */
1546 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
1547
1548 /* Wait for the read to complete, and return result. */
1549 return (buf_biowait(bp));
1550 }
1551
1552 /*
1553 * Read a disk block. [bread() for meta-data]
1554 * This algorithm described in Bach (p.54).
1555 */
1556 errno_t
1557 buf_meta_bread(vnode_t vp, daddr64_t blkno, int size, ucred_t cred, buf_t *bpp)
1558 {
1559 buf_t bp;
1560
1561 /* Get buffer for block. */
1562 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
1563
1564 /* Wait for the read to complete, and return result. */
1565 return (buf_biowait(bp));
1566 }
1567
1568 /*
1569 * Read-ahead multiple disk blocks. The first is sync, the rest async.
1570 */
1571 errno_t
1572 buf_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, ucred_t cred, buf_t *bpp)
1573 {
1574 return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_READ));
1575 }
1576
1577 /*
1578 * Read-ahead multiple disk blocks. The first is sync, the rest async.
1579 * [buf_breadn() for meta-data]
1580 */
1581 errno_t
1582 buf_meta_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, ucred_t cred, buf_t *bpp)
1583 {
1584 return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_META));
1585 }
1586
1587 /*
1588 * Block write. Described in Bach (p.56)
1589 */
1590 errno_t
1591 buf_bwrite(buf_t bp)
1592 {
1593 int sync, wasdelayed;
1594 errno_t rv;
1595 proc_t p = current_proc();
1596 vnode_t vp = bp->b_vp;
1597
1598 if (bp->b_datap == 0) {
1599 if (brecover_data(bp) == 0)
1600 return (0);
1601 }
1602 /* Remember buffer type, to switch on it later. */
1603 sync = !ISSET(bp->b_flags, B_ASYNC);
1604 wasdelayed = ISSET(bp->b_flags, B_DELWRI);
1605 CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
1606
1607 if (wasdelayed)
1608 OSAddAtomic(-1, &nbdwrite);
1609
1610 if (!sync) {
1611 /*
1612 * If not synchronous, pay for the I/O operation and make
1613 * sure the buf is on the correct vnode queue. We have
1614 * to do this now, because if we don't, the vnode may not
1615 * be properly notified that its I/O has completed.
1616 */
1617 if (wasdelayed)
1618 buf_reassign(bp, vp);
1619 else
1620 if (p && p->p_stats)
1621 p->p_stats->p_ru.ru_oublock++; /* XXX */
1622 }
1623 trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
1624
1625 /* Initiate disk write. Make sure the appropriate party is charged. */
1626
1627 OSAddAtomic(1, &vp->v_numoutput);
1628
1629 VNOP_STRATEGY(bp);
1630
1631 if (sync) {
1632 /*
1633 * If I/O was synchronous, wait for it to complete.
1634 */
1635 rv = buf_biowait(bp);
1636
1637 /*
1638 * Pay for the I/O operation, if it's not been paid for, and
1639 * make sure it's on the correct vnode queue. (async operatings
1640 * were payed for above.)
1641 */
1642 if (wasdelayed)
1643 buf_reassign(bp, vp);
1644 else
1645 if (p && p->p_stats)
1646 p->p_stats->p_ru.ru_oublock++; /* XXX */
1647
1648 /* Release the buffer. */
1649 // XXXdbg - only if the unused bit is set
1650 if (!ISSET(bp->b_flags, B_NORELSE)) {
1651 buf_brelse(bp);
1652 } else {
1653 CLR(bp->b_flags, B_NORELSE);
1654 }
1655
1656 return (rv);
1657 } else {
1658 return (0);
1659 }
1660 }
1661
1662 int
1663 vn_bwrite(ap)
1664 struct vnop_bwrite_args *ap;
1665 {
1666 return (buf_bwrite(ap->a_bp));
1667 }
1668
1669 /*
1670 * Delayed write.
1671 *
1672 * The buffer is marked dirty, but is not queued for I/O.
1673 * This routine should be used when the buffer is expected
1674 * to be modified again soon, typically a small write that
1675 * partially fills a buffer.
1676 *
1677 * NB: magnetic tapes cannot be delayed; they must be
1678 * written in the order that the writes are requested.
1679 *
1680 * Described in Leffler, et al. (pp. 208-213).
1681 *
1682 * Note: With the abilitty to allocate additional buffer
1683 * headers, we can get in to the situation where "too" many
1684 * buf_bdwrite()s can create situation where the kernel can create
1685 * buffers faster than the disks can service. Doing a buf_bawrite() in
1686 * cases were we have "too many" outstanding buf_bdwrite()s avoids that.
1687 */
1688 __private_extern__ int
1689 bdwrite_internal(buf_t bp, int return_error)
1690 {
1691 proc_t p = current_proc();
1692 vnode_t vp = bp->b_vp;
1693
1694 /*
1695 * If the block hasn't been seen before:
1696 * (1) Mark it as having been seen,
1697 * (2) Charge for the write.
1698 * (3) Make sure it's on its vnode's correct block list,
1699 */
1700 if (!ISSET(bp->b_flags, B_DELWRI)) {
1701 SET(bp->b_flags, B_DELWRI);
1702 if (p && p->p_stats)
1703 p->p_stats->p_ru.ru_oublock++; /* XXX */
1704 OSAddAtomic(1, &nbdwrite);
1705 buf_reassign(bp, vp);
1706 }
1707
1708 /* If this is a tape block, write it the block now. */
1709 if (ISSET(bp->b_flags, B_TAPE)) {
1710 VNOP_BWRITE(bp);
1711 return (0);
1712 }
1713
1714 /*
1715 * if we're not LOCKED, but the total number of delayed writes
1716 * has climbed above 75% of the total buffers in the system
1717 * return an error if the caller has indicated that it can
1718 * handle one in this case, otherwise schedule the I/O now
1719 * this is done to prevent us from allocating tons of extra
1720 * buffers when dealing with virtual disks (i.e. DiskImages),
1721 * because additional buffers are dynamically allocated to prevent
1722 * deadlocks from occurring
1723 *
1724 * however, can't do a buf_bawrite() if the LOCKED bit is set because the
1725 * buffer is part of a transaction and can't go to disk until
1726 * the LOCKED bit is cleared.
1727 */
1728 if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf/4)*3)) {
1729 if (return_error)
1730 return (EAGAIN);
1731 /*
1732 * If the vnode has "too many" write operations in progress
1733 * wait for them to finish the IO
1734 */
1735 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, (char *)"buf_bdwrite");
1736
1737 return (buf_bawrite(bp));
1738 }
1739
1740 /* Otherwise, the "write" is done, so mark and release the buffer. */
1741 SET(bp->b_flags, B_DONE);
1742 buf_brelse(bp);
1743 return (0);
1744 }
1745
1746 errno_t
1747 buf_bdwrite(buf_t bp)
1748 {
1749 return (bdwrite_internal(bp, 0));
1750 }
1751
1752
1753 /*
1754 * Asynchronous block write; just an asynchronous buf_bwrite().
1755 *
1756 * Note: With the abilitty to allocate additional buffer
1757 * headers, we can get in to the situation where "too" many
1758 * buf_bawrite()s can create situation where the kernel can create
1759 * buffers faster than the disks can service.
1760 * We limit the number of "in flight" writes a vnode can have to
1761 * avoid this.
1762 */
1763 static int
1764 bawrite_internal(buf_t bp, int throttle)
1765 {
1766 vnode_t vp = bp->b_vp;
1767
1768 if (vp) {
1769 if (throttle)
1770 /*
1771 * If the vnode has "too many" write operations in progress
1772 * wait for them to finish the IO
1773 */
1774 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, (const char *)"buf_bawrite");
1775 else if (vp->v_numoutput >= VNODE_ASYNC_THROTTLE)
1776 /*
1777 * return to the caller and
1778 * let him decide what to do
1779 */
1780 return (EWOULDBLOCK);
1781 }
1782 SET(bp->b_flags, B_ASYNC);
1783
1784 return (VNOP_BWRITE(bp));
1785 }
1786
1787 errno_t
1788 buf_bawrite(buf_t bp)
1789 {
1790 return (bawrite_internal(bp, 1));
1791 }
1792
1793
1794 /*
1795 * Release a buffer on to the free lists.
1796 * Described in Bach (p. 46).
1797 */
1798 void
1799 buf_brelse(buf_t bp)
1800 {
1801 struct bqueues *bufq;
1802 long whichq;
1803 upl_t upl;
1804 int need_wakeup = 0;
1805 int need_bp_wakeup = 0;
1806
1807
1808 if (bp->b_whichq != -1 || !(bp->b_lflags & BL_BUSY))
1809 panic("buf_brelse: bad buffer = %x\n", bp);
1810
1811 #ifdef JOE_DEBUG
1812 bp->b_stackbrelse[0] = __builtin_return_address(0);
1813 bp->b_stackbrelse[1] = __builtin_return_address(1);
1814 bp->b_stackbrelse[2] = __builtin_return_address(2);
1815 bp->b_stackbrelse[3] = __builtin_return_address(3);
1816 bp->b_stackbrelse[4] = __builtin_return_address(4);
1817 bp->b_stackbrelse[5] = __builtin_return_address(5);
1818
1819 bp->b_lastbrelse = current_thread();
1820 bp->b_tag = 0;
1821 #endif
1822 if (bp->b_lflags & BL_IOBUF) {
1823 free_io_buf(bp);
1824 return;
1825 }
1826
1827 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
1828 bp->b_lblkno * PAGE_SIZE, (int)bp, (int)bp->b_datap,
1829 bp->b_flags, 0);
1830
1831 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
1832
1833 /*
1834 * if we're invalidating a buffer that has the B_FILTER bit
1835 * set then call the b_iodone function so it gets cleaned
1836 * up properly.
1837 *
1838 * the HFS journal code depends on this
1839 */
1840 if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
1841 if (ISSET(bp->b_flags, B_FILTER)) { /* if necessary, call out */
1842 void (*iodone_func)(struct buf *, void *) = bp->b_iodone;
1843 void *arg = (void *)bp->b_transaction;
1844
1845 CLR(bp->b_flags, B_FILTER); /* but note callout done */
1846 bp->b_iodone = NULL;
1847 bp->b_transaction = NULL;
1848
1849 if (iodone_func == NULL) {
1850 panic("brelse: bp @ 0x%x has NULL b_iodone!\n", bp);
1851 }
1852 (*iodone_func)(bp, arg);
1853 }
1854 }
1855 /*
1856 * I/O is done. Cleanup the UPL state
1857 */
1858 upl = bp->b_upl;
1859
1860 if ( !ISSET(bp->b_flags, B_META) && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
1861 kern_return_t kret;
1862 int upl_flags;
1863
1864 if ( (upl == NULL) ) {
1865 if ( !ISSET(bp->b_flags, B_INVAL)) {
1866 kret = ubc_create_upl(bp->b_vp,
1867 ubc_blktooff(bp->b_vp, bp->b_lblkno),
1868 bp->b_bufsize,
1869 &upl,
1870 NULL,
1871 UPL_PRECIOUS);
1872
1873 if (kret != KERN_SUCCESS)
1874 panic("brelse: Failed to create UPL");
1875 #ifdef UPL_DEBUG
1876 upl_ubc_alias_set(upl, bp, 5);
1877 #endif /* UPL_DEBUG */
1878 }
1879 } else {
1880 if (bp->b_datap) {
1881 kret = ubc_upl_unmap(upl);
1882
1883 if (kret != KERN_SUCCESS)
1884 panic("ubc_upl_unmap failed");
1885 bp->b_datap = (uintptr_t)NULL;
1886 }
1887 }
1888 if (upl) {
1889 if (bp->b_flags & (B_ERROR | B_INVAL)) {
1890 if (bp->b_flags & (B_READ | B_INVAL))
1891 upl_flags = UPL_ABORT_DUMP_PAGES;
1892 else
1893 upl_flags = 0;
1894
1895 ubc_upl_abort(upl, upl_flags);
1896 } else {
1897 if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY))
1898 upl_flags = UPL_COMMIT_SET_DIRTY ;
1899 else
1900 upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
1901
1902 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
1903 UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
1904 }
1905 bp->b_upl = NULL;
1906 }
1907 } else {
1908 if ( (upl) )
1909 panic("brelse: UPL set for non VREG; vp=%x", bp->b_vp);
1910 }
1911
1912 /*
1913 * If it's locked, don't report an error; try again later.
1914 */
1915 if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
1916 CLR(bp->b_flags, B_ERROR);
1917 /*
1918 * If it's not cacheable, or an error, mark it invalid.
1919 */
1920 if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
1921 SET(bp->b_flags, B_INVAL);
1922
1923 if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
1924 /*
1925 * If it's invalid or empty, dissociate it from its vnode
1926 * and put on the head of the appropriate queue.
1927 */
1928 if (bp->b_vp)
1929 brelvp(bp);
1930
1931 if (ISSET(bp->b_flags, B_DELWRI))
1932 OSAddAtomic(-1, &nbdwrite);
1933
1934 CLR(bp->b_flags, (B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE));
1935 /*
1936 * Determine which queue the buffer should be on, then put it there.
1937 */
1938 if (bp->b_bufsize <= 0)
1939 whichq = BQ_EMPTY; /* no data */
1940 else if (ISSET(bp->b_flags, B_META))
1941 whichq = BQ_META; /* meta-data */
1942 else
1943 whichq = BQ_AGE; /* invalid data */
1944 bufq = &bufqueues[whichq];
1945
1946 lck_mtx_lock(buf_mtxp);
1947
1948 binsheadfree(bp, bufq, whichq);
1949 } else {
1950 /*
1951 * It has valid data. Put it on the end of the appropriate
1952 * queue, so that it'll stick around for as long as possible.
1953 */
1954 if (ISSET(bp->b_flags, B_LOCKED))
1955 whichq = BQ_LOCKED; /* locked in core */
1956 else if (ISSET(bp->b_flags, B_META))
1957 whichq = BQ_META; /* meta-data */
1958 else if (ISSET(bp->b_flags, B_AGE))
1959 whichq = BQ_AGE; /* stale but valid data */
1960 else
1961 whichq = BQ_LRU; /* valid data */
1962 bufq = &bufqueues[whichq];
1963
1964 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE));
1965
1966 lck_mtx_lock(buf_mtxp);
1967
1968 binstailfree(bp, bufq, whichq);
1969 }
1970 if (needbuffer) {
1971 /*
1972 * needbuffer is a global
1973 * we're currently using buf_mtxp to protect it
1974 * delay doing the actual wakeup until after
1975 * we drop buf_mtxp
1976 */
1977 needbuffer = 0;
1978 need_wakeup = 1;
1979 }
1980 if (ISSET(bp->b_lflags, BL_WANTED)) {
1981 /*
1982 * delay the actual wakeup until after we
1983 * clear BL_BUSY and we've dropped buf_mtxp
1984 */
1985 need_bp_wakeup = 1;
1986 }
1987 /*
1988 * Unlock the buffer.
1989 */
1990 CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
1991
1992 lck_mtx_unlock(buf_mtxp);
1993
1994 if (need_wakeup) {
1995 /*
1996 * Wake up any processes waiting for any buffer to become free.
1997 */
1998 wakeup(&needbuffer);
1999 }
2000 if (need_bp_wakeup) {
2001 /*
2002 * Wake up any proceeses waiting for _this_ buffer to become free.
2003 */
2004 wakeup(bp);
2005 }
2006 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
2007 (int)bp, (int)bp->b_datap, bp->b_flags, 0, 0);
2008 }
2009
2010 /*
2011 * Determine if a block is in the cache.
2012 * Just look on what would be its hash chain. If it's there, return
2013 * a pointer to it, unless it's marked invalid. If it's marked invalid,
2014 * we normally don't return the buffer, unless the caller explicitly
2015 * wants us to.
2016 */
2017 static boolean_t
2018 incore(vnode_t vp, daddr64_t blkno)
2019 {
2020 boolean_t retval;
2021
2022 lck_mtx_lock(buf_mtxp);
2023
2024 if (incore_locked(vp, blkno))
2025 retval = TRUE;
2026 else
2027 retval = FALSE;
2028 lck_mtx_unlock(buf_mtxp);
2029
2030 return (retval);
2031 }
2032
2033
2034 static buf_t
2035 incore_locked(vnode_t vp, daddr64_t blkno)
2036 {
2037 struct buf *bp;
2038
2039 bp = BUFHASH(vp, blkno)->lh_first;
2040
2041 /* Search hash chain */
2042 for (; bp != NULL; bp = bp->b_hash.le_next) {
2043 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
2044 !ISSET(bp->b_flags, B_INVAL)) {
2045 return (bp);
2046 }
2047 }
2048 return (0);
2049 }
2050
2051
2052 /* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
2053 /*
2054 * Get a block of requested size that is associated with
2055 * a given vnode and block offset. If it is found in the
2056 * block cache, mark it as having been found, make it busy
2057 * and return it. Otherwise, return an empty block of the
2058 * correct size. It is up to the caller to insure that the
2059 * cached blocks be of the correct size.
2060 */
2061 buf_t
2062 buf_getblk(vnode_t vp, daddr64_t blkno, int size, int slpflag, int slptimeo, int operation)
2063 {
2064 buf_t bp;
2065 int err;
2066 upl_t upl;
2067 upl_page_info_t *pl;
2068 kern_return_t kret;
2069 int ret_only_valid;
2070 struct timespec ts;
2071 int upl_flags;
2072
2073 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
2074 (int)(blkno * PAGE_SIZE), size, operation, 0, 0);
2075
2076 ret_only_valid = operation & BLK_ONLYVALID;
2077 operation &= ~BLK_ONLYVALID;
2078 start:
2079 lck_mtx_lock(buf_mtxp);
2080 start_locked:
2081 if ((bp = incore_locked(vp, blkno))) {
2082 /*
2083 * Found in the Buffer Cache
2084 */
2085 if (ISSET(bp->b_lflags, BL_BUSY)) {
2086 /*
2087 * but is busy
2088 */
2089 switch (operation) {
2090 case BLK_READ:
2091 case BLK_WRITE:
2092 case BLK_META:
2093 SET(bp->b_lflags, BL_WANTED);
2094 bufstats.bufs_busyincore++;
2095
2096 /*
2097 * don't retake the mutex after being awakened...
2098 * the time out is in msecs
2099 */
2100 ts.tv_sec = (slptimeo/1000);
2101 ts.tv_nsec = (slptimeo % 1000) * 10 * NSEC_PER_USEC * 1000;
2102
2103 err = msleep(bp, buf_mtxp, slpflag | PDROP | (PRIBIO + 1), "buf_getblk", &ts);
2104
2105 /*
2106 * Callers who call with PCATCH or timeout are
2107 * willing to deal with the NULL pointer
2108 */
2109 if (err && ((slpflag & PCATCH) || ((err == EWOULDBLOCK) && slptimeo)))
2110 return (NULL);
2111 goto start;
2112 /*NOTREACHED*/
2113 break;
2114
2115 default:
2116 /*
2117 * unknown operation requested
2118 */
2119 panic("getblk: paging or unknown operation for incore busy buffer - %x\n", operation);
2120 /*NOTREACHED*/
2121 break;
2122 }
2123 } else {
2124 /*
2125 * buffer in core and not busy
2126 */
2127 if ( (bp->b_upl) )
2128 panic("buffer has UPL, but not marked BUSY: %x", bp);
2129 SET(bp->b_lflags, BL_BUSY);
2130 SET(bp->b_flags, B_CACHE);
2131 #ifdef JOE_DEBUG
2132 bp->b_owner = current_thread();
2133 bp->b_tag = 1;
2134 #endif
2135 bremfree_locked(bp);
2136 bufstats.bufs_incore++;
2137
2138 lck_mtx_unlock(buf_mtxp);
2139
2140 if ( !ret_only_valid)
2141 allocbuf(bp, size);
2142
2143 upl_flags = 0;
2144 switch (operation) {
2145 case BLK_WRITE:
2146 /*
2147 * "write" operation: let the UPL subsystem
2148 * know that we intend to modify the buffer
2149 * cache pages we're gathering.
2150 */
2151 upl_flags |= UPL_WILL_MODIFY;
2152 case BLK_READ:
2153 upl_flags |= UPL_PRECIOUS;
2154 if (UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
2155 kret = ubc_create_upl(vp,
2156 ubc_blktooff(vp, bp->b_lblkno),
2157 bp->b_bufsize,
2158 &upl,
2159 &pl,
2160 upl_flags);
2161 if (kret != KERN_SUCCESS)
2162 panic("Failed to create UPL");
2163
2164 bp->b_upl = upl;
2165
2166 if (upl_valid_page(pl, 0)) {
2167 if (upl_dirty_page(pl, 0))
2168 SET(bp->b_flags, B_WASDIRTY);
2169 else
2170 CLR(bp->b_flags, B_WASDIRTY);
2171 } else
2172 CLR(bp->b_flags, (B_DONE | B_CACHE | B_WASDIRTY | B_DELWRI));
2173
2174 kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_datap));
2175
2176 if (kret != KERN_SUCCESS)
2177 panic("getblk: ubc_upl_map() failed with (%d)", kret);
2178 }
2179 break;
2180
2181 case BLK_META:
2182 /*
2183 * VM is not involved in IO for the meta data
2184 * buffer already has valid data
2185 */
2186 break;
2187
2188 default:
2189 panic("getblk: paging or unknown operation for incore buffer- %d\n", operation);
2190 /*NOTREACHED*/
2191 break;
2192 }
2193 }
2194 } else { /* not incore() */
2195 int queue = BQ_EMPTY; /* Start with no preference */
2196
2197 if (ret_only_valid) {
2198 lck_mtx_unlock(buf_mtxp);
2199 return (NULL);
2200 }
2201
2202 if ((UBCINVALID(vp)) || !(UBCINFOEXISTS(vp)))
2203 operation = BLK_META;
2204
2205 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
2206 goto start_locked;
2207
2208 /*
2209 * getnewbuf may block for a number of different reasons...
2210 * if it does, it's then possible for someone else to
2211 * create a buffer for the same block and insert it into
2212 * the hash... if we see it incore at this point we dump
2213 * the buffer we were working on and start over
2214 */
2215 if (incore_locked(vp, blkno)) {
2216 SET(bp->b_flags, B_INVAL);
2217 binshash(bp, &invalhash);
2218
2219 lck_mtx_unlock(buf_mtxp);
2220
2221 buf_brelse(bp);
2222 goto start;
2223 }
2224 /*
2225 * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
2226 * CALLED! BE CAREFUL.
2227 */
2228
2229 /*
2230 * mark the buffer as B_META if indicated
2231 * so that when buffer is released it will goto META queue
2232 */
2233 if (operation == BLK_META)
2234 SET(bp->b_flags, B_META);
2235
2236 bp->b_blkno = bp->b_lblkno = blkno;
2237 bp->b_vp = vp;
2238
2239 /*
2240 * Insert in the hash so that incore() can find it
2241 */
2242 binshash(bp, BUFHASH(vp, blkno));
2243
2244 lck_mtx_unlock(buf_mtxp);
2245
2246 bgetvp(vp, bp);
2247
2248 allocbuf(bp, size);
2249
2250 upl_flags = 0;
2251 switch (operation) {
2252 case BLK_META:
2253 /*
2254 * buffer data is invalid...
2255 *
2256 * I don't want to have to retake buf_mtxp,
2257 * so the miss and vmhits counters are done
2258 * with Atomic updates... all other counters
2259 * in bufstats are protected with either
2260 * buf_mtxp or iobuffer_mtxp
2261 */
2262 OSAddAtomic(1, &bufstats.bufs_miss);
2263 break;
2264
2265 case BLK_WRITE:
2266 /*
2267 * "write" operation: let the UPL subsystem know
2268 * that we intend to modify the buffer cache pages
2269 * we're gathering.
2270 */
2271 upl_flags |= UPL_WILL_MODIFY;
2272 case BLK_READ:
2273 { off_t f_offset;
2274 size_t contig_bytes;
2275 int bmap_flags;
2276
2277 if ( (bp->b_upl) )
2278 panic("bp already has UPL: %x",bp);
2279
2280 f_offset = ubc_blktooff(vp, blkno);
2281
2282 upl_flags |= UPL_PRECIOUS;
2283 kret = ubc_create_upl(vp,
2284 f_offset,
2285 bp->b_bufsize,
2286 &upl,
2287 &pl,
2288 upl_flags);
2289
2290 if (kret != KERN_SUCCESS)
2291 panic("Failed to create UPL");
2292 #ifdef UPL_DEBUG
2293 upl_ubc_alias_set(upl, bp, 4);
2294 #endif /* UPL_DEBUG */
2295 bp->b_upl = upl;
2296
2297 if (upl_valid_page(pl, 0)) {
2298
2299 if (operation == BLK_READ)
2300 bmap_flags = VNODE_READ;
2301 else
2302 bmap_flags = VNODE_WRITE;
2303
2304 SET(bp->b_flags, B_CACHE | B_DONE);
2305
2306 OSAddAtomic(1, &bufstats.bufs_vmhits);
2307
2308 bp->b_validoff = 0;
2309 bp->b_dirtyoff = 0;
2310
2311 if (upl_dirty_page(pl, 0)) {
2312 /* page is dirty */
2313 SET(bp->b_flags, B_WASDIRTY);
2314
2315 bp->b_validend = bp->b_bcount;
2316 bp->b_dirtyend = bp->b_bcount;
2317 } else {
2318 /* page is clean */
2319 bp->b_validend = bp->b_bcount;
2320 bp->b_dirtyend = 0;
2321 }
2322 /*
2323 * try to recreate the physical block number associated with
2324 * this buffer...
2325 */
2326 if (VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))
2327 panic("getblk: VNOP_BLOCKMAP failed");
2328 /*
2329 * if the extent represented by this buffer
2330 * is not completely physically contiguous on
2331 * disk, than we can't cache the physical mapping
2332 * in the buffer header
2333 */
2334 if ((long)contig_bytes < bp->b_bcount)
2335 bp->b_blkno = bp->b_lblkno;
2336 } else {
2337 OSAddAtomic(1, &bufstats.bufs_miss);
2338 }
2339 kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_datap));
2340
2341 if (kret != KERN_SUCCESS)
2342 panic("getblk: ubc_upl_map() failed with (%d)", kret);
2343 break;
2344 }
2345 default:
2346 panic("getblk: paging or unknown operation - %x", operation);
2347 /*NOTREACHED*/
2348 break;
2349 }
2350 }
2351 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
2352 (int)bp, (int)bp->b_datap, bp->b_flags, 3, 0);
2353
2354 #ifdef JOE_DEBUG
2355 bp->b_stackgetblk[0] = __builtin_return_address(0);
2356 bp->b_stackgetblk[1] = __builtin_return_address(1);
2357 bp->b_stackgetblk[2] = __builtin_return_address(2);
2358 bp->b_stackgetblk[3] = __builtin_return_address(3);
2359 bp->b_stackgetblk[4] = __builtin_return_address(4);
2360 bp->b_stackgetblk[5] = __builtin_return_address(5);
2361 #endif
2362 return (bp);
2363 }
2364
2365 /*
2366 * Get an empty, disassociated buffer of given size.
2367 */
2368 buf_t
2369 buf_geteblk(size)
2370 int size;
2371 {
2372 buf_t bp;
2373 int queue = BQ_EMPTY;
2374
2375 lck_mtx_lock(buf_mtxp);
2376
2377 while ((bp = getnewbuf(0, 0, &queue)) == 0)
2378 ;
2379 SET(bp->b_flags, (B_META|B_INVAL));
2380
2381 #if DIAGNOSTIC
2382 assert(queue == BQ_EMPTY);
2383 #endif /* DIAGNOSTIC */
2384 /* XXX need to implement logic to deal with other queues */
2385
2386 binshash(bp, &invalhash);
2387 bufstats.bufs_eblk++;
2388
2389 lck_mtx_unlock(buf_mtxp);
2390
2391 allocbuf(bp, size);
2392
2393 return (bp);
2394 }
2395
2396 /*
2397 * Zones for the meta data buffers
2398 */
2399
2400 #define MINMETA 512
2401 #define MAXMETA 4096
2402
2403 struct meta_zone_entry {
2404 zone_t mz_zone;
2405 vm_size_t mz_size;
2406 vm_size_t mz_max;
2407 char *mz_name;
2408 };
2409
2410 struct meta_zone_entry meta_zones[] = {
2411 {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
2412 {NULL, (MINMETA * 2), 64 * (MINMETA * 2), "buf.1024" },
2413 {NULL, (MINMETA * 4), 16 * (MINMETA * 4), "buf.2048" },
2414 {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
2415 {NULL, 0, 0, "" } /* End */
2416 };
2417
2418 /*
2419 * Initialize the meta data zones
2420 */
2421 static void
2422 bufzoneinit(void)
2423 {
2424 int i;
2425
2426 for (i = 0; meta_zones[i].mz_size != 0; i++) {
2427 meta_zones[i].mz_zone =
2428 zinit(meta_zones[i].mz_size,
2429 meta_zones[i].mz_max,
2430 PAGE_SIZE,
2431 meta_zones[i].mz_name);
2432 }
2433 buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
2434 }
2435
2436 static __inline__ zone_t
2437 getbufzone(size_t size)
2438 {
2439 int i;
2440
2441 if ((size % 512) || (size < MINMETA) || (size > MAXMETA))
2442 panic("getbufzone: incorect size = %d", size);
2443
2444 for (i = 0; meta_zones[i].mz_size != 0; i++) {
2445 if (meta_zones[i].mz_size >= size)
2446 break;
2447 }
2448
2449 return (meta_zones[i].mz_zone);
2450 }
2451
2452 /*
2453 * With UBC, there is no need to expand / shrink the file data
2454 * buffer. The VM uses the same pages, hence no waste.
2455 * All the file data buffers can have one size.
2456 * In fact expand / shrink would be an expensive operation.
2457 *
2458 * Only exception to this is meta-data buffers. Most of the
2459 * meta data operations are smaller than PAGE_SIZE. Having the
2460 * meta-data buffers grow and shrink as needed, optimizes use
2461 * of the kernel wired memory.
2462 */
2463
2464 int
2465 allocbuf(buf_t bp, int size)
2466 {
2467 vm_size_t desired_size;
2468
2469 desired_size = roundup(size, CLBYTES);
2470
2471 if (desired_size < PAGE_SIZE)
2472 desired_size = PAGE_SIZE;
2473 if (desired_size > MAXBSIZE)
2474 panic("allocbuf: buffer larger than MAXBSIZE requested");
2475
2476 if (ISSET(bp->b_flags, B_META)) {
2477 zone_t zprev, z;
2478 int nsize = roundup(size, MINMETA);
2479
2480 if (bp->b_datap) {
2481 vm_offset_t elem = (vm_offset_t)bp->b_datap;
2482
2483 if (ISSET(bp->b_flags, B_ZALLOC)) {
2484 if (bp->b_bufsize < nsize) {
2485 /* reallocate to a bigger size */
2486
2487 zprev = getbufzone(bp->b_bufsize);
2488 if (nsize <= MAXMETA) {
2489 desired_size = nsize;
2490 z = getbufzone(nsize);
2491 bp->b_datap = (uintptr_t)zalloc(z);
2492 } else {
2493 bp->b_datap = (uintptr_t)NULL;
2494 kmem_alloc(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
2495 CLR(bp->b_flags, B_ZALLOC);
2496 }
2497 bcopy((void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
2498 zfree(zprev, (void *)elem);
2499 } else {
2500 desired_size = bp->b_bufsize;
2501 }
2502
2503 } else {
2504 if ((vm_size_t)bp->b_bufsize < desired_size) {
2505 /* reallocate to a bigger size */
2506 bp->b_datap = (uintptr_t)NULL;
2507 kmem_alloc(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
2508 bcopy((const void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
2509 kmem_free(kernel_map, elem, bp->b_bufsize);
2510 } else {
2511 desired_size = bp->b_bufsize;
2512 }
2513 }
2514 } else {
2515 /* new allocation */
2516 if (nsize <= MAXMETA) {
2517 desired_size = nsize;
2518 z = getbufzone(nsize);
2519 bp->b_datap = (uintptr_t)zalloc(z);
2520 SET(bp->b_flags, B_ZALLOC);
2521 } else
2522 kmem_alloc(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
2523 }
2524 }
2525 bp->b_bufsize = desired_size;
2526 bp->b_bcount = size;
2527
2528 return (0);
2529 }
2530
2531 /*
2532 * Get a new buffer from one of the free lists.
2533 *
2534 * Request for a queue is passes in. The queue from which the buffer was taken
2535 * from is returned. Out of range queue requests get BQ_EMPTY. Request for
2536 * BQUEUE means no preference. Use heuristics in that case.
2537 * Heuristics is as follows:
2538 * Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
2539 * If none available block till one is made available.
2540 * If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
2541 * Pick the most stale buffer.
2542 * If found buffer was marked delayed write, start the async. write
2543 * and restart the search.
2544 * Initialize the fields and disassociate the buffer from the vnode.
2545 * Remove the buffer from the hash. Return the buffer and the queue
2546 * on which it was found.
2547 *
2548 * buf_mtxp is held upon entry
2549 * returns with buf_mtxp locked
2550 */
2551
2552 static buf_t
2553 getnewbuf(int slpflag, int slptimeo, int * queue)
2554 {
2555 buf_t bp;
2556 buf_t lru_bp;
2557 buf_t age_bp;
2558 buf_t meta_bp;
2559 int age_time, lru_time, bp_time, meta_time;
2560 int req = *queue; /* save it for restarts */
2561 struct timespec ts;
2562
2563 start:
2564 /*
2565 * invalid request gets empty queue
2566 */
2567 if ((*queue > BQUEUES) || (*queue < 0)
2568 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED))
2569 *queue = BQ_EMPTY;
2570
2571 /*
2572 * (*queue == BQUEUES) means no preference
2573 */
2574 if (*queue != BQUEUES) {
2575 /* Try for the requested queue first */
2576 bp = bufqueues[*queue].tqh_first;
2577 if (bp)
2578 goto found;
2579 }
2580
2581 /* Unable to use requested queue */
2582 age_bp = bufqueues[BQ_AGE].tqh_first;
2583 lru_bp = bufqueues[BQ_LRU].tqh_first;
2584 meta_bp = bufqueues[BQ_META].tqh_first;
2585
2586 if (!age_bp && !lru_bp && !meta_bp) {
2587 /*
2588 * Unavailble on AGE or LRU or META queues
2589 * Try the empty list first
2590 */
2591 bp = bufqueues[BQ_EMPTY].tqh_first;
2592 if (bp) {
2593 *queue = BQ_EMPTY;
2594 goto found;
2595 }
2596 lck_mtx_unlock(buf_mtxp);
2597
2598 /* Create a new temporary buffer header */
2599 bp = (struct buf *)zalloc(buf_hdr_zone);
2600
2601 lck_mtx_lock(buf_mtxp);
2602
2603 if (bp) {
2604 bufhdrinit(bp);
2605 BLISTNONE(bp);
2606 binshash(bp, &invalhash);
2607 SET(bp->b_flags, B_HDRALLOC);
2608 *queue = BQ_EMPTY;
2609 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
2610 buf_hdr_count++;
2611 goto found;
2612 }
2613 bufstats.bufs_sleeps++;
2614
2615 /* wait for a free buffer of any kind */
2616 needbuffer = 1;
2617 /* hz value is 100 */
2618 ts.tv_sec = (slptimeo/1000);
2619 /* the hz value is 100; which leads to 10ms */
2620 ts.tv_nsec = (slptimeo % 1000) * NSEC_PER_USEC * 1000 * 10;
2621 msleep(&needbuffer, buf_mtxp, slpflag|(PRIBIO+1), (char *)"getnewbuf", &ts);
2622
2623 return (0);
2624 }
2625
2626 /* Buffer available either on AGE or LRU or META */
2627 bp = NULL;
2628 *queue = -1;
2629
2630 /* Buffer available either on AGE or LRU */
2631 if (!age_bp) {
2632 bp = lru_bp;
2633 *queue = BQ_LRU;
2634 } else if (!lru_bp) {
2635 bp = age_bp;
2636 *queue = BQ_AGE;
2637 } else { /* buffer available on both AGE and LRU */
2638 int t = buf_timestamp();
2639
2640 age_time = t - age_bp->b_timestamp;
2641 lru_time = t - lru_bp->b_timestamp;
2642 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
2643 bp = age_bp;
2644 *queue = BQ_AGE;
2645 /*
2646 * we should probably re-timestamp eveything in the
2647 * queues at this point with the current time
2648 */
2649 } else {
2650 if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
2651 bp = lru_bp;
2652 *queue = BQ_LRU;
2653 } else {
2654 bp = age_bp;
2655 *queue = BQ_AGE;
2656 }
2657 }
2658 }
2659
2660 if (!bp) { /* Neither on AGE nor on LRU */
2661 bp = meta_bp;
2662 *queue = BQ_META;
2663 } else if (meta_bp) {
2664 int t = buf_timestamp();
2665
2666 bp_time = t - bp->b_timestamp;
2667 meta_time = t - meta_bp->b_timestamp;
2668
2669 if (!(bp_time < 0) && !(meta_time < 0)) {
2670 /* time not set backwards */
2671 int bp_is_stale;
2672 bp_is_stale = (*queue == BQ_LRU) ?
2673 lru_is_stale : age_is_stale;
2674
2675 if ((meta_time >= meta_is_stale) &&
2676 (bp_time < bp_is_stale)) {
2677 bp = meta_bp;
2678 *queue = BQ_META;
2679 }
2680 }
2681 }
2682 found:
2683 if (ISSET(bp->b_flags, B_LOCKED) || ISSET(bp->b_lflags, BL_BUSY))
2684 panic("getnewbuf: bp @ 0x%x is LOCKED or BUSY! (flags 0x%x)\n", bp, bp->b_flags);
2685
2686 /* Clean it */
2687 if (bcleanbuf(bp)) {
2688 /*
2689 * moved to the laundry thread, buffer not ready
2690 */
2691 *queue = req;
2692 goto start;
2693 }
2694 return (bp);
2695 }
2696
2697
2698 /*
2699 * Clean a buffer.
2700 * Returns 0 is buffer is ready to use,
2701 * Returns 1 if issued a buf_bawrite() to indicate
2702 * that the buffer is not ready.
2703 *
2704 * buf_mtxp is held upon entry
2705 * returns with buf_mtxp locked
2706 */
2707 static int
2708 bcleanbuf(buf_t bp)
2709 {
2710 ucred_t cred;
2711
2712
2713 /* Remove from the queue */
2714 bremfree_locked(bp);
2715
2716 /* Buffer is no longer on free lists. */
2717 SET(bp->b_lflags, BL_BUSY);
2718 #ifdef JOE_DEBUG
2719 bp->b_owner = current_thread();
2720 bp->b_tag = 2;
2721 #endif
2722 /*
2723 * If buffer was a delayed write, start the IO by queuing
2724 * it on the LAUNDRY queue, and return 1
2725 */
2726 if (ISSET(bp->b_flags, B_DELWRI)) {
2727 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
2728 blaundrycnt++;
2729
2730 lck_mtx_unlock(buf_mtxp);
2731
2732 wakeup(&blaundrycnt);
2733 /* and give it a chance to run */
2734 (void)thread_block(THREAD_CONTINUE_NULL);
2735
2736 lck_mtx_lock(buf_mtxp);
2737 return (1);
2738 }
2739 bremhash(bp);
2740
2741 lck_mtx_unlock(buf_mtxp);
2742
2743 BLISTNONE(bp);
2744 /*
2745 * disassociate us from our vnode, if we had one...
2746 */
2747 if (bp->b_vp)
2748 brelvp(bp);
2749
2750 if (ISSET(bp->b_flags, B_META)) {
2751 vm_offset_t elem;
2752
2753 elem = (vm_offset_t)bp->b_datap;
2754 bp->b_datap = (uintptr_t)0xdeadbeef;
2755
2756 if (ISSET(bp->b_flags, B_ZALLOC)) {
2757 zone_t z;
2758
2759 z = getbufzone(bp->b_bufsize);
2760 zfree(z, (void *)elem);
2761 } else
2762 kmem_free(kernel_map, elem, bp->b_bufsize);
2763 }
2764
2765 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
2766
2767 /* clear out various other fields */
2768 bp->b_bufsize = 0;
2769 bp->b_datap = (uintptr_t)NULL;
2770 bp->b_upl = (void *)NULL;
2771 /*
2772 * preserve the state of whether this buffer
2773 * was allocated on the fly or not...
2774 * the only other flag that should be set at
2775 * this point is BL_BUSY...
2776 */
2777 #ifdef JOE_DEBUG
2778 bp->b_owner = current_thread();
2779 bp->b_tag = 3;
2780 #endif
2781 bp->b_lflags = BL_BUSY;
2782 bp->b_flags = (bp->b_flags & B_HDRALLOC);
2783 bp->b_dev = NODEV;
2784 bp->b_blkno = bp->b_lblkno = 0;
2785 bp->b_iodone = NULL;
2786 bp->b_error = 0;
2787 bp->b_resid = 0;
2788 bp->b_bcount = 0;
2789 bp->b_dirtyoff = bp->b_dirtyend = 0;
2790 bp->b_validoff = bp->b_validend = 0;
2791
2792 /* nuke any credentials we were holding */
2793 cred = bp->b_rcred;
2794 if (cred != NOCRED) {
2795 bp->b_rcred = NOCRED;
2796 kauth_cred_rele(cred);
2797 }
2798 cred = bp->b_wcred;
2799 if (cred != NOCRED) {
2800 bp->b_wcred = NOCRED;
2801 kauth_cred_rele(cred);
2802 }
2803 lck_mtx_lock(buf_mtxp);
2804
2805 return (0);
2806 }
2807
2808
2809
2810 errno_t
2811 buf_invalblkno(vnode_t vp, daddr64_t lblkno, int flags)
2812 {
2813 buf_t bp;
2814 errno_t error;
2815
2816 lck_mtx_lock(buf_mtxp);
2817 relook:
2818 if ((bp = incore_locked(vp, lblkno)) == (struct buf *)0) {
2819 lck_mtx_unlock(buf_mtxp);
2820 return (0);
2821 }
2822 if (ISSET(bp->b_lflags, BL_BUSY)) {
2823 if ( !ISSET(flags, BUF_WAIT)) {
2824 lck_mtx_unlock(buf_mtxp);
2825 return (EBUSY);
2826 }
2827 SET(bp->b_lflags, BL_WANTED);
2828
2829 error = msleep((caddr_t)bp, buf_mtxp, (PRIBIO + 1), (char *)"buf_invalblkno", 0);
2830
2831 if (error)
2832 return (error);
2833 goto relook;
2834 }
2835 bremfree_locked(bp);
2836 SET(bp->b_lflags, BL_BUSY);
2837 SET(bp->b_flags, B_INVAL);
2838 #ifdef JOE_DEBUG
2839 bp->b_owner = current_thread();
2840 bp->b_tag = 4;
2841 #endif
2842 lck_mtx_unlock(buf_mtxp);
2843 buf_brelse(bp);
2844
2845 return (0);
2846 }
2847
2848
2849 void
2850 buf_drop(buf_t bp)
2851 {
2852 int need_wakeup = 0;
2853
2854 lck_mtx_lock(buf_mtxp);
2855
2856 if (ISSET(bp->b_lflags, BL_WANTED)) {
2857 /*
2858 * delay the actual wakeup until after we
2859 * clear BL_BUSY and we've dropped buf_mtxp
2860 */
2861 need_wakeup = 1;
2862 }
2863 /*
2864 * Unlock the buffer.
2865 */
2866 CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
2867
2868 lck_mtx_unlock(buf_mtxp);
2869
2870 if (need_wakeup) {
2871 /*
2872 * Wake up any proceeses waiting for _this_ buffer to become free.
2873 */
2874 wakeup(bp);
2875 }
2876 }
2877
2878
2879 errno_t
2880 buf_acquire(buf_t bp, int flags, int slpflag, int slptimeo) {
2881 errno_t error;
2882
2883 lck_mtx_lock(buf_mtxp);
2884
2885 error = buf_acquire_locked(bp, flags, slpflag, slptimeo);
2886
2887 lck_mtx_unlock(buf_mtxp);
2888
2889 return (error);
2890 }
2891
2892
2893 static errno_t
2894 buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo)
2895 {
2896 errno_t error;
2897 struct timespec ts;
2898
2899 if (ISSET(bp->b_flags, B_LOCKED)) {
2900 if ((flags & BAC_SKIP_LOCKED))
2901 return (EDEADLK);
2902 } else {
2903 if ((flags & BAC_SKIP_NONLOCKED))
2904 return (EDEADLK);
2905 }
2906 if (ISSET(bp->b_lflags, BL_BUSY)) {
2907 /*
2908 * since the mutex_lock may block, the buffer
2909 * may become BUSY, so we need to
2910 * recheck for a NOWAIT request
2911 */
2912 if (flags & BAC_NOWAIT)
2913 return (EBUSY);
2914 SET(bp->b_lflags, BL_WANTED);
2915
2916 /* the hz value is 100; which leads to 10ms */
2917 ts.tv_sec = (slptimeo/100);
2918 ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000;
2919 error = msleep((caddr_t)bp, buf_mtxp, slpflag | (PRIBIO + 1), (char *)"buf_acquire", &ts);
2920
2921 if (error)
2922 return (error);
2923 return (EAGAIN);
2924 }
2925 if (flags & BAC_REMOVE)
2926 bremfree_locked(bp);
2927 SET(bp->b_lflags, BL_BUSY);
2928 #ifdef JOE_DEBUG
2929 bp->b_owner = current_thread();
2930 bp->b_tag = 5;
2931 #endif
2932 return (0);
2933 }
2934
2935
2936 /*
2937 * Wait for operations on the buffer to complete.
2938 * When they do, extract and return the I/O's error value.
2939 */
2940 errno_t
2941 buf_biowait(buf_t bp)
2942 {
2943 lck_mtx_lock(buf_mtxp);
2944
2945 while (!ISSET(bp->b_flags, B_DONE))
2946 (void) msleep(bp, buf_mtxp, (PRIBIO+1), (char *)"buf_biowait", 0);
2947
2948 lck_mtx_unlock(buf_mtxp);
2949
2950 /* check for interruption of I/O (e.g. via NFS), then errors. */
2951 if (ISSET(bp->b_flags, B_EINTR)) {
2952 CLR(bp->b_flags, B_EINTR);
2953 return (EINTR);
2954 } else if (ISSET(bp->b_flags, B_ERROR))
2955 return (bp->b_error ? bp->b_error : EIO);
2956 else
2957 return (0);
2958 }
2959
2960 /*
2961 * Mark I/O complete on a buffer.
2962 *
2963 * If a callback has been requested, e.g. the pageout
2964 * daemon, do so. Otherwise, awaken waiting processes.
2965 *
2966 * [ Leffler, et al., says on p.247:
2967 * "This routine wakes up the blocked process, frees the buffer
2968 * for an asynchronous write, or, for a request by the pagedaemon
2969 * process, invokes a procedure specified in the buffer structure" ]
2970 *
2971 * In real life, the pagedaemon (or other system processes) wants
2972 * to do async stuff to, and doesn't want the buffer buf_brelse()'d.
2973 * (for swap pager, that puts swap buffers on the free lists (!!!),
2974 * for the vn device, that puts malloc'd buffers on the free lists!)
2975 */
2976 extern struct timeval priority_IO_timestamp_for_root;
2977 extern int hard_throttle_on_root;
2978
2979 void
2980 buf_biodone(buf_t bp)
2981 {
2982 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
2983 (int)bp, (int)bp->b_datap, bp->b_flags, 0, 0);
2984
2985 if (ISSET(bp->b_flags, B_DONE))
2986 panic("biodone already");
2987
2988 if (kdebug_enable) {
2989 int code = DKIO_DONE;
2990
2991 if (bp->b_flags & B_READ)
2992 code |= DKIO_READ;
2993 if (bp->b_flags & B_ASYNC)
2994 code |= DKIO_ASYNC;
2995
2996 if (bp->b_flags & B_META)
2997 code |= DKIO_META;
2998 else if (bp->b_flags & B_PAGEIO)
2999 code |= DKIO_PAGING;
3000
3001 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
3002 (unsigned int)bp, (unsigned int)bp->b_vp,
3003 bp->b_resid, bp->b_error, 0);
3004 }
3005 if ((bp->b_vp != NULLVP) &&
3006 ((bp->b_flags & (B_PAGEIO | B_READ)) == (B_PAGEIO | B_READ)) &&
3007 (bp->b_vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV)) {
3008 microuptime(&priority_IO_timestamp_for_root);
3009 hard_throttle_on_root = 0;
3010 }
3011 /*
3012 * I/O was done, so don't believe
3013 * the DIRTY state from VM anymore
3014 */
3015 CLR(bp->b_flags, B_WASDIRTY);
3016
3017 if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
3018 /*
3019 * wake up any writer's blocked
3020 * on throttle or waiting for I/O
3021 * to drain
3022 */
3023 vnode_writedone(bp->b_vp);
3024
3025 if (ISSET(bp->b_flags, (B_CALL | B_FILTER))) { /* if necessary, call out */
3026 void (*iodone_func)(struct buf *, void *) = bp->b_iodone;
3027 void *arg = (void *)bp->b_transaction;
3028 int callout = ISSET(bp->b_flags, B_CALL);
3029
3030 CLR(bp->b_flags, (B_CALL | B_FILTER)); /* filters and callouts are one-shot */
3031 bp->b_iodone = NULL;
3032 bp->b_transaction = NULL;
3033
3034 if (iodone_func == NULL) {
3035 panic("biodone: bp @ 0x%x has NULL b_iodone!\n", bp);
3036 } else {
3037 if (callout)
3038 SET(bp->b_flags, B_DONE); /* note that it's done */
3039 (*iodone_func)(bp, arg);
3040 }
3041 if (callout)
3042 /*
3043 * assumes that the call back function takes
3044 * ownership of the bp and deals with releasing it if necessary
3045 */
3046 goto biodone_done;
3047 /*
3048 * in this case the call back function is acting
3049 * strictly as a filter... it does not take
3050 * ownership of the bp and is expecting us
3051 * to finish cleaning up... this is currently used
3052 * by the HFS journaling code
3053 */
3054 }
3055 if (ISSET(bp->b_flags, B_ASYNC)) { /* if async, release it */
3056 SET(bp->b_flags, B_DONE); /* note that it's done */
3057
3058 buf_brelse(bp);
3059 } else { /* or just wakeup the buffer */
3060 /*
3061 * by taking the mutex, we serialize
3062 * the buf owner calling buf_biowait so that we'll
3063 * only see him in one of 2 states...
3064 * state 1: B_DONE wasn't set and he's
3065 * blocked in msleep
3066 * state 2: he's blocked trying to take the
3067 * mutex before looking at B_DONE
3068 * BL_WANTED is cleared in case anyone else
3069 * is blocked waiting for the buffer... note
3070 * that we haven't cleared B_BUSY yet, so if
3071 * they do get to run, their going to re-set
3072 * BL_WANTED and go back to sleep
3073 */
3074 lck_mtx_lock(buf_mtxp);
3075
3076 CLR(bp->b_lflags, BL_WANTED);
3077 SET(bp->b_flags, B_DONE); /* note that it's done */
3078
3079 lck_mtx_unlock(buf_mtxp);
3080
3081 wakeup(bp);
3082 }
3083 biodone_done:
3084 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
3085 (int)bp, (int)bp->b_datap, bp->b_flags, 0, 0);
3086 }
3087
3088 /*
3089 * Return a count of buffers on the "locked" queue.
3090 */
3091 int
3092 count_lock_queue(void)
3093 {
3094 buf_t bp;
3095 int n = 0;
3096
3097 lck_mtx_lock(buf_mtxp);
3098
3099 for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
3100 bp = bp->b_freelist.tqe_next)
3101 n++;
3102 lck_mtx_unlock(buf_mtxp);
3103
3104 return (n);
3105 }
3106
3107 /*
3108 * Return a count of 'busy' buffers. Used at the time of shutdown.
3109 */
3110 int
3111 count_busy_buffers(void)
3112 {
3113 buf_t bp;
3114 int nbusy = 0;
3115
3116 for (bp = &buf[nbuf]; --bp >= buf; )
3117 if (!ISSET(bp->b_flags, B_INVAL) && ISSET(bp->b_lflags, BL_BUSY))
3118 nbusy++;
3119 return (nbusy);
3120 }
3121
3122 #if DIAGNOSTIC
3123 /*
3124 * Print out statistics on the current allocation of the buffer pool.
3125 * Can be enabled to print out on every ``sync'' by setting "syncprt"
3126 * in vfs_syscalls.c using sysctl.
3127 */
3128 void
3129 vfs_bufstats()
3130 {
3131 int i, j, count;
3132 register struct buf *bp;
3133 register struct bqueues *dp;
3134 int counts[MAXBSIZE/CLBYTES+1];
3135 static char *bname[BQUEUES] =
3136 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
3137
3138 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
3139 count = 0;
3140 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
3141 counts[j] = 0;
3142
3143 lck_mtx_lock(buf_mtxp);
3144
3145 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
3146 counts[bp->b_bufsize/CLBYTES]++;
3147 count++;
3148 }
3149 lck_mtx_unlock(buf_mtxp);
3150
3151 printf("%s: total-%d", bname[i], count);
3152 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
3153 if (counts[j] != 0)
3154 printf(", %d-%d", j * CLBYTES, counts[j]);
3155 printf("\n");
3156 }
3157 }
3158 #endif /* DIAGNOSTIC */
3159
3160 #define NRESERVEDIOBUFS 64
3161
3162
3163 buf_t
3164 alloc_io_buf(vnode_t vp, int priv)
3165 {
3166 buf_t bp;
3167
3168 lck_mtx_lock(iobuffer_mtxp);
3169
3170 while (((niobuf - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse) && !priv) ||
3171 (bp = iobufqueue.tqh_first) == NULL) {
3172 bufstats.bufs_iobufsleeps++;
3173
3174 need_iobuffer = 1;
3175 (void) msleep(&need_iobuffer, iobuffer_mtxp, (PRIBIO+1), (const char *)"alloc_io_buf", 0);
3176 }
3177 TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
3178
3179 bufstats.bufs_iobufinuse++;
3180 if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
3181 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
3182
3183 lck_mtx_unlock(iobuffer_mtxp);
3184
3185 /*
3186 * initialize various fields
3187 * we don't need to hold the mutex since the buffer
3188 * is now private... the vp should have a reference
3189 * on it and is not protected by this mutex in any event
3190 */
3191 bp->b_timestamp = 0;
3192 bp->b_proc = NULL;
3193
3194 bp->b_datap = 0;
3195 bp->b_flags = 0;
3196 bp->b_lflags = BL_BUSY | BL_IOBUF;
3197 bp->b_blkno = bp->b_lblkno = 0;
3198 #ifdef JOE_DEBUG
3199 bp->b_owner = current_thread();
3200 bp->b_tag = 6;
3201 #endif
3202 bp->b_iodone = NULL;
3203 bp->b_error = 0;
3204 bp->b_resid = 0;
3205 bp->b_bcount = 0;
3206 bp->b_bufsize = 0;
3207 bp->b_upl = NULL;
3208 bp->b_vp = vp;
3209
3210 if (vp && (vp->v_type == VBLK || vp->v_type == VCHR))
3211 bp->b_dev = vp->v_rdev;
3212 else
3213 bp->b_dev = NODEV;
3214
3215 return (bp);
3216 }
3217
3218
3219 void
3220 free_io_buf(buf_t bp)
3221 {
3222 int need_wakeup = 0;
3223
3224 /*
3225 * put buffer back on the head of the iobufqueue
3226 */
3227 bp->b_vp = NULL;
3228 bp->b_flags = B_INVAL;
3229
3230 lck_mtx_lock(iobuffer_mtxp);
3231
3232 binsheadfree(bp, &iobufqueue, -1);
3233
3234 if (need_iobuffer) {
3235 /*
3236 * Wake up any processes waiting because they need an io buffer
3237 *
3238 * do the wakeup after we drop the mutex... it's possible that the
3239 * wakeup will be superfluous if need_iobuffer gets set again and
3240 * another thread runs this path, but it's highly unlikely, doesn't
3241 * hurt, and it means we don't hold up I/O progress if the wakeup blocks
3242 * trying to grab a task related lock...
3243 */
3244 need_iobuffer = 0;
3245 need_wakeup = 1;
3246 }
3247 bufstats.bufs_iobufinuse--;
3248
3249 lck_mtx_unlock(iobuffer_mtxp);
3250
3251 if (need_wakeup)
3252 wakeup(&need_iobuffer);
3253 }
3254
3255
3256
3257 /*
3258 * If getnewbuf() calls bcleanbuf() on the same thread
3259 * there is a potential for stack overrun and deadlocks.
3260 * So we always handoff the work to a worker thread for completion
3261 */
3262 #include <mach/mach_types.h>
3263 #include <mach/memory_object_types.h>
3264 #include <kern/sched_prim.h>
3265
3266
3267 static void
3268 bcleanbuf_thread_init(void)
3269 {
3270 /* create worker thread */
3271 kernel_thread(kernel_task, bcleanbuf_thread);
3272 }
3273
3274 static void
3275 bcleanbuf_thread(void)
3276 {
3277 struct buf *bp;
3278 int error = 0;
3279 int loopcnt = 0;
3280
3281 for (;;) {
3282 lck_mtx_lock(buf_mtxp);
3283
3284 while (blaundrycnt == 0)
3285 (void)msleep((void *)&blaundrycnt, buf_mtxp, PRIBIO, "blaundry", 0);
3286
3287 bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY]);
3288 /*
3289 * Remove from the queue
3290 */
3291 bremfree_locked(bp);
3292 blaundrycnt--;
3293
3294 lck_mtx_unlock(buf_mtxp);
3295 /*
3296 * do the IO
3297 */
3298 error = bawrite_internal(bp, 0);
3299
3300 if (error) {
3301 lck_mtx_lock(buf_mtxp);
3302
3303 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
3304 blaundrycnt++;
3305
3306 lck_mtx_unlock(buf_mtxp);
3307
3308 if (loopcnt > 10) {
3309 (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 1);
3310 loopcnt = 0;
3311 } else {
3312 (void)thread_block(THREAD_CONTINUE_NULL);
3313 loopcnt++;
3314 }
3315 }
3316 }
3317 }
3318
3319
3320 static int
3321 brecover_data(buf_t bp)
3322 {
3323 int upl_offset;
3324 upl_t upl;
3325 upl_page_info_t *pl;
3326 kern_return_t kret;
3327 vnode_t vp = bp->b_vp;
3328 int upl_flags;
3329
3330
3331 if ( !UBCINFOEXISTS(vp) || bp->b_bufsize == 0)
3332 goto dump_buffer;
3333
3334 upl_flags = UPL_PRECIOUS;
3335 if (! (buf_flags(bp) & B_READ)) {
3336 /*
3337 * "write" operation: let the UPL subsystem know
3338 * that we intend to modify the buffer cache pages we're
3339 * gathering.
3340 */
3341 upl_flags |= UPL_WILL_MODIFY;
3342 }
3343
3344 kret = ubc_create_upl(vp,
3345 ubc_blktooff(vp, bp->b_lblkno),
3346 bp->b_bufsize,
3347 &upl,
3348 &pl,
3349 upl_flags);
3350 if (kret != KERN_SUCCESS)
3351 panic("Failed to create UPL");
3352
3353 for (upl_offset = 0; upl_offset < bp->b_bufsize; upl_offset += PAGE_SIZE) {
3354
3355 if (!upl_valid_page(pl, upl_offset / PAGE_SIZE) || !upl_dirty_page(pl, upl_offset / PAGE_SIZE)) {
3356 ubc_upl_abort(upl, 0);
3357 goto dump_buffer;
3358 }
3359 }
3360 bp->b_upl = upl;
3361
3362 kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_datap));
3363
3364 if (kret != KERN_SUCCESS)
3365 panic("getblk: ubc_upl_map() failed with (%d)", kret);
3366 return (1);
3367
3368 dump_buffer:
3369 bp->b_bufsize = 0;
3370 SET(bp->b_flags, B_INVAL);
3371 buf_brelse(bp);
3372
3373 return(0);
3374 }
3375
3376
3377
3378 /*
3379 * disabled for now
3380 */
3381
3382 #if FLUSH_QUEUES
3383
3384 #define NFLUSH 32
3385
3386 static int
3387 bp_cmp(void *a, void *b)
3388 {
3389 buf_t *bp_a = *(buf_t **)a,
3390 *bp_b = *(buf_t **)b;
3391 daddr64_t res;
3392
3393 // don't have to worry about negative block
3394 // numbers so this is ok to do.
3395 //
3396 res = (bp_a->b_blkno - bp_b->b_blkno);
3397
3398 return (int)res;
3399 }
3400
3401
3402 int
3403 bflushq(int whichq, mount_t mp)
3404 {
3405 buf_t bp, next;
3406 int i, buf_count;
3407 int total_writes = 0;
3408 static buf_t flush_table[NFLUSH];
3409
3410 if (whichq < 0 || whichq >= BQUEUES) {
3411 return (0);
3412 }
3413
3414 restart:
3415 lck_mtx_lock(buf_mtxp);
3416
3417 bp = TAILQ_FIRST(&bufqueues[whichq]);
3418
3419 for (buf_count = 0; bp; bp = next) {
3420 next = bp->b_freelist.tqe_next;
3421
3422 if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) {
3423 continue;
3424 }
3425
3426 if (ISSET(bp->b_flags, B_DELWRI) && !ISSET(bp->b_lflags, BL_BUSY)) {
3427
3428 bremfree_locked(bp);
3429 #ifdef JOE_DEBUG
3430 bp->b_owner = current_thread();
3431 bp->b_tag = 7;
3432 #endif
3433 SET(bp->b_lflags, BL_BUSY);
3434 flush_table[buf_count] = bp;
3435 buf_count++;
3436 total_writes++;
3437
3438 if (buf_count >= NFLUSH) {
3439 lck_mtx_unlock(buf_mtxp);
3440
3441 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
3442
3443 for (i = 0; i < buf_count; i++) {
3444 buf_bawrite(flush_table[i]);
3445 }
3446 goto restart;
3447 }
3448 }
3449 }
3450 lck_mtx_unlock(buf_mtxp);
3451
3452 if (buf_count > 0) {
3453 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
3454
3455 for (i = 0; i < buf_count; i++) {
3456 buf_bawrite(flush_table[i]);
3457 }
3458 }
3459
3460 return (total_writes);
3461 }
3462 #endif
3463
3464
3465 #if BALANCE_QUEUES
3466
3467 /* XXX move this to a separate file */
3468
3469 /*
3470 * NOTE: THIS CODE HAS NOT BEEN UPDATED
3471 * WITH RESPECT TO THE NEW LOCKING MODEL
3472 */
3473
3474
3475 /*
3476 * Dynamic Scaling of the Buffer Queues
3477 */
3478
3479 typedef long long blsize_t;
3480
3481 blsize_t MAXNBUF; /* initialize to (sane_size / PAGE_SIZE) */
3482 /* Global tunable limits */
3483 blsize_t nbufh; /* number of buffer headers */
3484 blsize_t nbuflow; /* minimum number of buffer headers required */
3485 blsize_t nbufhigh; /* maximum number of buffer headers allowed */
3486 blsize_t nbuftarget; /* preferred number of buffer headers */
3487
3488 /*
3489 * assertions:
3490 *
3491 * 1. 0 < nbuflow <= nbufh <= nbufhigh
3492 * 2. nbufhigh <= MAXNBUF
3493 * 3. 0 < nbuflow <= nbuftarget <= nbufhigh
3494 * 4. nbufh can not be set by sysctl().
3495 */
3496
3497 /* Per queue tunable limits */
3498
3499 struct bufqlim {
3500 blsize_t bl_nlow; /* minimum number of buffer headers required */
3501 blsize_t bl_num; /* number of buffer headers on the queue */
3502 blsize_t bl_nlhigh; /* maximum number of buffer headers allowed */
3503 blsize_t bl_target; /* preferred number of buffer headers */
3504 long bl_stale; /* Seconds after which a buffer is considered stale */
3505 } bufqlim[BQUEUES];
3506
3507 /*
3508 * assertions:
3509 *
3510 * 1. 0 <= bl_nlow <= bl_num <= bl_nlhigh
3511 * 2. bl_nlhigh <= MAXNBUF
3512 * 3. bufqlim[BQ_META].bl_nlow != 0
3513 * 4. bufqlim[BQ_META].bl_nlow > (number of possible concurrent
3514 * file system IO operations)
3515 * 5. bl_num can not be set by sysctl().
3516 * 6. bl_nhigh <= nbufhigh
3517 */
3518
3519 /*
3520 * Rationale:
3521 * ----------
3522 * Defining it blsize_t as long permits 2^31 buffer headers per queue.
3523 * Which can describe (2^31 * PAGE_SIZE) memory per queue.
3524 *
3525 * These limits are exported to by means of sysctl().
3526 * It was decided to define blsize_t as a 64 bit quantity.
3527 * This will make sure that we will not be required to change it
3528 * as long as we do not exceed 64 bit address space for the kernel.
3529 *
3530 * low and high numbers parameters initialized at compile time
3531 * and boot arguments can be used to override them. sysctl()
3532 * would not change the value. sysctl() can get all the values
3533 * but can set only target. num is the current level.
3534 *
3535 * Advantages of having a "bufqscan" thread doing the balancing are,
3536 * Keep enough bufs on BQ_EMPTY.
3537 * getnewbuf() by default will always select a buffer from the BQ_EMPTY.
3538 * getnewbuf() perfoms best if a buffer was found there.
3539 * Also this minimizes the possibility of starting IO
3540 * from getnewbuf(). That's a performance win, too.
3541 *
3542 * Localize complex logic [balancing as well as time aging]
3543 * to balancebufq().
3544 *
3545 * Simplify getnewbuf() logic by elimination of time aging code.
3546 */
3547
3548 /*
3549 * Algorithm:
3550 * -----------
3551 * The goal of the dynamic scaling of the buffer queues to to keep
3552 * the size of the LRU close to bl_target. Buffers on a queue would
3553 * be time aged.
3554 *
3555 * There would be a thread which will be responsible for "balancing"
3556 * the buffer cache queues.
3557 *
3558 * The scan order would be: AGE, LRU, META, EMPTY.
3559 */
3560
3561 long bufqscanwait = 0;
3562
3563 static void bufqscan_thread();
3564 static int balancebufq(int q);
3565 static int btrimempty(int n);
3566 static __inline__ int initbufqscan(void);
3567 static __inline__ int nextbufq(int q);
3568 static void buqlimprt(int all);
3569
3570
3571 static __inline__ void
3572 bufqinc(int q)
3573 {
3574 if ((q < 0) || (q >= BQUEUES))
3575 return;
3576
3577 bufqlim[q].bl_num++;
3578 return;
3579 }
3580
3581 static __inline__ void
3582 bufqdec(int q)
3583 {
3584 if ((q < 0) || (q >= BQUEUES))
3585 return;
3586
3587 bufqlim[q].bl_num--;
3588 return;
3589 }
3590
3591 static void
3592 bufq_balance_thread_init()
3593 {
3594
3595 if (bufqscanwait++ == 0) {
3596
3597 /* Initalize globals */
3598 MAXNBUF = (sane_size / PAGE_SIZE);
3599 nbufh = nbuf;
3600 nbuflow = min(nbufh, 100);
3601 nbufhigh = min(MAXNBUF, max(nbufh, 2048));
3602 nbuftarget = (sane_size >> 5) / PAGE_SIZE;
3603 nbuftarget = max(nbuflow, nbuftarget);
3604 nbuftarget = min(nbufhigh, nbuftarget);
3605
3606 /*
3607 * Initialize the bufqlim
3608 */
3609
3610 /* LOCKED queue */
3611 bufqlim[BQ_LOCKED].bl_nlow = 0;
3612 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
3613 bufqlim[BQ_LOCKED].bl_target = 0;
3614 bufqlim[BQ_LOCKED].bl_stale = 30;
3615
3616 /* LRU queue */
3617 bufqlim[BQ_LRU].bl_nlow = 0;
3618 bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
3619 bufqlim[BQ_LRU].bl_target = nbuftarget/4;
3620 bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
3621
3622 /* AGE queue */
3623 bufqlim[BQ_AGE].bl_nlow = 0;
3624 bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
3625 bufqlim[BQ_AGE].bl_target = nbuftarget/4;
3626 bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
3627
3628 /* EMPTY queue */
3629 bufqlim[BQ_EMPTY].bl_nlow = 0;
3630 bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
3631 bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
3632 bufqlim[BQ_EMPTY].bl_stale = 600000;
3633
3634 /* META queue */
3635 bufqlim[BQ_META].bl_nlow = 0;
3636 bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
3637 bufqlim[BQ_META].bl_target = nbuftarget/4;
3638 bufqlim[BQ_META].bl_stale = META_IS_STALE;
3639
3640 /* LAUNDRY queue */
3641 bufqlim[BQ_LOCKED].bl_nlow = 0;
3642 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
3643 bufqlim[BQ_LOCKED].bl_target = 0;
3644 bufqlim[BQ_LOCKED].bl_stale = 30;
3645
3646 buqlimprt(1);
3647 }
3648
3649 /* create worker thread */
3650 kernel_thread(kernel_task, bufqscan_thread);
3651 }
3652
3653 /* The workloop for the buffer balancing thread */
3654 static void
3655 bufqscan_thread()
3656 {
3657 int moretodo = 0;
3658
3659 for(;;) {
3660 do {
3661 int q; /* buffer queue to process */
3662
3663 q = initbufqscan();
3664 for (; q; ) {
3665 moretodo |= balancebufq(q);
3666 q = nextbufq(q);
3667 }
3668 } while (moretodo);
3669
3670 #if DIAGNOSTIC
3671 vfs_bufstats();
3672 buqlimprt(0);
3673 #endif
3674 (void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz);
3675 moretodo = 0;
3676 }
3677 }
3678
3679 /* Seed for the buffer queue balancing */
3680 static __inline__ int
3681 initbufqscan()
3682 {
3683 /* Start with AGE queue */
3684 return (BQ_AGE);
3685 }
3686
3687 /* Pick next buffer queue to balance */
3688 static __inline__ int
3689 nextbufq(int q)
3690 {
3691 int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
3692
3693 q++;
3694 q %= sizeof(order);
3695 return (order[q]);
3696 }
3697
3698 /* function to balance the buffer queues */
3699 static int
3700 balancebufq(int q)
3701 {
3702 int moretodo = 0;
3703 int s = splbio();
3704 int n, t;
3705
3706 /* reject invalid q */
3707 if ((q < 0) || (q >= BQUEUES))
3708 goto out;
3709
3710 /* LOCKED or LAUNDRY queue MUST not be balanced */
3711 if ((q == BQ_LOCKED) || (q == BQ_LAUNDRY))
3712 goto out;
3713
3714 n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
3715
3716 /* If queue has less than target nothing more to do */
3717 if (n < 0)
3718 goto out;
3719
3720 if ( n > 8 ) {
3721 /* Balance only a small amount (12.5%) at a time */
3722 n >>= 3;
3723 }
3724
3725 /* EMPTY queue needs special handling */
3726 if (q == BQ_EMPTY) {
3727 moretodo |= btrimempty(n);
3728 goto out;
3729 }
3730
3731 t = buf_timestamp():
3732
3733 for (; n > 0; n--) {
3734 struct buf *bp = bufqueues[q].tqh_first;
3735 if (!bp)
3736 break;
3737
3738 /* check if it's stale */
3739 if ((t - bp->b_timestamp) > bufqlim[q].bl_stale) {
3740 if (bcleanbuf(bp)) {
3741 /* buf_bawrite() issued, bp not ready */
3742 moretodo = 1;
3743 } else {
3744 /* release the cleaned buffer to BQ_EMPTY */
3745 SET(bp->b_flags, B_INVAL);
3746 buf_brelse(bp);
3747 }
3748 } else
3749 break;
3750 }
3751
3752 out:
3753 splx(s);
3754 return (moretodo);
3755 }
3756
3757 static int
3758 btrimempty(int n)
3759 {
3760 /*
3761 * When struct buf are allocated dynamically, this would
3762 * reclaim upto 'n' struct buf from the empty queue.
3763 */
3764
3765 return (0);
3766 }
3767
3768 static void
3769 buqlimprt(int all)
3770 {
3771 int i;
3772 static char *bname[BQUEUES] =
3773 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
3774
3775 if (all)
3776 for (i = 0; i < BQUEUES; i++) {
3777 printf("%s : ", bname[i]);
3778 printf("min = %ld, ", (long)bufqlim[i].bl_nlow);
3779 printf("cur = %ld, ", (long)bufqlim[i].bl_num);
3780 printf("max = %ld, ", (long)bufqlim[i].bl_nlhigh);
3781 printf("target = %ld, ", (long)bufqlim[i].bl_target);
3782 printf("stale after %ld seconds\n", bufqlim[i].bl_stale);
3783 }
3784 else
3785 for (i = 0; i < BQUEUES; i++) {
3786 printf("%s : ", bname[i]);
3787 printf("cur = %ld, ", (long)bufqlim[i].bl_num);
3788 }
3789 }
3790
3791 #endif
3792
3793