2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
23 * @APPLE_LICENSE_HEADER_END@
25 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
28 * The Regents of the University of California. All rights reserved.
30 * Redistribution and use in source and binary forms, with or without
31 * modification, are permitted provided that the following conditions
33 * 1. Redistributions of source code must retain the above copyright
34 * notice, this list of conditions and the following disclaimer.
35 * 2. Redistributions in binary form must reproduce the above copyright
36 * notice, this list of conditions and the following disclaimer in the
37 * documentation and/or other materials provided with the distribution.
38 * 3. All advertising materials mentioning features or use of this software
39 * must display the following acknowledgement:
40 * This product includes software developed by the University of
41 * California, Berkeley and its contributors.
42 * 4. Neither the name of the University nor the names of its contributors
43 * may be used to endorse or promote products derived from this software
44 * without specific prior written permission.
46 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
47 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
49 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
50 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
52 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
54 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
55 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
61 #include <sys/param.h>
64 #include <sys/vnode.h>
65 #include <sys/mount.h>
66 #include <sys/trace.h>
67 #include <sys/malloc.h>
69 #include <sys/kernel.h>
70 #include <sys/resourcevar.h>
71 #include <libkern/libkern.h>
72 #include <machine/machine_routines.h>
75 #include <vm/vm_pageout.h>
77 #include <mach/mach_types.h>
78 #include <mach/memory_object_types.h>
80 #include <sys/kdebug.h>
85 #define CL_COMMIT 0x04
86 #define CL_PAGEOUT 0x10
89 #define CL_NOZERO 0x80
90 #define CL_PAGEIN 0x100
91 #define CL_DEV_MEMORY 0x200
92 #define CL_PRESERVE 0x400
93 #define CL_THROTTLE 0x800
97 u_int io_completed
; /* amount of io that has currently completed */
98 u_int io_issued
; /* amount of io that was successfully issued */
99 int io_error
; /* error code of first error encountered */
100 int io_wanted
; /* someone is sleeping waiting for a change in state */
104 static void cluster_zero(upl_t upl
, vm_offset_t upl_offset
,
105 int size
, struct buf
*bp
);
106 static int cluster_read_x(struct vnode
*vp
, struct uio
*uio
,
107 off_t filesize
, int devblocksize
, int flags
);
108 static int cluster_write_x(struct vnode
*vp
, struct uio
*uio
,
109 off_t oldEOF
, off_t newEOF
, off_t headOff
,
110 off_t tailOff
, int devblocksize
, int flags
);
111 static int cluster_nocopy_read(struct vnode
*vp
, struct uio
*uio
,
112 off_t filesize
, int devblocksize
, int flags
);
113 static int cluster_nocopy_write(struct vnode
*vp
, struct uio
*uio
,
114 off_t newEOF
, int devblocksize
, int flags
);
115 static int cluster_phys_read(struct vnode
*vp
, struct uio
*uio
,
116 off_t filesize
, int devblocksize
, int flags
);
117 static int cluster_phys_write(struct vnode
*vp
, struct uio
*uio
,
118 off_t newEOF
, int devblocksize
, int flags
);
119 static int cluster_align_phys_io(struct vnode
*vp
, struct uio
*uio
,
120 addr64_t usr_paddr
, int xsize
, int devblocksize
, int flags
);
121 static int cluster_push_x(struct vnode
*vp
, off_t EOF
, daddr_t first
, daddr_t last
, int can_delay
);
122 static int cluster_try_push(struct vnode
*vp
, off_t EOF
, int can_delay
, int push_all
);
124 static int sparse_cluster_switch(struct vnode
*vp
, off_t EOF
);
125 static int sparse_cluster_push(struct vnode
*vp
, off_t EOF
, int push_all
);
126 static int sparse_cluster_add(struct vnode
*vp
, off_t EOF
, daddr_t first
, daddr_t last
);
128 static kern_return_t
vfs_drt_mark_pages(void **cmapp
, off_t offset
, u_int length
, int *setcountp
);
129 static kern_return_t
vfs_drt_unmark_pages(void **cmapp
, off_t offset
, u_int length
);
130 static kern_return_t
vfs_drt_get_cluster(void **cmapp
, off_t
*offsetp
, u_int
*lengthp
);
131 static kern_return_t
vfs_drt_control(void **cmapp
, int op_type
);
133 int ubc_page_op_with_control
__P((memory_object_control_t
, off_t
, int, ppnum_t
*, int *));
137 * throttle the number of async writes that
138 * can be outstanding on a single vnode
139 * before we issue a synchronous write
141 #define ASYNC_THROTTLE 18
142 #define HARD_THROTTLE_MAXCNT 1
143 #define HARD_THROTTLE_MAXSIZE (64 * 1024)
145 int hard_throttle_on_root
= 0;
146 struct timeval priority_IO_timestamp_for_root
;
150 cluster_hard_throttle_on(vp
)
153 static struct timeval hard_throttle_maxelapsed
= { 0, 300000 };
155 if (vp
->v_mount
->mnt_kern_flag
& MNTK_ROOTDEV
) {
156 struct timeval elapsed
;
158 if (hard_throttle_on_root
)
162 timevalsub(&elapsed
, &priority_IO_timestamp_for_root
);
164 if (timevalcmp(&elapsed
, &hard_throttle_maxelapsed
, <))
183 struct buf
*cbp_head
;
184 struct buf
*cbp_next
;
187 struct clios
*iostate
;
192 cbp_head
= (struct buf
*)(bp
->b_trans_head
);
194 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_START
,
195 (int)cbp_head
, bp
->b_lblkno
, bp
->b_bcount
, bp
->b_flags
, 0);
197 for (cbp
= cbp_head
; cbp
; cbp
= cbp
->b_trans_next
) {
199 * all I/O requests that are part of this transaction
200 * have to complete before we can process it
202 if ( !(cbp
->b_flags
& B_DONE
)) {
204 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
205 (int)cbp_head
, (int)cbp
, cbp
->b_bcount
, cbp
->b_flags
, 0);
215 upl_offset
= cbp
->b_uploffset
;
216 upl
= cbp
->b_pagelist
;
217 b_flags
= cbp
->b_flags
;
218 real_bp
= cbp
->b_real_bp
;
220 zero_offset
= cbp
->b_validend
;
221 iostate
= (struct clios
*)cbp
->b_iostate
;
224 if ((cbp
->b_flags
& B_ERROR
) && error
== 0)
225 error
= cbp
->b_error
;
227 total_resid
+= cbp
->b_resid
;
228 total_size
+= cbp
->b_bcount
;
230 cbp_next
= cbp
->b_trans_next
;
237 cluster_zero(upl
, zero_offset
, PAGE_SIZE
- (zero_offset
& PAGE_MASK
), real_bp
);
239 if ((vp
->v_flag
& VTHROTTLED
) && (vp
->v_numoutput
<= (ASYNC_THROTTLE
/ 3))) {
240 vp
->v_flag
&= ~VTHROTTLED
;
241 wakeup((caddr_t
)&vp
->v_numoutput
);
245 * someone has issued multiple I/Os asynchrounsly
246 * and is waiting for them to complete (streaming)
248 if (error
&& iostate
->io_error
== 0)
249 iostate
->io_error
= error
;
251 iostate
->io_completed
+= total_size
;
253 if (iostate
->io_wanted
) {
255 * someone is waiting for the state of
256 * this io stream to change
258 iostate
->io_wanted
= 0;
259 wakeup((caddr_t
)&iostate
->io_wanted
);
262 if ((b_flags
& B_NEED_IODONE
) && real_bp
) {
264 real_bp
->b_flags
|= B_ERROR
;
265 real_bp
->b_error
= error
;
267 real_bp
->b_resid
= total_resid
;
271 if (error
== 0 && total_resid
)
274 if (b_flags
& B_COMMIT_UPL
) {
275 pg_offset
= upl_offset
& PAGE_MASK
;
276 commit_size
= (pg_offset
+ total_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
278 if (error
|| (b_flags
& B_NOCACHE
)) {
281 if ((b_flags
& B_PAGEOUT
) && (error
!= ENXIO
)) /* transient error */
282 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
;
283 else if (b_flags
& B_PGIN
)
284 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
;
286 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_DUMP_PAGES
;
288 ubc_upl_abort_range(upl
, upl_offset
- pg_offset
, commit_size
,
291 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
292 (int)upl
, upl_offset
- pg_offset
, commit_size
,
293 0x80000000|upl_abort_code
, 0);
296 int upl_commit_flags
= UPL_COMMIT_FREE_ON_EMPTY
;
298 if (b_flags
& B_PHYS
) {
299 if (b_flags
& B_READ
)
300 upl_commit_flags
|= UPL_COMMIT_SET_DIRTY
;
301 } else if ( !(b_flags
& B_PAGEOUT
))
302 upl_commit_flags
|= UPL_COMMIT_CLEAR_DIRTY
;
305 upl_commit_flags
|= UPL_COMMIT_INACTIVATE
;
307 ubc_upl_commit_range(upl
, upl_offset
- pg_offset
, commit_size
,
310 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
311 (int)upl
, upl_offset
- pg_offset
, commit_size
,
312 upl_commit_flags
, 0);
315 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
316 (int)upl
, upl_offset
, 0, error
, 0);
323 cluster_zero(upl
, upl_offset
, size
, bp
)
325 vm_offset_t upl_offset
;
331 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 23)) | DBG_FUNC_START
,
332 upl_offset
, size
, (int)bp
, 0, 0);
334 if (bp
== NULL
|| bp
->b_data
== NULL
) {
336 pl
= ubc_upl_pageinfo(upl
);
344 page_index
= upl_offset
/ PAGE_SIZE
;
345 page_offset
= upl_offset
& PAGE_MASK
;
347 zero_addr
= ((addr64_t
)upl_phys_page(pl
, page_index
) << 12) + page_offset
;
348 zero_cnt
= min(PAGE_SIZE
- page_offset
, size
);
350 bzero_phys(zero_addr
, zero_cnt
);
353 upl_offset
+= zero_cnt
;
356 bzero((caddr_t
)((vm_offset_t
)bp
->b_data
+ upl_offset
), size
);
358 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 23)) | DBG_FUNC_END
,
359 upl_offset
, size
, 0, 0, 0);
363 cluster_io(vp
, upl
, upl_offset
, f_offset
, non_rounded_size
, devblocksize
, flags
, real_bp
, iostate
)
366 vm_offset_t upl_offset
;
368 int non_rounded_size
;
372 struct clios
*iostate
;
380 struct buf
*cbp_head
= 0;
381 struct buf
*cbp_tail
= 0;
392 size
= (non_rounded_size
+ (devblocksize
- 1)) & ~(devblocksize
- 1);
394 size
= non_rounded_size
;
396 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_START
,
397 (int)f_offset
, size
, upl_offset
, flags
, 0);
400 if (flags
& CL_READ
) {
401 io_flags
= (B_VECTORLIST
| B_READ
);
403 vfs_io_attributes(vp
, B_READ
, &max_iosize
, &max_vectors
);
405 io_flags
= (B_VECTORLIST
| B_WRITEINPROG
);
407 vfs_io_attributes(vp
, B_WRITE
, &max_iosize
, &max_vectors
);
410 * make sure the maximum iosize are at least the size of a page
411 * and that they are multiples of the page size
413 max_iosize
&= ~PAGE_MASK
;
415 if (flags
& CL_THROTTLE
) {
416 if ( !(flags
& CL_PAGEOUT
) && cluster_hard_throttle_on(vp
)) {
417 if (max_iosize
> HARD_THROTTLE_MAXSIZE
)
418 max_iosize
= HARD_THROTTLE_MAXSIZE
;
419 async_throttle
= HARD_THROTTLE_MAXCNT
;
421 async_throttle
= ASYNC_THROTTLE
;
426 io_flags
|= B_NOCACHE
;
427 if (flags
& CL_PAGEIN
)
429 if (flags
& CL_PAGEOUT
)
430 io_flags
|= B_PAGEOUT
;
431 if (flags
& CL_COMMIT
)
432 io_flags
|= B_COMMIT_UPL
;
433 if (flags
& CL_PRESERVE
)
436 if ((flags
& CL_READ
) && ((upl_offset
+ non_rounded_size
) & PAGE_MASK
) && (!(flags
& CL_NOZERO
))) {
438 * then we are going to end up
439 * with a page that we can't complete (the file size wasn't a multiple
440 * of PAGE_SIZE and we're trying to read to the end of the file
441 * so we'll go ahead and zero out the portion of the page we can't
442 * read in from the file
444 zero_offset
= upl_offset
+ non_rounded_size
;
454 if (size
> max_iosize
)
455 io_size
= max_iosize
;
459 if (error
= VOP_CMAP(vp
, f_offset
, io_size
, &blkno
, (size_t *)&io_size
, NULL
)) {
460 if (error
== EOPNOTSUPP
)
461 panic("VOP_CMAP Unimplemented");
465 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 24)) | DBG_FUNC_NONE
,
466 (int)f_offset
, (int)blkno
, io_size
, zero_offset
, 0);
468 if ( (!(flags
& CL_READ
) && (long)blkno
== -1) || io_size
== 0) {
469 if (flags
& CL_PAGEOUT
) {
474 /* Try paging out the page individually before
475 giving up entirely and dumping it (it could
476 be mapped in a "hole" and require allocation
479 ubc_upl_abort_range(upl
, upl_offset
, PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
480 if (ubc_pushdirty_range(vp
, f_offset
, PAGE_SIZE_64
) == 0) {
485 f_offset
+= PAGE_SIZE_64
;
486 upl_offset
+= PAGE_SIZE
;
490 lblkno
= (daddr_t
)(f_offset
/ PAGE_SIZE_64
);
492 * we have now figured out how much I/O we can do - this is in 'io_size'
493 * pg_offset is the starting point in the first page for the I/O
494 * pg_count is the number of full and partial pages that 'io_size' encompasses
496 pg_offset
= upl_offset
& PAGE_MASK
;
498 if (flags
& CL_DEV_MEMORY
) {
500 * currently, can't deal with reading 'holes' in file
502 if ((long)blkno
== -1) {
507 * treat physical requests as one 'giant' page
511 pg_count
= (io_size
+ pg_offset
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
513 if ((flags
& CL_READ
) && (long)blkno
== -1) {
517 * if we're reading and blkno == -1, then we've got a
518 * 'hole' in the file that we need to deal with by zeroing
519 * out the affected area in the upl
521 if (zero_offset
&& io_size
== size
) {
523 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
524 * than 'zero_offset' will be non-zero
525 * if the 'hole' returned by VOP_CMAP extends all the way to the eof
526 * (indicated by the io_size finishing off the I/O request for this UPL)
527 * than we're not going to issue an I/O for the
528 * last page in this upl... we need to zero both the hole and the tail
529 * of the page beyond the EOF, since the delayed zero-fill won't kick in
531 bytes_to_zero
= (((upl_offset
+ io_size
) + (PAGE_SIZE
- 1)) & ~PAGE_MASK
) - upl_offset
;
535 bytes_to_zero
= io_size
;
537 cluster_zero(upl
, upl_offset
, bytes_to_zero
, real_bp
);
541 * if there is a current I/O chain pending
542 * then the first page of the group we just zero'd
543 * will be handled by the I/O completion if the zero
544 * fill started in the middle of the page
546 pg_count
= (io_size
- pg_offset
) / PAGE_SIZE
;
549 * no pending I/O to pick up that first page
550 * so, we have to make sure it gets committed
552 * set the pg_offset to 0 so that the upl_commit_range
553 * starts with this page
555 pg_count
= (io_size
+ pg_offset
) / PAGE_SIZE
;
558 if (io_size
== size
&& ((upl_offset
+ io_size
) & PAGE_MASK
))
560 * if we're done with the request for this UPL
561 * then we have to make sure to commit the last page
562 * even if we only partially zero-filled it
568 pg_resid
= PAGE_SIZE
- pg_offset
;
572 if (flags
& CL_COMMIT
)
573 ubc_upl_commit_range(upl
,
574 (upl_offset
+ pg_resid
) & ~PAGE_MASK
,
575 pg_count
* PAGE_SIZE
,
576 UPL_COMMIT_CLEAR_DIRTY
| UPL_COMMIT_FREE_ON_EMPTY
);
578 upl_offset
+= io_size
;
582 if (cbp_head
&& pg_count
)
586 } else if (real_bp
&& (real_bp
->b_blkno
== real_bp
->b_lblkno
)) {
587 real_bp
->b_blkno
= blkno
;
590 if (pg_count
> max_vectors
) {
591 io_size
-= (pg_count
- max_vectors
) * PAGE_SIZE
;
594 io_size
= PAGE_SIZE
- pg_offset
;
597 pg_count
= max_vectors
;
600 if ( !(vp
->v_mount
->mnt_kern_flag
& MNTK_VIRTUALDEV
))
602 * if we're not targeting a virtual device i.e. a disk image
603 * it's safe to dip into the reserve pool since real devices
604 * can complete this I/O request without requiring additional
605 * bufs from the alloc_io_buf pool
608 else if ((flags
& CL_ASYNC
) && !(flags
& CL_PAGEOUT
))
610 * Throttle the speculative IO
616 cbp
= alloc_io_buf(vp
, priv
);
619 if (flags
& CL_PAGEOUT
) {
620 for (i
= 0; i
< pg_count
; i
++) {
625 if (bp
= incore(vp
, lblkno
+ i
)) {
626 if (!ISSET(bp
->b_flags
, B_BUSY
)) {
628 SET(bp
->b_flags
, (B_BUSY
| B_INVAL
));
632 panic("BUSY bp found in cluster_io");
637 if (flags
& CL_ASYNC
) {
638 cbp
->b_flags
|= (B_CALL
| B_ASYNC
);
639 cbp
->b_iodone
= (void *)cluster_iodone
;
641 cbp
->b_flags
|= io_flags
;
643 cbp
->b_lblkno
= lblkno
;
644 cbp
->b_blkno
= blkno
;
645 cbp
->b_bcount
= io_size
;
646 cbp
->b_pagelist
= upl
;
647 cbp
->b_uploffset
= upl_offset
;
648 cbp
->b_trans_next
= (struct buf
*)0;
650 if (cbp
->b_iostate
= (void *)iostate
)
652 * caller wants to track the state of this
653 * io... bump the amount issued against this stream
655 iostate
->io_issued
+= io_size
;
658 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 26)) | DBG_FUNC_NONE
,
659 cbp
->b_lblkno
, cbp
->b_blkno
, upl_offset
, io_size
, 0);
661 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 27)) | DBG_FUNC_NONE
,
662 cbp
->b_lblkno
, cbp
->b_blkno
, upl_offset
, io_size
, 0);
665 cbp_tail
->b_trans_next
= cbp
;
671 (struct buf
*)(cbp
->b_trans_head
) = cbp_head
;
674 upl_offset
+= io_size
;
678 if ( (!(upl_offset
& PAGE_MASK
) && !(flags
& CL_DEV_MEMORY
) && ((flags
& CL_ASYNC
) || buf_count
> 8)) || size
== 0) {
680 * if we have no more I/O to issue or
681 * the current I/O we've prepared fully
682 * completes the last page in this request
683 * and it's either an ASYNC request or
684 * we've already accumulated more than 8 I/O's into
685 * this transaction and it's not an I/O directed to
686 * special DEVICE memory
687 * then go ahead and issue the I/O
691 cbp_head
->b_flags
|= B_NEED_IODONE
;
692 cbp_head
->b_real_bp
= real_bp
;
694 cbp_head
->b_real_bp
= (struct buf
*)NULL
;
698 * we're about to issue the last I/O for this upl
699 * if this was a read to the eof and the eof doesn't
700 * finish on a page boundary, than we need to zero-fill
701 * the rest of the page....
703 cbp_head
->b_validend
= zero_offset
;
705 cbp_head
->b_validend
= 0;
707 if (flags
& CL_THROTTLE
) {
708 while (vp
->v_numoutput
>= async_throttle
) {
709 vp
->v_flag
|= VTHROTTLED
;
710 tsleep((caddr_t
)&vp
->v_numoutput
, PRIBIO
+ 1, "cluster_io", 0);
713 for (cbp
= cbp_head
; cbp
;) {
714 struct buf
* cbp_next
;
716 if (io_flags
& B_WRITEINPROG
)
717 cbp
->b_vp
->v_numoutput
++;
719 cbp_next
= cbp
->b_trans_next
;
721 (void) VOP_STRATEGY(cbp
);
724 if ( !(flags
& CL_ASYNC
)) {
725 for (cbp
= cbp_head
; cbp
; cbp
= cbp
->b_trans_next
)
728 if (error
= cluster_iodone(cbp_head
)) {
729 if ((flags
& CL_PAGEOUT
) && (error
== ENXIO
))
730 retval
= 0; /* drop the error */
736 cbp_head
= (struct buf
*)0;
737 cbp_tail
= (struct buf
*)0;
747 for (cbp
= cbp_head
; cbp
;) {
748 struct buf
* cbp_next
;
750 upl_offset
-= cbp
->b_bcount
;
751 size
+= cbp
->b_bcount
;
752 io_size
+= cbp
->b_bcount
;
754 cbp_next
= cbp
->b_trans_next
;
760 * update the error condition for this stream
761 * since we never really issued the io
762 * just go ahead and adjust it back
764 if (iostate
->io_error
== 0)
765 iostate
->io_error
= error
;
766 iostate
->io_issued
-= io_size
;
768 if (iostate
->io_wanted
) {
770 * someone is waiting for the state of
771 * this io stream to change
773 iostate
->io_wanted
= 0;
774 wakeup((caddr_t
)&iostate
->io_wanted
);
777 pg_offset
= upl_offset
& PAGE_MASK
;
778 abort_size
= (size
+ pg_offset
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
780 if (flags
& CL_COMMIT
) {
783 if (flags
& CL_PRESERVE
) {
784 ubc_upl_commit_range(upl
, upl_offset
- pg_offset
, abort_size
,
785 UPL_COMMIT_FREE_ON_EMPTY
);
787 if ((flags
& CL_PAGEOUT
) && (error
!= ENXIO
)) /* transient error */
788 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
;
789 else if (flags
& CL_PAGEIN
)
790 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
;
792 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_DUMP_PAGES
;
794 ubc_upl_abort_range(upl
, upl_offset
- pg_offset
, abort_size
,
797 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 28)) | DBG_FUNC_NONE
,
798 (int)upl
, upl_offset
- pg_offset
, abort_size
, error
, 0);
801 real_bp
->b_flags
|= B_ERROR
;
802 real_bp
->b_error
= error
;
809 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_END
,
810 (int)f_offset
, size
, upl_offset
, retval
, 0);
817 cluster_rd_prefetch(vp
, f_offset
, size
, filesize
, devblocksize
)
824 int pages_in_prefetch
;
826 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_START
,
827 (int)f_offset
, size
, (int)filesize
, 0, 0);
829 if (f_offset
>= filesize
) {
830 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_END
,
831 (int)f_offset
, 0, 0, 0, 0);
834 if (size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
835 size
= (MAX_UPL_TRANSFER
* PAGE_SIZE
);
837 size
= (size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
839 if ((off_t
)size
> (filesize
- f_offset
))
840 size
= filesize
- f_offset
;
841 pages_in_prefetch
= (size
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
843 advisory_read(vp
, filesize
, f_offset
, size
, devblocksize
);
845 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_END
,
846 (int)f_offset
+ size
, pages_in_prefetch
, 0, 1, 0);
848 return (pages_in_prefetch
);
854 cluster_rd_ahead(vp
, b_lblkno
, e_lblkno
, filesize
, devblocksize
)
863 int size_of_prefetch
;
865 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_START
,
866 b_lblkno
, e_lblkno
, vp
->v_lastr
, 0, 0);
868 if (b_lblkno
== vp
->v_lastr
&& b_lblkno
== e_lblkno
) {
869 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
870 vp
->v_ralen
, vp
->v_maxra
, vp
->v_lastr
, 0, 0);
873 if (vp
->v_lastr
== -1 || (b_lblkno
!= vp
->v_lastr
&& b_lblkno
!= (vp
->v_lastr
+ 1) &&
874 (b_lblkno
!= (vp
->v_maxra
+ 1) || vp
->v_ralen
== 0))) {
878 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
879 vp
->v_ralen
, vp
->v_maxra
, vp
->v_lastr
, 1, 0);
883 if (e_lblkno
< vp
->v_maxra
) {
884 if ((vp
->v_maxra
- e_lblkno
) > (MAX_UPL_TRANSFER
/ 4)) {
886 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
887 vp
->v_ralen
, vp
->v_maxra
, vp
->v_lastr
, 2, 0);
891 r_lblkno
= max(e_lblkno
, vp
->v_maxra
) + 1;
892 f_offset
= (off_t
)r_lblkno
* PAGE_SIZE_64
;
894 size_of_prefetch
= 0;
896 ubc_range_op(vp
, f_offset
, f_offset
+ PAGE_SIZE_64
, UPL_ROP_PRESENT
, &size_of_prefetch
);
898 if (size_of_prefetch
) {
899 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
900 vp
->v_ralen
, vp
->v_maxra
, vp
->v_lastr
, 3, 0);
903 if (f_offset
< filesize
) {
904 vp
->v_ralen
= vp
->v_ralen
? min(MAX_UPL_TRANSFER
, vp
->v_ralen
<< 1) : 1;
906 if (((e_lblkno
+ 1) - b_lblkno
) > vp
->v_ralen
)
907 vp
->v_ralen
= min(MAX_UPL_TRANSFER
, (e_lblkno
+ 1) - b_lblkno
);
909 size_of_prefetch
= cluster_rd_prefetch(vp
, f_offset
, vp
->v_ralen
* PAGE_SIZE
, filesize
, devblocksize
);
911 if (size_of_prefetch
)
912 vp
->v_maxra
= (r_lblkno
+ size_of_prefetch
) - 1;
914 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
915 vp
->v_ralen
, vp
->v_maxra
, vp
->v_lastr
, 4, 0);
919 cluster_pageout(vp
, upl
, upl_offset
, f_offset
, size
, filesize
, devblocksize
, flags
)
922 vm_offset_t upl_offset
;
934 if (vp
->v_mount
->mnt_kern_flag
& MNTK_VIRTUALDEV
)
936 * if we know we're issuing this I/O to a virtual device (i.e. disk image)
937 * then we don't want to enforce this throttle... if we do, we can
938 * potentially deadlock since we're stalling the pageout thread at a time
939 * when the disk image might need additional memory (which won't be available
940 * if the pageout thread can't run)... instead we'll just depend on the throttle
941 * that the pageout thread now has in place to deal with external files
943 local_flags
= CL_PAGEOUT
;
945 local_flags
= CL_PAGEOUT
| CL_THROTTLE
;
947 if ((flags
& UPL_IOSYNC
) == 0)
948 local_flags
|= CL_ASYNC
;
949 if ((flags
& UPL_NOCOMMIT
) == 0)
950 local_flags
|= CL_COMMIT
;
953 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 52)) | DBG_FUNC_NONE
,
954 (int)f_offset
, size
, (int)filesize
, local_flags
, 0);
957 * If they didn't specify any I/O, then we are done...
958 * we can't issue an abort because we don't know how
959 * big the upl really is
964 if (vp
->v_mount
->mnt_flag
& MNT_RDONLY
) {
965 if (local_flags
& CL_COMMIT
)
966 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
);
970 * can't page-in from a negative offset
971 * or if we're starting beyond the EOF
972 * or if the file offset isn't page aligned
973 * or the size requested isn't a multiple of PAGE_SIZE
975 if (f_offset
< 0 || f_offset
>= filesize
||
976 (f_offset
& PAGE_MASK_64
) || (size
& PAGE_MASK
)) {
977 if (local_flags
& CL_COMMIT
)
978 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
);
981 max_size
= filesize
- f_offset
;
988 rounded_size
= (io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
990 if (size
> rounded_size
) {
991 if (local_flags
& CL_COMMIT
)
992 ubc_upl_abort_range(upl
, upl_offset
+ rounded_size
, size
- rounded_size
,
993 UPL_ABORT_FREE_ON_EMPTY
);
995 vp
->v_flag
|= VHASBEENPAGED
;
997 return (cluster_io(vp
, upl
, upl_offset
, f_offset
, io_size
, devblocksize
,
998 local_flags
, (struct buf
*)0, (struct clios
*)0));
1002 cluster_pagein(vp
, upl
, upl_offset
, f_offset
, size
, filesize
, devblocksize
, flags
)
1005 vm_offset_t upl_offset
;
1016 int local_flags
= 0;
1018 if (upl
== NULL
|| size
< 0)
1019 panic("cluster_pagein: NULL upl passed in");
1021 if ((flags
& UPL_IOSYNC
) == 0)
1022 local_flags
|= CL_ASYNC
;
1023 if ((flags
& UPL_NOCOMMIT
) == 0)
1024 local_flags
|= CL_COMMIT
;
1027 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 56)) | DBG_FUNC_NONE
,
1028 (int)f_offset
, size
, (int)filesize
, local_flags
, 0);
1031 * can't page-in from a negative offset
1032 * or if we're starting beyond the EOF
1033 * or if the file offset isn't page aligned
1034 * or the size requested isn't a multiple of PAGE_SIZE
1036 if (f_offset
< 0 || f_offset
>= filesize
||
1037 (f_offset
& PAGE_MASK_64
) || (size
& PAGE_MASK
) || (upl_offset
& PAGE_MASK
)) {
1038 if (local_flags
& CL_COMMIT
)
1039 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
);
1042 max_size
= filesize
- f_offset
;
1044 if (size
< max_size
)
1049 rounded_size
= (io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
1051 if (size
> rounded_size
&& (local_flags
& CL_COMMIT
))
1052 ubc_upl_abort_range(upl
, upl_offset
+ rounded_size
,
1053 size
- rounded_size
, UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
);
1055 retval
= cluster_io(vp
, upl
, upl_offset
, f_offset
, io_size
, devblocksize
,
1056 local_flags
| CL_READ
| CL_PAGEIN
, (struct buf
*)0, (struct clios
*)0);
1062 b_lblkno
= (int)(f_offset
/ PAGE_SIZE_64
);
1064 ((f_offset
+ ((off_t
)io_size
- 1)) / PAGE_SIZE_64
);
1066 if (!(flags
& UPL_NORDAHEAD
) && !(vp
->v_flag
& VRAOFF
) && rounded_size
== PAGE_SIZE
) {
1068 * we haven't read the last page in of the file yet
1069 * so let's try to read ahead if we're in
1070 * a sequential access pattern
1072 cluster_rd_ahead(vp
, b_lblkno
, e_lblkno
, filesize
, devblocksize
);
1074 vp
->v_lastr
= e_lblkno
;
1086 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 19)) | DBG_FUNC_START
,
1087 (int)bp
, bp
->b_lblkno
, bp
->b_bcount
, bp
->b_flags
, 0);
1089 if (bp
->b_pagelist
== (upl_t
) 0)
1090 panic("cluster_bp: can't handle NULL upl yet\n");
1091 if (bp
->b_flags
& B_READ
)
1092 flags
= CL_ASYNC
| CL_READ
;
1096 f_offset
= ubc_blktooff(bp
->b_vp
, bp
->b_lblkno
);
1098 return (cluster_io(bp
->b_vp
, bp
->b_pagelist
, 0, f_offset
, bp
->b_bcount
, 0, flags
, bp
, (struct clios
*)0));
1102 cluster_write(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
)
1122 if (vp
->v_flag
& VHASBEENPAGED
)
1125 * this vnode had pages cleaned to it by
1126 * the pager which indicates that either
1127 * it's not very 'hot', or the system is
1128 * being overwhelmed by a lot of dirty
1129 * data being delayed in the VM cache...
1130 * in either event, we'll push our remaining
1131 * delayed data at this point... this will
1132 * be more efficient than paging out 1 page at
1133 * a time, and will also act as a throttle
1134 * by delaying this client from writing any
1135 * more data until all his delayed data has
1136 * at least been queued to the uderlying driver.
1140 vp
->v_flag
&= ~VHASBEENPAGED
;
1143 if ( (!(vp
->v_flag
& VNOCACHE_DATA
)) || (!uio
) || (uio
->uio_segflg
!= UIO_USERSPACE
))
1146 * go do a write through the cache if one of the following is true....
1147 * NOCACHE is not true
1148 * there is no uio structure or it doesn't target USERSPACE
1150 return (cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
));
1153 while (uio
->uio_resid
&& uio
->uio_offset
< newEOF
&& retval
== 0)
1156 * we know we have a resid, so this is safe
1157 * skip over any emtpy vectors
1161 while (iov
->iov_len
== 0) {
1166 upl_size
= PAGE_SIZE
;
1167 upl_flags
= UPL_QUERY_OBJECT_TYPE
;
1169 if ((vm_map_get_upl(current_map(),
1170 (vm_offset_t
)iov
->iov_base
& ~PAGE_MASK
,
1171 &upl_size
, &upl
, NULL
, NULL
, &upl_flags
, 0)) != KERN_SUCCESS
)
1174 * the user app must have passed in an invalid address
1180 * We check every vector target but if it is physically
1181 * contiguous space, we skip the sanity checks.
1183 if (upl_flags
& UPL_PHYS_CONTIG
)
1185 if (flags
& IO_HEADZEROFILL
)
1187 flags
&= ~IO_HEADZEROFILL
;
1189 if (retval
= cluster_write_x(vp
, (struct uio
*)0, 0, uio
->uio_offset
, headOff
, 0, devblocksize
, IO_HEADZEROFILL
))
1193 retval
= cluster_phys_write(vp
, uio
, newEOF
, devblocksize
, flags
);
1195 if (uio
->uio_resid
== 0 && (flags
& IO_TAILZEROFILL
))
1197 return (cluster_write_x(vp
, (struct uio
*)0, 0, tailOff
, uio
->uio_offset
, 0, devblocksize
, IO_HEADZEROFILL
));
1200 else if ((uio
->uio_resid
< PAGE_SIZE
) || (flags
& (IO_TAILZEROFILL
| IO_HEADZEROFILL
)))
1203 * we're here because we're don't have a physically contiguous target buffer
1204 * go do a write through the cache if one of the following is true....
1205 * the total xfer size is less than a page...
1206 * we're being asked to ZEROFILL either the head or the tail of the I/O...
1208 return (cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
));
1210 else if (((int)uio
->uio_offset
& PAGE_MASK
) || ((int)iov
->iov_base
& PAGE_MASK
))
1212 if (((int)uio
->uio_offset
& PAGE_MASK
) == ((int)iov
->iov_base
& PAGE_MASK
))
1215 * Bring the file offset write up to a pagesize boundary
1216 * this will also bring the base address to a page boundary
1217 * since they both are currently on the same offset within a page
1218 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
1219 * so the computed clip_size must always be less than the current uio_resid
1221 clip_size
= (PAGE_SIZE
- (uio
->uio_offset
& PAGE_MASK_64
));
1224 * Fake the resid going into the cluster_write_x call
1225 * and restore it on the way out.
1227 prev_resid
= uio
->uio_resid
;
1228 uio
->uio_resid
= clip_size
;
1229 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
);
1230 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
1235 * can't get both the file offset and the buffer offset aligned to a page boundary
1236 * so fire an I/O through the cache for this entire vector
1238 clip_size
= iov
->iov_len
;
1239 prev_resid
= uio
->uio_resid
;
1240 uio
->uio_resid
= clip_size
;
1241 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
);
1242 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
1248 * If we come in here, we know the offset into
1249 * the file is on a pagesize boundary and the
1250 * target buffer address is also on a page boundary
1252 max_io_size
= newEOF
- uio
->uio_offset
;
1253 clip_size
= uio
->uio_resid
;
1254 if (iov
->iov_len
< clip_size
)
1255 clip_size
= iov
->iov_len
;
1256 if (max_io_size
< clip_size
)
1257 clip_size
= max_io_size
;
1259 if (clip_size
< PAGE_SIZE
)
1262 * Take care of tail end of write in this vector
1264 prev_resid
= uio
->uio_resid
;
1265 uio
->uio_resid
= clip_size
;
1266 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
);
1267 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
1271 /* round clip_size down to a multiple of pagesize */
1272 clip_size
= clip_size
& ~(PAGE_MASK
);
1273 prev_resid
= uio
->uio_resid
;
1274 uio
->uio_resid
= clip_size
;
1275 retval
= cluster_nocopy_write(vp
, uio
, newEOF
, devblocksize
, flags
);
1276 if ((retval
== 0) && uio
->uio_resid
)
1277 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
);
1278 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
1287 cluster_nocopy_write(vp
, uio
, newEOF
, devblocksize
, flags
)
1295 upl_page_info_t
*pl
;
1297 vm_offset_t upl_offset
;
1302 int upl_needed_size
;
1308 int force_data_sync
;
1310 struct clios iostate
;
1312 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_START
,
1313 (int)uio
->uio_offset
, (int)uio
->uio_resid
,
1314 (int)newEOF
, devblocksize
, 0);
1317 * When we enter this routine, we know
1318 * -- the offset into the file is on a pagesize boundary
1319 * -- the resid is a page multiple
1320 * -- the resid will not exceed iov_len
1322 cluster_try_push(vp
, newEOF
, 0, 1);
1324 iostate
.io_completed
= 0;
1325 iostate
.io_issued
= 0;
1326 iostate
.io_error
= 0;
1327 iostate
.io_wanted
= 0;
1331 while (uio
->uio_resid
&& uio
->uio_offset
< newEOF
&& error
== 0) {
1332 io_size
= uio
->uio_resid
;
1334 if (io_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
1335 io_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
1337 upl_offset
= (vm_offset_t
)iov
->iov_base
& PAGE_MASK
;
1338 upl_needed_size
= (upl_offset
+ io_size
+ (PAGE_SIZE
-1)) & ~PAGE_MASK
;
1340 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_START
,
1341 (int)upl_offset
, upl_needed_size
, (int)iov
->iov_base
, io_size
, 0);
1343 for (force_data_sync
= 0; force_data_sync
< 3; force_data_sync
++) {
1345 upl_size
= upl_needed_size
;
1346 upl_flags
= UPL_FILE_IO
| UPL_COPYOUT_FROM
| UPL_NO_SYNC
|
1347 UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
1349 kret
= vm_map_get_upl(current_map(),
1350 (vm_offset_t
)iov
->iov_base
& ~PAGE_MASK
,
1358 if (kret
!= KERN_SUCCESS
) {
1359 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
1362 * cluster_nocopy_write: failed to get pagelist
1364 * we may have already spun some portion of this request
1365 * off as async requests... we need to wait for the I/O
1366 * to complete before returning
1368 goto wait_for_writes
;
1370 pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
1371 pages_in_pl
= upl_size
/ PAGE_SIZE
;
1373 for (i
= 0; i
< pages_in_pl
; i
++) {
1374 if (!upl_valid_page(pl
, i
))
1377 if (i
== pages_in_pl
)
1381 * didn't get all the pages back that we
1382 * needed... release this upl and try again
1384 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
1385 UPL_ABORT_FREE_ON_EMPTY
);
1387 if (force_data_sync
>= 3) {
1388 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
1389 i
, pages_in_pl
, upl_size
, kret
, 0);
1391 * for some reason, we couldn't acquire a hold on all
1392 * the pages needed in the user's address space
1394 * we may have already spun some portion of this request
1395 * off as async requests... we need to wait for the I/O
1396 * to complete before returning
1398 goto wait_for_writes
;
1402 * Consider the possibility that upl_size wasn't satisfied.
1404 if (upl_size
!= upl_needed_size
)
1405 io_size
= (upl_size
- (int)upl_offset
) & ~PAGE_MASK
;
1407 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
1408 (int)upl_offset
, upl_size
, (int)iov
->iov_base
, io_size
, 0);
1411 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
1412 UPL_ABORT_FREE_ON_EMPTY
);
1414 * we may have already spun some portion of this request
1415 * off as async requests... we need to wait for the I/O
1416 * to complete before returning
1418 goto wait_for_writes
;
1421 * Now look for pages already in the cache
1422 * and throw them away.
1423 * uio->uio_offset is page aligned within the file
1424 * io_size is a multiple of PAGE_SIZE
1426 ubc_range_op(vp
, uio
->uio_offset
, uio
->uio_offset
+ io_size
, UPL_ROP_DUMP
, NULL
);
1429 * we want push out these writes asynchronously so that we can overlap
1430 * the preparation of the next I/O
1431 * if there are already too many outstanding writes
1432 * wait until some complete before issuing the next
1434 while ((iostate
.io_issued
- iostate
.io_completed
) > (2 * MAX_UPL_TRANSFER
* PAGE_SIZE
)) {
1435 iostate
.io_wanted
= 1;
1436 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO
+ 1, "cluster_nocopy_write", 0);
1438 if (iostate
.io_error
) {
1440 * one of the earlier writes we issued ran into a hard error
1441 * don't issue any more writes, cleanup the UPL
1442 * that was just created but not used, then
1443 * go wait for all writes that are part of this stream
1444 * to complete before returning the error to the caller
1446 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
1447 UPL_ABORT_FREE_ON_EMPTY
);
1449 goto wait_for_writes
;
1451 io_flag
= CL_ASYNC
| CL_PRESERVE
| CL_COMMIT
| CL_THROTTLE
;
1453 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 77)) | DBG_FUNC_START
,
1454 (int)upl_offset
, (int)uio
->uio_offset
, io_size
, io_flag
, 0);
1456 error
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
,
1457 io_size
, devblocksize
, io_flag
, (struct buf
*)0, &iostate
);
1459 iov
->iov_len
-= io_size
;
1460 iov
->iov_base
+= io_size
;
1461 uio
->uio_resid
-= io_size
;
1462 uio
->uio_offset
+= io_size
;
1464 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 77)) | DBG_FUNC_END
,
1465 (int)upl_offset
, (int)uio
->uio_offset
, (int)uio
->uio_resid
, error
, 0);
1471 * make sure all async writes issued as part of this stream
1472 * have completed before we return
1474 while (iostate
.io_issued
!= iostate
.io_completed
) {
1475 iostate
.io_wanted
= 1;
1476 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO
+ 1, "cluster_nocopy_write", 0);
1478 if (iostate
.io_error
)
1479 error
= iostate
.io_error
;
1481 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_END
,
1482 (int)uio
->uio_offset
, (int)uio
->uio_resid
, error
, 4, 0);
1489 cluster_phys_write(vp
, uio
, newEOF
, devblocksize
, flags
)
1496 upl_page_info_t
*pl
;
1499 vm_offset_t upl_offset
;
1503 int upl_needed_size
;
1511 * When we enter this routine, we know
1512 * -- the resid will not exceed iov_len
1513 * -- the vector target address is physcially contiguous
1515 cluster_try_push(vp
, newEOF
, 0, 1);
1518 io_size
= iov
->iov_len
;
1519 upl_offset
= (vm_offset_t
)iov
->iov_base
& PAGE_MASK
;
1520 upl_needed_size
= upl_offset
+ io_size
;
1523 upl_size
= upl_needed_size
;
1524 upl_flags
= UPL_FILE_IO
| UPL_COPYOUT_FROM
| UPL_NO_SYNC
|
1525 UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
1527 kret
= vm_map_get_upl(current_map(),
1528 (vm_offset_t
)iov
->iov_base
& ~PAGE_MASK
,
1529 &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
, 0);
1531 if (kret
!= KERN_SUCCESS
) {
1533 * cluster_phys_write: failed to get pagelist
1534 * note: return kret here
1539 * Consider the possibility that upl_size wasn't satisfied.
1540 * This is a failure in the physical memory case.
1542 if (upl_size
< upl_needed_size
) {
1543 kernel_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
1546 pl
= ubc_upl_pageinfo(upl
);
1548 src_paddr
= ((addr64_t
)upl_phys_page(pl
, 0) << 12) + ((addr64_t
)((u_int
)iov
->iov_base
& PAGE_MASK
));
1550 while (((uio
->uio_offset
& (devblocksize
- 1)) || io_size
< devblocksize
) && io_size
) {
1553 head_size
= devblocksize
- (int)(uio
->uio_offset
& (devblocksize
- 1));
1555 if (head_size
> io_size
)
1556 head_size
= io_size
;
1558 error
= cluster_align_phys_io(vp
, uio
, src_paddr
, head_size
, devblocksize
, 0);
1561 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
1565 upl_offset
+= head_size
;
1566 src_paddr
+= head_size
;
1567 io_size
-= head_size
;
1569 tail_size
= io_size
& (devblocksize
- 1);
1570 io_size
-= tail_size
;
1574 * issue a synchronous write to cluster_io
1576 error
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
,
1577 io_size
, 0, CL_DEV_MEMORY
, (struct buf
*)0, (struct clios
*)0);
1581 * The cluster_io write completed successfully,
1582 * update the uio structure
1584 uio
->uio_resid
-= io_size
;
1585 iov
->iov_len
-= io_size
;
1586 iov
->iov_base
+= io_size
;
1587 uio
->uio_offset
+= io_size
;
1588 src_paddr
+= io_size
;
1591 error
= cluster_align_phys_io(vp
, uio
, src_paddr
, tail_size
, devblocksize
, 0);
1594 * just release our hold on the physically contiguous
1595 * region without changing any state
1597 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
1604 cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
)
1614 upl_page_info_t
*pl
;
1616 vm_offset_t upl_offset
;
1630 long long total_size
;
1633 long long zero_cnt1
;
1635 daddr_t start_blkno
;
1641 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_START
,
1642 (int)uio
->uio_offset
, uio
->uio_resid
, (int)oldEOF
, (int)newEOF
, 0);
1644 uio_resid
= uio
->uio_resid
;
1646 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_START
,
1647 0, 0, (int)oldEOF
, (int)newEOF
, 0);
1654 if (flags
& IO_HEADZEROFILL
) {
1656 * some filesystems (HFS is one) don't support unallocated holes within a file...
1657 * so we zero fill the intervening space between the old EOF and the offset
1658 * where the next chunk of real data begins.... ftruncate will also use this
1659 * routine to zero fill to the new EOF when growing a file... in this case, the
1660 * uio structure will not be provided
1663 if (headOff
< uio
->uio_offset
) {
1664 zero_cnt
= uio
->uio_offset
- headOff
;
1667 } else if (headOff
< newEOF
) {
1668 zero_cnt
= newEOF
- headOff
;
1672 if (flags
& IO_TAILZEROFILL
) {
1674 zero_off1
= uio
->uio_offset
+ uio
->uio_resid
;
1676 if (zero_off1
< tailOff
)
1677 zero_cnt1
= tailOff
- zero_off1
;
1680 if (zero_cnt
== 0 && uio
== (struct uio
*) 0) {
1681 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_END
,
1682 retval
, 0, 0, 0, 0);
1686 while ((total_size
= (uio_resid
+ zero_cnt
+ zero_cnt1
)) && retval
== 0) {
1688 * for this iteration of the loop, figure out where our starting point is
1691 start_offset
= (int)(zero_off
& PAGE_MASK_64
);
1692 upl_f_offset
= zero_off
- start_offset
;
1693 } else if (uio_resid
) {
1694 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
1695 upl_f_offset
= uio
->uio_offset
- start_offset
;
1697 start_offset
= (int)(zero_off1
& PAGE_MASK_64
);
1698 upl_f_offset
= zero_off1
- start_offset
;
1700 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 46)) | DBG_FUNC_NONE
,
1701 (int)zero_off
, (int)zero_cnt
, (int)zero_off1
, (int)zero_cnt1
, 0);
1703 if (total_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
1704 total_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
1706 start_blkno
= (daddr_t
)(upl_f_offset
/ PAGE_SIZE_64
);
1708 if (uio
&& !(vp
->v_flag
& VNOCACHE_DATA
) &&
1709 (flags
& (IO_SYNC
| IO_HEADZEROFILL
| IO_TAILZEROFILL
)) == 0) {
1711 * assumption... total_size <= uio_resid
1712 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
1714 if ((start_offset
+ total_size
) > (MAX_UPL_TRANSFER
* PAGE_SIZE
))
1715 total_size
-= start_offset
;
1716 xfer_resid
= total_size
;
1718 retval
= cluster_copy_ubc_data(vp
, uio
, &xfer_resid
, 1);
1723 uio_resid
-= (total_size
- xfer_resid
);
1724 total_size
= xfer_resid
;
1725 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
1726 upl_f_offset
= uio
->uio_offset
- start_offset
;
1728 if (total_size
== 0) {
1731 * the write did not finish on a page boundary
1732 * which will leave upl_f_offset pointing to the
1733 * beginning of the last page written instead of
1734 * the page beyond it... bump it in this case
1735 * so that the cluster code records the last page
1738 upl_f_offset
+= PAGE_SIZE_64
;
1746 * compute the size of the upl needed to encompass
1747 * the requested write... limit each call to cluster_io
1748 * to the maximum UPL size... cluster_io will clip if
1749 * this exceeds the maximum io_size for the device,
1750 * make sure to account for
1751 * a starting offset that's not page aligned
1753 upl_size
= (start_offset
+ total_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
1755 if (upl_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
1756 upl_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
1758 pages_in_upl
= upl_size
/ PAGE_SIZE
;
1759 io_size
= upl_size
- start_offset
;
1761 if ((long long)io_size
> total_size
)
1762 io_size
= total_size
;
1764 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_START
, upl_size
, io_size
, total_size
, 0, 0);
1767 kret
= ubc_create_upl(vp
,
1773 if (kret
!= KERN_SUCCESS
)
1774 panic("cluster_write: failed to get pagelist");
1776 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_END
,
1777 (int)upl
, (int)upl_f_offset
, start_offset
, 0, 0);
1779 if (start_offset
&& !upl_valid_page(pl
, 0)) {
1783 * we're starting in the middle of the first page of the upl
1784 * and the page isn't currently valid, so we're going to have
1785 * to read it in first... this is a synchronous operation
1787 read_size
= PAGE_SIZE
;
1789 if ((upl_f_offset
+ read_size
) > newEOF
)
1790 read_size
= newEOF
- upl_f_offset
;
1792 retval
= cluster_io(vp
, upl
, 0, upl_f_offset
, read_size
, devblocksize
,
1793 CL_READ
, (struct buf
*)0, (struct clios
*)0);
1796 * we had an error during the read which causes us to abort
1797 * the current cluster_write request... before we do, we need
1798 * to release the rest of the pages in the upl without modifying
1799 * there state and mark the failed page in error
1801 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
);
1802 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
1804 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
1805 (int)upl
, 0, 0, retval
, 0);
1809 if ((start_offset
== 0 || upl_size
> PAGE_SIZE
) && ((start_offset
+ io_size
) & PAGE_MASK
)) {
1811 * the last offset we're writing to in this upl does not end on a page
1812 * boundary... if it's not beyond the old EOF, then we'll also need to
1813 * pre-read this page in if it isn't already valid
1815 upl_offset
= upl_size
- PAGE_SIZE
;
1817 if ((upl_f_offset
+ start_offset
+ io_size
) < oldEOF
&&
1818 !upl_valid_page(pl
, upl_offset
/ PAGE_SIZE
)) {
1821 read_size
= PAGE_SIZE
;
1823 if ((upl_f_offset
+ upl_offset
+ read_size
) > newEOF
)
1824 read_size
= newEOF
- (upl_f_offset
+ upl_offset
);
1826 retval
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, read_size
, devblocksize
,
1827 CL_READ
, (struct buf
*)0, (struct clios
*)0);
1830 * we had an error during the read which causes us to abort
1831 * the current cluster_write request... before we do, we
1832 * need to release the rest of the pages in the upl without
1833 * modifying there state and mark the failed page in error
1835 ubc_upl_abort_range(upl
, upl_offset
, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
);
1836 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
1838 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
1839 (int)upl
, 0, 0, retval
, 0);
1844 xfer_resid
= io_size
;
1845 io_offset
= start_offset
;
1847 while (zero_cnt
&& xfer_resid
) {
1849 if (zero_cnt
< (long long)xfer_resid
)
1850 bytes_to_zero
= zero_cnt
;
1852 bytes_to_zero
= xfer_resid
;
1854 if ( !(flags
& (IO_NOZEROVALID
| IO_NOZERODIRTY
))) {
1855 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
1859 bytes_to_zero
= min(bytes_to_zero
, PAGE_SIZE
- (int)(zero_off
& PAGE_MASK_64
));
1860 zero_pg_index
= (int)((zero_off
- upl_f_offset
) / PAGE_SIZE_64
);
1862 if ( !upl_valid_page(pl
, zero_pg_index
)) {
1863 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
1865 } else if ((flags
& (IO_NOZERODIRTY
| IO_NOZEROVALID
)) == IO_NOZERODIRTY
&&
1866 !upl_dirty_page(pl
, zero_pg_index
)) {
1867 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
1870 xfer_resid
-= bytes_to_zero
;
1871 zero_cnt
-= bytes_to_zero
;
1872 zero_off
+= bytes_to_zero
;
1873 io_offset
+= bytes_to_zero
;
1875 if (xfer_resid
&& uio_resid
) {
1876 bytes_to_move
= min(uio_resid
, xfer_resid
);
1878 retval
= cluster_copy_upl_data(uio
, upl
, io_offset
, bytes_to_move
);
1882 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
1884 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
1885 (int)upl
, 0, 0, retval
, 0);
1887 uio_resid
-= bytes_to_move
;
1888 xfer_resid
-= bytes_to_move
;
1889 io_offset
+= bytes_to_move
;
1892 while (xfer_resid
&& zero_cnt1
&& retval
== 0) {
1894 if (zero_cnt1
< (long long)xfer_resid
)
1895 bytes_to_zero
= zero_cnt1
;
1897 bytes_to_zero
= xfer_resid
;
1899 if ( !(flags
& (IO_NOZEROVALID
| IO_NOZERODIRTY
))) {
1900 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
1904 bytes_to_zero
= min(bytes_to_zero
, PAGE_SIZE
- (int)(zero_off1
& PAGE_MASK_64
));
1905 zero_pg_index
= (int)((zero_off1
- upl_f_offset
) / PAGE_SIZE_64
);
1907 if ( !upl_valid_page(pl
, zero_pg_index
)) {
1908 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
1909 } else if ((flags
& (IO_NOZERODIRTY
| IO_NOZEROVALID
)) == IO_NOZERODIRTY
&&
1910 !upl_dirty_page(pl
, zero_pg_index
)) {
1911 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
1914 xfer_resid
-= bytes_to_zero
;
1915 zero_cnt1
-= bytes_to_zero
;
1916 zero_off1
+= bytes_to_zero
;
1917 io_offset
+= bytes_to_zero
;
1924 io_size
+= start_offset
;
1926 if ((upl_f_offset
+ io_size
) >= newEOF
&& io_size
< upl_size
) {
1928 * if we're extending the file with this write
1929 * we'll zero fill the rest of the page so that
1930 * if the file gets extended again in such a way as to leave a
1931 * hole starting at this EOF, we'll have zero's in the correct spot
1933 cluster_zero(upl
, io_size
, upl_size
- io_size
, NULL
);
1935 if (flags
& IO_SYNC
)
1937 * if the IO_SYNC flag is set than we need to
1938 * bypass any clusters and immediately issue
1944 * calculate the last logical block number
1945 * that this delayed I/O encompassed
1947 last_blkno
= (upl_f_offset
+ (off_t
)upl_size
) / PAGE_SIZE_64
;
1949 if (vp
->v_flag
& VHASDIRTY
) {
1951 if ( !(vp
->v_flag
& VNOCACHE_DATA
)) {
1953 * we've fallen into the sparse
1954 * cluster method of delaying dirty pages
1955 * first, we need to release the upl if we hold one
1956 * since pages in it may be present in the sparse cluster map
1957 * and may span 2 separate buckets there... if they do and
1958 * we happen to have to flush a bucket to make room and it intersects
1959 * this upl, a deadlock may result on page BUSY
1962 ubc_upl_commit_range(upl
, 0, upl_size
,
1963 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
1965 sparse_cluster_add(vp
, newEOF
, start_blkno
, last_blkno
);
1970 * must have done cached writes that fell into
1971 * the sparse cluster mechanism... we've switched
1972 * to uncached writes on the file, so go ahead
1973 * and push whatever's in the sparse map
1974 * and switch back to normal clustering
1976 * see the comment above concerning a possible deadlock...
1979 ubc_upl_commit_range(upl
, 0, upl_size
,
1980 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
1982 * setting upl_size to 0 keeps us from committing a
1983 * second time in the start_new_cluster path
1987 sparse_cluster_push(vp
, ubc_getsize(vp
), 1);
1990 * no clusters of either type present at this point
1991 * so just go directly to start_new_cluster since
1992 * we know we need to delay this I/O since we've
1993 * already released the pages back into the cache
1994 * to avoid the deadlock with sparse_cluster_push
1996 goto start_new_cluster
;
2000 if (vp
->v_clen
== 0)
2002 * no clusters currently present
2004 goto start_new_cluster
;
2006 for (cl_index
= 0; cl_index
< vp
->v_clen
; cl_index
++) {
2008 * check each cluster that we currently hold
2009 * try to merge some or all of this write into
2010 * one or more of the existing clusters... if
2011 * any portion of the write remains, start a
2014 if (start_blkno
>= vp
->v_clusters
[cl_index
].start_pg
) {
2016 * the current write starts at or after the current cluster
2018 if (last_blkno
<= (vp
->v_clusters
[cl_index
].start_pg
+ MAX_UPL_TRANSFER
)) {
2020 * we have a write that fits entirely
2021 * within the existing cluster limits
2023 if (last_blkno
> vp
->v_clusters
[cl_index
].last_pg
)
2025 * update our idea of where the cluster ends
2027 vp
->v_clusters
[cl_index
].last_pg
= last_blkno
;
2030 if (start_blkno
< (vp
->v_clusters
[cl_index
].start_pg
+ MAX_UPL_TRANSFER
)) {
2032 * we have a write that starts in the middle of the current cluster
2033 * but extends beyond the cluster's limit... we know this because
2034 * of the previous checks
2035 * we'll extend the current cluster to the max
2036 * and update the start_blkno for the current write to reflect that
2037 * the head of it was absorbed into this cluster...
2038 * note that we'll always have a leftover tail in this case since
2039 * full absorbtion would have occurred in the clause above
2041 vp
->v_clusters
[cl_index
].last_pg
= vp
->v_clusters
[cl_index
].start_pg
+ MAX_UPL_TRANSFER
;
2044 int start_pg_in_upl
;
2046 start_pg_in_upl
= upl_f_offset
/ PAGE_SIZE_64
;
2048 if (start_pg_in_upl
< vp
->v_clusters
[cl_index
].last_pg
) {
2049 intersection
= (vp
->v_clusters
[cl_index
].last_pg
- start_pg_in_upl
) * PAGE_SIZE
;
2051 ubc_upl_commit_range(upl
, upl_offset
, intersection
,
2052 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2053 upl_f_offset
+= intersection
;
2054 upl_offset
+= intersection
;
2055 upl_size
-= intersection
;
2058 start_blkno
= vp
->v_clusters
[cl_index
].last_pg
;
2061 * we come here for the case where the current write starts
2062 * beyond the limit of the existing cluster or we have a leftover
2063 * tail after a partial absorbtion
2065 * in either case, we'll check the remaining clusters before
2066 * starting a new one
2070 * the current write starts in front of the cluster we're currently considering
2072 if ((vp
->v_clusters
[cl_index
].last_pg
- start_blkno
) <= MAX_UPL_TRANSFER
) {
2074 * we can just merge the new request into
2075 * this cluster and leave it in the cache
2076 * since the resulting cluster is still
2077 * less than the maximum allowable size
2079 vp
->v_clusters
[cl_index
].start_pg
= start_blkno
;
2081 if (last_blkno
> vp
->v_clusters
[cl_index
].last_pg
) {
2083 * the current write completely
2084 * envelops the existing cluster and since
2085 * each write is limited to at most MAX_UPL_TRANSFER bytes
2086 * we can just use the start and last blocknos of the write
2087 * to generate the cluster limits
2089 vp
->v_clusters
[cl_index
].last_pg
= last_blkno
;
2095 * if we were to combine this write with the current cluster
2096 * we would exceed the cluster size limit.... so,
2097 * let's see if there's any overlap of the new I/O with
2098 * the cluster we're currently considering... in fact, we'll
2099 * stretch the cluster out to it's full limit and see if we
2100 * get an intersection with the current write
2103 if (last_blkno
> vp
->v_clusters
[cl_index
].last_pg
- MAX_UPL_TRANSFER
) {
2105 * the current write extends into the proposed cluster
2106 * clip the length of the current write after first combining it's
2107 * tail with the newly shaped cluster
2109 vp
->v_clusters
[cl_index
].start_pg
= vp
->v_clusters
[cl_index
].last_pg
- MAX_UPL_TRANSFER
;
2112 intersection
= (last_blkno
- vp
->v_clusters
[cl_index
].start_pg
) * PAGE_SIZE
;
2114 if (intersection
> upl_size
)
2116 * because the current write may consist of a number of pages found in the cache
2117 * which are not part of the UPL, we may have an intersection that exceeds
2118 * the size of the UPL that is also part of this write
2120 intersection
= upl_size
;
2122 ubc_upl_commit_range(upl
, upl_offset
+ (upl_size
- intersection
), intersection
,
2123 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2124 upl_size
-= intersection
;
2126 last_blkno
= vp
->v_clusters
[cl_index
].start_pg
;
2129 * if we get here, there was no way to merge
2130 * any portion of this write with this cluster
2131 * or we could only merge part of it which
2132 * will leave a tail...
2133 * we'll check the remaining clusters before starting a new one
2137 if (cl_index
< vp
->v_clen
)
2139 * we found an existing cluster(s) that we
2140 * could entirely merge this I/O into
2144 if (vp
->v_clen
< MAX_CLUSTERS
&& !(vp
->v_flag
& VNOCACHE_DATA
))
2146 * we didn't find an existing cluster to
2147 * merge into, but there's room to start
2150 goto start_new_cluster
;
2153 * no exisitng cluster to merge with and no
2154 * room to start a new one... we'll try
2155 * pushing one of the existing ones... if none of
2156 * them are able to be pushed, we'll switch
2157 * to the sparse cluster mechanism
2158 * cluster_try_push updates v_clen to the
2159 * number of remaining clusters... and
2160 * returns the number of currently unused clusters
2162 if (vp
->v_flag
& VNOCACHE_DATA
)
2167 if (cluster_try_push(vp
, newEOF
, can_delay
, 0) == 0) {
2169 * no more room in the normal cluster mechanism
2170 * so let's switch to the more expansive but expensive
2171 * sparse mechanism....
2172 * first, we need to release the upl if we hold one
2173 * since pages in it may be present in the sparse cluster map (after the cluster_switch)
2174 * and may span 2 separate buckets there... if they do and
2175 * we happen to have to flush a bucket to make room and it intersects
2176 * this upl, a deadlock may result on page BUSY
2179 ubc_upl_commit_range(upl
, upl_offset
, upl_size
,
2180 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2182 sparse_cluster_switch(vp
, newEOF
);
2183 sparse_cluster_add(vp
, newEOF
, start_blkno
, last_blkno
);
2188 * we pushed one cluster successfully, so we must be sequentially writing this file
2189 * otherwise, we would have failed and fallen into the sparse cluster support
2190 * so let's take the opportunity to push out additional clusters as long as we
2191 * remain below the throttle... this will give us better I/O locality if we're
2192 * in a copy loop (i.e. we won't jump back and forth between the read and write points
2193 * however, we don't want to push so much out that the write throttle kicks in and
2194 * hangs this thread up until some of the I/O completes...
2196 while (vp
->v_clen
&& (vp
->v_numoutput
<= (ASYNC_THROTTLE
/ 2)))
2197 cluster_try_push(vp
, newEOF
, 0, 0);
2200 if (vp
->v_clen
== 0)
2201 vp
->v_ciosiz
= devblocksize
;
2203 vp
->v_clusters
[vp
->v_clen
].start_pg
= start_blkno
;
2204 vp
->v_clusters
[vp
->v_clen
].last_pg
= last_blkno
;
2209 ubc_upl_commit_range(upl
, upl_offset
, upl_size
,
2210 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2214 * in order to maintain some semblance of coherency with mapped writes
2215 * we need to write the cluster back out as a multiple of the PAGESIZE
2216 * unless the cluster encompasses the last page of the file... in this
2217 * case we'll round out to the nearest device block boundary
2221 if ((upl_f_offset
+ io_size
) > newEOF
) {
2222 io_size
= newEOF
- upl_f_offset
;
2223 io_size
= (io_size
+ (devblocksize
- 1)) & ~(devblocksize
- 1);
2226 if (flags
& IO_SYNC
)
2227 io_flags
= CL_THROTTLE
| CL_COMMIT
| CL_AGE
;
2229 io_flags
= CL_THROTTLE
| CL_COMMIT
| CL_AGE
| CL_ASYNC
;
2231 if (vp
->v_flag
& VNOCACHE_DATA
)
2232 io_flags
|= CL_DUMP
;
2234 retval
= cluster_io(vp
, upl
, 0, upl_f_offset
, io_size
, devblocksize
,
2235 io_flags
, (struct buf
*)0, (struct clios
*)0);
2238 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_END
,
2239 retval
, 0, uio_resid
, 0, 0);
2245 cluster_read(vp
, uio
, filesize
, devblocksize
, flags
)
2262 if (!((vp
->v_flag
& VNOCACHE_DATA
) && (uio
->uio_segflg
== UIO_USERSPACE
)))
2265 * go do a read through the cache if one of the following is true....
2266 * NOCACHE is not true
2267 * the uio request doesn't target USERSPACE
2269 return (cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
));
2272 while (uio
->uio_resid
&& uio
->uio_offset
< filesize
&& retval
== 0)
2275 * we know we have a resid, so this is safe
2276 * skip over any emtpy vectors
2280 while (iov
->iov_len
== 0) {
2285 upl_size
= PAGE_SIZE
;
2286 upl_flags
= UPL_QUERY_OBJECT_TYPE
;
2288 if ((vm_map_get_upl(current_map(),
2289 (vm_offset_t
)iov
->iov_base
& ~PAGE_MASK
,
2290 &upl_size
, &upl
, NULL
, NULL
, &upl_flags
, 0)) != KERN_SUCCESS
)
2293 * the user app must have passed in an invalid address
2299 * We check every vector target but if it is physically
2300 * contiguous space, we skip the sanity checks.
2302 if (upl_flags
& UPL_PHYS_CONTIG
)
2304 retval
= cluster_phys_read(vp
, uio
, filesize
, devblocksize
, flags
);
2306 else if (uio
->uio_resid
< PAGE_SIZE
)
2309 * we're here because we're don't have a physically contiguous target buffer
2310 * go do a read through the cache if
2311 * the total xfer size is less than a page...
2313 return (cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
));
2315 else if (((int)uio
->uio_offset
& PAGE_MASK
) || ((int)iov
->iov_base
& PAGE_MASK
))
2317 if (((int)uio
->uio_offset
& PAGE_MASK
) == ((int)iov
->iov_base
& PAGE_MASK
))
2320 * Bring the file offset read up to a pagesize boundary
2321 * this will also bring the base address to a page boundary
2322 * since they both are currently on the same offset within a page
2323 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
2324 * so the computed clip_size must always be less than the current uio_resid
2326 clip_size
= (PAGE_SIZE
- (int)(uio
->uio_offset
& PAGE_MASK_64
));
2329 * Fake the resid going into the cluster_read_x call
2330 * and restore it on the way out.
2332 prev_resid
= uio
->uio_resid
;
2333 uio
->uio_resid
= clip_size
;
2334 retval
= cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
);
2335 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
2340 * can't get both the file offset and the buffer offset aligned to a page boundary
2341 * so fire an I/O through the cache for this entire vector
2343 clip_size
= iov
->iov_len
;
2344 prev_resid
= uio
->uio_resid
;
2345 uio
->uio_resid
= clip_size
;
2346 retval
= cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
);
2347 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
2353 * If we come in here, we know the offset into
2354 * the file is on a pagesize boundary
2357 max_io_size
= filesize
- uio
->uio_offset
;
2358 clip_size
= uio
->uio_resid
;
2359 if (iov
->iov_len
< clip_size
)
2360 clip_size
= iov
->iov_len
;
2361 if (max_io_size
< clip_size
)
2362 clip_size
= (int)max_io_size
;
2364 if (clip_size
< PAGE_SIZE
)
2367 * Take care of the tail end of the read in this vector.
2369 prev_resid
= uio
->uio_resid
;
2370 uio
->uio_resid
= clip_size
;
2371 retval
= cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
);
2372 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
2376 /* round clip_size down to a multiple of pagesize */
2377 clip_size
= clip_size
& ~(PAGE_MASK
);
2378 prev_resid
= uio
->uio_resid
;
2379 uio
->uio_resid
= clip_size
;
2380 retval
= cluster_nocopy_read(vp
, uio
, filesize
, devblocksize
, flags
);
2381 if ((retval
==0) && uio
->uio_resid
)
2382 retval
= cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
);
2383 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
2392 cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
)
2399 upl_page_info_t
*pl
;
2401 vm_offset_t upl_offset
;
2410 off_t last_ioread_offset
;
2411 off_t last_request_offset
;
2412 u_int size_of_prefetch
;
2419 struct clios iostate
;
2420 u_int max_rd_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
2421 u_int rd_ahead_enabled
= 1;
2422 u_int prefetch_enabled
= 1;
2425 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_START
,
2426 (int)uio
->uio_offset
, uio
->uio_resid
, (int)filesize
, devblocksize
, 0);
2428 if (cluster_hard_throttle_on(vp
)) {
2429 rd_ahead_enabled
= 0;
2430 prefetch_enabled
= 0;
2432 max_rd_size
= HARD_THROTTLE_MAXSIZE
;
2434 if (vp
->v_flag
& (VRAOFF
|VNOCACHE_DATA
))
2435 rd_ahead_enabled
= 0;
2437 last_request_offset
= uio
->uio_offset
+ uio
->uio_resid
;
2439 if (last_request_offset
> filesize
)
2440 last_request_offset
= filesize
;
2441 b_lblkno
= (u_int
)(uio
->uio_offset
/ PAGE_SIZE_64
);
2442 e_lblkno
= (u_int
)((last_request_offset
- 1) / PAGE_SIZE_64
);
2444 if (vp
->v_ralen
&& (vp
->v_lastr
== b_lblkno
|| (vp
->v_lastr
+ 1) == b_lblkno
)) {
2446 * determine if we already have a read-ahead in the pipe courtesy of the
2447 * last read systemcall that was issued...
2448 * if so, pick up it's extent to determine where we should start
2449 * with respect to any read-ahead that might be necessary to
2450 * garner all the data needed to complete this read systemcall
2452 last_ioread_offset
= (vp
->v_maxra
* PAGE_SIZE_64
) + PAGE_SIZE_64
;
2454 if (last_ioread_offset
< uio
->uio_offset
)
2455 last_ioread_offset
= (off_t
)0;
2456 else if (last_ioread_offset
> last_request_offset
)
2457 last_ioread_offset
= last_request_offset
;
2459 last_ioread_offset
= (off_t
)0;
2461 while (uio
->uio_resid
&& uio
->uio_offset
< filesize
&& retval
== 0) {
2463 * compute the size of the upl needed to encompass
2464 * the requested read... limit each call to cluster_io
2465 * to the maximum UPL size... cluster_io will clip if
2466 * this exceeds the maximum io_size for the device,
2467 * make sure to account for
2468 * a starting offset that's not page aligned
2470 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
2471 upl_f_offset
= uio
->uio_offset
- (off_t
)start_offset
;
2472 max_size
= filesize
- uio
->uio_offset
;
2474 if ((off_t
)((unsigned int)uio
->uio_resid
) < max_size
)
2475 io_size
= uio
->uio_resid
;
2479 if (!(vp
->v_flag
& VNOCACHE_DATA
)) {
2486 * if we keep finding the pages we need already in the cache, then
2487 * don't bother to call cluster_rd_prefetch since it costs CPU cycles
2488 * to determine that we have all the pages we need... once we miss in
2489 * the cache and have issued an I/O, than we'll assume that we're likely
2490 * to continue to miss in the cache and it's to our advantage to try and prefetch
2492 if (last_request_offset
&& last_ioread_offset
&& (size_of_prefetch
= (last_request_offset
- last_ioread_offset
))) {
2493 if ((last_ioread_offset
- uio
->uio_offset
) <= max_rd_size
&& prefetch_enabled
) {
2495 * we've already issued I/O for this request and
2496 * there's still work to do and
2497 * our prefetch stream is running dry, so issue a
2498 * pre-fetch I/O... the I/O latency will overlap
2499 * with the copying of the data
2501 if (size_of_prefetch
> max_rd_size
)
2502 size_of_prefetch
= max_rd_size
;
2504 size_of_prefetch
= cluster_rd_prefetch(vp
, last_ioread_offset
, size_of_prefetch
, filesize
, devblocksize
);
2506 last_ioread_offset
+= (off_t
)(size_of_prefetch
* PAGE_SIZE
);
2508 if (last_ioread_offset
> last_request_offset
)
2509 last_ioread_offset
= last_request_offset
;
2513 * limit the size of the copy we're about to do so that
2514 * we can notice that our I/O pipe is running dry and
2515 * get the next I/O issued before it does go dry
2517 if (last_ioread_offset
&& io_size
> ((MAX_UPL_TRANSFER
* PAGE_SIZE
) / 4))
2518 io_resid
= ((MAX_UPL_TRANSFER
* PAGE_SIZE
) / 4);
2522 io_requested
= io_resid
;
2524 retval
= cluster_copy_ubc_data(vp
, uio
, &io_resid
, 0);
2526 io_size
-= (io_requested
- io_resid
);
2528 if (retval
|| io_resid
)
2530 * if we run into a real error or
2531 * a page that is not in the cache
2532 * we need to leave streaming mode
2536 if ((io_size
== 0 || last_ioread_offset
== last_request_offset
) && rd_ahead_enabled
) {
2538 * we're already finished the I/O for this read request
2539 * let's see if we should do a read-ahead
2541 cluster_rd_ahead(vp
, b_lblkno
, e_lblkno
, filesize
, devblocksize
);
2547 if (e_lblkno
< vp
->v_lastr
)
2549 vp
->v_lastr
= e_lblkno
;
2553 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
2554 upl_f_offset
= uio
->uio_offset
- (off_t
)start_offset
;
2555 max_size
= filesize
- uio
->uio_offset
;
2557 if (io_size
> max_rd_size
)
2558 io_size
= max_rd_size
;
2560 upl_size
= (start_offset
+ io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
2562 if (upl_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
) / 4)
2563 upl_size
= (MAX_UPL_TRANSFER
* PAGE_SIZE
) / 4;
2564 pages_in_upl
= upl_size
/ PAGE_SIZE
;
2566 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 33)) | DBG_FUNC_START
,
2567 (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
2569 kret
= ubc_create_upl(vp
,
2575 if (kret
!= KERN_SUCCESS
)
2576 panic("cluster_read: failed to get pagelist");
2578 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 33)) | DBG_FUNC_END
,
2579 (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
2582 * scan from the beginning of the upl looking for the first
2583 * non-valid page.... this will become the first page in
2584 * the request we're going to make to 'cluster_io'... if all
2585 * of the pages are valid, we won't call through to 'cluster_io'
2587 for (start_pg
= 0; start_pg
< pages_in_upl
; start_pg
++) {
2588 if (!upl_valid_page(pl
, start_pg
))
2593 * scan from the starting invalid page looking for a valid
2594 * page before the end of the upl is reached, if we
2595 * find one, then it will be the last page of the request to
2598 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
2599 if (upl_valid_page(pl
, last_pg
))
2602 iostate
.io_completed
= 0;
2603 iostate
.io_issued
= 0;
2604 iostate
.io_error
= 0;
2605 iostate
.io_wanted
= 0;
2607 if (start_pg
< last_pg
) {
2609 * we found a range of 'invalid' pages that must be filled
2610 * if the last page in this range is the last page of the file
2611 * we may have to clip the size of it to keep from reading past
2612 * the end of the last physical block associated with the file
2614 upl_offset
= start_pg
* PAGE_SIZE
;
2615 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
2617 if ((upl_f_offset
+ upl_offset
+ io_size
) > filesize
)
2618 io_size
= filesize
- (upl_f_offset
+ upl_offset
);
2621 * issue an asynchronous read to cluster_io
2624 error
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
,
2625 io_size
, devblocksize
, CL_READ
| CL_ASYNC
, (struct buf
*)0, &iostate
);
2629 * if the read completed successfully, or there was no I/O request
2630 * issued, than copy the data into user land via 'cluster_upl_copy_data'
2631 * we'll first add on any 'valid'
2632 * pages that were present in the upl when we acquired it.
2636 for (uio_last
= last_pg
; uio_last
< pages_in_upl
; uio_last
++) {
2637 if (!upl_valid_page(pl
, uio_last
))
2641 * compute size to transfer this round, if uio->uio_resid is
2642 * still non-zero after this attempt, we'll loop around and
2643 * set up for another I/O.
2645 val_size
= (uio_last
* PAGE_SIZE
) - start_offset
;
2647 if (val_size
> max_size
)
2648 val_size
= max_size
;
2650 if (val_size
> uio
->uio_resid
)
2651 val_size
= uio
->uio_resid
;
2653 if (last_ioread_offset
== 0)
2654 last_ioread_offset
= uio
->uio_offset
+ val_size
;
2656 if ((size_of_prefetch
= (last_request_offset
- last_ioread_offset
)) && prefetch_enabled
) {
2658 * if there's still I/O left to do for this request, and...
2659 * we're not in hard throttle mode, then issue a
2660 * pre-fetch I/O... the I/O latency will overlap
2661 * with the copying of the data
2663 size_of_prefetch
= cluster_rd_prefetch(vp
, last_ioread_offset
, size_of_prefetch
, filesize
, devblocksize
);
2665 last_ioread_offset
+= (off_t
)(size_of_prefetch
* PAGE_SIZE
);
2667 if (last_ioread_offset
> last_request_offset
)
2668 last_ioread_offset
= last_request_offset
;
2670 } else if ((uio
->uio_offset
+ val_size
) == last_request_offset
) {
2672 * this transfer will finish this request, so...
2673 * let's try to read ahead if we're in
2674 * a sequential access pattern and we haven't
2675 * explicitly disabled it
2677 if (rd_ahead_enabled
)
2678 cluster_rd_ahead(vp
, b_lblkno
, e_lblkno
, filesize
, devblocksize
);
2680 if (e_lblkno
< vp
->v_lastr
)
2682 vp
->v_lastr
= e_lblkno
;
2684 while (iostate
.io_issued
!= iostate
.io_completed
) {
2685 iostate
.io_wanted
= 1;
2686 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO
+ 1, "cluster_read_x", 0);
2688 if (iostate
.io_error
)
2689 error
= iostate
.io_error
;
2691 retval
= cluster_copy_upl_data(uio
, upl
, start_offset
, val_size
);
2693 if (start_pg
< last_pg
) {
2695 * compute the range of pages that we actually issued an I/O for
2696 * and either commit them as valid if the I/O succeeded
2697 * or abort them if the I/O failed
2699 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
2701 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_START
,
2702 (int)upl
, start_pg
* PAGE_SIZE
, io_size
, error
, 0);
2704 if (error
|| (vp
->v_flag
& VNOCACHE_DATA
))
2705 ubc_upl_abort_range(upl
, start_pg
* PAGE_SIZE
, io_size
,
2706 UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
2708 ubc_upl_commit_range(upl
, start_pg
* PAGE_SIZE
, io_size
,
2709 UPL_COMMIT_CLEAR_DIRTY
|
2710 UPL_COMMIT_FREE_ON_EMPTY
|
2711 UPL_COMMIT_INACTIVATE
);
2713 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_END
,
2714 (int)upl
, start_pg
* PAGE_SIZE
, io_size
, error
, 0);
2716 if ((last_pg
- start_pg
) < pages_in_upl
) {
2721 * the set of pages that we issued an I/O for did not encompass
2722 * the entire upl... so just release these without modifying
2726 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
2728 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_START
,
2729 (int)upl
, -1, pages_in_upl
- (last_pg
- start_pg
), 0, 0);
2733 * we found some already valid pages at the beginning of
2734 * the upl commit these back to the inactive list with
2737 for (cur_pg
= 0; cur_pg
< start_pg
; cur_pg
++) {
2738 commit_flags
= UPL_COMMIT_FREE_ON_EMPTY
2739 | UPL_COMMIT_INACTIVATE
;
2741 if (upl_dirty_page(pl
, cur_pg
))
2742 commit_flags
|= UPL_COMMIT_SET_DIRTY
;
2744 if ( !(commit_flags
& UPL_COMMIT_SET_DIRTY
) && (vp
->v_flag
& VNOCACHE_DATA
))
2745 ubc_upl_abort_range(upl
, cur_pg
* PAGE_SIZE
, PAGE_SIZE
,
2746 UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
2748 ubc_upl_commit_range(upl
, cur_pg
* PAGE_SIZE
,
2749 PAGE_SIZE
, commit_flags
);
2752 if (last_pg
< uio_last
) {
2754 * we found some already valid pages immediately after the
2755 * pages we issued I/O for, commit these back to the
2756 * inactive list with reference cleared
2758 for (cur_pg
= last_pg
; cur_pg
< uio_last
; cur_pg
++) {
2759 commit_flags
= UPL_COMMIT_FREE_ON_EMPTY
2760 | UPL_COMMIT_INACTIVATE
;
2762 if (upl_dirty_page(pl
, cur_pg
))
2763 commit_flags
|= UPL_COMMIT_SET_DIRTY
;
2765 if ( !(commit_flags
& UPL_COMMIT_SET_DIRTY
) && (vp
->v_flag
& VNOCACHE_DATA
))
2766 ubc_upl_abort_range(upl
, cur_pg
* PAGE_SIZE
, PAGE_SIZE
,
2767 UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
2769 ubc_upl_commit_range(upl
, cur_pg
* PAGE_SIZE
,
2770 PAGE_SIZE
, commit_flags
);
2773 if (uio_last
< pages_in_upl
) {
2775 * there were some invalid pages beyond the valid pages
2776 * that we didn't issue an I/O for, just release them
2779 ubc_upl_abort_range(upl
, uio_last
* PAGE_SIZE
,
2780 (pages_in_upl
- uio_last
) * PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
2783 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_END
,
2784 (int)upl
, -1, -1, 0, 0);
2790 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_END
,
2791 (int)uio
->uio_offset
, uio
->uio_resid
, vp
->v_lastr
, retval
, 0);
2798 cluster_nocopy_read(vp
, uio
, filesize
, devblocksize
, flags
)
2806 upl_page_info_t
*pl
;
2807 vm_offset_t upl_offset
;
2811 int upl_needed_size
;
2817 int force_data_sync
;
2819 struct clios iostate
;
2820 u_int max_rd_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
2821 u_int max_rd_ahead
= MAX_UPL_TRANSFER
* PAGE_SIZE
* 2;
2824 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_START
,
2825 (int)uio
->uio_offset
, uio
->uio_resid
, (int)filesize
, devblocksize
, 0);
2828 * When we enter this routine, we know
2829 * -- the offset into the file is on a pagesize boundary
2830 * -- the resid is a page multiple
2831 * -- the resid will not exceed iov_len
2834 iostate
.io_completed
= 0;
2835 iostate
.io_issued
= 0;
2836 iostate
.io_error
= 0;
2837 iostate
.io_wanted
= 0;
2841 if (cluster_hard_throttle_on(vp
)) {
2842 max_rd_size
= HARD_THROTTLE_MAXSIZE
;
2843 max_rd_ahead
= HARD_THROTTLE_MAXSIZE
- 1;
2845 while (uio
->uio_resid
&& uio
->uio_offset
< filesize
&& retval
== 0) {
2847 max_io_size
= filesize
- uio
->uio_offset
;
2849 if (max_io_size
< (off_t
)((unsigned int)uio
->uio_resid
))
2850 io_size
= max_io_size
;
2852 io_size
= uio
->uio_resid
;
2855 * First look for pages already in the cache
2856 * and move them to user space.
2858 retval
= cluster_copy_ubc_data(vp
, uio
, &io_size
, 0);
2862 * we may have already spun some portion of this request
2863 * off as async requests... we need to wait for the I/O
2864 * to complete before returning
2866 goto wait_for_reads
;
2869 * If we are already finished with this read, then return
2873 * we may have already spun some portion of this request
2874 * off as async requests... we need to wait for the I/O
2875 * to complete before returning
2877 goto wait_for_reads
;
2879 max_io_size
= io_size
;
2881 if (max_io_size
> max_rd_size
)
2882 max_io_size
= max_rd_size
;
2886 ubc_range_op(vp
, uio
->uio_offset
, uio
->uio_offset
+ max_io_size
, UPL_ROP_ABSENT
, &io_size
);
2890 * we may have already spun some portion of this request
2891 * off as async requests... we need to wait for the I/O
2892 * to complete before returning
2894 goto wait_for_reads
;
2896 upl_offset
= (vm_offset_t
)iov
->iov_base
& PAGE_MASK
;
2897 upl_needed_size
= (upl_offset
+ io_size
+ (PAGE_SIZE
-1)) & ~PAGE_MASK
;
2899 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_START
,
2900 (int)upl_offset
, upl_needed_size
, (int)iov
->iov_base
, io_size
, 0);
2902 for (force_data_sync
= 0; force_data_sync
< 3; force_data_sync
++) {
2904 upl_size
= upl_needed_size
;
2905 upl_flags
= UPL_FILE_IO
| UPL_NO_SYNC
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
2907 kret
= vm_map_get_upl(current_map(),
2908 (vm_offset_t
)iov
->iov_base
& ~PAGE_MASK
,
2909 &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
, force_data_sync
);
2911 if (kret
!= KERN_SUCCESS
) {
2912 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
2913 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
2915 * cluster_nocopy_read: failed to get pagelist
2917 * we may have already spun some portion of this request
2918 * off as async requests... we need to wait for the I/O
2919 * to complete before returning
2921 goto wait_for_reads
;
2923 pages_in_pl
= upl_size
/ PAGE_SIZE
;
2924 pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
2926 for (i
= 0; i
< pages_in_pl
; i
++) {
2927 if (!upl_valid_page(pl
, i
))
2930 if (i
== pages_in_pl
)
2933 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
2934 UPL_ABORT_FREE_ON_EMPTY
);
2936 if (force_data_sync
>= 3) {
2937 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
2938 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
2940 goto wait_for_reads
;
2943 * Consider the possibility that upl_size wasn't satisfied.
2945 if (upl_size
!= upl_needed_size
)
2946 io_size
= (upl_size
- (int)upl_offset
) & ~PAGE_MASK
;
2949 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
2950 UPL_ABORT_FREE_ON_EMPTY
);
2951 goto wait_for_reads
;
2953 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
2954 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
2957 * request asynchronously so that we can overlap
2958 * the preparation of the next I/O
2959 * if there are already too many outstanding reads
2960 * wait until some have completed before issuing the next read
2962 while ((iostate
.io_issued
- iostate
.io_completed
) > max_rd_ahead
) {
2963 iostate
.io_wanted
= 1;
2964 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO
+ 1, "cluster_nocopy_read", 0);
2966 if (iostate
.io_error
) {
2968 * one of the earlier reads we issued ran into a hard error
2969 * don't issue any more reads, cleanup the UPL
2970 * that was just created but not used, then
2971 * go wait for any other reads to complete before
2972 * returning the error to the caller
2974 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
2975 UPL_ABORT_FREE_ON_EMPTY
);
2977 goto wait_for_reads
;
2979 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 73)) | DBG_FUNC_START
,
2980 (int)upl
, (int)upl_offset
, (int)uio
->uio_offset
, io_size
, 0);
2982 retval
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
,
2983 io_size
, devblocksize
,
2984 CL_PRESERVE
| CL_COMMIT
| CL_READ
| CL_ASYNC
| CL_NOZERO
,
2985 (struct buf
*)0, &iostate
);
2988 * update the uio structure
2990 iov
->iov_base
+= io_size
;
2991 iov
->iov_len
-= io_size
;
2992 uio
->uio_resid
-= io_size
;
2993 uio
->uio_offset
+= io_size
;
2995 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 73)) | DBG_FUNC_END
,
2996 (int)upl
, (int)uio
->uio_offset
, (int)uio
->uio_resid
, retval
, 0);
3002 * make sure all async reads that are part of this stream
3003 * have completed before we return
3005 while (iostate
.io_issued
!= iostate
.io_completed
) {
3006 iostate
.io_wanted
= 1;
3007 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO
+ 1, "cluster_nocopy_read", 0);
3009 if (iostate
.io_error
)
3010 retval
= iostate
.io_error
;
3012 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_END
,
3013 (int)uio
->uio_offset
, (int)uio
->uio_resid
, 6, retval
, 0);
3020 cluster_phys_read(vp
, uio
, filesize
, devblocksize
, flags
)
3027 upl_page_info_t
*pl
;
3029 vm_offset_t upl_offset
;
3035 int upl_needed_size
;
3040 struct clios iostate
;
3044 * When we enter this routine, we know
3045 * -- the resid will not exceed iov_len
3046 * -- the target address is physically contiguous
3051 max_size
= filesize
- uio
->uio_offset
;
3053 if (max_size
> (off_t
)((unsigned int)iov
->iov_len
))
3054 io_size
= iov
->iov_len
;
3058 upl_offset
= (vm_offset_t
)iov
->iov_base
& PAGE_MASK
;
3059 upl_needed_size
= upl_offset
+ io_size
;
3063 upl_size
= upl_needed_size
;
3064 upl_flags
= UPL_FILE_IO
| UPL_NO_SYNC
| UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
3066 kret
= vm_map_get_upl(current_map(),
3067 (vm_offset_t
)iov
->iov_base
& ~PAGE_MASK
,
3068 &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
, 0);
3070 if (kret
!= KERN_SUCCESS
) {
3072 * cluster_phys_read: failed to get pagelist
3076 if (upl_size
< upl_needed_size
) {
3078 * The upl_size wasn't satisfied.
3080 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3084 pl
= ubc_upl_pageinfo(upl
);
3086 dst_paddr
= ((addr64_t
)upl_phys_page(pl
, 0) << 12) + ((addr64_t
)((u_int
)iov
->iov_base
& PAGE_MASK
));
3088 while (((uio
->uio_offset
& (devblocksize
- 1)) || io_size
< devblocksize
) && io_size
) {
3091 head_size
= devblocksize
- (int)(uio
->uio_offset
& (devblocksize
- 1));
3093 if (head_size
> io_size
)
3094 head_size
= io_size
;
3096 error
= cluster_align_phys_io(vp
, uio
, dst_paddr
, head_size
, devblocksize
, CL_READ
);
3099 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3103 upl_offset
+= head_size
;
3104 dst_paddr
+= head_size
;
3105 io_size
-= head_size
;
3107 tail_size
= io_size
& (devblocksize
- 1);
3108 io_size
-= tail_size
;
3110 iostate
.io_completed
= 0;
3111 iostate
.io_issued
= 0;
3112 iostate
.io_error
= 0;
3113 iostate
.io_wanted
= 0;
3115 while (io_size
&& error
== 0) {
3118 if (io_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
3119 xsize
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
3123 * request asynchronously so that we can overlap
3124 * the preparation of the next I/O... we'll do
3125 * the commit after all the I/O has completed
3126 * since its all issued against the same UPL
3127 * if there are already too many outstanding reads
3128 * wait until some have completed before issuing the next
3130 while ((iostate
.io_issued
- iostate
.io_completed
) > (2 * MAX_UPL_TRANSFER
* PAGE_SIZE
)) {
3131 iostate
.io_wanted
= 1;
3132 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO
+ 1, "cluster_phys_read", 0);
3135 error
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
, xsize
, 0,
3136 CL_READ
| CL_NOZERO
| CL_DEV_MEMORY
| CL_ASYNC
,
3137 (struct buf
*)0, &iostate
);
3139 * The cluster_io read was issued successfully,
3140 * update the uio structure
3143 uio
->uio_resid
-= xsize
;
3144 iov
->iov_len
-= xsize
;
3145 iov
->iov_base
+= xsize
;
3146 uio
->uio_offset
+= xsize
;
3148 upl_offset
+= xsize
;
3153 * make sure all async reads that are part of this stream
3154 * have completed before we proceed
3156 while (iostate
.io_issued
!= iostate
.io_completed
) {
3157 iostate
.io_wanted
= 1;
3158 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO
+ 1, "cluster_phys_read", 0);
3160 if (iostate
.io_error
) {
3161 error
= iostate
.io_error
;
3163 if (error
== 0 && tail_size
)
3164 error
= cluster_align_phys_io(vp
, uio
, dst_paddr
, tail_size
, devblocksize
, CL_READ
);
3167 * just release our hold on the physically contiguous
3168 * region without changing any state
3170 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3177 * generate advisory I/O's in the largest chunks possible
3178 * the completed pages will be released into the VM cache
3181 advisory_read(vp
, filesize
, f_offset
, resid
, devblocksize
)
3188 upl_page_info_t
*pl
;
3190 vm_offset_t upl_offset
;
3204 if (!UBCINFOEXISTS(vp
))
3207 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 60)) | DBG_FUNC_START
,
3208 (int)f_offset
, resid
, (int)filesize
, devblocksize
, 0);
3210 while (resid
&& f_offset
< filesize
&& retval
== 0) {
3212 * compute the size of the upl needed to encompass
3213 * the requested read... limit each call to cluster_io
3214 * to the maximum UPL size... cluster_io will clip if
3215 * this exceeds the maximum io_size for the device,
3216 * make sure to account for
3217 * a starting offset that's not page aligned
3219 start_offset
= (int)(f_offset
& PAGE_MASK_64
);
3220 upl_f_offset
= f_offset
- (off_t
)start_offset
;
3221 max_size
= filesize
- f_offset
;
3223 if (resid
< max_size
)
3228 upl_size
= (start_offset
+ io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
3229 if (upl_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
3230 upl_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
3234 * return the number of contiguously present pages in the cache
3235 * starting at upl_f_offset within the file
3237 ubc_range_op(vp
, upl_f_offset
, upl_f_offset
+ upl_size
, UPL_ROP_PRESENT
, &skip_range
);
3241 * skip over pages already present in the cache
3243 io_size
= skip_range
- start_offset
;
3245 f_offset
+= io_size
;
3248 if (skip_range
== upl_size
)
3251 * have to issue some real I/O
3252 * at this point, we know it's starting on a page boundary
3253 * because we've skipped over at least the first page in the request
3256 upl_f_offset
+= skip_range
;
3257 upl_size
-= skip_range
;
3259 pages_in_upl
= upl_size
/ PAGE_SIZE
;
3261 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 61)) | DBG_FUNC_START
,
3262 (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
3264 kret
= ubc_create_upl(vp
,
3269 UPL_RET_ONLY_ABSENT
| UPL_SET_LITE
);
3270 if (kret
!= KERN_SUCCESS
)
3275 * before we start marching forward, we must make sure we end on
3276 * a present page, otherwise we will be working with a freed
3279 for (last_pg
= pages_in_upl
- 1; last_pg
>= 0; last_pg
--) {
3280 if (upl_page_present(pl
, last_pg
))
3283 pages_in_upl
= last_pg
+ 1;
3286 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 61)) | DBG_FUNC_END
,
3287 (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
3290 for (last_pg
= 0; last_pg
< pages_in_upl
; ) {
3292 * scan from the beginning of the upl looking for the first
3293 * page that is present.... this will become the first page in
3294 * the request we're going to make to 'cluster_io'... if all
3295 * of the pages are absent, we won't call through to 'cluster_io'
3297 for (start_pg
= last_pg
; start_pg
< pages_in_upl
; start_pg
++) {
3298 if (upl_page_present(pl
, start_pg
))
3303 * scan from the starting present page looking for an absent
3304 * page before the end of the upl is reached, if we
3305 * find one, then it will terminate the range of pages being
3306 * presented to 'cluster_io'
3308 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
3309 if (!upl_page_present(pl
, last_pg
))
3313 if (last_pg
> start_pg
) {
3315 * we found a range of pages that must be filled
3316 * if the last page in this range is the last page of the file
3317 * we may have to clip the size of it to keep from reading past
3318 * the end of the last physical block associated with the file
3320 upl_offset
= start_pg
* PAGE_SIZE
;
3321 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
3323 if ((upl_f_offset
+ upl_offset
+ io_size
) > filesize
)
3324 io_size
= filesize
- (upl_f_offset
+ upl_offset
);
3327 * issue an asynchronous read to cluster_io
3329 retval
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, io_size
, devblocksize
,
3330 CL_ASYNC
| CL_READ
| CL_COMMIT
| CL_AGE
, (struct buf
*)0, (struct clios
*)0);
3336 ubc_upl_abort(upl
, 0);
3338 io_size
= upl_size
- start_offset
;
3340 if (io_size
> resid
)
3342 f_offset
+= io_size
;
3346 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 60)) | DBG_FUNC_END
,
3347 (int)f_offset
, resid
, retval
, 0, 0);
3359 if (!UBCINFOEXISTS(vp
) || (vp
->v_clen
== 0 && !(vp
->v_flag
& VHASDIRTY
)))
3362 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_START
,
3363 vp
->v_flag
& VHASDIRTY
, vp
->v_clen
, 0, 0, 0);
3365 if (vp
->v_flag
& VHASDIRTY
) {
3366 sparse_cluster_push(vp
, ubc_getsize(vp
), 1);
3371 retval
= cluster_try_push(vp
, ubc_getsize(vp
), 0, 1);
3373 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_END
,
3374 vp
->v_flag
& VHASDIRTY
, vp
->v_clen
, retval
, 0, 0);
3387 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_START
, (int)vp
, (int)vp
->v_scmap
, vp
->v_scdirty
, 0, 0);
3389 if (vp
->v_flag
& VHASDIRTY
) {
3390 vfs_drt_control(&(vp
->v_scmap
), 0);
3392 vp
->v_flag
&= ~VHASDIRTY
;
3394 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_END
, (int)vp
, (int)vp
->v_scmap
, vp
->v_scdirty
, 0, 0);
3399 cluster_try_push(vp
, EOF
, can_delay
, push_all
)
3411 struct v_cluster l_clusters
[MAX_CLUSTERS
];
3414 * make a local 'sorted' copy of the clusters
3415 * and clear vp->v_clen so that new clusters can
3418 for (cl_index
= 0; cl_index
< vp
->v_clen
; cl_index
++) {
3419 for (min_index
= -1, cl_index1
= 0; cl_index1
< vp
->v_clen
; cl_index1
++) {
3420 if (vp
->v_clusters
[cl_index1
].start_pg
== vp
->v_clusters
[cl_index1
].last_pg
)
3422 if (min_index
== -1)
3423 min_index
= cl_index1
;
3424 else if (vp
->v_clusters
[cl_index1
].start_pg
< vp
->v_clusters
[min_index
].start_pg
)
3425 min_index
= cl_index1
;
3427 if (min_index
== -1)
3429 l_clusters
[cl_index
].start_pg
= vp
->v_clusters
[min_index
].start_pg
;
3430 l_clusters
[cl_index
].last_pg
= vp
->v_clusters
[min_index
].last_pg
;
3432 vp
->v_clusters
[min_index
].start_pg
= vp
->v_clusters
[min_index
].last_pg
;
3437 if (can_delay
&& cl_len
== MAX_CLUSTERS
) {
3441 * determine if we appear to be writing the file sequentially
3442 * if not, by returning without having pushed any clusters
3443 * we will cause this vnode to be pushed into the sparse cluster mechanism
3444 * used for managing more random I/O patterns
3446 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
3447 * that's why we're in try_push with can_delay true...
3449 * check to make sure that all the clusters except the last one are 'full'... and that each cluster
3450 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
3451 * so we can just make a simple pass through up, to but not including the last one...
3452 * note that last_pg is not inclusive, so it will be equal to the start_pg of the next cluster if they
3455 * we let the last one be partial as long as it was adjacent to the previous one...
3456 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
3457 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
3459 for (i
= 0; i
< MAX_CLUSTERS
- 1; i
++) {
3460 if ((l_clusters
[i
].last_pg
- l_clusters
[i
].start_pg
) != MAX_UPL_TRANSFER
)
3462 if (l_clusters
[i
].last_pg
!= l_clusters
[i
+1].start_pg
)
3466 for (cl_index
= 0; cl_index
< cl_len
; cl_index
++) {
3468 * try to push each cluster in turn... cluster_push_x may not
3469 * push the cluster if can_delay is TRUE and the cluster doesn't
3470 * meet the critera for an immediate push
3472 if (cluster_push_x(vp
, EOF
, l_clusters
[cl_index
].start_pg
, l_clusters
[cl_index
].last_pg
, can_delay
)) {
3473 l_clusters
[cl_index
].start_pg
= 0;
3474 l_clusters
[cl_index
].last_pg
= 0;
3483 if (cl_len
> cl_pushed
) {
3485 * we didn't push all of the clusters, so
3486 * lets try to merge them back in to the vnode
3488 if ((MAX_CLUSTERS
- vp
->v_clen
) < (cl_len
- cl_pushed
)) {
3490 * we picked up some new clusters while we were trying to
3491 * push the old ones (I don't think this can happen because
3492 * I'm holding the lock, but just in case)... the sum of the
3493 * leftovers plus the new cluster count exceeds our ability
3494 * to represent them, so switch to the sparse cluster mechanism
3498 * first collect the new clusters sitting in the vp
3500 sparse_cluster_switch(vp
, EOF
);
3502 for (cl_index
= 0, cl_index1
= 0; cl_index
< cl_len
; cl_index
++) {
3503 if (l_clusters
[cl_index
].start_pg
== l_clusters
[cl_index
].last_pg
)
3505 vp
->v_clusters
[cl_index1
].start_pg
= l_clusters
[cl_index
].start_pg
;
3506 vp
->v_clusters
[cl_index1
].last_pg
= l_clusters
[cl_index
].last_pg
;
3511 * update the cluster count
3513 vp
->v_clen
= cl_index1
;
3516 * and collect the original clusters that were moved into the
3517 * local storage for sorting purposes
3519 sparse_cluster_switch(vp
, EOF
);
3523 * we've got room to merge the leftovers back in
3524 * just append them starting at the next 'hole'
3525 * represented by vp->v_clen
3527 for (cl_index
= 0, cl_index1
= vp
->v_clen
; cl_index
< cl_len
; cl_index
++) {
3528 if (l_clusters
[cl_index
].start_pg
== l_clusters
[cl_index
].last_pg
)
3531 vp
->v_clusters
[cl_index1
].start_pg
= l_clusters
[cl_index
].start_pg
;
3532 vp
->v_clusters
[cl_index1
].last_pg
= l_clusters
[cl_index
].last_pg
;
3537 * update the cluster count
3539 vp
->v_clen
= cl_index1
;
3542 return(MAX_CLUSTERS
- vp
->v_clen
);
3548 cluster_push_x(vp
, EOF
, first
, last
, can_delay
)
3555 upl_page_info_t
*pl
;
3557 vm_offset_t upl_offset
;
3570 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_START
,
3571 vp
->v_clen
, first
, last
, EOF
, 0);
3573 if ((pages_in_upl
= last
- first
) == 0) {
3574 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 0, 0, 0, 0);
3578 upl_size
= pages_in_upl
* PAGE_SIZE
;
3579 upl_f_offset
= ((off_t
)first
) * PAGE_SIZE_64
;
3581 if (upl_f_offset
+ upl_size
>= EOF
) {
3583 if (upl_f_offset
>= EOF
) {
3585 * must have truncated the file and missed
3586 * clearing a dangling cluster (i.e. it's completely
3587 * beyond the new EOF
3589 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 1, 0, 0, 0);
3593 size
= EOF
- upl_f_offset
;
3595 upl_size
= (size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
3596 pages_in_upl
= upl_size
/ PAGE_SIZE
;
3600 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_START
, upl_size
, size
, 0, 0, 0);
3602 if (vp
->v_flag
& VNOCACHE_DATA
)
3603 upl_flags
= UPL_COPYOUT_FROM
| UPL_RET_ONLY_DIRTY
| UPL_SET_LITE
| UPL_WILL_BE_DUMPED
;
3605 upl_flags
= UPL_COPYOUT_FROM
| UPL_RET_ONLY_DIRTY
| UPL_SET_LITE
;
3607 kret
= ubc_create_upl(vp
,
3613 if (kret
!= KERN_SUCCESS
)
3614 panic("cluster_push: failed to get pagelist");
3616 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_END
, (int)upl
, upl_f_offset
, 0, 0, 0);
3619 * since we only asked for the dirty pages back
3620 * it's possible that we may only get a few or even none, so...
3621 * before we start marching forward, we must make sure we know
3622 * where the last present page is in the UPL, otherwise we could
3623 * end up working with a freed upl due to the FREE_ON_EMPTY semantics
3624 * employed by commit_range and abort_range.
3626 for (last_pg
= pages_in_upl
- 1; last_pg
>= 0; last_pg
--) {
3627 if (upl_page_present(pl
, last_pg
))
3630 pages_in_upl
= last_pg
+ 1;
3632 if (pages_in_upl
== 0) {
3633 ubc_upl_abort(upl
, 0);
3635 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 2, 0, 0, 0);
3639 for (last_pg
= 0; last_pg
< pages_in_upl
; ) {
3641 * find the next dirty page in the UPL
3642 * this will become the first page in the
3643 * next I/O to generate
3645 for (start_pg
= last_pg
; start_pg
< pages_in_upl
; start_pg
++) {
3646 if (upl_dirty_page(pl
, start_pg
))
3648 if (upl_page_present(pl
, start_pg
))
3650 * RET_ONLY_DIRTY will return non-dirty 'precious' pages
3651 * just release these unchanged since we're not going
3652 * to steal them or change their state
3654 ubc_upl_abort_range(upl
, start_pg
* PAGE_SIZE
, PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
3656 if (start_pg
>= pages_in_upl
)
3658 * done... no more dirty pages to push
3661 if (start_pg
> last_pg
)
3663 * skipped over some non-dirty pages
3665 size
-= ((start_pg
- last_pg
) * PAGE_SIZE
);
3668 * find a range of dirty pages to write
3670 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
3671 if (!upl_dirty_page(pl
, last_pg
))
3674 upl_offset
= start_pg
* PAGE_SIZE
;
3676 io_size
= min(size
, (last_pg
- start_pg
) * PAGE_SIZE
);
3678 if (vp
->v_flag
& VNOCACHE_DATA
)
3679 io_flags
= CL_THROTTLE
| CL_COMMIT
| CL_ASYNC
| CL_DUMP
;
3681 io_flags
= CL_THROTTLE
| CL_COMMIT
| CL_ASYNC
;
3683 cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, io_size
, vp
->v_ciosiz
, io_flags
, (struct buf
*)0, (struct clios
*)0);
3687 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 3, 0, 0, 0);
3694 sparse_cluster_switch(struct vnode
*vp
, off_t EOF
)
3698 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 78)) | DBG_FUNC_START
, (int)vp
, (int)vp
->v_scmap
, vp
->v_scdirty
, 0, 0);
3700 if ( !(vp
->v_flag
& VHASDIRTY
)) {
3701 vp
->v_flag
|= VHASDIRTY
;
3705 for (cl_index
= 0; cl_index
< vp
->v_clen
; cl_index
++) {
3710 for (start_pg
= vp
->v_clusters
[cl_index
].start_pg
; start_pg
< vp
->v_clusters
[cl_index
].last_pg
; start_pg
++) {
3712 if (ubc_page_op(vp
, (off_t
)(((off_t
)start_pg
) * PAGE_SIZE_64
), 0, 0, &flags
) == KERN_SUCCESS
) {
3713 if (flags
& UPL_POP_DIRTY
)
3714 sparse_cluster_add(vp
, EOF
, start_pg
, start_pg
+ 1);
3718 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 78)) | DBG_FUNC_END
, (int)vp
, (int)vp
->v_scmap
, vp
->v_scdirty
, 0, 0);
3723 sparse_cluster_push(struct vnode
*vp
, off_t EOF
, int push_all
)
3730 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 79)) | DBG_FUNC_START
, (int)vp
, (int)vp
->v_scmap
, vp
->v_scdirty
, push_all
, 0);
3733 vfs_drt_control(&(vp
->v_scmap
), 1);
3736 if (vfs_drt_get_cluster(&(vp
->v_scmap
), &offset
, &length
) != KERN_SUCCESS
) {
3737 vp
->v_flag
&= ~VHASDIRTY
;
3741 first
= (daddr_t
)(offset
/ PAGE_SIZE_64
);
3742 last
= (daddr_t
)((offset
+ length
) / PAGE_SIZE_64
);
3744 cluster_push_x(vp
, EOF
, first
, last
, 0);
3746 vp
->v_scdirty
-= (last
- first
);
3751 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 79)) | DBG_FUNC_END
, (int)vp
, (int)vp
->v_scmap
, vp
->v_scdirty
, 0, 0);
3756 sparse_cluster_add(struct vnode
*vp
, off_t EOF
, daddr_t first
, daddr_t last
)
3762 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 80)) | DBG_FUNC_START
, (int)vp
->v_scmap
, vp
->v_scdirty
, first
, last
, 0);
3764 offset
= (off_t
)first
* PAGE_SIZE_64
;
3765 length
= (last
- first
) * PAGE_SIZE
;
3767 while (vfs_drt_mark_pages(&(vp
->v_scmap
), offset
, length
, &new_dirty
) != KERN_SUCCESS
) {
3769 * no room left in the map
3770 * only a partial update was done
3771 * push out some pages and try again
3773 vp
->v_scdirty
+= new_dirty
;
3775 sparse_cluster_push(vp
, EOF
, 0);
3777 offset
+= (new_dirty
* PAGE_SIZE_64
);
3778 length
-= (new_dirty
* PAGE_SIZE
);
3780 vp
->v_scdirty
+= new_dirty
;
3782 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 80)) | DBG_FUNC_END
, (int)vp
, (int)vp
->v_scmap
, vp
->v_scdirty
, 0, 0);
3787 cluster_align_phys_io(struct vnode
*vp
, struct uio
*uio
, addr64_t usr_paddr
, int xsize
, int devblocksize
, int flags
)
3790 upl_page_info_t
*pl
;
3798 kret
= ubc_create_upl(vp
,
3799 uio
->uio_offset
& ~PAGE_MASK_64
,
3805 if (kret
!= KERN_SUCCESS
)
3808 if (!upl_valid_page(pl
, 0)) {
3810 * issue a synchronous read to cluster_io
3812 error
= cluster_io(vp
, upl
, 0, uio
->uio_offset
& ~PAGE_MASK_64
, PAGE_SIZE
, devblocksize
,
3813 CL_READ
, (struct buf
*)0, (struct clios
*)0);
3815 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
3820 ubc_paddr
= ((addr64_t
)upl_phys_page(pl
, 0) << 12) + (addr64_t
)(uio
->uio_offset
& PAGE_MASK_64
);
3823 * NOTE: There is no prototype for the following in BSD. It, and the definitions
3824 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
3825 * osfmk/ppc/mappings.h. They are not included here because there appears to be no
3826 * way to do so without exporting them to kexts as well.
3828 if (flags
& CL_READ
)
3829 // copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */
3830 copypv(ubc_paddr
, usr_paddr
, xsize
, 2 | 1 | 4); /* Copy physical to physical and flush the destination */
3832 // copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */
3833 copypv(ubc_paddr
, usr_paddr
, xsize
, 2 | 1 | 8); /* Copy physical to physical and flush the source */
3835 if ( !(flags
& CL_READ
) || (upl_valid_page(pl
, 0) && upl_dirty_page(pl
, 0))) {
3837 * issue a synchronous write to cluster_io
3839 error
= cluster_io(vp
, upl
, 0, uio
->uio_offset
& ~PAGE_MASK_64
, PAGE_SIZE
, devblocksize
,
3840 0, (struct buf
*)0, (struct clios
*)0);
3843 uio
->uio_offset
+= xsize
;
3844 iov
->iov_base
+= xsize
;
3845 iov
->iov_len
-= xsize
;
3846 uio
->uio_resid
-= xsize
;
3848 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
3856 cluster_copy_upl_data(struct uio
*uio
, upl_t upl
, int upl_offset
, int xsize
)
3863 upl_page_info_t
*pl
;
3864 boolean_t funnel_state
= FALSE
;
3867 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_START
,
3868 (int)uio
->uio_offset
, uio
->uio_resid
, upl_offset
, xsize
, 0);
3870 if (xsize
>= (16 * 1024))
3871 funnel_state
= thread_funnel_set(kernel_flock
, FALSE
);
3873 segflg
= uio
->uio_segflg
;
3878 case UIO_USERISPACE
:
3879 uio
->uio_segflg
= UIO_PHYS_USERSPACE
;
3883 uio
->uio_segflg
= UIO_PHYS_SYSSPACE
;
3886 pl
= ubc_upl_pageinfo(upl
);
3888 pg_index
= upl_offset
/ PAGE_SIZE
;
3889 pg_offset
= upl_offset
& PAGE_MASK
;
3890 csize
= min(PAGE_SIZE
- pg_offset
, xsize
);
3892 while (xsize
&& retval
== 0) {
3895 paddr
= ((addr64_t
)upl_phys_page(pl
, pg_index
) << 12) + pg_offset
;
3897 retval
= uiomove64(paddr
, csize
, uio
);
3902 csize
= min(PAGE_SIZE
, xsize
);
3904 uio
->uio_segflg
= segflg
;
3906 if (funnel_state
== TRUE
)
3907 thread_funnel_set(kernel_flock
, TRUE
);
3909 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
3910 (int)uio
->uio_offset
, uio
->uio_resid
, retval
, segflg
, 0);
3917 cluster_copy_ubc_data(struct vnode
*vp
, struct uio
*uio
, int *io_resid
, int mark_dirty
)
3925 memory_object_control_t control
;
3926 int op_flags
= UPL_POP_SET
| UPL_POP_BUSY
;
3927 boolean_t funnel_state
= FALSE
;
3930 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_START
,
3931 (int)uio
->uio_offset
, uio
->uio_resid
, 0, *io_resid
, 0);
3933 control
= ubc_getobject(vp
, UBC_FLAGS_NONE
);
3934 if (control
== MEMORY_OBJECT_CONTROL_NULL
) {
3935 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
3936 (int)uio
->uio_offset
, uio
->uio_resid
, retval
, 3, 0);
3941 op_flags
|= UPL_POP_DIRTY
;
3943 segflg
= uio
->uio_segflg
;
3948 case UIO_USERISPACE
:
3949 uio
->uio_segflg
= UIO_PHYS_USERSPACE
;
3953 uio
->uio_segflg
= UIO_PHYS_SYSSPACE
;
3956 io_size
= *io_resid
;
3957 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
3958 f_offset
= uio
->uio_offset
- start_offset
;
3959 xsize
= min(PAGE_SIZE
- start_offset
, io_size
);
3961 while (io_size
&& retval
== 0) {
3964 if (ubc_page_op_with_control(control
, f_offset
, op_flags
, &pgframe
, 0) != KERN_SUCCESS
)
3967 if (funnel_state
== FALSE
&& io_size
>= (16 * 1024))
3968 funnel_state
= thread_funnel_set(kernel_flock
, FALSE
);
3970 retval
= uiomove64((addr64_t
)(((addr64_t
)pgframe
<< 12) + start_offset
), xsize
, uio
);
3972 ubc_page_op_with_control(control
, f_offset
, UPL_POP_CLR
| UPL_POP_BUSY
, 0, 0);
3976 f_offset
= uio
->uio_offset
;
3977 xsize
= min(PAGE_SIZE
, io_size
);
3979 uio
->uio_segflg
= segflg
;
3980 *io_resid
= io_size
;
3982 if (funnel_state
== TRUE
)
3983 thread_funnel_set(kernel_flock
, TRUE
);
3985 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
3986 (int)uio
->uio_offset
, uio
->uio_resid
, retval
, 0x80000000 | segflg
, 0);
3993 is_file_clean(struct vnode
*vp
, off_t filesize
)
3997 int total_dirty
= 0;
3999 for (f_offset
= 0; f_offset
< filesize
; f_offset
+= PAGE_SIZE_64
) {
4000 if (ubc_page_op(vp
, f_offset
, 0, 0, &flags
) == KERN_SUCCESS
) {
4001 if (flags
& UPL_POP_DIRTY
) {
4015 * Dirty region tracking/clustering mechanism.
4017 * This code (vfs_drt_*) provides a mechanism for tracking and clustering
4018 * dirty regions within a larger space (file). It is primarily intended to
4019 * support clustering in large files with many dirty areas.
4021 * The implementation assumes that the dirty regions are pages.
4023 * To represent dirty pages within the file, we store bit vectors in a
4024 * variable-size circular hash.
4028 * Bitvector size. This determines the number of pages we group in a
4029 * single hashtable entry. Each hashtable entry is aligned to this
4030 * size within the file.
4032 #define DRT_BITVECTOR_PAGES 256
4035 * File offset handling.
4037 * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
4038 * the correct formula is (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)
4040 #define DRT_ADDRESS_MASK (~((1 << 20) - 1))
4041 #define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
4044 * Hashtable address field handling.
4046 * The low-order bits of the hashtable address are used to conserve
4049 * DRT_HASH_COUNT_MASK must be large enough to store the range
4050 * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
4051 * to indicate that the bucket is actually unoccupied.
4053 #define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
4054 #define DRT_HASH_SET_ADDRESS(scm, i, a) \
4056 (scm)->scm_hashtable[(i)].dhe_control = \
4057 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
4059 #define DRT_HASH_COUNT_MASK 0x1ff
4060 #define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
4061 #define DRT_HASH_SET_COUNT(scm, i, c) \
4063 (scm)->scm_hashtable[(i)].dhe_control = \
4064 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \
4066 #define DRT_HASH_CLEAR(scm, i) \
4068 (scm)->scm_hashtable[(i)].dhe_control = 0; \
4070 #define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
4071 #define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
4072 #define DRT_HASH_COPY(oscm, oi, scm, i) \
4074 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
4075 DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
4080 * Hash table moduli.
4082 * Since the hashtable entry's size is dependent on the size of
4083 * the bitvector, and since the hashtable size is constrained to
4084 * both being prime and fitting within the desired allocation
4085 * size, these values need to be manually determined.
4087 * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
4089 * The small hashtable allocation is 1024 bytes, so the modulus is 23.
4090 * The large hashtable allocation is 16384 bytes, so the modulus is 401.
4092 #define DRT_HASH_SMALL_MODULUS 23
4093 #define DRT_HASH_LARGE_MODULUS 401
4095 #define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */
4096 #define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */
4098 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
4101 * Hashtable bitvector handling.
4103 * Bitvector fields are 32 bits long.
4106 #define DRT_HASH_SET_BIT(scm, i, bit) \
4107 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
4109 #define DRT_HASH_CLEAR_BIT(scm, i, bit) \
4110 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
4112 #define DRT_HASH_TEST_BIT(scm, i, bit) \
4113 ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
4115 #define DRT_BITVECTOR_CLEAR(scm, i) \
4116 bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4118 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
4119 bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
4120 &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
4121 (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4128 struct vfs_drt_hashentry
{
4129 u_int64_t dhe_control
;
4130 u_int32_t dhe_bitvector
[DRT_BITVECTOR_PAGES
/ 32];
4134 * Dirty Region Tracking structure.
4136 * The hashtable is allocated entirely inside the DRT structure.
4138 * The hash is a simple circular prime modulus arrangement, the structure
4139 * is resized from small to large if it overflows.
4142 struct vfs_drt_clustermap
{
4143 u_int32_t scm_magic
; /* sanity/detection */
4144 #define DRT_SCM_MAGIC 0x12020003
4145 u_int32_t scm_modulus
; /* current ring size */
4146 u_int32_t scm_buckets
; /* number of occupied buckets */
4147 u_int32_t scm_lastclean
; /* last entry we cleaned */
4148 u_int32_t scm_iskips
; /* number of slot skips */
4150 struct vfs_drt_hashentry scm_hashtable
[0];
4154 #define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
4155 #define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
4158 * Debugging codes and arguments.
4160 #define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
4161 #define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
4162 #define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
4163 #define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
4164 #define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
4167 /* 1 (clean, no map) */
4168 /* 2 (map alloc fail) */
4169 /* 3, resid (partial) */
4170 #define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
4171 #define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
4172 * lastclean, iskips */
4175 static void vfs_drt_sanity(struct vfs_drt_clustermap
*cmap
);
4176 static kern_return_t
vfs_drt_alloc_map(struct vfs_drt_clustermap
**cmapp
);
4177 static kern_return_t
vfs_drt_free_map(struct vfs_drt_clustermap
*cmap
);
4178 static kern_return_t
vfs_drt_search_index(struct vfs_drt_clustermap
*cmap
,
4179 u_int64_t offset
, int *indexp
);
4180 static kern_return_t
vfs_drt_get_index(struct vfs_drt_clustermap
**cmapp
,
4184 static kern_return_t
vfs_drt_do_mark_pages(
4190 static void vfs_drt_trace(
4191 struct vfs_drt_clustermap
*cmap
,
4200 * Allocate and initialise a sparse cluster map.
4202 * Will allocate a new map, resize or compact an existing map.
4204 * XXX we should probably have at least one intermediate map size,
4205 * as the 1:16 ratio seems a bit drastic.
4207 static kern_return_t
4208 vfs_drt_alloc_map(struct vfs_drt_clustermap
**cmapp
)
4210 struct vfs_drt_clustermap
*cmap
, *ocmap
;
4213 int nsize
, i
, active_buckets
, index
, copycount
;
4220 * Decide on the size of the new map.
4222 if (ocmap
== NULL
) {
4223 nsize
= DRT_HASH_SMALL_MODULUS
;
4225 /* count the number of active buckets in the old map */
4227 for (i
= 0; i
< ocmap
->scm_modulus
; i
++) {
4228 if (!DRT_HASH_VACANT(ocmap
, i
) &&
4229 (DRT_HASH_GET_COUNT(ocmap
, i
) != 0))
4233 * If we're currently using the small allocation, check to
4234 * see whether we should grow to the large one.
4236 if (ocmap
->scm_modulus
== DRT_HASH_SMALL_MODULUS
) {
4237 /* if the ring is nearly full */
4238 if (active_buckets
> (DRT_HASH_SMALL_MODULUS
- 5)) {
4239 nsize
= DRT_HASH_LARGE_MODULUS
;
4241 nsize
= DRT_HASH_SMALL_MODULUS
;
4244 /* already using the large modulus */
4245 nsize
= DRT_HASH_LARGE_MODULUS
;
4247 * If the ring is completely full, there's
4248 * nothing useful for us to do. Behave as
4249 * though we had compacted into the new
4252 if (active_buckets
>= DRT_HASH_LARGE_MODULUS
)
4253 return(KERN_SUCCESS
);
4258 * Allocate and initialise the new map.
4261 kret
= kmem_alloc(kernel_map
, (vm_offset_t
*)&cmap
,
4262 (nsize
== DRT_HASH_SMALL_MODULUS
) ? DRT_SMALL_ALLOCATION
: DRT_LARGE_ALLOCATION
);
4263 if (kret
!= KERN_SUCCESS
)
4265 cmap
->scm_magic
= DRT_SCM_MAGIC
;
4266 cmap
->scm_modulus
= nsize
;
4267 cmap
->scm_buckets
= 0;
4268 cmap
->scm_lastclean
= 0;
4269 cmap
->scm_iskips
= 0;
4270 for (i
= 0; i
< cmap
->scm_modulus
; i
++) {
4271 DRT_HASH_CLEAR(cmap
, i
);
4272 DRT_HASH_VACATE(cmap
, i
);
4273 DRT_BITVECTOR_CLEAR(cmap
, i
);
4277 * If there's an old map, re-hash entries from it into the new map.
4280 if (ocmap
!= NULL
) {
4281 for (i
= 0; i
< ocmap
->scm_modulus
; i
++) {
4282 /* skip empty buckets */
4283 if (DRT_HASH_VACANT(ocmap
, i
) ||
4284 (DRT_HASH_GET_COUNT(ocmap
, i
) == 0))
4287 offset
= DRT_HASH_GET_ADDRESS(ocmap
, i
);
4288 kret
= vfs_drt_get_index(&cmap
, offset
, &index
, 1);
4289 if (kret
!= KERN_SUCCESS
) {
4290 /* XXX need to bail out gracefully here */
4291 panic("vfs_drt: new cluster map mysteriously too small");
4294 DRT_HASH_COPY(ocmap
, i
, cmap
, index
);
4299 /* log what we've done */
4300 vfs_drt_trace(cmap
, DRT_DEBUG_ALLOC
, copycount
, 0, 0, 0);
4303 * It's important to ensure that *cmapp always points to
4304 * a valid map, so we must overwrite it before freeing
4308 if (ocmap
!= NULL
) {
4309 /* emit stats into trace buffer */
4310 vfs_drt_trace(ocmap
, DRT_DEBUG_SCMDATA
,
4313 ocmap
->scm_lastclean
,
4316 vfs_drt_free_map(ocmap
);
4318 return(KERN_SUCCESS
);
4323 * Free a sparse cluster map.
4325 static kern_return_t
4326 vfs_drt_free_map(struct vfs_drt_clustermap
*cmap
)
4330 kmem_free(kernel_map
, (vm_offset_t
)cmap
,
4331 (cmap
->scm_modulus
== DRT_HASH_SMALL_MODULUS
) ? DRT_SMALL_ALLOCATION
: DRT_LARGE_ALLOCATION
);
4332 return(KERN_SUCCESS
);
4337 * Find the hashtable slot currently occupied by an entry for the supplied offset.
4339 static kern_return_t
4340 vfs_drt_search_index(struct vfs_drt_clustermap
*cmap
, u_int64_t offset
, int *indexp
)
4343 int index
, i
, tries
;
4345 offset
= DRT_ALIGN_ADDRESS(offset
);
4346 index
= DRT_HASH(cmap
, offset
);
4348 /* traverse the hashtable */
4349 for (i
= 0; i
< cmap
->scm_modulus
; i
++) {
4352 * If the slot is vacant, we can stop.
4354 if (DRT_HASH_VACANT(cmap
, index
))
4358 * If the address matches our offset, we have success.
4360 if (DRT_HASH_GET_ADDRESS(cmap
, index
) == offset
) {
4362 return(KERN_SUCCESS
);
4366 * Move to the next slot, try again.
4368 index
= DRT_HASH_NEXT(cmap
, index
);
4373 return(KERN_FAILURE
);
4377 * Find the hashtable slot for the supplied offset. If we haven't allocated
4378 * one yet, allocate one and populate the address field. Note that it will
4379 * not have a nonzero page count and thus will still technically be free, so
4380 * in the case where we are called to clean pages, the slot will remain free.
4382 static kern_return_t
4383 vfs_drt_get_index(struct vfs_drt_clustermap
**cmapp
, u_int64_t offset
, int *indexp
, int recursed
)
4385 struct vfs_drt_clustermap
*cmap
;
4391 /* look for an existing entry */
4392 kret
= vfs_drt_search_index(cmap
, offset
, indexp
);
4393 if (kret
== KERN_SUCCESS
)
4396 /* need to allocate an entry */
4397 offset
= DRT_ALIGN_ADDRESS(offset
);
4398 index
= DRT_HASH(cmap
, offset
);
4400 /* scan from the index forwards looking for a vacant slot */
4401 for (i
= 0; i
< cmap
->scm_modulus
; i
++) {
4403 if (DRT_HASH_VACANT(cmap
, index
) || DRT_HASH_GET_COUNT(cmap
,index
) == 0) {
4404 cmap
->scm_buckets
++;
4405 if (index
< cmap
->scm_lastclean
)
4406 cmap
->scm_lastclean
= index
;
4407 DRT_HASH_SET_ADDRESS(cmap
, index
, offset
);
4408 DRT_HASH_SET_COUNT(cmap
, index
, 0);
4409 DRT_BITVECTOR_CLEAR(cmap
, index
);
4411 vfs_drt_trace(cmap
, DRT_DEBUG_INSERT
, (int)offset
, i
, 0, 0);
4412 return(KERN_SUCCESS
);
4414 cmap
->scm_iskips
+= i
;
4415 index
= DRT_HASH_NEXT(cmap
, index
);
4419 * We haven't found a vacant slot, so the map is full. If we're not
4420 * already recursed, try reallocating/compacting it.
4423 return(KERN_FAILURE
);
4424 kret
= vfs_drt_alloc_map(cmapp
);
4425 if (kret
== KERN_SUCCESS
) {
4426 /* now try to insert again */
4427 kret
= vfs_drt_get_index(cmapp
, offset
, indexp
, 1);
4433 * Implementation of set dirty/clean.
4435 * In the 'clean' case, not finding a map is OK.
4437 static kern_return_t
4438 vfs_drt_do_mark_pages(
4445 struct vfs_drt_clustermap
*cmap
, **cmapp
;
4447 int i
, index
, pgoff
, pgcount
, setcount
, ecount
;
4449 cmapp
= (struct vfs_drt_clustermap
**)private;
4452 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_START
, (int)offset
, (int)length
, dirty
, 0);
4454 if (setcountp
!= NULL
)
4457 /* allocate a cluster map if we don't already have one */
4459 /* no cluster map, nothing to clean */
4461 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 1, 0, 0, 0);
4462 return(KERN_SUCCESS
);
4464 kret
= vfs_drt_alloc_map(cmapp
);
4465 if (kret
!= KERN_SUCCESS
) {
4466 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 2, 0, 0, 0);
4473 * Iterate over the length of the region.
4475 while (length
> 0) {
4477 * Get the hashtable index for this offset.
4479 * XXX this will add blank entries if we are clearing a range
4480 * that hasn't been dirtied.
4482 kret
= vfs_drt_get_index(cmapp
, offset
, &index
, 0);
4483 cmap
= *cmapp
; /* may have changed! */
4484 /* this may be a partial-success return */
4485 if (kret
!= KERN_SUCCESS
) {
4486 if (setcountp
!= NULL
)
4487 *setcountp
= setcount
;
4488 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 3, (int)length
, 0, 0);
4494 * Work out how many pages we're modifying in this
4497 pgoff
= (offset
- DRT_ALIGN_ADDRESS(offset
)) / PAGE_SIZE
;
4498 pgcount
= min((length
/ PAGE_SIZE
), (DRT_BITVECTOR_PAGES
- pgoff
));
4501 * Iterate over pages, dirty/clearing as we go.
4503 ecount
= DRT_HASH_GET_COUNT(cmap
, index
);
4504 for (i
= 0; i
< pgcount
; i
++) {
4506 if (!DRT_HASH_TEST_BIT(cmap
, index
, pgoff
+ i
)) {
4507 DRT_HASH_SET_BIT(cmap
, index
, pgoff
+ i
);
4512 if (DRT_HASH_TEST_BIT(cmap
, index
, pgoff
+ i
)) {
4513 DRT_HASH_CLEAR_BIT(cmap
, index
, pgoff
+ i
);
4519 DRT_HASH_SET_COUNT(cmap
, index
, ecount
);
4521 offset
+= pgcount
* PAGE_SIZE
;
4522 length
-= pgcount
* PAGE_SIZE
;
4524 if (setcountp
!= NULL
)
4525 *setcountp
= setcount
;
4527 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 0, setcount
, 0, 0);
4529 return(KERN_SUCCESS
);
4533 * Mark a set of pages as dirty/clean.
4535 * This is a public interface.
4538 * Pointer to storage suitable for holding a pointer. Note that
4539 * this must either be NULL or a value set by this function.
4542 * Current file size in bytes.
4545 * Offset of the first page to be marked as dirty, in bytes. Must be
4549 * Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
4552 * Number of pages newly marked dirty by this call (optional).
4554 * Returns KERN_SUCCESS if all the pages were successfully marked.
4556 static kern_return_t
4557 vfs_drt_mark_pages(void **cmapp
, off_t offset
, u_int length
, int *setcountp
)
4559 /* XXX size unused, drop from interface */
4560 return(vfs_drt_do_mark_pages(cmapp
, offset
, length
, setcountp
, 1));
4563 static kern_return_t
4564 vfs_drt_unmark_pages(void **cmapp
, off_t offset
, u_int length
)
4566 return(vfs_drt_do_mark_pages(cmapp
, offset
, length
, NULL
, 0));
4570 * Get a cluster of dirty pages.
4572 * This is a public interface.
4575 * Pointer to storage managed by drt_mark_pages. Note that this must
4576 * be NULL or a value set by drt_mark_pages.
4579 * Returns the byte offset into the file of the first page in the cluster.
4582 * Returns the length in bytes of the cluster of dirty pages.
4584 * Returns success if a cluster was found. If KERN_FAILURE is returned, there
4585 * are no dirty pages meeting the minmum size criteria. Private storage will
4586 * be released if there are no more dirty pages left in the map
4589 static kern_return_t
4590 vfs_drt_get_cluster(void **cmapp
, off_t
*offsetp
, u_int
*lengthp
)
4592 struct vfs_drt_clustermap
*cmap
;
4595 int index
, i
, j
, fs
, ls
;
4598 if ((cmapp
== NULL
) || (*cmapp
== NULL
))
4599 return(KERN_FAILURE
);
4602 /* walk the hashtable */
4603 for (offset
= 0, j
= 0; j
< cmap
->scm_modulus
; offset
+= (DRT_BITVECTOR_PAGES
* PAGE_SIZE
), j
++) {
4604 index
= DRT_HASH(cmap
, offset
);
4606 if (DRT_HASH_VACANT(cmap
, index
) || (DRT_HASH_GET_COUNT(cmap
, index
) == 0))
4609 /* scan the bitfield for a string of bits */
4612 for (i
= 0; i
< DRT_BITVECTOR_PAGES
; i
++) {
4613 if (DRT_HASH_TEST_BIT(cmap
, index
, i
)) {
4619 /* didn't find any bits set */
4620 panic("vfs_drt: entry summary count > 0 but no bits set in map");
4622 for (ls
= 0; i
< DRT_BITVECTOR_PAGES
; i
++, ls
++) {
4623 if (!DRT_HASH_TEST_BIT(cmap
, index
, i
))
4627 /* compute offset and length, mark pages clean */
4628 offset
= DRT_HASH_GET_ADDRESS(cmap
, index
) + (PAGE_SIZE
* fs
);
4629 length
= ls
* PAGE_SIZE
;
4630 vfs_drt_do_mark_pages(cmapp
, offset
, length
, NULL
, 0);
4631 cmap
->scm_lastclean
= index
;
4633 /* return successful */
4634 *offsetp
= (off_t
)offset
;
4637 vfs_drt_trace(cmap
, DRT_DEBUG_RETCLUSTER
, (int)offset
, (int)length
, 0, 0);
4638 return(KERN_SUCCESS
);
4641 * We didn't find anything... hashtable is empty
4642 * emit stats into trace buffer and
4645 vfs_drt_trace(cmap
, DRT_DEBUG_SCMDATA
,
4648 cmap
->scm_lastclean
,
4651 vfs_drt_free_map(cmap
);
4654 return(KERN_FAILURE
);
4658 static kern_return_t
4659 vfs_drt_control(void **cmapp
, int op_type
)
4661 struct vfs_drt_clustermap
*cmap
;
4664 if ((cmapp
== NULL
) || (*cmapp
== NULL
))
4665 return(KERN_FAILURE
);
4670 /* emit stats into trace buffer */
4671 vfs_drt_trace(cmap
, DRT_DEBUG_SCMDATA
,
4674 cmap
->scm_lastclean
,
4677 vfs_drt_free_map(cmap
);
4682 cmap
->scm_lastclean
= 0;
4685 return(KERN_SUCCESS
);
4691 * Emit a summary of the state of the clustermap into the trace buffer
4692 * along with some caller-provided data.
4695 vfs_drt_trace(struct vfs_drt_clustermap
*cmap
, int code
, int arg1
, int arg2
, int arg3
, int arg4
)
4697 KERNEL_DEBUG(code
, arg1
, arg2
, arg3
, arg4
, 0);
4701 * Perform basic sanity check on the hash entry summary count
4702 * vs. the actual bits set in the entry.
4705 vfs_drt_sanity(struct vfs_drt_clustermap
*cmap
)
4710 for (index
= 0; index
< cmap
->scm_modulus
; index
++) {
4711 if (DRT_HASH_VACANT(cmap
, index
))
4714 for (bits_on
= 0, i
= 0; i
< DRT_BITVECTOR_PAGES
; i
++) {
4715 if (DRT_HASH_TEST_BIT(cmap
, index
, i
))
4718 if (bits_on
!= DRT_HASH_GET_COUNT(cmap
, index
))
4719 panic("bits_on = %d, index = %d\n", bits_on
, index
);