]>
git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_cluster.c
160acae0a313b9e0b21bf680734d20768d431451
2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
23 * @APPLE_LICENSE_HEADER_END@
25 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
28 * The Regents of the University of California. All rights reserved.
30 * Redistribution and use in source and binary forms, with or without
31 * modification, are permitted provided that the following conditions
33 * 1. Redistributions of source code must retain the above copyright
34 * notice, this list of conditions and the following disclaimer.
35 * 2. Redistributions in binary form must reproduce the above copyright
36 * notice, this list of conditions and the following disclaimer in the
37 * documentation and/or other materials provided with the distribution.
38 * 3. All advertising materials mentioning features or use of this software
39 * must display the following acknowledgement:
40 * This product includes software developed by the University of
41 * California, Berkeley and its contributors.
42 * 4. Neither the name of the University nor the names of its contributors
43 * may be used to endorse or promote products derived from this software
44 * without specific prior written permission.
46 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
47 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
49 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
50 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
52 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
54 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
55 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
61 #include <sys/param.h>
64 #include <sys/vnode.h>
65 #include <sys/mount.h>
66 #include <sys/trace.h>
67 #include <sys/malloc.h>
68 #include <sys/resourcevar.h>
69 #include <libkern/libkern.h>
72 #include <vm/vm_pageout.h>
74 #include <sys/kdebug.h>
78 #define CL_COMMIT 0x04
79 #define CL_PAGEOUT 0x10
82 #define CL_NOZERO 0x80
83 #define CL_PAGEIN 0x100
84 #define CL_DEV_MEMORY 0x200
85 #define CL_PRESERVE 0x400
89 u_int io_completed
; /* amount of io that has currently completed */
90 u_int io_issued
; /* amount of io that was successfully issued */
91 int io_error
; /* error code of first error encountered */
92 int io_wanted
; /* someone is sleeping waiting for a change in state */
96 static void cluster_zero(upl_t upl
, vm_offset_t upl_offset
,
97 int size
, struct buf
*bp
);
98 static int cluster_read_x(struct vnode
*vp
, struct uio
*uio
,
99 off_t filesize
, int devblocksize
, int flags
);
100 static int cluster_write_x(struct vnode
*vp
, struct uio
*uio
,
101 off_t oldEOF
, off_t newEOF
, off_t headOff
,
102 off_t tailOff
, int devblocksize
, int flags
);
103 static int cluster_nocopy_read(struct vnode
*vp
, struct uio
*uio
,
104 off_t filesize
, int devblocksize
, int flags
);
105 static int cluster_nocopy_write(struct vnode
*vp
, struct uio
*uio
,
106 off_t newEOF
, int devblocksize
, int flags
);
107 static int cluster_phys_read(struct vnode
*vp
, struct uio
*uio
,
108 off_t filesize
, int devblocksize
, int flags
);
109 static int cluster_phys_write(struct vnode
*vp
, struct uio
*uio
,
110 off_t newEOF
, int devblocksize
, int flags
);
111 static int cluster_align_phys_io(struct vnode
*vp
, struct uio
*uio
,
112 vm_offset_t usr_paddr
, int xsize
, int devblocksize
, int flags
);
113 static int cluster_push_x(struct vnode
*vp
, off_t EOF
, daddr_t first
, daddr_t last
, int can_delay
);
114 static int cluster_try_push(struct vnode
*vp
, off_t newEOF
, int can_delay
, int push_all
);
118 * throttle the number of async writes that
119 * can be outstanding on a single vnode
120 * before we issue a synchronous write
122 #define ASYNC_THROTTLE 9
136 struct buf
*cbp_head
;
137 struct buf
*cbp_next
;
140 struct clios
*iostate
;
145 cbp_head
= (struct buf
*)(bp
->b_trans_head
);
147 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_START
,
148 (int)cbp_head
, bp
->b_lblkno
, bp
->b_bcount
, bp
->b_flags
, 0);
150 for (cbp
= cbp_head
; cbp
; cbp
= cbp
->b_trans_next
) {
152 * all I/O requests that are part of this transaction
153 * have to complete before we can process it
155 if ( !(cbp
->b_flags
& B_DONE
)) {
157 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
158 (int)cbp_head
, (int)cbp
, cbp
->b_bcount
, cbp
->b_flags
, 0);
168 upl_offset
= cbp
->b_uploffset
;
169 upl
= cbp
->b_pagelist
;
170 b_flags
= cbp
->b_flags
;
171 real_bp
= cbp
->b_real_bp
;
173 zero_offset
= cbp
->b_validend
;
174 iostate
= (struct clios
*)cbp
->b_iostate
;
177 if (cbp
->b_vectorcount
> 1)
178 _FREE(cbp
->b_vectorlist
, M_SEGMENT
);
180 if ((cbp
->b_flags
& B_ERROR
) && error
== 0)
181 error
= cbp
->b_error
;
183 total_resid
+= cbp
->b_resid
;
184 total_size
+= cbp
->b_bcount
;
186 cbp_next
= cbp
->b_trans_next
;
193 cluster_zero(upl
, zero_offset
, PAGE_SIZE
- (zero_offset
& PAGE_MASK
), real_bp
);
195 if ((vp
->v_flag
& VTHROTTLED
) && (vp
->v_numoutput
<= (ASYNC_THROTTLE
/ 3))) {
196 vp
->v_flag
&= ~VTHROTTLED
;
197 wakeup((caddr_t
)&vp
->v_numoutput
);
201 * someone has issued multiple I/Os asynchrounsly
202 * and is waiting for them to complete (streaming)
204 if (error
&& iostate
->io_error
== 0)
205 iostate
->io_error
= error
;
207 iostate
->io_completed
+= total_size
;
209 if (iostate
->io_wanted
) {
211 * someone is waiting for the state of
212 * this io stream to change
214 iostate
->io_wanted
= 0;
215 wakeup((caddr_t
)&iostate
->io_wanted
);
218 if ((b_flags
& B_NEED_IODONE
) && real_bp
) {
220 real_bp
->b_flags
|= B_ERROR
;
221 real_bp
->b_error
= error
;
223 real_bp
->b_resid
= total_resid
;
227 if (error
== 0 && total_resid
)
230 if (b_flags
& B_COMMIT_UPL
) {
231 pg_offset
= upl_offset
& PAGE_MASK
;
232 commit_size
= (((pg_offset
+ total_size
) + (PAGE_SIZE
- 1)) / PAGE_SIZE
) * PAGE_SIZE
;
234 if (error
|| (b_flags
& B_NOCACHE
) || ((b_flags
& B_PHYS
) && !(b_flags
& B_READ
))) {
237 if (b_flags
& B_PHYS
)
238 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
;
239 else if ((b_flags
& B_PAGEOUT
) && (error
!= ENXIO
)) /* transient error */
240 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
;
241 else if (b_flags
& B_PGIN
)
242 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
;
244 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_DUMP_PAGES
;
246 ubc_upl_abort_range(upl
, upl_offset
- pg_offset
, commit_size
,
249 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
250 (int)upl
, upl_offset
- pg_offset
, commit_size
,
251 0x80000000|upl_abort_code
, 0);
254 int upl_commit_flags
= UPL_COMMIT_FREE_ON_EMPTY
;
256 if (b_flags
& B_PHYS
)
257 upl_commit_flags
|= UPL_COMMIT_SET_DIRTY
;
258 else if ( !(b_flags
& B_PAGEOUT
))
259 upl_commit_flags
|= UPL_COMMIT_CLEAR_DIRTY
;
261 upl_commit_flags
|= UPL_COMMIT_INACTIVATE
;
263 ubc_upl_commit_range(upl
, upl_offset
- pg_offset
, commit_size
,
266 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
267 (int)upl
, upl_offset
- pg_offset
, commit_size
,
268 upl_commit_flags
, 0);
271 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
272 (int)upl
, upl_offset
, 0, error
, 0);
279 cluster_zero(upl
, upl_offset
, size
, bp
)
281 vm_offset_t upl_offset
;
285 vm_offset_t io_addr
= 0;
289 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 23)) | DBG_FUNC_NONE
,
290 upl_offset
, size
, (int)bp
, 0, 0);
292 if (bp
== NULL
|| bp
->b_data
== NULL
) {
293 kret
= ubc_upl_map(upl
, &io_addr
);
295 if (kret
!= KERN_SUCCESS
)
296 panic("cluster_zero: ubc_upl_map() failed with (%d)", kret
);
298 panic("cluster_zero: ubc_upl_map() mapped 0");
302 io_addr
= (vm_offset_t
)bp
->b_data
;
303 bzero((caddr_t
)(io_addr
+ upl_offset
), size
);
306 kret
= ubc_upl_unmap(upl
);
308 if (kret
!= KERN_SUCCESS
)
309 panic("cluster_zero: kernel_upl_unmap failed");
314 cluster_io(vp
, upl
, upl_offset
, f_offset
, non_rounded_size
, devblocksize
, flags
, real_bp
, iostate
)
317 vm_offset_t upl_offset
;
319 int non_rounded_size
;
323 struct clios
*iostate
;
332 struct buf
*cbp_head
= 0;
333 struct buf
*cbp_tail
= 0;
344 if (flags
& CL_READ
) {
345 io_flags
= (B_VECTORLIST
| B_READ
);
347 vfs_io_attributes(vp
, B_READ
, &max_iosize
, &max_vectors
);
349 io_flags
= (B_VECTORLIST
| B_WRITEINPROG
);
351 vfs_io_attributes(vp
, B_WRITE
, &max_iosize
, &max_vectors
);
353 pl
= ubc_upl_pageinfo(upl
);
358 io_flags
|= B_NOCACHE
;
359 if (flags
& CL_PAGEIN
)
361 if (flags
& CL_PAGEOUT
)
362 io_flags
|= B_PAGEOUT
;
363 if (flags
& CL_COMMIT
)
364 io_flags
|= B_COMMIT_UPL
;
365 if (flags
& CL_PRESERVE
)
369 size
= (non_rounded_size
+ (devblocksize
- 1)) & ~(devblocksize
- 1);
371 size
= non_rounded_size
;
374 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_START
,
375 (int)f_offset
, size
, upl_offset
, flags
, 0);
377 if ((flags
& CL_READ
) && ((upl_offset
+ non_rounded_size
) & PAGE_MASK
) && (!(flags
& CL_NOZERO
))) {
379 * then we are going to end up
380 * with a page that we can't complete (the file size wasn't a multiple
381 * of PAGE_SIZE and we're trying to read to the end of the file
382 * so we'll go ahead and zero out the portion of the page we can't
383 * read in from the file
385 zero_offset
= upl_offset
+ non_rounded_size
;
396 if (size
> max_iosize
)
397 io_size
= max_iosize
;
401 if (error
= VOP_CMAP(vp
, f_offset
, io_size
, &blkno
, (size_t *)&io_size
, NULL
)) {
402 if (error
== EOPNOTSUPP
)
403 panic("VOP_CMAP Unimplemented");
407 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 24)) | DBG_FUNC_NONE
,
408 (int)f_offset
, (int)blkno
, io_size
, zero_offset
, 0);
410 if ( (!(flags
& CL_READ
) && (long)blkno
== -1) || io_size
== 0) {
411 if (flags
& CL_PAGEOUT
) {
416 /* Try paging out the page individually before
417 giving up entirely and dumping it (it could
418 be mapped in a "hole" and require allocation
421 ubc_upl_abort_range(upl
, upl_offset
, PAGE_SIZE_64
, UPL_ABORT_FREE_ON_EMPTY
);
422 if (ubc_pushdirty_range(vp
, f_offset
, PAGE_SIZE_64
) == 0) {
427 upl_offset
+= PAGE_SIZE_64
;
428 f_offset
+= PAGE_SIZE_64
;
429 size
-= PAGE_SIZE_64
;
432 lblkno
= (daddr_t
)(f_offset
/ PAGE_SIZE_64
);
434 * we have now figured out how much I/O we can do - this is in 'io_size'
435 * pl_index represents the first page in the 'upl' that the I/O will occur for
436 * pg_offset is the starting point in the first page for the I/O
437 * pg_count is the number of full and partial pages that 'io_size' encompasses
439 pl_index
= upl_offset
/ PAGE_SIZE
;
440 pg_offset
= upl_offset
& PAGE_MASK
;
441 pg_count
= (io_size
+ pg_offset
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
443 if (flags
& CL_DEV_MEMORY
) {
445 * currently, can't deal with reading 'holes' in file
447 if ((long)blkno
== -1) {
452 * treat physical requests as one 'giant' page
456 if ((flags
& CL_READ
) && (long)blkno
== -1) {
460 * if we're reading and blkno == -1, then we've got a
461 * 'hole' in the file that we need to deal with by zeroing
462 * out the affected area in the upl
464 if (zero_offset
&& io_size
== size
) {
466 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
467 * than 'zero_offset' will be non-zero
468 * if the 'hole' returned by VOP_CMAP extends all the way to the eof
469 * (indicated by the io_size finishing off the I/O request for this UPL)
470 * than we're not going to issue an I/O for the
471 * last page in this upl... we need to zero both the hole and the tail
472 * of the page beyond the EOF, since the delayed zero-fill won't kick in
474 bytes_to_zero
= (((upl_offset
+ io_size
) + (PAGE_SIZE
- 1)) & ~PAGE_MASK
) - upl_offset
;
478 bytes_to_zero
= io_size
;
480 cluster_zero(upl
, upl_offset
, bytes_to_zero
, real_bp
);
484 * if there is a current I/O chain pending
485 * then the first page of the group we just zero'd
486 * will be handled by the I/O completion if the zero
487 * fill started in the middle of the page
489 pg_count
= (io_size
- pg_offset
) / PAGE_SIZE
;
492 * no pending I/O to pick up that first page
493 * so, we have to make sure it gets committed
495 * set the pg_offset to 0 so that the upl_commit_range
496 * starts with this page
498 pg_count
= (io_size
+ pg_offset
) / PAGE_SIZE
;
501 if (io_size
== size
&& ((upl_offset
+ io_size
) & PAGE_MASK
))
503 * if we're done with the request for this UPL
504 * then we have to make sure to commit the last page
505 * even if we only partially zero-filled it
511 pg_resid
= PAGE_SIZE
- pg_offset
;
515 if (flags
& CL_COMMIT
)
516 ubc_upl_commit_range(upl
,
517 (upl_offset
+ pg_resid
) & ~PAGE_MASK
,
518 pg_count
* PAGE_SIZE
,
519 UPL_COMMIT_CLEAR_DIRTY
| UPL_COMMIT_FREE_ON_EMPTY
);
521 upl_offset
+= io_size
;
525 if (cbp_head
&& pg_count
)
529 } else if (real_bp
&& (real_bp
->b_blkno
== real_bp
->b_lblkno
)) {
530 real_bp
->b_blkno
= blkno
;
534 if (pg_count
> max_vectors
) {
535 io_size
-= (pg_count
- max_vectors
) * PAGE_SIZE
;
538 io_size
= PAGE_SIZE
- pg_offset
;
541 pg_count
= max_vectors
;
544 * we need to allocate space for the vector list
547 iovp
= (struct iovec
*)_MALLOC(sizeof(struct iovec
) * pg_count
,
548 M_SEGMENT
, M_NOWAIT
);
550 if (iovp
== (struct iovec
*) 0) {
552 * if the allocation fails, then throttle down to a single page
554 io_size
= PAGE_SIZE
- pg_offset
;
560 /* Throttle the speculative IO */
561 if ((flags
& CL_ASYNC
) && !(flags
& CL_PAGEOUT
))
566 cbp
= alloc_io_buf(vp
, priv
);
570 * we use the io vector that's reserved in the buffer header
571 * this insures we can always issue an I/O even in a low memory
572 * condition that prevents the _MALLOC from succeeding... this
573 * is necessary to prevent deadlocks with the pager
575 iovp
= (struct iovec
*)(&cbp
->b_vects
[0]);
577 cbp
->b_vectorlist
= (void *)iovp
;
578 cbp
->b_vectorcount
= pg_count
;
580 if (flags
& CL_DEV_MEMORY
) {
582 iovp
->iov_len
= io_size
;
583 iovp
->iov_base
= (caddr_t
)upl_phys_page(pl
, 0);
585 if (iovp
->iov_base
== (caddr_t
) 0) {
589 iovp
->iov_base
+= upl_offset
;
592 for (i
= 0, vsize
= io_size
; i
< pg_count
; i
++, iovp
++) {
595 psize
= PAGE_SIZE
- pg_offset
;
600 iovp
->iov_len
= psize
;
601 iovp
->iov_base
= (caddr_t
)upl_phys_page(pl
, pl_index
+ i
);
603 if (iovp
->iov_base
== (caddr_t
) 0) {
605 _FREE(cbp
->b_vectorlist
, M_SEGMENT
);
611 iovp
->iov_base
+= pg_offset
;
614 if (flags
& CL_PAGEOUT
) {
619 if (bp
= incore(vp
, lblkno
+ i
)) {
620 if (!ISSET(bp
->b_flags
, B_BUSY
)) {
622 SET(bp
->b_flags
, (B_BUSY
| B_INVAL
));
626 panic("BUSY bp found in cluster_io");
636 if (flags
& CL_ASYNC
) {
637 cbp
->b_flags
|= (B_CALL
| B_ASYNC
);
638 cbp
->b_iodone
= (void *)cluster_iodone
;
640 cbp
->b_flags
|= io_flags
;
642 cbp
->b_lblkno
= lblkno
;
643 cbp
->b_blkno
= blkno
;
644 cbp
->b_bcount
= io_size
;
645 cbp
->b_pagelist
= upl
;
646 cbp
->b_uploffset
= upl_offset
;
647 cbp
->b_trans_next
= (struct buf
*)0;
649 if (cbp
->b_iostate
= (void *)iostate
)
651 * caller wants to track the state of this
652 * io... bump the amount issued against this stream
654 iostate
->io_issued
+= io_size
;
657 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 26)) | DBG_FUNC_NONE
,
658 cbp
->b_lblkno
, cbp
->b_blkno
, upl_offset
, io_size
, 0);
660 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 27)) | DBG_FUNC_NONE
,
661 cbp
->b_lblkno
, cbp
->b_blkno
, upl_offset
, io_size
, 0);
664 cbp_tail
->b_trans_next
= cbp
;
670 (struct buf
*)(cbp
->b_trans_head
) = cbp_head
;
673 upl_offset
+= io_size
;
677 if ( (!(upl_offset
& PAGE_MASK
) && !(flags
& CL_DEV_MEMORY
) && ((flags
& CL_ASYNC
) || buf_count
> 8)) || size
== 0) {
679 * if we have no more I/O to issue or
680 * the current I/O we've prepared fully
681 * completes the last page in this request
682 * and it's either an ASYNC request or
683 * we've already accumulated more than 8 I/O's into
684 * this transaction and it's not an I/O directed to
685 * special DEVICE memory
686 * then go ahead and issue the I/O
690 cbp_head
->b_flags
|= B_NEED_IODONE
;
691 cbp_head
->b_real_bp
= real_bp
;
693 cbp_head
->b_real_bp
= (struct buf
*)NULL
;
697 * we're about to issue the last I/O for this upl
698 * if this was a read to the eof and the eof doesn't
699 * finish on a page boundary, than we need to zero-fill
700 * the rest of the page....
702 cbp_head
->b_validend
= zero_offset
;
704 cbp_head
->b_validend
= 0;
706 for (cbp
= cbp_head
; cbp
;) {
707 struct buf
* cbp_next
;
709 if (io_flags
& B_WRITEINPROG
)
710 cbp
->b_vp
->v_numoutput
++;
712 cbp_next
= cbp
->b_trans_next
;
714 (void) VOP_STRATEGY(cbp
);
717 if ( !(flags
& CL_ASYNC
)) {
718 for (cbp
= cbp_head
; cbp
; cbp
= cbp
->b_trans_next
)
721 if (error
= cluster_iodone(cbp_head
)) {
722 if ((flags
& CL_PAGEOUT
) && (error
== ENXIO
))
723 retval
= 0; /* drop the error */
729 cbp_head
= (struct buf
*)0;
730 cbp_tail
= (struct buf
*)0;
740 for (cbp
= cbp_head
; cbp
;) {
741 struct buf
* cbp_next
;
743 if (cbp
->b_vectorcount
> 1)
744 _FREE(cbp
->b_vectorlist
, M_SEGMENT
);
745 upl_offset
-= cbp
->b_bcount
;
746 size
+= cbp
->b_bcount
;
747 io_size
+= cbp
->b_bcount
;
749 cbp_next
= cbp
->b_trans_next
;
755 * update the error condition for this stream
756 * since we never really issued the io
757 * just go ahead and adjust it back
759 if (iostate
->io_error
== 0)
760 iostate
->io_error
= error
;
761 iostate
->io_issued
-= io_size
;
763 if (iostate
->io_wanted
) {
765 * someone is waiting for the state of
766 * this io stream to change
768 iostate
->io_wanted
= 0;
769 wakeup((caddr_t
)&iostate
->io_wanted
);
772 pg_offset
= upl_offset
& PAGE_MASK
;
773 abort_size
= ((size
+ pg_offset
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
) * PAGE_SIZE
;
775 if (flags
& CL_COMMIT
) {
778 if (flags
& CL_PRESERVE
)
779 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
;
780 else if ((flags
& CL_PAGEOUT
) && (error
!= ENXIO
)) /* transient error */
781 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
;
782 else if (flags
& CL_PAGEIN
)
783 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
;
785 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_DUMP_PAGES
;
787 ubc_upl_abort_range(upl
, upl_offset
- pg_offset
, abort_size
,
790 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 28)) | DBG_FUNC_NONE
,
791 (int)upl
, upl_offset
- pg_offset
, abort_size
, error
, 0);
794 real_bp
->b_flags
|= B_ERROR
;
795 real_bp
->b_error
= error
;
802 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_END
,
803 (int)f_offset
, size
, upl_offset
, retval
, 0);
810 cluster_rd_prefetch(vp
, f_offset
, size
, filesize
, devblocksize
)
820 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_START
,
821 (int)f_offset
, size
, (int)filesize
, 0, 0);
823 if (f_offset
>= filesize
) {
824 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_END
,
825 (int)f_offset
, 0, 0, 0, 0);
828 if (size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
829 size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
831 size
= (size
+ (PAGE_SIZE
- 1)) & ~(PAGE_SIZE
- 1);
833 if ((off_t
)size
> (filesize
- f_offset
))
834 size
= filesize
- f_offset
;
836 pages_to_fetch
= (size
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
838 for (skipped_pages
= 0; skipped_pages
< pages_to_fetch
; skipped_pages
++) {
839 if (ubc_page_op(vp
, f_offset
, 0, 0, 0) != KERN_SUCCESS
)
841 f_offset
+= PAGE_SIZE
;
844 if (skipped_pages
< pages_to_fetch
)
845 advisory_read(vp
, filesize
, f_offset
, size
, devblocksize
);
847 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_END
,
848 (int)f_offset
+ (pages_to_fetch
* PAGE_SIZE
), skipped_pages
, 0, 1, 0);
850 return (pages_to_fetch
);
856 cluster_rd_ahead(vp
, b_lblkno
, e_lblkno
, filesize
, devblocksize
)
865 int size_of_prefetch
;
868 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_START
,
869 b_lblkno
, e_lblkno
, vp
->v_lastr
, 0, 0);
871 if (b_lblkno
== vp
->v_lastr
&& b_lblkno
== e_lblkno
) {
872 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
873 vp
->v_ralen
, vp
->v_maxra
, vp
->v_lastr
, 0, 0);
877 if (vp
->v_lastr
== -1 || (b_lblkno
!= vp
->v_lastr
&& b_lblkno
!= (vp
->v_lastr
+ 1) &&
878 (b_lblkno
!= (vp
->v_maxra
+ 1) || vp
->v_ralen
== 0))) {
882 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
883 vp
->v_ralen
, vp
->v_maxra
, vp
->v_lastr
, 1, 0);
887 max_pages
= MAX_UPL_TRANSFER
;
889 vp
->v_ralen
= vp
->v_ralen
? min(max_pages
, vp
->v_ralen
<< 1) : 1;
891 if (((e_lblkno
+ 1) - b_lblkno
) > vp
->v_ralen
)
892 vp
->v_ralen
= min(max_pages
, (e_lblkno
+ 1) - b_lblkno
);
894 if (e_lblkno
< vp
->v_maxra
) {
895 if ((vp
->v_maxra
- e_lblkno
) > max(max_pages
/ 16, 4)) {
897 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
898 vp
->v_ralen
, vp
->v_maxra
, vp
->v_lastr
, 2, 0);
902 r_lblkno
= max(e_lblkno
, vp
->v_maxra
) + 1;
903 f_offset
= (off_t
)r_lblkno
* PAGE_SIZE_64
;
905 if (f_offset
< filesize
) {
906 size_of_prefetch
= cluster_rd_prefetch(vp
, f_offset
, vp
->v_ralen
* PAGE_SIZE
, filesize
, devblocksize
);
908 if (size_of_prefetch
)
909 vp
->v_maxra
= (r_lblkno
+ size_of_prefetch
) - 1;
911 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
912 vp
->v_ralen
, vp
->v_maxra
, vp
->v_lastr
, 3, 0);
916 cluster_pageout(vp
, upl
, upl_offset
, f_offset
, size
, filesize
, devblocksize
, flags
)
919 vm_offset_t upl_offset
;
929 int local_flags
= CL_PAGEOUT
;
931 if ((flags
& UPL_IOSYNC
) == 0)
932 local_flags
|= CL_ASYNC
;
933 if ((flags
& UPL_NOCOMMIT
) == 0)
934 local_flags
|= CL_COMMIT
;
937 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 52)) | DBG_FUNC_NONE
,
938 (int)f_offset
, size
, (int)filesize
, local_flags
, 0);
941 * If they didn't specify any I/O, then we are done...
942 * we can't issue an abort because we don't know how
943 * big the upl really is
948 if (vp
->v_mount
->mnt_flag
& MNT_RDONLY
) {
949 if (local_flags
& CL_COMMIT
)
950 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
);
954 * can't page-in from a negative offset
955 * or if we're starting beyond the EOF
956 * or if the file offset isn't page aligned
957 * or the size requested isn't a multiple of PAGE_SIZE
959 if (f_offset
< 0 || f_offset
>= filesize
||
960 (f_offset
& PAGE_MASK_64
) || (size
& PAGE_MASK
)) {
961 if (local_flags
& CL_COMMIT
)
962 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
);
965 max_size
= filesize
- f_offset
;
972 pg_size
= (io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
974 if (size
> pg_size
) {
975 if (local_flags
& CL_COMMIT
)
976 ubc_upl_abort_range(upl
, upl_offset
+ pg_size
, size
- pg_size
,
977 UPL_ABORT_FREE_ON_EMPTY
);
979 while (vp
->v_numoutput
>= ASYNC_THROTTLE
) {
980 vp
->v_flag
|= VTHROTTLED
;
981 tsleep((caddr_t
)&vp
->v_numoutput
, PRIBIO
+ 1, "cluster_pageout", 0);
984 return (cluster_io(vp
, upl
, upl_offset
, f_offset
, io_size
, devblocksize
,
985 local_flags
, (struct buf
*)0, (struct clios
*)0));
989 cluster_pagein(vp
, upl
, upl_offset
, f_offset
, size
, filesize
, devblocksize
, flags
)
992 vm_offset_t upl_offset
;
1003 int local_flags
= 0;
1005 if (upl
== NULL
|| size
< 0)
1006 panic("cluster_pagein: NULL upl passed in");
1008 if ((flags
& UPL_IOSYNC
) == 0)
1009 local_flags
|= CL_ASYNC
;
1010 if ((flags
& UPL_NOCOMMIT
) == 0)
1011 local_flags
|= CL_COMMIT
;
1014 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 56)) | DBG_FUNC_NONE
,
1015 (int)f_offset
, size
, (int)filesize
, local_flags
, 0);
1018 * can't page-in from a negative offset
1019 * or if we're starting beyond the EOF
1020 * or if the file offset isn't page aligned
1021 * or the size requested isn't a multiple of PAGE_SIZE
1023 if (f_offset
< 0 || f_offset
>= filesize
||
1024 (f_offset
& PAGE_MASK_64
) || (size
& PAGE_MASK
) || (upl_offset
& PAGE_MASK
)) {
1025 if (local_flags
& CL_COMMIT
)
1026 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
);
1029 max_size
= filesize
- f_offset
;
1031 if (size
< max_size
)
1036 rounded_size
= (io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
1038 if (size
> rounded_size
&& (local_flags
& CL_COMMIT
))
1039 ubc_upl_abort_range(upl
, upl_offset
+ rounded_size
,
1040 size
- (upl_offset
+ rounded_size
), UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
);
1042 retval
= cluster_io(vp
, upl
, upl_offset
, f_offset
, io_size
, devblocksize
,
1043 local_flags
| CL_READ
| CL_PAGEIN
, (struct buf
*)0, (struct clios
*)0);
1049 b_lblkno
= (int)(f_offset
/ PAGE_SIZE_64
);
1051 ((f_offset
+ ((off_t
)io_size
- 1)) / PAGE_SIZE_64
);
1053 if (!(flags
& UPL_NORDAHEAD
) && !(vp
->v_flag
& VRAOFF
) && rounded_size
== PAGE_SIZE
) {
1055 * we haven't read the last page in of the file yet
1056 * so let's try to read ahead if we're in
1057 * a sequential access pattern
1059 cluster_rd_ahead(vp
, b_lblkno
, e_lblkno
, filesize
, devblocksize
);
1061 vp
->v_lastr
= e_lblkno
;
1073 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 19)) | DBG_FUNC_START
,
1074 (int)bp
, bp
->b_lblkno
, bp
->b_bcount
, bp
->b_flags
, 0);
1076 if (bp
->b_pagelist
== (upl_t
) 0)
1077 panic("cluster_bp: can't handle NULL upl yet\n");
1078 if (bp
->b_flags
& B_READ
)
1079 flags
= CL_ASYNC
| CL_READ
;
1083 f_offset
= ubc_blktooff(bp
->b_vp
, bp
->b_lblkno
);
1085 return (cluster_io(bp
->b_vp
, bp
->b_pagelist
, 0, f_offset
, bp
->b_bcount
, 0, flags
, bp
, (struct clios
*)0));
1089 cluster_write(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
)
1103 vm_offset_t upl_offset
;
1106 upl_page_info_t
*pl
;
1112 if ( (!(vp
->v_flag
& VNOCACHE_DATA
)) || (!uio
) || (uio
->uio_segflg
!= UIO_USERSPACE
))
1114 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
);
1118 while (uio
->uio_resid
&& uio
->uio_offset
< newEOF
&& retval
== 0)
1120 /* we know we have a resid, so this is safe */
1122 while (iov
->iov_len
== 0) {
1129 * We check every vector target and if it is physically
1130 * contiguous space, we skip the sanity checks.
1133 upl_offset
= (vm_offset_t
)iov
->iov_base
& ~PAGE_MASK
;
1134 upl_size
= (upl_offset
+ PAGE_SIZE
+(PAGE_SIZE
-1)) & ~PAGE_MASK
;
1136 upl_flags
= UPL_QUERY_OBJECT_TYPE
;
1137 if ((vm_map_get_upl(current_map(),
1138 (vm_offset_t
)iov
->iov_base
& ~PAGE_MASK
,
1139 &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
, 0)) != KERN_SUCCESS
)
1142 * the user app must have passed in an invalid address
1147 if (upl_flags
& UPL_PHYS_CONTIG
)
1149 if (flags
& IO_HEADZEROFILL
)
1151 flags
&= ~IO_HEADZEROFILL
;
1153 if (retval
= cluster_write_x(vp
, (struct uio
*)0, 0, uio
->uio_offset
, headOff
, 0, devblocksize
, IO_HEADZEROFILL
))
1157 retval
= cluster_phys_write(vp
, uio
, newEOF
, devblocksize
, flags
);
1159 if (uio
->uio_resid
== 0 && (flags
& IO_TAILZEROFILL
))
1161 retval
= cluster_write_x(vp
, (struct uio
*)0, 0, tailOff
, uio
->uio_offset
, 0, devblocksize
, IO_HEADZEROFILL
);
1165 else if ((uio
->uio_resid
< 4 * PAGE_SIZE
) || (flags
& (IO_TAILZEROFILL
| IO_HEADZEROFILL
)))
1168 * We set a threshhold of 4 pages to decide if the nocopy
1169 * write loop is worth the trouble...
1170 * we also come here if we're trying to zero the head and/or tail
1171 * of a partially written page, and the user source is not a physically contiguous region
1173 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
);
1176 else if (uio
->uio_offset
& PAGE_MASK_64
)
1178 /* Bring the file offset write up to a pagesize boundary */
1179 clip_size
= (PAGE_SIZE
- (uio
->uio_offset
& PAGE_MASK_64
));
1180 if (uio
->uio_resid
< clip_size
)
1181 clip_size
= uio
->uio_resid
;
1183 * Fake the resid going into the cluster_write_x call
1184 * and restore it on the way out.
1186 prev_resid
= uio
->uio_resid
;
1187 uio
->uio_resid
= clip_size
;
1188 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
);
1189 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
1191 else if ((int)iov
->iov_base
& PAGE_MASK_64
)
1193 clip_size
= iov
->iov_len
;
1194 prev_resid
= uio
->uio_resid
;
1195 uio
->uio_resid
= clip_size
;
1196 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
);
1197 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
1202 * If we come in here, we know the offset into
1203 * the file is on a pagesize boundary
1206 max_io_size
= newEOF
- uio
->uio_offset
;
1207 clip_size
= uio
->uio_resid
;
1208 if (iov
->iov_len
< clip_size
)
1209 clip_size
= iov
->iov_len
;
1210 if (max_io_size
< clip_size
)
1211 clip_size
= max_io_size
;
1213 if (clip_size
< PAGE_SIZE
)
1216 * Take care of tail end of write in this vector
1218 prev_resid
= uio
->uio_resid
;
1219 uio
->uio_resid
= clip_size
;
1220 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
);
1221 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
1225 /* round clip_size down to a multiple of pagesize */
1226 clip_size
= clip_size
& ~(PAGE_MASK
);
1227 prev_resid
= uio
->uio_resid
;
1228 uio
->uio_resid
= clip_size
;
1229 retval
= cluster_nocopy_write(vp
, uio
, newEOF
, devblocksize
, flags
);
1230 if ((retval
== 0) && uio
->uio_resid
)
1231 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
);
1232 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
1241 cluster_nocopy_write(vp
, uio
, newEOF
, devblocksize
, flags
)
1249 upl_page_info_t
*pl
;
1251 vm_offset_t upl_offset
;
1256 int upl_needed_size
;
1263 int force_data_sync
;
1265 struct clios iostate
;
1267 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_START
,
1268 (int)uio
->uio_offset
, (int)uio
->uio_resid
,
1269 (int)newEOF
, devblocksize
, 0);
1272 * When we enter this routine, we know
1273 * -- the offset into the file is on a pagesize boundary
1274 * -- the resid is a page multiple
1275 * -- the resid will not exceed iov_len
1277 cluster_try_push(vp
, newEOF
, 0, 1);
1279 iostate
.io_completed
= 0;
1280 iostate
.io_issued
= 0;
1281 iostate
.io_error
= 0;
1282 iostate
.io_wanted
= 0;
1286 while (uio
->uio_resid
&& uio
->uio_offset
< newEOF
&& error
== 0) {
1287 io_size
= uio
->uio_resid
;
1289 if (io_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
1290 io_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
1293 if (io_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
) / 4)
1294 io_size
= (MAX_UPL_TRANSFER
* PAGE_SIZE
) / 8;
1297 upl_offset
= (vm_offset_t
)iov
->iov_base
& PAGE_MASK_64
;
1298 upl_needed_size
= (upl_offset
+ io_size
+ (PAGE_SIZE
-1)) & ~PAGE_MASK
;
1300 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_START
,
1301 (int)upl_offset
, upl_needed_size
, (int)iov
->iov_base
, io_size
, 0);
1303 for (force_data_sync
= 0; force_data_sync
< 3; force_data_sync
++) {
1305 upl_size
= upl_needed_size
;
1306 upl_flags
= UPL_FILE_IO
| UPL_COPYOUT_FROM
| UPL_NO_SYNC
|
1307 UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
;
1309 kret
= vm_map_get_upl(current_map(),
1310 (vm_offset_t
)iov
->iov_base
& ~PAGE_MASK
,
1318 if (kret
!= KERN_SUCCESS
) {
1319 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
1323 * cluster_nocopy_write: failed to get pagelist
1325 * we may have already spun some portion of this request
1326 * off as async requests... we need to wait for the I/O
1327 * to complete before returning
1329 goto wait_for_writes
;
1331 pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
1332 pages_in_pl
= upl_size
/ PAGE_SIZE
;
1334 for (i
= 0; i
< pages_in_pl
; i
++) {
1335 if (!upl_valid_page(pl
, i
))
1338 if (i
== pages_in_pl
)
1342 * didn't get all the pages back that we
1343 * needed... release this upl and try again
1345 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
1346 UPL_ABORT_FREE_ON_EMPTY
);
1348 if (force_data_sync
>= 3) {
1349 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
1350 i
, pages_in_pl
, upl_size
, kret
, 0);
1353 * for some reason, we couldn't acquire a hold on all
1354 * the pages needed in the user's address space
1356 * we may have already spun some portion of this request
1357 * off as async requests... we need to wait for the I/O
1358 * to complete before returning
1360 goto wait_for_writes
;
1364 * Consider the possibility that upl_size wasn't satisfied.
1366 if (upl_size
!= upl_needed_size
)
1367 io_size
= (upl_size
- (int)upl_offset
) & ~PAGE_MASK
;
1369 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
1370 (int)upl_offset
, upl_size
, (int)iov
->iov_base
, io_size
, 0);
1373 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
1374 UPL_ABORT_FREE_ON_EMPTY
);
1377 * we may have already spun some portion of this request
1378 * off as async requests... we need to wait for the I/O
1379 * to complete before returning
1381 goto wait_for_writes
;
1384 * Now look for pages already in the cache
1385 * and throw them away.
1388 upl_f_offset
= uio
->uio_offset
; /* this is page aligned in the file */
1389 max_io_size
= io_size
;
1391 while (max_io_size
) {
1393 * Flag UPL_POP_DUMP says if the page is found
1394 * in the page cache it must be thrown away.
1398 UPL_POP_SET
| UPL_POP_BUSY
| UPL_POP_DUMP
,
1400 max_io_size
-= PAGE_SIZE_64
;
1401 upl_f_offset
+= PAGE_SIZE_64
;
1404 * we want push out these writes asynchronously so that we can overlap
1405 * the preparation of the next I/O
1406 * if there are already too many outstanding writes
1407 * wait until some complete before issuing the next
1409 while ((iostate
.io_issued
- iostate
.io_completed
) > (2 * MAX_UPL_TRANSFER
* PAGE_SIZE
)) {
1410 iostate
.io_wanted
= 1;
1411 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO
+ 1, "cluster_nocopy_write", 0);
1413 if (iostate
.io_error
) {
1415 * one of the earlier writes we issued ran into a hard error
1416 * don't issue any more writes, cleanup the UPL
1417 * that was just created but not used, then
1418 * go wait for all writes that are part of this stream
1419 * to complete before returning the error to the caller
1421 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
1422 UPL_ABORT_FREE_ON_EMPTY
);
1424 goto wait_for_writes
;
1426 io_flag
= CL_ASYNC
| CL_PRESERVE
| CL_COMMIT
;
1428 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 77)) | DBG_FUNC_START
,
1429 (int)upl_offset
, (int)uio
->uio_offset
, io_size
, io_flag
, 0);
1431 error
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
,
1432 io_size
, devblocksize
, io_flag
, (struct buf
*)0, &iostate
);
1434 iov
->iov_len
-= io_size
;
1435 iov
->iov_base
+= io_size
;
1436 uio
->uio_resid
-= io_size
;
1437 uio
->uio_offset
+= io_size
;
1439 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 77)) | DBG_FUNC_END
,
1440 (int)upl_offset
, (int)uio
->uio_offset
, (int)uio
->uio_resid
, error
, 0);
1446 * make sure all async writes issued as part of this stream
1447 * have completed before we return
1449 while (iostate
.io_issued
!= iostate
.io_completed
) {
1450 iostate
.io_wanted
= 1;
1451 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO
+ 1, "cluster_nocopy_write", 0);
1453 if (iostate
.io_error
)
1454 error
= iostate
.io_error
;
1456 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_END
,
1457 (int)uio
->uio_offset
, (int)uio
->uio_resid
, error
, 4, 0);
1464 cluster_phys_write(vp
, uio
, newEOF
, devblocksize
, flags
)
1471 upl_page_info_t
*pl
;
1472 vm_offset_t src_paddr
;
1474 vm_offset_t upl_offset
;
1478 int upl_needed_size
;
1486 * When we enter this routine, we know
1487 * -- the resid will not exceed iov_len
1488 * -- the vector target address is physcially contiguous
1490 cluster_try_push(vp
, newEOF
, 0, 1);
1493 io_size
= iov
->iov_len
;
1494 upl_offset
= (vm_offset_t
)iov
->iov_base
& PAGE_MASK_64
;
1495 upl_needed_size
= upl_offset
+ io_size
;
1498 upl_size
= upl_needed_size
;
1499 upl_flags
= UPL_FILE_IO
| UPL_COPYOUT_FROM
| UPL_NO_SYNC
|
1500 UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
;
1502 kret
= vm_map_get_upl(current_map(),
1503 (vm_offset_t
)iov
->iov_base
& ~PAGE_MASK
,
1504 &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
, 0);
1506 if (kret
!= KERN_SUCCESS
) {
1508 * cluster_phys_write: failed to get pagelist
1509 * note: return kret here
1514 * Consider the possibility that upl_size wasn't satisfied.
1515 * This is a failure in the physical memory case.
1517 if (upl_size
< upl_needed_size
) {
1518 kernel_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
1521 pl
= ubc_upl_pageinfo(upl
);
1523 src_paddr
= (vm_offset_t
)upl_phys_page(pl
, 0) + ((vm_offset_t
)iov
->iov_base
& PAGE_MASK
);
1525 while (((uio
->uio_offset
& (devblocksize
- 1)) || io_size
< devblocksize
) && io_size
) {
1528 head_size
= devblocksize
- (int)(uio
->uio_offset
& (devblocksize
- 1));
1530 if (head_size
> io_size
)
1531 head_size
= io_size
;
1533 error
= cluster_align_phys_io(vp
, uio
, src_paddr
, head_size
, devblocksize
, 0);
1536 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
1540 upl_offset
+= head_size
;
1541 src_paddr
+= head_size
;
1542 io_size
-= head_size
;
1544 tail_size
= io_size
& (devblocksize
- 1);
1545 io_size
-= tail_size
;
1549 * issue a synchronous write to cluster_io
1551 error
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
,
1552 io_size
, 0, CL_DEV_MEMORY
, (struct buf
*)0, (struct clios
*)0);
1556 * The cluster_io write completed successfully,
1557 * update the uio structure
1559 uio
->uio_resid
-= io_size
;
1560 iov
->iov_len
-= io_size
;
1561 iov
->iov_base
+= io_size
;
1562 uio
->uio_offset
+= io_size
;
1563 src_paddr
+= io_size
;
1566 error
= cluster_align_phys_io(vp
, uio
, src_paddr
, tail_size
, devblocksize
, 0);
1569 * just release our hold on the physically contiguous
1570 * region without changing any state
1572 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
1579 cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
)
1589 upl_page_info_t
*pl
;
1591 vm_offset_t upl_offset
;
1599 vm_offset_t io_address
;
1606 long long total_size
;
1609 long long zero_cnt1
;
1611 daddr_t start_blkno
;
1615 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_START
,
1616 (int)uio
->uio_offset
, uio
->uio_resid
, (int)oldEOF
, (int)newEOF
, 0);
1618 uio_resid
= uio
->uio_resid
;
1620 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_START
,
1621 0, 0, (int)oldEOF
, (int)newEOF
, 0);
1628 if (flags
& IO_HEADZEROFILL
) {
1630 * some filesystems (HFS is one) don't support unallocated holes within a file...
1631 * so we zero fill the intervening space between the old EOF and the offset
1632 * where the next chunk of real data begins.... ftruncate will also use this
1633 * routine to zero fill to the new EOF when growing a file... in this case, the
1634 * uio structure will not be provided
1637 if (headOff
< uio
->uio_offset
) {
1638 zero_cnt
= uio
->uio_offset
- headOff
;
1641 } else if (headOff
< newEOF
) {
1642 zero_cnt
= newEOF
- headOff
;
1646 if (flags
& IO_TAILZEROFILL
) {
1648 zero_off1
= uio
->uio_offset
+ uio
->uio_resid
;
1650 if (zero_off1
< tailOff
)
1651 zero_cnt1
= tailOff
- zero_off1
;
1654 if (zero_cnt
== 0 && uio
== (struct uio
*) 0)
1656 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_END
,
1657 retval
, 0, 0, 0, 0);
1661 while ((total_size
= (uio_resid
+ zero_cnt
+ zero_cnt1
)) && retval
== 0) {
1663 * for this iteration of the loop, figure out where our starting point is
1666 start_offset
= (int)(zero_off
& PAGE_MASK_64
);
1667 upl_f_offset
= zero_off
- start_offset
;
1668 } else if (uio_resid
) {
1669 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
1670 upl_f_offset
= uio
->uio_offset
- start_offset
;
1672 start_offset
= (int)(zero_off1
& PAGE_MASK_64
);
1673 upl_f_offset
= zero_off1
- start_offset
;
1675 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 46)) | DBG_FUNC_NONE
,
1676 (int)zero_off
, (int)zero_cnt
, (int)zero_off1
, (int)zero_cnt1
, 0);
1678 if (total_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
1679 total_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
1682 * compute the size of the upl needed to encompass
1683 * the requested write... limit each call to cluster_io
1684 * to the maximum UPL size... cluster_io will clip if
1685 * this exceeds the maximum io_size for the device,
1686 * make sure to account for
1687 * a starting offset that's not page aligned
1689 upl_size
= (start_offset
+ total_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
1691 if (upl_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
1692 upl_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
1694 pages_in_upl
= upl_size
/ PAGE_SIZE
;
1695 io_size
= upl_size
- start_offset
;
1697 if ((long long)io_size
> total_size
)
1698 io_size
= total_size
;
1700 start_blkno
= (daddr_t
)(upl_f_offset
/ PAGE_SIZE_64
);
1701 last_blkno
= start_blkno
+ pages_in_upl
;
1703 kret
= ubc_create_upl(vp
,
1709 if (kret
!= KERN_SUCCESS
)
1710 panic("cluster_write: failed to get pagelist");
1712 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_NONE
,
1713 (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
1715 if (start_offset
&& !upl_valid_page(pl
, 0)) {
1719 * we're starting in the middle of the first page of the upl
1720 * and the page isn't currently valid, so we're going to have
1721 * to read it in first... this is a synchronous operation
1723 read_size
= PAGE_SIZE
;
1725 if ((upl_f_offset
+ read_size
) > newEOF
)
1726 read_size
= newEOF
- upl_f_offset
;
1728 retval
= cluster_io(vp
, upl
, 0, upl_f_offset
, read_size
, devblocksize
,
1729 CL_READ
, (struct buf
*)0, (struct clios
*)0);
1732 * we had an error during the read which causes us to abort
1733 * the current cluster_write request... before we do, we need
1734 * to release the rest of the pages in the upl without modifying
1735 * there state and mark the failed page in error
1737 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
);
1738 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
1740 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
1741 (int)upl
, 0, 0, retval
, 0);
1745 if ((start_offset
== 0 || upl_size
> PAGE_SIZE
) && ((start_offset
+ io_size
) & PAGE_MASK
)) {
1747 * the last offset we're writing to in this upl does not end on a page
1748 * boundary... if it's not beyond the old EOF, then we'll also need to
1749 * pre-read this page in if it isn't already valid
1751 upl_offset
= upl_size
- PAGE_SIZE
;
1753 if ((upl_f_offset
+ start_offset
+ io_size
) < oldEOF
&&
1754 !upl_valid_page(pl
, upl_offset
/ PAGE_SIZE
)) {
1757 read_size
= PAGE_SIZE
;
1759 if ((upl_f_offset
+ upl_offset
+ read_size
) > newEOF
)
1760 read_size
= newEOF
- (upl_f_offset
+ upl_offset
);
1762 retval
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, read_size
, devblocksize
,
1763 CL_READ
, (struct buf
*)0, (struct clios
*)0);
1766 * we had an error during the read which causes us to abort
1767 * the current cluster_write request... before we do, we
1768 * need to release the rest of the pages in the upl without
1769 * modifying there state and mark the failed page in error
1771 ubc_upl_abort_range(upl
, upl_offset
, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
);
1772 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
1774 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
1775 (int)upl
, 0, 0, retval
, 0);
1780 if ((kret
= ubc_upl_map(upl
, &io_address
)) != KERN_SUCCESS
)
1781 panic("cluster_write: ubc_upl_map failed\n");
1782 xfer_resid
= io_size
;
1783 io_offset
= start_offset
;
1785 while (zero_cnt
&& xfer_resid
) {
1787 if (zero_cnt
< (long long)xfer_resid
)
1788 bytes_to_zero
= zero_cnt
;
1790 bytes_to_zero
= xfer_resid
;
1792 if ( !(flags
& (IO_NOZEROVALID
| IO_NOZERODIRTY
))) {
1793 bzero((caddr_t
)(io_address
+ io_offset
), bytes_to_zero
);
1795 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 43)) | DBG_FUNC_NONE
,
1796 (int)upl_f_offset
+ io_offset
, bytes_to_zero
,
1797 (int)io_offset
, xfer_resid
, 0);
1801 bytes_to_zero
= min(bytes_to_zero
, PAGE_SIZE
- (int)(zero_off
& PAGE_MASK_64
));
1802 zero_pg_index
= (int)((zero_off
- upl_f_offset
) / PAGE_SIZE_64
);
1804 if ( !upl_valid_page(pl
, zero_pg_index
)) {
1805 bzero((caddr_t
)(io_address
+ io_offset
), bytes_to_zero
);
1807 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 43)) | DBG_FUNC_NONE
,
1808 (int)upl_f_offset
+ io_offset
, bytes_to_zero
,
1809 (int)io_offset
, xfer_resid
, 0);
1811 } else if ((flags
& (IO_NOZERODIRTY
| IO_NOZEROVALID
)) == IO_NOZERODIRTY
&&
1812 !upl_dirty_page(pl
, zero_pg_index
)) {
1813 bzero((caddr_t
)(io_address
+ io_offset
), bytes_to_zero
);
1815 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 43)) | DBG_FUNC_NONE
,
1816 (int)upl_f_offset
+ io_offset
, bytes_to_zero
,
1817 (int)io_offset
, xfer_resid
, 0);
1820 xfer_resid
-= bytes_to_zero
;
1821 zero_cnt
-= bytes_to_zero
;
1822 zero_off
+= bytes_to_zero
;
1823 io_offset
+= bytes_to_zero
;
1825 if (xfer_resid
&& uio_resid
) {
1826 bytes_to_move
= min(uio_resid
, xfer_resid
);
1828 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 42)) | DBG_FUNC_NONE
,
1829 (int)uio
->uio_offset
, bytes_to_move
, uio_resid
, xfer_resid
, 0);
1831 retval
= uiomove((caddr_t
)(io_address
+ io_offset
), bytes_to_move
, uio
);
1835 if ((kret
= ubc_upl_unmap(upl
)) != KERN_SUCCESS
)
1836 panic("cluster_write: kernel_upl_unmap failed\n");
1838 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
1840 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
1841 (int)upl
, 0, 0, retval
, 0);
1843 uio_resid
-= bytes_to_move
;
1844 xfer_resid
-= bytes_to_move
;
1845 io_offset
+= bytes_to_move
;
1848 while (xfer_resid
&& zero_cnt1
&& retval
== 0) {
1850 if (zero_cnt1
< (long long)xfer_resid
)
1851 bytes_to_zero
= zero_cnt1
;
1853 bytes_to_zero
= xfer_resid
;
1855 if ( !(flags
& (IO_NOZEROVALID
| IO_NOZERODIRTY
))) {
1856 bzero((caddr_t
)(io_address
+ io_offset
), bytes_to_zero
);
1858 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 43)) | DBG_FUNC_NONE
,
1859 (int)upl_f_offset
+ io_offset
,
1860 bytes_to_zero
, (int)io_offset
, xfer_resid
, 0);
1864 bytes_to_zero
= min(bytes_to_zero
, PAGE_SIZE
- (int)(zero_off1
& PAGE_MASK_64
));
1865 zero_pg_index
= (int)((zero_off1
- upl_f_offset
) / PAGE_SIZE_64
);
1867 if ( !upl_valid_page(pl
, zero_pg_index
)) {
1868 bzero((caddr_t
)(io_address
+ io_offset
), bytes_to_zero
);
1870 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 43)) | DBG_FUNC_NONE
,
1871 (int)upl_f_offset
+ io_offset
,
1872 bytes_to_zero
, (int)io_offset
, xfer_resid
, 0);
1874 } else if ((flags
& (IO_NOZERODIRTY
| IO_NOZEROVALID
)) == IO_NOZERODIRTY
&&
1875 !upl_dirty_page(pl
, zero_pg_index
)) {
1876 bzero((caddr_t
)(io_address
+ io_offset
), bytes_to_zero
);
1878 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 43)) | DBG_FUNC_NONE
,
1879 (int)upl_f_offset
+ io_offset
,
1880 bytes_to_zero
, (int)io_offset
, xfer_resid
, 0);
1883 xfer_resid
-= bytes_to_zero
;
1884 zero_cnt1
-= bytes_to_zero
;
1885 zero_off1
+= bytes_to_zero
;
1886 io_offset
+= bytes_to_zero
;
1893 io_size
+= start_offset
;
1895 if ((upl_f_offset
+ io_size
) >= newEOF
&& io_size
< upl_size
) {
1897 * if we're extending the file with this write
1898 * we'll zero fill the rest of the page so that
1899 * if the file gets extended again in such a way as to leave a
1900 * hole starting at this EOF, we'll have zero's in the correct spot
1902 bzero((caddr_t
)(io_address
+ io_size
), upl_size
- io_size
);
1904 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 43)) | DBG_FUNC_NONE
,
1905 (int)upl_f_offset
+ io_size
,
1906 upl_size
- io_size
, 0, 0, 0);
1908 if ((kret
= ubc_upl_unmap(upl
)) != KERN_SUCCESS
)
1909 panic("cluster_write: kernel_upl_unmap failed\n");
1911 if (flags
& IO_SYNC
)
1913 * if the IO_SYNC flag is set than we need to
1914 * bypass any clusters and immediately issue
1919 if (vp
->v_clen
== 0)
1921 * no clusters currently present
1923 goto start_new_cluster
;
1926 * keep track of the overall dirty page
1927 * range we've developed
1928 * in case we have to fall back to the
1929 * VHASDIRTY method of flushing
1931 if (vp
->v_flag
& VHASDIRTY
)
1934 for (cl_index
= 0; cl_index
< vp
->v_clen
; cl_index
++) {
1936 * we have an existing cluster... see if this write will extend it nicely
1938 if (start_blkno
>= vp
->v_clusters
[cl_index
].start_pg
) {
1940 * the current write starts at or after the current cluster
1942 if (last_blkno
<= (vp
->v_clusters
[cl_index
].start_pg
+ MAX_UPL_TRANSFER
)) {
1944 * we have a write that fits entirely
1945 * within the existing cluster limits
1947 if (last_blkno
> vp
->v_clusters
[cl_index
].last_pg
)
1949 * update our idea of where the cluster ends
1951 vp
->v_clusters
[cl_index
].last_pg
= last_blkno
;
1954 if (start_blkno
< (vp
->v_clusters
[cl_index
].start_pg
+ MAX_UPL_TRANSFER
)) {
1956 * we have a write that starts in the middle of the current cluster
1957 * but extends beyond the cluster's limit
1958 * we'll clip the current cluster if we actually
1959 * overlap with the new write
1960 * and start a new cluster with the current write
1962 if (vp
->v_clusters
[cl_index
].last_pg
> start_blkno
)
1963 vp
->v_clusters
[cl_index
].last_pg
= start_blkno
;
1966 * we also get here for the case where the current write starts
1967 * beyond the limit of the existing cluster
1969 * in either case, we'll check the remaining clusters before
1970 * starting a new one
1974 * the current write starts in front of the current cluster
1976 if ((vp
->v_clusters
[cl_index
].last_pg
- start_blkno
) <= MAX_UPL_TRANSFER
) {
1978 * we can just merge the old cluster
1979 * with the new request and leave it
1982 vp
->v_clusters
[cl_index
].start_pg
= start_blkno
;
1984 if (last_blkno
> vp
->v_clusters
[cl_index
].last_pg
) {
1986 * the current write completely
1987 * envelops the existing cluster
1989 vp
->v_clusters
[cl_index
].last_pg
= last_blkno
;
1995 * if we were to combine this write with the current cluster
1996 * we would exceed the cluster size limit.... so,
1997 * let's see if there's any overlap of the new I/O with
1998 * the existing cluster...
2001 if (last_blkno
> vp
->v_clusters
[cl_index
].start_pg
)
2003 * the current write extends into the existing cluster
2004 * clip the current cluster by moving the start position
2005 * to where the current write ends
2007 vp
->v_clusters
[cl_index
].start_pg
= last_blkno
;
2009 * if we get here, there was no way to merge
2010 * the new I/O with this cluster and
2011 * keep it under our maximum cluster length
2012 * we'll check the remaining clusters before starting a new one
2016 if (cl_index
< vp
->v_clen
)
2018 * we found an existing cluster that we
2019 * could merger this I/O into
2023 if (vp
->v_clen
< MAX_CLUSTERS
&& !(vp
->v_flag
& VNOCACHE_DATA
))
2025 * we didn't find an existing cluster to
2026 * merge into, but there's room to start
2029 goto start_new_cluster
;
2032 * no exisitng cluster to merge with and no
2033 * room to start a new one... we'll try
2034 * pushing the existing ones... if none of
2035 * them are able to be pushed, we'll have
2036 * to fall back on the VHASDIRTY mechanism
2037 * cluster_try_push will set v_clen to the
2038 * number of remaining clusters if it is
2039 * unable to push all of them
2041 if (vp
->v_flag
& VNOCACHE_DATA
)
2046 if (cluster_try_push(vp
, newEOF
, 0, 0) == 0) {
2047 vp
->v_flag
|= VHASDIRTY
;
2051 if (vp
->v_clen
== 0) {
2052 vp
->v_ciosiz
= devblocksize
;
2053 vp
->v_cstart
= start_blkno
;
2054 vp
->v_lastw
= last_blkno
;
2056 vp
->v_clusters
[vp
->v_clen
].start_pg
= start_blkno
;
2057 vp
->v_clusters
[vp
->v_clen
].last_pg
= last_blkno
;
2061 * make sure we keep v_cstart and v_lastw up to
2062 * date in case we have to fall back on the
2063 * V_HASDIRTY mechanism (or we've already entered it)
2065 if (start_blkno
< vp
->v_cstart
)
2066 vp
->v_cstart
= start_blkno
;
2067 if (last_blkno
> vp
->v_lastw
)
2068 vp
->v_lastw
= last_blkno
;
2070 ubc_upl_commit_range(upl
, 0, upl_size
, UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2074 * in order to maintain some semblance of coherency with mapped writes
2075 * we need to write the cluster back out as a multiple of the PAGESIZE
2076 * unless the cluster encompasses the last page of the file... in this
2077 * case we'll round out to the nearest device block boundary
2081 if ((upl_f_offset
+ io_size
) > newEOF
) {
2082 io_size
= newEOF
- upl_f_offset
;
2083 io_size
= (io_size
+ (devblocksize
- 1)) & ~(devblocksize
- 1);
2086 if (flags
& IO_SYNC
)
2087 io_flags
= CL_COMMIT
| CL_AGE
;
2089 io_flags
= CL_COMMIT
| CL_AGE
| CL_ASYNC
;
2091 if (vp
->v_flag
& VNOCACHE_DATA
)
2092 io_flags
|= CL_DUMP
;
2094 while (vp
->v_numoutput
>= ASYNC_THROTTLE
) {
2095 vp
->v_flag
|= VTHROTTLED
;
2096 tsleep((caddr_t
)&vp
->v_numoutput
, PRIBIO
+ 1, "cluster_write", 0);
2098 retval
= cluster_io(vp
, upl
, 0, upl_f_offset
, io_size
, devblocksize
,
2099 io_flags
, (struct buf
*)0, (struct clios
*)0);
2102 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_END
,
2103 retval
, 0, 0, 0, 0);
2109 cluster_read(vp
, uio
, filesize
, devblocksize
, flags
)
2120 vm_offset_t upl_offset
;
2123 upl_page_info_t
*pl
;
2128 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_START
,
2129 (int)uio
->uio_offset
, uio
->uio_resid
, (int)filesize
, devblocksize
, 0);
2132 * We set a threshhold of 4 pages to decide if the nocopy
2133 * read loop is worth the trouble...
2136 if (!((vp
->v_flag
& VNOCACHE_DATA
) && (uio
->uio_segflg
== UIO_USERSPACE
)))
2138 retval
= cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
);
2139 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_END
,
2140 (int)uio
->uio_offset
, uio
->uio_resid
, vp
->v_lastr
, retval
, 0);
2144 while (uio
->uio_resid
&& uio
->uio_offset
< filesize
&& retval
== 0)
2146 /* we know we have a resid, so this is safe */
2148 while (iov
->iov_len
== 0) {
2155 * We check every vector target and if it is physically
2156 * contiguous space, we skip the sanity checks.
2159 upl_offset
= (vm_offset_t
)iov
->iov_base
& ~PAGE_MASK
;
2160 upl_size
= (upl_offset
+ PAGE_SIZE
+(PAGE_SIZE
-1)) & ~PAGE_MASK
;
2162 upl_flags
= UPL_QUERY_OBJECT_TYPE
;
2163 if((vm_map_get_upl(current_map(),
2164 (vm_offset_t
)iov
->iov_base
& ~PAGE_MASK
,
2165 &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
, 0)) != KERN_SUCCESS
)
2168 * the user app must have passed in an invalid address
2173 if (upl_flags
& UPL_PHYS_CONTIG
)
2175 retval
= cluster_phys_read(vp
, uio
, filesize
, devblocksize
, flags
);
2177 else if (uio
->uio_resid
< 4 * PAGE_SIZE
)
2180 * We set a threshhold of 4 pages to decide if the nocopy
2181 * read loop is worth the trouble...
2183 retval
= cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
);
2184 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_END
,
2185 (int)uio
->uio_offset
, uio
->uio_resid
, vp
->v_lastr
, retval
, 0);
2188 else if (uio
->uio_offset
& PAGE_MASK_64
)
2190 /* Bring the file offset read up to a pagesize boundary */
2191 clip_size
= (PAGE_SIZE
- (int)(uio
->uio_offset
& PAGE_MASK_64
));
2192 if (uio
->uio_resid
< clip_size
)
2193 clip_size
= uio
->uio_resid
;
2195 * Fake the resid going into the cluster_read_x call
2196 * and restore it on the way out.
2198 prev_resid
= uio
->uio_resid
;
2199 uio
->uio_resid
= clip_size
;
2200 retval
= cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
);
2201 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
2203 else if ((int)iov
->iov_base
& PAGE_MASK_64
)
2205 clip_size
= iov
->iov_len
;
2206 prev_resid
= uio
->uio_resid
;
2207 uio
->uio_resid
= clip_size
;
2208 retval
= cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
);
2209 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
2214 * If we come in here, we know the offset into
2215 * the file is on a pagesize boundary
2218 max_io_size
= filesize
- uio
->uio_offset
;
2219 clip_size
= uio
->uio_resid
;
2220 if (iov
->iov_len
< clip_size
)
2221 clip_size
= iov
->iov_len
;
2222 if (max_io_size
< clip_size
)
2223 clip_size
= (int)max_io_size
;
2225 if (clip_size
< PAGE_SIZE
)
2228 * Take care of the tail end of the read in this vector.
2230 prev_resid
= uio
->uio_resid
;
2231 uio
->uio_resid
= clip_size
;
2232 retval
= cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
);
2233 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
2237 /* round clip_size down to a multiple of pagesize */
2238 clip_size
= clip_size
& ~(PAGE_MASK
);
2239 prev_resid
= uio
->uio_resid
;
2240 uio
->uio_resid
= clip_size
;
2241 retval
= cluster_nocopy_read(vp
, uio
, filesize
, devblocksize
, flags
);
2242 if ((retval
==0) && uio
->uio_resid
)
2243 retval
= cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
);
2244 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
2249 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_END
,
2250 (int)uio
->uio_offset
, uio
->uio_resid
, vp
->v_lastr
, retval
, 0);
2257 cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
)
2264 upl_page_info_t
*pl
;
2266 vm_offset_t upl_offset
;
2276 vm_offset_t io_address
;
2284 b_lblkno
= (int)(uio
->uio_offset
/ PAGE_SIZE_64
);
2286 while (uio
->uio_resid
&& uio
->uio_offset
< filesize
&& retval
== 0) {
2288 * compute the size of the upl needed to encompass
2289 * the requested read... limit each call to cluster_io
2290 * to the maximum UPL size... cluster_io will clip if
2291 * this exceeds the maximum io_size for the device,
2292 * make sure to account for
2293 * a starting offset that's not page aligned
2295 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
2296 upl_f_offset
= uio
->uio_offset
- (off_t
)start_offset
;
2297 max_size
= filesize
- uio
->uio_offset
;
2299 if ((off_t
)((unsigned int)uio
->uio_resid
) < max_size
)
2300 io_size
= uio
->uio_resid
;
2304 if (uio
->uio_segflg
== UIO_USERSPACE
&& !(vp
->v_flag
& VNOCACHE_DATA
)) {
2305 segflg
= uio
->uio_segflg
;
2307 uio
->uio_segflg
= UIO_PHYS_USERSPACE
;
2309 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_START
,
2310 (int)uio
->uio_offset
, io_size
, uio
->uio_resid
, 0, 0);
2312 while (io_size
&& retval
== 0) {
2318 UPL_POP_SET
| UPL_POP_BUSY
,
2319 &paddr
, 0) != KERN_SUCCESS
)
2322 xsize
= PAGE_SIZE
- start_offset
;
2324 if (xsize
> io_size
)
2327 retval
= uiomove((caddr_t
)(paddr
+ start_offset
), xsize
, uio
);
2329 ubc_page_op(vp
, upl_f_offset
,
2330 UPL_POP_CLR
| UPL_POP_BUSY
, 0, 0);
2333 start_offset
= (int)
2334 (uio
->uio_offset
& PAGE_MASK_64
);
2335 upl_f_offset
= uio
->uio_offset
- start_offset
;
2337 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
2338 (int)uio
->uio_offset
, io_size
, uio
->uio_resid
, 0, 0);
2340 uio
->uio_segflg
= segflg
;
2347 * we're already finished with this read request
2348 * let's see if we should do a read-ahead
2351 ((uio
->uio_offset
- 1) / PAGE_SIZE_64
);
2353 if (!(vp
->v_flag
& VRAOFF
))
2355 * let's try to read ahead if we're in
2356 * a sequential access pattern
2358 cluster_rd_ahead(vp
, b_lblkno
, e_lblkno
, filesize
, devblocksize
);
2359 vp
->v_lastr
= e_lblkno
;
2363 max_size
= filesize
- uio
->uio_offset
;
2365 upl_size
= (start_offset
+ io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
2366 if (upl_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
2367 upl_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
2368 pages_in_upl
= upl_size
/ PAGE_SIZE
;
2370 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 33)) | DBG_FUNC_START
,
2371 (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
2373 kret
= ubc_create_upl(vp
,
2379 if (kret
!= KERN_SUCCESS
)
2380 panic("cluster_read: failed to get pagelist");
2382 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 33)) | DBG_FUNC_END
,
2383 (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
2386 * scan from the beginning of the upl looking for the first
2387 * non-valid page.... this will become the first page in
2388 * the request we're going to make to 'cluster_io'... if all
2389 * of the pages are valid, we won't call through to 'cluster_io'
2391 for (start_pg
= 0; start_pg
< pages_in_upl
; start_pg
++) {
2392 if (!upl_valid_page(pl
, start_pg
))
2397 * scan from the starting invalid page looking for a valid
2398 * page before the end of the upl is reached, if we
2399 * find one, then it will be the last page of the request to
2402 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
2403 if (upl_valid_page(pl
, last_pg
))
2407 if (start_pg
< last_pg
) {
2409 * we found a range of 'invalid' pages that must be filled
2410 * if the last page in this range is the last page of the file
2411 * we may have to clip the size of it to keep from reading past
2412 * the end of the last physical block associated with the file
2414 upl_offset
= start_pg
* PAGE_SIZE
;
2415 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
2417 if ((upl_f_offset
+ upl_offset
+ io_size
) > filesize
)
2418 io_size
= filesize
- (upl_f_offset
+ upl_offset
);
2421 * issue a synchronous read to cluster_io
2424 error
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
,
2425 io_size
, devblocksize
, CL_READ
, (struct buf
*)0, (struct clios
*)0);
2429 * if the read completed successfully, or there was no I/O request
2430 * issued, than map the upl into kernel address space and
2431 * move the data into user land.... we'll first add on any 'valid'
2432 * pages that were present in the upl when we acquired it.
2435 u_int size_of_prefetch
;
2437 for (uio_last
= last_pg
; uio_last
< pages_in_upl
; uio_last
++) {
2438 if (!upl_valid_page(pl
, uio_last
))
2442 * compute size to transfer this round, if uio->uio_resid is
2443 * still non-zero after this uiomove, we'll loop around and
2444 * set up for another I/O.
2446 val_size
= (uio_last
* PAGE_SIZE
) - start_offset
;
2448 if (max_size
< val_size
)
2449 val_size
= max_size
;
2451 if (uio
->uio_resid
< val_size
)
2452 val_size
= uio
->uio_resid
;
2454 e_lblkno
= (int)((uio
->uio_offset
+ ((off_t
)val_size
- 1)) / PAGE_SIZE_64
);
2456 if (size_of_prefetch
= (uio
->uio_resid
- val_size
)) {
2458 * if there's still I/O left to do for this request, then issue a
2459 * pre-fetch I/O... the I/O wait time will overlap
2460 * with the copying of the data
2462 cluster_rd_prefetch(vp
, uio
->uio_offset
+ val_size
, size_of_prefetch
, filesize
, devblocksize
);
2464 if (!(vp
->v_flag
& VRAOFF
) && !(vp
->v_flag
& VNOCACHE_DATA
))
2466 * let's try to read ahead if we're in
2467 * a sequential access pattern
2469 cluster_rd_ahead(vp
, b_lblkno
, e_lblkno
, filesize
, devblocksize
);
2470 vp
->v_lastr
= e_lblkno
;
2472 if (uio
->uio_segflg
== UIO_USERSPACE
) {
2475 segflg
= uio
->uio_segflg
;
2477 uio
->uio_segflg
= UIO_PHYS_USERSPACE
;
2480 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_START
,
2481 (int)uio
->uio_offset
, val_size
, uio
->uio_resid
, 0, 0);
2483 offset
= start_offset
;
2485 while (val_size
&& retval
== 0) {
2490 i
= offset
/ PAGE_SIZE
;
2491 csize
= min(PAGE_SIZE
- start_offset
, val_size
);
2493 paddr
= (caddr_t
)upl_phys_page(pl
, i
) + start_offset
;
2495 retval
= uiomove(paddr
, csize
, uio
);
2499 start_offset
= offset
& PAGE_MASK
;
2501 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
2502 (int)uio
->uio_offset
, val_size
, uio
->uio_resid
, 0, 0);
2504 uio
->uio_segflg
= segflg
;
2508 if ((kret
= ubc_upl_map(upl
, &io_address
)) != KERN_SUCCESS
)
2509 panic("cluster_read: ubc_upl_map() failed\n");
2511 retval
= uiomove((caddr_t
)(io_address
+ start_offset
), val_size
, uio
);
2513 if ((kret
= ubc_upl_unmap(upl
)) != KERN_SUCCESS
)
2514 panic("cluster_read: ubc_upl_unmap() failed\n");
2517 if (start_pg
< last_pg
) {
2519 * compute the range of pages that we actually issued an I/O for
2520 * and either commit them as valid if the I/O succeeded
2521 * or abort them if the I/O failed
2523 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
2525 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_START
,
2526 (int)upl
, start_pg
* PAGE_SIZE
, io_size
, error
, 0);
2528 if (error
|| (vp
->v_flag
& VNOCACHE_DATA
))
2529 ubc_upl_abort_range(upl
, start_pg
* PAGE_SIZE
, io_size
,
2530 UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
2532 ubc_upl_commit_range(upl
, start_pg
* PAGE_SIZE
, io_size
,
2533 UPL_COMMIT_CLEAR_DIRTY
2534 | UPL_COMMIT_FREE_ON_EMPTY
2535 | UPL_COMMIT_INACTIVATE
);
2537 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_END
,
2538 (int)upl
, start_pg
* PAGE_SIZE
, io_size
, error
, 0);
2540 if ((last_pg
- start_pg
) < pages_in_upl
) {
2545 * the set of pages that we issued an I/O for did not encompass
2546 * the entire upl... so just release these without modifying
2550 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
2552 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_START
,
2553 (int)upl
, -1, pages_in_upl
- (last_pg
- start_pg
), 0, 0);
2557 * we found some already valid pages at the beginning of
2558 * the upl commit these back to the inactive list with
2561 for (cur_pg
= 0; cur_pg
< start_pg
; cur_pg
++) {
2562 commit_flags
= UPL_COMMIT_FREE_ON_EMPTY
2563 | UPL_COMMIT_INACTIVATE
;
2565 if (upl_dirty_page(pl
, cur_pg
))
2566 commit_flags
|= UPL_COMMIT_SET_DIRTY
;
2568 if ( !(commit_flags
& UPL_COMMIT_SET_DIRTY
) && (vp
->v_flag
& VNOCACHE_DATA
))
2569 ubc_upl_abort_range(upl
, cur_pg
* PAGE_SIZE
, PAGE_SIZE
,
2570 UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
2572 ubc_upl_commit_range(upl
, cur_pg
* PAGE_SIZE
,
2573 PAGE_SIZE
, commit_flags
);
2576 if (last_pg
< uio_last
) {
2578 * we found some already valid pages immediately after the
2579 * pages we issued I/O for, commit these back to the
2580 * inactive list with reference cleared
2582 for (cur_pg
= last_pg
; cur_pg
< uio_last
; cur_pg
++) {
2583 commit_flags
= UPL_COMMIT_FREE_ON_EMPTY
2584 | UPL_COMMIT_INACTIVATE
;
2586 if (upl_dirty_page(pl
, cur_pg
))
2587 commit_flags
|= UPL_COMMIT_SET_DIRTY
;
2589 if ( !(commit_flags
& UPL_COMMIT_SET_DIRTY
) && (vp
->v_flag
& VNOCACHE_DATA
))
2590 ubc_upl_abort_range(upl
, cur_pg
* PAGE_SIZE
, PAGE_SIZE
,
2591 UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
2593 ubc_upl_commit_range(upl
, cur_pg
* PAGE_SIZE
,
2594 PAGE_SIZE
, commit_flags
);
2597 if (uio_last
< pages_in_upl
) {
2599 * there were some invalid pages beyond the valid pages
2600 * that we didn't issue an I/O for, just release them
2603 ubc_upl_abort_range(upl
, uio_last
* PAGE_SIZE
,
2604 (pages_in_upl
- uio_last
) * PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
2607 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_END
,
2608 (int)upl
, -1, -1, 0, 0);
2620 cluster_nocopy_read(vp
, uio
, filesize
, devblocksize
, flags
)
2628 upl_page_info_t
*pl
;
2630 vm_offset_t upl_offset
;
2631 off_t start_upl_f_offset
;
2635 int upl_needed_size
;
2643 int force_data_sync
;
2646 struct clios iostate
;
2648 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_START
,
2649 (int)uio
->uio_offset
, uio
->uio_resid
, (int)filesize
, devblocksize
, 0);
2652 * When we enter this routine, we know
2653 * -- the offset into the file is on a pagesize boundary
2654 * -- the resid is a page multiple
2655 * -- the resid will not exceed iov_len
2658 iostate
.io_completed
= 0;
2659 iostate
.io_issued
= 0;
2660 iostate
.io_error
= 0;
2661 iostate
.io_wanted
= 0;
2665 while (uio
->uio_resid
&& uio
->uio_offset
< filesize
&& retval
== 0) {
2667 max_io_size
= filesize
- uio
->uio_offset
;
2669 if (max_io_size
< (off_t
)((unsigned int)uio
->uio_resid
))
2670 io_size
= max_io_size
;
2672 io_size
= uio
->uio_resid
;
2675 * We don't come into this routine unless
2676 * UIO_USERSPACE is set.
2678 segflg
= uio
->uio_segflg
;
2680 uio
->uio_segflg
= UIO_PHYS_USERSPACE
;
2683 * First look for pages already in the cache
2684 * and move them to user space.
2686 while (io_size
&& (retval
== 0)) {
2687 upl_f_offset
= uio
->uio_offset
;
2690 * If this call fails, it means the page is not
2691 * in the page cache.
2693 if (ubc_page_op(vp
, upl_f_offset
,
2694 UPL_POP_SET
| UPL_POP_BUSY
, &paddr
, 0) != KERN_SUCCESS
)
2697 retval
= uiomove((caddr_t
)(paddr
), PAGE_SIZE
, uio
);
2699 ubc_page_op(vp
, upl_f_offset
,
2700 UPL_POP_CLR
| UPL_POP_BUSY
, 0, 0);
2702 io_size
-= PAGE_SIZE
;
2703 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 71)) | DBG_FUNC_NONE
,
2704 (int)uio
->uio_offset
, io_size
, uio
->uio_resid
, 0, 0);
2706 uio
->uio_segflg
= segflg
;
2710 * we may have already spun some portion of this request
2711 * off as async requests... we need to wait for the I/O
2712 * to complete before returning
2714 goto wait_for_reads
;
2717 * If we are already finished with this read, then return
2721 * we may have already spun some portion of this request
2722 * off as async requests... we need to wait for the I/O
2723 * to complete before returning
2725 goto wait_for_reads
;
2727 max_io_size
= io_size
;
2729 if (max_io_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
2730 max_io_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
2732 if (max_io_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
) / 4)
2733 max_io_size
= (MAX_UPL_TRANSFER
* PAGE_SIZE
) / 8;
2736 start_upl_f_offset
= uio
->uio_offset
; /* this is page aligned in the file */
2737 upl_f_offset
= start_upl_f_offset
;
2740 while (io_size
< max_io_size
) {
2741 if (ubc_page_op(vp
, upl_f_offset
,
2742 UPL_POP_SET
| UPL_POP_BUSY
, &paddr
, 0) == KERN_SUCCESS
) {
2743 ubc_page_op(vp
, upl_f_offset
,
2744 UPL_POP_CLR
| UPL_POP_BUSY
, 0, 0);
2748 * Build up the io request parameters.
2750 io_size
+= PAGE_SIZE_64
;
2751 upl_f_offset
+= PAGE_SIZE_64
;
2755 * we may have already spun some portion of this request
2756 * off as async requests... we need to wait for the I/O
2757 * to complete before returning
2759 goto wait_for_reads
;
2761 upl_offset
= (vm_offset_t
)iov
->iov_base
& PAGE_MASK_64
;
2762 upl_needed_size
= (upl_offset
+ io_size
+ (PAGE_SIZE
-1)) & ~PAGE_MASK
;
2764 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_START
,
2765 (int)upl_offset
, upl_needed_size
, (int)iov
->iov_base
, io_size
, 0);
2767 for (force_data_sync
= 0; force_data_sync
< 3; force_data_sync
++) {
2769 upl_size
= upl_needed_size
;
2770 upl_flags
= UPL_FILE_IO
| UPL_NO_SYNC
| UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
;
2772 kret
= vm_map_get_upl(current_map(),
2773 (vm_offset_t
)iov
->iov_base
& ~PAGE_MASK
,
2774 &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
, force_data_sync
);
2776 if (kret
!= KERN_SUCCESS
) {
2777 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
2778 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
2781 * cluster_nocopy_read: failed to get pagelist
2783 * we may have already spun some portion of this request
2784 * off as async requests... we need to wait for the I/O
2785 * to complete before returning
2787 goto wait_for_reads
;
2789 pages_in_pl
= upl_size
/ PAGE_SIZE
;
2790 pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
2792 for (i
= 0; i
< pages_in_pl
; i
++) {
2793 if (!upl_valid_page(pl
, i
))
2796 if (i
== pages_in_pl
)
2799 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
2800 UPL_ABORT_FREE_ON_EMPTY
);
2802 if (force_data_sync
>= 3) {
2803 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
2804 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
2806 goto wait_for_reads
;
2809 * Consider the possibility that upl_size wasn't satisfied.
2811 if (upl_size
!= upl_needed_size
)
2812 io_size
= (upl_size
- (int)upl_offset
) & ~PAGE_MASK
;
2815 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
2816 UPL_ABORT_FREE_ON_EMPTY
);
2817 goto wait_for_reads
;
2819 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
2820 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
2823 * request asynchronously so that we can overlap
2824 * the preparation of the next I/O
2825 * if there are already too many outstanding reads
2826 * wait until some have completed before issuing the next read
2828 while ((iostate
.io_issued
- iostate
.io_completed
) > (2 * MAX_UPL_TRANSFER
* PAGE_SIZE
)) {
2829 iostate
.io_wanted
= 1;
2830 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO
+ 1, "cluster_nocopy_read", 0);
2832 if (iostate
.io_error
) {
2834 * one of the earlier reads we issued ran into a hard error
2835 * don't issue any more reads, cleanup the UPL
2836 * that was just created but not used, then
2837 * go wait for any other reads to complete before
2838 * returning the error to the caller
2840 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
2841 UPL_ABORT_FREE_ON_EMPTY
);
2843 goto wait_for_reads
;
2845 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 73)) | DBG_FUNC_START
,
2846 (int)upl
, (int)upl_offset
, (int)start_upl_f_offset
, io_size
, 0);
2848 retval
= cluster_io(vp
, upl
, upl_offset
, start_upl_f_offset
,
2849 io_size
, devblocksize
,
2850 CL_PRESERVE
| CL_COMMIT
| CL_READ
| CL_ASYNC
| CL_NOZERO
,
2851 (struct buf
*)0, &iostate
);
2854 * update the uio structure
2856 iov
->iov_base
+= io_size
;
2857 iov
->iov_len
-= io_size
;
2858 uio
->uio_resid
-= io_size
;
2859 uio
->uio_offset
+= io_size
;
2861 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 73)) | DBG_FUNC_END
,
2862 (int)upl
, (int)uio
->uio_offset
, (int)uio
->uio_resid
, retval
, 0);
2868 * make sure all async reads that are part of this stream
2869 * have completed before we return
2871 while (iostate
.io_issued
!= iostate
.io_completed
) {
2872 iostate
.io_wanted
= 1;
2873 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO
+ 1, "cluster_nocopy_read", 0);
2875 if (iostate
.io_error
)
2876 retval
= iostate
.io_error
;
2878 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_END
,
2879 (int)uio
->uio_offset
, (int)uio
->uio_resid
, 6, retval
, 0);
2886 cluster_phys_read(vp
, uio
, filesize
, devblocksize
, flags
)
2893 upl_page_info_t
*pl
;
2895 vm_offset_t upl_offset
;
2896 vm_offset_t dst_paddr
;
2901 int upl_needed_size
;
2906 struct clios iostate
;
2910 * When we enter this routine, we know
2911 * -- the resid will not exceed iov_len
2912 * -- the target address is physically contiguous
2917 max_size
= filesize
- uio
->uio_offset
;
2919 if (max_size
> (off_t
)((unsigned int)iov
->iov_len
))
2920 io_size
= iov
->iov_len
;
2924 upl_offset
= (vm_offset_t
)iov
->iov_base
& PAGE_MASK_64
;
2925 upl_needed_size
= upl_offset
+ io_size
;
2929 upl_size
= upl_needed_size
;
2930 upl_flags
= UPL_FILE_IO
| UPL_NO_SYNC
| UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
;
2932 kret
= vm_map_get_upl(current_map(),
2933 (vm_offset_t
)iov
->iov_base
& ~PAGE_MASK
,
2934 &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
, 0);
2936 if (kret
!= KERN_SUCCESS
) {
2938 * cluster_phys_read: failed to get pagelist
2942 if (upl_size
< upl_needed_size
) {
2944 * The upl_size wasn't satisfied.
2946 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
2950 pl
= ubc_upl_pageinfo(upl
);
2952 dst_paddr
= (vm_offset_t
)upl_phys_page(pl
, 0) + ((vm_offset_t
)iov
->iov_base
& PAGE_MASK
);
2954 while (((uio
->uio_offset
& (devblocksize
- 1)) || io_size
< devblocksize
) && io_size
) {
2957 head_size
= devblocksize
- (int)(uio
->uio_offset
& (devblocksize
- 1));
2959 if (head_size
> io_size
)
2960 head_size
= io_size
;
2962 error
= cluster_align_phys_io(vp
, uio
, dst_paddr
, head_size
, devblocksize
, CL_READ
);
2965 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
2969 upl_offset
+= head_size
;
2970 dst_paddr
+= head_size
;
2971 io_size
-= head_size
;
2973 tail_size
= io_size
& (devblocksize
- 1);
2974 io_size
-= tail_size
;
2976 iostate
.io_completed
= 0;
2977 iostate
.io_issued
= 0;
2978 iostate
.io_error
= 0;
2979 iostate
.io_wanted
= 0;
2981 while (io_size
&& error
== 0) {
2984 if (io_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
2985 xsize
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
2989 * request asynchronously so that we can overlap
2990 * the preparation of the next I/O... we'll do
2991 * the commit after all the I/O has completed
2992 * since its all issued against the same UPL
2993 * if there are already too many outstanding reads
2994 * wait until some have completed before issuing the next
2996 while ((iostate
.io_issued
- iostate
.io_completed
) > (2 * MAX_UPL_TRANSFER
* PAGE_SIZE
)) {
2997 iostate
.io_wanted
= 1;
2998 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO
+ 1, "cluster_phys_read", 0);
3001 error
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
, xsize
, 0,
3002 CL_READ
| CL_NOZERO
| CL_DEV_MEMORY
| CL_ASYNC
,
3003 (struct buf
*)0, &iostate
);
3005 * The cluster_io read was issued successfully,
3006 * update the uio structure
3009 uio
->uio_resid
-= xsize
;
3010 iov
->iov_len
-= xsize
;
3011 iov
->iov_base
+= xsize
;
3012 uio
->uio_offset
+= xsize
;
3014 upl_offset
+= xsize
;
3019 * make sure all async reads that are part of this stream
3020 * have completed before we proceed
3022 while (iostate
.io_issued
!= iostate
.io_completed
) {
3023 iostate
.io_wanted
= 1;
3024 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO
+ 1, "cluster_phys_read", 0);
3026 if (iostate
.io_error
) {
3027 error
= iostate
.io_error
;
3029 if (error
== 0 && tail_size
)
3030 error
= cluster_align_phys_io(vp
, uio
, dst_paddr
, tail_size
, devblocksize
, CL_READ
);
3033 * just release our hold on the physically contiguous
3034 * region without changing any state
3036 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3043 * generate advisory I/O's in the largest chunks possible
3044 * the completed pages will be released into the VM cache
3047 advisory_read(vp
, filesize
, f_offset
, resid
, devblocksize
)
3054 upl_page_info_t
*pl
;
3056 vm_offset_t upl_offset
;
3069 if (!UBCINFOEXISTS(vp
))
3072 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 60)) | DBG_FUNC_START
,
3073 (int)f_offset
, resid
, (int)filesize
, devblocksize
, 0);
3075 while (resid
&& f_offset
< filesize
&& retval
== 0) {
3077 * compute the size of the upl needed to encompass
3078 * the requested read... limit each call to cluster_io
3079 * to the maximum UPL size... cluster_io will clip if
3080 * this exceeds the maximum io_size for the device,
3081 * make sure to account for
3082 * a starting offset that's not page aligned
3084 start_offset
= (int)(f_offset
& PAGE_MASK_64
);
3085 upl_f_offset
= f_offset
- (off_t
)start_offset
;
3086 max_size
= filesize
- f_offset
;
3088 if (resid
< max_size
)
3093 upl_size
= (start_offset
+ io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
3094 if (upl_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
3095 upl_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
3096 pages_in_upl
= upl_size
/ PAGE_SIZE
;
3098 kret
= ubc_create_upl(vp
,
3103 UPL_RET_ONLY_ABSENT
);
3104 if (kret
!= KERN_SUCCESS
)
3109 * before we start marching forward, we must make sure we end on
3110 * a present page, otherwise we will be working with a freed
3113 for (last_pg
= pages_in_upl
- 1; last_pg
>= 0; last_pg
--) {
3114 if (upl_page_present(pl
, last_pg
))
3117 pages_in_upl
= last_pg
+ 1;
3120 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 61)) | DBG_FUNC_NONE
,
3121 (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
3124 for (last_pg
= 0; last_pg
< pages_in_upl
; ) {
3126 * scan from the beginning of the upl looking for the first
3127 * page that is present.... this will become the first page in
3128 * the request we're going to make to 'cluster_io'... if all
3129 * of the pages are absent, we won't call through to 'cluster_io'
3131 for (start_pg
= last_pg
; start_pg
< pages_in_upl
; start_pg
++) {
3132 if (upl_page_present(pl
, start_pg
))
3137 * scan from the starting present page looking for an absent
3138 * page before the end of the upl is reached, if we
3139 * find one, then it will terminate the range of pages being
3140 * presented to 'cluster_io'
3142 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
3143 if (!upl_page_present(pl
, last_pg
))
3147 if (last_pg
> start_pg
) {
3149 * we found a range of pages that must be filled
3150 * if the last page in this range is the last page of the file
3151 * we may have to clip the size of it to keep from reading past
3152 * the end of the last physical block associated with the file
3154 upl_offset
= start_pg
* PAGE_SIZE
;
3155 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
3157 if ((upl_f_offset
+ upl_offset
+ io_size
) > filesize
)
3158 io_size
= filesize
- (upl_f_offset
+ upl_offset
);
3161 * issue an asynchronous read to cluster_io
3163 retval
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, io_size
, devblocksize
,
3164 CL_ASYNC
| CL_READ
| CL_COMMIT
| CL_AGE
, (struct buf
*)0, (struct clios
*)0);
3170 ubc_upl_abort(upl
, 0);
3172 io_size
= upl_size
- start_offset
;
3174 if (io_size
> resid
)
3176 f_offset
+= io_size
;
3180 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 60)) | DBG_FUNC_END
,
3181 (int)f_offset
, resid
, retval
, 0, 0);
3193 if (!UBCINFOEXISTS(vp
) || vp
->v_clen
== 0) {
3194 vp
->v_flag
&= ~VHASDIRTY
;
3198 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_START
,
3199 vp
->v_flag
& VHASDIRTY
, vp
->v_clen
, 0, 0, 0);
3201 if (vp
->v_flag
& VHASDIRTY
) {
3206 start_pg
= vp
->v_cstart
;
3207 end_pg
= vp
->v_lastw
;
3209 vp
->v_flag
&= ~VHASDIRTY
;
3212 while (start_pg
< end_pg
) {
3213 last_pg
= start_pg
+ MAX_UPL_TRANSFER
;
3215 if (last_pg
> end_pg
)
3218 cluster_push_x(vp
, ubc_getsize(vp
), start_pg
, last_pg
, 0);
3224 retval
= cluster_try_push(vp
, ubc_getsize(vp
), 0, 1);
3226 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_END
,
3227 vp
->v_flag
& VHASDIRTY
, vp
->v_clen
, retval
, 0, 0);
3234 cluster_try_push(vp
, EOF
, can_delay
, push_all
)
3246 struct v_cluster l_clusters
[MAX_CLUSTERS
];
3249 * make a local 'sorted' copy of the clusters
3250 * and clear vp->v_clen so that new clusters can
3253 for (cl_index
= 0; cl_index
< vp
->v_clen
; cl_index
++) {
3254 for (min_index
= -1, cl_index1
= 0; cl_index1
< vp
->v_clen
; cl_index1
++) {
3255 if (vp
->v_clusters
[cl_index1
].start_pg
== vp
->v_clusters
[cl_index1
].last_pg
)
3257 if (min_index
== -1)
3258 min_index
= cl_index1
;
3259 else if (vp
->v_clusters
[cl_index1
].start_pg
< vp
->v_clusters
[min_index
].start_pg
)
3260 min_index
= cl_index1
;
3262 if (min_index
== -1)
3264 l_clusters
[cl_index
].start_pg
= vp
->v_clusters
[min_index
].start_pg
;
3265 l_clusters
[cl_index
].last_pg
= vp
->v_clusters
[min_index
].last_pg
;
3267 vp
->v_clusters
[min_index
].start_pg
= vp
->v_clusters
[min_index
].last_pg
;
3272 for (cl_pushed
= 0, cl_index
= 0; cl_index
< cl_len
; cl_index
++) {
3274 * try to push each cluster in turn... cluster_push_x may not
3275 * push the cluster if can_delay is TRUE and the cluster doesn't
3276 * meet the critera for an immediate push
3278 if (cluster_push_x(vp
, EOF
, l_clusters
[cl_index
].start_pg
, l_clusters
[cl_index
].last_pg
, can_delay
)) {
3279 l_clusters
[cl_index
].start_pg
= 0;
3280 l_clusters
[cl_index
].last_pg
= 0;
3288 if (cl_len
> cl_pushed
) {
3290 * we didn't push all of the clusters, so
3291 * lets try to merge them back in to the vnode
3293 if ((MAX_CLUSTERS
- vp
->v_clen
) < (cl_len
- cl_pushed
)) {
3295 * we picked up some new clusters while we were trying to
3296 * push the old ones (I don't think this can happen because
3297 * I'm holding the lock, but just in case)... the sum of the
3298 * leftovers plus the new cluster count exceeds our ability
3299 * to represent them, so fall back to the VHASDIRTY mechanism
3301 for (cl_index
= 0; cl_index
< cl_len
; cl_index
++) {
3302 if (l_clusters
[cl_index
].start_pg
== l_clusters
[cl_index
].last_pg
)
3305 if (l_clusters
[cl_index
].start_pg
< vp
->v_cstart
)
3306 vp
->v_cstart
= l_clusters
[cl_index
].start_pg
;
3307 if (l_clusters
[cl_index
].last_pg
> vp
->v_lastw
)
3308 vp
->v_lastw
= l_clusters
[cl_index
].last_pg
;
3310 vp
->v_flag
|= VHASDIRTY
;
3313 * we've got room to merge the leftovers back in
3314 * just append them starting at the next 'hole'
3315 * represented by vp->v_clen
3317 for (cl_index
= 0, cl_index1
= vp
->v_clen
; cl_index
< cl_len
; cl_index
++) {
3318 if (l_clusters
[cl_index
].start_pg
== l_clusters
[cl_index
].last_pg
)
3321 vp
->v_clusters
[cl_index1
].start_pg
= l_clusters
[cl_index
].start_pg
;
3322 vp
->v_clusters
[cl_index1
].last_pg
= l_clusters
[cl_index
].last_pg
;
3324 if (cl_index1
== 0) {
3325 vp
->v_cstart
= l_clusters
[cl_index
].start_pg
;
3326 vp
->v_lastw
= l_clusters
[cl_index
].last_pg
;
3328 if (l_clusters
[cl_index
].start_pg
< vp
->v_cstart
)
3329 vp
->v_cstart
= l_clusters
[cl_index
].start_pg
;
3330 if (l_clusters
[cl_index
].last_pg
> vp
->v_lastw
)
3331 vp
->v_lastw
= l_clusters
[cl_index
].last_pg
;
3336 * update the cluster count
3338 vp
->v_clen
= cl_index1
;
3341 return(MAX_CLUSTERS
- vp
->v_clen
);
3347 cluster_push_x(vp
, EOF
, first
, last
, can_delay
)
3354 upl_page_info_t
*pl
;
3356 vm_offset_t upl_offset
;
3368 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_START
,
3369 vp
->v_clen
, first
, last
, EOF
, 0);
3371 if ((pages_in_upl
= last
- first
) == 0) {
3372 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 0, 0, 0, 0);
3376 upl_size
= pages_in_upl
* PAGE_SIZE
;
3377 upl_f_offset
= ((off_t
)first
) * PAGE_SIZE_64
;
3379 if (upl_f_offset
+ upl_size
>= EOF
) {
3381 if (upl_f_offset
>= EOF
) {
3383 * must have truncated the file and missed
3384 * clearing a dangling cluster (i.e. it's completely
3385 * beyond the new EOF
3387 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 1, 0, 0, 0);
3391 size
= EOF
- upl_f_offset
;
3393 upl_size
= (size
+ (PAGE_SIZE
- 1) ) & ~(PAGE_SIZE
- 1);
3394 pages_in_upl
= upl_size
/ PAGE_SIZE
;
3396 if (can_delay
&& (pages_in_upl
< (MAX_UPL_TRANSFER
- (MAX_UPL_TRANSFER
/ 2))))
3400 kret
= ubc_create_upl(vp
,
3405 UPL_RET_ONLY_DIRTY
);
3406 if (kret
!= KERN_SUCCESS
)
3407 panic("cluster_push: failed to get pagelist");
3412 for (num_of_dirty
= 0, start_pg
= 0; start_pg
< pages_in_upl
; start_pg
++) {
3413 if (upl_valid_page(pl
, start_pg
) && upl_dirty_page(pl
, start_pg
))
3416 if (num_of_dirty
< pages_in_upl
/ 2) {
3417 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3419 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 0, 2, num_of_dirty
, (pages_in_upl
/ 2), 0);
3428 for (start_pg
= last_pg
; start_pg
< pages_in_upl
; start_pg
++) {
3429 if (upl_valid_page(pl
, start_pg
) && upl_dirty_page(pl
, start_pg
))
3432 if (start_pg
> last_pg
) {
3433 io_size
= (start_pg
- last_pg
) * PAGE_SIZE
;
3435 ubc_upl_abort_range(upl
, last_pg
* PAGE_SIZE
, io_size
,
3436 UPL_ABORT_FREE_ON_EMPTY
);
3443 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
3444 if (!upl_valid_page(pl
, last_pg
) || !upl_dirty_page(pl
, last_pg
))
3447 upl_offset
= start_pg
* PAGE_SIZE
;
3449 io_size
= min(size
, (last_pg
- start_pg
) * PAGE_SIZE
);
3451 if (vp
->v_flag
& VNOCACHE_DATA
)
3452 io_flags
= CL_COMMIT
| CL_AGE
| CL_ASYNC
| CL_DUMP
;
3454 io_flags
= CL_COMMIT
| CL_AGE
| CL_ASYNC
;
3456 while (vp
->v_numoutput
>= ASYNC_THROTTLE
) {
3457 vp
->v_flag
|= VTHROTTLED
;
3458 tsleep((caddr_t
)&vp
->v_numoutput
, PRIBIO
+ 1, "cluster_push", 0);
3460 cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, io_size
, vp
->v_ciosiz
, io_flags
, (struct buf
*)0, (struct clios
*)0);
3464 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 3, 0, 0, 0);
3472 cluster_align_phys_io(struct vnode
*vp
, struct uio
*uio
, vm_offset_t usr_paddr
, int xsize
, int devblocksize
, int flags
)
3475 upl_page_info_t
*pl
;
3477 vm_offset_t ubc_paddr
;
3483 kret
= ubc_create_upl(vp
,
3484 uio
->uio_offset
& ~PAGE_MASK_64
,
3490 if (kret
!= KERN_SUCCESS
)
3493 if (!upl_valid_page(pl
, 0)) {
3495 * issue a synchronous read to cluster_io
3497 error
= cluster_io(vp
, upl
, 0, uio
->uio_offset
& ~PAGE_MASK_64
, PAGE_SIZE
, devblocksize
,
3498 CL_READ
, (struct buf
*)0, (struct clios
*)0);
3500 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
3505 ubc_paddr
= (vm_offset_t
)upl_phys_page(pl
, 0) + (int)(uio
->uio_offset
& PAGE_MASK_64
);
3507 if (flags
& CL_READ
)
3508 copyp2p(ubc_paddr
, usr_paddr
, xsize
, 2);
3510 copyp2p(usr_paddr
, ubc_paddr
, xsize
, 1);
3512 if ( !(flags
& CL_READ
) || upl_dirty_page(pl
, 0)) {
3514 * issue a synchronous write to cluster_io
3516 error
= cluster_io(vp
, upl
, 0, uio
->uio_offset
& ~PAGE_MASK_64
, PAGE_SIZE
, devblocksize
,
3517 0, (struct buf
*)0, (struct clios
*)0);
3520 uio
->uio_offset
+= xsize
;
3521 iov
->iov_base
+= xsize
;
3522 iov
->iov_len
-= xsize
;
3523 uio
->uio_resid
-= xsize
;
3525 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);