]>
git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_cluster.c
080cd55b199f460366361538478b9db493d78977
2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
25 * The Regents of the University of California. All rights reserved.
27 * Redistribution and use in source and binary forms, with or without
28 * modification, are permitted provided that the following conditions
30 * 1. Redistributions of source code must retain the above copyright
31 * notice, this list of conditions and the following disclaimer.
32 * 2. Redistributions in binary form must reproduce the above copyright
33 * notice, this list of conditions and the following disclaimer in the
34 * documentation and/or other materials provided with the distribution.
35 * 3. All advertising materials mentioning features or use of this software
36 * must display the following acknowledgement:
37 * This product includes software developed by the University of
38 * California, Berkeley and its contributors.
39 * 4. Neither the name of the University nor the names of its contributors
40 * may be used to endorse or promote products derived from this software
41 * without specific prior written permission.
43 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
58 #include <sys/param.h>
61 #include <sys/vnode.h>
62 #include <sys/mount.h>
63 #include <sys/trace.h>
64 #include <sys/malloc.h>
65 #include <sys/resourcevar.h>
66 #include <libkern/libkern.h>
69 #include <vm/vm_pageout.h>
71 #include <sys/kdebug.h>
75 #define CL_COMMIT 0x04
76 #define CL_PAGEOUT 0x10
79 #define CL_NOZERO 0x80
80 #define CL_PAGEIN 0x100
81 #define CL_DEV_MEMORY 0x200
82 #define CL_PRESERVE 0x400
86 u_int io_completed
; /* amount of io that has currently completed */
87 u_int io_issued
; /* amount of io that was successfully issued */
88 int io_error
; /* error code of first error encountered */
89 int io_wanted
; /* someone is sleeping waiting for a change in state */
93 static void cluster_zero(upl_t upl
, vm_offset_t upl_offset
,
94 int size
, struct buf
*bp
);
95 static int cluster_read_x(struct vnode
*vp
, struct uio
*uio
,
96 off_t filesize
, int devblocksize
, int flags
);
97 static int cluster_write_x(struct vnode
*vp
, struct uio
*uio
,
98 off_t oldEOF
, off_t newEOF
, off_t headOff
,
99 off_t tailOff
, int devblocksize
, int flags
);
100 static int cluster_nocopy_read(struct vnode
*vp
, struct uio
*uio
,
101 off_t filesize
, int devblocksize
, int flags
);
102 static int cluster_nocopy_write(struct vnode
*vp
, struct uio
*uio
,
103 off_t newEOF
, int devblocksize
, int flags
);
104 static int cluster_phys_read(struct vnode
*vp
, struct uio
*uio
,
105 off_t filesize
, int devblocksize
, int flags
);
106 static int cluster_phys_write(struct vnode
*vp
, struct uio
*uio
,
107 off_t newEOF
, int devblocksize
, int flags
);
108 static int cluster_align_phys_io(struct vnode
*vp
, struct uio
*uio
,
109 vm_offset_t usr_paddr
, int xsize
, int devblocksize
, int flags
);
110 static int cluster_push_x(struct vnode
*vp
, off_t EOF
, daddr_t first
, daddr_t last
, int can_delay
);
111 static int cluster_try_push(struct vnode
*vp
, off_t newEOF
, int can_delay
, int push_all
);
115 * throttle the number of async writes that
116 * can be outstanding on a single vnode
117 * before we issue a synchronous write
119 #define ASYNC_THROTTLE 9
133 struct buf
*cbp_head
;
134 struct buf
*cbp_next
;
137 struct clios
*iostate
;
142 cbp_head
= (struct buf
*)(bp
->b_trans_head
);
144 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_START
,
145 (int)cbp_head
, bp
->b_lblkno
, bp
->b_bcount
, bp
->b_flags
, 0);
147 for (cbp
= cbp_head
; cbp
; cbp
= cbp
->b_trans_next
) {
149 * all I/O requests that are part of this transaction
150 * have to complete before we can process it
152 if ( !(cbp
->b_flags
& B_DONE
)) {
154 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
155 (int)cbp_head
, (int)cbp
, cbp
->b_bcount
, cbp
->b_flags
, 0);
165 upl_offset
= cbp
->b_uploffset
;
166 upl
= cbp
->b_pagelist
;
167 b_flags
= cbp
->b_flags
;
168 real_bp
= cbp
->b_real_bp
;
170 zero_offset
= cbp
->b_validend
;
171 iostate
= (struct clios
*)cbp
->b_iostate
;
174 if (cbp
->b_vectorcount
> 1)
175 _FREE(cbp
->b_vectorlist
, M_SEGMENT
);
177 if ((cbp
->b_flags
& B_ERROR
) && error
== 0)
178 error
= cbp
->b_error
;
180 total_resid
+= cbp
->b_resid
;
181 total_size
+= cbp
->b_bcount
;
183 cbp_next
= cbp
->b_trans_next
;
190 cluster_zero(upl
, zero_offset
, PAGE_SIZE
- (zero_offset
& PAGE_MASK
), real_bp
);
192 if ((vp
->v_flag
& VTHROTTLED
) && (vp
->v_numoutput
<= (ASYNC_THROTTLE
/ 3))) {
193 vp
->v_flag
&= ~VTHROTTLED
;
194 wakeup((caddr_t
)&vp
->v_numoutput
);
198 * someone has issued multiple I/Os asynchrounsly
199 * and is waiting for them to complete (streaming)
201 if (error
&& iostate
->io_error
== 0)
202 iostate
->io_error
= error
;
204 iostate
->io_completed
+= total_size
;
206 if (iostate
->io_wanted
) {
208 * someone is waiting for the state of
209 * this io stream to change
211 iostate
->io_wanted
= 0;
212 wakeup((caddr_t
)&iostate
->io_wanted
);
215 if ((b_flags
& B_NEED_IODONE
) && real_bp
) {
217 real_bp
->b_flags
|= B_ERROR
;
218 real_bp
->b_error
= error
;
220 real_bp
->b_resid
= total_resid
;
224 if (error
== 0 && total_resid
)
227 if (b_flags
& B_COMMIT_UPL
) {
228 pg_offset
= upl_offset
& PAGE_MASK
;
229 commit_size
= (((pg_offset
+ total_size
) + (PAGE_SIZE
- 1)) / PAGE_SIZE
) * PAGE_SIZE
;
231 if (error
|| (b_flags
& B_NOCACHE
) || ((b_flags
& B_PHYS
) && !(b_flags
& B_READ
))) {
234 if (b_flags
& B_PHYS
)
235 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
;
236 else if ((b_flags
& B_PAGEOUT
) && (error
!= ENXIO
)) /* transient error */
237 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
;
238 else if (b_flags
& B_PGIN
)
239 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
;
241 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_DUMP_PAGES
;
243 ubc_upl_abort_range(upl
, upl_offset
- pg_offset
, commit_size
,
246 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
247 (int)upl
, upl_offset
- pg_offset
, commit_size
,
248 0x80000000|upl_abort_code
, 0);
251 int upl_commit_flags
= UPL_COMMIT_FREE_ON_EMPTY
;
253 if (b_flags
& B_PHYS
)
254 upl_commit_flags
|= UPL_COMMIT_SET_DIRTY
;
255 else if ( !(b_flags
& B_PAGEOUT
))
256 upl_commit_flags
|= UPL_COMMIT_CLEAR_DIRTY
;
258 upl_commit_flags
|= UPL_COMMIT_INACTIVATE
;
260 ubc_upl_commit_range(upl
, upl_offset
- pg_offset
, commit_size
,
263 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
264 (int)upl
, upl_offset
- pg_offset
, commit_size
,
265 upl_commit_flags
, 0);
268 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
269 (int)upl
, upl_offset
, 0, error
, 0);
276 cluster_zero(upl
, upl_offset
, size
, bp
)
278 vm_offset_t upl_offset
;
282 vm_offset_t io_addr
= 0;
286 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 23)) | DBG_FUNC_NONE
,
287 upl_offset
, size
, (int)bp
, 0, 0);
289 if (bp
== NULL
|| bp
->b_data
== NULL
) {
290 kret
= ubc_upl_map(upl
, &io_addr
);
292 if (kret
!= KERN_SUCCESS
)
293 panic("cluster_zero: ubc_upl_map() failed with (%d)", kret
);
295 panic("cluster_zero: ubc_upl_map() mapped 0");
299 io_addr
= (vm_offset_t
)bp
->b_data
;
300 bzero((caddr_t
)(io_addr
+ upl_offset
), size
);
303 kret
= ubc_upl_unmap(upl
);
305 if (kret
!= KERN_SUCCESS
)
306 panic("cluster_zero: kernel_upl_unmap failed");
311 cluster_io(vp
, upl
, upl_offset
, f_offset
, non_rounded_size
, devblocksize
, flags
, real_bp
, iostate
)
314 vm_offset_t upl_offset
;
316 int non_rounded_size
;
320 struct clios
*iostate
;
329 struct buf
*cbp_head
= 0;
330 struct buf
*cbp_tail
= 0;
341 if (flags
& CL_READ
) {
342 io_flags
= (B_VECTORLIST
| B_READ
);
344 vfs_io_attributes(vp
, B_READ
, &max_iosize
, &max_vectors
);
346 io_flags
= (B_VECTORLIST
| B_WRITEINPROG
);
348 vfs_io_attributes(vp
, B_WRITE
, &max_iosize
, &max_vectors
);
350 pl
= ubc_upl_pageinfo(upl
);
355 io_flags
|= B_NOCACHE
;
356 if (flags
& CL_PAGEIN
)
358 if (flags
& CL_PAGEOUT
)
359 io_flags
|= B_PAGEOUT
;
360 if (flags
& CL_COMMIT
)
361 io_flags
|= B_COMMIT_UPL
;
362 if (flags
& CL_PRESERVE
)
366 size
= (non_rounded_size
+ (devblocksize
- 1)) & ~(devblocksize
- 1);
368 size
= non_rounded_size
;
371 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_START
,
372 (int)f_offset
, size
, upl_offset
, flags
, 0);
374 if ((flags
& CL_READ
) && ((upl_offset
+ non_rounded_size
) & PAGE_MASK
) && (!(flags
& CL_NOZERO
))) {
376 * then we are going to end up
377 * with a page that we can't complete (the file size wasn't a multiple
378 * of PAGE_SIZE and we're trying to read to the end of the file
379 * so we'll go ahead and zero out the portion of the page we can't
380 * read in from the file
382 zero_offset
= upl_offset
+ non_rounded_size
;
393 if (size
> max_iosize
)
394 io_size
= max_iosize
;
398 if (error
= VOP_CMAP(vp
, f_offset
, io_size
, &blkno
, (size_t *)&io_size
, NULL
)) {
399 if (error
== EOPNOTSUPP
)
400 panic("VOP_CMAP Unimplemented");
404 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 24)) | DBG_FUNC_NONE
,
405 (int)f_offset
, (int)blkno
, io_size
, zero_offset
, 0);
407 if ( (!(flags
& CL_READ
) && (long)blkno
== -1) || io_size
== 0) {
408 if (flags
& CL_PAGEOUT
) {
413 /* Try paging out the page individually before
414 giving up entirely and dumping it (it could
415 be mapped in a "hole" and require allocation
418 ubc_upl_abort_range(upl
, upl_offset
, PAGE_SIZE_64
, UPL_ABORT_FREE_ON_EMPTY
);
419 if (ubc_pushdirty_range(vp
, f_offset
, PAGE_SIZE_64
) == 0) {
424 upl_offset
+= PAGE_SIZE_64
;
425 f_offset
+= PAGE_SIZE_64
;
426 size
-= PAGE_SIZE_64
;
429 lblkno
= (daddr_t
)(f_offset
/ PAGE_SIZE_64
);
431 * we have now figured out how much I/O we can do - this is in 'io_size'
432 * pl_index represents the first page in the 'upl' that the I/O will occur for
433 * pg_offset is the starting point in the first page for the I/O
434 * pg_count is the number of full and partial pages that 'io_size' encompasses
436 pl_index
= upl_offset
/ PAGE_SIZE
;
437 pg_offset
= upl_offset
& PAGE_MASK
;
438 pg_count
= (io_size
+ pg_offset
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
440 if (flags
& CL_DEV_MEMORY
) {
442 * currently, can't deal with reading 'holes' in file
444 if ((long)blkno
== -1) {
449 * treat physical requests as one 'giant' page
453 if ((flags
& CL_READ
) && (long)blkno
== -1) {
457 * if we're reading and blkno == -1, then we've got a
458 * 'hole' in the file that we need to deal with by zeroing
459 * out the affected area in the upl
461 if (zero_offset
&& io_size
== size
) {
463 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
464 * than 'zero_offset' will be non-zero
465 * if the 'hole' returned by VOP_CMAP extends all the way to the eof
466 * (indicated by the io_size finishing off the I/O request for this UPL)
467 * than we're not going to issue an I/O for the
468 * last page in this upl... we need to zero both the hole and the tail
469 * of the page beyond the EOF, since the delayed zero-fill won't kick in
471 bytes_to_zero
= (((upl_offset
+ io_size
) + (PAGE_SIZE
- 1)) & ~PAGE_MASK
) - upl_offset
;
475 bytes_to_zero
= io_size
;
477 cluster_zero(upl
, upl_offset
, bytes_to_zero
, real_bp
);
481 * if there is a current I/O chain pending
482 * then the first page of the group we just zero'd
483 * will be handled by the I/O completion if the zero
484 * fill started in the middle of the page
486 pg_count
= (io_size
- pg_offset
) / PAGE_SIZE
;
489 * no pending I/O to pick up that first page
490 * so, we have to make sure it gets committed
492 * set the pg_offset to 0 so that the upl_commit_range
493 * starts with this page
495 pg_count
= (io_size
+ pg_offset
) / PAGE_SIZE
;
498 if (io_size
== size
&& ((upl_offset
+ io_size
) & PAGE_MASK
))
500 * if we're done with the request for this UPL
501 * then we have to make sure to commit the last page
502 * even if we only partially zero-filled it
508 pg_resid
= PAGE_SIZE
- pg_offset
;
512 if (flags
& CL_COMMIT
)
513 ubc_upl_commit_range(upl
,
514 (upl_offset
+ pg_resid
) & ~PAGE_MASK
,
515 pg_count
* PAGE_SIZE
,
516 UPL_COMMIT_CLEAR_DIRTY
| UPL_COMMIT_FREE_ON_EMPTY
);
518 upl_offset
+= io_size
;
522 if (cbp_head
&& pg_count
)
526 } else if (real_bp
&& (real_bp
->b_blkno
== real_bp
->b_lblkno
)) {
527 real_bp
->b_blkno
= blkno
;
531 if (pg_count
> max_vectors
) {
532 io_size
-= (pg_count
- max_vectors
) * PAGE_SIZE
;
535 io_size
= PAGE_SIZE
- pg_offset
;
538 pg_count
= max_vectors
;
541 * we need to allocate space for the vector list
544 iovp
= (struct iovec
*)_MALLOC(sizeof(struct iovec
) * pg_count
,
545 M_SEGMENT
, M_NOWAIT
);
547 if (iovp
== (struct iovec
*) 0) {
549 * if the allocation fails, then throttle down to a single page
551 io_size
= PAGE_SIZE
- pg_offset
;
557 /* Throttle the speculative IO */
558 if ((flags
& CL_ASYNC
) && !(flags
& CL_PAGEOUT
))
563 cbp
= alloc_io_buf(vp
, priv
);
567 * we use the io vector that's reserved in the buffer header
568 * this insures we can always issue an I/O even in a low memory
569 * condition that prevents the _MALLOC from succeeding... this
570 * is necessary to prevent deadlocks with the pager
572 iovp
= (struct iovec
*)(&cbp
->b_vects
[0]);
574 cbp
->b_vectorlist
= (void *)iovp
;
575 cbp
->b_vectorcount
= pg_count
;
577 if (flags
& CL_DEV_MEMORY
) {
579 iovp
->iov_len
= io_size
;
580 iovp
->iov_base
= (caddr_t
)upl_phys_page(pl
, 0);
582 if (iovp
->iov_base
== (caddr_t
) 0) {
586 iovp
->iov_base
+= upl_offset
;
589 for (i
= 0, vsize
= io_size
; i
< pg_count
; i
++, iovp
++) {
592 psize
= PAGE_SIZE
- pg_offset
;
597 iovp
->iov_len
= psize
;
598 iovp
->iov_base
= (caddr_t
)upl_phys_page(pl
, pl_index
+ i
);
600 if (iovp
->iov_base
== (caddr_t
) 0) {
602 _FREE(cbp
->b_vectorlist
, M_SEGMENT
);
608 iovp
->iov_base
+= pg_offset
;
611 if (flags
& CL_PAGEOUT
) {
616 if (bp
= incore(vp
, lblkno
+ i
)) {
617 if (!ISSET(bp
->b_flags
, B_BUSY
)) {
619 SET(bp
->b_flags
, (B_BUSY
| B_INVAL
));
623 panic("BUSY bp found in cluster_io");
633 if (flags
& CL_ASYNC
) {
634 cbp
->b_flags
|= (B_CALL
| B_ASYNC
);
635 cbp
->b_iodone
= (void *)cluster_iodone
;
637 cbp
->b_flags
|= io_flags
;
639 cbp
->b_lblkno
= lblkno
;
640 cbp
->b_blkno
= blkno
;
641 cbp
->b_bcount
= io_size
;
642 cbp
->b_pagelist
= upl
;
643 cbp
->b_uploffset
= upl_offset
;
644 cbp
->b_trans_next
= (struct buf
*)0;
646 if (cbp
->b_iostate
= (void *)iostate
)
648 * caller wants to track the state of this
649 * io... bump the amount issued against this stream
651 iostate
->io_issued
+= io_size
;
654 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 26)) | DBG_FUNC_NONE
,
655 cbp
->b_lblkno
, cbp
->b_blkno
, upl_offset
, io_size
, 0);
657 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 27)) | DBG_FUNC_NONE
,
658 cbp
->b_lblkno
, cbp
->b_blkno
, upl_offset
, io_size
, 0);
661 cbp_tail
->b_trans_next
= cbp
;
667 (struct buf
*)(cbp
->b_trans_head
) = cbp_head
;
670 upl_offset
+= io_size
;
674 if ( (!(upl_offset
& PAGE_MASK
) && !(flags
& CL_DEV_MEMORY
) && ((flags
& CL_ASYNC
) || buf_count
> 8)) || size
== 0) {
676 * if we have no more I/O to issue or
677 * the current I/O we've prepared fully
678 * completes the last page in this request
679 * and it's either an ASYNC request or
680 * we've already accumulated more than 8 I/O's into
681 * this transaction and it's not an I/O directed to
682 * special DEVICE memory
683 * then go ahead and issue the I/O
687 cbp_head
->b_flags
|= B_NEED_IODONE
;
688 cbp_head
->b_real_bp
= real_bp
;
690 cbp_head
->b_real_bp
= (struct buf
*)NULL
;
694 * we're about to issue the last I/O for this upl
695 * if this was a read to the eof and the eof doesn't
696 * finish on a page boundary, than we need to zero-fill
697 * the rest of the page....
699 cbp_head
->b_validend
= zero_offset
;
701 cbp_head
->b_validend
= 0;
703 for (cbp
= cbp_head
; cbp
;) {
704 struct buf
* cbp_next
;
706 if (io_flags
& B_WRITEINPROG
)
707 cbp
->b_vp
->v_numoutput
++;
709 cbp_next
= cbp
->b_trans_next
;
711 (void) VOP_STRATEGY(cbp
);
714 if ( !(flags
& CL_ASYNC
)) {
715 for (cbp
= cbp_head
; cbp
; cbp
= cbp
->b_trans_next
)
718 if (error
= cluster_iodone(cbp_head
)) {
719 if ((flags
& CL_PAGEOUT
) && (error
== ENXIO
))
720 retval
= 0; /* drop the error */
726 cbp_head
= (struct buf
*)0;
727 cbp_tail
= (struct buf
*)0;
737 for (cbp
= cbp_head
; cbp
;) {
738 struct buf
* cbp_next
;
740 if (cbp
->b_vectorcount
> 1)
741 _FREE(cbp
->b_vectorlist
, M_SEGMENT
);
742 upl_offset
-= cbp
->b_bcount
;
743 size
+= cbp
->b_bcount
;
744 io_size
+= cbp
->b_bcount
;
746 cbp_next
= cbp
->b_trans_next
;
752 * update the error condition for this stream
753 * since we never really issued the io
754 * just go ahead and adjust it back
756 if (iostate
->io_error
== 0)
757 iostate
->io_error
= error
;
758 iostate
->io_issued
-= io_size
;
760 if (iostate
->io_wanted
) {
762 * someone is waiting for the state of
763 * this io stream to change
765 iostate
->io_wanted
= 0;
766 wakeup((caddr_t
)&iostate
->io_wanted
);
769 pg_offset
= upl_offset
& PAGE_MASK
;
770 abort_size
= ((size
+ pg_offset
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
) * PAGE_SIZE
;
772 if (flags
& CL_COMMIT
) {
775 if (flags
& CL_PRESERVE
)
776 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
;
777 else if ((flags
& CL_PAGEOUT
) && (error
!= ENXIO
)) /* transient error */
778 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
;
779 else if (flags
& CL_PAGEIN
)
780 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
;
782 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_DUMP_PAGES
;
784 ubc_upl_abort_range(upl
, upl_offset
- pg_offset
, abort_size
,
787 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 28)) | DBG_FUNC_NONE
,
788 (int)upl
, upl_offset
- pg_offset
, abort_size
, error
, 0);
791 real_bp
->b_flags
|= B_ERROR
;
792 real_bp
->b_error
= error
;
799 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_END
,
800 (int)f_offset
, size
, upl_offset
, retval
, 0);
807 cluster_rd_prefetch(vp
, f_offset
, size
, filesize
, devblocksize
)
817 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_START
,
818 (int)f_offset
, size
, (int)filesize
, 0, 0);
820 if (f_offset
>= filesize
) {
821 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_END
,
822 (int)f_offset
, 0, 0, 0, 0);
825 if (size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
826 size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
828 size
= (size
+ (PAGE_SIZE
- 1)) & ~(PAGE_SIZE
- 1);
830 if ((off_t
)size
> (filesize
- f_offset
))
831 size
= filesize
- f_offset
;
833 pages_to_fetch
= (size
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
835 for (skipped_pages
= 0; skipped_pages
< pages_to_fetch
; skipped_pages
++) {
836 if (ubc_page_op(vp
, f_offset
, 0, 0, 0) != KERN_SUCCESS
)
838 f_offset
+= PAGE_SIZE
;
841 if (skipped_pages
< pages_to_fetch
)
842 advisory_read(vp
, filesize
, f_offset
, size
, devblocksize
);
844 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_END
,
845 (int)f_offset
+ (pages_to_fetch
* PAGE_SIZE
), skipped_pages
, 0, 1, 0);
847 return (pages_to_fetch
);
853 cluster_rd_ahead(vp
, b_lblkno
, e_lblkno
, filesize
, devblocksize
)
862 int size_of_prefetch
;
865 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_START
,
866 b_lblkno
, e_lblkno
, vp
->v_lastr
, 0, 0);
868 if (b_lblkno
== vp
->v_lastr
&& b_lblkno
== e_lblkno
) {
869 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
870 vp
->v_ralen
, vp
->v_maxra
, vp
->v_lastr
, 0, 0);
874 if (vp
->v_lastr
== -1 || (b_lblkno
!= vp
->v_lastr
&& b_lblkno
!= (vp
->v_lastr
+ 1) &&
875 (b_lblkno
!= (vp
->v_maxra
+ 1) || vp
->v_ralen
== 0))) {
879 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
880 vp
->v_ralen
, vp
->v_maxra
, vp
->v_lastr
, 1, 0);
884 max_pages
= MAX_UPL_TRANSFER
;
886 vp
->v_ralen
= vp
->v_ralen
? min(max_pages
, vp
->v_ralen
<< 1) : 1;
888 if (((e_lblkno
+ 1) - b_lblkno
) > vp
->v_ralen
)
889 vp
->v_ralen
= min(max_pages
, (e_lblkno
+ 1) - b_lblkno
);
891 if (e_lblkno
< vp
->v_maxra
) {
892 if ((vp
->v_maxra
- e_lblkno
) > max(max_pages
/ 16, 4)) {
894 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
895 vp
->v_ralen
, vp
->v_maxra
, vp
->v_lastr
, 2, 0);
899 r_lblkno
= max(e_lblkno
, vp
->v_maxra
) + 1;
900 f_offset
= (off_t
)r_lblkno
* PAGE_SIZE_64
;
902 if (f_offset
< filesize
) {
903 size_of_prefetch
= cluster_rd_prefetch(vp
, f_offset
, vp
->v_ralen
* PAGE_SIZE
, filesize
, devblocksize
);
905 if (size_of_prefetch
)
906 vp
->v_maxra
= (r_lblkno
+ size_of_prefetch
) - 1;
908 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
909 vp
->v_ralen
, vp
->v_maxra
, vp
->v_lastr
, 3, 0);
913 cluster_pageout(vp
, upl
, upl_offset
, f_offset
, size
, filesize
, devblocksize
, flags
)
916 vm_offset_t upl_offset
;
926 int local_flags
= CL_PAGEOUT
;
928 if ((flags
& UPL_IOSYNC
) == 0)
929 local_flags
|= CL_ASYNC
;
930 if ((flags
& UPL_NOCOMMIT
) == 0)
931 local_flags
|= CL_COMMIT
;
934 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 52)) | DBG_FUNC_NONE
,
935 (int)f_offset
, size
, (int)filesize
, local_flags
, 0);
938 * If they didn't specify any I/O, then we are done...
939 * we can't issue an abort because we don't know how
940 * big the upl really is
945 if (vp
->v_mount
->mnt_flag
& MNT_RDONLY
) {
946 if (local_flags
& CL_COMMIT
)
947 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
);
951 * can't page-in from a negative offset
952 * or if we're starting beyond the EOF
953 * or if the file offset isn't page aligned
954 * or the size requested isn't a multiple of PAGE_SIZE
956 if (f_offset
< 0 || f_offset
>= filesize
||
957 (f_offset
& PAGE_MASK_64
) || (size
& PAGE_MASK
)) {
958 if (local_flags
& CL_COMMIT
)
959 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
);
962 max_size
= filesize
- f_offset
;
969 pg_size
= (io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
971 if (size
> pg_size
) {
972 if (local_flags
& CL_COMMIT
)
973 ubc_upl_abort_range(upl
, upl_offset
+ pg_size
, size
- pg_size
,
974 UPL_ABORT_FREE_ON_EMPTY
);
976 while (vp
->v_numoutput
>= ASYNC_THROTTLE
) {
977 vp
->v_flag
|= VTHROTTLED
;
978 tsleep((caddr_t
)&vp
->v_numoutput
, PRIBIO
+ 1, "cluster_pageout", 0);
981 return (cluster_io(vp
, upl
, upl_offset
, f_offset
, io_size
, devblocksize
,
982 local_flags
, (struct buf
*)0, (struct clios
*)0));
986 cluster_pagein(vp
, upl
, upl_offset
, f_offset
, size
, filesize
, devblocksize
, flags
)
989 vm_offset_t upl_offset
;
1000 int local_flags
= 0;
1002 if (upl
== NULL
|| size
< 0)
1003 panic("cluster_pagein: NULL upl passed in");
1005 if ((flags
& UPL_IOSYNC
) == 0)
1006 local_flags
|= CL_ASYNC
;
1007 if ((flags
& UPL_NOCOMMIT
) == 0)
1008 local_flags
|= CL_COMMIT
;
1011 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 56)) | DBG_FUNC_NONE
,
1012 (int)f_offset
, size
, (int)filesize
, local_flags
, 0);
1015 * can't page-in from a negative offset
1016 * or if we're starting beyond the EOF
1017 * or if the file offset isn't page aligned
1018 * or the size requested isn't a multiple of PAGE_SIZE
1020 if (f_offset
< 0 || f_offset
>= filesize
||
1021 (f_offset
& PAGE_MASK_64
) || (size
& PAGE_MASK
) || (upl_offset
& PAGE_MASK
)) {
1022 if (local_flags
& CL_COMMIT
)
1023 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
);
1026 max_size
= filesize
- f_offset
;
1028 if (size
< max_size
)
1033 rounded_size
= (io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
1035 if (size
> rounded_size
&& (local_flags
& CL_COMMIT
))
1036 ubc_upl_abort_range(upl
, upl_offset
+ rounded_size
,
1037 size
- (upl_offset
+ rounded_size
), UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
);
1039 retval
= cluster_io(vp
, upl
, upl_offset
, f_offset
, io_size
, devblocksize
,
1040 local_flags
| CL_READ
| CL_PAGEIN
, (struct buf
*)0, (struct clios
*)0);
1046 b_lblkno
= (int)(f_offset
/ PAGE_SIZE_64
);
1048 ((f_offset
+ ((off_t
)io_size
- 1)) / PAGE_SIZE_64
);
1050 if (!(flags
& UPL_NORDAHEAD
) && !(vp
->v_flag
& VRAOFF
) && rounded_size
== PAGE_SIZE
) {
1052 * we haven't read the last page in of the file yet
1053 * so let's try to read ahead if we're in
1054 * a sequential access pattern
1056 cluster_rd_ahead(vp
, b_lblkno
, e_lblkno
, filesize
, devblocksize
);
1058 vp
->v_lastr
= e_lblkno
;
1070 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 19)) | DBG_FUNC_START
,
1071 (int)bp
, bp
->b_lblkno
, bp
->b_bcount
, bp
->b_flags
, 0);
1073 if (bp
->b_pagelist
== (upl_t
) 0)
1074 panic("cluster_bp: can't handle NULL upl yet\n");
1075 if (bp
->b_flags
& B_READ
)
1076 flags
= CL_ASYNC
| CL_READ
;
1080 f_offset
= ubc_blktooff(bp
->b_vp
, bp
->b_lblkno
);
1082 return (cluster_io(bp
->b_vp
, bp
->b_pagelist
, 0, f_offset
, bp
->b_bcount
, 0, flags
, bp
, (struct clios
*)0));
1086 cluster_write(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
)
1100 vm_offset_t upl_offset
;
1103 upl_page_info_t
*pl
;
1109 if ( (!(vp
->v_flag
& VNOCACHE_DATA
)) || (!uio
) || (uio
->uio_segflg
!= UIO_USERSPACE
))
1111 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
);
1115 while (uio
->uio_resid
&& uio
->uio_offset
< newEOF
&& retval
== 0)
1117 /* we know we have a resid, so this is safe */
1119 while (iov
->iov_len
== 0) {
1126 * We check every vector target and if it is physically
1127 * contiguous space, we skip the sanity checks.
1130 upl_offset
= (vm_offset_t
)iov
->iov_base
& ~PAGE_MASK
;
1131 upl_size
= (upl_offset
+ PAGE_SIZE
+(PAGE_SIZE
-1)) & ~PAGE_MASK
;
1133 upl_flags
= UPL_QUERY_OBJECT_TYPE
;
1134 if ((vm_map_get_upl(current_map(),
1135 (vm_offset_t
)iov
->iov_base
& ~PAGE_MASK
,
1136 &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
, 0)) != KERN_SUCCESS
)
1139 * the user app must have passed in an invalid address
1144 if (upl_flags
& UPL_PHYS_CONTIG
)
1146 if (flags
& IO_HEADZEROFILL
)
1148 flags
&= ~IO_HEADZEROFILL
;
1150 if (retval
= cluster_write_x(vp
, (struct uio
*)0, 0, uio
->uio_offset
, headOff
, 0, devblocksize
, IO_HEADZEROFILL
))
1154 retval
= cluster_phys_write(vp
, uio
, newEOF
, devblocksize
, flags
);
1156 if (uio
->uio_resid
== 0 && (flags
& IO_TAILZEROFILL
))
1158 retval
= cluster_write_x(vp
, (struct uio
*)0, 0, tailOff
, uio
->uio_offset
, 0, devblocksize
, IO_HEADZEROFILL
);
1162 else if ((uio
->uio_resid
< 4 * PAGE_SIZE
) || (flags
& (IO_TAILZEROFILL
| IO_HEADZEROFILL
)))
1165 * We set a threshhold of 4 pages to decide if the nocopy
1166 * write loop is worth the trouble...
1167 * we also come here if we're trying to zero the head and/or tail
1168 * of a partially written page, and the user source is not a physically contiguous region
1170 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
);
1173 else if (uio
->uio_offset
& PAGE_MASK_64
)
1175 /* Bring the file offset write up to a pagesize boundary */
1176 clip_size
= (PAGE_SIZE
- (uio
->uio_offset
& PAGE_MASK_64
));
1177 if (uio
->uio_resid
< clip_size
)
1178 clip_size
= uio
->uio_resid
;
1180 * Fake the resid going into the cluster_write_x call
1181 * and restore it on the way out.
1183 prev_resid
= uio
->uio_resid
;
1184 uio
->uio_resid
= clip_size
;
1185 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
);
1186 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
1188 else if ((int)iov
->iov_base
& PAGE_MASK_64
)
1190 clip_size
= iov
->iov_len
;
1191 prev_resid
= uio
->uio_resid
;
1192 uio
->uio_resid
= clip_size
;
1193 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
);
1194 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
1199 * If we come in here, we know the offset into
1200 * the file is on a pagesize boundary
1203 max_io_size
= newEOF
- uio
->uio_offset
;
1204 clip_size
= uio
->uio_resid
;
1205 if (iov
->iov_len
< clip_size
)
1206 clip_size
= iov
->iov_len
;
1207 if (max_io_size
< clip_size
)
1208 clip_size
= max_io_size
;
1210 if (clip_size
< PAGE_SIZE
)
1213 * Take care of tail end of write in this vector
1215 prev_resid
= uio
->uio_resid
;
1216 uio
->uio_resid
= clip_size
;
1217 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
);
1218 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
1222 /* round clip_size down to a multiple of pagesize */
1223 clip_size
= clip_size
& ~(PAGE_MASK
);
1224 prev_resid
= uio
->uio_resid
;
1225 uio
->uio_resid
= clip_size
;
1226 retval
= cluster_nocopy_write(vp
, uio
, newEOF
, devblocksize
, flags
);
1227 if ((retval
== 0) && uio
->uio_resid
)
1228 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
);
1229 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
1238 cluster_nocopy_write(vp
, uio
, newEOF
, devblocksize
, flags
)
1246 upl_page_info_t
*pl
;
1248 vm_offset_t upl_offset
;
1253 int upl_needed_size
;
1260 int force_data_sync
;
1262 struct clios iostate
;
1264 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_START
,
1265 (int)uio
->uio_offset
, (int)uio
->uio_resid
,
1266 (int)newEOF
, devblocksize
, 0);
1269 * When we enter this routine, we know
1270 * -- the offset into the file is on a pagesize boundary
1271 * -- the resid is a page multiple
1272 * -- the resid will not exceed iov_len
1274 cluster_try_push(vp
, newEOF
, 0, 1);
1276 iostate
.io_completed
= 0;
1277 iostate
.io_issued
= 0;
1278 iostate
.io_error
= 0;
1279 iostate
.io_wanted
= 0;
1283 while (uio
->uio_resid
&& uio
->uio_offset
< newEOF
&& error
== 0) {
1284 io_size
= uio
->uio_resid
;
1286 if (io_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
1287 io_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
1290 if (io_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
) / 4)
1291 io_size
= (MAX_UPL_TRANSFER
* PAGE_SIZE
) / 8;
1294 upl_offset
= (vm_offset_t
)iov
->iov_base
& PAGE_MASK_64
;
1295 upl_needed_size
= (upl_offset
+ io_size
+ (PAGE_SIZE
-1)) & ~PAGE_MASK
;
1297 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_START
,
1298 (int)upl_offset
, upl_needed_size
, (int)iov
->iov_base
, io_size
, 0);
1300 for (force_data_sync
= 0; force_data_sync
< 3; force_data_sync
++) {
1302 upl_size
= upl_needed_size
;
1303 upl_flags
= UPL_FILE_IO
| UPL_COPYOUT_FROM
| UPL_NO_SYNC
|
1304 UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
;
1306 kret
= vm_map_get_upl(current_map(),
1307 (vm_offset_t
)iov
->iov_base
& ~PAGE_MASK
,
1315 if (kret
!= KERN_SUCCESS
) {
1316 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
1320 * cluster_nocopy_write: failed to get pagelist
1322 * we may have already spun some portion of this request
1323 * off as async requests... we need to wait for the I/O
1324 * to complete before returning
1326 goto wait_for_writes
;
1328 pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
1329 pages_in_pl
= upl_size
/ PAGE_SIZE
;
1331 for (i
= 0; i
< pages_in_pl
; i
++) {
1332 if (!upl_valid_page(pl
, i
))
1335 if (i
== pages_in_pl
)
1339 * didn't get all the pages back that we
1340 * needed... release this upl and try again
1342 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
1343 UPL_ABORT_FREE_ON_EMPTY
);
1345 if (force_data_sync
>= 3) {
1346 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
1347 i
, pages_in_pl
, upl_size
, kret
, 0);
1350 * for some reason, we couldn't acquire a hold on all
1351 * the pages needed in the user's address space
1353 * we may have already spun some portion of this request
1354 * off as async requests... we need to wait for the I/O
1355 * to complete before returning
1357 goto wait_for_writes
;
1361 * Consider the possibility that upl_size wasn't satisfied.
1363 if (upl_size
!= upl_needed_size
)
1364 io_size
= (upl_size
- (int)upl_offset
) & ~PAGE_MASK
;
1366 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
1367 (int)upl_offset
, upl_size
, (int)iov
->iov_base
, io_size
, 0);
1370 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
1371 UPL_ABORT_FREE_ON_EMPTY
);
1374 * we may have already spun some portion of this request
1375 * off as async requests... we need to wait for the I/O
1376 * to complete before returning
1378 goto wait_for_writes
;
1381 * Now look for pages already in the cache
1382 * and throw them away.
1385 upl_f_offset
= uio
->uio_offset
; /* this is page aligned in the file */
1386 max_io_size
= io_size
;
1388 while (max_io_size
) {
1390 * Flag UPL_POP_DUMP says if the page is found
1391 * in the page cache it must be thrown away.
1395 UPL_POP_SET
| UPL_POP_BUSY
| UPL_POP_DUMP
,
1397 max_io_size
-= PAGE_SIZE_64
;
1398 upl_f_offset
+= PAGE_SIZE_64
;
1401 * we want push out these writes asynchronously so that we can overlap
1402 * the preparation of the next I/O
1403 * if there are already too many outstanding writes
1404 * wait until some complete before issuing the next
1406 while ((iostate
.io_issued
- iostate
.io_completed
) > (2 * MAX_UPL_TRANSFER
* PAGE_SIZE
)) {
1407 iostate
.io_wanted
= 1;
1408 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO
+ 1, "cluster_nocopy_write", 0);
1410 if (iostate
.io_error
) {
1412 * one of the earlier writes we issued ran into a hard error
1413 * don't issue any more writes, cleanup the UPL
1414 * that was just created but not used, then
1415 * go wait for all writes that are part of this stream
1416 * to complete before returning the error to the caller
1418 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
1419 UPL_ABORT_FREE_ON_EMPTY
);
1421 goto wait_for_writes
;
1423 io_flag
= CL_ASYNC
| CL_PRESERVE
| CL_COMMIT
;
1425 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 77)) | DBG_FUNC_START
,
1426 (int)upl_offset
, (int)uio
->uio_offset
, io_size
, io_flag
, 0);
1428 error
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
,
1429 io_size
, devblocksize
, io_flag
, (struct buf
*)0, &iostate
);
1431 iov
->iov_len
-= io_size
;
1432 iov
->iov_base
+= io_size
;
1433 uio
->uio_resid
-= io_size
;
1434 uio
->uio_offset
+= io_size
;
1436 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 77)) | DBG_FUNC_END
,
1437 (int)upl_offset
, (int)uio
->uio_offset
, (int)uio
->uio_resid
, error
, 0);
1443 * make sure all async writes issued as part of this stream
1444 * have completed before we return
1446 while (iostate
.io_issued
!= iostate
.io_completed
) {
1447 iostate
.io_wanted
= 1;
1448 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO
+ 1, "cluster_nocopy_write", 0);
1450 if (iostate
.io_error
)
1451 error
= iostate
.io_error
;
1453 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_END
,
1454 (int)uio
->uio_offset
, (int)uio
->uio_resid
, error
, 4, 0);
1461 cluster_phys_write(vp
, uio
, newEOF
, devblocksize
, flags
)
1468 upl_page_info_t
*pl
;
1469 vm_offset_t src_paddr
;
1471 vm_offset_t upl_offset
;
1475 int upl_needed_size
;
1483 * When we enter this routine, we know
1484 * -- the resid will not exceed iov_len
1485 * -- the vector target address is physcially contiguous
1487 cluster_try_push(vp
, newEOF
, 0, 1);
1490 io_size
= iov
->iov_len
;
1491 upl_offset
= (vm_offset_t
)iov
->iov_base
& PAGE_MASK_64
;
1492 upl_needed_size
= upl_offset
+ io_size
;
1495 upl_size
= upl_needed_size
;
1496 upl_flags
= UPL_FILE_IO
| UPL_COPYOUT_FROM
| UPL_NO_SYNC
|
1497 UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
;
1499 kret
= vm_map_get_upl(current_map(),
1500 (vm_offset_t
)iov
->iov_base
& ~PAGE_MASK
,
1501 &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
, 0);
1503 if (kret
!= KERN_SUCCESS
) {
1505 * cluster_phys_write: failed to get pagelist
1506 * note: return kret here
1511 * Consider the possibility that upl_size wasn't satisfied.
1512 * This is a failure in the physical memory case.
1514 if (upl_size
< upl_needed_size
) {
1515 kernel_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
1518 pl
= ubc_upl_pageinfo(upl
);
1520 src_paddr
= (vm_offset_t
)upl_phys_page(pl
, 0) + ((vm_offset_t
)iov
->iov_base
& PAGE_MASK
);
1522 while (((uio
->uio_offset
& (devblocksize
- 1)) || io_size
< devblocksize
) && io_size
) {
1525 head_size
= devblocksize
- (int)(uio
->uio_offset
& (devblocksize
- 1));
1527 if (head_size
> io_size
)
1528 head_size
= io_size
;
1530 error
= cluster_align_phys_io(vp
, uio
, src_paddr
, head_size
, devblocksize
, 0);
1533 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
1537 upl_offset
+= head_size
;
1538 src_paddr
+= head_size
;
1539 io_size
-= head_size
;
1541 tail_size
= io_size
& (devblocksize
- 1);
1542 io_size
-= tail_size
;
1546 * issue a synchronous write to cluster_io
1548 error
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
,
1549 io_size
, 0, CL_DEV_MEMORY
, (struct buf
*)0, (struct clios
*)0);
1553 * The cluster_io write completed successfully,
1554 * update the uio structure
1556 uio
->uio_resid
-= io_size
;
1557 iov
->iov_len
-= io_size
;
1558 iov
->iov_base
+= io_size
;
1559 uio
->uio_offset
+= io_size
;
1560 src_paddr
+= io_size
;
1563 error
= cluster_align_phys_io(vp
, uio
, src_paddr
, tail_size
, devblocksize
, 0);
1566 * just release our hold on the physically contiguous
1567 * region without changing any state
1569 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
1576 cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
)
1586 upl_page_info_t
*pl
;
1588 vm_offset_t upl_offset
;
1596 vm_offset_t io_address
;
1603 long long total_size
;
1606 long long zero_cnt1
;
1608 daddr_t start_blkno
;
1612 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_START
,
1613 (int)uio
->uio_offset
, uio
->uio_resid
, (int)oldEOF
, (int)newEOF
, 0);
1615 uio_resid
= uio
->uio_resid
;
1617 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_START
,
1618 0, 0, (int)oldEOF
, (int)newEOF
, 0);
1625 if (flags
& IO_HEADZEROFILL
) {
1627 * some filesystems (HFS is one) don't support unallocated holes within a file...
1628 * so we zero fill the intervening space between the old EOF and the offset
1629 * where the next chunk of real data begins.... ftruncate will also use this
1630 * routine to zero fill to the new EOF when growing a file... in this case, the
1631 * uio structure will not be provided
1634 if (headOff
< uio
->uio_offset
) {
1635 zero_cnt
= uio
->uio_offset
- headOff
;
1638 } else if (headOff
< newEOF
) {
1639 zero_cnt
= newEOF
- headOff
;
1643 if (flags
& IO_TAILZEROFILL
) {
1645 zero_off1
= uio
->uio_offset
+ uio
->uio_resid
;
1647 if (zero_off1
< tailOff
)
1648 zero_cnt1
= tailOff
- zero_off1
;
1651 if (zero_cnt
== 0 && uio
== (struct uio
*) 0)
1653 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_END
,
1654 retval
, 0, 0, 0, 0);
1658 while ((total_size
= (uio_resid
+ zero_cnt
+ zero_cnt1
)) && retval
== 0) {
1660 * for this iteration of the loop, figure out where our starting point is
1663 start_offset
= (int)(zero_off
& PAGE_MASK_64
);
1664 upl_f_offset
= zero_off
- start_offset
;
1665 } else if (uio_resid
) {
1666 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
1667 upl_f_offset
= uio
->uio_offset
- start_offset
;
1669 start_offset
= (int)(zero_off1
& PAGE_MASK_64
);
1670 upl_f_offset
= zero_off1
- start_offset
;
1672 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 46)) | DBG_FUNC_NONE
,
1673 (int)zero_off
, (int)zero_cnt
, (int)zero_off1
, (int)zero_cnt1
, 0);
1675 if (total_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
1676 total_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
1679 * compute the size of the upl needed to encompass
1680 * the requested write... limit each call to cluster_io
1681 * to the maximum UPL size... cluster_io will clip if
1682 * this exceeds the maximum io_size for the device,
1683 * make sure to account for
1684 * a starting offset that's not page aligned
1686 upl_size
= (start_offset
+ total_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
1688 if (upl_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
1689 upl_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
1691 pages_in_upl
= upl_size
/ PAGE_SIZE
;
1692 io_size
= upl_size
- start_offset
;
1694 if ((long long)io_size
> total_size
)
1695 io_size
= total_size
;
1697 start_blkno
= (daddr_t
)(upl_f_offset
/ PAGE_SIZE_64
);
1698 last_blkno
= start_blkno
+ pages_in_upl
;
1700 kret
= ubc_create_upl(vp
,
1706 if (kret
!= KERN_SUCCESS
)
1707 panic("cluster_write: failed to get pagelist");
1709 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_NONE
,
1710 (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
1712 if (start_offset
&& !upl_valid_page(pl
, 0)) {
1716 * we're starting in the middle of the first page of the upl
1717 * and the page isn't currently valid, so we're going to have
1718 * to read it in first... this is a synchronous operation
1720 read_size
= PAGE_SIZE
;
1722 if ((upl_f_offset
+ read_size
) > newEOF
)
1723 read_size
= newEOF
- upl_f_offset
;
1725 retval
= cluster_io(vp
, upl
, 0, upl_f_offset
, read_size
, devblocksize
,
1726 CL_READ
, (struct buf
*)0, (struct clios
*)0);
1729 * we had an error during the read which causes us to abort
1730 * the current cluster_write request... before we do, we need
1731 * to release the rest of the pages in the upl without modifying
1732 * there state and mark the failed page in error
1734 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
);
1735 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
1737 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
1738 (int)upl
, 0, 0, retval
, 0);
1742 if ((start_offset
== 0 || upl_size
> PAGE_SIZE
) && ((start_offset
+ io_size
) & PAGE_MASK
)) {
1744 * the last offset we're writing to in this upl does not end on a page
1745 * boundary... if it's not beyond the old EOF, then we'll also need to
1746 * pre-read this page in if it isn't already valid
1748 upl_offset
= upl_size
- PAGE_SIZE
;
1750 if ((upl_f_offset
+ start_offset
+ io_size
) < oldEOF
&&
1751 !upl_valid_page(pl
, upl_offset
/ PAGE_SIZE
)) {
1754 read_size
= PAGE_SIZE
;
1756 if ((upl_f_offset
+ upl_offset
+ read_size
) > newEOF
)
1757 read_size
= newEOF
- (upl_f_offset
+ upl_offset
);
1759 retval
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, read_size
, devblocksize
,
1760 CL_READ
, (struct buf
*)0, (struct clios
*)0);
1763 * we had an error during the read which causes us to abort
1764 * the current cluster_write request... before we do, we
1765 * need to release the rest of the pages in the upl without
1766 * modifying there state and mark the failed page in error
1768 ubc_upl_abort_range(upl
, upl_offset
, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
);
1769 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
1771 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
1772 (int)upl
, 0, 0, retval
, 0);
1777 if ((kret
= ubc_upl_map(upl
, &io_address
)) != KERN_SUCCESS
)
1778 panic("cluster_write: ubc_upl_map failed\n");
1779 xfer_resid
= io_size
;
1780 io_offset
= start_offset
;
1782 while (zero_cnt
&& xfer_resid
) {
1784 if (zero_cnt
< (long long)xfer_resid
)
1785 bytes_to_zero
= zero_cnt
;
1787 bytes_to_zero
= xfer_resid
;
1789 if ( !(flags
& (IO_NOZEROVALID
| IO_NOZERODIRTY
))) {
1790 bzero((caddr_t
)(io_address
+ io_offset
), bytes_to_zero
);
1792 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 43)) | DBG_FUNC_NONE
,
1793 (int)upl_f_offset
+ io_offset
, bytes_to_zero
,
1794 (int)io_offset
, xfer_resid
, 0);
1798 bytes_to_zero
= min(bytes_to_zero
, PAGE_SIZE
- (int)(zero_off
& PAGE_MASK_64
));
1799 zero_pg_index
= (int)((zero_off
- upl_f_offset
) / PAGE_SIZE_64
);
1801 if ( !upl_valid_page(pl
, zero_pg_index
)) {
1802 bzero((caddr_t
)(io_address
+ io_offset
), bytes_to_zero
);
1804 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 43)) | DBG_FUNC_NONE
,
1805 (int)upl_f_offset
+ io_offset
, bytes_to_zero
,
1806 (int)io_offset
, xfer_resid
, 0);
1808 } else if ((flags
& (IO_NOZERODIRTY
| IO_NOZEROVALID
)) == IO_NOZERODIRTY
&&
1809 !upl_dirty_page(pl
, zero_pg_index
)) {
1810 bzero((caddr_t
)(io_address
+ io_offset
), bytes_to_zero
);
1812 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 43)) | DBG_FUNC_NONE
,
1813 (int)upl_f_offset
+ io_offset
, bytes_to_zero
,
1814 (int)io_offset
, xfer_resid
, 0);
1817 xfer_resid
-= bytes_to_zero
;
1818 zero_cnt
-= bytes_to_zero
;
1819 zero_off
+= bytes_to_zero
;
1820 io_offset
+= bytes_to_zero
;
1822 if (xfer_resid
&& uio_resid
) {
1823 bytes_to_move
= min(uio_resid
, xfer_resid
);
1825 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 42)) | DBG_FUNC_NONE
,
1826 (int)uio
->uio_offset
, bytes_to_move
, uio_resid
, xfer_resid
, 0);
1828 retval
= uiomove((caddr_t
)(io_address
+ io_offset
), bytes_to_move
, uio
);
1832 if ((kret
= ubc_upl_unmap(upl
)) != KERN_SUCCESS
)
1833 panic("cluster_write: kernel_upl_unmap failed\n");
1835 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
1837 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
1838 (int)upl
, 0, 0, retval
, 0);
1840 uio_resid
-= bytes_to_move
;
1841 xfer_resid
-= bytes_to_move
;
1842 io_offset
+= bytes_to_move
;
1845 while (xfer_resid
&& zero_cnt1
&& retval
== 0) {
1847 if (zero_cnt1
< (long long)xfer_resid
)
1848 bytes_to_zero
= zero_cnt1
;
1850 bytes_to_zero
= xfer_resid
;
1852 if ( !(flags
& (IO_NOZEROVALID
| IO_NOZERODIRTY
))) {
1853 bzero((caddr_t
)(io_address
+ io_offset
), bytes_to_zero
);
1855 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 43)) | DBG_FUNC_NONE
,
1856 (int)upl_f_offset
+ io_offset
,
1857 bytes_to_zero
, (int)io_offset
, xfer_resid
, 0);
1861 bytes_to_zero
= min(bytes_to_zero
, PAGE_SIZE
- (int)(zero_off1
& PAGE_MASK_64
));
1862 zero_pg_index
= (int)((zero_off1
- upl_f_offset
) / PAGE_SIZE_64
);
1864 if ( !upl_valid_page(pl
, zero_pg_index
)) {
1865 bzero((caddr_t
)(io_address
+ io_offset
), bytes_to_zero
);
1867 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 43)) | DBG_FUNC_NONE
,
1868 (int)upl_f_offset
+ io_offset
,
1869 bytes_to_zero
, (int)io_offset
, xfer_resid
, 0);
1871 } else if ((flags
& (IO_NOZERODIRTY
| IO_NOZEROVALID
)) == IO_NOZERODIRTY
&&
1872 !upl_dirty_page(pl
, zero_pg_index
)) {
1873 bzero((caddr_t
)(io_address
+ io_offset
), bytes_to_zero
);
1875 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 43)) | DBG_FUNC_NONE
,
1876 (int)upl_f_offset
+ io_offset
,
1877 bytes_to_zero
, (int)io_offset
, xfer_resid
, 0);
1880 xfer_resid
-= bytes_to_zero
;
1881 zero_cnt1
-= bytes_to_zero
;
1882 zero_off1
+= bytes_to_zero
;
1883 io_offset
+= bytes_to_zero
;
1890 io_size
+= start_offset
;
1892 if ((upl_f_offset
+ io_size
) >= newEOF
&& io_size
< upl_size
) {
1894 * if we're extending the file with this write
1895 * we'll zero fill the rest of the page so that
1896 * if the file gets extended again in such a way as to leave a
1897 * hole starting at this EOF, we'll have zero's in the correct spot
1899 bzero((caddr_t
)(io_address
+ io_size
), upl_size
- io_size
);
1901 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 43)) | DBG_FUNC_NONE
,
1902 (int)upl_f_offset
+ io_size
,
1903 upl_size
- io_size
, 0, 0, 0);
1905 if ((kret
= ubc_upl_unmap(upl
)) != KERN_SUCCESS
)
1906 panic("cluster_write: kernel_upl_unmap failed\n");
1908 if (flags
& IO_SYNC
)
1910 * if the IO_SYNC flag is set than we need to
1911 * bypass any clusters and immediately issue
1916 if (vp
->v_clen
== 0)
1918 * no clusters currently present
1920 goto start_new_cluster
;
1923 * keep track of the overall dirty page
1924 * range we've developed
1925 * in case we have to fall back to the
1926 * VHASDIRTY method of flushing
1928 if (vp
->v_flag
& VHASDIRTY
)
1931 for (cl_index
= 0; cl_index
< vp
->v_clen
; cl_index
++) {
1933 * we have an existing cluster... see if this write will extend it nicely
1935 if (start_blkno
>= vp
->v_clusters
[cl_index
].start_pg
) {
1937 * the current write starts at or after the current cluster
1939 if (last_blkno
<= (vp
->v_clusters
[cl_index
].start_pg
+ MAX_UPL_TRANSFER
)) {
1941 * we have a write that fits entirely
1942 * within the existing cluster limits
1944 if (last_blkno
> vp
->v_clusters
[cl_index
].last_pg
)
1946 * update our idea of where the cluster ends
1948 vp
->v_clusters
[cl_index
].last_pg
= last_blkno
;
1951 if (start_blkno
< (vp
->v_clusters
[cl_index
].start_pg
+ MAX_UPL_TRANSFER
)) {
1953 * we have a write that starts in the middle of the current cluster
1954 * but extends beyond the cluster's limit
1955 * we'll clip the current cluster if we actually
1956 * overlap with the new write
1957 * and start a new cluster with the current write
1959 if (vp
->v_clusters
[cl_index
].last_pg
> start_blkno
)
1960 vp
->v_clusters
[cl_index
].last_pg
= start_blkno
;
1963 * we also get here for the case where the current write starts
1964 * beyond the limit of the existing cluster
1966 * in either case, we'll check the remaining clusters before
1967 * starting a new one
1971 * the current write starts in front of the current cluster
1973 if ((vp
->v_clusters
[cl_index
].last_pg
- start_blkno
) <= MAX_UPL_TRANSFER
) {
1975 * we can just merge the old cluster
1976 * with the new request and leave it
1979 vp
->v_clusters
[cl_index
].start_pg
= start_blkno
;
1981 if (last_blkno
> vp
->v_clusters
[cl_index
].last_pg
) {
1983 * the current write completely
1984 * envelops the existing cluster
1986 vp
->v_clusters
[cl_index
].last_pg
= last_blkno
;
1992 * if we were to combine this write with the current cluster
1993 * we would exceed the cluster size limit.... so,
1994 * let's see if there's any overlap of the new I/O with
1995 * the existing cluster...
1998 if (last_blkno
> vp
->v_clusters
[cl_index
].start_pg
)
2000 * the current write extends into the existing cluster
2001 * clip the current cluster by moving the start position
2002 * to where the current write ends
2004 vp
->v_clusters
[cl_index
].start_pg
= last_blkno
;
2006 * if we get here, there was no way to merge
2007 * the new I/O with this cluster and
2008 * keep it under our maximum cluster length
2009 * we'll check the remaining clusters before starting a new one
2013 if (cl_index
< vp
->v_clen
)
2015 * we found an existing cluster that we
2016 * could merger this I/O into
2020 if (vp
->v_clen
< MAX_CLUSTERS
&& !(vp
->v_flag
& VNOCACHE_DATA
))
2022 * we didn't find an existing cluster to
2023 * merge into, but there's room to start
2026 goto start_new_cluster
;
2029 * no exisitng cluster to merge with and no
2030 * room to start a new one... we'll try
2031 * pushing the existing ones... if none of
2032 * them are able to be pushed, we'll have
2033 * to fall back on the VHASDIRTY mechanism
2034 * cluster_try_push will set v_clen to the
2035 * number of remaining clusters if it is
2036 * unable to push all of them
2038 if (vp
->v_flag
& VNOCACHE_DATA
)
2043 if (cluster_try_push(vp
, newEOF
, 0, 0) == 0) {
2044 vp
->v_flag
|= VHASDIRTY
;
2048 if (vp
->v_clen
== 0) {
2049 vp
->v_ciosiz
= devblocksize
;
2050 vp
->v_cstart
= start_blkno
;
2051 vp
->v_lastw
= last_blkno
;
2053 vp
->v_clusters
[vp
->v_clen
].start_pg
= start_blkno
;
2054 vp
->v_clusters
[vp
->v_clen
].last_pg
= last_blkno
;
2058 * make sure we keep v_cstart and v_lastw up to
2059 * date in case we have to fall back on the
2060 * V_HASDIRTY mechanism (or we've already entered it)
2062 if (start_blkno
< vp
->v_cstart
)
2063 vp
->v_cstart
= start_blkno
;
2064 if (last_blkno
> vp
->v_lastw
)
2065 vp
->v_lastw
= last_blkno
;
2067 ubc_upl_commit_range(upl
, 0, upl_size
, UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2071 * in order to maintain some semblance of coherency with mapped writes
2072 * we need to write the cluster back out as a multiple of the PAGESIZE
2073 * unless the cluster encompasses the last page of the file... in this
2074 * case we'll round out to the nearest device block boundary
2078 if ((upl_f_offset
+ io_size
) > newEOF
) {
2079 io_size
= newEOF
- upl_f_offset
;
2080 io_size
= (io_size
+ (devblocksize
- 1)) & ~(devblocksize
- 1);
2083 if (flags
& IO_SYNC
)
2084 io_flags
= CL_COMMIT
| CL_AGE
;
2086 io_flags
= CL_COMMIT
| CL_AGE
| CL_ASYNC
;
2088 if (vp
->v_flag
& VNOCACHE_DATA
)
2089 io_flags
|= CL_DUMP
;
2091 while (vp
->v_numoutput
>= ASYNC_THROTTLE
) {
2092 vp
->v_flag
|= VTHROTTLED
;
2093 tsleep((caddr_t
)&vp
->v_numoutput
, PRIBIO
+ 1, "cluster_write", 0);
2095 retval
= cluster_io(vp
, upl
, 0, upl_f_offset
, io_size
, devblocksize
,
2096 io_flags
, (struct buf
*)0, (struct clios
*)0);
2099 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_END
,
2100 retval
, 0, 0, 0, 0);
2106 cluster_read(vp
, uio
, filesize
, devblocksize
, flags
)
2117 vm_offset_t upl_offset
;
2120 upl_page_info_t
*pl
;
2125 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_START
,
2126 (int)uio
->uio_offset
, uio
->uio_resid
, (int)filesize
, devblocksize
, 0);
2129 * We set a threshhold of 4 pages to decide if the nocopy
2130 * read loop is worth the trouble...
2133 if (!((vp
->v_flag
& VNOCACHE_DATA
) && (uio
->uio_segflg
== UIO_USERSPACE
)))
2135 retval
= cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
);
2136 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_END
,
2137 (int)uio
->uio_offset
, uio
->uio_resid
, vp
->v_lastr
, retval
, 0);
2141 while (uio
->uio_resid
&& uio
->uio_offset
< filesize
&& retval
== 0)
2143 /* we know we have a resid, so this is safe */
2145 while (iov
->iov_len
== 0) {
2152 * We check every vector target and if it is physically
2153 * contiguous space, we skip the sanity checks.
2156 upl_offset
= (vm_offset_t
)iov
->iov_base
& ~PAGE_MASK
;
2157 upl_size
= (upl_offset
+ PAGE_SIZE
+(PAGE_SIZE
-1)) & ~PAGE_MASK
;
2159 upl_flags
= UPL_QUERY_OBJECT_TYPE
;
2160 if((vm_map_get_upl(current_map(),
2161 (vm_offset_t
)iov
->iov_base
& ~PAGE_MASK
,
2162 &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
, 0)) != KERN_SUCCESS
)
2165 * the user app must have passed in an invalid address
2170 if (upl_flags
& UPL_PHYS_CONTIG
)
2172 retval
= cluster_phys_read(vp
, uio
, filesize
, devblocksize
, flags
);
2174 else if (uio
->uio_resid
< 4 * PAGE_SIZE
)
2177 * We set a threshhold of 4 pages to decide if the nocopy
2178 * read loop is worth the trouble...
2180 retval
= cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
);
2181 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_END
,
2182 (int)uio
->uio_offset
, uio
->uio_resid
, vp
->v_lastr
, retval
, 0);
2185 else if (uio
->uio_offset
& PAGE_MASK_64
)
2187 /* Bring the file offset read up to a pagesize boundary */
2188 clip_size
= (PAGE_SIZE
- (int)(uio
->uio_offset
& PAGE_MASK_64
));
2189 if (uio
->uio_resid
< clip_size
)
2190 clip_size
= uio
->uio_resid
;
2192 * Fake the resid going into the cluster_read_x call
2193 * and restore it on the way out.
2195 prev_resid
= uio
->uio_resid
;
2196 uio
->uio_resid
= clip_size
;
2197 retval
= cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
);
2198 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
2200 else if ((int)iov
->iov_base
& PAGE_MASK_64
)
2202 clip_size
= iov
->iov_len
;
2203 prev_resid
= uio
->uio_resid
;
2204 uio
->uio_resid
= clip_size
;
2205 retval
= cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
);
2206 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
2211 * If we come in here, we know the offset into
2212 * the file is on a pagesize boundary
2215 max_io_size
= filesize
- uio
->uio_offset
;
2216 clip_size
= uio
->uio_resid
;
2217 if (iov
->iov_len
< clip_size
)
2218 clip_size
= iov
->iov_len
;
2219 if (max_io_size
< clip_size
)
2220 clip_size
= (int)max_io_size
;
2222 if (clip_size
< PAGE_SIZE
)
2225 * Take care of the tail end of the read in this vector.
2227 prev_resid
= uio
->uio_resid
;
2228 uio
->uio_resid
= clip_size
;
2229 retval
= cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
);
2230 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
2234 /* round clip_size down to a multiple of pagesize */
2235 clip_size
= clip_size
& ~(PAGE_MASK
);
2236 prev_resid
= uio
->uio_resid
;
2237 uio
->uio_resid
= clip_size
;
2238 retval
= cluster_nocopy_read(vp
, uio
, filesize
, devblocksize
, flags
);
2239 if ((retval
==0) && uio
->uio_resid
)
2240 retval
= cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
);
2241 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
2246 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_END
,
2247 (int)uio
->uio_offset
, uio
->uio_resid
, vp
->v_lastr
, retval
, 0);
2254 cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
)
2261 upl_page_info_t
*pl
;
2263 vm_offset_t upl_offset
;
2273 vm_offset_t io_address
;
2281 b_lblkno
= (int)(uio
->uio_offset
/ PAGE_SIZE_64
);
2283 while (uio
->uio_resid
&& uio
->uio_offset
< filesize
&& retval
== 0) {
2285 * compute the size of the upl needed to encompass
2286 * the requested read... limit each call to cluster_io
2287 * to the maximum UPL size... cluster_io will clip if
2288 * this exceeds the maximum io_size for the device,
2289 * make sure to account for
2290 * a starting offset that's not page aligned
2292 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
2293 upl_f_offset
= uio
->uio_offset
- (off_t
)start_offset
;
2294 max_size
= filesize
- uio
->uio_offset
;
2296 if ((off_t
)((unsigned int)uio
->uio_resid
) < max_size
)
2297 io_size
= uio
->uio_resid
;
2301 if (uio
->uio_segflg
== UIO_USERSPACE
&& !(vp
->v_flag
& VNOCACHE_DATA
)) {
2302 segflg
= uio
->uio_segflg
;
2304 uio
->uio_segflg
= UIO_PHYS_USERSPACE
;
2306 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_START
,
2307 (int)uio
->uio_offset
, io_size
, uio
->uio_resid
, 0, 0);
2309 while (io_size
&& retval
== 0) {
2315 UPL_POP_SET
| UPL_POP_BUSY
,
2316 &paddr
, 0) != KERN_SUCCESS
)
2319 xsize
= PAGE_SIZE
- start_offset
;
2321 if (xsize
> io_size
)
2324 retval
= uiomove((caddr_t
)(paddr
+ start_offset
), xsize
, uio
);
2326 ubc_page_op(vp
, upl_f_offset
,
2327 UPL_POP_CLR
| UPL_POP_BUSY
, 0, 0);
2330 start_offset
= (int)
2331 (uio
->uio_offset
& PAGE_MASK_64
);
2332 upl_f_offset
= uio
->uio_offset
- start_offset
;
2334 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
2335 (int)uio
->uio_offset
, io_size
, uio
->uio_resid
, 0, 0);
2337 uio
->uio_segflg
= segflg
;
2344 * we're already finished with this read request
2345 * let's see if we should do a read-ahead
2348 ((uio
->uio_offset
- 1) / PAGE_SIZE_64
);
2350 if (!(vp
->v_flag
& VRAOFF
))
2352 * let's try to read ahead if we're in
2353 * a sequential access pattern
2355 cluster_rd_ahead(vp
, b_lblkno
, e_lblkno
, filesize
, devblocksize
);
2356 vp
->v_lastr
= e_lblkno
;
2360 max_size
= filesize
- uio
->uio_offset
;
2362 upl_size
= (start_offset
+ io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
2363 if (upl_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
2364 upl_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
2365 pages_in_upl
= upl_size
/ PAGE_SIZE
;
2367 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 33)) | DBG_FUNC_START
,
2368 (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
2370 kret
= ubc_create_upl(vp
,
2376 if (kret
!= KERN_SUCCESS
)
2377 panic("cluster_read: failed to get pagelist");
2379 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 33)) | DBG_FUNC_END
,
2380 (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
2383 * scan from the beginning of the upl looking for the first
2384 * non-valid page.... this will become the first page in
2385 * the request we're going to make to 'cluster_io'... if all
2386 * of the pages are valid, we won't call through to 'cluster_io'
2388 for (start_pg
= 0; start_pg
< pages_in_upl
; start_pg
++) {
2389 if (!upl_valid_page(pl
, start_pg
))
2394 * scan from the starting invalid page looking for a valid
2395 * page before the end of the upl is reached, if we
2396 * find one, then it will be the last page of the request to
2399 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
2400 if (upl_valid_page(pl
, last_pg
))
2404 if (start_pg
< last_pg
) {
2406 * we found a range of 'invalid' pages that must be filled
2407 * if the last page in this range is the last page of the file
2408 * we may have to clip the size of it to keep from reading past
2409 * the end of the last physical block associated with the file
2411 upl_offset
= start_pg
* PAGE_SIZE
;
2412 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
2414 if ((upl_f_offset
+ upl_offset
+ io_size
) > filesize
)
2415 io_size
= filesize
- (upl_f_offset
+ upl_offset
);
2418 * issue a synchronous read to cluster_io
2421 error
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
,
2422 io_size
, devblocksize
, CL_READ
, (struct buf
*)0, (struct clios
*)0);
2426 * if the read completed successfully, or there was no I/O request
2427 * issued, than map the upl into kernel address space and
2428 * move the data into user land.... we'll first add on any 'valid'
2429 * pages that were present in the upl when we acquired it.
2432 u_int size_of_prefetch
;
2434 for (uio_last
= last_pg
; uio_last
< pages_in_upl
; uio_last
++) {
2435 if (!upl_valid_page(pl
, uio_last
))
2439 * compute size to transfer this round, if uio->uio_resid is
2440 * still non-zero after this uiomove, we'll loop around and
2441 * set up for another I/O.
2443 val_size
= (uio_last
* PAGE_SIZE
) - start_offset
;
2445 if (max_size
< val_size
)
2446 val_size
= max_size
;
2448 if (uio
->uio_resid
< val_size
)
2449 val_size
= uio
->uio_resid
;
2451 e_lblkno
= (int)((uio
->uio_offset
+ ((off_t
)val_size
- 1)) / PAGE_SIZE_64
);
2453 if (size_of_prefetch
= (uio
->uio_resid
- val_size
)) {
2455 * if there's still I/O left to do for this request, then issue a
2456 * pre-fetch I/O... the I/O wait time will overlap
2457 * with the copying of the data
2459 cluster_rd_prefetch(vp
, uio
->uio_offset
+ val_size
, size_of_prefetch
, filesize
, devblocksize
);
2461 if (!(vp
->v_flag
& VRAOFF
) && !(vp
->v_flag
& VNOCACHE_DATA
))
2463 * let's try to read ahead if we're in
2464 * a sequential access pattern
2466 cluster_rd_ahead(vp
, b_lblkno
, e_lblkno
, filesize
, devblocksize
);
2467 vp
->v_lastr
= e_lblkno
;
2469 if (uio
->uio_segflg
== UIO_USERSPACE
) {
2472 segflg
= uio
->uio_segflg
;
2474 uio
->uio_segflg
= UIO_PHYS_USERSPACE
;
2477 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_START
,
2478 (int)uio
->uio_offset
, val_size
, uio
->uio_resid
, 0, 0);
2480 offset
= start_offset
;
2482 while (val_size
&& retval
== 0) {
2487 i
= offset
/ PAGE_SIZE
;
2488 csize
= min(PAGE_SIZE
- start_offset
, val_size
);
2490 paddr
= (caddr_t
)upl_phys_page(pl
, i
) + start_offset
;
2492 retval
= uiomove(paddr
, csize
, uio
);
2496 start_offset
= offset
& PAGE_MASK
;
2498 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
2499 (int)uio
->uio_offset
, val_size
, uio
->uio_resid
, 0, 0);
2501 uio
->uio_segflg
= segflg
;
2505 if ((kret
= ubc_upl_map(upl
, &io_address
)) != KERN_SUCCESS
)
2506 panic("cluster_read: ubc_upl_map() failed\n");
2508 retval
= uiomove((caddr_t
)(io_address
+ start_offset
), val_size
, uio
);
2510 if ((kret
= ubc_upl_unmap(upl
)) != KERN_SUCCESS
)
2511 panic("cluster_read: ubc_upl_unmap() failed\n");
2514 if (start_pg
< last_pg
) {
2516 * compute the range of pages that we actually issued an I/O for
2517 * and either commit them as valid if the I/O succeeded
2518 * or abort them if the I/O failed
2520 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
2522 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_START
,
2523 (int)upl
, start_pg
* PAGE_SIZE
, io_size
, error
, 0);
2525 if (error
|| (vp
->v_flag
& VNOCACHE_DATA
))
2526 ubc_upl_abort_range(upl
, start_pg
* PAGE_SIZE
, io_size
,
2527 UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
2529 ubc_upl_commit_range(upl
, start_pg
* PAGE_SIZE
, io_size
,
2530 UPL_COMMIT_CLEAR_DIRTY
2531 | UPL_COMMIT_FREE_ON_EMPTY
2532 | UPL_COMMIT_INACTIVATE
);
2534 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_END
,
2535 (int)upl
, start_pg
* PAGE_SIZE
, io_size
, error
, 0);
2537 if ((last_pg
- start_pg
) < pages_in_upl
) {
2542 * the set of pages that we issued an I/O for did not encompass
2543 * the entire upl... so just release these without modifying
2547 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
2549 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_START
,
2550 (int)upl
, -1, pages_in_upl
- (last_pg
- start_pg
), 0, 0);
2554 * we found some already valid pages at the beginning of
2555 * the upl commit these back to the inactive list with
2558 for (cur_pg
= 0; cur_pg
< start_pg
; cur_pg
++) {
2559 commit_flags
= UPL_COMMIT_FREE_ON_EMPTY
2560 | UPL_COMMIT_INACTIVATE
;
2562 if (upl_dirty_page(pl
, cur_pg
))
2563 commit_flags
|= UPL_COMMIT_SET_DIRTY
;
2565 if ( !(commit_flags
& UPL_COMMIT_SET_DIRTY
) && (vp
->v_flag
& VNOCACHE_DATA
))
2566 ubc_upl_abort_range(upl
, cur_pg
* PAGE_SIZE
, PAGE_SIZE
,
2567 UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
2569 ubc_upl_commit_range(upl
, cur_pg
* PAGE_SIZE
,
2570 PAGE_SIZE
, commit_flags
);
2573 if (last_pg
< uio_last
) {
2575 * we found some already valid pages immediately after the
2576 * pages we issued I/O for, commit these back to the
2577 * inactive list with reference cleared
2579 for (cur_pg
= last_pg
; cur_pg
< uio_last
; cur_pg
++) {
2580 commit_flags
= UPL_COMMIT_FREE_ON_EMPTY
2581 | UPL_COMMIT_INACTIVATE
;
2583 if (upl_dirty_page(pl
, cur_pg
))
2584 commit_flags
|= UPL_COMMIT_SET_DIRTY
;
2586 if ( !(commit_flags
& UPL_COMMIT_SET_DIRTY
) && (vp
->v_flag
& VNOCACHE_DATA
))
2587 ubc_upl_abort_range(upl
, cur_pg
* PAGE_SIZE
, PAGE_SIZE
,
2588 UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
2590 ubc_upl_commit_range(upl
, cur_pg
* PAGE_SIZE
,
2591 PAGE_SIZE
, commit_flags
);
2594 if (uio_last
< pages_in_upl
) {
2596 * there were some invalid pages beyond the valid pages
2597 * that we didn't issue an I/O for, just release them
2600 ubc_upl_abort_range(upl
, uio_last
* PAGE_SIZE
,
2601 (pages_in_upl
- uio_last
) * PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
2604 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_END
,
2605 (int)upl
, -1, -1, 0, 0);
2617 cluster_nocopy_read(vp
, uio
, filesize
, devblocksize
, flags
)
2625 upl_page_info_t
*pl
;
2627 vm_offset_t upl_offset
;
2628 off_t start_upl_f_offset
;
2632 int upl_needed_size
;
2640 int force_data_sync
;
2643 struct clios iostate
;
2645 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_START
,
2646 (int)uio
->uio_offset
, uio
->uio_resid
, (int)filesize
, devblocksize
, 0);
2649 * When we enter this routine, we know
2650 * -- the offset into the file is on a pagesize boundary
2651 * -- the resid is a page multiple
2652 * -- the resid will not exceed iov_len
2655 iostate
.io_completed
= 0;
2656 iostate
.io_issued
= 0;
2657 iostate
.io_error
= 0;
2658 iostate
.io_wanted
= 0;
2662 while (uio
->uio_resid
&& uio
->uio_offset
< filesize
&& retval
== 0) {
2664 max_io_size
= filesize
- uio
->uio_offset
;
2666 if (max_io_size
< (off_t
)((unsigned int)uio
->uio_resid
))
2667 io_size
= max_io_size
;
2669 io_size
= uio
->uio_resid
;
2672 * We don't come into this routine unless
2673 * UIO_USERSPACE is set.
2675 segflg
= uio
->uio_segflg
;
2677 uio
->uio_segflg
= UIO_PHYS_USERSPACE
;
2680 * First look for pages already in the cache
2681 * and move them to user space.
2683 while (io_size
&& (retval
== 0)) {
2684 upl_f_offset
= uio
->uio_offset
;
2687 * If this call fails, it means the page is not
2688 * in the page cache.
2690 if (ubc_page_op(vp
, upl_f_offset
,
2691 UPL_POP_SET
| UPL_POP_BUSY
, &paddr
, 0) != KERN_SUCCESS
)
2694 retval
= uiomove((caddr_t
)(paddr
), PAGE_SIZE
, uio
);
2696 ubc_page_op(vp
, upl_f_offset
,
2697 UPL_POP_CLR
| UPL_POP_BUSY
, 0, 0);
2699 io_size
-= PAGE_SIZE
;
2700 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 71)) | DBG_FUNC_NONE
,
2701 (int)uio
->uio_offset
, io_size
, uio
->uio_resid
, 0, 0);
2703 uio
->uio_segflg
= segflg
;
2707 * we may have already spun some portion of this request
2708 * off as async requests... we need to wait for the I/O
2709 * to complete before returning
2711 goto wait_for_reads
;
2714 * If we are already finished with this read, then return
2718 * we may have already spun some portion of this request
2719 * off as async requests... we need to wait for the I/O
2720 * to complete before returning
2722 goto wait_for_reads
;
2724 max_io_size
= io_size
;
2726 if (max_io_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
2727 max_io_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
2729 if (max_io_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
) / 4)
2730 max_io_size
= (MAX_UPL_TRANSFER
* PAGE_SIZE
) / 8;
2733 start_upl_f_offset
= uio
->uio_offset
; /* this is page aligned in the file */
2734 upl_f_offset
= start_upl_f_offset
;
2737 while (io_size
< max_io_size
) {
2738 if (ubc_page_op(vp
, upl_f_offset
,
2739 UPL_POP_SET
| UPL_POP_BUSY
, &paddr
, 0) == KERN_SUCCESS
) {
2740 ubc_page_op(vp
, upl_f_offset
,
2741 UPL_POP_CLR
| UPL_POP_BUSY
, 0, 0);
2745 * Build up the io request parameters.
2747 io_size
+= PAGE_SIZE_64
;
2748 upl_f_offset
+= PAGE_SIZE_64
;
2752 * we may have already spun some portion of this request
2753 * off as async requests... we need to wait for the I/O
2754 * to complete before returning
2756 goto wait_for_reads
;
2758 upl_offset
= (vm_offset_t
)iov
->iov_base
& PAGE_MASK_64
;
2759 upl_needed_size
= (upl_offset
+ io_size
+ (PAGE_SIZE
-1)) & ~PAGE_MASK
;
2761 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_START
,
2762 (int)upl_offset
, upl_needed_size
, (int)iov
->iov_base
, io_size
, 0);
2764 for (force_data_sync
= 0; force_data_sync
< 3; force_data_sync
++) {
2766 upl_size
= upl_needed_size
;
2767 upl_flags
= UPL_FILE_IO
| UPL_NO_SYNC
| UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
;
2769 kret
= vm_map_get_upl(current_map(),
2770 (vm_offset_t
)iov
->iov_base
& ~PAGE_MASK
,
2771 &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
, force_data_sync
);
2773 if (kret
!= KERN_SUCCESS
) {
2774 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
2775 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
2778 * cluster_nocopy_read: failed to get pagelist
2780 * we may have already spun some portion of this request
2781 * off as async requests... we need to wait for the I/O
2782 * to complete before returning
2784 goto wait_for_reads
;
2786 pages_in_pl
= upl_size
/ PAGE_SIZE
;
2787 pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
2789 for (i
= 0; i
< pages_in_pl
; i
++) {
2790 if (!upl_valid_page(pl
, i
))
2793 if (i
== pages_in_pl
)
2796 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
2797 UPL_ABORT_FREE_ON_EMPTY
);
2799 if (force_data_sync
>= 3) {
2800 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
2801 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
2803 goto wait_for_reads
;
2806 * Consider the possibility that upl_size wasn't satisfied.
2808 if (upl_size
!= upl_needed_size
)
2809 io_size
= (upl_size
- (int)upl_offset
) & ~PAGE_MASK
;
2812 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
2813 UPL_ABORT_FREE_ON_EMPTY
);
2814 goto wait_for_reads
;
2816 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
2817 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
2820 * request asynchronously so that we can overlap
2821 * the preparation of the next I/O
2822 * if there are already too many outstanding reads
2823 * wait until some have completed before issuing the next read
2825 while ((iostate
.io_issued
- iostate
.io_completed
) > (2 * MAX_UPL_TRANSFER
* PAGE_SIZE
)) {
2826 iostate
.io_wanted
= 1;
2827 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO
+ 1, "cluster_nocopy_read", 0);
2829 if (iostate
.io_error
) {
2831 * one of the earlier reads we issued ran into a hard error
2832 * don't issue any more reads, cleanup the UPL
2833 * that was just created but not used, then
2834 * go wait for any other reads to complete before
2835 * returning the error to the caller
2837 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
2838 UPL_ABORT_FREE_ON_EMPTY
);
2840 goto wait_for_reads
;
2842 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 73)) | DBG_FUNC_START
,
2843 (int)upl
, (int)upl_offset
, (int)start_upl_f_offset
, io_size
, 0);
2845 retval
= cluster_io(vp
, upl
, upl_offset
, start_upl_f_offset
,
2846 io_size
, devblocksize
,
2847 CL_PRESERVE
| CL_COMMIT
| CL_READ
| CL_ASYNC
| CL_NOZERO
,
2848 (struct buf
*)0, &iostate
);
2851 * update the uio structure
2853 iov
->iov_base
+= io_size
;
2854 iov
->iov_len
-= io_size
;
2855 uio
->uio_resid
-= io_size
;
2856 uio
->uio_offset
+= io_size
;
2858 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 73)) | DBG_FUNC_END
,
2859 (int)upl
, (int)uio
->uio_offset
, (int)uio
->uio_resid
, retval
, 0);
2865 * make sure all async reads that are part of this stream
2866 * have completed before we return
2868 while (iostate
.io_issued
!= iostate
.io_completed
) {
2869 iostate
.io_wanted
= 1;
2870 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO
+ 1, "cluster_nocopy_read", 0);
2872 if (iostate
.io_error
)
2873 retval
= iostate
.io_error
;
2875 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_END
,
2876 (int)uio
->uio_offset
, (int)uio
->uio_resid
, 6, retval
, 0);
2883 cluster_phys_read(vp
, uio
, filesize
, devblocksize
, flags
)
2890 upl_page_info_t
*pl
;
2892 vm_offset_t upl_offset
;
2893 vm_offset_t dst_paddr
;
2898 int upl_needed_size
;
2903 struct clios iostate
;
2907 * When we enter this routine, we know
2908 * -- the resid will not exceed iov_len
2909 * -- the target address is physically contiguous
2914 max_size
= filesize
- uio
->uio_offset
;
2916 if (max_size
> (off_t
)((unsigned int)iov
->iov_len
))
2917 io_size
= iov
->iov_len
;
2921 upl_offset
= (vm_offset_t
)iov
->iov_base
& PAGE_MASK_64
;
2922 upl_needed_size
= upl_offset
+ io_size
;
2926 upl_size
= upl_needed_size
;
2927 upl_flags
= UPL_FILE_IO
| UPL_NO_SYNC
| UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
;
2929 kret
= vm_map_get_upl(current_map(),
2930 (vm_offset_t
)iov
->iov_base
& ~PAGE_MASK
,
2931 &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
, 0);
2933 if (kret
!= KERN_SUCCESS
) {
2935 * cluster_phys_read: failed to get pagelist
2939 if (upl_size
< upl_needed_size
) {
2941 * The upl_size wasn't satisfied.
2943 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
2947 pl
= ubc_upl_pageinfo(upl
);
2949 dst_paddr
= (vm_offset_t
)upl_phys_page(pl
, 0) + ((vm_offset_t
)iov
->iov_base
& PAGE_MASK
);
2951 while (((uio
->uio_offset
& (devblocksize
- 1)) || io_size
< devblocksize
) && io_size
) {
2954 head_size
= devblocksize
- (int)(uio
->uio_offset
& (devblocksize
- 1));
2956 if (head_size
> io_size
)
2957 head_size
= io_size
;
2959 error
= cluster_align_phys_io(vp
, uio
, dst_paddr
, head_size
, devblocksize
, CL_READ
);
2962 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
2966 upl_offset
+= head_size
;
2967 dst_paddr
+= head_size
;
2968 io_size
-= head_size
;
2970 tail_size
= io_size
& (devblocksize
- 1);
2971 io_size
-= tail_size
;
2973 iostate
.io_completed
= 0;
2974 iostate
.io_issued
= 0;
2975 iostate
.io_error
= 0;
2976 iostate
.io_wanted
= 0;
2978 while (io_size
&& error
== 0) {
2981 if (io_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
2982 xsize
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
2986 * request asynchronously so that we can overlap
2987 * the preparation of the next I/O... we'll do
2988 * the commit after all the I/O has completed
2989 * since its all issued against the same UPL
2990 * if there are already too many outstanding reads
2991 * wait until some have completed before issuing the next
2993 while ((iostate
.io_issued
- iostate
.io_completed
) > (2 * MAX_UPL_TRANSFER
* PAGE_SIZE
)) {
2994 iostate
.io_wanted
= 1;
2995 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO
+ 1, "cluster_phys_read", 0);
2998 error
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
, xsize
, 0,
2999 CL_READ
| CL_NOZERO
| CL_DEV_MEMORY
| CL_ASYNC
,
3000 (struct buf
*)0, &iostate
);
3002 * The cluster_io read was issued successfully,
3003 * update the uio structure
3006 uio
->uio_resid
-= xsize
;
3007 iov
->iov_len
-= xsize
;
3008 iov
->iov_base
+= xsize
;
3009 uio
->uio_offset
+= xsize
;
3011 upl_offset
+= xsize
;
3016 * make sure all async reads that are part of this stream
3017 * have completed before we proceed
3019 while (iostate
.io_issued
!= iostate
.io_completed
) {
3020 iostate
.io_wanted
= 1;
3021 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO
+ 1, "cluster_phys_read", 0);
3023 if (iostate
.io_error
) {
3024 error
= iostate
.io_error
;
3026 if (error
== 0 && tail_size
)
3027 error
= cluster_align_phys_io(vp
, uio
, dst_paddr
, tail_size
, devblocksize
, CL_READ
);
3030 * just release our hold on the physically contiguous
3031 * region without changing any state
3033 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3040 * generate advisory I/O's in the largest chunks possible
3041 * the completed pages will be released into the VM cache
3044 advisory_read(vp
, filesize
, f_offset
, resid
, devblocksize
)
3051 upl_page_info_t
*pl
;
3053 vm_offset_t upl_offset
;
3066 if (!UBCINFOEXISTS(vp
))
3069 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 60)) | DBG_FUNC_START
,
3070 (int)f_offset
, resid
, (int)filesize
, devblocksize
, 0);
3072 while (resid
&& f_offset
< filesize
&& retval
== 0) {
3074 * compute the size of the upl needed to encompass
3075 * the requested read... limit each call to cluster_io
3076 * to the maximum UPL size... cluster_io will clip if
3077 * this exceeds the maximum io_size for the device,
3078 * make sure to account for
3079 * a starting offset that's not page aligned
3081 start_offset
= (int)(f_offset
& PAGE_MASK_64
);
3082 upl_f_offset
= f_offset
- (off_t
)start_offset
;
3083 max_size
= filesize
- f_offset
;
3085 if (resid
< max_size
)
3090 upl_size
= (start_offset
+ io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
3091 if (upl_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
3092 upl_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
3093 pages_in_upl
= upl_size
/ PAGE_SIZE
;
3095 kret
= ubc_create_upl(vp
,
3100 UPL_RET_ONLY_ABSENT
);
3101 if (kret
!= KERN_SUCCESS
)
3106 * before we start marching forward, we must make sure we end on
3107 * a present page, otherwise we will be working with a freed
3110 for (last_pg
= pages_in_upl
- 1; last_pg
>= 0; last_pg
--) {
3111 if (upl_page_present(pl
, last_pg
))
3114 pages_in_upl
= last_pg
+ 1;
3117 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 61)) | DBG_FUNC_NONE
,
3118 (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
3121 for (last_pg
= 0; last_pg
< pages_in_upl
; ) {
3123 * scan from the beginning of the upl looking for the first
3124 * page that is present.... this will become the first page in
3125 * the request we're going to make to 'cluster_io'... if all
3126 * of the pages are absent, we won't call through to 'cluster_io'
3128 for (start_pg
= last_pg
; start_pg
< pages_in_upl
; start_pg
++) {
3129 if (upl_page_present(pl
, start_pg
))
3134 * scan from the starting present page looking for an absent
3135 * page before the end of the upl is reached, if we
3136 * find one, then it will terminate the range of pages being
3137 * presented to 'cluster_io'
3139 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
3140 if (!upl_page_present(pl
, last_pg
))
3144 if (last_pg
> start_pg
) {
3146 * we found a range of pages that must be filled
3147 * if the last page in this range is the last page of the file
3148 * we may have to clip the size of it to keep from reading past
3149 * the end of the last physical block associated with the file
3151 upl_offset
= start_pg
* PAGE_SIZE
;
3152 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
3154 if ((upl_f_offset
+ upl_offset
+ io_size
) > filesize
)
3155 io_size
= filesize
- (upl_f_offset
+ upl_offset
);
3158 * issue an asynchronous read to cluster_io
3160 retval
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, io_size
, devblocksize
,
3161 CL_ASYNC
| CL_READ
| CL_COMMIT
| CL_AGE
, (struct buf
*)0, (struct clios
*)0);
3167 ubc_upl_abort(upl
, 0);
3169 io_size
= upl_size
- start_offset
;
3171 if (io_size
> resid
)
3173 f_offset
+= io_size
;
3177 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 60)) | DBG_FUNC_END
,
3178 (int)f_offset
, resid
, retval
, 0, 0);
3190 if (!UBCINFOEXISTS(vp
) || vp
->v_clen
== 0) {
3191 vp
->v_flag
&= ~VHASDIRTY
;
3195 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_START
,
3196 vp
->v_flag
& VHASDIRTY
, vp
->v_clen
, 0, 0, 0);
3198 if (vp
->v_flag
& VHASDIRTY
) {
3203 start_pg
= vp
->v_cstart
;
3204 end_pg
= vp
->v_lastw
;
3206 vp
->v_flag
&= ~VHASDIRTY
;
3209 while (start_pg
< end_pg
) {
3210 last_pg
= start_pg
+ MAX_UPL_TRANSFER
;
3212 if (last_pg
> end_pg
)
3215 cluster_push_x(vp
, ubc_getsize(vp
), start_pg
, last_pg
, 0);
3221 retval
= cluster_try_push(vp
, ubc_getsize(vp
), 0, 1);
3223 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_END
,
3224 vp
->v_flag
& VHASDIRTY
, vp
->v_clen
, retval
, 0, 0);
3231 cluster_try_push(vp
, EOF
, can_delay
, push_all
)
3243 struct v_cluster l_clusters
[MAX_CLUSTERS
];
3246 * make a local 'sorted' copy of the clusters
3247 * and clear vp->v_clen so that new clusters can
3250 for (cl_index
= 0; cl_index
< vp
->v_clen
; cl_index
++) {
3251 for (min_index
= -1, cl_index1
= 0; cl_index1
< vp
->v_clen
; cl_index1
++) {
3252 if (vp
->v_clusters
[cl_index1
].start_pg
== vp
->v_clusters
[cl_index1
].last_pg
)
3254 if (min_index
== -1)
3255 min_index
= cl_index1
;
3256 else if (vp
->v_clusters
[cl_index1
].start_pg
< vp
->v_clusters
[min_index
].start_pg
)
3257 min_index
= cl_index1
;
3259 if (min_index
== -1)
3261 l_clusters
[cl_index
].start_pg
= vp
->v_clusters
[min_index
].start_pg
;
3262 l_clusters
[cl_index
].last_pg
= vp
->v_clusters
[min_index
].last_pg
;
3264 vp
->v_clusters
[min_index
].start_pg
= vp
->v_clusters
[min_index
].last_pg
;
3269 for (cl_pushed
= 0, cl_index
= 0; cl_index
< cl_len
; cl_index
++) {
3271 * try to push each cluster in turn... cluster_push_x may not
3272 * push the cluster if can_delay is TRUE and the cluster doesn't
3273 * meet the critera for an immediate push
3275 if (cluster_push_x(vp
, EOF
, l_clusters
[cl_index
].start_pg
, l_clusters
[cl_index
].last_pg
, can_delay
)) {
3276 l_clusters
[cl_index
].start_pg
= 0;
3277 l_clusters
[cl_index
].last_pg
= 0;
3285 if (cl_len
> cl_pushed
) {
3287 * we didn't push all of the clusters, so
3288 * lets try to merge them back in to the vnode
3290 if ((MAX_CLUSTERS
- vp
->v_clen
) < (cl_len
- cl_pushed
)) {
3292 * we picked up some new clusters while we were trying to
3293 * push the old ones (I don't think this can happen because
3294 * I'm holding the lock, but just in case)... the sum of the
3295 * leftovers plus the new cluster count exceeds our ability
3296 * to represent them, so fall back to the VHASDIRTY mechanism
3298 for (cl_index
= 0; cl_index
< cl_len
; cl_index
++) {
3299 if (l_clusters
[cl_index
].start_pg
== l_clusters
[cl_index
].last_pg
)
3302 if (l_clusters
[cl_index
].start_pg
< vp
->v_cstart
)
3303 vp
->v_cstart
= l_clusters
[cl_index
].start_pg
;
3304 if (l_clusters
[cl_index
].last_pg
> vp
->v_lastw
)
3305 vp
->v_lastw
= l_clusters
[cl_index
].last_pg
;
3307 vp
->v_flag
|= VHASDIRTY
;
3310 * we've got room to merge the leftovers back in
3311 * just append them starting at the next 'hole'
3312 * represented by vp->v_clen
3314 for (cl_index
= 0, cl_index1
= vp
->v_clen
; cl_index
< cl_len
; cl_index
++) {
3315 if (l_clusters
[cl_index
].start_pg
== l_clusters
[cl_index
].last_pg
)
3318 vp
->v_clusters
[cl_index1
].start_pg
= l_clusters
[cl_index
].start_pg
;
3319 vp
->v_clusters
[cl_index1
].last_pg
= l_clusters
[cl_index
].last_pg
;
3321 if (cl_index1
== 0) {
3322 vp
->v_cstart
= l_clusters
[cl_index
].start_pg
;
3323 vp
->v_lastw
= l_clusters
[cl_index
].last_pg
;
3325 if (l_clusters
[cl_index
].start_pg
< vp
->v_cstart
)
3326 vp
->v_cstart
= l_clusters
[cl_index
].start_pg
;
3327 if (l_clusters
[cl_index
].last_pg
> vp
->v_lastw
)
3328 vp
->v_lastw
= l_clusters
[cl_index
].last_pg
;
3333 * update the cluster count
3335 vp
->v_clen
= cl_index1
;
3338 return(MAX_CLUSTERS
- vp
->v_clen
);
3344 cluster_push_x(vp
, EOF
, first
, last
, can_delay
)
3351 upl_page_info_t
*pl
;
3353 vm_offset_t upl_offset
;
3365 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_START
,
3366 vp
->v_clen
, first
, last
, EOF
, 0);
3368 if ((pages_in_upl
= last
- first
) == 0) {
3369 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 0, 0, 0, 0);
3373 upl_size
= pages_in_upl
* PAGE_SIZE
;
3374 upl_f_offset
= ((off_t
)first
) * PAGE_SIZE_64
;
3376 if (upl_f_offset
+ upl_size
>= EOF
) {
3378 if (upl_f_offset
>= EOF
) {
3380 * must have truncated the file and missed
3381 * clearing a dangling cluster (i.e. it's completely
3382 * beyond the new EOF
3384 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 1, 0, 0, 0);
3388 size
= EOF
- upl_f_offset
;
3390 upl_size
= (size
+ (PAGE_SIZE
- 1) ) & ~(PAGE_SIZE
- 1);
3391 pages_in_upl
= upl_size
/ PAGE_SIZE
;
3393 if (can_delay
&& (pages_in_upl
< (MAX_UPL_TRANSFER
- (MAX_UPL_TRANSFER
/ 2))))
3397 kret
= ubc_create_upl(vp
,
3402 UPL_RET_ONLY_DIRTY
);
3403 if (kret
!= KERN_SUCCESS
)
3404 panic("cluster_push: failed to get pagelist");
3409 for (num_of_dirty
= 0, start_pg
= 0; start_pg
< pages_in_upl
; start_pg
++) {
3410 if (upl_valid_page(pl
, start_pg
) && upl_dirty_page(pl
, start_pg
))
3413 if (num_of_dirty
< pages_in_upl
/ 2) {
3414 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3416 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 0, 2, num_of_dirty
, (pages_in_upl
/ 2), 0);
3425 for (start_pg
= last_pg
; start_pg
< pages_in_upl
; start_pg
++) {
3426 if (upl_valid_page(pl
, start_pg
) && upl_dirty_page(pl
, start_pg
))
3429 if (start_pg
> last_pg
) {
3430 io_size
= (start_pg
- last_pg
) * PAGE_SIZE
;
3432 ubc_upl_abort_range(upl
, last_pg
* PAGE_SIZE
, io_size
,
3433 UPL_ABORT_FREE_ON_EMPTY
);
3440 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
3441 if (!upl_valid_page(pl
, last_pg
) || !upl_dirty_page(pl
, last_pg
))
3444 upl_offset
= start_pg
* PAGE_SIZE
;
3446 io_size
= min(size
, (last_pg
- start_pg
) * PAGE_SIZE
);
3448 if (vp
->v_flag
& VNOCACHE_DATA
)
3449 io_flags
= CL_COMMIT
| CL_AGE
| CL_ASYNC
| CL_DUMP
;
3451 io_flags
= CL_COMMIT
| CL_AGE
| CL_ASYNC
;
3453 while (vp
->v_numoutput
>= ASYNC_THROTTLE
) {
3454 vp
->v_flag
|= VTHROTTLED
;
3455 tsleep((caddr_t
)&vp
->v_numoutput
, PRIBIO
+ 1, "cluster_push", 0);
3457 cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, io_size
, vp
->v_ciosiz
, io_flags
, (struct buf
*)0, (struct clios
*)0);
3461 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 3, 0, 0, 0);
3469 cluster_align_phys_io(struct vnode
*vp
, struct uio
*uio
, vm_offset_t usr_paddr
, int xsize
, int devblocksize
, int flags
)
3472 upl_page_info_t
*pl
;
3474 vm_offset_t ubc_paddr
;
3480 kret
= ubc_create_upl(vp
,
3481 uio
->uio_offset
& ~PAGE_MASK_64
,
3487 if (kret
!= KERN_SUCCESS
)
3490 if (!upl_valid_page(pl
, 0)) {
3492 * issue a synchronous read to cluster_io
3494 error
= cluster_io(vp
, upl
, 0, uio
->uio_offset
& ~PAGE_MASK_64
, PAGE_SIZE
, devblocksize
,
3495 CL_READ
, (struct buf
*)0, (struct clios
*)0);
3497 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
3502 ubc_paddr
= (vm_offset_t
)upl_phys_page(pl
, 0) + (int)(uio
->uio_offset
& PAGE_MASK_64
);
3504 if (flags
& CL_READ
)
3505 copyp2p(ubc_paddr
, usr_paddr
, xsize
, 2);
3507 copyp2p(usr_paddr
, ubc_paddr
, xsize
, 1);
3509 if ( !(flags
& CL_READ
) || upl_dirty_page(pl
, 0)) {
3511 * issue a synchronous write to cluster_io
3513 error
= cluster_io(vp
, upl
, 0, uio
->uio_offset
& ~PAGE_MASK_64
, PAGE_SIZE
, devblocksize
,
3514 0, (struct buf
*)0, (struct clios
*)0);
3517 uio
->uio_offset
+= xsize
;
3518 iov
->iov_base
+= xsize
;
3519 iov
->iov_len
-= xsize
;
3520 uio
->uio_resid
-= xsize
;
3522 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);