2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
25 * The Regents of the University of California. All rights reserved.
27 * Redistribution and use in source and binary forms, with or without
28 * modification, are permitted provided that the following conditions
30 * 1. Redistributions of source code must retain the above copyright
31 * notice, this list of conditions and the following disclaimer.
32 * 2. Redistributions in binary form must reproduce the above copyright
33 * notice, this list of conditions and the following disclaimer in the
34 * documentation and/or other materials provided with the distribution.
35 * 3. All advertising materials mentioning features or use of this software
36 * must display the following acknowledgement:
37 * This product includes software developed by the University of
38 * California, Berkeley and its contributors.
39 * 4. Neither the name of the University nor the names of its contributors
40 * may be used to endorse or promote products derived from this software
41 * without specific prior written permission.
43 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
58 #include <sys/param.h>
61 #include <sys/vnode.h>
62 #include <sys/mount.h>
63 #include <sys/trace.h>
64 #include <sys/malloc.h>
66 #include <sys/kernel.h>
67 #include <sys/resourcevar.h>
68 #include <libkern/libkern.h>
69 #include <machine/machine_routines.h>
72 #include <vm/vm_pageout.h>
74 #include <mach/mach_types.h>
75 #include <mach/memory_object_types.h>
77 #include <sys/kdebug.h>
82 #define CL_COMMIT 0x04
83 #define CL_PAGEOUT 0x10
86 #define CL_NOZERO 0x80
87 #define CL_PAGEIN 0x100
88 #define CL_DEV_MEMORY 0x200
89 #define CL_PRESERVE 0x400
90 #define CL_THROTTLE 0x800
94 u_int io_completed
; /* amount of io that has currently completed */
95 u_int io_issued
; /* amount of io that was successfully issued */
96 int io_error
; /* error code of first error encountered */
97 int io_wanted
; /* someone is sleeping waiting for a change in state */
101 static void cluster_zero(upl_t upl
, vm_offset_t upl_offset
,
102 int size
, struct buf
*bp
);
103 static int cluster_read_x(struct vnode
*vp
, struct uio
*uio
,
104 off_t filesize
, int devblocksize
, int flags
);
105 static int cluster_write_x(struct vnode
*vp
, struct uio
*uio
,
106 off_t oldEOF
, off_t newEOF
, off_t headOff
,
107 off_t tailOff
, int devblocksize
, int flags
);
108 static int cluster_nocopy_read(struct vnode
*vp
, struct uio
*uio
,
109 off_t filesize
, int devblocksize
, int flags
);
110 static int cluster_nocopy_write(struct vnode
*vp
, struct uio
*uio
,
111 off_t newEOF
, int devblocksize
, int flags
);
112 static int cluster_phys_read(struct vnode
*vp
, struct uio
*uio
,
113 off_t filesize
, int devblocksize
, int flags
);
114 static int cluster_phys_write(struct vnode
*vp
, struct uio
*uio
,
115 off_t newEOF
, int devblocksize
, int flags
);
116 static int cluster_align_phys_io(struct vnode
*vp
, struct uio
*uio
,
117 addr64_t usr_paddr
, int xsize
, int devblocksize
, int flags
);
118 static int cluster_push_x(struct vnode
*vp
, off_t EOF
, daddr_t first
, daddr_t last
, int can_delay
);
119 static int cluster_try_push(struct vnode
*vp
, off_t EOF
, int can_delay
, int push_all
);
121 static int sparse_cluster_switch(struct vnode
*vp
, off_t EOF
);
122 static int sparse_cluster_push(struct vnode
*vp
, off_t EOF
, int push_all
);
123 static int sparse_cluster_add(struct vnode
*vp
, off_t EOF
, daddr_t first
, daddr_t last
);
125 static kern_return_t
vfs_drt_mark_pages(void **cmapp
, off_t offset
, u_int length
, int *setcountp
);
126 static kern_return_t
vfs_drt_unmark_pages(void **cmapp
, off_t offset
, u_int length
);
127 static kern_return_t
vfs_drt_get_cluster(void **cmapp
, off_t
*offsetp
, u_int
*lengthp
);
128 static kern_return_t
vfs_drt_control(void **cmapp
, int op_type
);
130 int ubc_page_op_with_control
__P((memory_object_control_t
, off_t
, int, ppnum_t
*, int *));
134 * throttle the number of async writes that
135 * can be outstanding on a single vnode
136 * before we issue a synchronous write
138 #define ASYNC_THROTTLE 18
139 #define HARD_THROTTLE_MAXCNT 1
140 #define HARD_THROTTLE_MAXSIZE (64 * 1024)
142 int hard_throttle_on_root
= 0;
143 struct timeval priority_IO_timestamp_for_root
;
147 cluster_hard_throttle_on(vp
)
150 static struct timeval hard_throttle_maxelapsed
= { 0, 300000 };
152 if (vp
->v_mount
->mnt_kern_flag
& MNTK_ROOTDEV
) {
153 struct timeval elapsed
;
155 if (hard_throttle_on_root
)
159 timevalsub(&elapsed
, &priority_IO_timestamp_for_root
);
161 if (timevalcmp(&elapsed
, &hard_throttle_maxelapsed
, <))
180 struct buf
*cbp_head
;
181 struct buf
*cbp_next
;
184 struct clios
*iostate
;
189 cbp_head
= (struct buf
*)(bp
->b_trans_head
);
191 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_START
,
192 (int)cbp_head
, bp
->b_lblkno
, bp
->b_bcount
, bp
->b_flags
, 0);
194 for (cbp
= cbp_head
; cbp
; cbp
= cbp
->b_trans_next
) {
196 * all I/O requests that are part of this transaction
197 * have to complete before we can process it
199 if ( !(cbp
->b_flags
& B_DONE
)) {
201 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
202 (int)cbp_head
, (int)cbp
, cbp
->b_bcount
, cbp
->b_flags
, 0);
212 upl_offset
= cbp
->b_uploffset
;
213 upl
= cbp
->b_pagelist
;
214 b_flags
= cbp
->b_flags
;
215 real_bp
= cbp
->b_real_bp
;
217 zero_offset
= cbp
->b_validend
;
218 iostate
= (struct clios
*)cbp
->b_iostate
;
221 if ((cbp
->b_flags
& B_ERROR
) && error
== 0)
222 error
= cbp
->b_error
;
224 total_resid
+= cbp
->b_resid
;
225 total_size
+= cbp
->b_bcount
;
227 cbp_next
= cbp
->b_trans_next
;
234 cluster_zero(upl
, zero_offset
, PAGE_SIZE
- (zero_offset
& PAGE_MASK
), real_bp
);
236 if ((vp
->v_flag
& VTHROTTLED
) && (vp
->v_numoutput
<= (ASYNC_THROTTLE
/ 3))) {
237 vp
->v_flag
&= ~VTHROTTLED
;
238 wakeup((caddr_t
)&vp
->v_numoutput
);
242 * someone has issued multiple I/Os asynchrounsly
243 * and is waiting for them to complete (streaming)
245 if (error
&& iostate
->io_error
== 0)
246 iostate
->io_error
= error
;
248 iostate
->io_completed
+= total_size
;
250 if (iostate
->io_wanted
) {
252 * someone is waiting for the state of
253 * this io stream to change
255 iostate
->io_wanted
= 0;
256 wakeup((caddr_t
)&iostate
->io_wanted
);
259 if ((b_flags
& B_NEED_IODONE
) && real_bp
) {
261 real_bp
->b_flags
|= B_ERROR
;
262 real_bp
->b_error
= error
;
264 real_bp
->b_resid
= total_resid
;
268 if (error
== 0 && total_resid
)
271 if (b_flags
& B_COMMIT_UPL
) {
272 pg_offset
= upl_offset
& PAGE_MASK
;
273 commit_size
= (pg_offset
+ total_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
275 if (error
|| (b_flags
& B_NOCACHE
)) {
278 if ((b_flags
& B_PAGEOUT
) && (error
!= ENXIO
)) /* transient error */
279 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
;
280 else if (b_flags
& B_PGIN
)
281 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
;
283 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_DUMP_PAGES
;
285 ubc_upl_abort_range(upl
, upl_offset
- pg_offset
, commit_size
,
288 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
289 (int)upl
, upl_offset
- pg_offset
, commit_size
,
290 0x80000000|upl_abort_code
, 0);
293 int upl_commit_flags
= UPL_COMMIT_FREE_ON_EMPTY
;
295 if (b_flags
& B_PHYS
) {
296 if (b_flags
& B_READ
)
297 upl_commit_flags
|= UPL_COMMIT_SET_DIRTY
;
298 } else if ( !(b_flags
& B_PAGEOUT
))
299 upl_commit_flags
|= UPL_COMMIT_CLEAR_DIRTY
;
302 upl_commit_flags
|= UPL_COMMIT_INACTIVATE
;
304 ubc_upl_commit_range(upl
, upl_offset
- pg_offset
, commit_size
,
307 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
308 (int)upl
, upl_offset
- pg_offset
, commit_size
,
309 upl_commit_flags
, 0);
312 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
313 (int)upl
, upl_offset
, 0, error
, 0);
320 cluster_zero(upl
, upl_offset
, size
, bp
)
322 vm_offset_t upl_offset
;
328 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 23)) | DBG_FUNC_START
,
329 upl_offset
, size
, (int)bp
, 0, 0);
331 if (bp
== NULL
|| bp
->b_data
== NULL
) {
333 pl
= ubc_upl_pageinfo(upl
);
341 page_index
= upl_offset
/ PAGE_SIZE
;
342 page_offset
= upl_offset
& PAGE_MASK
;
344 zero_addr
= ((addr64_t
)upl_phys_page(pl
, page_index
) << 12) + page_offset
;
345 zero_cnt
= min(PAGE_SIZE
- page_offset
, size
);
347 bzero_phys(zero_addr
, zero_cnt
);
350 upl_offset
+= zero_cnt
;
353 bzero((caddr_t
)((vm_offset_t
)bp
->b_data
+ upl_offset
), size
);
355 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 23)) | DBG_FUNC_END
,
356 upl_offset
, size
, 0, 0, 0);
360 cluster_io(vp
, upl
, upl_offset
, f_offset
, non_rounded_size
, devblocksize
, flags
, real_bp
, iostate
)
363 vm_offset_t upl_offset
;
365 int non_rounded_size
;
369 struct clios
*iostate
;
377 struct buf
*cbp_head
= 0;
378 struct buf
*cbp_tail
= 0;
389 size
= (non_rounded_size
+ (devblocksize
- 1)) & ~(devblocksize
- 1);
391 size
= non_rounded_size
;
393 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_START
,
394 (int)f_offset
, size
, upl_offset
, flags
, 0);
397 if (flags
& CL_READ
) {
398 io_flags
= (B_VECTORLIST
| B_READ
);
400 vfs_io_attributes(vp
, B_READ
, &max_iosize
, &max_vectors
);
402 io_flags
= (B_VECTORLIST
| B_WRITEINPROG
);
404 vfs_io_attributes(vp
, B_WRITE
, &max_iosize
, &max_vectors
);
407 * make sure the maximum iosize are at least the size of a page
408 * and that they are multiples of the page size
410 max_iosize
&= ~PAGE_MASK
;
412 if (flags
& CL_THROTTLE
) {
413 if ( !(flags
& CL_PAGEOUT
) && cluster_hard_throttle_on(vp
)) {
414 if (max_iosize
> HARD_THROTTLE_MAXSIZE
)
415 max_iosize
= HARD_THROTTLE_MAXSIZE
;
416 async_throttle
= HARD_THROTTLE_MAXCNT
;
418 async_throttle
= ASYNC_THROTTLE
;
423 io_flags
|= B_NOCACHE
;
424 if (flags
& CL_PAGEIN
)
426 if (flags
& CL_PAGEOUT
)
427 io_flags
|= B_PAGEOUT
;
428 if (flags
& CL_COMMIT
)
429 io_flags
|= B_COMMIT_UPL
;
430 if (flags
& CL_PRESERVE
)
433 if ((flags
& CL_READ
) && ((upl_offset
+ non_rounded_size
) & PAGE_MASK
) && (!(flags
& CL_NOZERO
))) {
435 * then we are going to end up
436 * with a page that we can't complete (the file size wasn't a multiple
437 * of PAGE_SIZE and we're trying to read to the end of the file
438 * so we'll go ahead and zero out the portion of the page we can't
439 * read in from the file
441 zero_offset
= upl_offset
+ non_rounded_size
;
451 if (size
> max_iosize
)
452 io_size
= max_iosize
;
456 if (error
= VOP_CMAP(vp
, f_offset
, io_size
, &blkno
, (size_t *)&io_size
, NULL
)) {
457 if (error
== EOPNOTSUPP
)
458 panic("VOP_CMAP Unimplemented");
462 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 24)) | DBG_FUNC_NONE
,
463 (int)f_offset
, (int)blkno
, io_size
, zero_offset
, 0);
465 if ( (!(flags
& CL_READ
) && (long)blkno
== -1) || io_size
== 0) {
466 if (flags
& CL_PAGEOUT
) {
471 /* Try paging out the page individually before
472 giving up entirely and dumping it (it could
473 be mapped in a "hole" and require allocation
476 ubc_upl_abort_range(upl
, upl_offset
, PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
477 if (ubc_pushdirty_range(vp
, f_offset
, PAGE_SIZE_64
) == 0) {
482 f_offset
+= PAGE_SIZE_64
;
483 upl_offset
+= PAGE_SIZE
;
487 lblkno
= (daddr_t
)(f_offset
/ PAGE_SIZE_64
);
489 * we have now figured out how much I/O we can do - this is in 'io_size'
490 * pg_offset is the starting point in the first page for the I/O
491 * pg_count is the number of full and partial pages that 'io_size' encompasses
493 pg_offset
= upl_offset
& PAGE_MASK
;
495 if (flags
& CL_DEV_MEMORY
) {
497 * currently, can't deal with reading 'holes' in file
499 if ((long)blkno
== -1) {
504 * treat physical requests as one 'giant' page
508 pg_count
= (io_size
+ pg_offset
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
510 if ((flags
& CL_READ
) && (long)blkno
== -1) {
514 * if we're reading and blkno == -1, then we've got a
515 * 'hole' in the file that we need to deal with by zeroing
516 * out the affected area in the upl
518 if (zero_offset
&& io_size
== size
) {
520 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
521 * than 'zero_offset' will be non-zero
522 * if the 'hole' returned by VOP_CMAP extends all the way to the eof
523 * (indicated by the io_size finishing off the I/O request for this UPL)
524 * than we're not going to issue an I/O for the
525 * last page in this upl... we need to zero both the hole and the tail
526 * of the page beyond the EOF, since the delayed zero-fill won't kick in
528 bytes_to_zero
= (((upl_offset
+ io_size
) + (PAGE_SIZE
- 1)) & ~PAGE_MASK
) - upl_offset
;
532 bytes_to_zero
= io_size
;
534 cluster_zero(upl
, upl_offset
, bytes_to_zero
, real_bp
);
538 * if there is a current I/O chain pending
539 * then the first page of the group we just zero'd
540 * will be handled by the I/O completion if the zero
541 * fill started in the middle of the page
543 pg_count
= (io_size
- pg_offset
) / PAGE_SIZE
;
546 * no pending I/O to pick up that first page
547 * so, we have to make sure it gets committed
549 * set the pg_offset to 0 so that the upl_commit_range
550 * starts with this page
552 pg_count
= (io_size
+ pg_offset
) / PAGE_SIZE
;
555 if (io_size
== size
&& ((upl_offset
+ io_size
) & PAGE_MASK
))
557 * if we're done with the request for this UPL
558 * then we have to make sure to commit the last page
559 * even if we only partially zero-filled it
565 pg_resid
= PAGE_SIZE
- pg_offset
;
569 if (flags
& CL_COMMIT
)
570 ubc_upl_commit_range(upl
,
571 (upl_offset
+ pg_resid
) & ~PAGE_MASK
,
572 pg_count
* PAGE_SIZE
,
573 UPL_COMMIT_CLEAR_DIRTY
| UPL_COMMIT_FREE_ON_EMPTY
);
575 upl_offset
+= io_size
;
579 if (cbp_head
&& pg_count
)
583 } else if (real_bp
&& (real_bp
->b_blkno
== real_bp
->b_lblkno
)) {
584 real_bp
->b_blkno
= blkno
;
587 if (pg_count
> max_vectors
) {
588 io_size
-= (pg_count
- max_vectors
) * PAGE_SIZE
;
591 io_size
= PAGE_SIZE
- pg_offset
;
594 pg_count
= max_vectors
;
597 if ( !(vp
->v_mount
->mnt_kern_flag
& MNTK_VIRTUALDEV
))
599 * if we're not targeting a virtual device i.e. a disk image
600 * it's safe to dip into the reserve pool since real devices
601 * can complete this I/O request without requiring additional
602 * bufs from the alloc_io_buf pool
605 else if ((flags
& CL_ASYNC
) && !(flags
& CL_PAGEOUT
))
607 * Throttle the speculative IO
613 cbp
= alloc_io_buf(vp
, priv
);
616 if (flags
& CL_PAGEOUT
) {
617 for (i
= 0; i
< pg_count
; i
++) {
622 if (bp
= incore(vp
, lblkno
+ i
)) {
623 if (!ISSET(bp
->b_flags
, B_BUSY
)) {
625 SET(bp
->b_flags
, (B_BUSY
| B_INVAL
));
629 panic("BUSY bp found in cluster_io");
634 if (flags
& CL_ASYNC
) {
635 cbp
->b_flags
|= (B_CALL
| B_ASYNC
);
636 cbp
->b_iodone
= (void *)cluster_iodone
;
638 cbp
->b_flags
|= io_flags
;
640 cbp
->b_lblkno
= lblkno
;
641 cbp
->b_blkno
= blkno
;
642 cbp
->b_bcount
= io_size
;
643 cbp
->b_pagelist
= upl
;
644 cbp
->b_uploffset
= upl_offset
;
645 cbp
->b_trans_next
= (struct buf
*)0;
647 if (cbp
->b_iostate
= (void *)iostate
)
649 * caller wants to track the state of this
650 * io... bump the amount issued against this stream
652 iostate
->io_issued
+= io_size
;
655 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 26)) | DBG_FUNC_NONE
,
656 cbp
->b_lblkno
, cbp
->b_blkno
, upl_offset
, io_size
, 0);
658 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 27)) | DBG_FUNC_NONE
,
659 cbp
->b_lblkno
, cbp
->b_blkno
, upl_offset
, io_size
, 0);
662 cbp_tail
->b_trans_next
= cbp
;
668 (struct buf
*)(cbp
->b_trans_head
) = cbp_head
;
671 upl_offset
+= io_size
;
675 if ( (!(upl_offset
& PAGE_MASK
) && !(flags
& CL_DEV_MEMORY
) && ((flags
& CL_ASYNC
) || buf_count
> 8)) || size
== 0) {
677 * if we have no more I/O to issue or
678 * the current I/O we've prepared fully
679 * completes the last page in this request
680 * and it's either an ASYNC request or
681 * we've already accumulated more than 8 I/O's into
682 * this transaction and it's not an I/O directed to
683 * special DEVICE memory
684 * then go ahead and issue the I/O
688 cbp_head
->b_flags
|= B_NEED_IODONE
;
689 cbp_head
->b_real_bp
= real_bp
;
691 cbp_head
->b_real_bp
= (struct buf
*)NULL
;
695 * we're about to issue the last I/O for this upl
696 * if this was a read to the eof and the eof doesn't
697 * finish on a page boundary, than we need to zero-fill
698 * the rest of the page....
700 cbp_head
->b_validend
= zero_offset
;
702 cbp_head
->b_validend
= 0;
704 if (flags
& CL_THROTTLE
) {
705 while (vp
->v_numoutput
>= async_throttle
) {
706 vp
->v_flag
|= VTHROTTLED
;
707 tsleep((caddr_t
)&vp
->v_numoutput
, PRIBIO
+ 1, "cluster_io", 0);
710 for (cbp
= cbp_head
; cbp
;) {
711 struct buf
* cbp_next
;
713 if (io_flags
& B_WRITEINPROG
)
714 cbp
->b_vp
->v_numoutput
++;
716 cbp_next
= cbp
->b_trans_next
;
718 (void) VOP_STRATEGY(cbp
);
721 if ( !(flags
& CL_ASYNC
)) {
722 for (cbp
= cbp_head
; cbp
; cbp
= cbp
->b_trans_next
)
725 if (error
= cluster_iodone(cbp_head
)) {
726 if ((flags
& CL_PAGEOUT
) && (error
== ENXIO
))
727 retval
= 0; /* drop the error */
733 cbp_head
= (struct buf
*)0;
734 cbp_tail
= (struct buf
*)0;
744 for (cbp
= cbp_head
; cbp
;) {
745 struct buf
* cbp_next
;
747 upl_offset
-= cbp
->b_bcount
;
748 size
+= cbp
->b_bcount
;
749 io_size
+= cbp
->b_bcount
;
751 cbp_next
= cbp
->b_trans_next
;
757 * update the error condition for this stream
758 * since we never really issued the io
759 * just go ahead and adjust it back
761 if (iostate
->io_error
== 0)
762 iostate
->io_error
= error
;
763 iostate
->io_issued
-= io_size
;
765 if (iostate
->io_wanted
) {
767 * someone is waiting for the state of
768 * this io stream to change
770 iostate
->io_wanted
= 0;
771 wakeup((caddr_t
)&iostate
->io_wanted
);
774 pg_offset
= upl_offset
& PAGE_MASK
;
775 abort_size
= (size
+ pg_offset
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
777 if (flags
& CL_COMMIT
) {
780 if (flags
& CL_PRESERVE
) {
781 ubc_upl_commit_range(upl
, upl_offset
- pg_offset
, abort_size
,
782 UPL_COMMIT_FREE_ON_EMPTY
);
784 if ((flags
& CL_PAGEOUT
) && (error
!= ENXIO
)) /* transient error */
785 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
;
786 else if (flags
& CL_PAGEIN
)
787 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
;
789 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_DUMP_PAGES
;
791 ubc_upl_abort_range(upl
, upl_offset
- pg_offset
, abort_size
,
794 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 28)) | DBG_FUNC_NONE
,
795 (int)upl
, upl_offset
- pg_offset
, abort_size
, error
, 0);
798 real_bp
->b_flags
|= B_ERROR
;
799 real_bp
->b_error
= error
;
806 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_END
,
807 (int)f_offset
, size
, upl_offset
, retval
, 0);
814 cluster_rd_prefetch(vp
, f_offset
, size
, filesize
, devblocksize
)
821 int pages_in_prefetch
;
823 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_START
,
824 (int)f_offset
, size
, (int)filesize
, 0, 0);
826 if (f_offset
>= filesize
) {
827 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_END
,
828 (int)f_offset
, 0, 0, 0, 0);
831 if (size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
832 size
= (MAX_UPL_TRANSFER
* PAGE_SIZE
);
834 size
= (size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
836 if ((off_t
)size
> (filesize
- f_offset
))
837 size
= filesize
- f_offset
;
838 pages_in_prefetch
= (size
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
840 advisory_read(vp
, filesize
, f_offset
, size
, devblocksize
);
842 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_END
,
843 (int)f_offset
+ size
, pages_in_prefetch
, 0, 1, 0);
845 return (pages_in_prefetch
);
851 cluster_rd_ahead(vp
, b_lblkno
, e_lblkno
, filesize
, devblocksize
)
860 int size_of_prefetch
;
862 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_START
,
863 b_lblkno
, e_lblkno
, vp
->v_lastr
, 0, 0);
865 if (b_lblkno
== vp
->v_lastr
&& b_lblkno
== e_lblkno
) {
866 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
867 vp
->v_ralen
, vp
->v_maxra
, vp
->v_lastr
, 0, 0);
870 if (vp
->v_lastr
== -1 || (b_lblkno
!= vp
->v_lastr
&& b_lblkno
!= (vp
->v_lastr
+ 1) &&
871 (b_lblkno
!= (vp
->v_maxra
+ 1) || vp
->v_ralen
== 0))) {
875 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
876 vp
->v_ralen
, vp
->v_maxra
, vp
->v_lastr
, 1, 0);
880 if (e_lblkno
< vp
->v_maxra
) {
881 if ((vp
->v_maxra
- e_lblkno
) > (MAX_UPL_TRANSFER
/ 4)) {
883 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
884 vp
->v_ralen
, vp
->v_maxra
, vp
->v_lastr
, 2, 0);
888 r_lblkno
= max(e_lblkno
, vp
->v_maxra
) + 1;
889 f_offset
= (off_t
)r_lblkno
* PAGE_SIZE_64
;
891 size_of_prefetch
= 0;
893 ubc_range_op(vp
, f_offset
, f_offset
+ PAGE_SIZE_64
, UPL_ROP_PRESENT
, &size_of_prefetch
);
895 if (size_of_prefetch
) {
896 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
897 vp
->v_ralen
, vp
->v_maxra
, vp
->v_lastr
, 3, 0);
900 if (f_offset
< filesize
) {
901 vp
->v_ralen
= vp
->v_ralen
? min(MAX_UPL_TRANSFER
, vp
->v_ralen
<< 1) : 1;
903 if (((e_lblkno
+ 1) - b_lblkno
) > vp
->v_ralen
)
904 vp
->v_ralen
= min(MAX_UPL_TRANSFER
, (e_lblkno
+ 1) - b_lblkno
);
906 size_of_prefetch
= cluster_rd_prefetch(vp
, f_offset
, vp
->v_ralen
* PAGE_SIZE
, filesize
, devblocksize
);
908 if (size_of_prefetch
)
909 vp
->v_maxra
= (r_lblkno
+ size_of_prefetch
) - 1;
911 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
912 vp
->v_ralen
, vp
->v_maxra
, vp
->v_lastr
, 4, 0);
916 cluster_pageout(vp
, upl
, upl_offset
, f_offset
, size
, filesize
, devblocksize
, flags
)
919 vm_offset_t upl_offset
;
931 if (vp
->v_mount
->mnt_kern_flag
& MNTK_VIRTUALDEV
)
933 * if we know we're issuing this I/O to a virtual device (i.e. disk image)
934 * then we don't want to enforce this throttle... if we do, we can
935 * potentially deadlock since we're stalling the pageout thread at a time
936 * when the disk image might need additional memory (which won't be available
937 * if the pageout thread can't run)... instead we'll just depend on the throttle
938 * that the pageout thread now has in place to deal with external files
940 local_flags
= CL_PAGEOUT
;
942 local_flags
= CL_PAGEOUT
| CL_THROTTLE
;
944 if ((flags
& UPL_IOSYNC
) == 0)
945 local_flags
|= CL_ASYNC
;
946 if ((flags
& UPL_NOCOMMIT
) == 0)
947 local_flags
|= CL_COMMIT
;
950 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 52)) | DBG_FUNC_NONE
,
951 (int)f_offset
, size
, (int)filesize
, local_flags
, 0);
954 * If they didn't specify any I/O, then we are done...
955 * we can't issue an abort because we don't know how
956 * big the upl really is
961 if (vp
->v_mount
->mnt_flag
& MNT_RDONLY
) {
962 if (local_flags
& CL_COMMIT
)
963 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
);
967 * can't page-in from a negative offset
968 * or if we're starting beyond the EOF
969 * or if the file offset isn't page aligned
970 * or the size requested isn't a multiple of PAGE_SIZE
972 if (f_offset
< 0 || f_offset
>= filesize
||
973 (f_offset
& PAGE_MASK_64
) || (size
& PAGE_MASK
)) {
974 if (local_flags
& CL_COMMIT
)
975 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
);
978 max_size
= filesize
- f_offset
;
985 rounded_size
= (io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
987 if (size
> rounded_size
) {
988 if (local_flags
& CL_COMMIT
)
989 ubc_upl_abort_range(upl
, upl_offset
+ rounded_size
, size
- rounded_size
,
990 UPL_ABORT_FREE_ON_EMPTY
);
992 vp
->v_flag
|= VHASBEENPAGED
;
994 return (cluster_io(vp
, upl
, upl_offset
, f_offset
, io_size
, devblocksize
,
995 local_flags
, (struct buf
*)0, (struct clios
*)0));
999 cluster_pagein(vp
, upl
, upl_offset
, f_offset
, size
, filesize
, devblocksize
, flags
)
1002 vm_offset_t upl_offset
;
1013 int local_flags
= 0;
1015 if (upl
== NULL
|| size
< 0)
1016 panic("cluster_pagein: NULL upl passed in");
1018 if ((flags
& UPL_IOSYNC
) == 0)
1019 local_flags
|= CL_ASYNC
;
1020 if ((flags
& UPL_NOCOMMIT
) == 0)
1021 local_flags
|= CL_COMMIT
;
1024 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 56)) | DBG_FUNC_NONE
,
1025 (int)f_offset
, size
, (int)filesize
, local_flags
, 0);
1028 * can't page-in from a negative offset
1029 * or if we're starting beyond the EOF
1030 * or if the file offset isn't page aligned
1031 * or the size requested isn't a multiple of PAGE_SIZE
1033 if (f_offset
< 0 || f_offset
>= filesize
||
1034 (f_offset
& PAGE_MASK_64
) || (size
& PAGE_MASK
) || (upl_offset
& PAGE_MASK
)) {
1035 if (local_flags
& CL_COMMIT
)
1036 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
);
1039 max_size
= filesize
- f_offset
;
1041 if (size
< max_size
)
1046 rounded_size
= (io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
1048 if (size
> rounded_size
&& (local_flags
& CL_COMMIT
))
1049 ubc_upl_abort_range(upl
, upl_offset
+ rounded_size
,
1050 size
- rounded_size
, UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
);
1052 retval
= cluster_io(vp
, upl
, upl_offset
, f_offset
, io_size
, devblocksize
,
1053 local_flags
| CL_READ
| CL_PAGEIN
, (struct buf
*)0, (struct clios
*)0);
1059 b_lblkno
= (int)(f_offset
/ PAGE_SIZE_64
);
1061 ((f_offset
+ ((off_t
)io_size
- 1)) / PAGE_SIZE_64
);
1063 if (!(flags
& UPL_NORDAHEAD
) && !(vp
->v_flag
& VRAOFF
) && rounded_size
== PAGE_SIZE
) {
1065 * we haven't read the last page in of the file yet
1066 * so let's try to read ahead if we're in
1067 * a sequential access pattern
1069 cluster_rd_ahead(vp
, b_lblkno
, e_lblkno
, filesize
, devblocksize
);
1071 vp
->v_lastr
= e_lblkno
;
1083 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 19)) | DBG_FUNC_START
,
1084 (int)bp
, bp
->b_lblkno
, bp
->b_bcount
, bp
->b_flags
, 0);
1086 if (bp
->b_pagelist
== (upl_t
) 0)
1087 panic("cluster_bp: can't handle NULL upl yet\n");
1088 if (bp
->b_flags
& B_READ
)
1089 flags
= CL_ASYNC
| CL_READ
;
1093 f_offset
= ubc_blktooff(bp
->b_vp
, bp
->b_lblkno
);
1095 return (cluster_io(bp
->b_vp
, bp
->b_pagelist
, 0, f_offset
, bp
->b_bcount
, 0, flags
, bp
, (struct clios
*)0));
1099 cluster_write(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
)
1119 if (vp
->v_flag
& VHASBEENPAGED
)
1122 * this vnode had pages cleaned to it by
1123 * the pager which indicates that either
1124 * it's not very 'hot', or the system is
1125 * being overwhelmed by a lot of dirty
1126 * data being delayed in the VM cache...
1127 * in either event, we'll push our remaining
1128 * delayed data at this point... this will
1129 * be more efficient than paging out 1 page at
1130 * a time, and will also act as a throttle
1131 * by delaying this client from writing any
1132 * more data until all his delayed data has
1133 * at least been queued to the uderlying driver.
1137 vp
->v_flag
&= ~VHASBEENPAGED
;
1140 if ( (!(vp
->v_flag
& VNOCACHE_DATA
)) || (!uio
) || (uio
->uio_segflg
!= UIO_USERSPACE
))
1143 * go do a write through the cache if one of the following is true....
1144 * NOCACHE is not true
1145 * there is no uio structure or it doesn't target USERSPACE
1147 return (cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
));
1150 while (uio
->uio_resid
&& uio
->uio_offset
< newEOF
&& retval
== 0)
1153 * we know we have a resid, so this is safe
1154 * skip over any emtpy vectors
1158 while (iov
->iov_len
== 0) {
1163 upl_size
= PAGE_SIZE
;
1164 upl_flags
= UPL_QUERY_OBJECT_TYPE
;
1166 if ((vm_map_get_upl(current_map(),
1167 (vm_offset_t
)iov
->iov_base
& ~PAGE_MASK
,
1168 &upl_size
, &upl
, NULL
, NULL
, &upl_flags
, 0)) != KERN_SUCCESS
)
1171 * the user app must have passed in an invalid address
1177 * We check every vector target but if it is physically
1178 * contiguous space, we skip the sanity checks.
1180 if (upl_flags
& UPL_PHYS_CONTIG
)
1182 if (flags
& IO_HEADZEROFILL
)
1184 flags
&= ~IO_HEADZEROFILL
;
1186 if (retval
= cluster_write_x(vp
, (struct uio
*)0, 0, uio
->uio_offset
, headOff
, 0, devblocksize
, IO_HEADZEROFILL
))
1190 retval
= cluster_phys_write(vp
, uio
, newEOF
, devblocksize
, flags
);
1192 if (uio
->uio_resid
== 0 && (flags
& IO_TAILZEROFILL
))
1194 return (cluster_write_x(vp
, (struct uio
*)0, 0, tailOff
, uio
->uio_offset
, 0, devblocksize
, IO_HEADZEROFILL
));
1197 else if ((uio
->uio_resid
< PAGE_SIZE
) || (flags
& (IO_TAILZEROFILL
| IO_HEADZEROFILL
)))
1200 * we're here because we're don't have a physically contiguous target buffer
1201 * go do a write through the cache if one of the following is true....
1202 * the total xfer size is less than a page...
1203 * we're being asked to ZEROFILL either the head or the tail of the I/O...
1205 return (cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
));
1207 else if (((int)uio
->uio_offset
& PAGE_MASK
) || ((int)iov
->iov_base
& PAGE_MASK
))
1209 if (((int)uio
->uio_offset
& PAGE_MASK
) == ((int)iov
->iov_base
& PAGE_MASK
))
1212 * Bring the file offset write up to a pagesize boundary
1213 * this will also bring the base address to a page boundary
1214 * since they both are currently on the same offset within a page
1215 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
1216 * so the computed clip_size must always be less than the current uio_resid
1218 clip_size
= (PAGE_SIZE
- (uio
->uio_offset
& PAGE_MASK_64
));
1221 * Fake the resid going into the cluster_write_x call
1222 * and restore it on the way out.
1224 prev_resid
= uio
->uio_resid
;
1225 uio
->uio_resid
= clip_size
;
1226 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
);
1227 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
1232 * can't get both the file offset and the buffer offset aligned to a page boundary
1233 * so fire an I/O through the cache for this entire vector
1235 clip_size
= iov
->iov_len
;
1236 prev_resid
= uio
->uio_resid
;
1237 uio
->uio_resid
= clip_size
;
1238 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
);
1239 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
1245 * If we come in here, we know the offset into
1246 * the file is on a pagesize boundary and the
1247 * target buffer address is also on a page boundary
1249 max_io_size
= newEOF
- uio
->uio_offset
;
1250 clip_size
= uio
->uio_resid
;
1251 if (iov
->iov_len
< clip_size
)
1252 clip_size
= iov
->iov_len
;
1253 if (max_io_size
< clip_size
)
1254 clip_size
= max_io_size
;
1256 if (clip_size
< PAGE_SIZE
)
1259 * Take care of tail end of write in this vector
1261 prev_resid
= uio
->uio_resid
;
1262 uio
->uio_resid
= clip_size
;
1263 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
);
1264 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
1268 /* round clip_size down to a multiple of pagesize */
1269 clip_size
= clip_size
& ~(PAGE_MASK
);
1270 prev_resid
= uio
->uio_resid
;
1271 uio
->uio_resid
= clip_size
;
1272 retval
= cluster_nocopy_write(vp
, uio
, newEOF
, devblocksize
, flags
);
1273 if ((retval
== 0) && uio
->uio_resid
)
1274 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
);
1275 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
1284 cluster_nocopy_write(vp
, uio
, newEOF
, devblocksize
, flags
)
1292 upl_page_info_t
*pl
;
1294 vm_offset_t upl_offset
;
1299 int upl_needed_size
;
1305 int force_data_sync
;
1307 struct clios iostate
;
1309 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_START
,
1310 (int)uio
->uio_offset
, (int)uio
->uio_resid
,
1311 (int)newEOF
, devblocksize
, 0);
1314 * When we enter this routine, we know
1315 * -- the offset into the file is on a pagesize boundary
1316 * -- the resid is a page multiple
1317 * -- the resid will not exceed iov_len
1319 cluster_try_push(vp
, newEOF
, 0, 1);
1321 iostate
.io_completed
= 0;
1322 iostate
.io_issued
= 0;
1323 iostate
.io_error
= 0;
1324 iostate
.io_wanted
= 0;
1328 while (uio
->uio_resid
&& uio
->uio_offset
< newEOF
&& error
== 0) {
1329 io_size
= uio
->uio_resid
;
1331 if (io_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
1332 io_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
1334 upl_offset
= (vm_offset_t
)iov
->iov_base
& PAGE_MASK
;
1335 upl_needed_size
= (upl_offset
+ io_size
+ (PAGE_SIZE
-1)) & ~PAGE_MASK
;
1337 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_START
,
1338 (int)upl_offset
, upl_needed_size
, (int)iov
->iov_base
, io_size
, 0);
1340 for (force_data_sync
= 0; force_data_sync
< 3; force_data_sync
++) {
1342 upl_size
= upl_needed_size
;
1343 upl_flags
= UPL_FILE_IO
| UPL_COPYOUT_FROM
| UPL_NO_SYNC
|
1344 UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
1346 kret
= vm_map_get_upl(current_map(),
1347 (vm_offset_t
)iov
->iov_base
& ~PAGE_MASK
,
1355 if (kret
!= KERN_SUCCESS
) {
1356 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
1359 * cluster_nocopy_write: failed to get pagelist
1361 * we may have already spun some portion of this request
1362 * off as async requests... we need to wait for the I/O
1363 * to complete before returning
1365 goto wait_for_writes
;
1367 pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
1368 pages_in_pl
= upl_size
/ PAGE_SIZE
;
1370 for (i
= 0; i
< pages_in_pl
; i
++) {
1371 if (!upl_valid_page(pl
, i
))
1374 if (i
== pages_in_pl
)
1378 * didn't get all the pages back that we
1379 * needed... release this upl and try again
1381 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
1382 UPL_ABORT_FREE_ON_EMPTY
);
1384 if (force_data_sync
>= 3) {
1385 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
1386 i
, pages_in_pl
, upl_size
, kret
, 0);
1388 * for some reason, we couldn't acquire a hold on all
1389 * the pages needed in the user's address space
1391 * we may have already spun some portion of this request
1392 * off as async requests... we need to wait for the I/O
1393 * to complete before returning
1395 goto wait_for_writes
;
1399 * Consider the possibility that upl_size wasn't satisfied.
1401 if (upl_size
!= upl_needed_size
)
1402 io_size
= (upl_size
- (int)upl_offset
) & ~PAGE_MASK
;
1404 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
1405 (int)upl_offset
, upl_size
, (int)iov
->iov_base
, io_size
, 0);
1408 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
1409 UPL_ABORT_FREE_ON_EMPTY
);
1411 * we may have already spun some portion of this request
1412 * off as async requests... we need to wait for the I/O
1413 * to complete before returning
1415 goto wait_for_writes
;
1418 * Now look for pages already in the cache
1419 * and throw them away.
1420 * uio->uio_offset is page aligned within the file
1421 * io_size is a multiple of PAGE_SIZE
1423 ubc_range_op(vp
, uio
->uio_offset
, uio
->uio_offset
+ io_size
, UPL_ROP_DUMP
, NULL
);
1426 * we want push out these writes asynchronously so that we can overlap
1427 * the preparation of the next I/O
1428 * if there are already too many outstanding writes
1429 * wait until some complete before issuing the next
1431 while ((iostate
.io_issued
- iostate
.io_completed
) > (2 * MAX_UPL_TRANSFER
* PAGE_SIZE
)) {
1432 iostate
.io_wanted
= 1;
1433 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO
+ 1, "cluster_nocopy_write", 0);
1435 if (iostate
.io_error
) {
1437 * one of the earlier writes we issued ran into a hard error
1438 * don't issue any more writes, cleanup the UPL
1439 * that was just created but not used, then
1440 * go wait for all writes that are part of this stream
1441 * to complete before returning the error to the caller
1443 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
1444 UPL_ABORT_FREE_ON_EMPTY
);
1446 goto wait_for_writes
;
1448 io_flag
= CL_ASYNC
| CL_PRESERVE
| CL_COMMIT
| CL_THROTTLE
;
1450 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 77)) | DBG_FUNC_START
,
1451 (int)upl_offset
, (int)uio
->uio_offset
, io_size
, io_flag
, 0);
1453 error
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
,
1454 io_size
, devblocksize
, io_flag
, (struct buf
*)0, &iostate
);
1456 iov
->iov_len
-= io_size
;
1457 iov
->iov_base
+= io_size
;
1458 uio
->uio_resid
-= io_size
;
1459 uio
->uio_offset
+= io_size
;
1461 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 77)) | DBG_FUNC_END
,
1462 (int)upl_offset
, (int)uio
->uio_offset
, (int)uio
->uio_resid
, error
, 0);
1468 * make sure all async writes issued as part of this stream
1469 * have completed before we return
1471 while (iostate
.io_issued
!= iostate
.io_completed
) {
1472 iostate
.io_wanted
= 1;
1473 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO
+ 1, "cluster_nocopy_write", 0);
1475 if (iostate
.io_error
)
1476 error
= iostate
.io_error
;
1478 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_END
,
1479 (int)uio
->uio_offset
, (int)uio
->uio_resid
, error
, 4, 0);
1486 cluster_phys_write(vp
, uio
, newEOF
, devblocksize
, flags
)
1493 upl_page_info_t
*pl
;
1496 vm_offset_t upl_offset
;
1500 int upl_needed_size
;
1508 * When we enter this routine, we know
1509 * -- the resid will not exceed iov_len
1510 * -- the vector target address is physcially contiguous
1512 cluster_try_push(vp
, newEOF
, 0, 1);
1515 io_size
= iov
->iov_len
;
1516 upl_offset
= (vm_offset_t
)iov
->iov_base
& PAGE_MASK
;
1517 upl_needed_size
= upl_offset
+ io_size
;
1520 upl_size
= upl_needed_size
;
1521 upl_flags
= UPL_FILE_IO
| UPL_COPYOUT_FROM
| UPL_NO_SYNC
|
1522 UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
1524 kret
= vm_map_get_upl(current_map(),
1525 (vm_offset_t
)iov
->iov_base
& ~PAGE_MASK
,
1526 &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
, 0);
1528 if (kret
!= KERN_SUCCESS
) {
1530 * cluster_phys_write: failed to get pagelist
1531 * note: return kret here
1536 * Consider the possibility that upl_size wasn't satisfied.
1537 * This is a failure in the physical memory case.
1539 if (upl_size
< upl_needed_size
) {
1540 kernel_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
1543 pl
= ubc_upl_pageinfo(upl
);
1545 src_paddr
= ((addr64_t
)upl_phys_page(pl
, 0) << 12) + ((addr64_t
)((u_int
)iov
->iov_base
& PAGE_MASK
));
1547 while (((uio
->uio_offset
& (devblocksize
- 1)) || io_size
< devblocksize
) && io_size
) {
1550 head_size
= devblocksize
- (int)(uio
->uio_offset
& (devblocksize
- 1));
1552 if (head_size
> io_size
)
1553 head_size
= io_size
;
1555 error
= cluster_align_phys_io(vp
, uio
, src_paddr
, head_size
, devblocksize
, 0);
1558 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
1562 upl_offset
+= head_size
;
1563 src_paddr
+= head_size
;
1564 io_size
-= head_size
;
1566 tail_size
= io_size
& (devblocksize
- 1);
1567 io_size
-= tail_size
;
1571 * issue a synchronous write to cluster_io
1573 error
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
,
1574 io_size
, 0, CL_DEV_MEMORY
, (struct buf
*)0, (struct clios
*)0);
1578 * The cluster_io write completed successfully,
1579 * update the uio structure
1581 uio
->uio_resid
-= io_size
;
1582 iov
->iov_len
-= io_size
;
1583 iov
->iov_base
+= io_size
;
1584 uio
->uio_offset
+= io_size
;
1585 src_paddr
+= io_size
;
1588 error
= cluster_align_phys_io(vp
, uio
, src_paddr
, tail_size
, devblocksize
, 0);
1591 * just release our hold on the physically contiguous
1592 * region without changing any state
1594 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
1601 cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, devblocksize
, flags
)
1611 upl_page_info_t
*pl
;
1613 vm_offset_t upl_offset
;
1627 long long total_size
;
1630 long long zero_cnt1
;
1632 daddr_t start_blkno
;
1638 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_START
,
1639 (int)uio
->uio_offset
, uio
->uio_resid
, (int)oldEOF
, (int)newEOF
, 0);
1641 uio_resid
= uio
->uio_resid
;
1643 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_START
,
1644 0, 0, (int)oldEOF
, (int)newEOF
, 0);
1651 if (flags
& IO_HEADZEROFILL
) {
1653 * some filesystems (HFS is one) don't support unallocated holes within a file...
1654 * so we zero fill the intervening space between the old EOF and the offset
1655 * where the next chunk of real data begins.... ftruncate will also use this
1656 * routine to zero fill to the new EOF when growing a file... in this case, the
1657 * uio structure will not be provided
1660 if (headOff
< uio
->uio_offset
) {
1661 zero_cnt
= uio
->uio_offset
- headOff
;
1664 } else if (headOff
< newEOF
) {
1665 zero_cnt
= newEOF
- headOff
;
1669 if (flags
& IO_TAILZEROFILL
) {
1671 zero_off1
= uio
->uio_offset
+ uio
->uio_resid
;
1673 if (zero_off1
< tailOff
)
1674 zero_cnt1
= tailOff
- zero_off1
;
1677 if (zero_cnt
== 0 && uio
== (struct uio
*) 0) {
1678 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_END
,
1679 retval
, 0, 0, 0, 0);
1683 while ((total_size
= (uio_resid
+ zero_cnt
+ zero_cnt1
)) && retval
== 0) {
1685 * for this iteration of the loop, figure out where our starting point is
1688 start_offset
= (int)(zero_off
& PAGE_MASK_64
);
1689 upl_f_offset
= zero_off
- start_offset
;
1690 } else if (uio_resid
) {
1691 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
1692 upl_f_offset
= uio
->uio_offset
- start_offset
;
1694 start_offset
= (int)(zero_off1
& PAGE_MASK_64
);
1695 upl_f_offset
= zero_off1
- start_offset
;
1697 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 46)) | DBG_FUNC_NONE
,
1698 (int)zero_off
, (int)zero_cnt
, (int)zero_off1
, (int)zero_cnt1
, 0);
1700 if (total_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
1701 total_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
1703 start_blkno
= (daddr_t
)(upl_f_offset
/ PAGE_SIZE_64
);
1705 if (uio
&& !(vp
->v_flag
& VNOCACHE_DATA
) &&
1706 (flags
& (IO_SYNC
| IO_HEADZEROFILL
| IO_TAILZEROFILL
)) == 0) {
1708 * assumption... total_size <= uio_resid
1709 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
1711 if ((start_offset
+ total_size
) > (MAX_UPL_TRANSFER
* PAGE_SIZE
))
1712 total_size
-= start_offset
;
1713 xfer_resid
= total_size
;
1715 retval
= cluster_copy_ubc_data(vp
, uio
, &xfer_resid
, 1);
1720 uio_resid
-= (total_size
- xfer_resid
);
1721 total_size
= xfer_resid
;
1722 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
1723 upl_f_offset
= uio
->uio_offset
- start_offset
;
1725 if (total_size
== 0) {
1728 * the write did not finish on a page boundary
1729 * which will leave upl_f_offset pointing to the
1730 * beginning of the last page written instead of
1731 * the page beyond it... bump it in this case
1732 * so that the cluster code records the last page
1735 upl_f_offset
+= PAGE_SIZE_64
;
1743 * compute the size of the upl needed to encompass
1744 * the requested write... limit each call to cluster_io
1745 * to the maximum UPL size... cluster_io will clip if
1746 * this exceeds the maximum io_size for the device,
1747 * make sure to account for
1748 * a starting offset that's not page aligned
1750 upl_size
= (start_offset
+ total_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
1752 if (upl_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
1753 upl_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
1755 pages_in_upl
= upl_size
/ PAGE_SIZE
;
1756 io_size
= upl_size
- start_offset
;
1758 if ((long long)io_size
> total_size
)
1759 io_size
= total_size
;
1761 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_START
, upl_size
, io_size
, total_size
, 0, 0);
1764 kret
= ubc_create_upl(vp
,
1770 if (kret
!= KERN_SUCCESS
)
1771 panic("cluster_write: failed to get pagelist");
1773 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_END
,
1774 (int)upl
, (int)upl_f_offset
, start_offset
, 0, 0);
1776 if (start_offset
&& !upl_valid_page(pl
, 0)) {
1780 * we're starting in the middle of the first page of the upl
1781 * and the page isn't currently valid, so we're going to have
1782 * to read it in first... this is a synchronous operation
1784 read_size
= PAGE_SIZE
;
1786 if ((upl_f_offset
+ read_size
) > newEOF
)
1787 read_size
= newEOF
- upl_f_offset
;
1789 retval
= cluster_io(vp
, upl
, 0, upl_f_offset
, read_size
, devblocksize
,
1790 CL_READ
, (struct buf
*)0, (struct clios
*)0);
1793 * we had an error during the read which causes us to abort
1794 * the current cluster_write request... before we do, we need
1795 * to release the rest of the pages in the upl without modifying
1796 * there state and mark the failed page in error
1798 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
);
1799 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
1801 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
1802 (int)upl
, 0, 0, retval
, 0);
1806 if ((start_offset
== 0 || upl_size
> PAGE_SIZE
) && ((start_offset
+ io_size
) & PAGE_MASK
)) {
1808 * the last offset we're writing to in this upl does not end on a page
1809 * boundary... if it's not beyond the old EOF, then we'll also need to
1810 * pre-read this page in if it isn't already valid
1812 upl_offset
= upl_size
- PAGE_SIZE
;
1814 if ((upl_f_offset
+ start_offset
+ io_size
) < oldEOF
&&
1815 !upl_valid_page(pl
, upl_offset
/ PAGE_SIZE
)) {
1818 read_size
= PAGE_SIZE
;
1820 if ((upl_f_offset
+ upl_offset
+ read_size
) > newEOF
)
1821 read_size
= newEOF
- (upl_f_offset
+ upl_offset
);
1823 retval
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, read_size
, devblocksize
,
1824 CL_READ
, (struct buf
*)0, (struct clios
*)0);
1827 * we had an error during the read which causes us to abort
1828 * the current cluster_write request... before we do, we
1829 * need to release the rest of the pages in the upl without
1830 * modifying there state and mark the failed page in error
1832 ubc_upl_abort_range(upl
, upl_offset
, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
);
1833 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
1835 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
1836 (int)upl
, 0, 0, retval
, 0);
1841 xfer_resid
= io_size
;
1842 io_offset
= start_offset
;
1844 while (zero_cnt
&& xfer_resid
) {
1846 if (zero_cnt
< (long long)xfer_resid
)
1847 bytes_to_zero
= zero_cnt
;
1849 bytes_to_zero
= xfer_resid
;
1851 if ( !(flags
& (IO_NOZEROVALID
| IO_NOZERODIRTY
))) {
1852 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
1856 bytes_to_zero
= min(bytes_to_zero
, PAGE_SIZE
- (int)(zero_off
& PAGE_MASK_64
));
1857 zero_pg_index
= (int)((zero_off
- upl_f_offset
) / PAGE_SIZE_64
);
1859 if ( !upl_valid_page(pl
, zero_pg_index
)) {
1860 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
1862 } else if ((flags
& (IO_NOZERODIRTY
| IO_NOZEROVALID
)) == IO_NOZERODIRTY
&&
1863 !upl_dirty_page(pl
, zero_pg_index
)) {
1864 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
1867 xfer_resid
-= bytes_to_zero
;
1868 zero_cnt
-= bytes_to_zero
;
1869 zero_off
+= bytes_to_zero
;
1870 io_offset
+= bytes_to_zero
;
1872 if (xfer_resid
&& uio_resid
) {
1873 bytes_to_move
= min(uio_resid
, xfer_resid
);
1875 retval
= cluster_copy_upl_data(uio
, upl
, io_offset
, bytes_to_move
);
1879 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
1881 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
1882 (int)upl
, 0, 0, retval
, 0);
1884 uio_resid
-= bytes_to_move
;
1885 xfer_resid
-= bytes_to_move
;
1886 io_offset
+= bytes_to_move
;
1889 while (xfer_resid
&& zero_cnt1
&& retval
== 0) {
1891 if (zero_cnt1
< (long long)xfer_resid
)
1892 bytes_to_zero
= zero_cnt1
;
1894 bytes_to_zero
= xfer_resid
;
1896 if ( !(flags
& (IO_NOZEROVALID
| IO_NOZERODIRTY
))) {
1897 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
1901 bytes_to_zero
= min(bytes_to_zero
, PAGE_SIZE
- (int)(zero_off1
& PAGE_MASK_64
));
1902 zero_pg_index
= (int)((zero_off1
- upl_f_offset
) / PAGE_SIZE_64
);
1904 if ( !upl_valid_page(pl
, zero_pg_index
)) {
1905 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
1906 } else if ((flags
& (IO_NOZERODIRTY
| IO_NOZEROVALID
)) == IO_NOZERODIRTY
&&
1907 !upl_dirty_page(pl
, zero_pg_index
)) {
1908 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
1911 xfer_resid
-= bytes_to_zero
;
1912 zero_cnt1
-= bytes_to_zero
;
1913 zero_off1
+= bytes_to_zero
;
1914 io_offset
+= bytes_to_zero
;
1921 io_size
+= start_offset
;
1923 if ((upl_f_offset
+ io_size
) >= newEOF
&& io_size
< upl_size
) {
1925 * if we're extending the file with this write
1926 * we'll zero fill the rest of the page so that
1927 * if the file gets extended again in such a way as to leave a
1928 * hole starting at this EOF, we'll have zero's in the correct spot
1930 cluster_zero(upl
, io_size
, upl_size
- io_size
, NULL
);
1932 if (flags
& IO_SYNC
)
1934 * if the IO_SYNC flag is set than we need to
1935 * bypass any clusters and immediately issue
1941 * calculate the last logical block number
1942 * that this delayed I/O encompassed
1944 last_blkno
= (upl_f_offset
+ (off_t
)upl_size
) / PAGE_SIZE_64
;
1946 if (vp
->v_flag
& VHASDIRTY
) {
1948 if ( !(vp
->v_flag
& VNOCACHE_DATA
)) {
1950 * we've fallen into the sparse
1951 * cluster method of delaying dirty pages
1952 * first, we need to release the upl if we hold one
1953 * since pages in it may be present in the sparse cluster map
1954 * and may span 2 separate buckets there... if they do and
1955 * we happen to have to flush a bucket to make room and it intersects
1956 * this upl, a deadlock may result on page BUSY
1959 ubc_upl_commit_range(upl
, 0, upl_size
,
1960 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
1962 sparse_cluster_add(vp
, newEOF
, start_blkno
, last_blkno
);
1967 * must have done cached writes that fell into
1968 * the sparse cluster mechanism... we've switched
1969 * to uncached writes on the file, so go ahead
1970 * and push whatever's in the sparse map
1971 * and switch back to normal clustering
1973 * see the comment above concerning a possible deadlock...
1976 ubc_upl_commit_range(upl
, 0, upl_size
,
1977 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
1979 * setting upl_size to 0 keeps us from committing a
1980 * second time in the start_new_cluster path
1984 sparse_cluster_push(vp
, ubc_getsize(vp
), 1);
1987 * no clusters of either type present at this point
1988 * so just go directly to start_new_cluster since
1989 * we know we need to delay this I/O since we've
1990 * already released the pages back into the cache
1991 * to avoid the deadlock with sparse_cluster_push
1993 goto start_new_cluster
;
1997 if (vp
->v_clen
== 0)
1999 * no clusters currently present
2001 goto start_new_cluster
;
2003 for (cl_index
= 0; cl_index
< vp
->v_clen
; cl_index
++) {
2005 * check each cluster that we currently hold
2006 * try to merge some or all of this write into
2007 * one or more of the existing clusters... if
2008 * any portion of the write remains, start a
2011 if (start_blkno
>= vp
->v_clusters
[cl_index
].start_pg
) {
2013 * the current write starts at or after the current cluster
2015 if (last_blkno
<= (vp
->v_clusters
[cl_index
].start_pg
+ MAX_UPL_TRANSFER
)) {
2017 * we have a write that fits entirely
2018 * within the existing cluster limits
2020 if (last_blkno
> vp
->v_clusters
[cl_index
].last_pg
)
2022 * update our idea of where the cluster ends
2024 vp
->v_clusters
[cl_index
].last_pg
= last_blkno
;
2027 if (start_blkno
< (vp
->v_clusters
[cl_index
].start_pg
+ MAX_UPL_TRANSFER
)) {
2029 * we have a write that starts in the middle of the current cluster
2030 * but extends beyond the cluster's limit... we know this because
2031 * of the previous checks
2032 * we'll extend the current cluster to the max
2033 * and update the start_blkno for the current write to reflect that
2034 * the head of it was absorbed into this cluster...
2035 * note that we'll always have a leftover tail in this case since
2036 * full absorbtion would have occurred in the clause above
2038 vp
->v_clusters
[cl_index
].last_pg
= vp
->v_clusters
[cl_index
].start_pg
+ MAX_UPL_TRANSFER
;
2041 int start_pg_in_upl
;
2043 start_pg_in_upl
= upl_f_offset
/ PAGE_SIZE_64
;
2045 if (start_pg_in_upl
< vp
->v_clusters
[cl_index
].last_pg
) {
2046 intersection
= (vp
->v_clusters
[cl_index
].last_pg
- start_pg_in_upl
) * PAGE_SIZE
;
2048 ubc_upl_commit_range(upl
, upl_offset
, intersection
,
2049 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2050 upl_f_offset
+= intersection
;
2051 upl_offset
+= intersection
;
2052 upl_size
-= intersection
;
2055 start_blkno
= vp
->v_clusters
[cl_index
].last_pg
;
2058 * we come here for the case where the current write starts
2059 * beyond the limit of the existing cluster or we have a leftover
2060 * tail after a partial absorbtion
2062 * in either case, we'll check the remaining clusters before
2063 * starting a new one
2067 * the current write starts in front of the cluster we're currently considering
2069 if ((vp
->v_clusters
[cl_index
].last_pg
- start_blkno
) <= MAX_UPL_TRANSFER
) {
2071 * we can just merge the new request into
2072 * this cluster and leave it in the cache
2073 * since the resulting cluster is still
2074 * less than the maximum allowable size
2076 vp
->v_clusters
[cl_index
].start_pg
= start_blkno
;
2078 if (last_blkno
> vp
->v_clusters
[cl_index
].last_pg
) {
2080 * the current write completely
2081 * envelops the existing cluster and since
2082 * each write is limited to at most MAX_UPL_TRANSFER bytes
2083 * we can just use the start and last blocknos of the write
2084 * to generate the cluster limits
2086 vp
->v_clusters
[cl_index
].last_pg
= last_blkno
;
2092 * if we were to combine this write with the current cluster
2093 * we would exceed the cluster size limit.... so,
2094 * let's see if there's any overlap of the new I/O with
2095 * the cluster we're currently considering... in fact, we'll
2096 * stretch the cluster out to it's full limit and see if we
2097 * get an intersection with the current write
2100 if (last_blkno
> vp
->v_clusters
[cl_index
].last_pg
- MAX_UPL_TRANSFER
) {
2102 * the current write extends into the proposed cluster
2103 * clip the length of the current write after first combining it's
2104 * tail with the newly shaped cluster
2106 vp
->v_clusters
[cl_index
].start_pg
= vp
->v_clusters
[cl_index
].last_pg
- MAX_UPL_TRANSFER
;
2109 intersection
= (last_blkno
- vp
->v_clusters
[cl_index
].start_pg
) * PAGE_SIZE
;
2111 if (intersection
> upl_size
)
2113 * because the current write may consist of a number of pages found in the cache
2114 * which are not part of the UPL, we may have an intersection that exceeds
2115 * the size of the UPL that is also part of this write
2117 intersection
= upl_size
;
2119 ubc_upl_commit_range(upl
, upl_offset
+ (upl_size
- intersection
), intersection
,
2120 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2121 upl_size
-= intersection
;
2123 last_blkno
= vp
->v_clusters
[cl_index
].start_pg
;
2126 * if we get here, there was no way to merge
2127 * any portion of this write with this cluster
2128 * or we could only merge part of it which
2129 * will leave a tail...
2130 * we'll check the remaining clusters before starting a new one
2134 if (cl_index
< vp
->v_clen
)
2136 * we found an existing cluster(s) that we
2137 * could entirely merge this I/O into
2141 if (vp
->v_clen
< MAX_CLUSTERS
&& !(vp
->v_flag
& VNOCACHE_DATA
))
2143 * we didn't find an existing cluster to
2144 * merge into, but there's room to start
2147 goto start_new_cluster
;
2150 * no exisitng cluster to merge with and no
2151 * room to start a new one... we'll try
2152 * pushing one of the existing ones... if none of
2153 * them are able to be pushed, we'll switch
2154 * to the sparse cluster mechanism
2155 * cluster_try_push updates v_clen to the
2156 * number of remaining clusters... and
2157 * returns the number of currently unused clusters
2159 if (vp
->v_flag
& VNOCACHE_DATA
)
2164 if (cluster_try_push(vp
, newEOF
, can_delay
, 0) == 0) {
2166 * no more room in the normal cluster mechanism
2167 * so let's switch to the more expansive but expensive
2168 * sparse mechanism....
2169 * first, we need to release the upl if we hold one
2170 * since pages in it may be present in the sparse cluster map (after the cluster_switch)
2171 * and may span 2 separate buckets there... if they do and
2172 * we happen to have to flush a bucket to make room and it intersects
2173 * this upl, a deadlock may result on page BUSY
2176 ubc_upl_commit_range(upl
, upl_offset
, upl_size
,
2177 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2179 sparse_cluster_switch(vp
, newEOF
);
2180 sparse_cluster_add(vp
, newEOF
, start_blkno
, last_blkno
);
2185 * we pushed one cluster successfully, so we must be sequentially writing this file
2186 * otherwise, we would have failed and fallen into the sparse cluster support
2187 * so let's take the opportunity to push out additional clusters as long as we
2188 * remain below the throttle... this will give us better I/O locality if we're
2189 * in a copy loop (i.e. we won't jump back and forth between the read and write points
2190 * however, we don't want to push so much out that the write throttle kicks in and
2191 * hangs this thread up until some of the I/O completes...
2193 while (vp
->v_clen
&& (vp
->v_numoutput
<= (ASYNC_THROTTLE
/ 2)))
2194 cluster_try_push(vp
, newEOF
, 0, 0);
2197 if (vp
->v_clen
== 0)
2198 vp
->v_ciosiz
= devblocksize
;
2200 vp
->v_clusters
[vp
->v_clen
].start_pg
= start_blkno
;
2201 vp
->v_clusters
[vp
->v_clen
].last_pg
= last_blkno
;
2206 ubc_upl_commit_range(upl
, upl_offset
, upl_size
,
2207 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2211 * in order to maintain some semblance of coherency with mapped writes
2212 * we need to write the cluster back out as a multiple of the PAGESIZE
2213 * unless the cluster encompasses the last page of the file... in this
2214 * case we'll round out to the nearest device block boundary
2218 if ((upl_f_offset
+ io_size
) > newEOF
) {
2219 io_size
= newEOF
- upl_f_offset
;
2220 io_size
= (io_size
+ (devblocksize
- 1)) & ~(devblocksize
- 1);
2223 if (flags
& IO_SYNC
)
2224 io_flags
= CL_THROTTLE
| CL_COMMIT
| CL_AGE
;
2226 io_flags
= CL_THROTTLE
| CL_COMMIT
| CL_AGE
| CL_ASYNC
;
2228 if (vp
->v_flag
& VNOCACHE_DATA
)
2229 io_flags
|= CL_DUMP
;
2231 retval
= cluster_io(vp
, upl
, 0, upl_f_offset
, io_size
, devblocksize
,
2232 io_flags
, (struct buf
*)0, (struct clios
*)0);
2235 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_END
,
2236 retval
, 0, uio_resid
, 0, 0);
2242 cluster_read(vp
, uio
, filesize
, devblocksize
, flags
)
2259 if (!((vp
->v_flag
& VNOCACHE_DATA
) && (uio
->uio_segflg
== UIO_USERSPACE
)))
2262 * go do a read through the cache if one of the following is true....
2263 * NOCACHE is not true
2264 * the uio request doesn't target USERSPACE
2266 return (cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
));
2269 while (uio
->uio_resid
&& uio
->uio_offset
< filesize
&& retval
== 0)
2272 * we know we have a resid, so this is safe
2273 * skip over any emtpy vectors
2277 while (iov
->iov_len
== 0) {
2282 upl_size
= PAGE_SIZE
;
2283 upl_flags
= UPL_QUERY_OBJECT_TYPE
;
2285 if ((vm_map_get_upl(current_map(),
2286 (vm_offset_t
)iov
->iov_base
& ~PAGE_MASK
,
2287 &upl_size
, &upl
, NULL
, NULL
, &upl_flags
, 0)) != KERN_SUCCESS
)
2290 * the user app must have passed in an invalid address
2296 * We check every vector target but if it is physically
2297 * contiguous space, we skip the sanity checks.
2299 if (upl_flags
& UPL_PHYS_CONTIG
)
2301 retval
= cluster_phys_read(vp
, uio
, filesize
, devblocksize
, flags
);
2303 else if (uio
->uio_resid
< PAGE_SIZE
)
2306 * we're here because we're don't have a physically contiguous target buffer
2307 * go do a read through the cache if
2308 * the total xfer size is less than a page...
2310 return (cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
));
2312 else if (((int)uio
->uio_offset
& PAGE_MASK
) || ((int)iov
->iov_base
& PAGE_MASK
))
2314 if (((int)uio
->uio_offset
& PAGE_MASK
) == ((int)iov
->iov_base
& PAGE_MASK
))
2317 * Bring the file offset read up to a pagesize boundary
2318 * this will also bring the base address to a page boundary
2319 * since they both are currently on the same offset within a page
2320 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
2321 * so the computed clip_size must always be less than the current uio_resid
2323 clip_size
= (PAGE_SIZE
- (int)(uio
->uio_offset
& PAGE_MASK_64
));
2326 * Fake the resid going into the cluster_read_x call
2327 * and restore it on the way out.
2329 prev_resid
= uio
->uio_resid
;
2330 uio
->uio_resid
= clip_size
;
2331 retval
= cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
);
2332 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
2337 * can't get both the file offset and the buffer offset aligned to a page boundary
2338 * so fire an I/O through the cache for this entire vector
2340 clip_size
= iov
->iov_len
;
2341 prev_resid
= uio
->uio_resid
;
2342 uio
->uio_resid
= clip_size
;
2343 retval
= cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
);
2344 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
2350 * If we come in here, we know the offset into
2351 * the file is on a pagesize boundary
2354 max_io_size
= filesize
- uio
->uio_offset
;
2355 clip_size
= uio
->uio_resid
;
2356 if (iov
->iov_len
< clip_size
)
2357 clip_size
= iov
->iov_len
;
2358 if (max_io_size
< clip_size
)
2359 clip_size
= (int)max_io_size
;
2361 if (clip_size
< PAGE_SIZE
)
2364 * Take care of the tail end of the read in this vector.
2366 prev_resid
= uio
->uio_resid
;
2367 uio
->uio_resid
= clip_size
;
2368 retval
= cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
);
2369 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
2373 /* round clip_size down to a multiple of pagesize */
2374 clip_size
= clip_size
& ~(PAGE_MASK
);
2375 prev_resid
= uio
->uio_resid
;
2376 uio
->uio_resid
= clip_size
;
2377 retval
= cluster_nocopy_read(vp
, uio
, filesize
, devblocksize
, flags
);
2378 if ((retval
==0) && uio
->uio_resid
)
2379 retval
= cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
);
2380 uio
->uio_resid
= prev_resid
- (clip_size
- uio
->uio_resid
);
2389 cluster_read_x(vp
, uio
, filesize
, devblocksize
, flags
)
2396 upl_page_info_t
*pl
;
2398 vm_offset_t upl_offset
;
2407 off_t last_ioread_offset
;
2408 off_t last_request_offset
;
2409 u_int size_of_prefetch
;
2416 struct clios iostate
;
2417 u_int max_rd_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
2418 u_int rd_ahead_enabled
= 1;
2419 u_int prefetch_enabled
= 1;
2422 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_START
,
2423 (int)uio
->uio_offset
, uio
->uio_resid
, (int)filesize
, devblocksize
, 0);
2425 if (cluster_hard_throttle_on(vp
)) {
2426 rd_ahead_enabled
= 0;
2427 prefetch_enabled
= 0;
2429 max_rd_size
= HARD_THROTTLE_MAXSIZE
;
2431 if (vp
->v_flag
& (VRAOFF
|VNOCACHE_DATA
))
2432 rd_ahead_enabled
= 0;
2434 last_request_offset
= uio
->uio_offset
+ uio
->uio_resid
;
2436 if (last_request_offset
> filesize
)
2437 last_request_offset
= filesize
;
2438 b_lblkno
= (u_int
)(uio
->uio_offset
/ PAGE_SIZE_64
);
2439 e_lblkno
= (u_int
)((last_request_offset
- 1) / PAGE_SIZE_64
);
2441 if (vp
->v_ralen
&& (vp
->v_lastr
== b_lblkno
|| (vp
->v_lastr
+ 1) == b_lblkno
)) {
2443 * determine if we already have a read-ahead in the pipe courtesy of the
2444 * last read systemcall that was issued...
2445 * if so, pick up it's extent to determine where we should start
2446 * with respect to any read-ahead that might be necessary to
2447 * garner all the data needed to complete this read systemcall
2449 last_ioread_offset
= (vp
->v_maxra
* PAGE_SIZE_64
) + PAGE_SIZE_64
;
2451 if (last_ioread_offset
< uio
->uio_offset
)
2452 last_ioread_offset
= (off_t
)0;
2453 else if (last_ioread_offset
> last_request_offset
)
2454 last_ioread_offset
= last_request_offset
;
2456 last_ioread_offset
= (off_t
)0;
2458 while (uio
->uio_resid
&& uio
->uio_offset
< filesize
&& retval
== 0) {
2460 * compute the size of the upl needed to encompass
2461 * the requested read... limit each call to cluster_io
2462 * to the maximum UPL size... cluster_io will clip if
2463 * this exceeds the maximum io_size for the device,
2464 * make sure to account for
2465 * a starting offset that's not page aligned
2467 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
2468 upl_f_offset
= uio
->uio_offset
- (off_t
)start_offset
;
2469 max_size
= filesize
- uio
->uio_offset
;
2471 if ((off_t
)((unsigned int)uio
->uio_resid
) < max_size
)
2472 io_size
= uio
->uio_resid
;
2476 if (!(vp
->v_flag
& VNOCACHE_DATA
)) {
2483 * if we keep finding the pages we need already in the cache, then
2484 * don't bother to call cluster_rd_prefetch since it costs CPU cycles
2485 * to determine that we have all the pages we need... once we miss in
2486 * the cache and have issued an I/O, than we'll assume that we're likely
2487 * to continue to miss in the cache and it's to our advantage to try and prefetch
2489 if (last_request_offset
&& last_ioread_offset
&& (size_of_prefetch
= (last_request_offset
- last_ioread_offset
))) {
2490 if ((last_ioread_offset
- uio
->uio_offset
) <= max_rd_size
&& prefetch_enabled
) {
2492 * we've already issued I/O for this request and
2493 * there's still work to do and
2494 * our prefetch stream is running dry, so issue a
2495 * pre-fetch I/O... the I/O latency will overlap
2496 * with the copying of the data
2498 if (size_of_prefetch
> max_rd_size
)
2499 size_of_prefetch
= max_rd_size
;
2501 size_of_prefetch
= cluster_rd_prefetch(vp
, last_ioread_offset
, size_of_prefetch
, filesize
, devblocksize
);
2503 last_ioread_offset
+= (off_t
)(size_of_prefetch
* PAGE_SIZE
);
2505 if (last_ioread_offset
> last_request_offset
)
2506 last_ioread_offset
= last_request_offset
;
2510 * limit the size of the copy we're about to do so that
2511 * we can notice that our I/O pipe is running dry and
2512 * get the next I/O issued before it does go dry
2514 if (last_ioread_offset
&& io_size
> ((MAX_UPL_TRANSFER
* PAGE_SIZE
) / 4))
2515 io_resid
= ((MAX_UPL_TRANSFER
* PAGE_SIZE
) / 4);
2519 io_requested
= io_resid
;
2521 retval
= cluster_copy_ubc_data(vp
, uio
, &io_resid
, 0);
2523 io_size
-= (io_requested
- io_resid
);
2525 if (retval
|| io_resid
)
2527 * if we run into a real error or
2528 * a page that is not in the cache
2529 * we need to leave streaming mode
2533 if ((io_size
== 0 || last_ioread_offset
== last_request_offset
) && rd_ahead_enabled
) {
2535 * we're already finished the I/O for this read request
2536 * let's see if we should do a read-ahead
2538 cluster_rd_ahead(vp
, b_lblkno
, e_lblkno
, filesize
, devblocksize
);
2544 if (e_lblkno
< vp
->v_lastr
)
2546 vp
->v_lastr
= e_lblkno
;
2550 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
2551 upl_f_offset
= uio
->uio_offset
- (off_t
)start_offset
;
2552 max_size
= filesize
- uio
->uio_offset
;
2554 if (io_size
> max_rd_size
)
2555 io_size
= max_rd_size
;
2557 upl_size
= (start_offset
+ io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
2559 if (upl_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
) / 4)
2560 upl_size
= (MAX_UPL_TRANSFER
* PAGE_SIZE
) / 4;
2561 pages_in_upl
= upl_size
/ PAGE_SIZE
;
2563 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 33)) | DBG_FUNC_START
,
2564 (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
2566 kret
= ubc_create_upl(vp
,
2572 if (kret
!= KERN_SUCCESS
)
2573 panic("cluster_read: failed to get pagelist");
2575 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 33)) | DBG_FUNC_END
,
2576 (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
2579 * scan from the beginning of the upl looking for the first
2580 * non-valid page.... this will become the first page in
2581 * the request we're going to make to 'cluster_io'... if all
2582 * of the pages are valid, we won't call through to 'cluster_io'
2584 for (start_pg
= 0; start_pg
< pages_in_upl
; start_pg
++) {
2585 if (!upl_valid_page(pl
, start_pg
))
2590 * scan from the starting invalid page looking for a valid
2591 * page before the end of the upl is reached, if we
2592 * find one, then it will be the last page of the request to
2595 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
2596 if (upl_valid_page(pl
, last_pg
))
2599 iostate
.io_completed
= 0;
2600 iostate
.io_issued
= 0;
2601 iostate
.io_error
= 0;
2602 iostate
.io_wanted
= 0;
2604 if (start_pg
< last_pg
) {
2606 * we found a range of 'invalid' pages that must be filled
2607 * if the last page in this range is the last page of the file
2608 * we may have to clip the size of it to keep from reading past
2609 * the end of the last physical block associated with the file
2611 upl_offset
= start_pg
* PAGE_SIZE
;
2612 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
2614 if ((upl_f_offset
+ upl_offset
+ io_size
) > filesize
)
2615 io_size
= filesize
- (upl_f_offset
+ upl_offset
);
2618 * issue an asynchronous read to cluster_io
2621 error
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
,
2622 io_size
, devblocksize
, CL_READ
| CL_ASYNC
, (struct buf
*)0, &iostate
);
2626 * if the read completed successfully, or there was no I/O request
2627 * issued, than copy the data into user land via 'cluster_upl_copy_data'
2628 * we'll first add on any 'valid'
2629 * pages that were present in the upl when we acquired it.
2633 for (uio_last
= last_pg
; uio_last
< pages_in_upl
; uio_last
++) {
2634 if (!upl_valid_page(pl
, uio_last
))
2638 * compute size to transfer this round, if uio->uio_resid is
2639 * still non-zero after this attempt, we'll loop around and
2640 * set up for another I/O.
2642 val_size
= (uio_last
* PAGE_SIZE
) - start_offset
;
2644 if (val_size
> max_size
)
2645 val_size
= max_size
;
2647 if (val_size
> uio
->uio_resid
)
2648 val_size
= uio
->uio_resid
;
2650 if (last_ioread_offset
== 0)
2651 last_ioread_offset
= uio
->uio_offset
+ val_size
;
2653 if ((size_of_prefetch
= (last_request_offset
- last_ioread_offset
)) && prefetch_enabled
) {
2655 * if there's still I/O left to do for this request, and...
2656 * we're not in hard throttle mode, then issue a
2657 * pre-fetch I/O... the I/O latency will overlap
2658 * with the copying of the data
2660 size_of_prefetch
= cluster_rd_prefetch(vp
, last_ioread_offset
, size_of_prefetch
, filesize
, devblocksize
);
2662 last_ioread_offset
+= (off_t
)(size_of_prefetch
* PAGE_SIZE
);
2664 if (last_ioread_offset
> last_request_offset
)
2665 last_ioread_offset
= last_request_offset
;
2667 } else if ((uio
->uio_offset
+ val_size
) == last_request_offset
) {
2669 * this transfer will finish this request, so...
2670 * let's try to read ahead if we're in
2671 * a sequential access pattern and we haven't
2672 * explicitly disabled it
2674 if (rd_ahead_enabled
)
2675 cluster_rd_ahead(vp
, b_lblkno
, e_lblkno
, filesize
, devblocksize
);
2677 if (e_lblkno
< vp
->v_lastr
)
2679 vp
->v_lastr
= e_lblkno
;
2681 while (iostate
.io_issued
!= iostate
.io_completed
) {
2682 iostate
.io_wanted
= 1;
2683 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO
+ 1, "cluster_read_x", 0);
2685 if (iostate
.io_error
)
2686 error
= iostate
.io_error
;
2688 retval
= cluster_copy_upl_data(uio
, upl
, start_offset
, val_size
);
2690 if (start_pg
< last_pg
) {
2692 * compute the range of pages that we actually issued an I/O for
2693 * and either commit them as valid if the I/O succeeded
2694 * or abort them if the I/O failed
2696 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
2698 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_START
,
2699 (int)upl
, start_pg
* PAGE_SIZE
, io_size
, error
, 0);
2701 if (error
|| (vp
->v_flag
& VNOCACHE_DATA
))
2702 ubc_upl_abort_range(upl
, start_pg
* PAGE_SIZE
, io_size
,
2703 UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
2705 ubc_upl_commit_range(upl
, start_pg
* PAGE_SIZE
, io_size
,
2706 UPL_COMMIT_CLEAR_DIRTY
|
2707 UPL_COMMIT_FREE_ON_EMPTY
|
2708 UPL_COMMIT_INACTIVATE
);
2710 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_END
,
2711 (int)upl
, start_pg
* PAGE_SIZE
, io_size
, error
, 0);
2713 if ((last_pg
- start_pg
) < pages_in_upl
) {
2718 * the set of pages that we issued an I/O for did not encompass
2719 * the entire upl... so just release these without modifying
2723 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
2725 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_START
,
2726 (int)upl
, -1, pages_in_upl
- (last_pg
- start_pg
), 0, 0);
2730 * we found some already valid pages at the beginning of
2731 * the upl commit these back to the inactive list with
2734 for (cur_pg
= 0; cur_pg
< start_pg
; cur_pg
++) {
2735 commit_flags
= UPL_COMMIT_FREE_ON_EMPTY
2736 | UPL_COMMIT_INACTIVATE
;
2738 if (upl_dirty_page(pl
, cur_pg
))
2739 commit_flags
|= UPL_COMMIT_SET_DIRTY
;
2741 if ( !(commit_flags
& UPL_COMMIT_SET_DIRTY
) && (vp
->v_flag
& VNOCACHE_DATA
))
2742 ubc_upl_abort_range(upl
, cur_pg
* PAGE_SIZE
, PAGE_SIZE
,
2743 UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
2745 ubc_upl_commit_range(upl
, cur_pg
* PAGE_SIZE
,
2746 PAGE_SIZE
, commit_flags
);
2749 if (last_pg
< uio_last
) {
2751 * we found some already valid pages immediately after the
2752 * pages we issued I/O for, commit these back to the
2753 * inactive list with reference cleared
2755 for (cur_pg
= last_pg
; cur_pg
< uio_last
; cur_pg
++) {
2756 commit_flags
= UPL_COMMIT_FREE_ON_EMPTY
2757 | UPL_COMMIT_INACTIVATE
;
2759 if (upl_dirty_page(pl
, cur_pg
))
2760 commit_flags
|= UPL_COMMIT_SET_DIRTY
;
2762 if ( !(commit_flags
& UPL_COMMIT_SET_DIRTY
) && (vp
->v_flag
& VNOCACHE_DATA
))
2763 ubc_upl_abort_range(upl
, cur_pg
* PAGE_SIZE
, PAGE_SIZE
,
2764 UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
2766 ubc_upl_commit_range(upl
, cur_pg
* PAGE_SIZE
,
2767 PAGE_SIZE
, commit_flags
);
2770 if (uio_last
< pages_in_upl
) {
2772 * there were some invalid pages beyond the valid pages
2773 * that we didn't issue an I/O for, just release them
2776 ubc_upl_abort_range(upl
, uio_last
* PAGE_SIZE
,
2777 (pages_in_upl
- uio_last
) * PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
2780 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_END
,
2781 (int)upl
, -1, -1, 0, 0);
2787 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_END
,
2788 (int)uio
->uio_offset
, uio
->uio_resid
, vp
->v_lastr
, retval
, 0);
2795 cluster_nocopy_read(vp
, uio
, filesize
, devblocksize
, flags
)
2803 upl_page_info_t
*pl
;
2804 vm_offset_t upl_offset
;
2808 int upl_needed_size
;
2814 int force_data_sync
;
2816 struct clios iostate
;
2817 u_int max_rd_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
2818 u_int max_rd_ahead
= MAX_UPL_TRANSFER
* PAGE_SIZE
* 2;
2821 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_START
,
2822 (int)uio
->uio_offset
, uio
->uio_resid
, (int)filesize
, devblocksize
, 0);
2825 * When we enter this routine, we know
2826 * -- the offset into the file is on a pagesize boundary
2827 * -- the resid is a page multiple
2828 * -- the resid will not exceed iov_len
2831 iostate
.io_completed
= 0;
2832 iostate
.io_issued
= 0;
2833 iostate
.io_error
= 0;
2834 iostate
.io_wanted
= 0;
2838 if (cluster_hard_throttle_on(vp
)) {
2839 max_rd_size
= HARD_THROTTLE_MAXSIZE
;
2840 max_rd_ahead
= HARD_THROTTLE_MAXSIZE
- 1;
2842 while (uio
->uio_resid
&& uio
->uio_offset
< filesize
&& retval
== 0) {
2844 max_io_size
= filesize
- uio
->uio_offset
;
2846 if (max_io_size
< (off_t
)((unsigned int)uio
->uio_resid
))
2847 io_size
= max_io_size
;
2849 io_size
= uio
->uio_resid
;
2852 * First look for pages already in the cache
2853 * and move them to user space.
2855 retval
= cluster_copy_ubc_data(vp
, uio
, &io_size
, 0);
2859 * we may have already spun some portion of this request
2860 * off as async requests... we need to wait for the I/O
2861 * to complete before returning
2863 goto wait_for_reads
;
2866 * If we are already finished with this read, then return
2870 * we may have already spun some portion of this request
2871 * off as async requests... we need to wait for the I/O
2872 * to complete before returning
2874 goto wait_for_reads
;
2876 max_io_size
= io_size
;
2878 if (max_io_size
> max_rd_size
)
2879 max_io_size
= max_rd_size
;
2883 ubc_range_op(vp
, uio
->uio_offset
, uio
->uio_offset
+ max_io_size
, UPL_ROP_ABSENT
, &io_size
);
2887 * we may have already spun some portion of this request
2888 * off as async requests... we need to wait for the I/O
2889 * to complete before returning
2891 goto wait_for_reads
;
2893 upl_offset
= (vm_offset_t
)iov
->iov_base
& PAGE_MASK
;
2894 upl_needed_size
= (upl_offset
+ io_size
+ (PAGE_SIZE
-1)) & ~PAGE_MASK
;
2896 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_START
,
2897 (int)upl_offset
, upl_needed_size
, (int)iov
->iov_base
, io_size
, 0);
2899 for (force_data_sync
= 0; force_data_sync
< 3; force_data_sync
++) {
2901 upl_size
= upl_needed_size
;
2902 upl_flags
= UPL_FILE_IO
| UPL_NO_SYNC
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
2904 kret
= vm_map_get_upl(current_map(),
2905 (vm_offset_t
)iov
->iov_base
& ~PAGE_MASK
,
2906 &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
, force_data_sync
);
2908 if (kret
!= KERN_SUCCESS
) {
2909 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
2910 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
2912 * cluster_nocopy_read: failed to get pagelist
2914 * we may have already spun some portion of this request
2915 * off as async requests... we need to wait for the I/O
2916 * to complete before returning
2918 goto wait_for_reads
;
2920 pages_in_pl
= upl_size
/ PAGE_SIZE
;
2921 pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
2923 for (i
= 0; i
< pages_in_pl
; i
++) {
2924 if (!upl_valid_page(pl
, i
))
2927 if (i
== pages_in_pl
)
2930 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
2931 UPL_ABORT_FREE_ON_EMPTY
);
2933 if (force_data_sync
>= 3) {
2934 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
2935 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
2937 goto wait_for_reads
;
2940 * Consider the possibility that upl_size wasn't satisfied.
2942 if (upl_size
!= upl_needed_size
)
2943 io_size
= (upl_size
- (int)upl_offset
) & ~PAGE_MASK
;
2946 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
2947 UPL_ABORT_FREE_ON_EMPTY
);
2948 goto wait_for_reads
;
2950 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
2951 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
2954 * request asynchronously so that we can overlap
2955 * the preparation of the next I/O
2956 * if there are already too many outstanding reads
2957 * wait until some have completed before issuing the next read
2959 while ((iostate
.io_issued
- iostate
.io_completed
) > max_rd_ahead
) {
2960 iostate
.io_wanted
= 1;
2961 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO
+ 1, "cluster_nocopy_read", 0);
2963 if (iostate
.io_error
) {
2965 * one of the earlier reads we issued ran into a hard error
2966 * don't issue any more reads, cleanup the UPL
2967 * that was just created but not used, then
2968 * go wait for any other reads to complete before
2969 * returning the error to the caller
2971 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
2972 UPL_ABORT_FREE_ON_EMPTY
);
2974 goto wait_for_reads
;
2976 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 73)) | DBG_FUNC_START
,
2977 (int)upl
, (int)upl_offset
, (int)uio
->uio_offset
, io_size
, 0);
2979 retval
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
,
2980 io_size
, devblocksize
,
2981 CL_PRESERVE
| CL_COMMIT
| CL_READ
| CL_ASYNC
| CL_NOZERO
,
2982 (struct buf
*)0, &iostate
);
2985 * update the uio structure
2987 iov
->iov_base
+= io_size
;
2988 iov
->iov_len
-= io_size
;
2989 uio
->uio_resid
-= io_size
;
2990 uio
->uio_offset
+= io_size
;
2992 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 73)) | DBG_FUNC_END
,
2993 (int)upl
, (int)uio
->uio_offset
, (int)uio
->uio_resid
, retval
, 0);
2999 * make sure all async reads that are part of this stream
3000 * have completed before we return
3002 while (iostate
.io_issued
!= iostate
.io_completed
) {
3003 iostate
.io_wanted
= 1;
3004 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO
+ 1, "cluster_nocopy_read", 0);
3006 if (iostate
.io_error
)
3007 retval
= iostate
.io_error
;
3009 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_END
,
3010 (int)uio
->uio_offset
, (int)uio
->uio_resid
, 6, retval
, 0);
3017 cluster_phys_read(vp
, uio
, filesize
, devblocksize
, flags
)
3024 upl_page_info_t
*pl
;
3026 vm_offset_t upl_offset
;
3032 int upl_needed_size
;
3037 struct clios iostate
;
3041 * When we enter this routine, we know
3042 * -- the resid will not exceed iov_len
3043 * -- the target address is physically contiguous
3048 max_size
= filesize
- uio
->uio_offset
;
3050 if (max_size
> (off_t
)((unsigned int)iov
->iov_len
))
3051 io_size
= iov
->iov_len
;
3055 upl_offset
= (vm_offset_t
)iov
->iov_base
& PAGE_MASK
;
3056 upl_needed_size
= upl_offset
+ io_size
;
3060 upl_size
= upl_needed_size
;
3061 upl_flags
= UPL_FILE_IO
| UPL_NO_SYNC
| UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
3063 kret
= vm_map_get_upl(current_map(),
3064 (vm_offset_t
)iov
->iov_base
& ~PAGE_MASK
,
3065 &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
, 0);
3067 if (kret
!= KERN_SUCCESS
) {
3069 * cluster_phys_read: failed to get pagelist
3073 if (upl_size
< upl_needed_size
) {
3075 * The upl_size wasn't satisfied.
3077 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3081 pl
= ubc_upl_pageinfo(upl
);
3083 dst_paddr
= ((addr64_t
)upl_phys_page(pl
, 0) << 12) + ((addr64_t
)((u_int
)iov
->iov_base
& PAGE_MASK
));
3085 while (((uio
->uio_offset
& (devblocksize
- 1)) || io_size
< devblocksize
) && io_size
) {
3088 head_size
= devblocksize
- (int)(uio
->uio_offset
& (devblocksize
- 1));
3090 if (head_size
> io_size
)
3091 head_size
= io_size
;
3093 error
= cluster_align_phys_io(vp
, uio
, dst_paddr
, head_size
, devblocksize
, CL_READ
);
3096 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3100 upl_offset
+= head_size
;
3101 dst_paddr
+= head_size
;
3102 io_size
-= head_size
;
3104 tail_size
= io_size
& (devblocksize
- 1);
3105 io_size
-= tail_size
;
3107 iostate
.io_completed
= 0;
3108 iostate
.io_issued
= 0;
3109 iostate
.io_error
= 0;
3110 iostate
.io_wanted
= 0;
3112 while (io_size
&& error
== 0) {
3115 if (io_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
3116 xsize
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
3120 * request asynchronously so that we can overlap
3121 * the preparation of the next I/O... we'll do
3122 * the commit after all the I/O has completed
3123 * since its all issued against the same UPL
3124 * if there are already too many outstanding reads
3125 * wait until some have completed before issuing the next
3127 while ((iostate
.io_issued
- iostate
.io_completed
) > (2 * MAX_UPL_TRANSFER
* PAGE_SIZE
)) {
3128 iostate
.io_wanted
= 1;
3129 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO
+ 1, "cluster_phys_read", 0);
3132 error
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
, xsize
, 0,
3133 CL_READ
| CL_NOZERO
| CL_DEV_MEMORY
| CL_ASYNC
,
3134 (struct buf
*)0, &iostate
);
3136 * The cluster_io read was issued successfully,
3137 * update the uio structure
3140 uio
->uio_resid
-= xsize
;
3141 iov
->iov_len
-= xsize
;
3142 iov
->iov_base
+= xsize
;
3143 uio
->uio_offset
+= xsize
;
3145 upl_offset
+= xsize
;
3150 * make sure all async reads that are part of this stream
3151 * have completed before we proceed
3153 while (iostate
.io_issued
!= iostate
.io_completed
) {
3154 iostate
.io_wanted
= 1;
3155 tsleep((caddr_t
)&iostate
.io_wanted
, PRIBIO
+ 1, "cluster_phys_read", 0);
3157 if (iostate
.io_error
) {
3158 error
= iostate
.io_error
;
3160 if (error
== 0 && tail_size
)
3161 error
= cluster_align_phys_io(vp
, uio
, dst_paddr
, tail_size
, devblocksize
, CL_READ
);
3164 * just release our hold on the physically contiguous
3165 * region without changing any state
3167 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3174 * generate advisory I/O's in the largest chunks possible
3175 * the completed pages will be released into the VM cache
3178 advisory_read(vp
, filesize
, f_offset
, resid
, devblocksize
)
3185 upl_page_info_t
*pl
;
3187 vm_offset_t upl_offset
;
3201 if (!UBCINFOEXISTS(vp
))
3204 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 60)) | DBG_FUNC_START
,
3205 (int)f_offset
, resid
, (int)filesize
, devblocksize
, 0);
3207 while (resid
&& f_offset
< filesize
&& retval
== 0) {
3209 * compute the size of the upl needed to encompass
3210 * the requested read... limit each call to cluster_io
3211 * to the maximum UPL size... cluster_io will clip if
3212 * this exceeds the maximum io_size for the device,
3213 * make sure to account for
3214 * a starting offset that's not page aligned
3216 start_offset
= (int)(f_offset
& PAGE_MASK_64
);
3217 upl_f_offset
= f_offset
- (off_t
)start_offset
;
3218 max_size
= filesize
- f_offset
;
3220 if (resid
< max_size
)
3225 upl_size
= (start_offset
+ io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
3226 if (upl_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
3227 upl_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
3231 * return the number of contiguously present pages in the cache
3232 * starting at upl_f_offset within the file
3234 ubc_range_op(vp
, upl_f_offset
, upl_f_offset
+ upl_size
, UPL_ROP_PRESENT
, &skip_range
);
3238 * skip over pages already present in the cache
3240 io_size
= skip_range
- start_offset
;
3242 f_offset
+= io_size
;
3245 if (skip_range
== upl_size
)
3248 * have to issue some real I/O
3249 * at this point, we know it's starting on a page boundary
3250 * because we've skipped over at least the first page in the request
3253 upl_f_offset
+= skip_range
;
3254 upl_size
-= skip_range
;
3256 pages_in_upl
= upl_size
/ PAGE_SIZE
;
3258 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 61)) | DBG_FUNC_START
,
3259 (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
3261 kret
= ubc_create_upl(vp
,
3266 UPL_RET_ONLY_ABSENT
| UPL_SET_LITE
);
3267 if (kret
!= KERN_SUCCESS
)
3272 * before we start marching forward, we must make sure we end on
3273 * a present page, otherwise we will be working with a freed
3276 for (last_pg
= pages_in_upl
- 1; last_pg
>= 0; last_pg
--) {
3277 if (upl_page_present(pl
, last_pg
))
3280 pages_in_upl
= last_pg
+ 1;
3283 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 61)) | DBG_FUNC_END
,
3284 (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
3287 for (last_pg
= 0; last_pg
< pages_in_upl
; ) {
3289 * scan from the beginning of the upl looking for the first
3290 * page that is present.... this will become the first page in
3291 * the request we're going to make to 'cluster_io'... if all
3292 * of the pages are absent, we won't call through to 'cluster_io'
3294 for (start_pg
= last_pg
; start_pg
< pages_in_upl
; start_pg
++) {
3295 if (upl_page_present(pl
, start_pg
))
3300 * scan from the starting present page looking for an absent
3301 * page before the end of the upl is reached, if we
3302 * find one, then it will terminate the range of pages being
3303 * presented to 'cluster_io'
3305 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
3306 if (!upl_page_present(pl
, last_pg
))
3310 if (last_pg
> start_pg
) {
3312 * we found a range of pages that must be filled
3313 * if the last page in this range is the last page of the file
3314 * we may have to clip the size of it to keep from reading past
3315 * the end of the last physical block associated with the file
3317 upl_offset
= start_pg
* PAGE_SIZE
;
3318 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
3320 if ((upl_f_offset
+ upl_offset
+ io_size
) > filesize
)
3321 io_size
= filesize
- (upl_f_offset
+ upl_offset
);
3324 * issue an asynchronous read to cluster_io
3326 retval
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, io_size
, devblocksize
,
3327 CL_ASYNC
| CL_READ
| CL_COMMIT
| CL_AGE
, (struct buf
*)0, (struct clios
*)0);
3333 ubc_upl_abort(upl
, 0);
3335 io_size
= upl_size
- start_offset
;
3337 if (io_size
> resid
)
3339 f_offset
+= io_size
;
3343 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 60)) | DBG_FUNC_END
,
3344 (int)f_offset
, resid
, retval
, 0, 0);
3356 if (!UBCINFOEXISTS(vp
) || (vp
->v_clen
== 0 && !(vp
->v_flag
& VHASDIRTY
)))
3359 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_START
,
3360 vp
->v_flag
& VHASDIRTY
, vp
->v_clen
, 0, 0, 0);
3362 if (vp
->v_flag
& VHASDIRTY
) {
3363 sparse_cluster_push(vp
, ubc_getsize(vp
), 1);
3368 retval
= cluster_try_push(vp
, ubc_getsize(vp
), 0, 1);
3370 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_END
,
3371 vp
->v_flag
& VHASDIRTY
, vp
->v_clen
, retval
, 0, 0);
3384 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_START
, (int)vp
, (int)vp
->v_scmap
, vp
->v_scdirty
, 0, 0);
3386 if (vp
->v_flag
& VHASDIRTY
) {
3387 vfs_drt_control(&(vp
->v_scmap
), 0);
3389 vp
->v_flag
&= ~VHASDIRTY
;
3391 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_END
, (int)vp
, (int)vp
->v_scmap
, vp
->v_scdirty
, 0, 0);
3396 cluster_try_push(vp
, EOF
, can_delay
, push_all
)
3408 struct v_cluster l_clusters
[MAX_CLUSTERS
];
3411 * make a local 'sorted' copy of the clusters
3412 * and clear vp->v_clen so that new clusters can
3415 for (cl_index
= 0; cl_index
< vp
->v_clen
; cl_index
++) {
3416 for (min_index
= -1, cl_index1
= 0; cl_index1
< vp
->v_clen
; cl_index1
++) {
3417 if (vp
->v_clusters
[cl_index1
].start_pg
== vp
->v_clusters
[cl_index1
].last_pg
)
3419 if (min_index
== -1)
3420 min_index
= cl_index1
;
3421 else if (vp
->v_clusters
[cl_index1
].start_pg
< vp
->v_clusters
[min_index
].start_pg
)
3422 min_index
= cl_index1
;
3424 if (min_index
== -1)
3426 l_clusters
[cl_index
].start_pg
= vp
->v_clusters
[min_index
].start_pg
;
3427 l_clusters
[cl_index
].last_pg
= vp
->v_clusters
[min_index
].last_pg
;
3429 vp
->v_clusters
[min_index
].start_pg
= vp
->v_clusters
[min_index
].last_pg
;
3434 if (can_delay
&& cl_len
== MAX_CLUSTERS
) {
3438 * determine if we appear to be writing the file sequentially
3439 * if not, by returning without having pushed any clusters
3440 * we will cause this vnode to be pushed into the sparse cluster mechanism
3441 * used for managing more random I/O patterns
3443 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
3444 * that's why we're in try_push with can_delay true...
3446 * check to make sure that all the clusters except the last one are 'full'... and that each cluster
3447 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
3448 * so we can just make a simple pass through up, to but not including the last one...
3449 * note that last_pg is not inclusive, so it will be equal to the start_pg of the next cluster if they
3452 * we let the last one be partial as long as it was adjacent to the previous one...
3453 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
3454 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
3456 for (i
= 0; i
< MAX_CLUSTERS
- 1; i
++) {
3457 if ((l_clusters
[i
].last_pg
- l_clusters
[i
].start_pg
) != MAX_UPL_TRANSFER
)
3459 if (l_clusters
[i
].last_pg
!= l_clusters
[i
+1].start_pg
)
3463 for (cl_index
= 0; cl_index
< cl_len
; cl_index
++) {
3465 * try to push each cluster in turn... cluster_push_x may not
3466 * push the cluster if can_delay is TRUE and the cluster doesn't
3467 * meet the critera for an immediate push
3469 if (cluster_push_x(vp
, EOF
, l_clusters
[cl_index
].start_pg
, l_clusters
[cl_index
].last_pg
, can_delay
)) {
3470 l_clusters
[cl_index
].start_pg
= 0;
3471 l_clusters
[cl_index
].last_pg
= 0;
3480 if (cl_len
> cl_pushed
) {
3482 * we didn't push all of the clusters, so
3483 * lets try to merge them back in to the vnode
3485 if ((MAX_CLUSTERS
- vp
->v_clen
) < (cl_len
- cl_pushed
)) {
3487 * we picked up some new clusters while we were trying to
3488 * push the old ones (I don't think this can happen because
3489 * I'm holding the lock, but just in case)... the sum of the
3490 * leftovers plus the new cluster count exceeds our ability
3491 * to represent them, so switch to the sparse cluster mechanism
3495 * first collect the new clusters sitting in the vp
3497 sparse_cluster_switch(vp
, EOF
);
3499 for (cl_index
= 0, cl_index1
= 0; cl_index
< cl_len
; cl_index
++) {
3500 if (l_clusters
[cl_index
].start_pg
== l_clusters
[cl_index
].last_pg
)
3502 vp
->v_clusters
[cl_index1
].start_pg
= l_clusters
[cl_index
].start_pg
;
3503 vp
->v_clusters
[cl_index1
].last_pg
= l_clusters
[cl_index
].last_pg
;
3508 * update the cluster count
3510 vp
->v_clen
= cl_index1
;
3513 * and collect the original clusters that were moved into the
3514 * local storage for sorting purposes
3516 sparse_cluster_switch(vp
, EOF
);
3520 * we've got room to merge the leftovers back in
3521 * just append them starting at the next 'hole'
3522 * represented by vp->v_clen
3524 for (cl_index
= 0, cl_index1
= vp
->v_clen
; cl_index
< cl_len
; cl_index
++) {
3525 if (l_clusters
[cl_index
].start_pg
== l_clusters
[cl_index
].last_pg
)
3528 vp
->v_clusters
[cl_index1
].start_pg
= l_clusters
[cl_index
].start_pg
;
3529 vp
->v_clusters
[cl_index1
].last_pg
= l_clusters
[cl_index
].last_pg
;
3534 * update the cluster count
3536 vp
->v_clen
= cl_index1
;
3539 return(MAX_CLUSTERS
- vp
->v_clen
);
3545 cluster_push_x(vp
, EOF
, first
, last
, can_delay
)
3552 upl_page_info_t
*pl
;
3554 vm_offset_t upl_offset
;
3567 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_START
,
3568 vp
->v_clen
, first
, last
, EOF
, 0);
3570 if ((pages_in_upl
= last
- first
) == 0) {
3571 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 0, 0, 0, 0);
3575 upl_size
= pages_in_upl
* PAGE_SIZE
;
3576 upl_f_offset
= ((off_t
)first
) * PAGE_SIZE_64
;
3578 if (upl_f_offset
+ upl_size
>= EOF
) {
3580 if (upl_f_offset
>= EOF
) {
3582 * must have truncated the file and missed
3583 * clearing a dangling cluster (i.e. it's completely
3584 * beyond the new EOF
3586 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 1, 0, 0, 0);
3590 size
= EOF
- upl_f_offset
;
3592 upl_size
= (size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
3593 pages_in_upl
= upl_size
/ PAGE_SIZE
;
3597 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_START
, upl_size
, size
, 0, 0, 0);
3599 if (vp
->v_flag
& VNOCACHE_DATA
)
3600 upl_flags
= UPL_COPYOUT_FROM
| UPL_RET_ONLY_DIRTY
| UPL_SET_LITE
| UPL_WILL_BE_DUMPED
;
3602 upl_flags
= UPL_COPYOUT_FROM
| UPL_RET_ONLY_DIRTY
| UPL_SET_LITE
;
3604 kret
= ubc_create_upl(vp
,
3610 if (kret
!= KERN_SUCCESS
)
3611 panic("cluster_push: failed to get pagelist");
3613 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_END
, (int)upl
, upl_f_offset
, 0, 0, 0);
3616 * since we only asked for the dirty pages back
3617 * it's possible that we may only get a few or even none, so...
3618 * before we start marching forward, we must make sure we know
3619 * where the last present page is in the UPL, otherwise we could
3620 * end up working with a freed upl due to the FREE_ON_EMPTY semantics
3621 * employed by commit_range and abort_range.
3623 for (last_pg
= pages_in_upl
- 1; last_pg
>= 0; last_pg
--) {
3624 if (upl_page_present(pl
, last_pg
))
3627 pages_in_upl
= last_pg
+ 1;
3629 if (pages_in_upl
== 0) {
3630 ubc_upl_abort(upl
, 0);
3632 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 2, 0, 0, 0);
3636 for (last_pg
= 0; last_pg
< pages_in_upl
; ) {
3638 * find the next dirty page in the UPL
3639 * this will become the first page in the
3640 * next I/O to generate
3642 for (start_pg
= last_pg
; start_pg
< pages_in_upl
; start_pg
++) {
3643 if (upl_dirty_page(pl
, start_pg
))
3645 if (upl_page_present(pl
, start_pg
))
3647 * RET_ONLY_DIRTY will return non-dirty 'precious' pages
3648 * just release these unchanged since we're not going
3649 * to steal them or change their state
3651 ubc_upl_abort_range(upl
, start_pg
* PAGE_SIZE
, PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
3653 if (start_pg
>= pages_in_upl
)
3655 * done... no more dirty pages to push
3658 if (start_pg
> last_pg
)
3660 * skipped over some non-dirty pages
3662 size
-= ((start_pg
- last_pg
) * PAGE_SIZE
);
3665 * find a range of dirty pages to write
3667 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
3668 if (!upl_dirty_page(pl
, last_pg
))
3671 upl_offset
= start_pg
* PAGE_SIZE
;
3673 io_size
= min(size
, (last_pg
- start_pg
) * PAGE_SIZE
);
3675 if (vp
->v_flag
& VNOCACHE_DATA
)
3676 io_flags
= CL_THROTTLE
| CL_COMMIT
| CL_ASYNC
| CL_DUMP
;
3678 io_flags
= CL_THROTTLE
| CL_COMMIT
| CL_ASYNC
;
3680 cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, io_size
, vp
->v_ciosiz
, io_flags
, (struct buf
*)0, (struct clios
*)0);
3684 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 3, 0, 0, 0);
3691 sparse_cluster_switch(struct vnode
*vp
, off_t EOF
)
3695 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 78)) | DBG_FUNC_START
, (int)vp
, (int)vp
->v_scmap
, vp
->v_scdirty
, 0, 0);
3697 if ( !(vp
->v_flag
& VHASDIRTY
)) {
3698 vp
->v_flag
|= VHASDIRTY
;
3702 for (cl_index
= 0; cl_index
< vp
->v_clen
; cl_index
++) {
3707 for (start_pg
= vp
->v_clusters
[cl_index
].start_pg
; start_pg
< vp
->v_clusters
[cl_index
].last_pg
; start_pg
++) {
3709 if (ubc_page_op(vp
, (off_t
)(((off_t
)start_pg
) * PAGE_SIZE_64
), 0, 0, &flags
) == KERN_SUCCESS
) {
3710 if (flags
& UPL_POP_DIRTY
)
3711 sparse_cluster_add(vp
, EOF
, start_pg
, start_pg
+ 1);
3715 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 78)) | DBG_FUNC_END
, (int)vp
, (int)vp
->v_scmap
, vp
->v_scdirty
, 0, 0);
3720 sparse_cluster_push(struct vnode
*vp
, off_t EOF
, int push_all
)
3727 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 79)) | DBG_FUNC_START
, (int)vp
, (int)vp
->v_scmap
, vp
->v_scdirty
, push_all
, 0);
3730 vfs_drt_control(&(vp
->v_scmap
), 1);
3733 if (vfs_drt_get_cluster(&(vp
->v_scmap
), &offset
, &length
) != KERN_SUCCESS
) {
3734 vp
->v_flag
&= ~VHASDIRTY
;
3738 first
= (daddr_t
)(offset
/ PAGE_SIZE_64
);
3739 last
= (daddr_t
)((offset
+ length
) / PAGE_SIZE_64
);
3741 cluster_push_x(vp
, EOF
, first
, last
, 0);
3743 vp
->v_scdirty
-= (last
- first
);
3748 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 79)) | DBG_FUNC_END
, (int)vp
, (int)vp
->v_scmap
, vp
->v_scdirty
, 0, 0);
3753 sparse_cluster_add(struct vnode
*vp
, off_t EOF
, daddr_t first
, daddr_t last
)
3759 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 80)) | DBG_FUNC_START
, (int)vp
->v_scmap
, vp
->v_scdirty
, first
, last
, 0);
3761 offset
= (off_t
)first
* PAGE_SIZE_64
;
3762 length
= (last
- first
) * PAGE_SIZE
;
3764 while (vfs_drt_mark_pages(&(vp
->v_scmap
), offset
, length
, &new_dirty
) != KERN_SUCCESS
) {
3766 * no room left in the map
3767 * only a partial update was done
3768 * push out some pages and try again
3770 vp
->v_scdirty
+= new_dirty
;
3772 sparse_cluster_push(vp
, EOF
, 0);
3774 offset
+= (new_dirty
* PAGE_SIZE_64
);
3775 length
-= (new_dirty
* PAGE_SIZE
);
3777 vp
->v_scdirty
+= new_dirty
;
3779 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 80)) | DBG_FUNC_END
, (int)vp
, (int)vp
->v_scmap
, vp
->v_scdirty
, 0, 0);
3784 cluster_align_phys_io(struct vnode
*vp
, struct uio
*uio
, addr64_t usr_paddr
, int xsize
, int devblocksize
, int flags
)
3787 upl_page_info_t
*pl
;
3795 kret
= ubc_create_upl(vp
,
3796 uio
->uio_offset
& ~PAGE_MASK_64
,
3802 if (kret
!= KERN_SUCCESS
)
3805 if (!upl_valid_page(pl
, 0)) {
3807 * issue a synchronous read to cluster_io
3809 error
= cluster_io(vp
, upl
, 0, uio
->uio_offset
& ~PAGE_MASK_64
, PAGE_SIZE
, devblocksize
,
3810 CL_READ
, (struct buf
*)0, (struct clios
*)0);
3812 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
3817 ubc_paddr
= ((addr64_t
)upl_phys_page(pl
, 0) << 12) + (addr64_t
)(uio
->uio_offset
& PAGE_MASK_64
);
3820 * NOTE: There is no prototype for the following in BSD. It, and the definitions
3821 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
3822 * osfmk/ppc/mappings.h. They are not included here because there appears to be no
3823 * way to do so without exporting them to kexts as well.
3825 if (flags
& CL_READ
)
3826 // copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */
3827 copypv(ubc_paddr
, usr_paddr
, xsize
, 2 | 1 | 4); /* Copy physical to physical and flush the destination */
3829 // copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */
3830 copypv(usr_paddr
, ubc_paddr
, xsize
, 2 | 1 | 8); /* Copy physical to physical and flush the source */
3832 if ( !(flags
& CL_READ
) || (upl_valid_page(pl
, 0) && upl_dirty_page(pl
, 0))) {
3834 * issue a synchronous write to cluster_io
3836 error
= cluster_io(vp
, upl
, 0, uio
->uio_offset
& ~PAGE_MASK_64
, PAGE_SIZE
, devblocksize
,
3837 0, (struct buf
*)0, (struct clios
*)0);
3840 uio
->uio_offset
+= xsize
;
3841 iov
->iov_base
+= xsize
;
3842 iov
->iov_len
-= xsize
;
3843 uio
->uio_resid
-= xsize
;
3845 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
3853 cluster_copy_upl_data(struct uio
*uio
, upl_t upl
, int upl_offset
, int xsize
)
3860 upl_page_info_t
*pl
;
3861 boolean_t funnel_state
= FALSE
;
3864 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_START
,
3865 (int)uio
->uio_offset
, uio
->uio_resid
, upl_offset
, xsize
, 0);
3867 if (xsize
>= (16 * 1024))
3868 funnel_state
= thread_funnel_set(kernel_flock
, FALSE
);
3870 segflg
= uio
->uio_segflg
;
3875 case UIO_USERISPACE
:
3876 uio
->uio_segflg
= UIO_PHYS_USERSPACE
;
3880 uio
->uio_segflg
= UIO_PHYS_SYSSPACE
;
3883 pl
= ubc_upl_pageinfo(upl
);
3885 pg_index
= upl_offset
/ PAGE_SIZE
;
3886 pg_offset
= upl_offset
& PAGE_MASK
;
3887 csize
= min(PAGE_SIZE
- pg_offset
, xsize
);
3889 while (xsize
&& retval
== 0) {
3892 paddr
= ((addr64_t
)upl_phys_page(pl
, pg_index
) << 12) + pg_offset
;
3894 retval
= uiomove64(paddr
, csize
, uio
);
3899 csize
= min(PAGE_SIZE
, xsize
);
3901 uio
->uio_segflg
= segflg
;
3903 if (funnel_state
== TRUE
)
3904 thread_funnel_set(kernel_flock
, TRUE
);
3906 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
3907 (int)uio
->uio_offset
, uio
->uio_resid
, retval
, segflg
, 0);
3914 cluster_copy_ubc_data(struct vnode
*vp
, struct uio
*uio
, int *io_resid
, int mark_dirty
)
3922 memory_object_control_t control
;
3923 int op_flags
= UPL_POP_SET
| UPL_POP_BUSY
;
3924 boolean_t funnel_state
= FALSE
;
3927 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_START
,
3928 (int)uio
->uio_offset
, uio
->uio_resid
, 0, *io_resid
, 0);
3930 control
= ubc_getobject(vp
, UBC_FLAGS_NONE
);
3931 if (control
== MEMORY_OBJECT_CONTROL_NULL
) {
3932 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
3933 (int)uio
->uio_offset
, uio
->uio_resid
, retval
, 3, 0);
3938 op_flags
|= UPL_POP_DIRTY
;
3940 segflg
= uio
->uio_segflg
;
3945 case UIO_USERISPACE
:
3946 uio
->uio_segflg
= UIO_PHYS_USERSPACE
;
3950 uio
->uio_segflg
= UIO_PHYS_SYSSPACE
;
3953 io_size
= *io_resid
;
3954 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
3955 f_offset
= uio
->uio_offset
- start_offset
;
3956 xsize
= min(PAGE_SIZE
- start_offset
, io_size
);
3958 while (io_size
&& retval
== 0) {
3961 if (ubc_page_op_with_control(control
, f_offset
, op_flags
, &pgframe
, 0) != KERN_SUCCESS
)
3964 if (funnel_state
== FALSE
&& io_size
>= (16 * 1024))
3965 funnel_state
= thread_funnel_set(kernel_flock
, FALSE
);
3967 retval
= uiomove64((addr64_t
)(((addr64_t
)pgframe
<< 12) + start_offset
), xsize
, uio
);
3969 ubc_page_op_with_control(control
, f_offset
, UPL_POP_CLR
| UPL_POP_BUSY
, 0, 0);
3973 f_offset
= uio
->uio_offset
;
3974 xsize
= min(PAGE_SIZE
, io_size
);
3976 uio
->uio_segflg
= segflg
;
3977 *io_resid
= io_size
;
3979 if (funnel_state
== TRUE
)
3980 thread_funnel_set(kernel_flock
, TRUE
);
3982 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
3983 (int)uio
->uio_offset
, uio
->uio_resid
, retval
, 0x80000000 | segflg
, 0);
3990 is_file_clean(struct vnode
*vp
, off_t filesize
)
3994 int total_dirty
= 0;
3996 for (f_offset
= 0; f_offset
< filesize
; f_offset
+= PAGE_SIZE_64
) {
3997 if (ubc_page_op(vp
, f_offset
, 0, 0, &flags
) == KERN_SUCCESS
) {
3998 if (flags
& UPL_POP_DIRTY
) {
4012 * Dirty region tracking/clustering mechanism.
4014 * This code (vfs_drt_*) provides a mechanism for tracking and clustering
4015 * dirty regions within a larger space (file). It is primarily intended to
4016 * support clustering in large files with many dirty areas.
4018 * The implementation assumes that the dirty regions are pages.
4020 * To represent dirty pages within the file, we store bit vectors in a
4021 * variable-size circular hash.
4025 * Bitvector size. This determines the number of pages we group in a
4026 * single hashtable entry. Each hashtable entry is aligned to this
4027 * size within the file.
4029 #define DRT_BITVECTOR_PAGES 256
4032 * File offset handling.
4034 * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
4035 * the correct formula is (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)
4037 #define DRT_ADDRESS_MASK (~((1 << 20) - 1))
4038 #define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
4041 * Hashtable address field handling.
4043 * The low-order bits of the hashtable address are used to conserve
4046 * DRT_HASH_COUNT_MASK must be large enough to store the range
4047 * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
4048 * to indicate that the bucket is actually unoccupied.
4050 #define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
4051 #define DRT_HASH_SET_ADDRESS(scm, i, a) \
4053 (scm)->scm_hashtable[(i)].dhe_control = \
4054 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
4056 #define DRT_HASH_COUNT_MASK 0x1ff
4057 #define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
4058 #define DRT_HASH_SET_COUNT(scm, i, c) \
4060 (scm)->scm_hashtable[(i)].dhe_control = \
4061 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \
4063 #define DRT_HASH_CLEAR(scm, i) \
4065 (scm)->scm_hashtable[(i)].dhe_control = 0; \
4067 #define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
4068 #define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
4069 #define DRT_HASH_COPY(oscm, oi, scm, i) \
4071 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
4072 DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
4077 * Hash table moduli.
4079 * Since the hashtable entry's size is dependent on the size of
4080 * the bitvector, and since the hashtable size is constrained to
4081 * both being prime and fitting within the desired allocation
4082 * size, these values need to be manually determined.
4084 * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
4086 * The small hashtable allocation is 1024 bytes, so the modulus is 23.
4087 * The large hashtable allocation is 16384 bytes, so the modulus is 401.
4089 #define DRT_HASH_SMALL_MODULUS 23
4090 #define DRT_HASH_LARGE_MODULUS 401
4092 #define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */
4093 #define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */
4095 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
4098 * Hashtable bitvector handling.
4100 * Bitvector fields are 32 bits long.
4103 #define DRT_HASH_SET_BIT(scm, i, bit) \
4104 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
4106 #define DRT_HASH_CLEAR_BIT(scm, i, bit) \
4107 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
4109 #define DRT_HASH_TEST_BIT(scm, i, bit) \
4110 ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
4112 #define DRT_BITVECTOR_CLEAR(scm, i) \
4113 bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4115 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
4116 bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
4117 &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
4118 (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4125 struct vfs_drt_hashentry
{
4126 u_int64_t dhe_control
;
4127 u_int32_t dhe_bitvector
[DRT_BITVECTOR_PAGES
/ 32];
4131 * Dirty Region Tracking structure.
4133 * The hashtable is allocated entirely inside the DRT structure.
4135 * The hash is a simple circular prime modulus arrangement, the structure
4136 * is resized from small to large if it overflows.
4139 struct vfs_drt_clustermap
{
4140 u_int32_t scm_magic
; /* sanity/detection */
4141 #define DRT_SCM_MAGIC 0x12020003
4142 u_int32_t scm_modulus
; /* current ring size */
4143 u_int32_t scm_buckets
; /* number of occupied buckets */
4144 u_int32_t scm_lastclean
; /* last entry we cleaned */
4145 u_int32_t scm_iskips
; /* number of slot skips */
4147 struct vfs_drt_hashentry scm_hashtable
[0];
4151 #define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
4152 #define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
4155 * Debugging codes and arguments.
4157 #define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
4158 #define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
4159 #define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
4160 #define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
4161 #define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
4164 /* 1 (clean, no map) */
4165 /* 2 (map alloc fail) */
4166 /* 3, resid (partial) */
4167 #define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
4168 #define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
4169 * lastclean, iskips */
4172 static void vfs_drt_sanity(struct vfs_drt_clustermap
*cmap
);
4173 static kern_return_t
vfs_drt_alloc_map(struct vfs_drt_clustermap
**cmapp
);
4174 static kern_return_t
vfs_drt_free_map(struct vfs_drt_clustermap
*cmap
);
4175 static kern_return_t
vfs_drt_search_index(struct vfs_drt_clustermap
*cmap
,
4176 u_int64_t offset
, int *indexp
);
4177 static kern_return_t
vfs_drt_get_index(struct vfs_drt_clustermap
**cmapp
,
4181 static kern_return_t
vfs_drt_do_mark_pages(
4187 static void vfs_drt_trace(
4188 struct vfs_drt_clustermap
*cmap
,
4197 * Allocate and initialise a sparse cluster map.
4199 * Will allocate a new map, resize or compact an existing map.
4201 * XXX we should probably have at least one intermediate map size,
4202 * as the 1:16 ratio seems a bit drastic.
4204 static kern_return_t
4205 vfs_drt_alloc_map(struct vfs_drt_clustermap
**cmapp
)
4207 struct vfs_drt_clustermap
*cmap
, *ocmap
;
4210 int nsize
, i
, active_buckets
, index
, copycount
;
4217 * Decide on the size of the new map.
4219 if (ocmap
== NULL
) {
4220 nsize
= DRT_HASH_SMALL_MODULUS
;
4222 /* count the number of active buckets in the old map */
4224 for (i
= 0; i
< ocmap
->scm_modulus
; i
++) {
4225 if (!DRT_HASH_VACANT(ocmap
, i
) &&
4226 (DRT_HASH_GET_COUNT(ocmap
, i
) != 0))
4230 * If we're currently using the small allocation, check to
4231 * see whether we should grow to the large one.
4233 if (ocmap
->scm_modulus
== DRT_HASH_SMALL_MODULUS
) {
4234 /* if the ring is nearly full */
4235 if (active_buckets
> (DRT_HASH_SMALL_MODULUS
- 5)) {
4236 nsize
= DRT_HASH_LARGE_MODULUS
;
4238 nsize
= DRT_HASH_SMALL_MODULUS
;
4241 /* already using the large modulus */
4242 nsize
= DRT_HASH_LARGE_MODULUS
;
4244 * If the ring is completely full, there's
4245 * nothing useful for us to do. Behave as
4246 * though we had compacted into the new
4249 if (active_buckets
>= DRT_HASH_LARGE_MODULUS
)
4250 return(KERN_SUCCESS
);
4255 * Allocate and initialise the new map.
4258 kret
= kmem_alloc(kernel_map
, (vm_offset_t
*)&cmap
,
4259 (nsize
== DRT_HASH_SMALL_MODULUS
) ? DRT_SMALL_ALLOCATION
: DRT_LARGE_ALLOCATION
);
4260 if (kret
!= KERN_SUCCESS
)
4262 cmap
->scm_magic
= DRT_SCM_MAGIC
;
4263 cmap
->scm_modulus
= nsize
;
4264 cmap
->scm_buckets
= 0;
4265 cmap
->scm_lastclean
= 0;
4266 cmap
->scm_iskips
= 0;
4267 for (i
= 0; i
< cmap
->scm_modulus
; i
++) {
4268 DRT_HASH_CLEAR(cmap
, i
);
4269 DRT_HASH_VACATE(cmap
, i
);
4270 DRT_BITVECTOR_CLEAR(cmap
, i
);
4274 * If there's an old map, re-hash entries from it into the new map.
4277 if (ocmap
!= NULL
) {
4278 for (i
= 0; i
< ocmap
->scm_modulus
; i
++) {
4279 /* skip empty buckets */
4280 if (DRT_HASH_VACANT(ocmap
, i
) ||
4281 (DRT_HASH_GET_COUNT(ocmap
, i
) == 0))
4284 offset
= DRT_HASH_GET_ADDRESS(ocmap
, i
);
4285 kret
= vfs_drt_get_index(&cmap
, offset
, &index
, 1);
4286 if (kret
!= KERN_SUCCESS
) {
4287 /* XXX need to bail out gracefully here */
4288 panic("vfs_drt: new cluster map mysteriously too small");
4291 DRT_HASH_COPY(ocmap
, i
, cmap
, index
);
4296 /* log what we've done */
4297 vfs_drt_trace(cmap
, DRT_DEBUG_ALLOC
, copycount
, 0, 0, 0);
4300 * It's important to ensure that *cmapp always points to
4301 * a valid map, so we must overwrite it before freeing
4305 if (ocmap
!= NULL
) {
4306 /* emit stats into trace buffer */
4307 vfs_drt_trace(ocmap
, DRT_DEBUG_SCMDATA
,
4310 ocmap
->scm_lastclean
,
4313 vfs_drt_free_map(ocmap
);
4315 return(KERN_SUCCESS
);
4320 * Free a sparse cluster map.
4322 static kern_return_t
4323 vfs_drt_free_map(struct vfs_drt_clustermap
*cmap
)
4327 kmem_free(kernel_map
, (vm_offset_t
)cmap
,
4328 (cmap
->scm_modulus
== DRT_HASH_SMALL_MODULUS
) ? DRT_SMALL_ALLOCATION
: DRT_LARGE_ALLOCATION
);
4329 return(KERN_SUCCESS
);
4334 * Find the hashtable slot currently occupied by an entry for the supplied offset.
4336 static kern_return_t
4337 vfs_drt_search_index(struct vfs_drt_clustermap
*cmap
, u_int64_t offset
, int *indexp
)
4340 int index
, i
, tries
;
4342 offset
= DRT_ALIGN_ADDRESS(offset
);
4343 index
= DRT_HASH(cmap
, offset
);
4345 /* traverse the hashtable */
4346 for (i
= 0; i
< cmap
->scm_modulus
; i
++) {
4349 * If the slot is vacant, we can stop.
4351 if (DRT_HASH_VACANT(cmap
, index
))
4355 * If the address matches our offset, we have success.
4357 if (DRT_HASH_GET_ADDRESS(cmap
, index
) == offset
) {
4359 return(KERN_SUCCESS
);
4363 * Move to the next slot, try again.
4365 index
= DRT_HASH_NEXT(cmap
, index
);
4370 return(KERN_FAILURE
);
4374 * Find the hashtable slot for the supplied offset. If we haven't allocated
4375 * one yet, allocate one and populate the address field. Note that it will
4376 * not have a nonzero page count and thus will still technically be free, so
4377 * in the case where we are called to clean pages, the slot will remain free.
4379 static kern_return_t
4380 vfs_drt_get_index(struct vfs_drt_clustermap
**cmapp
, u_int64_t offset
, int *indexp
, int recursed
)
4382 struct vfs_drt_clustermap
*cmap
;
4388 /* look for an existing entry */
4389 kret
= vfs_drt_search_index(cmap
, offset
, indexp
);
4390 if (kret
== KERN_SUCCESS
)
4393 /* need to allocate an entry */
4394 offset
= DRT_ALIGN_ADDRESS(offset
);
4395 index
= DRT_HASH(cmap
, offset
);
4397 /* scan from the index forwards looking for a vacant slot */
4398 for (i
= 0; i
< cmap
->scm_modulus
; i
++) {
4400 if (DRT_HASH_VACANT(cmap
, index
) || DRT_HASH_GET_COUNT(cmap
,index
) == 0) {
4401 cmap
->scm_buckets
++;
4402 if (index
< cmap
->scm_lastclean
)
4403 cmap
->scm_lastclean
= index
;
4404 DRT_HASH_SET_ADDRESS(cmap
, index
, offset
);
4405 DRT_HASH_SET_COUNT(cmap
, index
, 0);
4406 DRT_BITVECTOR_CLEAR(cmap
, index
);
4408 vfs_drt_trace(cmap
, DRT_DEBUG_INSERT
, (int)offset
, i
, 0, 0);
4409 return(KERN_SUCCESS
);
4411 cmap
->scm_iskips
+= i
;
4412 index
= DRT_HASH_NEXT(cmap
, index
);
4416 * We haven't found a vacant slot, so the map is full. If we're not
4417 * already recursed, try reallocating/compacting it.
4420 return(KERN_FAILURE
);
4421 kret
= vfs_drt_alloc_map(cmapp
);
4422 if (kret
== KERN_SUCCESS
) {
4423 /* now try to insert again */
4424 kret
= vfs_drt_get_index(cmapp
, offset
, indexp
, 1);
4430 * Implementation of set dirty/clean.
4432 * In the 'clean' case, not finding a map is OK.
4434 static kern_return_t
4435 vfs_drt_do_mark_pages(
4442 struct vfs_drt_clustermap
*cmap
, **cmapp
;
4444 int i
, index
, pgoff
, pgcount
, setcount
, ecount
;
4446 cmapp
= (struct vfs_drt_clustermap
**)private;
4449 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_START
, (int)offset
, (int)length
, dirty
, 0);
4451 if (setcountp
!= NULL
)
4454 /* allocate a cluster map if we don't already have one */
4456 /* no cluster map, nothing to clean */
4458 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 1, 0, 0, 0);
4459 return(KERN_SUCCESS
);
4461 kret
= vfs_drt_alloc_map(cmapp
);
4462 if (kret
!= KERN_SUCCESS
) {
4463 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 2, 0, 0, 0);
4470 * Iterate over the length of the region.
4472 while (length
> 0) {
4474 * Get the hashtable index for this offset.
4476 * XXX this will add blank entries if we are clearing a range
4477 * that hasn't been dirtied.
4479 kret
= vfs_drt_get_index(cmapp
, offset
, &index
, 0);
4480 cmap
= *cmapp
; /* may have changed! */
4481 /* this may be a partial-success return */
4482 if (kret
!= KERN_SUCCESS
) {
4483 if (setcountp
!= NULL
)
4484 *setcountp
= setcount
;
4485 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 3, (int)length
, 0, 0);
4491 * Work out how many pages we're modifying in this
4494 pgoff
= (offset
- DRT_ALIGN_ADDRESS(offset
)) / PAGE_SIZE
;
4495 pgcount
= min((length
/ PAGE_SIZE
), (DRT_BITVECTOR_PAGES
- pgoff
));
4498 * Iterate over pages, dirty/clearing as we go.
4500 ecount
= DRT_HASH_GET_COUNT(cmap
, index
);
4501 for (i
= 0; i
< pgcount
; i
++) {
4503 if (!DRT_HASH_TEST_BIT(cmap
, index
, pgoff
+ i
)) {
4504 DRT_HASH_SET_BIT(cmap
, index
, pgoff
+ i
);
4509 if (DRT_HASH_TEST_BIT(cmap
, index
, pgoff
+ i
)) {
4510 DRT_HASH_CLEAR_BIT(cmap
, index
, pgoff
+ i
);
4516 DRT_HASH_SET_COUNT(cmap
, index
, ecount
);
4518 offset
+= pgcount
* PAGE_SIZE
;
4519 length
-= pgcount
* PAGE_SIZE
;
4521 if (setcountp
!= NULL
)
4522 *setcountp
= setcount
;
4524 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 0, setcount
, 0, 0);
4526 return(KERN_SUCCESS
);
4530 * Mark a set of pages as dirty/clean.
4532 * This is a public interface.
4535 * Pointer to storage suitable for holding a pointer. Note that
4536 * this must either be NULL or a value set by this function.
4539 * Current file size in bytes.
4542 * Offset of the first page to be marked as dirty, in bytes. Must be
4546 * Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
4549 * Number of pages newly marked dirty by this call (optional).
4551 * Returns KERN_SUCCESS if all the pages were successfully marked.
4553 static kern_return_t
4554 vfs_drt_mark_pages(void **cmapp
, off_t offset
, u_int length
, int *setcountp
)
4556 /* XXX size unused, drop from interface */
4557 return(vfs_drt_do_mark_pages(cmapp
, offset
, length
, setcountp
, 1));
4560 static kern_return_t
4561 vfs_drt_unmark_pages(void **cmapp
, off_t offset
, u_int length
)
4563 return(vfs_drt_do_mark_pages(cmapp
, offset
, length
, NULL
, 0));
4567 * Get a cluster of dirty pages.
4569 * This is a public interface.
4572 * Pointer to storage managed by drt_mark_pages. Note that this must
4573 * be NULL or a value set by drt_mark_pages.
4576 * Returns the byte offset into the file of the first page in the cluster.
4579 * Returns the length in bytes of the cluster of dirty pages.
4581 * Returns success if a cluster was found. If KERN_FAILURE is returned, there
4582 * are no dirty pages meeting the minmum size criteria. Private storage will
4583 * be released if there are no more dirty pages left in the map
4586 static kern_return_t
4587 vfs_drt_get_cluster(void **cmapp
, off_t
*offsetp
, u_int
*lengthp
)
4589 struct vfs_drt_clustermap
*cmap
;
4592 int index
, i
, j
, fs
, ls
;
4595 if ((cmapp
== NULL
) || (*cmapp
== NULL
))
4596 return(KERN_FAILURE
);
4599 /* walk the hashtable */
4600 for (offset
= 0, j
= 0; j
< cmap
->scm_modulus
; offset
+= (DRT_BITVECTOR_PAGES
* PAGE_SIZE
), j
++) {
4601 index
= DRT_HASH(cmap
, offset
);
4603 if (DRT_HASH_VACANT(cmap
, index
) || (DRT_HASH_GET_COUNT(cmap
, index
) == 0))
4606 /* scan the bitfield for a string of bits */
4609 for (i
= 0; i
< DRT_BITVECTOR_PAGES
; i
++) {
4610 if (DRT_HASH_TEST_BIT(cmap
, index
, i
)) {
4616 /* didn't find any bits set */
4617 panic("vfs_drt: entry summary count > 0 but no bits set in map");
4619 for (ls
= 0; i
< DRT_BITVECTOR_PAGES
; i
++, ls
++) {
4620 if (!DRT_HASH_TEST_BIT(cmap
, index
, i
))
4624 /* compute offset and length, mark pages clean */
4625 offset
= DRT_HASH_GET_ADDRESS(cmap
, index
) + (PAGE_SIZE
* fs
);
4626 length
= ls
* PAGE_SIZE
;
4627 vfs_drt_do_mark_pages(cmapp
, offset
, length
, NULL
, 0);
4628 cmap
->scm_lastclean
= index
;
4630 /* return successful */
4631 *offsetp
= (off_t
)offset
;
4634 vfs_drt_trace(cmap
, DRT_DEBUG_RETCLUSTER
, (int)offset
, (int)length
, 0, 0);
4635 return(KERN_SUCCESS
);
4638 * We didn't find anything... hashtable is empty
4639 * emit stats into trace buffer and
4642 vfs_drt_trace(cmap
, DRT_DEBUG_SCMDATA
,
4645 cmap
->scm_lastclean
,
4648 vfs_drt_free_map(cmap
);
4651 return(KERN_FAILURE
);
4655 static kern_return_t
4656 vfs_drt_control(void **cmapp
, int op_type
)
4658 struct vfs_drt_clustermap
*cmap
;
4661 if ((cmapp
== NULL
) || (*cmapp
== NULL
))
4662 return(KERN_FAILURE
);
4667 /* emit stats into trace buffer */
4668 vfs_drt_trace(cmap
, DRT_DEBUG_SCMDATA
,
4671 cmap
->scm_lastclean
,
4674 vfs_drt_free_map(cmap
);
4679 cmap
->scm_lastclean
= 0;
4682 return(KERN_SUCCESS
);
4688 * Emit a summary of the state of the clustermap into the trace buffer
4689 * along with some caller-provided data.
4692 vfs_drt_trace(struct vfs_drt_clustermap
*cmap
, int code
, int arg1
, int arg2
, int arg3
, int arg4
)
4694 KERNEL_DEBUG(code
, arg1
, arg2
, arg3
, arg4
, 0);
4698 * Perform basic sanity check on the hash entry summary count
4699 * vs. the actual bits set in the entry.
4702 vfs_drt_sanity(struct vfs_drt_clustermap
*cmap
)
4707 for (index
= 0; index
< cmap
->scm_modulus
; index
++) {
4708 if (DRT_HASH_VACANT(cmap
, index
))
4711 for (bits_on
= 0, i
= 0; i
< DRT_BITVECTOR_PAGES
; i
++) {
4712 if (DRT_HASH_TEST_BIT(cmap
, index
, i
))
4715 if (bits_on
!= DRT_HASH_GET_COUNT(cmap
, index
))
4716 panic("bits_on = %d, index = %d\n", bits_on
, index
);