2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
31 * The Regents of the University of California. All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
64 #include <sys/param.h>
65 #include <sys/proc_internal.h>
66 #include <sys/buf_internal.h>
67 #include <sys/mount_internal.h>
68 #include <sys/vnode_internal.h>
69 #include <sys/trace.h>
70 #include <sys/malloc.h>
72 #include <sys/kernel.h>
73 #include <sys/resourcevar.h>
74 #include <sys/uio_internal.h>
75 #include <libkern/libkern.h>
76 #include <machine/machine_routines.h>
78 #include <sys/ubc_internal.h>
80 #include <mach/mach_types.h>
81 #include <mach/memory_object_types.h>
82 #include <mach/vm_map.h>
85 #include <vm/vm_kern.h>
86 #include <vm/vm_map.h>
87 #include <vm/vm_pageout.h>
89 #include <sys/kdebug.h>
94 #define CL_COMMIT 0x04
95 #define CL_PAGEOUT 0x10
98 #define CL_NOZERO 0x80
99 #define CL_PAGEIN 0x100
100 #define CL_DEV_MEMORY 0x200
101 #define CL_PRESERVE 0x400
102 #define CL_THROTTLE 0x800
103 #define CL_KEEPCACHED 0x1000
107 u_int io_completed
; /* amount of io that has currently completed */
108 u_int io_issued
; /* amount of io that was successfully issued */
109 int io_error
; /* error code of first error encountered */
110 int io_wanted
; /* someone is sleeping waiting for a change in state */
113 static lck_grp_t
*cl_mtx_grp
;
114 static lck_attr_t
*cl_mtx_attr
;
115 static lck_grp_attr_t
*cl_mtx_grp_attr
;
116 static lck_mtx_t
*cl_mtxp
;
119 static int cluster_io(vnode_t vp
, upl_t upl
, vm_offset_t upl_offset
, off_t f_offset
, int non_rounded_size
,
120 int flags
, buf_t real_bp
, struct clios
*iostate
);
121 static int cluster_iodone(buf_t bp
, void *dummy
);
122 static int cluster_rd_prefetch(vnode_t vp
, off_t f_offset
, u_int size
, off_t filesize
);
123 static int cluster_hard_throttle_on(vnode_t vp
);
125 static int cluster_read_x(vnode_t vp
, struct uio
*uio
, off_t filesize
, int flags
);
126 static int cluster_write_x(vnode_t vp
, struct uio
*uio
, off_t oldEOF
, off_t newEOF
,
127 off_t headOff
, off_t tailOff
, int flags
);
128 static int cluster_nocopy_read(vnode_t vp
, struct uio
*uio
, off_t filesize
);
129 static int cluster_nocopy_write(vnode_t vp
, struct uio
*uio
, off_t newEOF
);
130 static int cluster_phys_read(vnode_t vp
, struct uio
*uio
, off_t filesize
);
131 static int cluster_phys_write(vnode_t vp
, struct uio
*uio
, off_t newEOF
);
132 static int cluster_align_phys_io(vnode_t vp
, struct uio
*uio
, addr64_t usr_paddr
, int xsize
, int flags
);
134 static void cluster_rd_ahead(vnode_t vp
, struct cl_extent
*extent
, off_t filesize
, struct cl_readahead
*ra
);
136 static int cluster_push_x(vnode_t vp
, struct cl_extent
*, off_t EOF
, int flags
);
137 static void cluster_push_EOF(vnode_t vp
, off_t EOF
);
139 static int cluster_try_push(struct cl_writebehind
*, vnode_t vp
, off_t EOF
, int can_delay
, int push_all
);
141 static void sparse_cluster_switch(struct cl_writebehind
*, vnode_t vp
, off_t EOF
);
142 static void sparse_cluster_push(struct cl_writebehind
*, vnode_t vp
, off_t EOF
, int push_all
);
143 static void sparse_cluster_add(struct cl_writebehind
*, vnode_t vp
, struct cl_extent
*, off_t EOF
);
145 static kern_return_t
vfs_drt_mark_pages(void **cmapp
, off_t offset
, u_int length
, int *setcountp
);
146 static kern_return_t
vfs_drt_get_cluster(void **cmapp
, off_t
*offsetp
, u_int
*lengthp
);
147 static kern_return_t
vfs_drt_control(void **cmapp
, int op_type
);
149 int is_file_clean(vnode_t
, off_t
);
152 * throttle the number of async writes that
153 * can be outstanding on a single vnode
154 * before we issue a synchronous write
156 #define HARD_THROTTLE_MAXCNT 0
157 #define HARD_THROTTLE_MAXSIZE (64 * 1024)
159 int hard_throttle_on_root
= 0;
160 struct timeval priority_IO_timestamp_for_root
;
166 * allocate lock group attribute and group
168 cl_mtx_grp_attr
= lck_grp_attr_alloc_init();
169 cl_mtx_grp
= lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr
);
172 * allocate the lock attribute
174 cl_mtx_attr
= lck_attr_alloc_init();
177 * allocate and initialize mutex's used to protect updates and waits
178 * on the cluster_io context
180 cl_mtxp
= lck_mtx_alloc_init(cl_mtx_grp
, cl_mtx_attr
);
183 panic("cluster_init: failed to allocate cl_mtxp");
188 #define CLW_ALLOCATE 0x01
189 #define CLW_RETURNLOCKED 0x02
191 * if the read ahead context doesn't yet exist,
192 * allocate and initialize it...
193 * the vnode lock serializes multiple callers
194 * during the actual assignment... first one
195 * to grab the lock wins... the other callers
196 * will release the now unnecessary storage
198 * once the context is present, try to grab (but don't block on)
199 * the lock associated with it... if someone
200 * else currently owns it, than the read
201 * will run without read-ahead. this allows
202 * multiple readers to run in parallel and
203 * since there's only 1 read ahead context,
204 * there's no real loss in only allowing 1
205 * reader to have read-ahead enabled.
207 static struct cl_readahead
*
208 cluster_get_rap(vnode_t vp
)
210 struct ubc_info
*ubc
;
211 struct cl_readahead
*rap
;
215 if ((rap
= ubc
->cl_rahead
) == NULL
) {
216 MALLOC_ZONE(rap
, struct cl_readahead
*, sizeof *rap
, M_CLRDAHEAD
, M_WAITOK
);
218 bzero(rap
, sizeof *rap
);
220 lck_mtx_init(&rap
->cl_lockr
, cl_mtx_grp
, cl_mtx_attr
);
224 if (ubc
->cl_rahead
== NULL
)
225 ubc
->cl_rahead
= rap
;
227 lck_mtx_destroy(&rap
->cl_lockr
, cl_mtx_grp
);
228 FREE_ZONE((void *)rap
, sizeof *rap
, M_CLRDAHEAD
);
229 rap
= ubc
->cl_rahead
;
233 if (lck_mtx_try_lock(&rap
->cl_lockr
) == TRUE
)
236 return ((struct cl_readahead
*)NULL
);
241 * if the write behind context doesn't yet exist,
242 * and CLW_ALLOCATE is specified, allocate and initialize it...
243 * the vnode lock serializes multiple callers
244 * during the actual assignment... first one
245 * to grab the lock wins... the other callers
246 * will release the now unnecessary storage
248 * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
249 * the lock associated with the write behind context before
253 static struct cl_writebehind
*
254 cluster_get_wbp(vnode_t vp
, int flags
)
256 struct ubc_info
*ubc
;
257 struct cl_writebehind
*wbp
;
261 if ((wbp
= ubc
->cl_wbehind
) == NULL
) {
263 if ( !(flags
& CLW_ALLOCATE
))
264 return ((struct cl_writebehind
*)NULL
);
266 MALLOC_ZONE(wbp
, struct cl_writebehind
*, sizeof *wbp
, M_CLWRBEHIND
, M_WAITOK
);
268 bzero(wbp
, sizeof *wbp
);
269 lck_mtx_init(&wbp
->cl_lockw
, cl_mtx_grp
, cl_mtx_attr
);
273 if (ubc
->cl_wbehind
== NULL
)
274 ubc
->cl_wbehind
= wbp
;
276 lck_mtx_destroy(&wbp
->cl_lockw
, cl_mtx_grp
);
277 FREE_ZONE((void *)wbp
, sizeof *wbp
, M_CLWRBEHIND
);
278 wbp
= ubc
->cl_wbehind
;
282 if (flags
& CLW_RETURNLOCKED
)
283 lck_mtx_lock(&wbp
->cl_lockw
);
290 cluster_hard_throttle_on(vnode_t vp
)
292 static struct timeval hard_throttle_maxelapsed
= { 0, 200000 };
294 if (vp
->v_mount
->mnt_kern_flag
& MNTK_ROOTDEV
) {
295 struct timeval elapsed
;
297 if (hard_throttle_on_root
)
300 microuptime(&elapsed
);
301 timevalsub(&elapsed
, &priority_IO_timestamp_for_root
);
303 if (timevalcmp(&elapsed
, &hard_throttle_maxelapsed
, <))
311 cluster_iodone(buf_t bp
, __unused
void *dummy
)
324 struct clios
*iostate
;
328 cbp_head
= (buf_t
)(bp
->b_trans_head
);
330 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_START
,
331 (int)cbp_head
, bp
->b_lblkno
, bp
->b_bcount
, bp
->b_flags
, 0);
333 for (cbp
= cbp_head
; cbp
; cbp
= cbp
->b_trans_next
) {
335 * all I/O requests that are part of this transaction
336 * have to complete before we can process it
338 if ( !(cbp
->b_flags
& B_DONE
)) {
340 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
341 (int)cbp_head
, (int)cbp
, cbp
->b_bcount
, cbp
->b_flags
, 0);
351 upl_offset
= cbp
->b_uploffset
;
353 b_flags
= cbp
->b_flags
;
354 real_bp
= cbp
->b_real_bp
;
355 zero_offset
= cbp
->b_validend
;
356 iostate
= (struct clios
*)cbp
->b_iostate
;
359 real_bp
->b_dev
= cbp
->b_dev
;
362 if ((cbp
->b_flags
& B_ERROR
) && error
== 0)
363 error
= cbp
->b_error
;
365 total_resid
+= cbp
->b_resid
;
366 total_size
+= cbp
->b_bcount
;
368 cbp_next
= cbp
->b_trans_next
;
375 cluster_zero(upl
, zero_offset
, PAGE_SIZE
- (zero_offset
& PAGE_MASK
), real_bp
);
381 * someone has issued multiple I/Os asynchrounsly
382 * and is waiting for them to complete (streaming)
384 lck_mtx_lock(cl_mtxp
);
386 if (error
&& iostate
->io_error
== 0)
387 iostate
->io_error
= error
;
389 iostate
->io_completed
+= total_size
;
391 if (iostate
->io_wanted
) {
393 * someone is waiting for the state of
394 * this io stream to change
396 iostate
->io_wanted
= 0;
399 lck_mtx_unlock(cl_mtxp
);
402 wakeup((caddr_t
)&iostate
->io_wanted
);
404 if ((b_flags
& B_NEED_IODONE
) && real_bp
) {
406 real_bp
->b_flags
|= B_ERROR
;
407 real_bp
->b_error
= error
;
409 real_bp
->b_resid
= total_resid
;
411 buf_biodone(real_bp
);
413 if (error
== 0 && total_resid
)
416 if (b_flags
& B_COMMIT_UPL
) {
417 pg_offset
= upl_offset
& PAGE_MASK
;
418 commit_size
= (pg_offset
+ total_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
420 if (error
|| (b_flags
& B_NOCACHE
)) {
425 if (b_flags
& B_PAGEIO
) {
426 if (b_flags
& B_READ
)
431 if (b_flags
& B_CACHE
) /* leave pages in the cache unchanged on error */
432 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
;
433 else if (page_out
&& (error
!= ENXIO
)) /* transient error */
434 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
;
436 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
;
438 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_DUMP_PAGES
;
440 ubc_upl_abort_range(upl
, upl_offset
- pg_offset
, commit_size
,
443 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
444 (int)upl
, upl_offset
- pg_offset
, commit_size
,
445 0x80000000|upl_abort_code
, 0);
448 int upl_commit_flags
= UPL_COMMIT_FREE_ON_EMPTY
;
450 if ((b_flags
& B_PHYS
) && (b_flags
& B_READ
))
451 upl_commit_flags
|= UPL_COMMIT_SET_DIRTY
;
454 upl_commit_flags
|= UPL_COMMIT_INACTIVATE
;
456 ubc_upl_commit_range(upl
, upl_offset
- pg_offset
, commit_size
,
459 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
460 (int)upl
, upl_offset
- pg_offset
, commit_size
,
461 upl_commit_flags
, 0);
464 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
465 (int)upl
, upl_offset
, 0, error
, 0);
473 cluster_zero(upl_t upl
, vm_offset_t upl_offset
, int size
, buf_t bp
)
477 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 23)) | DBG_FUNC_START
,
478 upl_offset
, size
, (int)bp
, 0, 0);
480 if (bp
== NULL
|| bp
->b_datap
== 0) {
482 pl
= ubc_upl_pageinfo(upl
);
490 page_index
= upl_offset
/ PAGE_SIZE
;
491 page_offset
= upl_offset
& PAGE_MASK
;
493 zero_addr
= ((addr64_t
)upl_phys_page(pl
, page_index
) << 12) + page_offset
;
494 zero_cnt
= min(PAGE_SIZE
- page_offset
, size
);
496 bzero_phys(zero_addr
, zero_cnt
);
499 upl_offset
+= zero_cnt
;
502 bzero((caddr_t
)((vm_offset_t
)bp
->b_datap
+ upl_offset
), size
);
504 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 23)) | DBG_FUNC_END
,
505 upl_offset
, size
, 0, 0, 0);
510 cluster_io(vnode_t vp
, upl_t upl
, vm_offset_t upl_offset
, off_t f_offset
, int non_rounded_size
,
511 int flags
, buf_t real_bp
, struct clios
*iostate
)
520 buf_t cbp_head
= NULL
;
521 buf_t cbp_tail
= NULL
;
529 int async_throttle
= 0;
534 if (mp
->mnt_devblocksize
> 1) {
536 * round the requested size up so that this I/O ends on a
537 * page boundary in case this is a 'write'... if the filesystem
538 * has blocks allocated to back the page beyond the EOF, we want to
539 * make sure to write out the zero's that are sitting beyond the EOF
540 * so that in case the filesystem doesn't explicitly zero this area
541 * if a hole is created via a lseek/write beyond the current EOF,
542 * it will return zeros when it's read back from the disk. If the
543 * physical allocation doesn't extend for the whole page, we'll
544 * only write/read from the disk up to the end of this allocation
545 * via the extent info returned from the VNOP_BLOCKMAP call.
547 pg_offset
= upl_offset
& PAGE_MASK
;
549 size
= (((non_rounded_size
+ pg_offset
) + (PAGE_SIZE
- 1)) & ~PAGE_MASK
) - pg_offset
;
552 * anyone advertising a blocksize of 1 byte probably
553 * can't deal with us rounding up the request size
554 * AFP is one such filesystem/device
556 size
= non_rounded_size
;
558 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_START
,
559 (int)f_offset
, size
, upl_offset
, flags
, 0);
561 if (flags
& CL_READ
) {
563 bmap_flags
= VNODE_READ
;
565 max_iosize
= mp
->mnt_maxreadcnt
;
566 max_vectors
= mp
->mnt_segreadcnt
;
569 bmap_flags
= VNODE_WRITE
;
571 max_iosize
= mp
->mnt_maxwritecnt
;
572 max_vectors
= mp
->mnt_segwritecnt
;
574 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_NONE
, max_iosize
, max_vectors
, mp
->mnt_devblocksize
, 0, 0);
577 * make sure the maximum iosize is a
578 * multiple of the page size
580 max_iosize
&= ~PAGE_MASK
;
582 if (flags
& CL_THROTTLE
) {
583 if ( !(flags
& CL_PAGEOUT
) && cluster_hard_throttle_on(vp
)) {
584 if (max_iosize
> HARD_THROTTLE_MAXSIZE
)
585 max_iosize
= HARD_THROTTLE_MAXSIZE
;
586 async_throttle
= HARD_THROTTLE_MAXCNT
;
588 async_throttle
= VNODE_ASYNC_THROTTLE
;
593 io_flags
|= B_NOCACHE
;
594 if (flags
& (CL_PAGEIN
| CL_PAGEOUT
))
595 io_flags
|= B_PAGEIO
;
596 if (flags
& CL_COMMIT
)
597 io_flags
|= B_COMMIT_UPL
;
598 if (flags
& CL_PRESERVE
)
600 if (flags
& CL_KEEPCACHED
)
603 if ((flags
& CL_READ
) && ((upl_offset
+ non_rounded_size
) & PAGE_MASK
) && (!(flags
& CL_NOZERO
))) {
605 * then we are going to end up
606 * with a page that we can't complete (the file size wasn't a multiple
607 * of PAGE_SIZE and we're trying to read to the end of the file
608 * so we'll go ahead and zero out the portion of the page we can't
609 * read in from the file
611 zero_offset
= upl_offset
+ non_rounded_size
;
618 if (size
> max_iosize
)
619 io_size
= max_iosize
;
623 if ((error
= VNOP_BLOCKMAP(vp
, f_offset
, io_size
, &blkno
, (size_t *)&io_size
, NULL
, bmap_flags
, NULL
))) {
626 if (real_bp
&& (real_bp
->b_blkno
== real_bp
->b_lblkno
))
627 real_bp
->b_blkno
= blkno
;
629 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 24)) | DBG_FUNC_NONE
,
630 (int)f_offset
, (int)blkno
, io_size
, zero_offset
, 0);
634 * vnop_blockmap didn't return an error... however, it did
635 * return an extent size of 0 which means we can't
636 * make forward progress on this I/O... a hole in the
637 * file would be returned as a blkno of -1 with a non-zero io_size
638 * a real extent is returned with a blkno != -1 and a non-zero io_size
643 if ( !(flags
& CL_READ
) && blkno
== -1) {
647 * we're writing into a 'hole'
649 if (flags
& CL_PAGEOUT
) {
651 * if we got here via cluster_pageout
652 * then just error the request and return
653 * the 'hole' should already have been covered
658 if ( !(flags
& CL_COMMIT
)) {
660 * currently writes always request the commit to happen
661 * as part of the io completion... however, if the CL_COMMIT
662 * flag isn't specified, than we can't issue the abort_range
663 * since the call site is going to abort or commit the same upl..
664 * in this case we can only return an error
670 * we can get here if the cluster code happens to
671 * pick up a page that was dirtied via mmap vs
672 * a 'write' and the page targets a 'hole'...
673 * i.e. the writes to the cluster were sparse
674 * and the file was being written for the first time
676 * we can also get here if the filesystem supports
677 * 'holes' that are less than PAGE_SIZE.... because
678 * we can't know if the range in the page that covers
679 * the 'hole' has been dirtied via an mmap or not,
680 * we have to assume the worst and try to push the
681 * entire page to storage.
683 * Try paging out the page individually before
684 * giving up entirely and dumping it (the pageout
685 * path will insure that the zero extent accounting
686 * has been taken care of before we get back into cluster_io)
688 ubc_upl_abort_range(upl
, trunc_page(upl_offset
), PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
690 e_offset
= round_page_64(f_offset
+ 1);
692 if (ubc_sync_range(vp
, f_offset
, e_offset
, UBC_PUSHDIRTY
) == 0) {
696 io_size
= e_offset
- f_offset
;
699 upl_offset
+= io_size
;
706 * keep track of how much of the original request
707 * that we've actually completed... non_rounded_size
708 * may go negative due to us rounding the request
709 * to a page size multiple (i.e. size > non_rounded_size)
711 non_rounded_size
-= io_size
;
713 if (non_rounded_size
<= 0) {
715 * we've transferred all of the data in the original
716 * request, but we were unable to complete the tail
717 * of the last page because the file didn't have
718 * an allocation to back that portion... this is ok.
724 lblkno
= (daddr64_t
)(f_offset
/ PAGE_SIZE_64
);
726 * we have now figured out how much I/O we can do - this is in 'io_size'
727 * pg_offset is the starting point in the first page for the I/O
728 * pg_count is the number of full and partial pages that 'io_size' encompasses
730 pg_offset
= upl_offset
& PAGE_MASK
;
732 if (flags
& CL_DEV_MEMORY
) {
734 * currently, can't deal with reading 'holes' in file
741 * treat physical requests as one 'giant' page
745 pg_count
= (io_size
+ pg_offset
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
747 if ((flags
& CL_READ
) && blkno
== -1) {
751 * if we're reading and blkno == -1, then we've got a
752 * 'hole' in the file that we need to deal with by zeroing
753 * out the affected area in the upl
755 if (zero_offset
&& io_size
== size
) {
757 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
758 * than 'zero_offset' will be non-zero
759 * if the 'hole' returned by vnop_blockmap extends all the way to the eof
760 * (indicated by the io_size finishing off the I/O request for this UPL)
761 * than we're not going to issue an I/O for the
762 * last page in this upl... we need to zero both the hole and the tail
763 * of the page beyond the EOF, since the delayed zero-fill won't kick in
765 bytes_to_zero
= (((upl_offset
+ io_size
) + (PAGE_SIZE
- 1)) & ~PAGE_MASK
) - upl_offset
;
769 bytes_to_zero
= io_size
;
771 cluster_zero(upl
, upl_offset
, bytes_to_zero
, real_bp
);
775 * if there is a current I/O chain pending
776 * then the first page of the group we just zero'd
777 * will be handled by the I/O completion if the zero
778 * fill started in the middle of the page
780 pg_count
= (io_size
- pg_offset
) / PAGE_SIZE
;
783 * no pending I/O to pick up that first page
784 * so, we have to make sure it gets committed
786 * set the pg_offset to 0 so that the upl_commit_range
787 * starts with this page
789 pg_count
= (io_size
+ pg_offset
) / PAGE_SIZE
;
792 if (io_size
== size
&& ((upl_offset
+ io_size
) & PAGE_MASK
))
794 * if we're done with the request for this UPL
795 * then we have to make sure to commit the last page
796 * even if we only partially zero-filled it
802 pg_resid
= PAGE_SIZE
- pg_offset
;
806 if (flags
& CL_COMMIT
)
807 ubc_upl_commit_range(upl
,
808 (upl_offset
+ pg_resid
) & ~PAGE_MASK
,
809 pg_count
* PAGE_SIZE
,
810 UPL_COMMIT_CLEAR_DIRTY
| UPL_COMMIT_FREE_ON_EMPTY
);
812 upl_offset
+= io_size
;
816 * keep track of how much of the original request
817 * that we've actually completed... non_rounded_size
818 * may go negative due to us rounding the request
819 * to a page size multiple (i.e. size > non_rounded_size)
821 non_rounded_size
-= io_size
;
823 if (non_rounded_size
<= 0) {
825 * we've transferred all of the data in the original
826 * request, but we were unable to complete the tail
827 * of the last page because the file didn't have
828 * an allocation to back that portion... this is ok.
832 if (cbp_head
&& pg_count
)
837 if (pg_count
> max_vectors
) {
838 if (((pg_count
- max_vectors
) * PAGE_SIZE
) > io_size
) {
839 io_size
= PAGE_SIZE
- pg_offset
;
842 io_size
-= (pg_count
- max_vectors
) * PAGE_SIZE
;
843 pg_count
= max_vectors
;
847 if ( !(mp
->mnt_kern_flag
& MNTK_VIRTUALDEV
))
849 * if we're not targeting a virtual device i.e. a disk image
850 * it's safe to dip into the reserve pool since real devices
851 * can complete this I/O request without requiring additional
852 * bufs from the alloc_io_buf pool
855 else if ((flags
& CL_ASYNC
) && !(flags
& CL_PAGEOUT
))
857 * Throttle the speculative IO
863 cbp
= alloc_io_buf(vp
, priv
);
865 if (flags
& CL_PAGEOUT
) {
868 for (i
= 0; i
< pg_count
; i
++) {
869 if (buf_invalblkno(vp
, lblkno
+ i
, 0) == EBUSY
)
870 panic("BUSY bp found in cluster_io");
873 if (flags
& CL_ASYNC
) {
874 if (buf_setcallback(cbp
, (void *)cluster_iodone
, NULL
))
875 panic("buf_setcallback failed\n");
877 cbp
->b_flags
|= io_flags
;
879 cbp
->b_lblkno
= lblkno
;
880 cbp
->b_blkno
= blkno
;
881 cbp
->b_bcount
= io_size
;
883 if (buf_setupl(cbp
, upl
, upl_offset
))
884 panic("buf_setupl failed\n");
886 cbp
->b_trans_next
= (buf_t
)NULL
;
888 if ((cbp
->b_iostate
= (void *)iostate
))
890 * caller wants to track the state of this
891 * io... bump the amount issued against this stream
893 iostate
->io_issued
+= io_size
;
895 if (flags
& CL_READ
) {
896 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 26)) | DBG_FUNC_NONE
,
897 (int)cbp
->b_lblkno
, (int)cbp
->b_blkno
, upl_offset
, io_size
, 0);
900 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 27)) | DBG_FUNC_NONE
,
901 (int)cbp
->b_lblkno
, (int)cbp
->b_blkno
, upl_offset
, io_size
, 0);
905 cbp_tail
->b_trans_next
= cbp
;
911 (buf_t
)(cbp
->b_trans_head
) = cbp_head
;
914 upl_offset
+= io_size
;
918 * keep track of how much of the original request
919 * that we've actually completed... non_rounded_size
920 * may go negative due to us rounding the request
921 * to a page size multiple (i.e. size > non_rounded_size)
923 non_rounded_size
-= io_size
;
925 if (non_rounded_size
<= 0) {
927 * we've transferred all of the data in the original
928 * request, but we were unable to complete the tail
929 * of the last page because the file didn't have
930 * an allocation to back that portion... this is ok.
934 if ( (!(upl_offset
& PAGE_MASK
) && !(flags
& CL_DEV_MEMORY
) && ((flags
& CL_ASYNC
) || trans_count
> 8)) || size
== 0) {
936 * if we have no more I/O to issue or
937 * the current I/O we've prepared fully
938 * completes the last page in this request
939 * and it's either an ASYNC request or
940 * we've already accumulated more than 8 I/O's into
941 * this transaction and it's not an I/O directed to
942 * special DEVICE memory
943 * then go ahead and issue the I/O
947 cbp_head
->b_flags
|= B_NEED_IODONE
;
948 cbp_head
->b_real_bp
= real_bp
;
950 cbp_head
->b_real_bp
= (buf_t
)NULL
;
954 * we're about to issue the last I/O for this upl
955 * if this was a read to the eof and the eof doesn't
956 * finish on a page boundary, than we need to zero-fill
957 * the rest of the page....
959 cbp_head
->b_validend
= zero_offset
;
961 cbp_head
->b_validend
= 0;
963 if (flags
& CL_THROTTLE
)
964 (void)vnode_waitforwrites(vp
, async_throttle
, 0, 0, (char *)"cluster_io");
966 for (cbp
= cbp_head
; cbp
;) {
969 if ( !(io_flags
& B_READ
))
970 vnode_startwrite(vp
);
972 cbp_next
= cbp
->b_trans_next
;
974 (void) VNOP_STRATEGY(cbp
);
977 if ( !(flags
& CL_ASYNC
)) {
980 for (cbp
= cbp_head
; cbp
; cbp
= cbp
->b_trans_next
)
983 if ((error
= cluster_iodone(cbp_head
, (void *)&dummy
))) {
984 if (((flags
& (CL_PAGEOUT
| CL_KEEPCACHED
)) == CL_PAGEOUT
) && (error
== ENXIO
))
985 error
= 0; /* drop the error */
993 cbp_head
= (buf_t
)NULL
;
994 cbp_tail
= (buf_t
)NULL
;
1004 for (cbp
= cbp_head
; cbp
;) {
1007 upl_offset
-= cbp
->b_bcount
;
1008 size
+= cbp
->b_bcount
;
1009 io_size
+= cbp
->b_bcount
;
1011 cbp_next
= cbp
->b_trans_next
;
1016 int need_wakeup
= 0;
1019 * update the error condition for this stream
1020 * since we never really issued the io
1021 * just go ahead and adjust it back
1023 lck_mtx_lock(cl_mtxp
);
1025 if (iostate
->io_error
== 0)
1026 iostate
->io_error
= error
;
1027 iostate
->io_issued
-= io_size
;
1029 if (iostate
->io_wanted
) {
1031 * someone is waiting for the state of
1032 * this io stream to change
1034 iostate
->io_wanted
= 0;
1037 lck_mtx_unlock(cl_mtxp
);
1040 wakeup((caddr_t
)&iostate
->io_wanted
);
1042 pg_offset
= upl_offset
& PAGE_MASK
;
1043 abort_size
= (size
+ pg_offset
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
1045 if (flags
& CL_COMMIT
) {
1048 if (flags
& CL_PRESERVE
) {
1049 ubc_upl_commit_range(upl
, upl_offset
- pg_offset
, abort_size
,
1050 UPL_COMMIT_FREE_ON_EMPTY
);
1052 if ((flags
& CL_PAGEOUT
) && (error
!= ENXIO
)) /* transient error */
1053 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
;
1054 else if (flags
& CL_PAGEIN
)
1055 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
;
1057 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_DUMP_PAGES
;
1059 ubc_upl_abort_range(upl
, upl_offset
- pg_offset
, abort_size
,
1062 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 28)) | DBG_FUNC_NONE
,
1063 (int)upl
, upl_offset
- pg_offset
, abort_size
, error
, 0);
1066 real_bp
->b_flags
|= B_ERROR
;
1067 real_bp
->b_error
= error
;
1069 buf_biodone(real_bp
);
1074 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_END
,
1075 (int)f_offset
, size
, upl_offset
, retval
, 0);
1082 cluster_rd_prefetch(vnode_t vp
, off_t f_offset
, u_int size
, off_t filesize
)
1084 int pages_in_prefetch
;
1086 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_START
,
1087 (int)f_offset
, size
, (int)filesize
, 0, 0);
1089 if (f_offset
>= filesize
) {
1090 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_END
,
1091 (int)f_offset
, 0, 0, 0, 0);
1094 if (size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
1095 size
= (MAX_UPL_TRANSFER
* PAGE_SIZE
);
1097 size
= (size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
1099 if ((off_t
)size
> (filesize
- f_offset
))
1100 size
= filesize
- f_offset
;
1101 pages_in_prefetch
= (size
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
1103 advisory_read(vp
, filesize
, f_offset
, size
);
1105 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_END
,
1106 (int)f_offset
+ size
, pages_in_prefetch
, 0, 1, 0);
1108 return (pages_in_prefetch
);
1114 cluster_rd_ahead(vnode_t vp
, struct cl_extent
*extent
, off_t filesize
, struct cl_readahead
*rap
)
1118 int size_of_prefetch
;
1121 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_START
,
1122 (int)extent
->b_addr
, (int)extent
->e_addr
, (int)rap
->cl_lastr
, 0, 0);
1124 if (extent
->b_addr
== rap
->cl_lastr
&& extent
->b_addr
== extent
->e_addr
) {
1125 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
1126 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 0, 0);
1129 if (rap
->cl_lastr
== -1 || (extent
->b_addr
!= rap
->cl_lastr
&& extent
->b_addr
!= (rap
->cl_lastr
+ 1) &&
1130 (extent
->b_addr
!= (rap
->cl_maxra
+ 1) || rap
->cl_ralen
== 0))) {
1134 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
1135 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 1, 0);
1139 if (extent
->e_addr
< rap
->cl_maxra
) {
1140 if ((rap
->cl_maxra
- extent
->e_addr
) > (MAX_UPL_TRANSFER
/ 4)) {
1142 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
1143 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 2, 0);
1147 r_addr
= max(extent
->e_addr
, rap
->cl_maxra
) + 1;
1148 f_offset
= (off_t
)(r_addr
* PAGE_SIZE_64
);
1150 size_of_prefetch
= 0;
1152 ubc_range_op(vp
, f_offset
, f_offset
+ PAGE_SIZE_64
, UPL_ROP_PRESENT
, &size_of_prefetch
);
1154 if (size_of_prefetch
) {
1155 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
1156 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 3, 0);
1159 if (f_offset
< filesize
) {
1160 daddr64_t read_size
;
1162 rap
->cl_ralen
= rap
->cl_ralen
? min(MAX_UPL_TRANSFER
, rap
->cl_ralen
<< 1) : 1;
1164 read_size
= (extent
->e_addr
+ 1) - extent
->b_addr
;
1166 if (read_size
> rap
->cl_ralen
) {
1167 if (read_size
> MAX_UPL_TRANSFER
)
1168 rap
->cl_ralen
= MAX_UPL_TRANSFER
;
1170 rap
->cl_ralen
= read_size
;
1172 size_of_prefetch
= cluster_rd_prefetch(vp
, f_offset
, rap
->cl_ralen
* PAGE_SIZE
, filesize
);
1174 if (size_of_prefetch
)
1175 rap
->cl_maxra
= (r_addr
+ size_of_prefetch
) - 1;
1177 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
1178 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 4, 0);
1182 cluster_pageout(vnode_t vp
, upl_t upl
, vm_offset_t upl_offset
, off_t f_offset
,
1183 int size
, off_t filesize
, int flags
)
1189 struct cl_writebehind
*wbp
;
1191 if (vp
->v_mount
->mnt_kern_flag
& MNTK_VIRTUALDEV
)
1193 * if we know we're issuing this I/O to a virtual device (i.e. disk image)
1194 * then we don't want to enforce this throttle... if we do, we can
1195 * potentially deadlock since we're stalling the pageout thread at a time
1196 * when the disk image might need additional memory (which won't be available
1197 * if the pageout thread can't run)... instead we'll just depend on the throttle
1198 * that the pageout thread now has in place to deal with external files
1200 local_flags
= CL_PAGEOUT
;
1202 local_flags
= CL_PAGEOUT
| CL_THROTTLE
;
1204 if ((flags
& UPL_IOSYNC
) == 0)
1205 local_flags
|= CL_ASYNC
;
1206 if ((flags
& UPL_NOCOMMIT
) == 0)
1207 local_flags
|= CL_COMMIT
;
1208 if ((flags
& UPL_KEEPCACHED
))
1209 local_flags
|= CL_KEEPCACHED
;
1212 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 52)) | DBG_FUNC_NONE
,
1213 (int)f_offset
, size
, (int)filesize
, local_flags
, 0);
1216 * If they didn't specify any I/O, then we are done...
1217 * we can't issue an abort because we don't know how
1218 * big the upl really is
1223 if (vp
->v_mount
->mnt_flag
& MNT_RDONLY
) {
1224 if (local_flags
& CL_COMMIT
)
1225 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
);
1229 * can't page-in from a negative offset
1230 * or if we're starting beyond the EOF
1231 * or if the file offset isn't page aligned
1232 * or the size requested isn't a multiple of PAGE_SIZE
1234 if (f_offset
< 0 || f_offset
>= filesize
||
1235 (f_offset
& PAGE_MASK_64
) || (size
& PAGE_MASK
)) {
1236 if (local_flags
& CL_COMMIT
)
1237 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
);
1240 max_size
= filesize
- f_offset
;
1242 if (size
< max_size
)
1247 rounded_size
= (io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
1249 if (size
> rounded_size
) {
1250 if (local_flags
& CL_COMMIT
)
1251 ubc_upl_abort_range(upl
, upl_offset
+ rounded_size
, size
- rounded_size
,
1252 UPL_ABORT_FREE_ON_EMPTY
);
1254 if ((wbp
= cluster_get_wbp(vp
, 0)) != NULL
)
1255 wbp
->cl_hasbeenpaged
= 1;
1257 return (cluster_io(vp
, upl
, upl_offset
, f_offset
, io_size
,
1258 local_flags
, (buf_t
)NULL
, (struct clios
*)NULL
));
1262 cluster_pagein(vnode_t vp
, upl_t upl
, vm_offset_t upl_offset
, off_t f_offset
,
1263 int size
, off_t filesize
, int flags
)
1269 int local_flags
= 0;
1271 if (upl
== NULL
|| size
< 0)
1272 panic("cluster_pagein: NULL upl passed in");
1274 if ((flags
& UPL_IOSYNC
) == 0)
1275 local_flags
|= CL_ASYNC
;
1276 if ((flags
& UPL_NOCOMMIT
) == 0)
1277 local_flags
|= CL_COMMIT
;
1280 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 56)) | DBG_FUNC_NONE
,
1281 (int)f_offset
, size
, (int)filesize
, local_flags
, 0);
1284 * can't page-in from a negative offset
1285 * or if we're starting beyond the EOF
1286 * or if the file offset isn't page aligned
1287 * or the size requested isn't a multiple of PAGE_SIZE
1289 if (f_offset
< 0 || f_offset
>= filesize
||
1290 (f_offset
& PAGE_MASK_64
) || (size
& PAGE_MASK
) || (upl_offset
& PAGE_MASK
)) {
1291 if (local_flags
& CL_COMMIT
)
1292 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
);
1295 max_size
= filesize
- f_offset
;
1297 if (size
< max_size
)
1302 rounded_size
= (io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
1304 if (size
> rounded_size
&& (local_flags
& CL_COMMIT
))
1305 ubc_upl_abort_range(upl
, upl_offset
+ rounded_size
,
1306 size
- rounded_size
, UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
);
1308 retval
= cluster_io(vp
, upl
, upl_offset
, f_offset
, io_size
,
1309 local_flags
| CL_READ
| CL_PAGEIN
, (buf_t
)NULL
, (struct clios
*)NULL
);
1311 if (retval
== 0 && !(flags
& UPL_NORDAHEAD
) && !(vp
->v_flag
& VRAOFF
)) {
1312 struct cl_readahead
*rap
;
1314 rap
= cluster_get_rap(vp
);
1317 struct cl_extent extent
;
1319 extent
.b_addr
= (daddr64_t
)(f_offset
/ PAGE_SIZE_64
);
1320 extent
.e_addr
= (daddr64_t
)((f_offset
+ ((off_t
)io_size
- 1)) / PAGE_SIZE_64
);
1322 if (rounded_size
== PAGE_SIZE
) {
1324 * we haven't read the last page in of the file yet
1325 * so let's try to read ahead if we're in
1326 * a sequential access pattern
1328 cluster_rd_ahead(vp
, &extent
, filesize
, rap
);
1330 rap
->cl_lastr
= extent
.e_addr
;
1332 lck_mtx_unlock(&rap
->cl_lockr
);
1339 cluster_bp(buf_t bp
)
1344 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 19)) | DBG_FUNC_START
,
1345 (int)bp
, (int)bp
->b_lblkno
, bp
->b_bcount
, bp
->b_flags
, 0);
1347 if (bp
->b_flags
& B_READ
)
1348 flags
= CL_ASYNC
| CL_READ
;
1352 f_offset
= ubc_blktooff(bp
->b_vp
, bp
->b_lblkno
);
1354 return (cluster_io(bp
->b_vp
, bp
->b_upl
, 0, f_offset
, bp
->b_bcount
, flags
, bp
, (struct clios
*)NULL
));
1358 cluster_write(vnode_t vp
, struct uio
*uio
, off_t oldEOF
, off_t newEOF
, off_t headOff
, off_t tailOff
, int xflags
)
1371 if (vp
->v_flag
& VNOCACHE_DATA
)
1372 flags
|= IO_NOCACHE
;
1374 if ( (!(flags
& IO_NOCACHE
)) || (!uio
) || (!UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
))) {
1376 * go do a write through the cache if one of the following is true....
1377 * NOCACHE is not true
1378 * there is no uio structure or it doesn't target USERSPACE
1380 return (cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, flags
));
1384 if (IS_VALID_UIO_SEGFLG(uio
->uio_segflg
) == 0) {
1385 panic("%s :%d - invalid uio_segflg\n", __FILE__
, __LINE__
);
1387 #endif /* LP64_DEBUG */
1389 while (uio_resid(uio
) && uio
->uio_offset
< newEOF
&& retval
== 0) {
1390 user_size_t iov_len
;
1391 user_addr_t iov_base
;
1394 * we know we have a resid, so this is safe
1395 * skip over any emtpy vectors
1397 uio_update(uio
, (user_size_t
)0);
1399 iov_len
= uio_curriovlen(uio
);
1400 iov_base
= uio_curriovbase(uio
);
1402 upl_size
= PAGE_SIZE
;
1403 upl_flags
= UPL_QUERY_OBJECT_TYPE
;
1405 // LP64todo - fix this!
1406 if ((vm_map_get_upl(current_map(),
1407 (vm_map_offset_t
)(iov_base
& ~((user_addr_t
)PAGE_MASK
)),
1408 &upl_size
, &upl
, NULL
, NULL
, &upl_flags
, 0)) != KERN_SUCCESS
) {
1410 * the user app must have passed in an invalid address
1416 * We check every vector target but if it is physically
1417 * contiguous space, we skip the sanity checks.
1419 if (upl_flags
& UPL_PHYS_CONTIG
) {
1422 zflags
= flags
& ~IO_TAILZEROFILL
;
1423 zflags
|= IO_HEADZEROFILL
;
1425 if (flags
& IO_HEADZEROFILL
) {
1427 * in case we have additional vectors, we don't want to do this again
1429 flags
&= ~IO_HEADZEROFILL
;
1431 if ((retval
= cluster_write_x(vp
, (struct uio
*)0, 0, uio
->uio_offset
, headOff
, 0, zflags
)))
1434 retval
= cluster_phys_write(vp
, uio
, newEOF
);
1436 if (uio_resid(uio
) == 0 && (flags
& IO_TAILZEROFILL
)) {
1437 return (cluster_write_x(vp
, (struct uio
*)0, 0, tailOff
, uio
->uio_offset
, 0, zflags
));
1440 else if ((uio_resid(uio
) < PAGE_SIZE
) || (flags
& (IO_TAILZEROFILL
| IO_HEADZEROFILL
))) {
1442 * we're here because we're don't have a physically contiguous target buffer
1443 * go do a write through the cache if one of the following is true....
1444 * the total xfer size is less than a page...
1445 * we're being asked to ZEROFILL either the head or the tail of the I/O...
1447 return (cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, flags
));
1449 // LP64todo - fix this!
1450 else if (((int)uio
->uio_offset
& PAGE_MASK
) || (CAST_DOWN(int, iov_base
) & PAGE_MASK
)) {
1451 if (((int)uio
->uio_offset
& PAGE_MASK
) == (CAST_DOWN(int, iov_base
) & PAGE_MASK
)) {
1453 * Bring the file offset write up to a pagesize boundary
1454 * this will also bring the base address to a page boundary
1455 * since they both are currently on the same offset within a page
1456 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
1457 * so the computed clip_size must always be less than the current uio_resid
1459 clip_size
= (PAGE_SIZE
- (uio
->uio_offset
& PAGE_MASK_64
));
1462 * Fake the resid going into the cluster_write_x call
1463 * and restore it on the way out.
1465 // LP64todo - fix this
1466 prev_resid
= uio_resid(uio
);
1467 uio_setresid(uio
, clip_size
);
1469 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, flags
);
1471 uio_setresid(uio
, prev_resid
- (clip_size
- uio_resid(uio
)));
1474 * can't get both the file offset and the buffer offset aligned to a page boundary
1475 * so fire an I/O through the cache for this entire vector
1477 // LP64todo - fix this
1478 clip_size
= iov_len
;
1479 // LP64todo - fix this
1480 prev_resid
= uio_resid(uio
);
1481 uio_setresid(uio
, clip_size
);
1483 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, flags
);
1485 uio_setresid(uio
, prev_resid
- (clip_size
- uio_resid(uio
)));
1489 * If we come in here, we know the offset into
1490 * the file is on a pagesize boundary and the
1491 * target buffer address is also on a page boundary
1493 max_io_size
= newEOF
- uio
->uio_offset
;
1494 // LP64todo - fix this
1495 clip_size
= uio_resid(uio
);
1496 if (iov_len
< clip_size
)
1497 // LP64todo - fix this!
1498 clip_size
= iov_len
;
1499 if (max_io_size
< clip_size
)
1500 clip_size
= max_io_size
;
1502 if (clip_size
< PAGE_SIZE
) {
1504 * Take care of tail end of write in this vector
1506 // LP64todo - fix this
1507 prev_resid
= uio_resid(uio
);
1508 uio_setresid(uio
, clip_size
);
1510 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, flags
);
1512 uio_setresid(uio
, prev_resid
- (clip_size
- uio_resid(uio
)));
1514 /* round clip_size down to a multiple of pagesize */
1515 clip_size
= clip_size
& ~(PAGE_MASK
);
1516 // LP64todo - fix this
1517 prev_resid
= uio_resid(uio
);
1518 uio_setresid(uio
, clip_size
);
1520 retval
= cluster_nocopy_write(vp
, uio
, newEOF
);
1522 if ((retval
== 0) && uio_resid(uio
))
1523 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, flags
);
1525 uio_setresid(uio
, prev_resid
- (clip_size
- uio_resid(uio
)));
1535 cluster_nocopy_write(vnode_t vp
, struct uio
*uio
, off_t newEOF
)
1538 upl_page_info_t
*pl
;
1539 vm_offset_t upl_offset
;
1543 int upl_needed_size
;
1548 int force_data_sync
;
1550 struct clios iostate
;
1551 struct cl_writebehind
*wbp
;
1554 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_START
,
1555 (int)uio
->uio_offset
, (int)uio_resid(uio
),
1559 * When we enter this routine, we know
1560 * -- the offset into the file is on a pagesize boundary
1561 * -- the resid is a page multiple
1562 * -- the resid will not exceed iov_len
1565 if ((wbp
= cluster_get_wbp(vp
, CLW_RETURNLOCKED
)) != NULL
) {
1567 cluster_try_push(wbp
, vp
, newEOF
, 0, 1);
1569 lck_mtx_unlock(&wbp
->cl_lockw
);
1571 iostate
.io_completed
= 0;
1572 iostate
.io_issued
= 0;
1573 iostate
.io_error
= 0;
1574 iostate
.io_wanted
= 0;
1576 while (uio_resid(uio
) && uio
->uio_offset
< newEOF
&& error
== 0) {
1577 user_addr_t iov_base
;
1579 io_size
= uio_resid(uio
);
1581 if (io_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
1582 io_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
1584 iov_base
= uio_curriovbase(uio
);
1586 // LP64todo - fix this!
1587 upl_offset
= CAST_DOWN(vm_offset_t
, iov_base
) & PAGE_MASK
;
1589 upl_needed_size
= (upl_offset
+ io_size
+ (PAGE_SIZE
-1)) & ~PAGE_MASK
;
1591 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_START
,
1592 (int)upl_offset
, upl_needed_size
, (int)iov_base
, io_size
, 0);
1594 for (force_data_sync
= 0; force_data_sync
< 3; force_data_sync
++) {
1596 upl_size
= upl_needed_size
;
1597 upl_flags
= UPL_FILE_IO
| UPL_COPYOUT_FROM
| UPL_NO_SYNC
|
1598 UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
1600 // LP64todo - fix this!
1601 kret
= vm_map_get_upl(current_map(),
1602 (vm_map_offset_t
)(iov_base
& ~((user_addr_t
)PAGE_MASK
)),
1610 if (kret
!= KERN_SUCCESS
) {
1611 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
1614 * cluster_nocopy_write: failed to get pagelist
1616 * we may have already spun some portion of this request
1617 * off as async requests... we need to wait for the I/O
1618 * to complete before returning
1620 goto wait_for_writes
;
1622 pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
1623 pages_in_pl
= upl_size
/ PAGE_SIZE
;
1625 for (i
= 0; i
< pages_in_pl
; i
++) {
1626 if (!upl_valid_page(pl
, i
))
1629 if (i
== pages_in_pl
)
1633 * didn't get all the pages back that we
1634 * needed... release this upl and try again
1636 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
1637 UPL_ABORT_FREE_ON_EMPTY
);
1639 if (force_data_sync
>= 3) {
1640 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
1641 i
, pages_in_pl
, upl_size
, kret
, 0);
1643 * for some reason, we couldn't acquire a hold on all
1644 * the pages needed in the user's address space
1646 * we may have already spun some portion of this request
1647 * off as async requests... we need to wait for the I/O
1648 * to complete before returning
1650 goto wait_for_writes
;
1654 * Consider the possibility that upl_size wasn't satisfied.
1656 if (upl_size
!= upl_needed_size
)
1657 io_size
= (upl_size
- (int)upl_offset
) & ~PAGE_MASK
;
1659 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
1660 (int)upl_offset
, upl_size
, (int)iov_base
, io_size
, 0);
1663 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
1664 UPL_ABORT_FREE_ON_EMPTY
);
1666 * we may have already spun some portion of this request
1667 * off as async requests... we need to wait for the I/O
1668 * to complete before returning
1670 goto wait_for_writes
;
1673 * Now look for pages already in the cache
1674 * and throw them away.
1675 * uio->uio_offset is page aligned within the file
1676 * io_size is a multiple of PAGE_SIZE
1678 ubc_range_op(vp
, uio
->uio_offset
, uio
->uio_offset
+ io_size
, UPL_ROP_DUMP
, NULL
);
1681 * we want push out these writes asynchronously so that we can overlap
1682 * the preparation of the next I/O
1683 * if there are already too many outstanding writes
1684 * wait until some complete before issuing the next
1686 lck_mtx_lock(cl_mtxp
);
1688 while ((iostate
.io_issued
- iostate
.io_completed
) > (2 * MAX_UPL_TRANSFER
* PAGE_SIZE
)) {
1689 iostate
.io_wanted
= 1;
1690 msleep((caddr_t
)&iostate
.io_wanted
, cl_mtxp
, PRIBIO
+ 1, "cluster_nocopy_write", 0);
1692 lck_mtx_unlock(cl_mtxp
);
1694 if (iostate
.io_error
) {
1696 * one of the earlier writes we issued ran into a hard error
1697 * don't issue any more writes, cleanup the UPL
1698 * that was just created but not used, then
1699 * go wait for all writes that are part of this stream
1700 * to complete before returning the error to the caller
1702 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
1703 UPL_ABORT_FREE_ON_EMPTY
);
1705 goto wait_for_writes
;
1707 io_flag
= CL_ASYNC
| CL_PRESERVE
| CL_COMMIT
| CL_THROTTLE
;
1709 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 77)) | DBG_FUNC_START
,
1710 (int)upl_offset
, (int)uio
->uio_offset
, io_size
, io_flag
, 0);
1712 error
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
,
1713 io_size
, io_flag
, (buf_t
)NULL
, &iostate
);
1715 uio_update(uio
, (user_size_t
)io_size
);
1717 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 77)) | DBG_FUNC_END
,
1718 (int)upl_offset
, (int)uio
->uio_offset
, (int)uio_resid(uio
), error
, 0);
1724 * make sure all async writes issued as part of this stream
1725 * have completed before we return
1727 lck_mtx_lock(cl_mtxp
);
1729 while (iostate
.io_issued
!= iostate
.io_completed
) {
1730 iostate
.io_wanted
= 1;
1731 msleep((caddr_t
)&iostate
.io_wanted
, cl_mtxp
, PRIBIO
+ 1, "cluster_nocopy_write", 0);
1733 lck_mtx_unlock(cl_mtxp
);
1735 if (iostate
.io_error
)
1736 error
= iostate
.io_error
;
1738 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_END
,
1739 (int)uio
->uio_offset
, (int)uio
->uio_resid
, error
, 4, 0);
1746 cluster_phys_write(vnode_t vp
, struct uio
*uio
, off_t newEOF
)
1748 upl_page_info_t
*pl
;
1751 vm_offset_t upl_offset
;
1755 int upl_needed_size
;
1760 user_addr_t iov_base
;
1762 struct cl_writebehind
*wbp
;
1764 devblocksize
= vp
->v_mount
->mnt_devblocksize
;
1766 * When we enter this routine, we know
1767 * -- the resid will not exceed iov_len
1768 * -- the vector target address is physcially contiguous
1770 if ((wbp
= cluster_get_wbp(vp
, CLW_RETURNLOCKED
)) != NULL
) {
1772 cluster_try_push(wbp
, vp
, newEOF
, 0, 1);
1774 lck_mtx_unlock(&wbp
->cl_lockw
);
1777 if (IS_VALID_UIO_SEGFLG(uio
->uio_segflg
) == 0) {
1778 panic("%s :%d - invalid uio_segflg\n", __FILE__
, __LINE__
);
1780 #endif /* LP64_DEBUG */
1782 // LP64todo - fix this!
1783 io_size
= (int)uio_curriovlen(uio
);
1784 iov_base
= uio_curriovbase(uio
);
1786 upl_offset
= CAST_DOWN(upl_offset_t
, iov_base
) & PAGE_MASK
;
1787 upl_needed_size
= upl_offset
+ io_size
;
1790 upl_size
= upl_needed_size
;
1791 upl_flags
= UPL_FILE_IO
| UPL_COPYOUT_FROM
| UPL_NO_SYNC
|
1792 UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
1794 // LP64todo - fix this!
1795 kret
= vm_map_get_upl(current_map(),
1796 (vm_map_offset_t
)(iov_base
& ~((user_addr_t
)PAGE_MASK
)),
1797 &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
, 0);
1799 if (kret
!= KERN_SUCCESS
) {
1801 * cluster_phys_write: failed to get pagelist
1802 * note: return kret here
1807 * Consider the possibility that upl_size wasn't satisfied.
1808 * This is a failure in the physical memory case.
1810 if (upl_size
< upl_needed_size
) {
1811 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
1814 pl
= ubc_upl_pageinfo(upl
);
1816 src_paddr
= ((addr64_t
)upl_phys_page(pl
, 0) << 12) + (addr64_t
)upl_offset
;
1818 while (((uio
->uio_offset
& (devblocksize
- 1)) || io_size
< devblocksize
) && io_size
) {
1821 head_size
= devblocksize
- (int)(uio
->uio_offset
& (devblocksize
- 1));
1823 if (head_size
> io_size
)
1824 head_size
= io_size
;
1826 error
= cluster_align_phys_io(vp
, uio
, src_paddr
, head_size
, 0);
1829 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
1833 upl_offset
+= head_size
;
1834 src_paddr
+= head_size
;
1835 io_size
-= head_size
;
1837 tail_size
= io_size
& (devblocksize
- 1);
1838 io_size
-= tail_size
;
1842 * issue a synchronous write to cluster_io
1844 error
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
,
1845 io_size
, CL_DEV_MEMORY
, (buf_t
)NULL
, (struct clios
*)NULL
);
1849 * The cluster_io write completed successfully,
1850 * update the uio structure
1852 uio_update(uio
, (user_size_t
)io_size
);
1854 src_paddr
+= io_size
;
1857 error
= cluster_align_phys_io(vp
, uio
, src_paddr
, tail_size
, 0);
1860 * just release our hold on the physically contiguous
1861 * region without changing any state
1863 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
1870 cluster_write_x(vnode_t vp
, struct uio
*uio
, off_t oldEOF
, off_t newEOF
, off_t headOff
, off_t tailOff
, int flags
)
1872 upl_page_info_t
*pl
;
1874 vm_offset_t upl_offset
= 0;
1887 long long total_size
;
1890 long long zero_cnt1
;
1892 struct cl_extent cl
;
1894 struct cl_writebehind
*wbp
;
1896 if ((wbp
= cluster_get_wbp(vp
, 0)) != NULL
)
1898 if (wbp
->cl_hasbeenpaged
) {
1900 * this vnode had pages cleaned to it by
1901 * the pager which indicates that either
1902 * it's not very 'hot', or the system is
1903 * being overwhelmed by a lot of dirty
1904 * data being delayed in the VM cache...
1905 * in either event, we'll push our remaining
1906 * delayed data at this point... this will
1907 * be more efficient than paging out 1 page at
1908 * a time, and will also act as a throttle
1909 * by delaying this client from writing any
1910 * more data until all his delayed data has
1911 * at least been queued to the uderlying driver.
1913 if (wbp
->cl_number
|| wbp
->cl_scmap
)
1914 cluster_push_EOF(vp
, newEOF
);
1916 wbp
->cl_hasbeenpaged
= 0;
1920 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_START
,
1921 (int)uio
->uio_offset
, uio_resid(uio
), (int)oldEOF
, (int)newEOF
, 0);
1923 // LP64todo - fix this
1924 io_resid
= uio_resid(uio
);
1926 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_START
,
1927 0, 0, (int)oldEOF
, (int)newEOF
, 0);
1936 if (flags
& IO_HEADZEROFILL
) {
1938 * some filesystems (HFS is one) don't support unallocated holes within a file...
1939 * so we zero fill the intervening space between the old EOF and the offset
1940 * where the next chunk of real data begins.... ftruncate will also use this
1941 * routine to zero fill to the new EOF when growing a file... in this case, the
1942 * uio structure will not be provided
1945 if (headOff
< uio
->uio_offset
) {
1946 zero_cnt
= uio
->uio_offset
- headOff
;
1949 } else if (headOff
< newEOF
) {
1950 zero_cnt
= newEOF
- headOff
;
1954 if (flags
& IO_TAILZEROFILL
) {
1956 // LP64todo - fix this
1957 zero_off1
= uio
->uio_offset
+ uio_resid(uio
);
1959 if (zero_off1
< tailOff
)
1960 zero_cnt1
= tailOff
- zero_off1
;
1963 if (zero_cnt
== 0 && uio
== (struct uio
*) 0) {
1964 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_END
,
1965 retval
, 0, 0, 0, 0);
1969 while ((total_size
= (io_resid
+ zero_cnt
+ zero_cnt1
)) && retval
== 0) {
1971 * for this iteration of the loop, figure out where our starting point is
1974 start_offset
= (int)(zero_off
& PAGE_MASK_64
);
1975 upl_f_offset
= zero_off
- start_offset
;
1976 } else if (io_resid
) {
1977 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
1978 upl_f_offset
= uio
->uio_offset
- start_offset
;
1980 start_offset
= (int)(zero_off1
& PAGE_MASK_64
);
1981 upl_f_offset
= zero_off1
- start_offset
;
1983 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 46)) | DBG_FUNC_NONE
,
1984 (int)zero_off
, (int)zero_cnt
, (int)zero_off1
, (int)zero_cnt1
, 0);
1986 if (total_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
1987 total_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
1989 cl
.b_addr
= (daddr64_t
)(upl_f_offset
/ PAGE_SIZE_64
);
1991 if (uio
&& ((flags
& (IO_NOCACHE
| IO_SYNC
| IO_HEADZEROFILL
| IO_TAILZEROFILL
)) == 0)) {
1993 * assumption... total_size <= io_resid
1994 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
1996 if ((start_offset
+ total_size
) > (MAX_UPL_TRANSFER
* PAGE_SIZE
))
1997 total_size
-= start_offset
;
1998 xfer_resid
= total_size
;
2000 retval
= cluster_copy_ubc_data(vp
, uio
, &xfer_resid
, 1);
2005 io_resid
-= (total_size
- xfer_resid
);
2006 total_size
= xfer_resid
;
2007 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
2008 upl_f_offset
= uio
->uio_offset
- start_offset
;
2010 if (total_size
== 0) {
2013 * the write did not finish on a page boundary
2014 * which will leave upl_f_offset pointing to the
2015 * beginning of the last page written instead of
2016 * the page beyond it... bump it in this case
2017 * so that the cluster code records the last page
2020 upl_f_offset
+= PAGE_SIZE_64
;
2028 * compute the size of the upl needed to encompass
2029 * the requested write... limit each call to cluster_io
2030 * to the maximum UPL size... cluster_io will clip if
2031 * this exceeds the maximum io_size for the device,
2032 * make sure to account for
2033 * a starting offset that's not page aligned
2035 upl_size
= (start_offset
+ total_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
2037 if (upl_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
2038 upl_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
2040 pages_in_upl
= upl_size
/ PAGE_SIZE
;
2041 io_size
= upl_size
- start_offset
;
2043 if ((long long)io_size
> total_size
)
2044 io_size
= total_size
;
2046 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_START
, upl_size
, io_size
, total_size
, 0, 0);
2050 * Gather the pages from the buffer cache.
2051 * The UPL_WILL_MODIFY flag lets the UPL subsystem know
2052 * that we intend to modify these pages.
2054 kret
= ubc_create_upl(vp
,
2059 UPL_SET_LITE
| UPL_WILL_MODIFY
);
2060 if (kret
!= KERN_SUCCESS
)
2061 panic("cluster_write: failed to get pagelist");
2063 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_END
,
2064 (int)upl
, (int)upl_f_offset
, start_offset
, 0, 0);
2066 if (start_offset
&& !upl_valid_page(pl
, 0)) {
2070 * we're starting in the middle of the first page of the upl
2071 * and the page isn't currently valid, so we're going to have
2072 * to read it in first... this is a synchronous operation
2074 read_size
= PAGE_SIZE
;
2076 if ((upl_f_offset
+ read_size
) > newEOF
)
2077 read_size
= newEOF
- upl_f_offset
;
2079 retval
= cluster_io(vp
, upl
, 0, upl_f_offset
, read_size
,
2080 CL_READ
, (buf_t
)NULL
, (struct clios
*)NULL
);
2083 * we had an error during the read which causes us to abort
2084 * the current cluster_write request... before we do, we need
2085 * to release the rest of the pages in the upl without modifying
2086 * there state and mark the failed page in error
2088 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
);
2090 if (upl_size
> PAGE_SIZE
)
2091 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
2093 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
2094 (int)upl
, 0, 0, retval
, 0);
2098 if ((start_offset
== 0 || upl_size
> PAGE_SIZE
) && ((start_offset
+ io_size
) & PAGE_MASK
)) {
2100 * the last offset we're writing to in this upl does not end on a page
2101 * boundary... if it's not beyond the old EOF, then we'll also need to
2102 * pre-read this page in if it isn't already valid
2104 upl_offset
= upl_size
- PAGE_SIZE
;
2106 if ((upl_f_offset
+ start_offset
+ io_size
) < oldEOF
&&
2107 !upl_valid_page(pl
, upl_offset
/ PAGE_SIZE
)) {
2110 read_size
= PAGE_SIZE
;
2112 if ((upl_f_offset
+ upl_offset
+ read_size
) > newEOF
)
2113 read_size
= newEOF
- (upl_f_offset
+ upl_offset
);
2115 retval
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, read_size
,
2116 CL_READ
, (buf_t
)NULL
, (struct clios
*)NULL
);
2119 * we had an error during the read which causes us to abort
2120 * the current cluster_write request... before we do, we
2121 * need to release the rest of the pages in the upl without
2122 * modifying there state and mark the failed page in error
2124 ubc_upl_abort_range(upl
, upl_offset
, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
);
2126 if (upl_size
> PAGE_SIZE
)
2127 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
2129 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
2130 (int)upl
, 0, 0, retval
, 0);
2135 xfer_resid
= io_size
;
2136 io_offset
= start_offset
;
2138 while (zero_cnt
&& xfer_resid
) {
2140 if (zero_cnt
< (long long)xfer_resid
)
2141 bytes_to_zero
= zero_cnt
;
2143 bytes_to_zero
= xfer_resid
;
2145 if ( !(flags
& (IO_NOZEROVALID
| IO_NOZERODIRTY
))) {
2146 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
2150 bytes_to_zero
= min(bytes_to_zero
, PAGE_SIZE
- (int)(zero_off
& PAGE_MASK_64
));
2151 zero_pg_index
= (int)((zero_off
- upl_f_offset
) / PAGE_SIZE_64
);
2153 if ( !upl_valid_page(pl
, zero_pg_index
)) {
2154 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
2156 } else if ((flags
& (IO_NOZERODIRTY
| IO_NOZEROVALID
)) == IO_NOZERODIRTY
&&
2157 !upl_dirty_page(pl
, zero_pg_index
)) {
2158 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
2161 xfer_resid
-= bytes_to_zero
;
2162 zero_cnt
-= bytes_to_zero
;
2163 zero_off
+= bytes_to_zero
;
2164 io_offset
+= bytes_to_zero
;
2166 if (xfer_resid
&& io_resid
) {
2167 bytes_to_move
= min(io_resid
, xfer_resid
);
2169 retval
= cluster_copy_upl_data(uio
, upl
, io_offset
, bytes_to_move
);
2173 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
2175 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
2176 (int)upl
, 0, 0, retval
, 0);
2178 io_resid
-= bytes_to_move
;
2179 xfer_resid
-= bytes_to_move
;
2180 io_offset
+= bytes_to_move
;
2183 while (xfer_resid
&& zero_cnt1
&& retval
== 0) {
2185 if (zero_cnt1
< (long long)xfer_resid
)
2186 bytes_to_zero
= zero_cnt1
;
2188 bytes_to_zero
= xfer_resid
;
2190 if ( !(flags
& (IO_NOZEROVALID
| IO_NOZERODIRTY
))) {
2191 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
2195 bytes_to_zero
= min(bytes_to_zero
, PAGE_SIZE
- (int)(zero_off1
& PAGE_MASK_64
));
2196 zero_pg_index
= (int)((zero_off1
- upl_f_offset
) / PAGE_SIZE_64
);
2198 if ( !upl_valid_page(pl
, zero_pg_index
)) {
2199 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
2200 } else if ((flags
& (IO_NOZERODIRTY
| IO_NOZEROVALID
)) == IO_NOZERODIRTY
&&
2201 !upl_dirty_page(pl
, zero_pg_index
)) {
2202 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
2205 xfer_resid
-= bytes_to_zero
;
2206 zero_cnt1
-= bytes_to_zero
;
2207 zero_off1
+= bytes_to_zero
;
2208 io_offset
+= bytes_to_zero
;
2215 io_size
+= start_offset
;
2217 if ((upl_f_offset
+ io_size
) >= newEOF
&& io_size
< upl_size
) {
2219 * if we're extending the file with this write
2220 * we'll zero fill the rest of the page so that
2221 * if the file gets extended again in such a way as to leave a
2222 * hole starting at this EOF, we'll have zero's in the correct spot
2224 cluster_zero(upl
, io_size
, upl_size
- io_size
, NULL
);
2226 if (flags
& IO_SYNC
)
2228 * if the IO_SYNC flag is set than we need to
2229 * bypass any clusters and immediately issue
2235 * take the lock to protect our accesses
2236 * of the writebehind and sparse cluster state
2238 wbp
= cluster_get_wbp(vp
, CLW_ALLOCATE
| CLW_RETURNLOCKED
);
2241 * calculate the last logical block number
2242 * that this delayed I/O encompassed
2244 cl
.e_addr
= (daddr64_t
)((upl_f_offset
+ (off_t
)upl_size
) / PAGE_SIZE_64
);
2246 if (wbp
->cl_scmap
) {
2248 if ( !(flags
& IO_NOCACHE
)) {
2250 * we've fallen into the sparse
2251 * cluster method of delaying dirty pages
2252 * first, we need to release the upl if we hold one
2253 * since pages in it may be present in the sparse cluster map
2254 * and may span 2 separate buckets there... if they do and
2255 * we happen to have to flush a bucket to make room and it intersects
2256 * this upl, a deadlock may result on page BUSY
2259 ubc_upl_commit_range(upl
, 0, upl_size
,
2260 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2262 sparse_cluster_add(wbp
, vp
, &cl
, newEOF
);
2264 lck_mtx_unlock(&wbp
->cl_lockw
);
2269 * must have done cached writes that fell into
2270 * the sparse cluster mechanism... we've switched
2271 * to uncached writes on the file, so go ahead
2272 * and push whatever's in the sparse map
2273 * and switch back to normal clustering
2275 * see the comment above concerning a possible deadlock...
2278 ubc_upl_commit_range(upl
, 0, upl_size
,
2279 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2281 * setting upl_size to 0 keeps us from committing a
2282 * second time in the start_new_cluster path
2286 sparse_cluster_push(wbp
, vp
, newEOF
, 1);
2290 * no clusters of either type present at this point
2291 * so just go directly to start_new_cluster since
2292 * we know we need to delay this I/O since we've
2293 * already released the pages back into the cache
2294 * to avoid the deadlock with sparse_cluster_push
2296 goto start_new_cluster
;
2300 if (wbp
->cl_number
== 0)
2302 * no clusters currently present
2304 goto start_new_cluster
;
2306 for (cl_index
= 0; cl_index
< wbp
->cl_number
; cl_index
++) {
2308 * check each cluster that we currently hold
2309 * try to merge some or all of this write into
2310 * one or more of the existing clusters... if
2311 * any portion of the write remains, start a
2314 if (cl
.b_addr
>= wbp
->cl_clusters
[cl_index
].b_addr
) {
2316 * the current write starts at or after the current cluster
2318 if (cl
.e_addr
<= (wbp
->cl_clusters
[cl_index
].b_addr
+ MAX_UPL_TRANSFER
)) {
2320 * we have a write that fits entirely
2321 * within the existing cluster limits
2323 if (cl
.e_addr
> wbp
->cl_clusters
[cl_index
].e_addr
)
2325 * update our idea of where the cluster ends
2327 wbp
->cl_clusters
[cl_index
].e_addr
= cl
.e_addr
;
2330 if (cl
.b_addr
< (wbp
->cl_clusters
[cl_index
].b_addr
+ MAX_UPL_TRANSFER
)) {
2332 * we have a write that starts in the middle of the current cluster
2333 * but extends beyond the cluster's limit... we know this because
2334 * of the previous checks
2335 * we'll extend the current cluster to the max
2336 * and update the b_addr for the current write to reflect that
2337 * the head of it was absorbed into this cluster...
2338 * note that we'll always have a leftover tail in this case since
2339 * full absorbtion would have occurred in the clause above
2341 wbp
->cl_clusters
[cl_index
].e_addr
= wbp
->cl_clusters
[cl_index
].b_addr
+ MAX_UPL_TRANSFER
;
2344 daddr64_t start_pg_in_upl
;
2346 start_pg_in_upl
= (daddr64_t
)(upl_f_offset
/ PAGE_SIZE_64
);
2348 if (start_pg_in_upl
< wbp
->cl_clusters
[cl_index
].e_addr
) {
2349 intersection
= (int)((wbp
->cl_clusters
[cl_index
].e_addr
- start_pg_in_upl
) * PAGE_SIZE
);
2351 ubc_upl_commit_range(upl
, upl_offset
, intersection
,
2352 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2353 upl_f_offset
+= intersection
;
2354 upl_offset
+= intersection
;
2355 upl_size
-= intersection
;
2358 cl
.b_addr
= wbp
->cl_clusters
[cl_index
].e_addr
;
2361 * we come here for the case where the current write starts
2362 * beyond the limit of the existing cluster or we have a leftover
2363 * tail after a partial absorbtion
2365 * in either case, we'll check the remaining clusters before
2366 * starting a new one
2370 * the current write starts in front of the cluster we're currently considering
2372 if ((wbp
->cl_clusters
[cl_index
].e_addr
- cl
.b_addr
) <= MAX_UPL_TRANSFER
) {
2374 * we can just merge the new request into
2375 * this cluster and leave it in the cache
2376 * since the resulting cluster is still
2377 * less than the maximum allowable size
2379 wbp
->cl_clusters
[cl_index
].b_addr
= cl
.b_addr
;
2381 if (cl
.e_addr
> wbp
->cl_clusters
[cl_index
].e_addr
) {
2383 * the current write completely
2384 * envelops the existing cluster and since
2385 * each write is limited to at most MAX_UPL_TRANSFER bytes
2386 * we can just use the start and last blocknos of the write
2387 * to generate the cluster limits
2389 wbp
->cl_clusters
[cl_index
].e_addr
= cl
.e_addr
;
2395 * if we were to combine this write with the current cluster
2396 * we would exceed the cluster size limit.... so,
2397 * let's see if there's any overlap of the new I/O with
2398 * the cluster we're currently considering... in fact, we'll
2399 * stretch the cluster out to it's full limit and see if we
2400 * get an intersection with the current write
2403 if (cl
.e_addr
> wbp
->cl_clusters
[cl_index
].e_addr
- MAX_UPL_TRANSFER
) {
2405 * the current write extends into the proposed cluster
2406 * clip the length of the current write after first combining it's
2407 * tail with the newly shaped cluster
2409 wbp
->cl_clusters
[cl_index
].b_addr
= wbp
->cl_clusters
[cl_index
].e_addr
- MAX_UPL_TRANSFER
;
2412 intersection
= (int)((cl
.e_addr
- wbp
->cl_clusters
[cl_index
].b_addr
) * PAGE_SIZE
);
2414 if (intersection
> upl_size
)
2416 * because the current write may consist of a number of pages found in the cache
2417 * which are not part of the UPL, we may have an intersection that exceeds
2418 * the size of the UPL that is also part of this write
2420 intersection
= upl_size
;
2422 ubc_upl_commit_range(upl
, upl_offset
+ (upl_size
- intersection
), intersection
,
2423 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2424 upl_size
-= intersection
;
2426 cl
.e_addr
= wbp
->cl_clusters
[cl_index
].b_addr
;
2429 * if we get here, there was no way to merge
2430 * any portion of this write with this cluster
2431 * or we could only merge part of it which
2432 * will leave a tail...
2433 * we'll check the remaining clusters before starting a new one
2437 if (cl_index
< wbp
->cl_number
)
2439 * we found an existing cluster(s) that we
2440 * could entirely merge this I/O into
2444 if (wbp
->cl_number
< MAX_CLUSTERS
&& !(flags
& IO_NOCACHE
))
2446 * we didn't find an existing cluster to
2447 * merge into, but there's room to start
2450 goto start_new_cluster
;
2453 * no exisitng cluster to merge with and no
2454 * room to start a new one... we'll try
2455 * pushing one of the existing ones... if none of
2456 * them are able to be pushed, we'll switch
2457 * to the sparse cluster mechanism
2458 * cluster_try_push updates cl_number to the
2459 * number of remaining clusters... and
2460 * returns the number of currently unused clusters
2462 int ret_cluster_try_push
= 0;
2463 /* if writes are not deferred, call cluster push immediately */
2464 if (!((unsigned int)vfs_flags(vp
->v_mount
) & MNT_DEFWRITE
)) {
2465 if (flags
& IO_NOCACHE
)
2470 ret_cluster_try_push
= cluster_try_push(wbp
, vp
, newEOF
, can_delay
, 0);
2473 /* execute following regardless writes are deferred or not */
2474 if (ret_cluster_try_push
== 0) {
2476 * no more room in the normal cluster mechanism
2477 * so let's switch to the more expansive but expensive
2478 * sparse mechanism....
2479 * first, we need to release the upl if we hold one
2480 * since pages in it may be present in the sparse cluster map (after the cluster_switch)
2481 * and may span 2 separate buckets there... if they do and
2482 * we happen to have to flush a bucket to make room and it intersects
2483 * this upl, a deadlock may result on page BUSY
2486 ubc_upl_commit_range(upl
, upl_offset
, upl_size
,
2487 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2489 sparse_cluster_switch(wbp
, vp
, newEOF
);
2490 sparse_cluster_add(wbp
, vp
, &cl
, newEOF
);
2492 lck_mtx_unlock(&wbp
->cl_lockw
);
2497 * we pushed one cluster successfully, so we must be sequentially writing this file
2498 * otherwise, we would have failed and fallen into the sparse cluster support
2499 * so let's take the opportunity to push out additional clusters as long as we
2500 * remain below the throttle... this will give us better I/O locality if we're
2501 * in a copy loop (i.e. we won't jump back and forth between the read and write points
2502 * however, we don't want to push so much out that the write throttle kicks in and
2503 * hangs this thread up until some of the I/O completes...
2505 if (!((unsigned int)vfs_flags(vp
->v_mount
) & MNT_DEFWRITE
)) {
2506 while (wbp
->cl_number
&& (vp
->v_numoutput
<= (VNODE_ASYNC_THROTTLE
/ 2)))
2507 cluster_try_push(wbp
, vp
, newEOF
, 0, 0);
2511 wbp
->cl_clusters
[wbp
->cl_number
].b_addr
= cl
.b_addr
;
2512 wbp
->cl_clusters
[wbp
->cl_number
].e_addr
= cl
.e_addr
;
2514 if (flags
& IO_NOCACHE
)
2515 wbp
->cl_clusters
[wbp
->cl_number
].io_nocache
= 1;
2517 wbp
->cl_clusters
[wbp
->cl_number
].io_nocache
= 0;
2521 ubc_upl_commit_range(upl
, upl_offset
, upl_size
,
2522 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2524 lck_mtx_unlock(&wbp
->cl_lockw
);
2529 * we don't hold the vnode lock at this point
2531 * because we had to ask for a UPL that provides currenty non-present pages, the
2532 * UPL has been automatically set to clear the dirty flags (both software and hardware)
2533 * upon committing it... this is not the behavior we want since it's possible for
2534 * pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
2535 * in order to maintain some semblance of coherency with mapped writes
2536 * we need to drop the current upl and pick it back up with COPYOUT_FROM set
2537 * so that we correctly deal with a change in state of the hardware modify bit...
2538 * we do this via cluster_push_x... by passing along the IO_SYNC flag, we force
2539 * cluster_push_x to wait until all the I/Os have completed... cluster_push_x is also
2540 * responsible for generating the correct sized I/O(s)
2542 ubc_upl_commit_range(upl
, 0, upl_size
,
2543 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2545 cl
.e_addr
= (upl_f_offset
+ (off_t
)upl_size
) / PAGE_SIZE_64
;
2547 retval
= cluster_push_x(vp
, &cl
, newEOF
, flags
);
2550 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_END
,
2551 retval
, 0, io_resid
, 0, 0);
2557 cluster_read(vnode_t vp
, struct uio
*uio
, off_t filesize
, int xflags
)
2570 if (vp
->v_flag
& VNOCACHE_DATA
)
2571 flags
|= IO_NOCACHE
;
2572 if (vp
->v_flag
& VRAOFF
)
2575 if (!((flags
& IO_NOCACHE
) && UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
))) {
2577 * go do a read through the cache if one of the following is true....
2578 * NOCACHE is not true
2579 * the uio request doesn't target USERSPACE
2581 return (cluster_read_x(vp
, uio
, filesize
, flags
));
2585 if (IS_VALID_UIO_SEGFLG(uio
->uio_segflg
) == 0) {
2586 panic("%s :%d - invalid uio_segflg\n", __FILE__
, __LINE__
);
2588 #endif /* LP64_DEBUG */
2590 while (uio_resid(uio
) && uio
->uio_offset
< filesize
&& retval
== 0) {
2591 user_size_t iov_len
;
2592 user_addr_t iov_base
;
2595 * we know we have a resid, so this is safe
2596 * skip over any emtpy vectors
2598 uio_update(uio
, (user_size_t
)0);
2600 iov_len
= uio_curriovlen(uio
);
2601 iov_base
= uio_curriovbase(uio
);
2603 upl_size
= PAGE_SIZE
;
2604 upl_flags
= UPL_QUERY_OBJECT_TYPE
;
2606 // LP64todo - fix this!
2607 if ((vm_map_get_upl(current_map(),
2608 (vm_map_offset_t
)(iov_base
& ~((user_addr_t
)PAGE_MASK
)),
2609 &upl_size
, &upl
, NULL
, NULL
, &upl_flags
, 0)) != KERN_SUCCESS
) {
2611 * the user app must have passed in an invalid address
2617 * We check every vector target but if it is physically
2618 * contiguous space, we skip the sanity checks.
2620 if (upl_flags
& UPL_PHYS_CONTIG
) {
2621 retval
= cluster_phys_read(vp
, uio
, filesize
);
2623 else if (uio_resid(uio
) < PAGE_SIZE
) {
2625 * we're here because we're don't have a physically contiguous target buffer
2626 * go do a read through the cache if
2627 * the total xfer size is less than a page...
2629 return (cluster_read_x(vp
, uio
, filesize
, flags
));
2631 // LP64todo - fix this!
2632 else if (((int)uio
->uio_offset
& PAGE_MASK
) || (CAST_DOWN(int, iov_base
) & PAGE_MASK
)) {
2633 if (((int)uio
->uio_offset
& PAGE_MASK
) == (CAST_DOWN(int, iov_base
) & PAGE_MASK
)) {
2635 * Bring the file offset read up to a pagesize boundary
2636 * this will also bring the base address to a page boundary
2637 * since they both are currently on the same offset within a page
2638 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
2639 * so the computed clip_size must always be less than the current uio_resid
2641 clip_size
= (PAGE_SIZE
- (int)(uio
->uio_offset
& PAGE_MASK_64
));
2644 * Fake the resid going into the cluster_read_x call
2645 * and restore it on the way out.
2647 prev_resid
= uio_resid(uio
);
2648 // LP64todo - fix this
2649 uio_setresid(uio
, clip_size
);
2651 retval
= cluster_read_x(vp
, uio
, filesize
, flags
);
2653 uio_setresid(uio
, prev_resid
- (clip_size
- uio_resid(uio
)));
2656 * can't get both the file offset and the buffer offset aligned to a page boundary
2657 * so fire an I/O through the cache for this entire vector
2659 // LP64todo - fix this!
2660 clip_size
= iov_len
;
2661 prev_resid
= uio_resid(uio
);
2662 uio_setresid(uio
, clip_size
);
2664 retval
= cluster_read_x(vp
, uio
, filesize
, flags
);
2666 uio_setresid(uio
, prev_resid
- (clip_size
- uio_resid(uio
)));
2670 * If we come in here, we know the offset into
2671 * the file is on a pagesize boundary
2673 max_io_size
= filesize
- uio
->uio_offset
;
2674 // LP64todo - fix this
2675 clip_size
= uio_resid(uio
);
2676 if (iov_len
< clip_size
)
2677 clip_size
= iov_len
;
2678 if (max_io_size
< clip_size
)
2679 clip_size
= (int)max_io_size
;
2681 if (clip_size
< PAGE_SIZE
) {
2683 * Take care of the tail end of the read in this vector.
2685 // LP64todo - fix this
2686 prev_resid
= uio_resid(uio
);
2687 uio_setresid(uio
, clip_size
);
2689 retval
= cluster_read_x(vp
, uio
, filesize
, flags
);
2691 uio_setresid(uio
, prev_resid
- (clip_size
- uio_resid(uio
)));
2693 /* round clip_size down to a multiple of pagesize */
2694 clip_size
= clip_size
& ~(PAGE_MASK
);
2695 // LP64todo - fix this
2696 prev_resid
= uio_resid(uio
);
2697 uio_setresid(uio
, clip_size
);
2699 retval
= cluster_nocopy_read(vp
, uio
, filesize
);
2701 if ((retval
==0) && uio_resid(uio
))
2702 retval
= cluster_read_x(vp
, uio
, filesize
, flags
);
2704 uio_setresid(uio
, prev_resid
- (clip_size
- uio_resid(uio
)));
2713 cluster_read_x(vnode_t vp
, struct uio
*uio
, off_t filesize
, int flags
)
2715 upl_page_info_t
*pl
;
2717 vm_offset_t upl_offset
;
2726 off_t last_ioread_offset
;
2727 off_t last_request_offset
;
2728 u_int size_of_prefetch
;
2733 u_int max_rd_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
2734 u_int rd_ahead_enabled
= 1;
2735 u_int prefetch_enabled
= 1;
2736 struct cl_readahead
* rap
;
2737 struct clios iostate
;
2738 struct cl_extent extent
;
2740 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_START
,
2741 (int)uio
->uio_offset
, uio_resid(uio
), (int)filesize
, 0, 0);
2743 // LP64todo - fix this
2744 last_request_offset
= uio
->uio_offset
+ uio_resid(uio
);
2746 if ((flags
& (IO_RAOFF
|IO_NOCACHE
)) ||
2747 ((last_request_offset
& ~PAGE_MASK_64
) == (uio
->uio_offset
& ~PAGE_MASK_64
))) {
2748 rd_ahead_enabled
= 0;
2751 if (cluster_hard_throttle_on(vp
)) {
2752 rd_ahead_enabled
= 0;
2753 prefetch_enabled
= 0;
2755 max_rd_size
= HARD_THROTTLE_MAXSIZE
;
2757 if ((rap
= cluster_get_rap(vp
)) == NULL
)
2758 rd_ahead_enabled
= 0;
2760 if (last_request_offset
> filesize
)
2761 last_request_offset
= filesize
;
2762 extent
.b_addr
= uio
->uio_offset
/ PAGE_SIZE_64
;
2763 extent
.e_addr
= (last_request_offset
- 1) / PAGE_SIZE_64
;
2765 if (rap
!= NULL
&& rap
->cl_ralen
&& (rap
->cl_lastr
== extent
.b_addr
|| (rap
->cl_lastr
+ 1) == extent
.b_addr
)) {
2767 * determine if we already have a read-ahead in the pipe courtesy of the
2768 * last read systemcall that was issued...
2769 * if so, pick up it's extent to determine where we should start
2770 * with respect to any read-ahead that might be necessary to
2771 * garner all the data needed to complete this read systemcall
2773 last_ioread_offset
= (rap
->cl_maxra
* PAGE_SIZE_64
) + PAGE_SIZE_64
;
2775 if (last_ioread_offset
< uio
->uio_offset
)
2776 last_ioread_offset
= (off_t
)0;
2777 else if (last_ioread_offset
> last_request_offset
)
2778 last_ioread_offset
= last_request_offset
;
2780 last_ioread_offset
= (off_t
)0;
2782 while (uio_resid(uio
) && uio
->uio_offset
< filesize
&& retval
== 0) {
2784 * compute the size of the upl needed to encompass
2785 * the requested read... limit each call to cluster_io
2786 * to the maximum UPL size... cluster_io will clip if
2787 * this exceeds the maximum io_size for the device,
2788 * make sure to account for
2789 * a starting offset that's not page aligned
2791 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
2792 upl_f_offset
= uio
->uio_offset
- (off_t
)start_offset
;
2793 max_size
= filesize
- uio
->uio_offset
;
2795 // LP64todo - fix this!
2796 if ((off_t
)((unsigned int)uio_resid(uio
)) < max_size
)
2797 io_size
= uio_resid(uio
);
2801 if (!(flags
& IO_NOCACHE
)) {
2808 * if we keep finding the pages we need already in the cache, then
2809 * don't bother to call cluster_rd_prefetch since it costs CPU cycles
2810 * to determine that we have all the pages we need... once we miss in
2811 * the cache and have issued an I/O, than we'll assume that we're likely
2812 * to continue to miss in the cache and it's to our advantage to try and prefetch
2814 if (last_request_offset
&& last_ioread_offset
&& (size_of_prefetch
= (last_request_offset
- last_ioread_offset
))) {
2815 if ((last_ioread_offset
- uio
->uio_offset
) <= max_rd_size
&& prefetch_enabled
) {
2817 * we've already issued I/O for this request and
2818 * there's still work to do and
2819 * our prefetch stream is running dry, so issue a
2820 * pre-fetch I/O... the I/O latency will overlap
2821 * with the copying of the data
2823 if (size_of_prefetch
> max_rd_size
)
2824 size_of_prefetch
= max_rd_size
;
2826 size_of_prefetch
= cluster_rd_prefetch(vp
, last_ioread_offset
, size_of_prefetch
, filesize
);
2828 last_ioread_offset
+= (off_t
)(size_of_prefetch
* PAGE_SIZE
);
2830 if (last_ioread_offset
> last_request_offset
)
2831 last_ioread_offset
= last_request_offset
;
2835 * limit the size of the copy we're about to do so that
2836 * we can notice that our I/O pipe is running dry and
2837 * get the next I/O issued before it does go dry
2839 if (last_ioread_offset
&& io_size
> ((MAX_UPL_TRANSFER
* PAGE_SIZE
) / 4))
2840 io_resid
= ((MAX_UPL_TRANSFER
* PAGE_SIZE
) / 4);
2844 io_requested
= io_resid
;
2846 retval
= cluster_copy_ubc_data(vp
, uio
, &io_resid
, 0);
2848 io_size
-= (io_requested
- io_resid
);
2850 if (retval
|| io_resid
)
2852 * if we run into a real error or
2853 * a page that is not in the cache
2854 * we need to leave streaming mode
2858 if ((io_size
== 0 || last_ioread_offset
== last_request_offset
) && rd_ahead_enabled
) {
2860 * we're already finished the I/O for this read request
2861 * let's see if we should do a read-ahead
2863 cluster_rd_ahead(vp
, &extent
, filesize
, rap
);
2870 if (extent
.e_addr
< rap
->cl_lastr
)
2872 rap
->cl_lastr
= extent
.e_addr
;
2876 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
2877 upl_f_offset
= uio
->uio_offset
- (off_t
)start_offset
;
2878 max_size
= filesize
- uio
->uio_offset
;
2880 if (io_size
> max_rd_size
)
2881 io_size
= max_rd_size
;
2883 upl_size
= (start_offset
+ io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
2885 if (upl_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
) / 4)
2886 upl_size
= (MAX_UPL_TRANSFER
* PAGE_SIZE
) / 4;
2887 pages_in_upl
= upl_size
/ PAGE_SIZE
;
2889 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 33)) | DBG_FUNC_START
,
2890 (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
2892 kret
= ubc_create_upl(vp
,
2898 if (kret
!= KERN_SUCCESS
)
2899 panic("cluster_read: failed to get pagelist");
2901 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 33)) | DBG_FUNC_END
,
2902 (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
2905 * scan from the beginning of the upl looking for the first
2906 * non-valid page.... this will become the first page in
2907 * the request we're going to make to 'cluster_io'... if all
2908 * of the pages are valid, we won't call through to 'cluster_io'
2910 for (start_pg
= 0; start_pg
< pages_in_upl
; start_pg
++) {
2911 if (!upl_valid_page(pl
, start_pg
))
2916 * scan from the starting invalid page looking for a valid
2917 * page before the end of the upl is reached, if we
2918 * find one, then it will be the last page of the request to
2921 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
2922 if (upl_valid_page(pl
, last_pg
))
2925 iostate
.io_completed
= 0;
2926 iostate
.io_issued
= 0;
2927 iostate
.io_error
= 0;
2928 iostate
.io_wanted
= 0;
2930 if (start_pg
< last_pg
) {
2932 * we found a range of 'invalid' pages that must be filled
2933 * if the last page in this range is the last page of the file
2934 * we may have to clip the size of it to keep from reading past
2935 * the end of the last physical block associated with the file
2937 upl_offset
= start_pg
* PAGE_SIZE
;
2938 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
2940 if ((upl_f_offset
+ upl_offset
+ io_size
) > filesize
)
2941 io_size
= filesize
- (upl_f_offset
+ upl_offset
);
2944 * issue an asynchronous read to cluster_io
2947 error
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
,
2948 io_size
, CL_READ
| CL_ASYNC
, (buf_t
)NULL
, &iostate
);
2952 * if the read completed successfully, or there was no I/O request
2953 * issued, than copy the data into user land via 'cluster_upl_copy_data'
2954 * we'll first add on any 'valid'
2955 * pages that were present in the upl when we acquired it.
2959 for (uio_last
= last_pg
; uio_last
< pages_in_upl
; uio_last
++) {
2960 if (!upl_valid_page(pl
, uio_last
))
2964 * compute size to transfer this round, if uio->uio_resid is
2965 * still non-zero after this attempt, we'll loop around and
2966 * set up for another I/O.
2968 val_size
= (uio_last
* PAGE_SIZE
) - start_offset
;
2970 if (val_size
> max_size
)
2971 val_size
= max_size
;
2973 if (val_size
> uio_resid(uio
))
2974 // LP64todo - fix this
2975 val_size
= uio_resid(uio
);
2977 if (last_ioread_offset
== 0)
2978 last_ioread_offset
= uio
->uio_offset
+ val_size
;
2980 if ((size_of_prefetch
= (last_request_offset
- last_ioread_offset
)) && prefetch_enabled
) {
2982 * if there's still I/O left to do for this request, and...
2983 * we're not in hard throttle mode, then issue a
2984 * pre-fetch I/O... the I/O latency will overlap
2985 * with the copying of the data
2987 size_of_prefetch
= cluster_rd_prefetch(vp
, last_ioread_offset
, size_of_prefetch
, filesize
);
2989 last_ioread_offset
+= (off_t
)(size_of_prefetch
* PAGE_SIZE
);
2991 if (last_ioread_offset
> last_request_offset
)
2992 last_ioread_offset
= last_request_offset
;
2994 } else if ((uio
->uio_offset
+ val_size
) == last_request_offset
) {
2996 * this transfer will finish this request, so...
2997 * let's try to read ahead if we're in
2998 * a sequential access pattern and we haven't
2999 * explicitly disabled it
3001 if (rd_ahead_enabled
)
3002 cluster_rd_ahead(vp
, &extent
, filesize
, rap
);
3005 if (extent
.e_addr
< rap
->cl_lastr
)
3007 rap
->cl_lastr
= extent
.e_addr
;
3010 lck_mtx_lock(cl_mtxp
);
3012 while (iostate
.io_issued
!= iostate
.io_completed
) {
3013 iostate
.io_wanted
= 1;
3014 msleep((caddr_t
)&iostate
.io_wanted
, cl_mtxp
, PRIBIO
+ 1, "cluster_read_x", 0);
3016 lck_mtx_unlock(cl_mtxp
);
3018 if (iostate
.io_error
)
3019 error
= iostate
.io_error
;
3021 retval
= cluster_copy_upl_data(uio
, upl
, start_offset
, val_size
);
3023 if (start_pg
< last_pg
) {
3025 * compute the range of pages that we actually issued an I/O for
3026 * and either commit them as valid if the I/O succeeded
3027 * or abort them if the I/O failed
3029 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
3031 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_START
,
3032 (int)upl
, start_pg
* PAGE_SIZE
, io_size
, error
, 0);
3034 if (error
|| (flags
& IO_NOCACHE
))
3035 ubc_upl_abort_range(upl
, start_pg
* PAGE_SIZE
, io_size
,
3036 UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
3038 ubc_upl_commit_range(upl
, start_pg
* PAGE_SIZE
, io_size
,
3039 UPL_COMMIT_CLEAR_DIRTY
|
3040 UPL_COMMIT_FREE_ON_EMPTY
|
3041 UPL_COMMIT_INACTIVATE
);
3043 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_END
,
3044 (int)upl
, start_pg
* PAGE_SIZE
, io_size
, error
, 0);
3046 if ((last_pg
- start_pg
) < pages_in_upl
) {
3051 * the set of pages that we issued an I/O for did not encompass
3052 * the entire upl... so just release these without modifying
3056 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3058 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_START
,
3059 (int)upl
, -1, pages_in_upl
- (last_pg
- start_pg
), 0, 0);
3063 * we found some already valid pages at the beginning of
3064 * the upl commit these back to the inactive list with
3067 for (cur_pg
= 0; cur_pg
< start_pg
; cur_pg
++) {
3068 commit_flags
= UPL_COMMIT_FREE_ON_EMPTY
3069 | UPL_COMMIT_INACTIVATE
;
3071 if (upl_dirty_page(pl
, cur_pg
))
3072 commit_flags
|= UPL_COMMIT_SET_DIRTY
;
3074 if ( !(commit_flags
& UPL_COMMIT_SET_DIRTY
) && (flags
& IO_NOCACHE
))
3075 ubc_upl_abort_range(upl
, cur_pg
* PAGE_SIZE
, PAGE_SIZE
,
3076 UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
3078 ubc_upl_commit_range(upl
, cur_pg
* PAGE_SIZE
,
3079 PAGE_SIZE
, commit_flags
);
3082 if (last_pg
< uio_last
) {
3084 * we found some already valid pages immediately after the
3085 * pages we issued I/O for, commit these back to the
3086 * inactive list with reference cleared
3088 for (cur_pg
= last_pg
; cur_pg
< uio_last
; cur_pg
++) {
3089 commit_flags
= UPL_COMMIT_FREE_ON_EMPTY
3090 | UPL_COMMIT_INACTIVATE
;
3092 if (upl_dirty_page(pl
, cur_pg
))
3093 commit_flags
|= UPL_COMMIT_SET_DIRTY
;
3095 if ( !(commit_flags
& UPL_COMMIT_SET_DIRTY
) && (flags
& IO_NOCACHE
))
3096 ubc_upl_abort_range(upl
, cur_pg
* PAGE_SIZE
, PAGE_SIZE
,
3097 UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
3099 ubc_upl_commit_range(upl
, cur_pg
* PAGE_SIZE
,
3100 PAGE_SIZE
, commit_flags
);
3103 if (uio_last
< pages_in_upl
) {
3105 * there were some invalid pages beyond the valid pages
3106 * that we didn't issue an I/O for, just release them
3109 ubc_upl_abort_range(upl
, uio_last
* PAGE_SIZE
,
3110 (pages_in_upl
- uio_last
) * PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
3113 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_END
,
3114 (int)upl
, -1, -1, 0, 0);
3120 if ( uio_resid(uio
) ) {
3121 if (cluster_hard_throttle_on(vp
)) {
3122 rd_ahead_enabled
= 0;
3123 prefetch_enabled
= 0;
3125 max_rd_size
= HARD_THROTTLE_MAXSIZE
;
3128 rd_ahead_enabled
= 1;
3129 prefetch_enabled
= 1;
3131 max_rd_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
3136 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_END
,
3137 (int)uio
->uio_offset
, uio_resid(uio
), rap
->cl_lastr
, retval
, 0);
3139 lck_mtx_unlock(&rap
->cl_lockr
);
3141 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_END
,
3142 (int)uio
->uio_offset
, uio_resid(uio
), 0, retval
, 0);
3150 cluster_nocopy_read(vnode_t vp
, struct uio
*uio
, off_t filesize
)
3153 upl_page_info_t
*pl
;
3154 vm_offset_t upl_offset
;
3158 int upl_needed_size
;
3163 int force_data_sync
;
3165 int no_zero_fill
= 0;
3167 struct clios iostate
;
3168 u_int max_rd_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
3169 u_int max_rd_ahead
= MAX_UPL_TRANSFER
* PAGE_SIZE
* 2;
3172 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_START
,
3173 (int)uio
->uio_offset
, uio_resid(uio
), (int)filesize
, 0, 0);
3176 * When we enter this routine, we know
3177 * -- the offset into the file is on a pagesize boundary
3178 * -- the resid is a page multiple
3179 * -- the resid will not exceed iov_len
3182 iostate
.io_completed
= 0;
3183 iostate
.io_issued
= 0;
3184 iostate
.io_error
= 0;
3185 iostate
.io_wanted
= 0;
3187 while (uio_resid(uio
) && uio
->uio_offset
< filesize
&& retval
== 0) {
3188 user_addr_t iov_base
;
3190 if (cluster_hard_throttle_on(vp
)) {
3191 max_rd_size
= HARD_THROTTLE_MAXSIZE
;
3192 max_rd_ahead
= HARD_THROTTLE_MAXSIZE
- 1;
3194 max_rd_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
3195 max_rd_ahead
= MAX_UPL_TRANSFER
* PAGE_SIZE
* 8;
3197 max_io_size
= filesize
- uio
->uio_offset
;
3199 // LP64todo - fix this
3200 if (max_io_size
< (off_t
)((unsigned int)uio_resid(uio
)))
3201 io_size
= max_io_size
;
3203 io_size
= uio_resid(uio
);
3206 * First look for pages already in the cache
3207 * and move them to user space.
3209 retval
= cluster_copy_ubc_data(vp
, uio
, &io_size
, 0);
3213 * we may have already spun some portion of this request
3214 * off as async requests... we need to wait for the I/O
3215 * to complete before returning
3217 goto wait_for_reads
;
3220 * If we are already finished with this read, then return
3224 * we may have already spun some portion of this request
3225 * off as async requests... we need to wait for the I/O
3226 * to complete before returning
3228 goto wait_for_reads
;
3230 max_io_size
= io_size
;
3232 if (max_io_size
> max_rd_size
)
3233 max_io_size
= max_rd_size
;
3237 ubc_range_op(vp
, uio
->uio_offset
, uio
->uio_offset
+ max_io_size
, UPL_ROP_ABSENT
, &io_size
);
3241 * we may have already spun some portion of this request
3242 * off as async requests... we need to wait for the I/O
3243 * to complete before returning
3245 goto wait_for_reads
;
3247 iov_base
= uio_curriovbase(uio
);
3249 // LP64todo - fix this!
3250 upl_offset
= CAST_DOWN(vm_offset_t
, iov_base
) & PAGE_MASK
;
3251 upl_needed_size
= (upl_offset
+ io_size
+ (PAGE_SIZE
-1)) & ~PAGE_MASK
;
3253 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_START
,
3254 (int)upl_offset
, upl_needed_size
, (int)iov_base
, io_size
, 0);
3256 if (upl_offset
== 0 && ((io_size
& PAGE_MASK
) == 0)) {
3258 abort_flag
= UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
;
3261 abort_flag
= UPL_ABORT_FREE_ON_EMPTY
;
3263 for (force_data_sync
= 0; force_data_sync
< 3; force_data_sync
++) {
3265 upl_size
= upl_needed_size
;
3266 upl_flags
= UPL_FILE_IO
| UPL_NO_SYNC
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
3269 upl_flags
|= UPL_NOZEROFILL
;
3270 if (force_data_sync
)
3271 upl_flags
|= UPL_FORCE_DATA_SYNC
;
3273 // LP64todo - fix this!
3274 kret
= vm_map_create_upl(current_map(),
3275 (vm_map_offset_t
)(iov_base
& ~((user_addr_t
)PAGE_MASK
)),
3276 &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
);
3278 if (kret
!= KERN_SUCCESS
) {
3279 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
3280 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
3282 * cluster_nocopy_read: failed to get pagelist
3284 * we may have already spun some portion of this request
3285 * off as async requests... we need to wait for the I/O
3286 * to complete before returning
3288 goto wait_for_reads
;
3290 pages_in_pl
= upl_size
/ PAGE_SIZE
;
3291 pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
3293 for (i
= 0; i
< pages_in_pl
; i
++) {
3294 if (!upl_valid_page(pl
, i
))
3297 if (i
== pages_in_pl
)
3300 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
, abort_flag
);
3302 if (force_data_sync
>= 3) {
3303 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
3304 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
3306 goto wait_for_reads
;
3309 * Consider the possibility that upl_size wasn't satisfied.
3311 if (upl_size
!= upl_needed_size
)
3312 io_size
= (upl_size
- (int)upl_offset
) & ~PAGE_MASK
;
3315 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
, abort_flag
);
3316 goto wait_for_reads
;
3318 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
3319 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
3322 * request asynchronously so that we can overlap
3323 * the preparation of the next I/O
3324 * if there are already too many outstanding reads
3325 * wait until some have completed before issuing the next read
3327 lck_mtx_lock(cl_mtxp
);
3329 while ((iostate
.io_issued
- iostate
.io_completed
) > max_rd_ahead
) {
3330 iostate
.io_wanted
= 1;
3331 msleep((caddr_t
)&iostate
.io_wanted
, cl_mtxp
, PRIBIO
+ 1, "cluster_nocopy_read", 0);
3333 lck_mtx_unlock(cl_mtxp
);
3335 if (iostate
.io_error
) {
3337 * one of the earlier reads we issued ran into a hard error
3338 * don't issue any more reads, cleanup the UPL
3339 * that was just created but not used, then
3340 * go wait for any other reads to complete before
3341 * returning the error to the caller
3343 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
, abort_flag
);
3345 goto wait_for_reads
;
3347 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 73)) | DBG_FUNC_START
,
3348 (int)upl
, (int)upl_offset
, (int)uio
->uio_offset
, io_size
, 0);
3350 retval
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
, io_size
,
3351 CL_PRESERVE
| CL_COMMIT
| CL_READ
| CL_ASYNC
| CL_NOZERO
,
3352 (buf_t
)NULL
, &iostate
);
3355 * update the uio structure
3357 uio_update(uio
, (user_size_t
)io_size
);
3359 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 73)) | DBG_FUNC_END
,
3360 (int)upl
, (int)uio
->uio_offset
, (int)uio_resid(uio
), retval
, 0);
3366 * make sure all async reads that are part of this stream
3367 * have completed before we return
3369 lck_mtx_lock(cl_mtxp
);
3371 while (iostate
.io_issued
!= iostate
.io_completed
) {
3372 iostate
.io_wanted
= 1;
3373 msleep((caddr_t
)&iostate
.io_wanted
, cl_mtxp
, PRIBIO
+ 1, "cluster_nocopy_read", 0);
3375 lck_mtx_unlock(cl_mtxp
);
3377 if (iostate
.io_error
)
3378 retval
= iostate
.io_error
;
3380 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_END
,
3381 (int)uio
->uio_offset
, (int)uio_resid(uio
), 6, retval
, 0);
3388 cluster_phys_read(vnode_t vp
, struct uio
*uio
, off_t filesize
)
3390 upl_page_info_t
*pl
;
3392 vm_offset_t upl_offset
;
3396 user_size_t iov_len
;
3397 user_addr_t iov_base
;
3400 int upl_needed_size
;
3404 struct clios iostate
;
3408 devblocksize
= vp
->v_mount
->mnt_devblocksize
;
3410 * When we enter this routine, we know
3411 * -- the resid will not exceed iov_len
3412 * -- the target address is physically contiguous
3416 if (IS_VALID_UIO_SEGFLG(uio
->uio_segflg
) == 0) {
3417 panic("%s :%d - invalid uio_segflg\n", __FILE__
, __LINE__
);
3419 #endif /* LP64_DEBUG */
3421 iov_len
= uio_curriovlen(uio
);
3422 iov_base
= uio_curriovbase(uio
);
3424 max_size
= filesize
- uio
->uio_offset
;
3426 // LP64todo - fix this!
3427 if (max_size
< 0 || (u_int64_t
)max_size
> iov_len
)
3432 // LP64todo - fix this!
3433 upl_offset
= CAST_DOWN(vm_offset_t
, iov_base
) & PAGE_MASK
;
3434 upl_needed_size
= upl_offset
+ io_size
;
3438 upl_size
= upl_needed_size
;
3439 upl_flags
= UPL_FILE_IO
| UPL_NO_SYNC
| UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
3441 kret
= vm_map_get_upl(current_map(),
3442 (vm_map_offset_t
)(iov_base
& ~((user_addr_t
)PAGE_MASK
)),
3443 &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
, 0);
3445 if (kret
!= KERN_SUCCESS
) {
3447 * cluster_phys_read: failed to get pagelist
3451 if (upl_size
< upl_needed_size
) {
3453 * The upl_size wasn't satisfied.
3455 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3459 pl
= ubc_upl_pageinfo(upl
);
3461 dst_paddr
= ((addr64_t
)upl_phys_page(pl
, 0) << 12) + (addr64_t
)upl_offset
;
3463 while (((uio
->uio_offset
& (devblocksize
- 1)) || io_size
< devblocksize
) && io_size
) {
3466 head_size
= devblocksize
- (int)(uio
->uio_offset
& (devblocksize
- 1));
3468 if (head_size
> io_size
)
3469 head_size
= io_size
;
3471 error
= cluster_align_phys_io(vp
, uio
, dst_paddr
, head_size
, CL_READ
);
3474 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3478 upl_offset
+= head_size
;
3479 dst_paddr
+= head_size
;
3480 io_size
-= head_size
;
3482 tail_size
= io_size
& (devblocksize
- 1);
3483 io_size
-= tail_size
;
3485 iostate
.io_completed
= 0;
3486 iostate
.io_issued
= 0;
3487 iostate
.io_error
= 0;
3488 iostate
.io_wanted
= 0;
3490 while (io_size
&& error
== 0) {
3493 if (io_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
3494 xsize
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
3498 * request asynchronously so that we can overlap
3499 * the preparation of the next I/O... we'll do
3500 * the commit after all the I/O has completed
3501 * since its all issued against the same UPL
3502 * if there are already too many outstanding reads
3503 * wait until some have completed before issuing the next
3505 lck_mtx_lock(cl_mtxp
);
3507 while ((iostate
.io_issued
- iostate
.io_completed
) > (8 * MAX_UPL_TRANSFER
* PAGE_SIZE
)) {
3508 iostate
.io_wanted
= 1;
3509 msleep((caddr_t
)&iostate
.io_wanted
, cl_mtxp
, PRIBIO
+ 1, "cluster_phys_read", 0);
3511 lck_mtx_unlock(cl_mtxp
);
3513 error
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
, xsize
,
3514 CL_READ
| CL_NOZERO
| CL_DEV_MEMORY
| CL_ASYNC
,
3515 (buf_t
)NULL
, &iostate
);
3517 * The cluster_io read was issued successfully,
3518 * update the uio structure
3521 uio_update(uio
, (user_size_t
)xsize
);
3524 upl_offset
+= xsize
;
3529 * make sure all async reads that are part of this stream
3530 * have completed before we proceed
3532 lck_mtx_lock(cl_mtxp
);
3534 while (iostate
.io_issued
!= iostate
.io_completed
) {
3535 iostate
.io_wanted
= 1;
3536 msleep((caddr_t
)&iostate
.io_wanted
, cl_mtxp
, PRIBIO
+ 1, "cluster_phys_read", 0);
3538 lck_mtx_unlock(cl_mtxp
);
3540 if (iostate
.io_error
)
3541 error
= iostate
.io_error
;
3543 if (error
== 0 && tail_size
)
3544 error
= cluster_align_phys_io(vp
, uio
, dst_paddr
, tail_size
, CL_READ
);
3547 * just release our hold on the physically contiguous
3548 * region without changing any state
3550 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3557 * generate advisory I/O's in the largest chunks possible
3558 * the completed pages will be released into the VM cache
3561 advisory_read(vnode_t vp
, off_t filesize
, off_t f_offset
, int resid
)
3563 upl_page_info_t
*pl
;
3565 vm_offset_t upl_offset
;
3579 if ( !UBCINFOEXISTS(vp
))
3582 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 60)) | DBG_FUNC_START
,
3583 (int)f_offset
, resid
, (int)filesize
, 0, 0);
3585 while (resid
&& f_offset
< filesize
&& retval
== 0) {
3587 * compute the size of the upl needed to encompass
3588 * the requested read... limit each call to cluster_io
3589 * to the maximum UPL size... cluster_io will clip if
3590 * this exceeds the maximum io_size for the device,
3591 * make sure to account for
3592 * a starting offset that's not page aligned
3594 start_offset
= (int)(f_offset
& PAGE_MASK_64
);
3595 upl_f_offset
= f_offset
- (off_t
)start_offset
;
3596 max_size
= filesize
- f_offset
;
3598 if (resid
< max_size
)
3603 upl_size
= (start_offset
+ io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
3604 if (upl_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
3605 upl_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
3609 * return the number of contiguously present pages in the cache
3610 * starting at upl_f_offset within the file
3612 ubc_range_op(vp
, upl_f_offset
, upl_f_offset
+ upl_size
, UPL_ROP_PRESENT
, &skip_range
);
3616 * skip over pages already present in the cache
3618 io_size
= skip_range
- start_offset
;
3620 f_offset
+= io_size
;
3623 if (skip_range
== upl_size
)
3626 * have to issue some real I/O
3627 * at this point, we know it's starting on a page boundary
3628 * because we've skipped over at least the first page in the request
3631 upl_f_offset
+= skip_range
;
3632 upl_size
-= skip_range
;
3634 pages_in_upl
= upl_size
/ PAGE_SIZE
;
3636 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 61)) | DBG_FUNC_START
,
3637 (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
3639 kret
= ubc_create_upl(vp
,
3644 UPL_RET_ONLY_ABSENT
| UPL_SET_LITE
);
3645 if (kret
!= KERN_SUCCESS
)
3650 * before we start marching forward, we must make sure we end on
3651 * a present page, otherwise we will be working with a freed
3654 for (last_pg
= pages_in_upl
- 1; last_pg
>= 0; last_pg
--) {
3655 if (upl_page_present(pl
, last_pg
))
3658 pages_in_upl
= last_pg
+ 1;
3661 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 61)) | DBG_FUNC_END
,
3662 (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
3665 for (last_pg
= 0; last_pg
< pages_in_upl
; ) {
3667 * scan from the beginning of the upl looking for the first
3668 * page that is present.... this will become the first page in
3669 * the request we're going to make to 'cluster_io'... if all
3670 * of the pages are absent, we won't call through to 'cluster_io'
3672 for (start_pg
= last_pg
; start_pg
< pages_in_upl
; start_pg
++) {
3673 if (upl_page_present(pl
, start_pg
))
3678 * scan from the starting present page looking for an absent
3679 * page before the end of the upl is reached, if we
3680 * find one, then it will terminate the range of pages being
3681 * presented to 'cluster_io'
3683 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
3684 if (!upl_page_present(pl
, last_pg
))
3688 if (last_pg
> start_pg
) {
3690 * we found a range of pages that must be filled
3691 * if the last page in this range is the last page of the file
3692 * we may have to clip the size of it to keep from reading past
3693 * the end of the last physical block associated with the file
3695 upl_offset
= start_pg
* PAGE_SIZE
;
3696 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
3698 if ((upl_f_offset
+ upl_offset
+ io_size
) > filesize
)
3699 io_size
= filesize
- (upl_f_offset
+ upl_offset
);
3702 * issue an asynchronous read to cluster_io
3704 retval
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, io_size
,
3705 CL_ASYNC
| CL_READ
| CL_COMMIT
| CL_AGE
, (buf_t
)NULL
, (struct clios
*)NULL
);
3711 ubc_upl_abort(upl
, 0);
3713 io_size
= upl_size
- start_offset
;
3715 if (io_size
> resid
)
3717 f_offset
+= io_size
;
3721 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 60)) | DBG_FUNC_END
,
3722 (int)f_offset
, resid
, retval
, 0, 0);
3729 cluster_push(vnode_t vp
, int flags
)
3732 struct cl_writebehind
*wbp
;
3734 if ( !UBCINFOEXISTS(vp
)) {
3735 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_NONE
, (int)vp
, flags
, 0, -1, 0);
3738 /* return if deferred write is set */
3739 if (((unsigned int)vfs_flags(vp
->v_mount
) & MNT_DEFWRITE
) && (flags
& IO_DEFWRITE
)) {
3742 if ((wbp
= cluster_get_wbp(vp
, CLW_RETURNLOCKED
)) == NULL
) {
3743 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_NONE
, (int)vp
, flags
, 0, -2, 0);
3746 if (wbp
->cl_number
== 0 && wbp
->cl_scmap
== NULL
) {
3747 lck_mtx_unlock(&wbp
->cl_lockw
);
3749 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_NONE
, (int)vp
, flags
, 0, -3, 0);
3752 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_START
,
3753 (int)wbp
->cl_scmap
, wbp
->cl_number
, flags
, 0, 0);
3755 if (wbp
->cl_scmap
) {
3756 sparse_cluster_push(wbp
, vp
, ubc_getsize(vp
), 1);
3760 retval
= cluster_try_push(wbp
, vp
, ubc_getsize(vp
), 0, 1);
3762 lck_mtx_unlock(&wbp
->cl_lockw
);
3764 if (flags
& IO_SYNC
)
3765 (void)vnode_waitforwrites(vp
, 0, 0, 0, (char *)"cluster_push");
3767 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_END
,
3768 (int)wbp
->cl_scmap
, wbp
->cl_number
, retval
, 0, 0);
3774 __private_extern__
void
3775 cluster_release(struct ubc_info
*ubc
)
3777 struct cl_writebehind
*wbp
;
3778 struct cl_readahead
*rap
;
3780 if ((wbp
= ubc
->cl_wbehind
)) {
3782 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_START
, (int)ubc
, (int)wbp
->cl_scmap
, wbp
->cl_scdirty
, 0, 0);
3785 vfs_drt_control(&(wbp
->cl_scmap
), 0);
3787 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_START
, (int)ubc
, 0, 0, 0, 0);
3790 rap
= ubc
->cl_rahead
;
3793 lck_mtx_destroy(&wbp
->cl_lockw
, cl_mtx_grp
);
3794 FREE_ZONE((void *)wbp
, sizeof *wbp
, M_CLWRBEHIND
);
3796 if ((rap
= ubc
->cl_rahead
)) {
3797 lck_mtx_destroy(&rap
->cl_lockr
, cl_mtx_grp
);
3798 FREE_ZONE((void *)rap
, sizeof *rap
, M_CLRDAHEAD
);
3800 ubc
->cl_rahead
= NULL
;
3801 ubc
->cl_wbehind
= NULL
;
3803 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_END
, (int)ubc
, (int)rap
, (int)wbp
, 0, 0);
3808 cluster_push_EOF(vnode_t vp
, off_t EOF
)
3810 struct cl_writebehind
*wbp
;
3812 wbp
= cluster_get_wbp(vp
, CLW_ALLOCATE
| CLW_RETURNLOCKED
);
3814 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_START
,
3815 (int)wbp
->cl_scmap
, wbp
->cl_number
, (int)EOF
, 0, 0);
3818 sparse_cluster_push(wbp
, vp
, EOF
, 1);
3820 cluster_try_push(wbp
, vp
, EOF
, 0, 1);
3822 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_END
,
3823 (int)wbp
->cl_scmap
, wbp
->cl_number
, 0, 0, 0);
3825 lck_mtx_unlock(&wbp
->cl_lockw
);
3830 cluster_try_push(struct cl_writebehind
*wbp
, vnode_t vp
, off_t EOF
, int can_delay
, int push_all
)
3837 struct cl_wextent l_clusters
[MAX_CLUSTERS
];
3840 * the write behind context exists and has
3841 * already been locked...
3843 * make a local 'sorted' copy of the clusters
3844 * and clear wbp->cl_number so that new clusters can
3847 for (cl_index
= 0; cl_index
< wbp
->cl_number
; cl_index
++) {
3848 for (min_index
= -1, cl_index1
= 0; cl_index1
< wbp
->cl_number
; cl_index1
++) {
3849 if (wbp
->cl_clusters
[cl_index1
].b_addr
== wbp
->cl_clusters
[cl_index1
].e_addr
)
3851 if (min_index
== -1)
3852 min_index
= cl_index1
;
3853 else if (wbp
->cl_clusters
[cl_index1
].b_addr
< wbp
->cl_clusters
[min_index
].b_addr
)
3854 min_index
= cl_index1
;
3856 if (min_index
== -1)
3858 l_clusters
[cl_index
].b_addr
= wbp
->cl_clusters
[min_index
].b_addr
;
3859 l_clusters
[cl_index
].e_addr
= wbp
->cl_clusters
[min_index
].e_addr
;
3860 l_clusters
[cl_index
].io_nocache
= wbp
->cl_clusters
[min_index
].io_nocache
;
3862 wbp
->cl_clusters
[min_index
].b_addr
= wbp
->cl_clusters
[min_index
].e_addr
;
3868 if (can_delay
&& cl_len
== MAX_CLUSTERS
) {
3872 * determine if we appear to be writing the file sequentially
3873 * if not, by returning without having pushed any clusters
3874 * we will cause this vnode to be pushed into the sparse cluster mechanism
3875 * used for managing more random I/O patterns
3877 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
3878 * that's why we're in try_push with can_delay true...
3880 * check to make sure that all the clusters except the last one are 'full'... and that each cluster
3881 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
3882 * so we can just make a simple pass through, up to, but not including the last one...
3883 * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
3886 * we let the last one be partial as long as it was adjacent to the previous one...
3887 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
3888 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
3890 for (i
= 0; i
< MAX_CLUSTERS
- 1; i
++) {
3891 if ((l_clusters
[i
].e_addr
- l_clusters
[i
].b_addr
) != MAX_UPL_TRANSFER
)
3893 if (l_clusters
[i
].e_addr
!= l_clusters
[i
+1].b_addr
)
3898 * drop the lock while we're firing off the I/Os...
3899 * this is safe since I'm working off of a private sorted copy
3900 * of the clusters, and I'm going to re-evaluate the public
3901 * state after I retake the lock
3903 lck_mtx_unlock(&wbp
->cl_lockw
);
3905 for (cl_index
= 0; cl_index
< cl_len
; cl_index
++) {
3907 struct cl_extent cl
;
3910 * try to push each cluster in turn...
3912 if (l_clusters
[cl_index
].io_nocache
)
3916 cl
.b_addr
= l_clusters
[cl_index
].b_addr
;
3917 cl
.e_addr
= l_clusters
[cl_index
].e_addr
;
3919 cluster_push_x(vp
, &cl
, EOF
, flags
);
3921 l_clusters
[cl_index
].b_addr
= 0;
3922 l_clusters
[cl_index
].e_addr
= 0;
3929 lck_mtx_lock(&wbp
->cl_lockw
);
3932 if (cl_len
> cl_pushed
) {
3934 * we didn't push all of the clusters, so
3935 * lets try to merge them back in to the vnode
3937 if ((MAX_CLUSTERS
- wbp
->cl_number
) < (cl_len
- cl_pushed
)) {
3939 * we picked up some new clusters while we were trying to
3940 * push the old ones... this can happen because I've dropped
3941 * the vnode lock... the sum of the
3942 * leftovers plus the new cluster count exceeds our ability
3943 * to represent them, so switch to the sparse cluster mechanism
3945 * collect the active public clusters...
3947 sparse_cluster_switch(wbp
, vp
, EOF
);
3949 for (cl_index
= 0, cl_index1
= 0; cl_index
< cl_len
; cl_index
++) {
3950 if (l_clusters
[cl_index
].b_addr
== l_clusters
[cl_index
].e_addr
)
3952 wbp
->cl_clusters
[cl_index1
].b_addr
= l_clusters
[cl_index
].b_addr
;
3953 wbp
->cl_clusters
[cl_index1
].e_addr
= l_clusters
[cl_index
].e_addr
;
3954 wbp
->cl_clusters
[cl_index1
].io_nocache
= l_clusters
[cl_index
].io_nocache
;
3959 * update the cluster count
3961 wbp
->cl_number
= cl_index1
;
3964 * and collect the original clusters that were moved into the
3965 * local storage for sorting purposes
3967 sparse_cluster_switch(wbp
, vp
, EOF
);
3971 * we've got room to merge the leftovers back in
3972 * just append them starting at the next 'hole'
3973 * represented by wbp->cl_number
3975 for (cl_index
= 0, cl_index1
= wbp
->cl_number
; cl_index
< cl_len
; cl_index
++) {
3976 if (l_clusters
[cl_index
].b_addr
== l_clusters
[cl_index
].e_addr
)
3979 wbp
->cl_clusters
[cl_index1
].b_addr
= l_clusters
[cl_index
].b_addr
;
3980 wbp
->cl_clusters
[cl_index1
].e_addr
= l_clusters
[cl_index
].e_addr
;
3981 wbp
->cl_clusters
[cl_index1
].io_nocache
= l_clusters
[cl_index
].io_nocache
;
3986 * update the cluster count
3988 wbp
->cl_number
= cl_index1
;
3991 return(MAX_CLUSTERS
- wbp
->cl_number
);
3997 cluster_push_x(vnode_t vp
, struct cl_extent
*cl
, off_t EOF
, int flags
)
3999 upl_page_info_t
*pl
;
4001 vm_offset_t upl_offset
;
4016 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_START
,
4017 (int)cl
->b_addr
, (int)cl
->e_addr
, (int)EOF
, flags
, 0);
4019 if ((pages_in_upl
= (int)(cl
->e_addr
- cl
->b_addr
)) == 0) {
4020 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 0, 0, 0, 0);
4024 upl_size
= pages_in_upl
* PAGE_SIZE
;
4025 upl_f_offset
= (off_t
)(cl
->b_addr
* PAGE_SIZE_64
);
4027 if (upl_f_offset
+ upl_size
>= EOF
) {
4029 if (upl_f_offset
>= EOF
) {
4031 * must have truncated the file and missed
4032 * clearing a dangling cluster (i.e. it's completely
4033 * beyond the new EOF
4035 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 1, 0, 0, 0);
4039 size
= EOF
- upl_f_offset
;
4041 upl_size
= (size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
4042 pages_in_upl
= upl_size
/ PAGE_SIZE
;
4046 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_START
, upl_size
, size
, 0, 0, 0);
4049 * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
4051 * - only pages that are currently dirty are returned... these are the ones we need to clean
4052 * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
4053 * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
4054 * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
4055 * someone dirties this page while the I/O is in progress, we don't lose track of the new state
4057 * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
4060 if ((vp
->v_flag
& VNOCACHE_DATA
) || (flags
& IO_NOCACHE
))
4061 upl_flags
= UPL_COPYOUT_FROM
| UPL_RET_ONLY_DIRTY
| UPL_SET_LITE
| UPL_WILL_BE_DUMPED
;
4063 upl_flags
= UPL_COPYOUT_FROM
| UPL_RET_ONLY_DIRTY
| UPL_SET_LITE
;
4065 kret
= ubc_create_upl(vp
,
4071 if (kret
!= KERN_SUCCESS
)
4072 panic("cluster_push: failed to get pagelist");
4074 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_END
, (int)upl
, upl_f_offset
, 0, 0, 0);
4077 * since we only asked for the dirty pages back
4078 * it's possible that we may only get a few or even none, so...
4079 * before we start marching forward, we must make sure we know
4080 * where the last present page is in the UPL, otherwise we could
4081 * end up working with a freed upl due to the FREE_ON_EMPTY semantics
4082 * employed by commit_range and abort_range.
4084 for (last_pg
= pages_in_upl
- 1; last_pg
>= 0; last_pg
--) {
4085 if (upl_page_present(pl
, last_pg
))
4088 pages_in_upl
= last_pg
+ 1;
4090 if (pages_in_upl
== 0) {
4091 ubc_upl_abort(upl
, 0);
4093 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 2, 0, 0, 0);
4097 for (last_pg
= 0; last_pg
< pages_in_upl
; ) {
4099 * find the next dirty page in the UPL
4100 * this will become the first page in the
4101 * next I/O to generate
4103 for (start_pg
= last_pg
; start_pg
< pages_in_upl
; start_pg
++) {
4104 if (upl_dirty_page(pl
, start_pg
))
4106 if (upl_page_present(pl
, start_pg
))
4108 * RET_ONLY_DIRTY will return non-dirty 'precious' pages
4109 * just release these unchanged since we're not going
4110 * to steal them or change their state
4112 ubc_upl_abort_range(upl
, start_pg
* PAGE_SIZE
, PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
4114 if (start_pg
>= pages_in_upl
)
4116 * done... no more dirty pages to push
4119 if (start_pg
> last_pg
)
4121 * skipped over some non-dirty pages
4123 size
-= ((start_pg
- last_pg
) * PAGE_SIZE
);
4126 * find a range of dirty pages to write
4128 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
4129 if (!upl_dirty_page(pl
, last_pg
))
4132 upl_offset
= start_pg
* PAGE_SIZE
;
4134 io_size
= min(size
, (last_pg
- start_pg
) * PAGE_SIZE
);
4136 io_flags
= CL_THROTTLE
| CL_COMMIT
;
4138 if ( !(flags
& IO_SYNC
))
4139 io_flags
|= CL_ASYNC
;
4141 retval
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, io_size
,
4142 io_flags
, (buf_t
)NULL
, (struct clios
*)NULL
);
4144 if (error
== 0 && retval
)
4149 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 3, 0, 0, 0);
4156 * sparse_cluster_switch is called with the write behind lock held
4159 sparse_cluster_switch(struct cl_writebehind
*wbp
, vnode_t vp
, off_t EOF
)
4163 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 78)) | DBG_FUNC_START
, (int)vp
, (int)wbp
->cl_scmap
, wbp
->cl_scdirty
, 0, 0);
4165 if (wbp
->cl_scmap
== NULL
)
4166 wbp
->cl_scdirty
= 0;
4168 for (cl_index
= 0; cl_index
< wbp
->cl_number
; cl_index
++) {
4170 struct cl_extent cl
;
4172 for (cl
.b_addr
= wbp
->cl_clusters
[cl_index
].b_addr
; cl
.b_addr
< wbp
->cl_clusters
[cl_index
].e_addr
; cl
.b_addr
++) {
4174 if (ubc_page_op(vp
, (off_t
)(cl
.b_addr
* PAGE_SIZE_64
), 0, 0, &flags
) == KERN_SUCCESS
) {
4175 if (flags
& UPL_POP_DIRTY
) {
4176 cl
.e_addr
= cl
.b_addr
+ 1;
4178 sparse_cluster_add(wbp
, vp
, &cl
, EOF
);
4185 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 78)) | DBG_FUNC_END
, (int)vp
, (int)wbp
->cl_scmap
, wbp
->cl_scdirty
, 0, 0);
4190 * sparse_cluster_push is called with the write behind lock held
4193 sparse_cluster_push(struct cl_writebehind
*wbp
, vnode_t vp
, off_t EOF
, int push_all
)
4195 struct cl_extent cl
;
4199 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 79)) | DBG_FUNC_START
, (int)vp
, (int)wbp
->cl_scmap
, wbp
->cl_scdirty
, push_all
, 0);
4202 vfs_drt_control(&(wbp
->cl_scmap
), 1);
4205 if (vfs_drt_get_cluster(&(wbp
->cl_scmap
), &offset
, &length
) != KERN_SUCCESS
)
4208 cl
.b_addr
= (daddr64_t
)(offset
/ PAGE_SIZE_64
);
4209 cl
.e_addr
= (daddr64_t
)((offset
+ length
) / PAGE_SIZE_64
);
4211 wbp
->cl_scdirty
-= (int)(cl
.e_addr
- cl
.b_addr
);
4213 cluster_push_x(vp
, &cl
, EOF
, 0);
4218 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 79)) | DBG_FUNC_END
, (int)vp
, (int)wbp
->cl_scmap
, wbp
->cl_scdirty
, 0, 0);
4223 * sparse_cluster_add is called with the write behind lock held
4226 sparse_cluster_add(struct cl_writebehind
*wbp
, vnode_t vp
, struct cl_extent
*cl
, off_t EOF
)
4232 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 80)) | DBG_FUNC_START
, (int)wbp
->cl_scmap
, wbp
->cl_scdirty
, (int)cl
->b_addr
, (int)cl
->e_addr
, 0);
4234 offset
= (off_t
)(cl
->b_addr
* PAGE_SIZE_64
);
4235 length
= ((u_int
)(cl
->e_addr
- cl
->b_addr
)) * PAGE_SIZE
;
4237 while (vfs_drt_mark_pages(&(wbp
->cl_scmap
), offset
, length
, &new_dirty
) != KERN_SUCCESS
) {
4239 * no room left in the map
4240 * only a partial update was done
4241 * push out some pages and try again
4243 wbp
->cl_scdirty
+= new_dirty
;
4245 sparse_cluster_push(wbp
, vp
, EOF
, 0);
4247 offset
+= (new_dirty
* PAGE_SIZE_64
);
4248 length
-= (new_dirty
* PAGE_SIZE
);
4250 wbp
->cl_scdirty
+= new_dirty
;
4252 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 80)) | DBG_FUNC_END
, (int)vp
, (int)wbp
->cl_scmap
, wbp
->cl_scdirty
, 0, 0);
4257 cluster_align_phys_io(vnode_t vp
, struct uio
*uio
, addr64_t usr_paddr
, int xsize
, int flags
)
4259 upl_page_info_t
*pl
;
4268 upl_flags
= UPL_SET_LITE
;
4269 if (! (flags
& CL_READ
)) {
4271 * "write" operation: let the UPL subsystem know
4272 * that we intend to modify the buffer cache pages
4275 upl_flags
|= UPL_WILL_MODIFY
;
4278 kret
= ubc_create_upl(vp
,
4279 uio
->uio_offset
& ~PAGE_MASK_64
,
4285 if (kret
!= KERN_SUCCESS
)
4288 if (!upl_valid_page(pl
, 0)) {
4290 * issue a synchronous read to cluster_io
4292 error
= cluster_io(vp
, upl
, 0, uio
->uio_offset
& ~PAGE_MASK_64
, PAGE_SIZE
,
4293 CL_READ
, (buf_t
)NULL
, (struct clios
*)NULL
);
4295 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
4301 ubc_paddr
= ((addr64_t
)upl_phys_page(pl
, 0) << 12) + (addr64_t
)(uio
->uio_offset
& PAGE_MASK_64
);
4304 * NOTE: There is no prototype for the following in BSD. It, and the definitions
4305 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
4306 * osfmk/ppc/mappings.h. They are not included here because there appears to be no
4307 * way to do so without exporting them to kexts as well.
4309 if (flags
& CL_READ
)
4310 // copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */
4311 copypv(ubc_paddr
, usr_paddr
, xsize
, 2 | 1 | 4); /* Copy physical to physical and flush the destination */
4313 // copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */
4314 copypv(usr_paddr
, ubc_paddr
, xsize
, 2 | 1 | 8); /* Copy physical to physical and flush the source */
4316 if ( !(flags
& CL_READ
) || (upl_valid_page(pl
, 0) && upl_dirty_page(pl
, 0))) {
4318 * issue a synchronous write to cluster_io
4320 error
= cluster_io(vp
, upl
, 0, uio
->uio_offset
& ~PAGE_MASK_64
, PAGE_SIZE
,
4321 0, (buf_t
)NULL
, (struct clios
*)NULL
);
4324 uio_update(uio
, (user_size_t
)xsize
);
4327 abort_flags
= UPL_ABORT_FREE_ON_EMPTY
;
4329 abort_flags
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_DUMP_PAGES
;
4331 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, abort_flags
);
4339 cluster_copy_upl_data(struct uio
*uio
, upl_t upl
, int upl_offset
, int xsize
)
4346 upl_page_info_t
*pl
;
4348 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_START
,
4349 (int)uio
->uio_offset
, uio_resid(uio
), upl_offset
, xsize
, 0);
4351 segflg
= uio
->uio_segflg
;
4355 case UIO_USERSPACE32
:
4356 case UIO_USERISPACE32
:
4357 uio
->uio_segflg
= UIO_PHYS_USERSPACE32
;
4361 case UIO_USERISPACE
:
4362 uio
->uio_segflg
= UIO_PHYS_USERSPACE
;
4365 case UIO_USERSPACE64
:
4366 case UIO_USERISPACE64
:
4367 uio
->uio_segflg
= UIO_PHYS_USERSPACE64
;
4370 case UIO_SYSSPACE32
:
4371 uio
->uio_segflg
= UIO_PHYS_SYSSPACE32
;
4375 uio
->uio_segflg
= UIO_PHYS_SYSSPACE
;
4378 case UIO_SYSSPACE64
:
4379 uio
->uio_segflg
= UIO_PHYS_SYSSPACE64
;
4382 pl
= ubc_upl_pageinfo(upl
);
4384 pg_index
= upl_offset
/ PAGE_SIZE
;
4385 pg_offset
= upl_offset
& PAGE_MASK
;
4386 csize
= min(PAGE_SIZE
- pg_offset
, xsize
);
4388 while (xsize
&& retval
== 0) {
4391 paddr
= ((addr64_t
)upl_phys_page(pl
, pg_index
) << 12) + pg_offset
;
4393 retval
= uiomove64(paddr
, csize
, uio
);
4398 csize
= min(PAGE_SIZE
, xsize
);
4400 uio
->uio_segflg
= segflg
;
4402 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
4403 (int)uio
->uio_offset
, uio_resid(uio
), retval
, segflg
, 0);
4410 cluster_copy_ubc_data(vnode_t vp
, struct uio
*uio
, int *io_resid
, int mark_dirty
)
4417 memory_object_control_t control
;
4420 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_START
,
4421 (int)uio
->uio_offset
, uio_resid(uio
), 0, *io_resid
, 0);
4423 control
= ubc_getobject(vp
, UBC_FLAGS_NONE
);
4424 if (control
== MEMORY_OBJECT_CONTROL_NULL
) {
4425 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
4426 (int)uio
->uio_offset
, uio_resid(uio
), retval
, 3, 0);
4430 segflg
= uio
->uio_segflg
;
4434 case UIO_USERSPACE32
:
4435 case UIO_USERISPACE32
:
4436 uio
->uio_segflg
= UIO_PHYS_USERSPACE32
;
4439 case UIO_USERSPACE64
:
4440 case UIO_USERISPACE64
:
4441 uio
->uio_segflg
= UIO_PHYS_USERSPACE64
;
4444 case UIO_SYSSPACE32
:
4445 uio
->uio_segflg
= UIO_PHYS_SYSSPACE32
;
4448 case UIO_SYSSPACE64
:
4449 uio
->uio_segflg
= UIO_PHYS_SYSSPACE64
;
4453 case UIO_USERISPACE
:
4454 uio
->uio_segflg
= UIO_PHYS_USERSPACE
;
4458 uio
->uio_segflg
= UIO_PHYS_SYSSPACE
;
4462 if ( (io_size
= *io_resid
) ) {
4463 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
4464 xsize
= uio_resid(uio
);
4466 retval
= memory_object_control_uiomove(control
, uio
->uio_offset
- start_offset
,
4467 uio
, start_offset
, io_size
, mark_dirty
);
4468 xsize
-= uio_resid(uio
);
4471 uio
->uio_segflg
= segflg
;
4472 *io_resid
= io_size
;
4474 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
4475 (int)uio
->uio_offset
, uio_resid(uio
), retval
, 0x80000000 | segflg
, 0);
4482 is_file_clean(vnode_t vp
, off_t filesize
)
4486 int total_dirty
= 0;
4488 for (f_offset
= 0; f_offset
< filesize
; f_offset
+= PAGE_SIZE_64
) {
4489 if (ubc_page_op(vp
, f_offset
, 0, 0, &flags
) == KERN_SUCCESS
) {
4490 if (flags
& UPL_POP_DIRTY
) {
4504 * Dirty region tracking/clustering mechanism.
4506 * This code (vfs_drt_*) provides a mechanism for tracking and clustering
4507 * dirty regions within a larger space (file). It is primarily intended to
4508 * support clustering in large files with many dirty areas.
4510 * The implementation assumes that the dirty regions are pages.
4512 * To represent dirty pages within the file, we store bit vectors in a
4513 * variable-size circular hash.
4517 * Bitvector size. This determines the number of pages we group in a
4518 * single hashtable entry. Each hashtable entry is aligned to this
4519 * size within the file.
4521 #define DRT_BITVECTOR_PAGES 256
4524 * File offset handling.
4526 * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
4527 * the correct formula is (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)
4529 #define DRT_ADDRESS_MASK (~((1 << 20) - 1))
4530 #define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
4533 * Hashtable address field handling.
4535 * The low-order bits of the hashtable address are used to conserve
4538 * DRT_HASH_COUNT_MASK must be large enough to store the range
4539 * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
4540 * to indicate that the bucket is actually unoccupied.
4542 #define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
4543 #define DRT_HASH_SET_ADDRESS(scm, i, a) \
4545 (scm)->scm_hashtable[(i)].dhe_control = \
4546 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
4548 #define DRT_HASH_COUNT_MASK 0x1ff
4549 #define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
4550 #define DRT_HASH_SET_COUNT(scm, i, c) \
4552 (scm)->scm_hashtable[(i)].dhe_control = \
4553 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \
4555 #define DRT_HASH_CLEAR(scm, i) \
4557 (scm)->scm_hashtable[(i)].dhe_control = 0; \
4559 #define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
4560 #define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
4561 #define DRT_HASH_COPY(oscm, oi, scm, i) \
4563 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
4564 DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
4569 * Hash table moduli.
4571 * Since the hashtable entry's size is dependent on the size of
4572 * the bitvector, and since the hashtable size is constrained to
4573 * both being prime and fitting within the desired allocation
4574 * size, these values need to be manually determined.
4576 * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
4578 * The small hashtable allocation is 1024 bytes, so the modulus is 23.
4579 * The large hashtable allocation is 16384 bytes, so the modulus is 401.
4581 #define DRT_HASH_SMALL_MODULUS 23
4582 #define DRT_HASH_LARGE_MODULUS 401
4584 #define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */
4585 #define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */
4587 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
4590 * Hashtable bitvector handling.
4592 * Bitvector fields are 32 bits long.
4595 #define DRT_HASH_SET_BIT(scm, i, bit) \
4596 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
4598 #define DRT_HASH_CLEAR_BIT(scm, i, bit) \
4599 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
4601 #define DRT_HASH_TEST_BIT(scm, i, bit) \
4602 ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
4604 #define DRT_BITVECTOR_CLEAR(scm, i) \
4605 bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4607 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
4608 bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
4609 &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
4610 (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4617 struct vfs_drt_hashentry
{
4618 u_int64_t dhe_control
;
4619 u_int32_t dhe_bitvector
[DRT_BITVECTOR_PAGES
/ 32];
4623 * Dirty Region Tracking structure.
4625 * The hashtable is allocated entirely inside the DRT structure.
4627 * The hash is a simple circular prime modulus arrangement, the structure
4628 * is resized from small to large if it overflows.
4631 struct vfs_drt_clustermap
{
4632 u_int32_t scm_magic
; /* sanity/detection */
4633 #define DRT_SCM_MAGIC 0x12020003
4634 u_int32_t scm_modulus
; /* current ring size */
4635 u_int32_t scm_buckets
; /* number of occupied buckets */
4636 u_int32_t scm_lastclean
; /* last entry we cleaned */
4637 u_int32_t scm_iskips
; /* number of slot skips */
4639 struct vfs_drt_hashentry scm_hashtable
[0];
4643 #define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
4644 #define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
4647 * Debugging codes and arguments.
4649 #define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
4650 #define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
4651 #define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
4652 #define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
4653 #define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
4656 /* 1 (clean, no map) */
4657 /* 2 (map alloc fail) */
4658 /* 3, resid (partial) */
4659 #define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
4660 #define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
4661 * lastclean, iskips */
4664 static kern_return_t
vfs_drt_alloc_map(struct vfs_drt_clustermap
**cmapp
);
4665 static kern_return_t
vfs_drt_free_map(struct vfs_drt_clustermap
*cmap
);
4666 static kern_return_t
vfs_drt_search_index(struct vfs_drt_clustermap
*cmap
,
4667 u_int64_t offset
, int *indexp
);
4668 static kern_return_t
vfs_drt_get_index(struct vfs_drt_clustermap
**cmapp
,
4672 static kern_return_t
vfs_drt_do_mark_pages(
4678 static void vfs_drt_trace(
4679 struct vfs_drt_clustermap
*cmap
,
4688 * Allocate and initialise a sparse cluster map.
4690 * Will allocate a new map, resize or compact an existing map.
4692 * XXX we should probably have at least one intermediate map size,
4693 * as the 1:16 ratio seems a bit drastic.
4695 static kern_return_t
4696 vfs_drt_alloc_map(struct vfs_drt_clustermap
**cmapp
)
4698 struct vfs_drt_clustermap
*cmap
, *ocmap
;
4701 int nsize
, i
, active_buckets
, index
, copycount
;
4708 * Decide on the size of the new map.
4710 if (ocmap
== NULL
) {
4711 nsize
= DRT_HASH_SMALL_MODULUS
;
4713 /* count the number of active buckets in the old map */
4715 for (i
= 0; i
< ocmap
->scm_modulus
; i
++) {
4716 if (!DRT_HASH_VACANT(ocmap
, i
) &&
4717 (DRT_HASH_GET_COUNT(ocmap
, i
) != 0))
4721 * If we're currently using the small allocation, check to
4722 * see whether we should grow to the large one.
4724 if (ocmap
->scm_modulus
== DRT_HASH_SMALL_MODULUS
) {
4725 /* if the ring is nearly full */
4726 if (active_buckets
> (DRT_HASH_SMALL_MODULUS
- 5)) {
4727 nsize
= DRT_HASH_LARGE_MODULUS
;
4729 nsize
= DRT_HASH_SMALL_MODULUS
;
4732 /* already using the large modulus */
4733 nsize
= DRT_HASH_LARGE_MODULUS
;
4735 * If the ring is completely full, there's
4736 * nothing useful for us to do. Behave as
4737 * though we had compacted into the new
4740 if (active_buckets
>= DRT_HASH_LARGE_MODULUS
)
4741 return(KERN_SUCCESS
);
4746 * Allocate and initialise the new map.
4749 kret
= kmem_alloc(kernel_map
, (vm_offset_t
*)&cmap
,
4750 (nsize
== DRT_HASH_SMALL_MODULUS
) ? DRT_SMALL_ALLOCATION
: DRT_LARGE_ALLOCATION
);
4751 if (kret
!= KERN_SUCCESS
)
4753 cmap
->scm_magic
= DRT_SCM_MAGIC
;
4754 cmap
->scm_modulus
= nsize
;
4755 cmap
->scm_buckets
= 0;
4756 cmap
->scm_lastclean
= 0;
4757 cmap
->scm_iskips
= 0;
4758 for (i
= 0; i
< cmap
->scm_modulus
; i
++) {
4759 DRT_HASH_CLEAR(cmap
, i
);
4760 DRT_HASH_VACATE(cmap
, i
);
4761 DRT_BITVECTOR_CLEAR(cmap
, i
);
4765 * If there's an old map, re-hash entries from it into the new map.
4768 if (ocmap
!= NULL
) {
4769 for (i
= 0; i
< ocmap
->scm_modulus
; i
++) {
4770 /* skip empty buckets */
4771 if (DRT_HASH_VACANT(ocmap
, i
) ||
4772 (DRT_HASH_GET_COUNT(ocmap
, i
) == 0))
4775 offset
= DRT_HASH_GET_ADDRESS(ocmap
, i
);
4776 kret
= vfs_drt_get_index(&cmap
, offset
, &index
, 1);
4777 if (kret
!= KERN_SUCCESS
) {
4778 /* XXX need to bail out gracefully here */
4779 panic("vfs_drt: new cluster map mysteriously too small");
4782 DRT_HASH_COPY(ocmap
, i
, cmap
, index
);
4787 /* log what we've done */
4788 vfs_drt_trace(cmap
, DRT_DEBUG_ALLOC
, copycount
, 0, 0, 0);
4791 * It's important to ensure that *cmapp always points to
4792 * a valid map, so we must overwrite it before freeing
4796 if (ocmap
!= NULL
) {
4797 /* emit stats into trace buffer */
4798 vfs_drt_trace(ocmap
, DRT_DEBUG_SCMDATA
,
4801 ocmap
->scm_lastclean
,
4804 vfs_drt_free_map(ocmap
);
4806 return(KERN_SUCCESS
);
4811 * Free a sparse cluster map.
4813 static kern_return_t
4814 vfs_drt_free_map(struct vfs_drt_clustermap
*cmap
)
4816 kmem_free(kernel_map
, (vm_offset_t
)cmap
,
4817 (cmap
->scm_modulus
== DRT_HASH_SMALL_MODULUS
) ? DRT_SMALL_ALLOCATION
: DRT_LARGE_ALLOCATION
);
4818 return(KERN_SUCCESS
);
4823 * Find the hashtable slot currently occupied by an entry for the supplied offset.
4825 static kern_return_t
4826 vfs_drt_search_index(struct vfs_drt_clustermap
*cmap
, u_int64_t offset
, int *indexp
)
4830 offset
= DRT_ALIGN_ADDRESS(offset
);
4831 index
= DRT_HASH(cmap
, offset
);
4833 /* traverse the hashtable */
4834 for (i
= 0; i
< cmap
->scm_modulus
; i
++) {
4837 * If the slot is vacant, we can stop.
4839 if (DRT_HASH_VACANT(cmap
, index
))
4843 * If the address matches our offset, we have success.
4845 if (DRT_HASH_GET_ADDRESS(cmap
, index
) == offset
) {
4847 return(KERN_SUCCESS
);
4851 * Move to the next slot, try again.
4853 index
= DRT_HASH_NEXT(cmap
, index
);
4858 return(KERN_FAILURE
);
4862 * Find the hashtable slot for the supplied offset. If we haven't allocated
4863 * one yet, allocate one and populate the address field. Note that it will
4864 * not have a nonzero page count and thus will still technically be free, so
4865 * in the case where we are called to clean pages, the slot will remain free.
4867 static kern_return_t
4868 vfs_drt_get_index(struct vfs_drt_clustermap
**cmapp
, u_int64_t offset
, int *indexp
, int recursed
)
4870 struct vfs_drt_clustermap
*cmap
;
4876 /* look for an existing entry */
4877 kret
= vfs_drt_search_index(cmap
, offset
, indexp
);
4878 if (kret
== KERN_SUCCESS
)
4881 /* need to allocate an entry */
4882 offset
= DRT_ALIGN_ADDRESS(offset
);
4883 index
= DRT_HASH(cmap
, offset
);
4885 /* scan from the index forwards looking for a vacant slot */
4886 for (i
= 0; i
< cmap
->scm_modulus
; i
++) {
4888 if (DRT_HASH_VACANT(cmap
, index
) || DRT_HASH_GET_COUNT(cmap
,index
) == 0) {
4889 cmap
->scm_buckets
++;
4890 if (index
< cmap
->scm_lastclean
)
4891 cmap
->scm_lastclean
= index
;
4892 DRT_HASH_SET_ADDRESS(cmap
, index
, offset
);
4893 DRT_HASH_SET_COUNT(cmap
, index
, 0);
4894 DRT_BITVECTOR_CLEAR(cmap
, index
);
4896 vfs_drt_trace(cmap
, DRT_DEBUG_INSERT
, (int)offset
, i
, 0, 0);
4897 return(KERN_SUCCESS
);
4899 cmap
->scm_iskips
+= i
;
4900 index
= DRT_HASH_NEXT(cmap
, index
);
4904 * We haven't found a vacant slot, so the map is full. If we're not
4905 * already recursed, try reallocating/compacting it.
4908 return(KERN_FAILURE
);
4909 kret
= vfs_drt_alloc_map(cmapp
);
4910 if (kret
== KERN_SUCCESS
) {
4911 /* now try to insert again */
4912 kret
= vfs_drt_get_index(cmapp
, offset
, indexp
, 1);
4918 * Implementation of set dirty/clean.
4920 * In the 'clean' case, not finding a map is OK.
4922 static kern_return_t
4923 vfs_drt_do_mark_pages(
4930 struct vfs_drt_clustermap
*cmap
, **cmapp
;
4932 int i
, index
, pgoff
, pgcount
, setcount
, ecount
;
4934 cmapp
= (struct vfs_drt_clustermap
**)private;
4937 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_START
, (int)offset
, (int)length
, dirty
, 0);
4939 if (setcountp
!= NULL
)
4942 /* allocate a cluster map if we don't already have one */
4944 /* no cluster map, nothing to clean */
4946 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 1, 0, 0, 0);
4947 return(KERN_SUCCESS
);
4949 kret
= vfs_drt_alloc_map(cmapp
);
4950 if (kret
!= KERN_SUCCESS
) {
4951 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 2, 0, 0, 0);
4958 * Iterate over the length of the region.
4960 while (length
> 0) {
4962 * Get the hashtable index for this offset.
4964 * XXX this will add blank entries if we are clearing a range
4965 * that hasn't been dirtied.
4967 kret
= vfs_drt_get_index(cmapp
, offset
, &index
, 0);
4968 cmap
= *cmapp
; /* may have changed! */
4969 /* this may be a partial-success return */
4970 if (kret
!= KERN_SUCCESS
) {
4971 if (setcountp
!= NULL
)
4972 *setcountp
= setcount
;
4973 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 3, (int)length
, 0, 0);
4979 * Work out how many pages we're modifying in this
4982 pgoff
= (offset
- DRT_ALIGN_ADDRESS(offset
)) / PAGE_SIZE
;
4983 pgcount
= min((length
/ PAGE_SIZE
), (DRT_BITVECTOR_PAGES
- pgoff
));
4986 * Iterate over pages, dirty/clearing as we go.
4988 ecount
= DRT_HASH_GET_COUNT(cmap
, index
);
4989 for (i
= 0; i
< pgcount
; i
++) {
4991 if (!DRT_HASH_TEST_BIT(cmap
, index
, pgoff
+ i
)) {
4992 DRT_HASH_SET_BIT(cmap
, index
, pgoff
+ i
);
4997 if (DRT_HASH_TEST_BIT(cmap
, index
, pgoff
+ i
)) {
4998 DRT_HASH_CLEAR_BIT(cmap
, index
, pgoff
+ i
);
5004 DRT_HASH_SET_COUNT(cmap
, index
, ecount
);
5006 offset
+= pgcount
* PAGE_SIZE
;
5007 length
-= pgcount
* PAGE_SIZE
;
5009 if (setcountp
!= NULL
)
5010 *setcountp
= setcount
;
5012 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 0, setcount
, 0, 0);
5014 return(KERN_SUCCESS
);
5018 * Mark a set of pages as dirty/clean.
5020 * This is a public interface.
5023 * Pointer to storage suitable for holding a pointer. Note that
5024 * this must either be NULL or a value set by this function.
5027 * Current file size in bytes.
5030 * Offset of the first page to be marked as dirty, in bytes. Must be
5034 * Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
5037 * Number of pages newly marked dirty by this call (optional).
5039 * Returns KERN_SUCCESS if all the pages were successfully marked.
5041 static kern_return_t
5042 vfs_drt_mark_pages(void **cmapp
, off_t offset
, u_int length
, int *setcountp
)
5044 /* XXX size unused, drop from interface */
5045 return(vfs_drt_do_mark_pages(cmapp
, offset
, length
, setcountp
, 1));
5049 static kern_return_t
5050 vfs_drt_unmark_pages(void **cmapp
, off_t offset
, u_int length
)
5052 return(vfs_drt_do_mark_pages(cmapp
, offset
, length
, NULL
, 0));
5057 * Get a cluster of dirty pages.
5059 * This is a public interface.
5062 * Pointer to storage managed by drt_mark_pages. Note that this must
5063 * be NULL or a value set by drt_mark_pages.
5066 * Returns the byte offset into the file of the first page in the cluster.
5069 * Returns the length in bytes of the cluster of dirty pages.
5071 * Returns success if a cluster was found. If KERN_FAILURE is returned, there
5072 * are no dirty pages meeting the minmum size criteria. Private storage will
5073 * be released if there are no more dirty pages left in the map
5076 static kern_return_t
5077 vfs_drt_get_cluster(void **cmapp
, off_t
*offsetp
, u_int
*lengthp
)
5079 struct vfs_drt_clustermap
*cmap
;
5082 int index
, i
, j
, fs
, ls
;
5085 if ((cmapp
== NULL
) || (*cmapp
== NULL
))
5086 return(KERN_FAILURE
);
5089 /* walk the hashtable */
5090 for (offset
= 0, j
= 0; j
< cmap
->scm_modulus
; offset
+= (DRT_BITVECTOR_PAGES
* PAGE_SIZE
), j
++) {
5091 index
= DRT_HASH(cmap
, offset
);
5093 if (DRT_HASH_VACANT(cmap
, index
) || (DRT_HASH_GET_COUNT(cmap
, index
) == 0))
5096 /* scan the bitfield for a string of bits */
5099 for (i
= 0; i
< DRT_BITVECTOR_PAGES
; i
++) {
5100 if (DRT_HASH_TEST_BIT(cmap
, index
, i
)) {
5106 /* didn't find any bits set */
5107 panic("vfs_drt: entry summary count > 0 but no bits set in map");
5109 for (ls
= 0; i
< DRT_BITVECTOR_PAGES
; i
++, ls
++) {
5110 if (!DRT_HASH_TEST_BIT(cmap
, index
, i
))
5114 /* compute offset and length, mark pages clean */
5115 offset
= DRT_HASH_GET_ADDRESS(cmap
, index
) + (PAGE_SIZE
* fs
);
5116 length
= ls
* PAGE_SIZE
;
5117 vfs_drt_do_mark_pages(cmapp
, offset
, length
, NULL
, 0);
5118 cmap
->scm_lastclean
= index
;
5120 /* return successful */
5121 *offsetp
= (off_t
)offset
;
5124 vfs_drt_trace(cmap
, DRT_DEBUG_RETCLUSTER
, (int)offset
, (int)length
, 0, 0);
5125 return(KERN_SUCCESS
);
5128 * We didn't find anything... hashtable is empty
5129 * emit stats into trace buffer and
5132 vfs_drt_trace(cmap
, DRT_DEBUG_SCMDATA
,
5135 cmap
->scm_lastclean
,
5138 vfs_drt_free_map(cmap
);
5141 return(KERN_FAILURE
);
5145 static kern_return_t
5146 vfs_drt_control(void **cmapp
, int op_type
)
5148 struct vfs_drt_clustermap
*cmap
;
5151 if ((cmapp
== NULL
) || (*cmapp
== NULL
))
5152 return(KERN_FAILURE
);
5157 /* emit stats into trace buffer */
5158 vfs_drt_trace(cmap
, DRT_DEBUG_SCMDATA
,
5161 cmap
->scm_lastclean
,
5164 vfs_drt_free_map(cmap
);
5169 cmap
->scm_lastclean
= 0;
5172 return(KERN_SUCCESS
);
5178 * Emit a summary of the state of the clustermap into the trace buffer
5179 * along with some caller-provided data.
5183 vfs_drt_trace(__unused
struct vfs_drt_clustermap
*cmap
, int code
, int arg1
, int arg2
, int arg3
, int arg4
)
5185 KERNEL_DEBUG(code
, arg1
, arg2
, arg3
, arg4
, 0);
5189 vfs_drt_trace(__unused
struct vfs_drt_clustermap
*cmap
, __unused
int code
,
5190 __unused
int arg1
, __unused
int arg2
, __unused
int arg3
,
5198 * Perform basic sanity check on the hash entry summary count
5199 * vs. the actual bits set in the entry.
5202 vfs_drt_sanity(struct vfs_drt_clustermap
*cmap
)
5207 for (index
= 0; index
< cmap
->scm_modulus
; index
++) {
5208 if (DRT_HASH_VACANT(cmap
, index
))
5211 for (bits_on
= 0, i
= 0; i
< DRT_BITVECTOR_PAGES
; i
++) {
5212 if (DRT_HASH_TEST_BIT(cmap
, index
, i
))
5215 if (bits_on
!= DRT_HASH_GET_COUNT(cmap
, index
))
5216 panic("bits_on = %d, index = %d\n", bits_on
, index
);