2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
31 * The Regents of the University of California. All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
64 #include <sys/param.h>
65 #include <sys/proc_internal.h>
66 #include <sys/buf_internal.h>
67 #include <sys/mount_internal.h>
68 #include <sys/vnode_internal.h>
69 #include <sys/trace.h>
70 #include <sys/malloc.h>
72 #include <sys/kernel.h>
73 #include <sys/resourcevar.h>
74 #include <sys/uio_internal.h>
75 #include <libkern/libkern.h>
76 #include <machine/machine_routines.h>
78 #include <sys/ubc_internal.h>
80 #include <mach/mach_types.h>
81 #include <mach/memory_object_types.h>
82 #include <mach/vm_map.h>
85 #include <vm/vm_kern.h>
86 #include <vm/vm_map.h>
87 #include <vm/vm_pageout.h>
89 #include <sys/kdebug.h>
94 #define CL_COMMIT 0x04
95 #define CL_PAGEOUT 0x10
98 #define CL_NOZERO 0x80
99 #define CL_PAGEIN 0x100
100 #define CL_DEV_MEMORY 0x200
101 #define CL_PRESERVE 0x400
102 #define CL_THROTTLE 0x800
103 #define CL_KEEPCACHED 0x1000
107 u_int io_completed
; /* amount of io that has currently completed */
108 u_int io_issued
; /* amount of io that was successfully issued */
109 int io_error
; /* error code of first error encountered */
110 int io_wanted
; /* someone is sleeping waiting for a change in state */
113 static lck_grp_t
*cl_mtx_grp
;
114 static lck_attr_t
*cl_mtx_attr
;
115 static lck_grp_attr_t
*cl_mtx_grp_attr
;
116 static lck_mtx_t
*cl_mtxp
;
119 static int cluster_io(vnode_t vp
, upl_t upl
, vm_offset_t upl_offset
, off_t f_offset
, int non_rounded_size
,
120 int flags
, buf_t real_bp
, struct clios
*iostate
);
121 static int cluster_iodone(buf_t bp
, void *dummy
);
122 static int cluster_rd_prefetch(vnode_t vp
, off_t f_offset
, u_int size
, off_t filesize
);
123 static int cluster_hard_throttle_on(vnode_t vp
);
125 static int cluster_read_x(vnode_t vp
, struct uio
*uio
, off_t filesize
, int flags
);
126 static int cluster_write_x(vnode_t vp
, struct uio
*uio
, off_t oldEOF
, off_t newEOF
,
127 off_t headOff
, off_t tailOff
, int flags
);
128 static int cluster_nocopy_read(vnode_t vp
, struct uio
*uio
, off_t filesize
);
129 static int cluster_nocopy_write(vnode_t vp
, struct uio
*uio
, off_t newEOF
);
130 static int cluster_phys_read(vnode_t vp
, struct uio
*uio
, off_t filesize
);
131 static int cluster_phys_write(vnode_t vp
, struct uio
*uio
, off_t newEOF
);
132 static int cluster_align_phys_io(vnode_t vp
, struct uio
*uio
, addr64_t usr_paddr
, int xsize
, int flags
);
134 static void cluster_rd_ahead(vnode_t vp
, struct cl_extent
*extent
, off_t filesize
, struct cl_readahead
*ra
);
136 static int cluster_push_x(vnode_t vp
, struct cl_extent
*, off_t EOF
, int flags
);
137 static void cluster_push_EOF(vnode_t vp
, off_t EOF
);
139 static int cluster_try_push(struct cl_writebehind
*, vnode_t vp
, off_t EOF
, int can_delay
, int push_all
);
141 static void sparse_cluster_switch(struct cl_writebehind
*, vnode_t vp
, off_t EOF
);
142 static void sparse_cluster_push(struct cl_writebehind
*, vnode_t vp
, off_t EOF
, int push_all
);
143 static void sparse_cluster_add(struct cl_writebehind
*, vnode_t vp
, struct cl_extent
*, off_t EOF
);
145 static kern_return_t
vfs_drt_mark_pages(void **cmapp
, off_t offset
, u_int length
, int *setcountp
);
146 static kern_return_t
vfs_drt_get_cluster(void **cmapp
, off_t
*offsetp
, u_int
*lengthp
);
147 static kern_return_t
vfs_drt_control(void **cmapp
, int op_type
);
149 int is_file_clean(vnode_t
, off_t
);
152 * throttle the number of async writes that
153 * can be outstanding on a single vnode
154 * before we issue a synchronous write
156 #define HARD_THROTTLE_MAXCNT 0
157 #define HARD_THROTTLE_MAXSIZE (64 * 1024)
159 int hard_throttle_on_root
= 0;
160 struct timeval priority_IO_timestamp_for_root
;
166 * allocate lock group attribute and group
168 cl_mtx_grp_attr
= lck_grp_attr_alloc_init();
169 //lck_grp_attr_setstat(cl_mtx_grp_attr);
170 cl_mtx_grp
= lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr
);
173 * allocate the lock attribute
175 cl_mtx_attr
= lck_attr_alloc_init();
176 //lck_attr_setdebug(clf_mtx_attr);
179 * allocate and initialize mutex's used to protect updates and waits
180 * on the cluster_io context
182 cl_mtxp
= lck_mtx_alloc_init(cl_mtx_grp
, cl_mtx_attr
);
185 panic("cluster_init: failed to allocate cl_mtxp");
190 #define CLW_ALLOCATE 0x01
191 #define CLW_RETURNLOCKED 0x02
193 * if the read ahead context doesn't yet exist,
194 * allocate and initialize it...
195 * the vnode lock serializes multiple callers
196 * during the actual assignment... first one
197 * to grab the lock wins... the other callers
198 * will release the now unnecessary storage
200 * once the context is present, try to grab (but don't block on)
201 * the lock associated with it... if someone
202 * else currently owns it, than the read
203 * will run without read-ahead. this allows
204 * multiple readers to run in parallel and
205 * since there's only 1 read ahead context,
206 * there's no real loss in only allowing 1
207 * reader to have read-ahead enabled.
209 static struct cl_readahead
*
210 cluster_get_rap(vnode_t vp
)
212 struct ubc_info
*ubc
;
213 struct cl_readahead
*rap
;
217 if ((rap
= ubc
->cl_rahead
) == NULL
) {
218 MALLOC_ZONE(rap
, struct cl_readahead
*, sizeof *rap
, M_CLRDAHEAD
, M_WAITOK
);
220 bzero(rap
, sizeof *rap
);
222 lck_mtx_init(&rap
->cl_lockr
, cl_mtx_grp
, cl_mtx_attr
);
226 if (ubc
->cl_rahead
== NULL
)
227 ubc
->cl_rahead
= rap
;
229 lck_mtx_destroy(&rap
->cl_lockr
, cl_mtx_grp
);
230 FREE_ZONE((void *)rap
, sizeof *rap
, M_CLRDAHEAD
);
231 rap
= ubc
->cl_rahead
;
235 if (lck_mtx_try_lock(&rap
->cl_lockr
) == TRUE
)
238 return ((struct cl_readahead
*)NULL
);
243 * if the write behind context doesn't yet exist,
244 * and CLW_ALLOCATE is specified, allocate and initialize it...
245 * the vnode lock serializes multiple callers
246 * during the actual assignment... first one
247 * to grab the lock wins... the other callers
248 * will release the now unnecessary storage
250 * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
251 * the lock associated with the write behind context before
255 static struct cl_writebehind
*
256 cluster_get_wbp(vnode_t vp
, int flags
)
258 struct ubc_info
*ubc
;
259 struct cl_writebehind
*wbp
;
263 if ((wbp
= ubc
->cl_wbehind
) == NULL
) {
265 if ( !(flags
& CLW_ALLOCATE
))
266 return ((struct cl_writebehind
*)NULL
);
268 MALLOC_ZONE(wbp
, struct cl_writebehind
*, sizeof *wbp
, M_CLWRBEHIND
, M_WAITOK
);
270 bzero(wbp
, sizeof *wbp
);
271 lck_mtx_init(&wbp
->cl_lockw
, cl_mtx_grp
, cl_mtx_attr
);
275 if (ubc
->cl_wbehind
== NULL
)
276 ubc
->cl_wbehind
= wbp
;
278 lck_mtx_destroy(&wbp
->cl_lockw
, cl_mtx_grp
);
279 FREE_ZONE((void *)wbp
, sizeof *wbp
, M_CLWRBEHIND
);
280 wbp
= ubc
->cl_wbehind
;
284 if (flags
& CLW_RETURNLOCKED
)
285 lck_mtx_lock(&wbp
->cl_lockw
);
292 cluster_hard_throttle_on(vnode_t vp
)
294 static struct timeval hard_throttle_maxelapsed
= { 0, 200000 };
296 if (vp
->v_mount
->mnt_kern_flag
& MNTK_ROOTDEV
) {
297 struct timeval elapsed
;
299 if (hard_throttle_on_root
)
302 microuptime(&elapsed
);
303 timevalsub(&elapsed
, &priority_IO_timestamp_for_root
);
305 if (timevalcmp(&elapsed
, &hard_throttle_maxelapsed
, <))
313 cluster_iodone(buf_t bp
, __unused
void *dummy
)
326 struct clios
*iostate
;
330 cbp_head
= (buf_t
)(bp
->b_trans_head
);
332 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_START
,
333 (int)cbp_head
, bp
->b_lblkno
, bp
->b_bcount
, bp
->b_flags
, 0);
335 for (cbp
= cbp_head
; cbp
; cbp
= cbp
->b_trans_next
) {
337 * all I/O requests that are part of this transaction
338 * have to complete before we can process it
340 if ( !(cbp
->b_flags
& B_DONE
)) {
342 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
343 (int)cbp_head
, (int)cbp
, cbp
->b_bcount
, cbp
->b_flags
, 0);
353 upl_offset
= cbp
->b_uploffset
;
355 b_flags
= cbp
->b_flags
;
356 real_bp
= cbp
->b_real_bp
;
357 zero_offset
= cbp
->b_validend
;
358 iostate
= (struct clios
*)cbp
->b_iostate
;
361 real_bp
->b_dev
= cbp
->b_dev
;
364 if ((cbp
->b_flags
& B_ERROR
) && error
== 0)
365 error
= cbp
->b_error
;
367 total_resid
+= cbp
->b_resid
;
368 total_size
+= cbp
->b_bcount
;
370 cbp_next
= cbp
->b_trans_next
;
377 cluster_zero(upl
, zero_offset
, PAGE_SIZE
- (zero_offset
& PAGE_MASK
), real_bp
);
383 * someone has issued multiple I/Os asynchrounsly
384 * and is waiting for them to complete (streaming)
386 lck_mtx_lock(cl_mtxp
);
388 if (error
&& iostate
->io_error
== 0)
389 iostate
->io_error
= error
;
391 iostate
->io_completed
+= total_size
;
393 if (iostate
->io_wanted
) {
395 * someone is waiting for the state of
396 * this io stream to change
398 iostate
->io_wanted
= 0;
401 lck_mtx_unlock(cl_mtxp
);
404 wakeup((caddr_t
)&iostate
->io_wanted
);
406 if ((b_flags
& B_NEED_IODONE
) && real_bp
) {
408 real_bp
->b_flags
|= B_ERROR
;
409 real_bp
->b_error
= error
;
411 real_bp
->b_resid
= total_resid
;
413 buf_biodone(real_bp
);
415 if (error
== 0 && total_resid
)
418 if (b_flags
& B_COMMIT_UPL
) {
419 pg_offset
= upl_offset
& PAGE_MASK
;
420 commit_size
= (pg_offset
+ total_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
422 if (error
|| (b_flags
& B_NOCACHE
)) {
427 if (b_flags
& B_PAGEIO
) {
428 if (b_flags
& B_READ
)
433 if (b_flags
& B_CACHE
) /* leave pages in the cache unchanged on error */
434 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
;
435 else if (page_out
&& (error
!= ENXIO
)) /* transient error */
436 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
;
438 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
;
440 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_DUMP_PAGES
;
442 ubc_upl_abort_range(upl
, upl_offset
- pg_offset
, commit_size
,
445 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
446 (int)upl
, upl_offset
- pg_offset
, commit_size
,
447 0x80000000|upl_abort_code
, 0);
450 int upl_commit_flags
= UPL_COMMIT_FREE_ON_EMPTY
;
452 if ((b_flags
& B_PHYS
) && (b_flags
& B_READ
))
453 upl_commit_flags
|= UPL_COMMIT_SET_DIRTY
;
456 upl_commit_flags
|= UPL_COMMIT_INACTIVATE
;
458 ubc_upl_commit_range(upl
, upl_offset
- pg_offset
, commit_size
,
461 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
462 (int)upl
, upl_offset
- pg_offset
, commit_size
,
463 upl_commit_flags
, 0);
466 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
467 (int)upl
, upl_offset
, 0, error
, 0);
475 cluster_zero(upl_t upl
, vm_offset_t upl_offset
, int size
, buf_t bp
)
479 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 23)) | DBG_FUNC_START
,
480 upl_offset
, size
, (int)bp
, 0, 0);
482 if (bp
== NULL
|| bp
->b_datap
== 0) {
484 pl
= ubc_upl_pageinfo(upl
);
492 page_index
= upl_offset
/ PAGE_SIZE
;
493 page_offset
= upl_offset
& PAGE_MASK
;
495 zero_addr
= ((addr64_t
)upl_phys_page(pl
, page_index
) << 12) + page_offset
;
496 zero_cnt
= min(PAGE_SIZE
- page_offset
, size
);
498 bzero_phys(zero_addr
, zero_cnt
);
501 upl_offset
+= zero_cnt
;
504 bzero((caddr_t
)((vm_offset_t
)bp
->b_datap
+ upl_offset
), size
);
506 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 23)) | DBG_FUNC_END
,
507 upl_offset
, size
, 0, 0, 0);
512 cluster_io(vnode_t vp
, upl_t upl
, vm_offset_t upl_offset
, off_t f_offset
, int non_rounded_size
,
513 int flags
, buf_t real_bp
, struct clios
*iostate
)
522 buf_t cbp_head
= NULL
;
523 buf_t cbp_tail
= NULL
;
531 int async_throttle
= 0;
536 if (mp
->mnt_devblocksize
> 1) {
538 * round the requested size up so that this I/O ends on a
539 * page boundary in case this is a 'write'... if the filesystem
540 * has blocks allocated to back the page beyond the EOF, we want to
541 * make sure to write out the zero's that are sitting beyond the EOF
542 * so that in case the filesystem doesn't explicitly zero this area
543 * if a hole is created via a lseek/write beyond the current EOF,
544 * it will return zeros when it's read back from the disk. If the
545 * physical allocation doesn't extend for the whole page, we'll
546 * only write/read from the disk up to the end of this allocation
547 * via the extent info returned from the VNOP_BLOCKMAP call.
549 pg_offset
= upl_offset
& PAGE_MASK
;
551 size
= (((non_rounded_size
+ pg_offset
) + (PAGE_SIZE
- 1)) & ~PAGE_MASK
) - pg_offset
;
554 * anyone advertising a blocksize of 1 byte probably
555 * can't deal with us rounding up the request size
556 * AFP is one such filesystem/device
558 size
= non_rounded_size
;
560 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_START
,
561 (int)f_offset
, size
, upl_offset
, flags
, 0);
563 if (flags
& CL_READ
) {
565 bmap_flags
= VNODE_READ
;
567 max_iosize
= mp
->mnt_maxreadcnt
;
568 max_vectors
= mp
->mnt_segreadcnt
;
571 bmap_flags
= VNODE_WRITE
;
573 max_iosize
= mp
->mnt_maxwritecnt
;
574 max_vectors
= mp
->mnt_segwritecnt
;
576 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_NONE
, max_iosize
, max_vectors
, mp
->mnt_devblocksize
, 0, 0);
579 * make sure the maximum iosize is a
580 * multiple of the page size
582 max_iosize
&= ~PAGE_MASK
;
584 if (flags
& CL_THROTTLE
) {
585 if ( !(flags
& CL_PAGEOUT
) && cluster_hard_throttle_on(vp
)) {
586 if (max_iosize
> HARD_THROTTLE_MAXSIZE
)
587 max_iosize
= HARD_THROTTLE_MAXSIZE
;
588 async_throttle
= HARD_THROTTLE_MAXCNT
;
590 async_throttle
= VNODE_ASYNC_THROTTLE
;
595 io_flags
|= B_NOCACHE
;
596 if (flags
& (CL_PAGEIN
| CL_PAGEOUT
))
597 io_flags
|= B_PAGEIO
;
598 if (flags
& CL_COMMIT
)
599 io_flags
|= B_COMMIT_UPL
;
600 if (flags
& CL_PRESERVE
)
602 if (flags
& CL_KEEPCACHED
)
605 if ((flags
& CL_READ
) && ((upl_offset
+ non_rounded_size
) & PAGE_MASK
) && (!(flags
& CL_NOZERO
))) {
607 * then we are going to end up
608 * with a page that we can't complete (the file size wasn't a multiple
609 * of PAGE_SIZE and we're trying to read to the end of the file
610 * so we'll go ahead and zero out the portion of the page we can't
611 * read in from the file
613 zero_offset
= upl_offset
+ non_rounded_size
;
620 if (size
> max_iosize
)
621 io_size
= max_iosize
;
625 if ((error
= VNOP_BLOCKMAP(vp
, f_offset
, io_size
, &blkno
, (size_t *)&io_size
, NULL
, bmap_flags
, NULL
))) {
628 if (real_bp
&& (real_bp
->b_blkno
== real_bp
->b_lblkno
))
629 real_bp
->b_blkno
= blkno
;
631 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 24)) | DBG_FUNC_NONE
,
632 (int)f_offset
, (int)blkno
, io_size
, zero_offset
, 0);
636 * vnop_blockmap didn't return an error... however, it did
637 * return an extent size of 0 which means we can't
638 * make forward progress on this I/O... a hole in the
639 * file would be returned as a blkno of -1 with a non-zero io_size
640 * a real extent is returned with a blkno != -1 and a non-zero io_size
645 if ( !(flags
& CL_READ
) && blkno
== -1) {
649 * we're writing into a 'hole'
651 if (flags
& CL_PAGEOUT
) {
653 * if we got here via cluster_pageout
654 * then just error the request and return
655 * the 'hole' should already have been covered
660 if ( !(flags
& CL_COMMIT
)) {
662 * currently writes always request the commit to happen
663 * as part of the io completion... however, if the CL_COMMIT
664 * flag isn't specified, than we can't issue the abort_range
665 * since the call site is going to abort or commit the same upl..
666 * in this case we can only return an error
672 * we can get here if the cluster code happens to
673 * pick up a page that was dirtied via mmap vs
674 * a 'write' and the page targets a 'hole'...
675 * i.e. the writes to the cluster were sparse
676 * and the file was being written for the first time
678 * we can also get here if the filesystem supports
679 * 'holes' that are less than PAGE_SIZE.... because
680 * we can't know if the range in the page that covers
681 * the 'hole' has been dirtied via an mmap or not,
682 * we have to assume the worst and try to push the
683 * entire page to storage.
685 * Try paging out the page individually before
686 * giving up entirely and dumping it (the pageout
687 * path will insure that the zero extent accounting
688 * has been taken care of before we get back into cluster_io)
690 ubc_upl_abort_range(upl
, trunc_page(upl_offset
), PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
692 e_offset
= round_page_64(f_offset
+ 1);
694 if (ubc_sync_range(vp
, f_offset
, e_offset
, UBC_PUSHDIRTY
) == 0) {
698 io_size
= e_offset
- f_offset
;
701 upl_offset
+= io_size
;
708 * keep track of how much of the original request
709 * that we've actually completed... non_rounded_size
710 * may go negative due to us rounding the request
711 * to a page size multiple (i.e. size > non_rounded_size)
713 non_rounded_size
-= io_size
;
715 if (non_rounded_size
<= 0) {
717 * we've transferred all of the data in the original
718 * request, but we were unable to complete the tail
719 * of the last page because the file didn't have
720 * an allocation to back that portion... this is ok.
726 lblkno
= (daddr64_t
)(f_offset
/ PAGE_SIZE_64
);
728 * we have now figured out how much I/O we can do - this is in 'io_size'
729 * pg_offset is the starting point in the first page for the I/O
730 * pg_count is the number of full and partial pages that 'io_size' encompasses
732 pg_offset
= upl_offset
& PAGE_MASK
;
734 if (flags
& CL_DEV_MEMORY
) {
736 * currently, can't deal with reading 'holes' in file
743 * treat physical requests as one 'giant' page
747 pg_count
= (io_size
+ pg_offset
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
749 if ((flags
& CL_READ
) && blkno
== -1) {
753 * if we're reading and blkno == -1, then we've got a
754 * 'hole' in the file that we need to deal with by zeroing
755 * out the affected area in the upl
757 if (zero_offset
&& io_size
== size
) {
759 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
760 * than 'zero_offset' will be non-zero
761 * if the 'hole' returned by vnop_blockmap extends all the way to the eof
762 * (indicated by the io_size finishing off the I/O request for this UPL)
763 * than we're not going to issue an I/O for the
764 * last page in this upl... we need to zero both the hole and the tail
765 * of the page beyond the EOF, since the delayed zero-fill won't kick in
767 bytes_to_zero
= (((upl_offset
+ io_size
) + (PAGE_SIZE
- 1)) & ~PAGE_MASK
) - upl_offset
;
771 bytes_to_zero
= io_size
;
773 cluster_zero(upl
, upl_offset
, bytes_to_zero
, real_bp
);
777 * if there is a current I/O chain pending
778 * then the first page of the group we just zero'd
779 * will be handled by the I/O completion if the zero
780 * fill started in the middle of the page
782 pg_count
= (io_size
- pg_offset
) / PAGE_SIZE
;
785 * no pending I/O to pick up that first page
786 * so, we have to make sure it gets committed
788 * set the pg_offset to 0 so that the upl_commit_range
789 * starts with this page
791 pg_count
= (io_size
+ pg_offset
) / PAGE_SIZE
;
794 if (io_size
== size
&& ((upl_offset
+ io_size
) & PAGE_MASK
))
796 * if we're done with the request for this UPL
797 * then we have to make sure to commit the last page
798 * even if we only partially zero-filled it
804 pg_resid
= PAGE_SIZE
- pg_offset
;
808 if (flags
& CL_COMMIT
)
809 ubc_upl_commit_range(upl
,
810 (upl_offset
+ pg_resid
) & ~PAGE_MASK
,
811 pg_count
* PAGE_SIZE
,
812 UPL_COMMIT_CLEAR_DIRTY
| UPL_COMMIT_FREE_ON_EMPTY
);
814 upl_offset
+= io_size
;
818 * keep track of how much of the original request
819 * that we've actually completed... non_rounded_size
820 * may go negative due to us rounding the request
821 * to a page size multiple (i.e. size > non_rounded_size)
823 non_rounded_size
-= io_size
;
825 if (non_rounded_size
<= 0) {
827 * we've transferred all of the data in the original
828 * request, but we were unable to complete the tail
829 * of the last page because the file didn't have
830 * an allocation to back that portion... this is ok.
834 if (cbp_head
&& pg_count
)
839 if (pg_count
> max_vectors
) {
840 if (((pg_count
- max_vectors
) * PAGE_SIZE
) > io_size
) {
841 io_size
= PAGE_SIZE
- pg_offset
;
844 io_size
-= (pg_count
- max_vectors
) * PAGE_SIZE
;
845 pg_count
= max_vectors
;
849 if ( !(mp
->mnt_kern_flag
& MNTK_VIRTUALDEV
))
851 * if we're not targeting a virtual device i.e. a disk image
852 * it's safe to dip into the reserve pool since real devices
853 * can complete this I/O request without requiring additional
854 * bufs from the alloc_io_buf pool
857 else if ((flags
& CL_ASYNC
) && !(flags
& CL_PAGEOUT
))
859 * Throttle the speculative IO
865 cbp
= alloc_io_buf(vp
, priv
);
867 if (flags
& CL_PAGEOUT
) {
870 for (i
= 0; i
< pg_count
; i
++) {
871 if (buf_invalblkno(vp
, lblkno
+ i
, 0) == EBUSY
)
872 panic("BUSY bp found in cluster_io");
875 if (flags
& CL_ASYNC
) {
876 if (buf_setcallback(cbp
, (void *)cluster_iodone
, NULL
))
877 panic("buf_setcallback failed\n");
879 cbp
->b_flags
|= io_flags
;
881 cbp
->b_lblkno
= lblkno
;
882 cbp
->b_blkno
= blkno
;
883 cbp
->b_bcount
= io_size
;
885 if (buf_setupl(cbp
, upl
, upl_offset
))
886 panic("buf_setupl failed\n");
888 cbp
->b_trans_next
= (buf_t
)NULL
;
890 if ((cbp
->b_iostate
= (void *)iostate
))
892 * caller wants to track the state of this
893 * io... bump the amount issued against this stream
895 iostate
->io_issued
+= io_size
;
897 if (flags
& CL_READ
) {
898 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 26)) | DBG_FUNC_NONE
,
899 (int)cbp
->b_lblkno
, (int)cbp
->b_blkno
, upl_offset
, io_size
, 0);
902 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 27)) | DBG_FUNC_NONE
,
903 (int)cbp
->b_lblkno
, (int)cbp
->b_blkno
, upl_offset
, io_size
, 0);
907 cbp_tail
->b_trans_next
= cbp
;
913 (buf_t
)(cbp
->b_trans_head
) = cbp_head
;
916 upl_offset
+= io_size
;
920 * keep track of how much of the original request
921 * that we've actually completed... non_rounded_size
922 * may go negative due to us rounding the request
923 * to a page size multiple (i.e. size > non_rounded_size)
925 non_rounded_size
-= io_size
;
927 if (non_rounded_size
<= 0) {
929 * we've transferred all of the data in the original
930 * request, but we were unable to complete the tail
931 * of the last page because the file didn't have
932 * an allocation to back that portion... this is ok.
936 if ( (!(upl_offset
& PAGE_MASK
) && !(flags
& CL_DEV_MEMORY
) && ((flags
& CL_ASYNC
) || trans_count
> 8)) || size
== 0) {
938 * if we have no more I/O to issue or
939 * the current I/O we've prepared fully
940 * completes the last page in this request
941 * and it's either an ASYNC request or
942 * we've already accumulated more than 8 I/O's into
943 * this transaction and it's not an I/O directed to
944 * special DEVICE memory
945 * then go ahead and issue the I/O
949 cbp_head
->b_flags
|= B_NEED_IODONE
;
950 cbp_head
->b_real_bp
= real_bp
;
952 cbp_head
->b_real_bp
= (buf_t
)NULL
;
956 * we're about to issue the last I/O for this upl
957 * if this was a read to the eof and the eof doesn't
958 * finish on a page boundary, than we need to zero-fill
959 * the rest of the page....
961 cbp_head
->b_validend
= zero_offset
;
963 cbp_head
->b_validend
= 0;
965 if (flags
& CL_THROTTLE
)
966 (void)vnode_waitforwrites(vp
, async_throttle
, 0, 0, (char *)"cluster_io");
968 for (cbp
= cbp_head
; cbp
;) {
971 if ( !(io_flags
& B_READ
))
972 vnode_startwrite(vp
);
974 cbp_next
= cbp
->b_trans_next
;
976 (void) VNOP_STRATEGY(cbp
);
979 if ( !(flags
& CL_ASYNC
)) {
982 for (cbp
= cbp_head
; cbp
; cbp
= cbp
->b_trans_next
)
985 if ((error
= cluster_iodone(cbp_head
, (void *)&dummy
))) {
986 if (((flags
& (CL_PAGEOUT
| CL_KEEPCACHED
)) == CL_PAGEOUT
) && (error
== ENXIO
))
987 error
= 0; /* drop the error */
995 cbp_head
= (buf_t
)NULL
;
996 cbp_tail
= (buf_t
)NULL
;
1006 for (cbp
= cbp_head
; cbp
;) {
1009 upl_offset
-= cbp
->b_bcount
;
1010 size
+= cbp
->b_bcount
;
1011 io_size
+= cbp
->b_bcount
;
1013 cbp_next
= cbp
->b_trans_next
;
1018 int need_wakeup
= 0;
1021 * update the error condition for this stream
1022 * since we never really issued the io
1023 * just go ahead and adjust it back
1025 lck_mtx_lock(cl_mtxp
);
1027 if (iostate
->io_error
== 0)
1028 iostate
->io_error
= error
;
1029 iostate
->io_issued
-= io_size
;
1031 if (iostate
->io_wanted
) {
1033 * someone is waiting for the state of
1034 * this io stream to change
1036 iostate
->io_wanted
= 0;
1039 lck_mtx_unlock(cl_mtxp
);
1042 wakeup((caddr_t
)&iostate
->io_wanted
);
1044 pg_offset
= upl_offset
& PAGE_MASK
;
1045 abort_size
= (size
+ pg_offset
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
1047 if (flags
& CL_COMMIT
) {
1050 if (flags
& CL_PRESERVE
) {
1051 ubc_upl_commit_range(upl
, upl_offset
- pg_offset
, abort_size
,
1052 UPL_COMMIT_FREE_ON_EMPTY
);
1054 if ((flags
& CL_PAGEOUT
) && (error
!= ENXIO
)) /* transient error */
1055 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
;
1056 else if (flags
& CL_PAGEIN
)
1057 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
;
1059 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_DUMP_PAGES
;
1061 ubc_upl_abort_range(upl
, upl_offset
- pg_offset
, abort_size
,
1064 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 28)) | DBG_FUNC_NONE
,
1065 (int)upl
, upl_offset
- pg_offset
, abort_size
, error
, 0);
1068 real_bp
->b_flags
|= B_ERROR
;
1069 real_bp
->b_error
= error
;
1071 buf_biodone(real_bp
);
1076 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_END
,
1077 (int)f_offset
, size
, upl_offset
, retval
, 0);
1084 cluster_rd_prefetch(vnode_t vp
, off_t f_offset
, u_int size
, off_t filesize
)
1086 int pages_in_prefetch
;
1088 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_START
,
1089 (int)f_offset
, size
, (int)filesize
, 0, 0);
1091 if (f_offset
>= filesize
) {
1092 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_END
,
1093 (int)f_offset
, 0, 0, 0, 0);
1096 if (size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
1097 size
= (MAX_UPL_TRANSFER
* PAGE_SIZE
);
1099 size
= (size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
1101 if ((off_t
)size
> (filesize
- f_offset
))
1102 size
= filesize
- f_offset
;
1103 pages_in_prefetch
= (size
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
1105 advisory_read(vp
, filesize
, f_offset
, size
);
1107 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_END
,
1108 (int)f_offset
+ size
, pages_in_prefetch
, 0, 1, 0);
1110 return (pages_in_prefetch
);
1116 cluster_rd_ahead(vnode_t vp
, struct cl_extent
*extent
, off_t filesize
, struct cl_readahead
*rap
)
1120 int size_of_prefetch
;
1123 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_START
,
1124 (int)extent
->b_addr
, (int)extent
->e_addr
, (int)rap
->cl_lastr
, 0, 0);
1126 if (extent
->b_addr
== rap
->cl_lastr
&& extent
->b_addr
== extent
->e_addr
) {
1127 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
1128 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 0, 0);
1131 if (rap
->cl_lastr
== -1 || (extent
->b_addr
!= rap
->cl_lastr
&& extent
->b_addr
!= (rap
->cl_lastr
+ 1) &&
1132 (extent
->b_addr
!= (rap
->cl_maxra
+ 1) || rap
->cl_ralen
== 0))) {
1136 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
1137 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 1, 0);
1141 if (extent
->e_addr
< rap
->cl_maxra
) {
1142 if ((rap
->cl_maxra
- extent
->e_addr
) > (MAX_UPL_TRANSFER
/ 4)) {
1144 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
1145 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 2, 0);
1149 r_addr
= max(extent
->e_addr
, rap
->cl_maxra
) + 1;
1150 f_offset
= (off_t
)(r_addr
* PAGE_SIZE_64
);
1152 size_of_prefetch
= 0;
1154 ubc_range_op(vp
, f_offset
, f_offset
+ PAGE_SIZE_64
, UPL_ROP_PRESENT
, &size_of_prefetch
);
1156 if (size_of_prefetch
) {
1157 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
1158 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 3, 0);
1161 if (f_offset
< filesize
) {
1162 daddr64_t read_size
;
1164 rap
->cl_ralen
= rap
->cl_ralen
? min(MAX_UPL_TRANSFER
, rap
->cl_ralen
<< 1) : 1;
1166 read_size
= (extent
->e_addr
+ 1) - extent
->b_addr
;
1168 if (read_size
> rap
->cl_ralen
) {
1169 if (read_size
> MAX_UPL_TRANSFER
)
1170 rap
->cl_ralen
= MAX_UPL_TRANSFER
;
1172 rap
->cl_ralen
= read_size
;
1174 size_of_prefetch
= cluster_rd_prefetch(vp
, f_offset
, rap
->cl_ralen
* PAGE_SIZE
, filesize
);
1176 if (size_of_prefetch
)
1177 rap
->cl_maxra
= (r_addr
+ size_of_prefetch
) - 1;
1179 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
1180 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 4, 0);
1184 cluster_pageout(vnode_t vp
, upl_t upl
, vm_offset_t upl_offset
, off_t f_offset
,
1185 int size
, off_t filesize
, int flags
)
1191 struct cl_writebehind
*wbp
;
1193 if (vp
->v_mount
->mnt_kern_flag
& MNTK_VIRTUALDEV
)
1195 * if we know we're issuing this I/O to a virtual device (i.e. disk image)
1196 * then we don't want to enforce this throttle... if we do, we can
1197 * potentially deadlock since we're stalling the pageout thread at a time
1198 * when the disk image might need additional memory (which won't be available
1199 * if the pageout thread can't run)... instead we'll just depend on the throttle
1200 * that the pageout thread now has in place to deal with external files
1202 local_flags
= CL_PAGEOUT
;
1204 local_flags
= CL_PAGEOUT
| CL_THROTTLE
;
1206 if ((flags
& UPL_IOSYNC
) == 0)
1207 local_flags
|= CL_ASYNC
;
1208 if ((flags
& UPL_NOCOMMIT
) == 0)
1209 local_flags
|= CL_COMMIT
;
1210 if ((flags
& UPL_KEEPCACHED
))
1211 local_flags
|= CL_KEEPCACHED
;
1214 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 52)) | DBG_FUNC_NONE
,
1215 (int)f_offset
, size
, (int)filesize
, local_flags
, 0);
1218 * If they didn't specify any I/O, then we are done...
1219 * we can't issue an abort because we don't know how
1220 * big the upl really is
1225 if (vp
->v_mount
->mnt_flag
& MNT_RDONLY
) {
1226 if (local_flags
& CL_COMMIT
)
1227 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
);
1231 * can't page-in from a negative offset
1232 * or if we're starting beyond the EOF
1233 * or if the file offset isn't page aligned
1234 * or the size requested isn't a multiple of PAGE_SIZE
1236 if (f_offset
< 0 || f_offset
>= filesize
||
1237 (f_offset
& PAGE_MASK_64
) || (size
& PAGE_MASK
)) {
1238 if (local_flags
& CL_COMMIT
)
1239 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
);
1242 max_size
= filesize
- f_offset
;
1244 if (size
< max_size
)
1249 rounded_size
= (io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
1251 if (size
> rounded_size
) {
1252 if (local_flags
& CL_COMMIT
)
1253 ubc_upl_abort_range(upl
, upl_offset
+ rounded_size
, size
- rounded_size
,
1254 UPL_ABORT_FREE_ON_EMPTY
);
1256 if ((wbp
= cluster_get_wbp(vp
, 0)) != NULL
)
1257 wbp
->cl_hasbeenpaged
= 1;
1259 return (cluster_io(vp
, upl
, upl_offset
, f_offset
, io_size
,
1260 local_flags
, (buf_t
)NULL
, (struct clios
*)NULL
));
1264 cluster_pagein(vnode_t vp
, upl_t upl
, vm_offset_t upl_offset
, off_t f_offset
,
1265 int size
, off_t filesize
, int flags
)
1271 int local_flags
= 0;
1273 if (upl
== NULL
|| size
< 0)
1274 panic("cluster_pagein: NULL upl passed in");
1276 if ((flags
& UPL_IOSYNC
) == 0)
1277 local_flags
|= CL_ASYNC
;
1278 if ((flags
& UPL_NOCOMMIT
) == 0)
1279 local_flags
|= CL_COMMIT
;
1282 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 56)) | DBG_FUNC_NONE
,
1283 (int)f_offset
, size
, (int)filesize
, local_flags
, 0);
1286 * can't page-in from a negative offset
1287 * or if we're starting beyond the EOF
1288 * or if the file offset isn't page aligned
1289 * or the size requested isn't a multiple of PAGE_SIZE
1291 if (f_offset
< 0 || f_offset
>= filesize
||
1292 (f_offset
& PAGE_MASK_64
) || (size
& PAGE_MASK
) || (upl_offset
& PAGE_MASK
)) {
1293 if (local_flags
& CL_COMMIT
)
1294 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
);
1297 max_size
= filesize
- f_offset
;
1299 if (size
< max_size
)
1304 rounded_size
= (io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
1306 if (size
> rounded_size
&& (local_flags
& CL_COMMIT
))
1307 ubc_upl_abort_range(upl
, upl_offset
+ rounded_size
,
1308 size
- rounded_size
, UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
);
1310 retval
= cluster_io(vp
, upl
, upl_offset
, f_offset
, io_size
,
1311 local_flags
| CL_READ
| CL_PAGEIN
, (buf_t
)NULL
, (struct clios
*)NULL
);
1313 if (retval
== 0 && !(flags
& UPL_NORDAHEAD
) && !(vp
->v_flag
& VRAOFF
)) {
1314 struct cl_readahead
*rap
;
1316 rap
= cluster_get_rap(vp
);
1319 struct cl_extent extent
;
1321 extent
.b_addr
= (daddr64_t
)(f_offset
/ PAGE_SIZE_64
);
1322 extent
.e_addr
= (daddr64_t
)((f_offset
+ ((off_t
)io_size
- 1)) / PAGE_SIZE_64
);
1324 if (rounded_size
== PAGE_SIZE
) {
1326 * we haven't read the last page in of the file yet
1327 * so let's try to read ahead if we're in
1328 * a sequential access pattern
1330 cluster_rd_ahead(vp
, &extent
, filesize
, rap
);
1332 rap
->cl_lastr
= extent
.e_addr
;
1334 lck_mtx_unlock(&rap
->cl_lockr
);
1341 cluster_bp(buf_t bp
)
1346 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 19)) | DBG_FUNC_START
,
1347 (int)bp
, (int)bp
->b_lblkno
, bp
->b_bcount
, bp
->b_flags
, 0);
1349 if (bp
->b_flags
& B_READ
)
1350 flags
= CL_ASYNC
| CL_READ
;
1354 f_offset
= ubc_blktooff(bp
->b_vp
, bp
->b_lblkno
);
1356 return (cluster_io(bp
->b_vp
, bp
->b_upl
, 0, f_offset
, bp
->b_bcount
, flags
, bp
, (struct clios
*)NULL
));
1360 cluster_write(vnode_t vp
, struct uio
*uio
, off_t oldEOF
, off_t newEOF
, off_t headOff
, off_t tailOff
, int xflags
)
1373 if (vp
->v_flag
& VNOCACHE_DATA
)
1374 flags
|= IO_NOCACHE
;
1376 if ( (!(flags
& IO_NOCACHE
)) || (!uio
) || (!UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
))) {
1378 * go do a write through the cache if one of the following is true....
1379 * NOCACHE is not true
1380 * there is no uio structure or it doesn't target USERSPACE
1382 return (cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, flags
));
1386 if (IS_VALID_UIO_SEGFLG(uio
->uio_segflg
) == 0) {
1387 panic("%s :%d - invalid uio_segflg\n", __FILE__
, __LINE__
);
1389 #endif /* LP64_DEBUG */
1391 while (uio_resid(uio
) && uio
->uio_offset
< newEOF
&& retval
== 0) {
1392 user_size_t iov_len
;
1393 user_addr_t iov_base
;
1396 * we know we have a resid, so this is safe
1397 * skip over any emtpy vectors
1399 uio_update(uio
, (user_size_t
)0);
1401 iov_len
= uio_curriovlen(uio
);
1402 iov_base
= uio_curriovbase(uio
);
1404 upl_size
= PAGE_SIZE
;
1405 upl_flags
= UPL_QUERY_OBJECT_TYPE
;
1407 // LP64todo - fix this!
1408 if ((vm_map_get_upl(current_map(),
1409 (vm_map_offset_t
)(iov_base
& ~((user_addr_t
)PAGE_MASK
)),
1410 &upl_size
, &upl
, NULL
, NULL
, &upl_flags
, 0)) != KERN_SUCCESS
) {
1412 * the user app must have passed in an invalid address
1418 * We check every vector target but if it is physically
1419 * contiguous space, we skip the sanity checks.
1421 if (upl_flags
& UPL_PHYS_CONTIG
) {
1424 zflags
= flags
& ~IO_TAILZEROFILL
;
1425 zflags
|= IO_HEADZEROFILL
;
1427 if (flags
& IO_HEADZEROFILL
) {
1429 * in case we have additional vectors, we don't want to do this again
1431 flags
&= ~IO_HEADZEROFILL
;
1433 if ((retval
= cluster_write_x(vp
, (struct uio
*)0, 0, uio
->uio_offset
, headOff
, 0, zflags
)))
1436 retval
= cluster_phys_write(vp
, uio
, newEOF
);
1438 if (uio_resid(uio
) == 0 && (flags
& IO_TAILZEROFILL
)) {
1439 return (cluster_write_x(vp
, (struct uio
*)0, 0, tailOff
, uio
->uio_offset
, 0, zflags
));
1442 else if ((uio_resid(uio
) < PAGE_SIZE
) || (flags
& (IO_TAILZEROFILL
| IO_HEADZEROFILL
))) {
1444 * we're here because we're don't have a physically contiguous target buffer
1445 * go do a write through the cache if one of the following is true....
1446 * the total xfer size is less than a page...
1447 * we're being asked to ZEROFILL either the head or the tail of the I/O...
1449 return (cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, flags
));
1451 // LP64todo - fix this!
1452 else if (((int)uio
->uio_offset
& PAGE_MASK
) || (CAST_DOWN(int, iov_base
) & PAGE_MASK
)) {
1453 if (((int)uio
->uio_offset
& PAGE_MASK
) == (CAST_DOWN(int, iov_base
) & PAGE_MASK
)) {
1455 * Bring the file offset write up to a pagesize boundary
1456 * this will also bring the base address to a page boundary
1457 * since they both are currently on the same offset within a page
1458 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
1459 * so the computed clip_size must always be less than the current uio_resid
1461 clip_size
= (PAGE_SIZE
- (uio
->uio_offset
& PAGE_MASK_64
));
1464 * Fake the resid going into the cluster_write_x call
1465 * and restore it on the way out.
1467 // LP64todo - fix this
1468 prev_resid
= uio_resid(uio
);
1469 uio_setresid(uio
, clip_size
);
1471 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, flags
);
1473 uio_setresid(uio
, prev_resid
- (clip_size
- uio_resid(uio
)));
1476 * can't get both the file offset and the buffer offset aligned to a page boundary
1477 * so fire an I/O through the cache for this entire vector
1479 // LP64todo - fix this
1480 clip_size
= iov_len
;
1481 // LP64todo - fix this
1482 prev_resid
= uio_resid(uio
);
1483 uio_setresid(uio
, clip_size
);
1485 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, flags
);
1487 uio_setresid(uio
, prev_resid
- (clip_size
- uio_resid(uio
)));
1491 * If we come in here, we know the offset into
1492 * the file is on a pagesize boundary and the
1493 * target buffer address is also on a page boundary
1495 max_io_size
= newEOF
- uio
->uio_offset
;
1496 // LP64todo - fix this
1497 clip_size
= uio_resid(uio
);
1498 if (iov_len
< clip_size
)
1499 // LP64todo - fix this!
1500 clip_size
= iov_len
;
1501 if (max_io_size
< clip_size
)
1502 clip_size
= max_io_size
;
1504 if (clip_size
< PAGE_SIZE
) {
1506 * Take care of tail end of write in this vector
1508 // LP64todo - fix this
1509 prev_resid
= uio_resid(uio
);
1510 uio_setresid(uio
, clip_size
);
1512 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, flags
);
1514 uio_setresid(uio
, prev_resid
- (clip_size
- uio_resid(uio
)));
1516 /* round clip_size down to a multiple of pagesize */
1517 clip_size
= clip_size
& ~(PAGE_MASK
);
1518 // LP64todo - fix this
1519 prev_resid
= uio_resid(uio
);
1520 uio_setresid(uio
, clip_size
);
1522 retval
= cluster_nocopy_write(vp
, uio
, newEOF
);
1524 if ((retval
== 0) && uio_resid(uio
))
1525 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, flags
);
1527 uio_setresid(uio
, prev_resid
- (clip_size
- uio_resid(uio
)));
1537 cluster_nocopy_write(vnode_t vp
, struct uio
*uio
, off_t newEOF
)
1540 upl_page_info_t
*pl
;
1541 vm_offset_t upl_offset
;
1545 int upl_needed_size
;
1550 int force_data_sync
;
1552 struct clios iostate
;
1553 struct cl_writebehind
*wbp
;
1556 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_START
,
1557 (int)uio
->uio_offset
, (int)uio_resid(uio
),
1561 * When we enter this routine, we know
1562 * -- the offset into the file is on a pagesize boundary
1563 * -- the resid is a page multiple
1564 * -- the resid will not exceed iov_len
1567 if ((wbp
= cluster_get_wbp(vp
, CLW_RETURNLOCKED
)) != NULL
) {
1569 cluster_try_push(wbp
, vp
, newEOF
, 0, 1);
1571 lck_mtx_unlock(&wbp
->cl_lockw
);
1573 iostate
.io_completed
= 0;
1574 iostate
.io_issued
= 0;
1575 iostate
.io_error
= 0;
1576 iostate
.io_wanted
= 0;
1578 while (uio_resid(uio
) && uio
->uio_offset
< newEOF
&& error
== 0) {
1579 user_addr_t iov_base
;
1581 io_size
= uio_resid(uio
);
1583 if (io_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
1584 io_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
1586 iov_base
= uio_curriovbase(uio
);
1588 // LP64todo - fix this!
1589 upl_offset
= CAST_DOWN(vm_offset_t
, iov_base
) & PAGE_MASK
;
1591 upl_needed_size
= (upl_offset
+ io_size
+ (PAGE_SIZE
-1)) & ~PAGE_MASK
;
1593 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_START
,
1594 (int)upl_offset
, upl_needed_size
, (int)iov_base
, io_size
, 0);
1596 for (force_data_sync
= 0; force_data_sync
< 3; force_data_sync
++) {
1598 upl_size
= upl_needed_size
;
1599 upl_flags
= UPL_FILE_IO
| UPL_COPYOUT_FROM
| UPL_NO_SYNC
|
1600 UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
1602 // LP64todo - fix this!
1603 kret
= vm_map_get_upl(current_map(),
1604 (vm_map_offset_t
)(iov_base
& ~((user_addr_t
)PAGE_MASK
)),
1612 if (kret
!= KERN_SUCCESS
) {
1613 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
1616 * cluster_nocopy_write: failed to get pagelist
1618 * we may have already spun some portion of this request
1619 * off as async requests... we need to wait for the I/O
1620 * to complete before returning
1622 goto wait_for_writes
;
1624 pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
1625 pages_in_pl
= upl_size
/ PAGE_SIZE
;
1627 for (i
= 0; i
< pages_in_pl
; i
++) {
1628 if (!upl_valid_page(pl
, i
))
1631 if (i
== pages_in_pl
)
1635 * didn't get all the pages back that we
1636 * needed... release this upl and try again
1638 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
1639 UPL_ABORT_FREE_ON_EMPTY
);
1641 if (force_data_sync
>= 3) {
1642 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
1643 i
, pages_in_pl
, upl_size
, kret
, 0);
1645 * for some reason, we couldn't acquire a hold on all
1646 * the pages needed in the user's address space
1648 * we may have already spun some portion of this request
1649 * off as async requests... we need to wait for the I/O
1650 * to complete before returning
1652 goto wait_for_writes
;
1656 * Consider the possibility that upl_size wasn't satisfied.
1658 if (upl_size
!= upl_needed_size
)
1659 io_size
= (upl_size
- (int)upl_offset
) & ~PAGE_MASK
;
1661 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
1662 (int)upl_offset
, upl_size
, (int)iov_base
, io_size
, 0);
1665 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
1666 UPL_ABORT_FREE_ON_EMPTY
);
1668 * we may have already spun some portion of this request
1669 * off as async requests... we need to wait for the I/O
1670 * to complete before returning
1672 goto wait_for_writes
;
1675 * Now look for pages already in the cache
1676 * and throw them away.
1677 * uio->uio_offset is page aligned within the file
1678 * io_size is a multiple of PAGE_SIZE
1680 ubc_range_op(vp
, uio
->uio_offset
, uio
->uio_offset
+ io_size
, UPL_ROP_DUMP
, NULL
);
1683 * we want push out these writes asynchronously so that we can overlap
1684 * the preparation of the next I/O
1685 * if there are already too many outstanding writes
1686 * wait until some complete before issuing the next
1688 lck_mtx_lock(cl_mtxp
);
1690 while ((iostate
.io_issued
- iostate
.io_completed
) > (2 * MAX_UPL_TRANSFER
* PAGE_SIZE
)) {
1691 iostate
.io_wanted
= 1;
1692 msleep((caddr_t
)&iostate
.io_wanted
, cl_mtxp
, PRIBIO
+ 1, "cluster_nocopy_write", 0);
1694 lck_mtx_unlock(cl_mtxp
);
1696 if (iostate
.io_error
) {
1698 * one of the earlier writes we issued ran into a hard error
1699 * don't issue any more writes, cleanup the UPL
1700 * that was just created but not used, then
1701 * go wait for all writes that are part of this stream
1702 * to complete before returning the error to the caller
1704 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
1705 UPL_ABORT_FREE_ON_EMPTY
);
1707 goto wait_for_writes
;
1709 io_flag
= CL_ASYNC
| CL_PRESERVE
| CL_COMMIT
| CL_THROTTLE
;
1711 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 77)) | DBG_FUNC_START
,
1712 (int)upl_offset
, (int)uio
->uio_offset
, io_size
, io_flag
, 0);
1714 error
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
,
1715 io_size
, io_flag
, (buf_t
)NULL
, &iostate
);
1717 uio_update(uio
, (user_size_t
)io_size
);
1719 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 77)) | DBG_FUNC_END
,
1720 (int)upl_offset
, (int)uio
->uio_offset
, (int)uio_resid(uio
), error
, 0);
1726 * make sure all async writes issued as part of this stream
1727 * have completed before we return
1729 lck_mtx_lock(cl_mtxp
);
1731 while (iostate
.io_issued
!= iostate
.io_completed
) {
1732 iostate
.io_wanted
= 1;
1733 msleep((caddr_t
)&iostate
.io_wanted
, cl_mtxp
, PRIBIO
+ 1, "cluster_nocopy_write", 0);
1735 lck_mtx_unlock(cl_mtxp
);
1737 if (iostate
.io_error
)
1738 error
= iostate
.io_error
;
1740 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_END
,
1741 (int)uio
->uio_offset
, (int)uio
->uio_resid
, error
, 4, 0);
1748 cluster_phys_write(vnode_t vp
, struct uio
*uio
, off_t newEOF
)
1750 upl_page_info_t
*pl
;
1753 vm_offset_t upl_offset
;
1757 int upl_needed_size
;
1762 user_addr_t iov_base
;
1764 struct cl_writebehind
*wbp
;
1766 devblocksize
= vp
->v_mount
->mnt_devblocksize
;
1768 * When we enter this routine, we know
1769 * -- the resid will not exceed iov_len
1770 * -- the vector target address is physcially contiguous
1772 if ((wbp
= cluster_get_wbp(vp
, CLW_RETURNLOCKED
)) != NULL
) {
1774 cluster_try_push(wbp
, vp
, newEOF
, 0, 1);
1776 lck_mtx_unlock(&wbp
->cl_lockw
);
1779 if (IS_VALID_UIO_SEGFLG(uio
->uio_segflg
) == 0) {
1780 panic("%s :%d - invalid uio_segflg\n", __FILE__
, __LINE__
);
1782 #endif /* LP64_DEBUG */
1784 // LP64todo - fix this!
1785 io_size
= (int)uio_curriovlen(uio
);
1786 iov_base
= uio_curriovbase(uio
);
1788 upl_offset
= CAST_DOWN(upl_offset_t
, iov_base
) & PAGE_MASK
;
1789 upl_needed_size
= upl_offset
+ io_size
;
1792 upl_size
= upl_needed_size
;
1793 upl_flags
= UPL_FILE_IO
| UPL_COPYOUT_FROM
| UPL_NO_SYNC
|
1794 UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
1796 // LP64todo - fix this!
1797 kret
= vm_map_get_upl(current_map(),
1798 (vm_map_offset_t
)(iov_base
& ~((user_addr_t
)PAGE_MASK
)),
1799 &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
, 0);
1801 if (kret
!= KERN_SUCCESS
) {
1803 * cluster_phys_write: failed to get pagelist
1804 * note: return kret here
1809 * Consider the possibility that upl_size wasn't satisfied.
1810 * This is a failure in the physical memory case.
1812 if (upl_size
< upl_needed_size
) {
1813 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
1816 pl
= ubc_upl_pageinfo(upl
);
1818 src_paddr
= ((addr64_t
)upl_phys_page(pl
, 0) << 12) + (addr64_t
)upl_offset
;
1820 while (((uio
->uio_offset
& (devblocksize
- 1)) || io_size
< devblocksize
) && io_size
) {
1823 head_size
= devblocksize
- (int)(uio
->uio_offset
& (devblocksize
- 1));
1825 if (head_size
> io_size
)
1826 head_size
= io_size
;
1828 error
= cluster_align_phys_io(vp
, uio
, src_paddr
, head_size
, 0);
1831 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
1835 upl_offset
+= head_size
;
1836 src_paddr
+= head_size
;
1837 io_size
-= head_size
;
1839 tail_size
= io_size
& (devblocksize
- 1);
1840 io_size
-= tail_size
;
1844 * issue a synchronous write to cluster_io
1846 error
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
,
1847 io_size
, CL_DEV_MEMORY
, (buf_t
)NULL
, (struct clios
*)NULL
);
1851 * The cluster_io write completed successfully,
1852 * update the uio structure
1854 uio_update(uio
, (user_size_t
)io_size
);
1856 src_paddr
+= io_size
;
1859 error
= cluster_align_phys_io(vp
, uio
, src_paddr
, tail_size
, 0);
1862 * just release our hold on the physically contiguous
1863 * region without changing any state
1865 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
1872 cluster_write_x(vnode_t vp
, struct uio
*uio
, off_t oldEOF
, off_t newEOF
, off_t headOff
, off_t tailOff
, int flags
)
1874 upl_page_info_t
*pl
;
1876 vm_offset_t upl_offset
= 0;
1889 long long total_size
;
1892 long long zero_cnt1
;
1894 struct cl_extent cl
;
1896 struct cl_writebehind
*wbp
;
1898 if ((wbp
= cluster_get_wbp(vp
, 0)) != NULL
)
1900 if (wbp
->cl_hasbeenpaged
) {
1902 * this vnode had pages cleaned to it by
1903 * the pager which indicates that either
1904 * it's not very 'hot', or the system is
1905 * being overwhelmed by a lot of dirty
1906 * data being delayed in the VM cache...
1907 * in either event, we'll push our remaining
1908 * delayed data at this point... this will
1909 * be more efficient than paging out 1 page at
1910 * a time, and will also act as a throttle
1911 * by delaying this client from writing any
1912 * more data until all his delayed data has
1913 * at least been queued to the uderlying driver.
1915 if (wbp
->cl_number
|| wbp
->cl_scmap
)
1916 cluster_push_EOF(vp
, newEOF
);
1918 wbp
->cl_hasbeenpaged
= 0;
1922 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_START
,
1923 (int)uio
->uio_offset
, uio_resid(uio
), (int)oldEOF
, (int)newEOF
, 0);
1925 // LP64todo - fix this
1926 io_resid
= uio_resid(uio
);
1928 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_START
,
1929 0, 0, (int)oldEOF
, (int)newEOF
, 0);
1938 if (flags
& IO_HEADZEROFILL
) {
1940 * some filesystems (HFS is one) don't support unallocated holes within a file...
1941 * so we zero fill the intervening space between the old EOF and the offset
1942 * where the next chunk of real data begins.... ftruncate will also use this
1943 * routine to zero fill to the new EOF when growing a file... in this case, the
1944 * uio structure will not be provided
1947 if (headOff
< uio
->uio_offset
) {
1948 zero_cnt
= uio
->uio_offset
- headOff
;
1951 } else if (headOff
< newEOF
) {
1952 zero_cnt
= newEOF
- headOff
;
1956 if (flags
& IO_TAILZEROFILL
) {
1958 // LP64todo - fix this
1959 zero_off1
= uio
->uio_offset
+ uio_resid(uio
);
1961 if (zero_off1
< tailOff
)
1962 zero_cnt1
= tailOff
- zero_off1
;
1965 if (zero_cnt
== 0 && uio
== (struct uio
*) 0) {
1966 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_END
,
1967 retval
, 0, 0, 0, 0);
1971 while ((total_size
= (io_resid
+ zero_cnt
+ zero_cnt1
)) && retval
== 0) {
1973 * for this iteration of the loop, figure out where our starting point is
1976 start_offset
= (int)(zero_off
& PAGE_MASK_64
);
1977 upl_f_offset
= zero_off
- start_offset
;
1978 } else if (io_resid
) {
1979 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
1980 upl_f_offset
= uio
->uio_offset
- start_offset
;
1982 start_offset
= (int)(zero_off1
& PAGE_MASK_64
);
1983 upl_f_offset
= zero_off1
- start_offset
;
1985 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 46)) | DBG_FUNC_NONE
,
1986 (int)zero_off
, (int)zero_cnt
, (int)zero_off1
, (int)zero_cnt1
, 0);
1988 if (total_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
1989 total_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
1991 cl
.b_addr
= (daddr64_t
)(upl_f_offset
/ PAGE_SIZE_64
);
1993 if (uio
&& ((flags
& (IO_NOCACHE
| IO_SYNC
| IO_HEADZEROFILL
| IO_TAILZEROFILL
)) == 0)) {
1995 * assumption... total_size <= io_resid
1996 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
1998 if ((start_offset
+ total_size
) > (MAX_UPL_TRANSFER
* PAGE_SIZE
))
1999 total_size
-= start_offset
;
2000 xfer_resid
= total_size
;
2002 retval
= cluster_copy_ubc_data(vp
, uio
, &xfer_resid
, 1);
2007 io_resid
-= (total_size
- xfer_resid
);
2008 total_size
= xfer_resid
;
2009 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
2010 upl_f_offset
= uio
->uio_offset
- start_offset
;
2012 if (total_size
== 0) {
2015 * the write did not finish on a page boundary
2016 * which will leave upl_f_offset pointing to the
2017 * beginning of the last page written instead of
2018 * the page beyond it... bump it in this case
2019 * so that the cluster code records the last page
2022 upl_f_offset
+= PAGE_SIZE_64
;
2030 * compute the size of the upl needed to encompass
2031 * the requested write... limit each call to cluster_io
2032 * to the maximum UPL size... cluster_io will clip if
2033 * this exceeds the maximum io_size for the device,
2034 * make sure to account for
2035 * a starting offset that's not page aligned
2037 upl_size
= (start_offset
+ total_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
2039 if (upl_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
2040 upl_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
2042 pages_in_upl
= upl_size
/ PAGE_SIZE
;
2043 io_size
= upl_size
- start_offset
;
2045 if ((long long)io_size
> total_size
)
2046 io_size
= total_size
;
2048 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_START
, upl_size
, io_size
, total_size
, 0, 0);
2052 * Gather the pages from the buffer cache.
2053 * The UPL_WILL_MODIFY flag lets the UPL subsystem know
2054 * that we intend to modify these pages.
2056 kret
= ubc_create_upl(vp
,
2061 UPL_SET_LITE
| UPL_WILL_MODIFY
);
2062 if (kret
!= KERN_SUCCESS
)
2063 panic("cluster_write: failed to get pagelist");
2065 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_END
,
2066 (int)upl
, (int)upl_f_offset
, start_offset
, 0, 0);
2068 if (start_offset
&& !upl_valid_page(pl
, 0)) {
2072 * we're starting in the middle of the first page of the upl
2073 * and the page isn't currently valid, so we're going to have
2074 * to read it in first... this is a synchronous operation
2076 read_size
= PAGE_SIZE
;
2078 if ((upl_f_offset
+ read_size
) > newEOF
)
2079 read_size
= newEOF
- upl_f_offset
;
2081 retval
= cluster_io(vp
, upl
, 0, upl_f_offset
, read_size
,
2082 CL_READ
, (buf_t
)NULL
, (struct clios
*)NULL
);
2085 * we had an error during the read which causes us to abort
2086 * the current cluster_write request... before we do, we need
2087 * to release the rest of the pages in the upl without modifying
2088 * there state and mark the failed page in error
2090 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
);
2092 if (upl_size
> PAGE_SIZE
)
2093 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
2095 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
2096 (int)upl
, 0, 0, retval
, 0);
2100 if ((start_offset
== 0 || upl_size
> PAGE_SIZE
) && ((start_offset
+ io_size
) & PAGE_MASK
)) {
2102 * the last offset we're writing to in this upl does not end on a page
2103 * boundary... if it's not beyond the old EOF, then we'll also need to
2104 * pre-read this page in if it isn't already valid
2106 upl_offset
= upl_size
- PAGE_SIZE
;
2108 if ((upl_f_offset
+ start_offset
+ io_size
) < oldEOF
&&
2109 !upl_valid_page(pl
, upl_offset
/ PAGE_SIZE
)) {
2112 read_size
= PAGE_SIZE
;
2114 if ((upl_f_offset
+ upl_offset
+ read_size
) > newEOF
)
2115 read_size
= newEOF
- (upl_f_offset
+ upl_offset
);
2117 retval
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, read_size
,
2118 CL_READ
, (buf_t
)NULL
, (struct clios
*)NULL
);
2121 * we had an error during the read which causes us to abort
2122 * the current cluster_write request... before we do, we
2123 * need to release the rest of the pages in the upl without
2124 * modifying there state and mark the failed page in error
2126 ubc_upl_abort_range(upl
, upl_offset
, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
);
2128 if (upl_size
> PAGE_SIZE
)
2129 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
2131 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
2132 (int)upl
, 0, 0, retval
, 0);
2137 xfer_resid
= io_size
;
2138 io_offset
= start_offset
;
2140 while (zero_cnt
&& xfer_resid
) {
2142 if (zero_cnt
< (long long)xfer_resid
)
2143 bytes_to_zero
= zero_cnt
;
2145 bytes_to_zero
= xfer_resid
;
2147 if ( !(flags
& (IO_NOZEROVALID
| IO_NOZERODIRTY
))) {
2148 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
2152 bytes_to_zero
= min(bytes_to_zero
, PAGE_SIZE
- (int)(zero_off
& PAGE_MASK_64
));
2153 zero_pg_index
= (int)((zero_off
- upl_f_offset
) / PAGE_SIZE_64
);
2155 if ( !upl_valid_page(pl
, zero_pg_index
)) {
2156 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
2158 } else if ((flags
& (IO_NOZERODIRTY
| IO_NOZEROVALID
)) == IO_NOZERODIRTY
&&
2159 !upl_dirty_page(pl
, zero_pg_index
)) {
2160 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
2163 xfer_resid
-= bytes_to_zero
;
2164 zero_cnt
-= bytes_to_zero
;
2165 zero_off
+= bytes_to_zero
;
2166 io_offset
+= bytes_to_zero
;
2168 if (xfer_resid
&& io_resid
) {
2169 bytes_to_move
= min(io_resid
, xfer_resid
);
2171 retval
= cluster_copy_upl_data(uio
, upl
, io_offset
, bytes_to_move
);
2175 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
2177 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
2178 (int)upl
, 0, 0, retval
, 0);
2180 io_resid
-= bytes_to_move
;
2181 xfer_resid
-= bytes_to_move
;
2182 io_offset
+= bytes_to_move
;
2185 while (xfer_resid
&& zero_cnt1
&& retval
== 0) {
2187 if (zero_cnt1
< (long long)xfer_resid
)
2188 bytes_to_zero
= zero_cnt1
;
2190 bytes_to_zero
= xfer_resid
;
2192 if ( !(flags
& (IO_NOZEROVALID
| IO_NOZERODIRTY
))) {
2193 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
2197 bytes_to_zero
= min(bytes_to_zero
, PAGE_SIZE
- (int)(zero_off1
& PAGE_MASK_64
));
2198 zero_pg_index
= (int)((zero_off1
- upl_f_offset
) / PAGE_SIZE_64
);
2200 if ( !upl_valid_page(pl
, zero_pg_index
)) {
2201 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
2202 } else if ((flags
& (IO_NOZERODIRTY
| IO_NOZEROVALID
)) == IO_NOZERODIRTY
&&
2203 !upl_dirty_page(pl
, zero_pg_index
)) {
2204 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
2207 xfer_resid
-= bytes_to_zero
;
2208 zero_cnt1
-= bytes_to_zero
;
2209 zero_off1
+= bytes_to_zero
;
2210 io_offset
+= bytes_to_zero
;
2217 io_size
+= start_offset
;
2219 if ((upl_f_offset
+ io_size
) >= newEOF
&& io_size
< upl_size
) {
2221 * if we're extending the file with this write
2222 * we'll zero fill the rest of the page so that
2223 * if the file gets extended again in such a way as to leave a
2224 * hole starting at this EOF, we'll have zero's in the correct spot
2226 cluster_zero(upl
, io_size
, upl_size
- io_size
, NULL
);
2228 if (flags
& IO_SYNC
)
2230 * if the IO_SYNC flag is set than we need to
2231 * bypass any clusters and immediately issue
2237 * take the lock to protect our accesses
2238 * of the writebehind and sparse cluster state
2240 wbp
= cluster_get_wbp(vp
, CLW_ALLOCATE
| CLW_RETURNLOCKED
);
2243 * calculate the last logical block number
2244 * that this delayed I/O encompassed
2246 cl
.e_addr
= (daddr64_t
)((upl_f_offset
+ (off_t
)upl_size
) / PAGE_SIZE_64
);
2248 if (wbp
->cl_scmap
) {
2250 if ( !(flags
& IO_NOCACHE
)) {
2252 * we've fallen into the sparse
2253 * cluster method of delaying dirty pages
2254 * first, we need to release the upl if we hold one
2255 * since pages in it may be present in the sparse cluster map
2256 * and may span 2 separate buckets there... if they do and
2257 * we happen to have to flush a bucket to make room and it intersects
2258 * this upl, a deadlock may result on page BUSY
2261 ubc_upl_commit_range(upl
, 0, upl_size
,
2262 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2264 sparse_cluster_add(wbp
, vp
, &cl
, newEOF
);
2266 lck_mtx_unlock(&wbp
->cl_lockw
);
2271 * must have done cached writes that fell into
2272 * the sparse cluster mechanism... we've switched
2273 * to uncached writes on the file, so go ahead
2274 * and push whatever's in the sparse map
2275 * and switch back to normal clustering
2277 * see the comment above concerning a possible deadlock...
2280 ubc_upl_commit_range(upl
, 0, upl_size
,
2281 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2283 * setting upl_size to 0 keeps us from committing a
2284 * second time in the start_new_cluster path
2288 sparse_cluster_push(wbp
, vp
, newEOF
, 1);
2292 * no clusters of either type present at this point
2293 * so just go directly to start_new_cluster since
2294 * we know we need to delay this I/O since we've
2295 * already released the pages back into the cache
2296 * to avoid the deadlock with sparse_cluster_push
2298 goto start_new_cluster
;
2302 if (wbp
->cl_number
== 0)
2304 * no clusters currently present
2306 goto start_new_cluster
;
2308 for (cl_index
= 0; cl_index
< wbp
->cl_number
; cl_index
++) {
2310 * check each cluster that we currently hold
2311 * try to merge some or all of this write into
2312 * one or more of the existing clusters... if
2313 * any portion of the write remains, start a
2316 if (cl
.b_addr
>= wbp
->cl_clusters
[cl_index
].b_addr
) {
2318 * the current write starts at or after the current cluster
2320 if (cl
.e_addr
<= (wbp
->cl_clusters
[cl_index
].b_addr
+ MAX_UPL_TRANSFER
)) {
2322 * we have a write that fits entirely
2323 * within the existing cluster limits
2325 if (cl
.e_addr
> wbp
->cl_clusters
[cl_index
].e_addr
)
2327 * update our idea of where the cluster ends
2329 wbp
->cl_clusters
[cl_index
].e_addr
= cl
.e_addr
;
2332 if (cl
.b_addr
< (wbp
->cl_clusters
[cl_index
].b_addr
+ MAX_UPL_TRANSFER
)) {
2334 * we have a write that starts in the middle of the current cluster
2335 * but extends beyond the cluster's limit... we know this because
2336 * of the previous checks
2337 * we'll extend the current cluster to the max
2338 * and update the b_addr for the current write to reflect that
2339 * the head of it was absorbed into this cluster...
2340 * note that we'll always have a leftover tail in this case since
2341 * full absorbtion would have occurred in the clause above
2343 wbp
->cl_clusters
[cl_index
].e_addr
= wbp
->cl_clusters
[cl_index
].b_addr
+ MAX_UPL_TRANSFER
;
2346 daddr64_t start_pg_in_upl
;
2348 start_pg_in_upl
= (daddr64_t
)(upl_f_offset
/ PAGE_SIZE_64
);
2350 if (start_pg_in_upl
< wbp
->cl_clusters
[cl_index
].e_addr
) {
2351 intersection
= (int)((wbp
->cl_clusters
[cl_index
].e_addr
- start_pg_in_upl
) * PAGE_SIZE
);
2353 ubc_upl_commit_range(upl
, upl_offset
, intersection
,
2354 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2355 upl_f_offset
+= intersection
;
2356 upl_offset
+= intersection
;
2357 upl_size
-= intersection
;
2360 cl
.b_addr
= wbp
->cl_clusters
[cl_index
].e_addr
;
2363 * we come here for the case where the current write starts
2364 * beyond the limit of the existing cluster or we have a leftover
2365 * tail after a partial absorbtion
2367 * in either case, we'll check the remaining clusters before
2368 * starting a new one
2372 * the current write starts in front of the cluster we're currently considering
2374 if ((wbp
->cl_clusters
[cl_index
].e_addr
- cl
.b_addr
) <= MAX_UPL_TRANSFER
) {
2376 * we can just merge the new request into
2377 * this cluster and leave it in the cache
2378 * since the resulting cluster is still
2379 * less than the maximum allowable size
2381 wbp
->cl_clusters
[cl_index
].b_addr
= cl
.b_addr
;
2383 if (cl
.e_addr
> wbp
->cl_clusters
[cl_index
].e_addr
) {
2385 * the current write completely
2386 * envelops the existing cluster and since
2387 * each write is limited to at most MAX_UPL_TRANSFER bytes
2388 * we can just use the start and last blocknos of the write
2389 * to generate the cluster limits
2391 wbp
->cl_clusters
[cl_index
].e_addr
= cl
.e_addr
;
2397 * if we were to combine this write with the current cluster
2398 * we would exceed the cluster size limit.... so,
2399 * let's see if there's any overlap of the new I/O with
2400 * the cluster we're currently considering... in fact, we'll
2401 * stretch the cluster out to it's full limit and see if we
2402 * get an intersection with the current write
2405 if (cl
.e_addr
> wbp
->cl_clusters
[cl_index
].e_addr
- MAX_UPL_TRANSFER
) {
2407 * the current write extends into the proposed cluster
2408 * clip the length of the current write after first combining it's
2409 * tail with the newly shaped cluster
2411 wbp
->cl_clusters
[cl_index
].b_addr
= wbp
->cl_clusters
[cl_index
].e_addr
- MAX_UPL_TRANSFER
;
2414 intersection
= (int)((cl
.e_addr
- wbp
->cl_clusters
[cl_index
].b_addr
) * PAGE_SIZE
);
2416 if (intersection
> upl_size
)
2418 * because the current write may consist of a number of pages found in the cache
2419 * which are not part of the UPL, we may have an intersection that exceeds
2420 * the size of the UPL that is also part of this write
2422 intersection
= upl_size
;
2424 ubc_upl_commit_range(upl
, upl_offset
+ (upl_size
- intersection
), intersection
,
2425 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2426 upl_size
-= intersection
;
2428 cl
.e_addr
= wbp
->cl_clusters
[cl_index
].b_addr
;
2431 * if we get here, there was no way to merge
2432 * any portion of this write with this cluster
2433 * or we could only merge part of it which
2434 * will leave a tail...
2435 * we'll check the remaining clusters before starting a new one
2439 if (cl_index
< wbp
->cl_number
)
2441 * we found an existing cluster(s) that we
2442 * could entirely merge this I/O into
2446 if (wbp
->cl_number
< MAX_CLUSTERS
&& !(flags
& IO_NOCACHE
))
2448 * we didn't find an existing cluster to
2449 * merge into, but there's room to start
2452 goto start_new_cluster
;
2455 * no exisitng cluster to merge with and no
2456 * room to start a new one... we'll try
2457 * pushing one of the existing ones... if none of
2458 * them are able to be pushed, we'll switch
2459 * to the sparse cluster mechanism
2460 * cluster_try_push updates cl_number to the
2461 * number of remaining clusters... and
2462 * returns the number of currently unused clusters
2464 int ret_cluster_try_push
= 0;
2465 /* if writes are not deferred, call cluster push immediately */
2466 if (!((unsigned int)vfs_flags(vp
->v_mount
) & MNT_DEFWRITE
)) {
2467 if (flags
& IO_NOCACHE
)
2472 ret_cluster_try_push
= cluster_try_push(wbp
, vp
, newEOF
, can_delay
, 0);
2475 /* execute following regardless writes are deferred or not */
2476 if (ret_cluster_try_push
== 0) {
2478 * no more room in the normal cluster mechanism
2479 * so let's switch to the more expansive but expensive
2480 * sparse mechanism....
2481 * first, we need to release the upl if we hold one
2482 * since pages in it may be present in the sparse cluster map (after the cluster_switch)
2483 * and may span 2 separate buckets there... if they do and
2484 * we happen to have to flush a bucket to make room and it intersects
2485 * this upl, a deadlock may result on page BUSY
2488 ubc_upl_commit_range(upl
, upl_offset
, upl_size
,
2489 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2491 sparse_cluster_switch(wbp
, vp
, newEOF
);
2492 sparse_cluster_add(wbp
, vp
, &cl
, newEOF
);
2494 lck_mtx_unlock(&wbp
->cl_lockw
);
2499 * we pushed one cluster successfully, so we must be sequentially writing this file
2500 * otherwise, we would have failed and fallen into the sparse cluster support
2501 * so let's take the opportunity to push out additional clusters as long as we
2502 * remain below the throttle... this will give us better I/O locality if we're
2503 * in a copy loop (i.e. we won't jump back and forth between the read and write points
2504 * however, we don't want to push so much out that the write throttle kicks in and
2505 * hangs this thread up until some of the I/O completes...
2507 if (!((unsigned int)vfs_flags(vp
->v_mount
) & MNT_DEFWRITE
)) {
2508 while (wbp
->cl_number
&& (vp
->v_numoutput
<= (VNODE_ASYNC_THROTTLE
/ 2)))
2509 cluster_try_push(wbp
, vp
, newEOF
, 0, 0);
2513 wbp
->cl_clusters
[wbp
->cl_number
].b_addr
= cl
.b_addr
;
2514 wbp
->cl_clusters
[wbp
->cl_number
].e_addr
= cl
.e_addr
;
2516 if (flags
& IO_NOCACHE
)
2517 wbp
->cl_clusters
[wbp
->cl_number
].io_nocache
= 1;
2519 wbp
->cl_clusters
[wbp
->cl_number
].io_nocache
= 0;
2523 ubc_upl_commit_range(upl
, upl_offset
, upl_size
,
2524 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2526 lck_mtx_unlock(&wbp
->cl_lockw
);
2531 * we don't hold the vnode lock at this point
2533 * because we had to ask for a UPL that provides currenty non-present pages, the
2534 * UPL has been automatically set to clear the dirty flags (both software and hardware)
2535 * upon committing it... this is not the behavior we want since it's possible for
2536 * pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
2537 * in order to maintain some semblance of coherency with mapped writes
2538 * we need to drop the current upl and pick it back up with COPYOUT_FROM set
2539 * so that we correctly deal with a change in state of the hardware modify bit...
2540 * we do this via cluster_push_x... by passing along the IO_SYNC flag, we force
2541 * cluster_push_x to wait until all the I/Os have completed... cluster_push_x is also
2542 * responsible for generating the correct sized I/O(s)
2544 ubc_upl_commit_range(upl
, 0, upl_size
,
2545 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2547 cl
.e_addr
= (upl_f_offset
+ (off_t
)upl_size
) / PAGE_SIZE_64
;
2549 retval
= cluster_push_x(vp
, &cl
, newEOF
, flags
);
2552 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_END
,
2553 retval
, 0, io_resid
, 0, 0);
2559 cluster_read(vnode_t vp
, struct uio
*uio
, off_t filesize
, int xflags
)
2572 if (vp
->v_flag
& VNOCACHE_DATA
)
2573 flags
|= IO_NOCACHE
;
2574 if (vp
->v_flag
& VRAOFF
)
2577 if (!((flags
& IO_NOCACHE
) && UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
))) {
2579 * go do a read through the cache if one of the following is true....
2580 * NOCACHE is not true
2581 * the uio request doesn't target USERSPACE
2583 return (cluster_read_x(vp
, uio
, filesize
, flags
));
2587 if (IS_VALID_UIO_SEGFLG(uio
->uio_segflg
) == 0) {
2588 panic("%s :%d - invalid uio_segflg\n", __FILE__
, __LINE__
);
2590 #endif /* LP64_DEBUG */
2592 while (uio_resid(uio
) && uio
->uio_offset
< filesize
&& retval
== 0) {
2593 user_size_t iov_len
;
2594 user_addr_t iov_base
;
2597 * we know we have a resid, so this is safe
2598 * skip over any emtpy vectors
2600 uio_update(uio
, (user_size_t
)0);
2602 iov_len
= uio_curriovlen(uio
);
2603 iov_base
= uio_curriovbase(uio
);
2605 upl_size
= PAGE_SIZE
;
2606 upl_flags
= UPL_QUERY_OBJECT_TYPE
;
2608 // LP64todo - fix this!
2609 if ((vm_map_get_upl(current_map(),
2610 (vm_map_offset_t
)(iov_base
& ~((user_addr_t
)PAGE_MASK
)),
2611 &upl_size
, &upl
, NULL
, NULL
, &upl_flags
, 0)) != KERN_SUCCESS
) {
2613 * the user app must have passed in an invalid address
2619 * We check every vector target but if it is physically
2620 * contiguous space, we skip the sanity checks.
2622 if (upl_flags
& UPL_PHYS_CONTIG
) {
2623 retval
= cluster_phys_read(vp
, uio
, filesize
);
2625 else if (uio_resid(uio
) < PAGE_SIZE
) {
2627 * we're here because we're don't have a physically contiguous target buffer
2628 * go do a read through the cache if
2629 * the total xfer size is less than a page...
2631 return (cluster_read_x(vp
, uio
, filesize
, flags
));
2633 // LP64todo - fix this!
2634 else if (((int)uio
->uio_offset
& PAGE_MASK
) || (CAST_DOWN(int, iov_base
) & PAGE_MASK
)) {
2635 if (((int)uio
->uio_offset
& PAGE_MASK
) == (CAST_DOWN(int, iov_base
) & PAGE_MASK
)) {
2637 * Bring the file offset read up to a pagesize boundary
2638 * this will also bring the base address to a page boundary
2639 * since they both are currently on the same offset within a page
2640 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
2641 * so the computed clip_size must always be less than the current uio_resid
2643 clip_size
= (PAGE_SIZE
- (int)(uio
->uio_offset
& PAGE_MASK_64
));
2646 * Fake the resid going into the cluster_read_x call
2647 * and restore it on the way out.
2649 prev_resid
= uio_resid(uio
);
2650 // LP64todo - fix this
2651 uio_setresid(uio
, clip_size
);
2653 retval
= cluster_read_x(vp
, uio
, filesize
, flags
);
2655 uio_setresid(uio
, prev_resid
- (clip_size
- uio_resid(uio
)));
2658 * can't get both the file offset and the buffer offset aligned to a page boundary
2659 * so fire an I/O through the cache for this entire vector
2661 // LP64todo - fix this!
2662 clip_size
= iov_len
;
2663 prev_resid
= uio_resid(uio
);
2664 uio_setresid(uio
, clip_size
);
2666 retval
= cluster_read_x(vp
, uio
, filesize
, flags
);
2668 uio_setresid(uio
, prev_resid
- (clip_size
- uio_resid(uio
)));
2672 * If we come in here, we know the offset into
2673 * the file is on a pagesize boundary
2675 max_io_size
= filesize
- uio
->uio_offset
;
2676 // LP64todo - fix this
2677 clip_size
= uio_resid(uio
);
2678 if (iov_len
< clip_size
)
2679 clip_size
= iov_len
;
2680 if (max_io_size
< clip_size
)
2681 clip_size
= (int)max_io_size
;
2683 if (clip_size
< PAGE_SIZE
) {
2685 * Take care of the tail end of the read in this vector.
2687 // LP64todo - fix this
2688 prev_resid
= uio_resid(uio
);
2689 uio_setresid(uio
, clip_size
);
2691 retval
= cluster_read_x(vp
, uio
, filesize
, flags
);
2693 uio_setresid(uio
, prev_resid
- (clip_size
- uio_resid(uio
)));
2695 /* round clip_size down to a multiple of pagesize */
2696 clip_size
= clip_size
& ~(PAGE_MASK
);
2697 // LP64todo - fix this
2698 prev_resid
= uio_resid(uio
);
2699 uio_setresid(uio
, clip_size
);
2701 retval
= cluster_nocopy_read(vp
, uio
, filesize
);
2703 if ((retval
==0) && uio_resid(uio
))
2704 retval
= cluster_read_x(vp
, uio
, filesize
, flags
);
2706 uio_setresid(uio
, prev_resid
- (clip_size
- uio_resid(uio
)));
2715 cluster_read_x(vnode_t vp
, struct uio
*uio
, off_t filesize
, int flags
)
2717 upl_page_info_t
*pl
;
2719 vm_offset_t upl_offset
;
2728 off_t last_ioread_offset
;
2729 off_t last_request_offset
;
2730 u_int size_of_prefetch
;
2735 u_int max_rd_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
2736 u_int rd_ahead_enabled
= 1;
2737 u_int prefetch_enabled
= 1;
2738 struct cl_readahead
* rap
;
2739 struct clios iostate
;
2740 struct cl_extent extent
;
2742 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_START
,
2743 (int)uio
->uio_offset
, uio_resid(uio
), (int)filesize
, 0, 0);
2745 // LP64todo - fix this
2746 last_request_offset
= uio
->uio_offset
+ uio_resid(uio
);
2748 if ((flags
& (IO_RAOFF
|IO_NOCACHE
)) ||
2749 ((last_request_offset
& ~PAGE_MASK_64
) == (uio
->uio_offset
& ~PAGE_MASK_64
))) {
2750 rd_ahead_enabled
= 0;
2753 if (cluster_hard_throttle_on(vp
)) {
2754 rd_ahead_enabled
= 0;
2755 prefetch_enabled
= 0;
2757 max_rd_size
= HARD_THROTTLE_MAXSIZE
;
2759 if ((rap
= cluster_get_rap(vp
)) == NULL
)
2760 rd_ahead_enabled
= 0;
2762 if (last_request_offset
> filesize
)
2763 last_request_offset
= filesize
;
2764 extent
.b_addr
= uio
->uio_offset
/ PAGE_SIZE_64
;
2765 extent
.e_addr
= (last_request_offset
- 1) / PAGE_SIZE_64
;
2767 if (rap
!= NULL
&& rap
->cl_ralen
&& (rap
->cl_lastr
== extent
.b_addr
|| (rap
->cl_lastr
+ 1) == extent
.b_addr
)) {
2769 * determine if we already have a read-ahead in the pipe courtesy of the
2770 * last read systemcall that was issued...
2771 * if so, pick up it's extent to determine where we should start
2772 * with respect to any read-ahead that might be necessary to
2773 * garner all the data needed to complete this read systemcall
2775 last_ioread_offset
= (rap
->cl_maxra
* PAGE_SIZE_64
) + PAGE_SIZE_64
;
2777 if (last_ioread_offset
< uio
->uio_offset
)
2778 last_ioread_offset
= (off_t
)0;
2779 else if (last_ioread_offset
> last_request_offset
)
2780 last_ioread_offset
= last_request_offset
;
2782 last_ioread_offset
= (off_t
)0;
2784 while (uio_resid(uio
) && uio
->uio_offset
< filesize
&& retval
== 0) {
2786 * compute the size of the upl needed to encompass
2787 * the requested read... limit each call to cluster_io
2788 * to the maximum UPL size... cluster_io will clip if
2789 * this exceeds the maximum io_size for the device,
2790 * make sure to account for
2791 * a starting offset that's not page aligned
2793 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
2794 upl_f_offset
= uio
->uio_offset
- (off_t
)start_offset
;
2795 max_size
= filesize
- uio
->uio_offset
;
2797 // LP64todo - fix this!
2798 if ((off_t
)((unsigned int)uio_resid(uio
)) < max_size
)
2799 io_size
= uio_resid(uio
);
2803 if (!(flags
& IO_NOCACHE
)) {
2810 * if we keep finding the pages we need already in the cache, then
2811 * don't bother to call cluster_rd_prefetch since it costs CPU cycles
2812 * to determine that we have all the pages we need... once we miss in
2813 * the cache and have issued an I/O, than we'll assume that we're likely
2814 * to continue to miss in the cache and it's to our advantage to try and prefetch
2816 if (last_request_offset
&& last_ioread_offset
&& (size_of_prefetch
= (last_request_offset
- last_ioread_offset
))) {
2817 if ((last_ioread_offset
- uio
->uio_offset
) <= max_rd_size
&& prefetch_enabled
) {
2819 * we've already issued I/O for this request and
2820 * there's still work to do and
2821 * our prefetch stream is running dry, so issue a
2822 * pre-fetch I/O... the I/O latency will overlap
2823 * with the copying of the data
2825 if (size_of_prefetch
> max_rd_size
)
2826 size_of_prefetch
= max_rd_size
;
2828 size_of_prefetch
= cluster_rd_prefetch(vp
, last_ioread_offset
, size_of_prefetch
, filesize
);
2830 last_ioread_offset
+= (off_t
)(size_of_prefetch
* PAGE_SIZE
);
2832 if (last_ioread_offset
> last_request_offset
)
2833 last_ioread_offset
= last_request_offset
;
2837 * limit the size of the copy we're about to do so that
2838 * we can notice that our I/O pipe is running dry and
2839 * get the next I/O issued before it does go dry
2841 if (last_ioread_offset
&& io_size
> ((MAX_UPL_TRANSFER
* PAGE_SIZE
) / 4))
2842 io_resid
= ((MAX_UPL_TRANSFER
* PAGE_SIZE
) / 4);
2846 io_requested
= io_resid
;
2848 retval
= cluster_copy_ubc_data(vp
, uio
, &io_resid
, 0);
2850 io_size
-= (io_requested
- io_resid
);
2852 if (retval
|| io_resid
)
2854 * if we run into a real error or
2855 * a page that is not in the cache
2856 * we need to leave streaming mode
2860 if ((io_size
== 0 || last_ioread_offset
== last_request_offset
) && rd_ahead_enabled
) {
2862 * we're already finished the I/O for this read request
2863 * let's see if we should do a read-ahead
2865 cluster_rd_ahead(vp
, &extent
, filesize
, rap
);
2872 if (extent
.e_addr
< rap
->cl_lastr
)
2874 rap
->cl_lastr
= extent
.e_addr
;
2878 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
2879 upl_f_offset
= uio
->uio_offset
- (off_t
)start_offset
;
2880 max_size
= filesize
- uio
->uio_offset
;
2882 if (io_size
> max_rd_size
)
2883 io_size
= max_rd_size
;
2885 upl_size
= (start_offset
+ io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
2887 if (upl_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
) / 4)
2888 upl_size
= (MAX_UPL_TRANSFER
* PAGE_SIZE
) / 4;
2889 pages_in_upl
= upl_size
/ PAGE_SIZE
;
2891 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 33)) | DBG_FUNC_START
,
2892 (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
2894 kret
= ubc_create_upl(vp
,
2900 if (kret
!= KERN_SUCCESS
)
2901 panic("cluster_read: failed to get pagelist");
2903 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 33)) | DBG_FUNC_END
,
2904 (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
2907 * scan from the beginning of the upl looking for the first
2908 * non-valid page.... this will become the first page in
2909 * the request we're going to make to 'cluster_io'... if all
2910 * of the pages are valid, we won't call through to 'cluster_io'
2912 for (start_pg
= 0; start_pg
< pages_in_upl
; start_pg
++) {
2913 if (!upl_valid_page(pl
, start_pg
))
2918 * scan from the starting invalid page looking for a valid
2919 * page before the end of the upl is reached, if we
2920 * find one, then it will be the last page of the request to
2923 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
2924 if (upl_valid_page(pl
, last_pg
))
2927 iostate
.io_completed
= 0;
2928 iostate
.io_issued
= 0;
2929 iostate
.io_error
= 0;
2930 iostate
.io_wanted
= 0;
2932 if (start_pg
< last_pg
) {
2934 * we found a range of 'invalid' pages that must be filled
2935 * if the last page in this range is the last page of the file
2936 * we may have to clip the size of it to keep from reading past
2937 * the end of the last physical block associated with the file
2939 upl_offset
= start_pg
* PAGE_SIZE
;
2940 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
2942 if ((upl_f_offset
+ upl_offset
+ io_size
) > filesize
)
2943 io_size
= filesize
- (upl_f_offset
+ upl_offset
);
2946 * issue an asynchronous read to cluster_io
2949 error
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
,
2950 io_size
, CL_READ
| CL_ASYNC
, (buf_t
)NULL
, &iostate
);
2954 * if the read completed successfully, or there was no I/O request
2955 * issued, than copy the data into user land via 'cluster_upl_copy_data'
2956 * we'll first add on any 'valid'
2957 * pages that were present in the upl when we acquired it.
2961 for (uio_last
= last_pg
; uio_last
< pages_in_upl
; uio_last
++) {
2962 if (!upl_valid_page(pl
, uio_last
))
2966 * compute size to transfer this round, if uio->uio_resid is
2967 * still non-zero after this attempt, we'll loop around and
2968 * set up for another I/O.
2970 val_size
= (uio_last
* PAGE_SIZE
) - start_offset
;
2972 if (val_size
> max_size
)
2973 val_size
= max_size
;
2975 if (val_size
> uio_resid(uio
))
2976 // LP64todo - fix this
2977 val_size
= uio_resid(uio
);
2979 if (last_ioread_offset
== 0)
2980 last_ioread_offset
= uio
->uio_offset
+ val_size
;
2982 if ((size_of_prefetch
= (last_request_offset
- last_ioread_offset
)) && prefetch_enabled
) {
2984 * if there's still I/O left to do for this request, and...
2985 * we're not in hard throttle mode, then issue a
2986 * pre-fetch I/O... the I/O latency will overlap
2987 * with the copying of the data
2989 size_of_prefetch
= cluster_rd_prefetch(vp
, last_ioread_offset
, size_of_prefetch
, filesize
);
2991 last_ioread_offset
+= (off_t
)(size_of_prefetch
* PAGE_SIZE
);
2993 if (last_ioread_offset
> last_request_offset
)
2994 last_ioread_offset
= last_request_offset
;
2996 } else if ((uio
->uio_offset
+ val_size
) == last_request_offset
) {
2998 * this transfer will finish this request, so...
2999 * let's try to read ahead if we're in
3000 * a sequential access pattern and we haven't
3001 * explicitly disabled it
3003 if (rd_ahead_enabled
)
3004 cluster_rd_ahead(vp
, &extent
, filesize
, rap
);
3007 if (extent
.e_addr
< rap
->cl_lastr
)
3009 rap
->cl_lastr
= extent
.e_addr
;
3012 lck_mtx_lock(cl_mtxp
);
3014 while (iostate
.io_issued
!= iostate
.io_completed
) {
3015 iostate
.io_wanted
= 1;
3016 msleep((caddr_t
)&iostate
.io_wanted
, cl_mtxp
, PRIBIO
+ 1, "cluster_read_x", 0);
3018 lck_mtx_unlock(cl_mtxp
);
3020 if (iostate
.io_error
)
3021 error
= iostate
.io_error
;
3023 retval
= cluster_copy_upl_data(uio
, upl
, start_offset
, val_size
);
3025 if (start_pg
< last_pg
) {
3027 * compute the range of pages that we actually issued an I/O for
3028 * and either commit them as valid if the I/O succeeded
3029 * or abort them if the I/O failed
3031 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
3033 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_START
,
3034 (int)upl
, start_pg
* PAGE_SIZE
, io_size
, error
, 0);
3036 if (error
|| (flags
& IO_NOCACHE
))
3037 ubc_upl_abort_range(upl
, start_pg
* PAGE_SIZE
, io_size
,
3038 UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
3040 ubc_upl_commit_range(upl
, start_pg
* PAGE_SIZE
, io_size
,
3041 UPL_COMMIT_CLEAR_DIRTY
|
3042 UPL_COMMIT_FREE_ON_EMPTY
|
3043 UPL_COMMIT_INACTIVATE
);
3045 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_END
,
3046 (int)upl
, start_pg
* PAGE_SIZE
, io_size
, error
, 0);
3048 if ((last_pg
- start_pg
) < pages_in_upl
) {
3053 * the set of pages that we issued an I/O for did not encompass
3054 * the entire upl... so just release these without modifying
3058 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3060 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_START
,
3061 (int)upl
, -1, pages_in_upl
- (last_pg
- start_pg
), 0, 0);
3065 * we found some already valid pages at the beginning of
3066 * the upl commit these back to the inactive list with
3069 for (cur_pg
= 0; cur_pg
< start_pg
; cur_pg
++) {
3070 commit_flags
= UPL_COMMIT_FREE_ON_EMPTY
3071 | UPL_COMMIT_INACTIVATE
;
3073 if (upl_dirty_page(pl
, cur_pg
))
3074 commit_flags
|= UPL_COMMIT_SET_DIRTY
;
3076 if ( !(commit_flags
& UPL_COMMIT_SET_DIRTY
) && (flags
& IO_NOCACHE
))
3077 ubc_upl_abort_range(upl
, cur_pg
* PAGE_SIZE
, PAGE_SIZE
,
3078 UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
3080 ubc_upl_commit_range(upl
, cur_pg
* PAGE_SIZE
,
3081 PAGE_SIZE
, commit_flags
);
3084 if (last_pg
< uio_last
) {
3086 * we found some already valid pages immediately after the
3087 * pages we issued I/O for, commit these back to the
3088 * inactive list with reference cleared
3090 for (cur_pg
= last_pg
; cur_pg
< uio_last
; cur_pg
++) {
3091 commit_flags
= UPL_COMMIT_FREE_ON_EMPTY
3092 | UPL_COMMIT_INACTIVATE
;
3094 if (upl_dirty_page(pl
, cur_pg
))
3095 commit_flags
|= UPL_COMMIT_SET_DIRTY
;
3097 if ( !(commit_flags
& UPL_COMMIT_SET_DIRTY
) && (flags
& IO_NOCACHE
))
3098 ubc_upl_abort_range(upl
, cur_pg
* PAGE_SIZE
, PAGE_SIZE
,
3099 UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
3101 ubc_upl_commit_range(upl
, cur_pg
* PAGE_SIZE
,
3102 PAGE_SIZE
, commit_flags
);
3105 if (uio_last
< pages_in_upl
) {
3107 * there were some invalid pages beyond the valid pages
3108 * that we didn't issue an I/O for, just release them
3111 ubc_upl_abort_range(upl
, uio_last
* PAGE_SIZE
,
3112 (pages_in_upl
- uio_last
) * PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
3115 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_END
,
3116 (int)upl
, -1, -1, 0, 0);
3122 if ( uio_resid(uio
) ) {
3123 if (cluster_hard_throttle_on(vp
)) {
3124 rd_ahead_enabled
= 0;
3125 prefetch_enabled
= 0;
3127 max_rd_size
= HARD_THROTTLE_MAXSIZE
;
3130 rd_ahead_enabled
= 1;
3131 prefetch_enabled
= 1;
3133 max_rd_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
3138 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_END
,
3139 (int)uio
->uio_offset
, uio_resid(uio
), rap
->cl_lastr
, retval
, 0);
3141 lck_mtx_unlock(&rap
->cl_lockr
);
3143 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_END
,
3144 (int)uio
->uio_offset
, uio_resid(uio
), 0, retval
, 0);
3152 cluster_nocopy_read(vnode_t vp
, struct uio
*uio
, off_t filesize
)
3155 upl_page_info_t
*pl
;
3156 vm_offset_t upl_offset
;
3160 int upl_needed_size
;
3165 int force_data_sync
;
3167 int no_zero_fill
= 0;
3169 struct clios iostate
;
3170 u_int max_rd_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
3171 u_int max_rd_ahead
= MAX_UPL_TRANSFER
* PAGE_SIZE
* 2;
3174 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_START
,
3175 (int)uio
->uio_offset
, uio_resid(uio
), (int)filesize
, 0, 0);
3178 * When we enter this routine, we know
3179 * -- the offset into the file is on a pagesize boundary
3180 * -- the resid is a page multiple
3181 * -- the resid will not exceed iov_len
3184 iostate
.io_completed
= 0;
3185 iostate
.io_issued
= 0;
3186 iostate
.io_error
= 0;
3187 iostate
.io_wanted
= 0;
3189 while (uio_resid(uio
) && uio
->uio_offset
< filesize
&& retval
== 0) {
3190 user_addr_t iov_base
;
3192 if (cluster_hard_throttle_on(vp
)) {
3193 max_rd_size
= HARD_THROTTLE_MAXSIZE
;
3194 max_rd_ahead
= HARD_THROTTLE_MAXSIZE
- 1;
3196 max_rd_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
3197 max_rd_ahead
= MAX_UPL_TRANSFER
* PAGE_SIZE
* 8;
3199 max_io_size
= filesize
- uio
->uio_offset
;
3201 // LP64todo - fix this
3202 if (max_io_size
< (off_t
)((unsigned int)uio_resid(uio
)))
3203 io_size
= max_io_size
;
3205 io_size
= uio_resid(uio
);
3208 * First look for pages already in the cache
3209 * and move them to user space.
3211 retval
= cluster_copy_ubc_data(vp
, uio
, &io_size
, 0);
3215 * we may have already spun some portion of this request
3216 * off as async requests... we need to wait for the I/O
3217 * to complete before returning
3219 goto wait_for_reads
;
3222 * If we are already finished with this read, then return
3226 * we may have already spun some portion of this request
3227 * off as async requests... we need to wait for the I/O
3228 * to complete before returning
3230 goto wait_for_reads
;
3232 max_io_size
= io_size
;
3234 if (max_io_size
> max_rd_size
)
3235 max_io_size
= max_rd_size
;
3239 ubc_range_op(vp
, uio
->uio_offset
, uio
->uio_offset
+ max_io_size
, UPL_ROP_ABSENT
, &io_size
);
3243 * we may have already spun some portion of this request
3244 * off as async requests... we need to wait for the I/O
3245 * to complete before returning
3247 goto wait_for_reads
;
3249 iov_base
= uio_curriovbase(uio
);
3251 // LP64todo - fix this!
3252 upl_offset
= CAST_DOWN(vm_offset_t
, iov_base
) & PAGE_MASK
;
3253 upl_needed_size
= (upl_offset
+ io_size
+ (PAGE_SIZE
-1)) & ~PAGE_MASK
;
3255 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_START
,
3256 (int)upl_offset
, upl_needed_size
, (int)iov_base
, io_size
, 0);
3258 if (upl_offset
== 0 && ((io_size
& PAGE_MASK
) == 0)) {
3260 abort_flag
= UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
;
3263 abort_flag
= UPL_ABORT_FREE_ON_EMPTY
;
3265 for (force_data_sync
= 0; force_data_sync
< 3; force_data_sync
++) {
3267 upl_size
= upl_needed_size
;
3268 upl_flags
= UPL_FILE_IO
| UPL_NO_SYNC
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
3271 upl_flags
|= UPL_NOZEROFILL
;
3272 if (force_data_sync
)
3273 upl_flags
|= UPL_FORCE_DATA_SYNC
;
3275 // LP64todo - fix this!
3276 kret
= vm_map_create_upl(current_map(),
3277 (vm_map_offset_t
)(iov_base
& ~((user_addr_t
)PAGE_MASK
)),
3278 &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
);
3280 if (kret
!= KERN_SUCCESS
) {
3281 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
3282 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
3284 * cluster_nocopy_read: failed to get pagelist
3286 * we may have already spun some portion of this request
3287 * off as async requests... we need to wait for the I/O
3288 * to complete before returning
3290 goto wait_for_reads
;
3292 pages_in_pl
= upl_size
/ PAGE_SIZE
;
3293 pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
3295 for (i
= 0; i
< pages_in_pl
; i
++) {
3296 if (!upl_valid_page(pl
, i
))
3299 if (i
== pages_in_pl
)
3302 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
, abort_flag
);
3304 if (force_data_sync
>= 3) {
3305 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
3306 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
3308 goto wait_for_reads
;
3311 * Consider the possibility that upl_size wasn't satisfied.
3313 if (upl_size
!= upl_needed_size
)
3314 io_size
= (upl_size
- (int)upl_offset
) & ~PAGE_MASK
;
3317 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
, abort_flag
);
3318 goto wait_for_reads
;
3320 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
3321 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
3324 * request asynchronously so that we can overlap
3325 * the preparation of the next I/O
3326 * if there are already too many outstanding reads
3327 * wait until some have completed before issuing the next read
3329 lck_mtx_lock(cl_mtxp
);
3331 while ((iostate
.io_issued
- iostate
.io_completed
) > max_rd_ahead
) {
3332 iostate
.io_wanted
= 1;
3333 msleep((caddr_t
)&iostate
.io_wanted
, cl_mtxp
, PRIBIO
+ 1, "cluster_nocopy_read", 0);
3335 lck_mtx_unlock(cl_mtxp
);
3337 if (iostate
.io_error
) {
3339 * one of the earlier reads we issued ran into a hard error
3340 * don't issue any more reads, cleanup the UPL
3341 * that was just created but not used, then
3342 * go wait for any other reads to complete before
3343 * returning the error to the caller
3345 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
, abort_flag
);
3347 goto wait_for_reads
;
3349 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 73)) | DBG_FUNC_START
,
3350 (int)upl
, (int)upl_offset
, (int)uio
->uio_offset
, io_size
, 0);
3352 retval
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
, io_size
,
3353 CL_PRESERVE
| CL_COMMIT
| CL_READ
| CL_ASYNC
| CL_NOZERO
,
3354 (buf_t
)NULL
, &iostate
);
3357 * update the uio structure
3359 uio_update(uio
, (user_size_t
)io_size
);
3361 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 73)) | DBG_FUNC_END
,
3362 (int)upl
, (int)uio
->uio_offset
, (int)uio_resid(uio
), retval
, 0);
3368 * make sure all async reads that are part of this stream
3369 * have completed before we return
3371 lck_mtx_lock(cl_mtxp
);
3373 while (iostate
.io_issued
!= iostate
.io_completed
) {
3374 iostate
.io_wanted
= 1;
3375 msleep((caddr_t
)&iostate
.io_wanted
, cl_mtxp
, PRIBIO
+ 1, "cluster_nocopy_read", 0);
3377 lck_mtx_unlock(cl_mtxp
);
3379 if (iostate
.io_error
)
3380 retval
= iostate
.io_error
;
3382 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_END
,
3383 (int)uio
->uio_offset
, (int)uio_resid(uio
), 6, retval
, 0);
3390 cluster_phys_read(vnode_t vp
, struct uio
*uio
, off_t filesize
)
3392 upl_page_info_t
*pl
;
3394 vm_offset_t upl_offset
;
3398 user_size_t iov_len
;
3399 user_addr_t iov_base
;
3402 int upl_needed_size
;
3406 struct clios iostate
;
3410 devblocksize
= vp
->v_mount
->mnt_devblocksize
;
3412 * When we enter this routine, we know
3413 * -- the resid will not exceed iov_len
3414 * -- the target address is physically contiguous
3418 if (IS_VALID_UIO_SEGFLG(uio
->uio_segflg
) == 0) {
3419 panic("%s :%d - invalid uio_segflg\n", __FILE__
, __LINE__
);
3421 #endif /* LP64_DEBUG */
3423 iov_len
= uio_curriovlen(uio
);
3424 iov_base
= uio_curriovbase(uio
);
3426 max_size
= filesize
- uio
->uio_offset
;
3428 // LP64todo - fix this!
3429 if (max_size
< 0 || (u_int64_t
)max_size
> iov_len
)
3434 // LP64todo - fix this!
3435 upl_offset
= CAST_DOWN(vm_offset_t
, iov_base
) & PAGE_MASK
;
3436 upl_needed_size
= upl_offset
+ io_size
;
3440 upl_size
= upl_needed_size
;
3441 upl_flags
= UPL_FILE_IO
| UPL_NO_SYNC
| UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
3443 kret
= vm_map_get_upl(current_map(),
3444 (vm_map_offset_t
)(iov_base
& ~((user_addr_t
)PAGE_MASK
)),
3445 &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
, 0);
3447 if (kret
!= KERN_SUCCESS
) {
3449 * cluster_phys_read: failed to get pagelist
3453 if (upl_size
< upl_needed_size
) {
3455 * The upl_size wasn't satisfied.
3457 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3461 pl
= ubc_upl_pageinfo(upl
);
3463 dst_paddr
= ((addr64_t
)upl_phys_page(pl
, 0) << 12) + (addr64_t
)upl_offset
;
3465 while (((uio
->uio_offset
& (devblocksize
- 1)) || io_size
< devblocksize
) && io_size
) {
3468 head_size
= devblocksize
- (int)(uio
->uio_offset
& (devblocksize
- 1));
3470 if (head_size
> io_size
)
3471 head_size
= io_size
;
3473 error
= cluster_align_phys_io(vp
, uio
, dst_paddr
, head_size
, CL_READ
);
3476 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3480 upl_offset
+= head_size
;
3481 dst_paddr
+= head_size
;
3482 io_size
-= head_size
;
3484 tail_size
= io_size
& (devblocksize
- 1);
3485 io_size
-= tail_size
;
3487 iostate
.io_completed
= 0;
3488 iostate
.io_issued
= 0;
3489 iostate
.io_error
= 0;
3490 iostate
.io_wanted
= 0;
3492 while (io_size
&& error
== 0) {
3495 if (io_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
3496 xsize
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
3500 * request asynchronously so that we can overlap
3501 * the preparation of the next I/O... we'll do
3502 * the commit after all the I/O has completed
3503 * since its all issued against the same UPL
3504 * if there are already too many outstanding reads
3505 * wait until some have completed before issuing the next
3507 lck_mtx_lock(cl_mtxp
);
3509 while ((iostate
.io_issued
- iostate
.io_completed
) > (8 * MAX_UPL_TRANSFER
* PAGE_SIZE
)) {
3510 iostate
.io_wanted
= 1;
3511 msleep((caddr_t
)&iostate
.io_wanted
, cl_mtxp
, PRIBIO
+ 1, "cluster_phys_read", 0);
3513 lck_mtx_unlock(cl_mtxp
);
3515 error
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
, xsize
,
3516 CL_READ
| CL_NOZERO
| CL_DEV_MEMORY
| CL_ASYNC
,
3517 (buf_t
)NULL
, &iostate
);
3519 * The cluster_io read was issued successfully,
3520 * update the uio structure
3523 uio_update(uio
, (user_size_t
)xsize
);
3526 upl_offset
+= xsize
;
3531 * make sure all async reads that are part of this stream
3532 * have completed before we proceed
3534 lck_mtx_lock(cl_mtxp
);
3536 while (iostate
.io_issued
!= iostate
.io_completed
) {
3537 iostate
.io_wanted
= 1;
3538 msleep((caddr_t
)&iostate
.io_wanted
, cl_mtxp
, PRIBIO
+ 1, "cluster_phys_read", 0);
3540 lck_mtx_unlock(cl_mtxp
);
3542 if (iostate
.io_error
)
3543 error
= iostate
.io_error
;
3545 if (error
== 0 && tail_size
)
3546 error
= cluster_align_phys_io(vp
, uio
, dst_paddr
, tail_size
, CL_READ
);
3549 * just release our hold on the physically contiguous
3550 * region without changing any state
3552 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3559 * generate advisory I/O's in the largest chunks possible
3560 * the completed pages will be released into the VM cache
3563 advisory_read(vnode_t vp
, off_t filesize
, off_t f_offset
, int resid
)
3565 upl_page_info_t
*pl
;
3567 vm_offset_t upl_offset
;
3581 if ( !UBCINFOEXISTS(vp
))
3584 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 60)) | DBG_FUNC_START
,
3585 (int)f_offset
, resid
, (int)filesize
, 0, 0);
3587 while (resid
&& f_offset
< filesize
&& retval
== 0) {
3589 * compute the size of the upl needed to encompass
3590 * the requested read... limit each call to cluster_io
3591 * to the maximum UPL size... cluster_io will clip if
3592 * this exceeds the maximum io_size for the device,
3593 * make sure to account for
3594 * a starting offset that's not page aligned
3596 start_offset
= (int)(f_offset
& PAGE_MASK_64
);
3597 upl_f_offset
= f_offset
- (off_t
)start_offset
;
3598 max_size
= filesize
- f_offset
;
3600 if (resid
< max_size
)
3605 upl_size
= (start_offset
+ io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
3606 if (upl_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
3607 upl_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
3611 * return the number of contiguously present pages in the cache
3612 * starting at upl_f_offset within the file
3614 ubc_range_op(vp
, upl_f_offset
, upl_f_offset
+ upl_size
, UPL_ROP_PRESENT
, &skip_range
);
3618 * skip over pages already present in the cache
3620 io_size
= skip_range
- start_offset
;
3622 f_offset
+= io_size
;
3625 if (skip_range
== upl_size
)
3628 * have to issue some real I/O
3629 * at this point, we know it's starting on a page boundary
3630 * because we've skipped over at least the first page in the request
3633 upl_f_offset
+= skip_range
;
3634 upl_size
-= skip_range
;
3636 pages_in_upl
= upl_size
/ PAGE_SIZE
;
3638 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 61)) | DBG_FUNC_START
,
3639 (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
3641 kret
= ubc_create_upl(vp
,
3646 UPL_RET_ONLY_ABSENT
| UPL_SET_LITE
);
3647 if (kret
!= KERN_SUCCESS
)
3652 * before we start marching forward, we must make sure we end on
3653 * a present page, otherwise we will be working with a freed
3656 for (last_pg
= pages_in_upl
- 1; last_pg
>= 0; last_pg
--) {
3657 if (upl_page_present(pl
, last_pg
))
3660 pages_in_upl
= last_pg
+ 1;
3663 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 61)) | DBG_FUNC_END
,
3664 (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
3667 for (last_pg
= 0; last_pg
< pages_in_upl
; ) {
3669 * scan from the beginning of the upl looking for the first
3670 * page that is present.... this will become the first page in
3671 * the request we're going to make to 'cluster_io'... if all
3672 * of the pages are absent, we won't call through to 'cluster_io'
3674 for (start_pg
= last_pg
; start_pg
< pages_in_upl
; start_pg
++) {
3675 if (upl_page_present(pl
, start_pg
))
3680 * scan from the starting present page looking for an absent
3681 * page before the end of the upl is reached, if we
3682 * find one, then it will terminate the range of pages being
3683 * presented to 'cluster_io'
3685 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
3686 if (!upl_page_present(pl
, last_pg
))
3690 if (last_pg
> start_pg
) {
3692 * we found a range of pages that must be filled
3693 * if the last page in this range is the last page of the file
3694 * we may have to clip the size of it to keep from reading past
3695 * the end of the last physical block associated with the file
3697 upl_offset
= start_pg
* PAGE_SIZE
;
3698 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
3700 if ((upl_f_offset
+ upl_offset
+ io_size
) > filesize
)
3701 io_size
= filesize
- (upl_f_offset
+ upl_offset
);
3704 * issue an asynchronous read to cluster_io
3706 retval
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, io_size
,
3707 CL_ASYNC
| CL_READ
| CL_COMMIT
| CL_AGE
, (buf_t
)NULL
, (struct clios
*)NULL
);
3713 ubc_upl_abort(upl
, 0);
3715 io_size
= upl_size
- start_offset
;
3717 if (io_size
> resid
)
3719 f_offset
+= io_size
;
3723 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 60)) | DBG_FUNC_END
,
3724 (int)f_offset
, resid
, retval
, 0, 0);
3731 cluster_push(vnode_t vp
, int flags
)
3734 struct cl_writebehind
*wbp
;
3736 if ( !UBCINFOEXISTS(vp
)) {
3737 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_NONE
, (int)vp
, flags
, 0, -1, 0);
3740 /* return if deferred write is set */
3741 if (((unsigned int)vfs_flags(vp
->v_mount
) & MNT_DEFWRITE
) && (flags
& IO_DEFWRITE
)) {
3744 if ((wbp
= cluster_get_wbp(vp
, CLW_RETURNLOCKED
)) == NULL
) {
3745 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_NONE
, (int)vp
, flags
, 0, -2, 0);
3748 if (wbp
->cl_number
== 0 && wbp
->cl_scmap
== NULL
) {
3749 lck_mtx_unlock(&wbp
->cl_lockw
);
3751 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_NONE
, (int)vp
, flags
, 0, -3, 0);
3754 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_START
,
3755 (int)wbp
->cl_scmap
, wbp
->cl_number
, flags
, 0, 0);
3757 if (wbp
->cl_scmap
) {
3758 sparse_cluster_push(wbp
, vp
, ubc_getsize(vp
), 1);
3762 retval
= cluster_try_push(wbp
, vp
, ubc_getsize(vp
), 0, 1);
3764 lck_mtx_unlock(&wbp
->cl_lockw
);
3766 if (flags
& IO_SYNC
)
3767 (void)vnode_waitforwrites(vp
, 0, 0, 0, (char *)"cluster_push");
3769 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_END
,
3770 (int)wbp
->cl_scmap
, wbp
->cl_number
, retval
, 0, 0);
3776 __private_extern__
void
3777 cluster_release(struct ubc_info
*ubc
)
3779 struct cl_writebehind
*wbp
;
3780 struct cl_readahead
*rap
;
3782 if ((wbp
= ubc
->cl_wbehind
)) {
3784 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_START
, (int)ubc
, (int)wbp
->cl_scmap
, wbp
->cl_scdirty
, 0, 0);
3787 vfs_drt_control(&(wbp
->cl_scmap
), 0);
3789 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_START
, (int)ubc
, 0, 0, 0, 0);
3792 rap
= ubc
->cl_rahead
;
3795 lck_mtx_destroy(&wbp
->cl_lockw
, cl_mtx_grp
);
3796 FREE_ZONE((void *)wbp
, sizeof *wbp
, M_CLWRBEHIND
);
3798 if ((rap
= ubc
->cl_rahead
)) {
3799 lck_mtx_destroy(&rap
->cl_lockr
, cl_mtx_grp
);
3800 FREE_ZONE((void *)rap
, sizeof *rap
, M_CLRDAHEAD
);
3802 ubc
->cl_rahead
= NULL
;
3803 ubc
->cl_wbehind
= NULL
;
3805 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_END
, (int)ubc
, (int)rap
, (int)wbp
, 0, 0);
3810 cluster_push_EOF(vnode_t vp
, off_t EOF
)
3812 struct cl_writebehind
*wbp
;
3814 wbp
= cluster_get_wbp(vp
, CLW_ALLOCATE
| CLW_RETURNLOCKED
);
3816 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_START
,
3817 (int)wbp
->cl_scmap
, wbp
->cl_number
, (int)EOF
, 0, 0);
3820 sparse_cluster_push(wbp
, vp
, EOF
, 1);
3822 cluster_try_push(wbp
, vp
, EOF
, 0, 1);
3824 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_END
,
3825 (int)wbp
->cl_scmap
, wbp
->cl_number
, 0, 0, 0);
3827 lck_mtx_unlock(&wbp
->cl_lockw
);
3832 cluster_try_push(struct cl_writebehind
*wbp
, vnode_t vp
, off_t EOF
, int can_delay
, int push_all
)
3839 struct cl_wextent l_clusters
[MAX_CLUSTERS
];
3842 * the write behind context exists and has
3843 * already been locked...
3845 * make a local 'sorted' copy of the clusters
3846 * and clear wbp->cl_number so that new clusters can
3849 for (cl_index
= 0; cl_index
< wbp
->cl_number
; cl_index
++) {
3850 for (min_index
= -1, cl_index1
= 0; cl_index1
< wbp
->cl_number
; cl_index1
++) {
3851 if (wbp
->cl_clusters
[cl_index1
].b_addr
== wbp
->cl_clusters
[cl_index1
].e_addr
)
3853 if (min_index
== -1)
3854 min_index
= cl_index1
;
3855 else if (wbp
->cl_clusters
[cl_index1
].b_addr
< wbp
->cl_clusters
[min_index
].b_addr
)
3856 min_index
= cl_index1
;
3858 if (min_index
== -1)
3860 l_clusters
[cl_index
].b_addr
= wbp
->cl_clusters
[min_index
].b_addr
;
3861 l_clusters
[cl_index
].e_addr
= wbp
->cl_clusters
[min_index
].e_addr
;
3862 l_clusters
[cl_index
].io_nocache
= wbp
->cl_clusters
[min_index
].io_nocache
;
3864 wbp
->cl_clusters
[min_index
].b_addr
= wbp
->cl_clusters
[min_index
].e_addr
;
3870 if (can_delay
&& cl_len
== MAX_CLUSTERS
) {
3874 * determine if we appear to be writing the file sequentially
3875 * if not, by returning without having pushed any clusters
3876 * we will cause this vnode to be pushed into the sparse cluster mechanism
3877 * used for managing more random I/O patterns
3879 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
3880 * that's why we're in try_push with can_delay true...
3882 * check to make sure that all the clusters except the last one are 'full'... and that each cluster
3883 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
3884 * so we can just make a simple pass through, up to, but not including the last one...
3885 * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
3888 * we let the last one be partial as long as it was adjacent to the previous one...
3889 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
3890 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
3892 for (i
= 0; i
< MAX_CLUSTERS
- 1; i
++) {
3893 if ((l_clusters
[i
].e_addr
- l_clusters
[i
].b_addr
) != MAX_UPL_TRANSFER
)
3895 if (l_clusters
[i
].e_addr
!= l_clusters
[i
+1].b_addr
)
3900 * drop the lock while we're firing off the I/Os...
3901 * this is safe since I'm working off of a private sorted copy
3902 * of the clusters, and I'm going to re-evaluate the public
3903 * state after I retake the lock
3905 lck_mtx_unlock(&wbp
->cl_lockw
);
3907 for (cl_index
= 0; cl_index
< cl_len
; cl_index
++) {
3909 struct cl_extent cl
;
3912 * try to push each cluster in turn...
3914 if (l_clusters
[cl_index
].io_nocache
)
3918 cl
.b_addr
= l_clusters
[cl_index
].b_addr
;
3919 cl
.e_addr
= l_clusters
[cl_index
].e_addr
;
3921 cluster_push_x(vp
, &cl
, EOF
, flags
);
3923 l_clusters
[cl_index
].b_addr
= 0;
3924 l_clusters
[cl_index
].e_addr
= 0;
3931 lck_mtx_lock(&wbp
->cl_lockw
);
3934 if (cl_len
> cl_pushed
) {
3936 * we didn't push all of the clusters, so
3937 * lets try to merge them back in to the vnode
3939 if ((MAX_CLUSTERS
- wbp
->cl_number
) < (cl_len
- cl_pushed
)) {
3941 * we picked up some new clusters while we were trying to
3942 * push the old ones... this can happen because I've dropped
3943 * the vnode lock... the sum of the
3944 * leftovers plus the new cluster count exceeds our ability
3945 * to represent them, so switch to the sparse cluster mechanism
3947 * collect the active public clusters...
3949 sparse_cluster_switch(wbp
, vp
, EOF
);
3951 for (cl_index
= 0, cl_index1
= 0; cl_index
< cl_len
; cl_index
++) {
3952 if (l_clusters
[cl_index
].b_addr
== l_clusters
[cl_index
].e_addr
)
3954 wbp
->cl_clusters
[cl_index1
].b_addr
= l_clusters
[cl_index
].b_addr
;
3955 wbp
->cl_clusters
[cl_index1
].e_addr
= l_clusters
[cl_index
].e_addr
;
3956 wbp
->cl_clusters
[cl_index1
].io_nocache
= l_clusters
[cl_index
].io_nocache
;
3961 * update the cluster count
3963 wbp
->cl_number
= cl_index1
;
3966 * and collect the original clusters that were moved into the
3967 * local storage for sorting purposes
3969 sparse_cluster_switch(wbp
, vp
, EOF
);
3973 * we've got room to merge the leftovers back in
3974 * just append them starting at the next 'hole'
3975 * represented by wbp->cl_number
3977 for (cl_index
= 0, cl_index1
= wbp
->cl_number
; cl_index
< cl_len
; cl_index
++) {
3978 if (l_clusters
[cl_index
].b_addr
== l_clusters
[cl_index
].e_addr
)
3981 wbp
->cl_clusters
[cl_index1
].b_addr
= l_clusters
[cl_index
].b_addr
;
3982 wbp
->cl_clusters
[cl_index1
].e_addr
= l_clusters
[cl_index
].e_addr
;
3983 wbp
->cl_clusters
[cl_index1
].io_nocache
= l_clusters
[cl_index
].io_nocache
;
3988 * update the cluster count
3990 wbp
->cl_number
= cl_index1
;
3993 return(MAX_CLUSTERS
- wbp
->cl_number
);
3999 cluster_push_x(vnode_t vp
, struct cl_extent
*cl
, off_t EOF
, int flags
)
4001 upl_page_info_t
*pl
;
4003 vm_offset_t upl_offset
;
4018 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_START
,
4019 (int)cl
->b_addr
, (int)cl
->e_addr
, (int)EOF
, flags
, 0);
4021 if ((pages_in_upl
= (int)(cl
->e_addr
- cl
->b_addr
)) == 0) {
4022 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 0, 0, 0, 0);
4026 upl_size
= pages_in_upl
* PAGE_SIZE
;
4027 upl_f_offset
= (off_t
)(cl
->b_addr
* PAGE_SIZE_64
);
4029 if (upl_f_offset
+ upl_size
>= EOF
) {
4031 if (upl_f_offset
>= EOF
) {
4033 * must have truncated the file and missed
4034 * clearing a dangling cluster (i.e. it's completely
4035 * beyond the new EOF
4037 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 1, 0, 0, 0);
4041 size
= EOF
- upl_f_offset
;
4043 upl_size
= (size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
4044 pages_in_upl
= upl_size
/ PAGE_SIZE
;
4048 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_START
, upl_size
, size
, 0, 0, 0);
4051 * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
4053 * - only pages that are currently dirty are returned... these are the ones we need to clean
4054 * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
4055 * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
4056 * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
4057 * someone dirties this page while the I/O is in progress, we don't lose track of the new state
4059 * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
4062 if ((vp
->v_flag
& VNOCACHE_DATA
) || (flags
& IO_NOCACHE
))
4063 upl_flags
= UPL_COPYOUT_FROM
| UPL_RET_ONLY_DIRTY
| UPL_SET_LITE
| UPL_WILL_BE_DUMPED
;
4065 upl_flags
= UPL_COPYOUT_FROM
| UPL_RET_ONLY_DIRTY
| UPL_SET_LITE
;
4067 kret
= ubc_create_upl(vp
,
4073 if (kret
!= KERN_SUCCESS
)
4074 panic("cluster_push: failed to get pagelist");
4076 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_END
, (int)upl
, upl_f_offset
, 0, 0, 0);
4079 * since we only asked for the dirty pages back
4080 * it's possible that we may only get a few or even none, so...
4081 * before we start marching forward, we must make sure we know
4082 * where the last present page is in the UPL, otherwise we could
4083 * end up working with a freed upl due to the FREE_ON_EMPTY semantics
4084 * employed by commit_range and abort_range.
4086 for (last_pg
= pages_in_upl
- 1; last_pg
>= 0; last_pg
--) {
4087 if (upl_page_present(pl
, last_pg
))
4090 pages_in_upl
= last_pg
+ 1;
4092 if (pages_in_upl
== 0) {
4093 ubc_upl_abort(upl
, 0);
4095 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 2, 0, 0, 0);
4099 for (last_pg
= 0; last_pg
< pages_in_upl
; ) {
4101 * find the next dirty page in the UPL
4102 * this will become the first page in the
4103 * next I/O to generate
4105 for (start_pg
= last_pg
; start_pg
< pages_in_upl
; start_pg
++) {
4106 if (upl_dirty_page(pl
, start_pg
))
4108 if (upl_page_present(pl
, start_pg
))
4110 * RET_ONLY_DIRTY will return non-dirty 'precious' pages
4111 * just release these unchanged since we're not going
4112 * to steal them or change their state
4114 ubc_upl_abort_range(upl
, start_pg
* PAGE_SIZE
, PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
4116 if (start_pg
>= pages_in_upl
)
4118 * done... no more dirty pages to push
4121 if (start_pg
> last_pg
)
4123 * skipped over some non-dirty pages
4125 size
-= ((start_pg
- last_pg
) * PAGE_SIZE
);
4128 * find a range of dirty pages to write
4130 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
4131 if (!upl_dirty_page(pl
, last_pg
))
4134 upl_offset
= start_pg
* PAGE_SIZE
;
4136 io_size
= min(size
, (last_pg
- start_pg
) * PAGE_SIZE
);
4138 io_flags
= CL_THROTTLE
| CL_COMMIT
;
4140 if ( !(flags
& IO_SYNC
))
4141 io_flags
|= CL_ASYNC
;
4143 retval
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, io_size
,
4144 io_flags
, (buf_t
)NULL
, (struct clios
*)NULL
);
4146 if (error
== 0 && retval
)
4151 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 3, 0, 0, 0);
4158 * sparse_cluster_switch is called with the write behind lock held
4161 sparse_cluster_switch(struct cl_writebehind
*wbp
, vnode_t vp
, off_t EOF
)
4165 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 78)) | DBG_FUNC_START
, (int)vp
, (int)wbp
->cl_scmap
, wbp
->cl_scdirty
, 0, 0);
4167 if (wbp
->cl_scmap
== NULL
)
4168 wbp
->cl_scdirty
= 0;
4170 for (cl_index
= 0; cl_index
< wbp
->cl_number
; cl_index
++) {
4172 struct cl_extent cl
;
4174 for (cl
.b_addr
= wbp
->cl_clusters
[cl_index
].b_addr
; cl
.b_addr
< wbp
->cl_clusters
[cl_index
].e_addr
; cl
.b_addr
++) {
4176 if (ubc_page_op(vp
, (off_t
)(cl
.b_addr
* PAGE_SIZE_64
), 0, 0, &flags
) == KERN_SUCCESS
) {
4177 if (flags
& UPL_POP_DIRTY
) {
4178 cl
.e_addr
= cl
.b_addr
+ 1;
4180 sparse_cluster_add(wbp
, vp
, &cl
, EOF
);
4187 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 78)) | DBG_FUNC_END
, (int)vp
, (int)wbp
->cl_scmap
, wbp
->cl_scdirty
, 0, 0);
4192 * sparse_cluster_push is called with the write behind lock held
4195 sparse_cluster_push(struct cl_writebehind
*wbp
, vnode_t vp
, off_t EOF
, int push_all
)
4197 struct cl_extent cl
;
4201 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 79)) | DBG_FUNC_START
, (int)vp
, (int)wbp
->cl_scmap
, wbp
->cl_scdirty
, push_all
, 0);
4204 vfs_drt_control(&(wbp
->cl_scmap
), 1);
4207 if (vfs_drt_get_cluster(&(wbp
->cl_scmap
), &offset
, &length
) != KERN_SUCCESS
)
4210 cl
.b_addr
= (daddr64_t
)(offset
/ PAGE_SIZE_64
);
4211 cl
.e_addr
= (daddr64_t
)((offset
+ length
) / PAGE_SIZE_64
);
4213 wbp
->cl_scdirty
-= (int)(cl
.e_addr
- cl
.b_addr
);
4215 cluster_push_x(vp
, &cl
, EOF
, 0);
4220 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 79)) | DBG_FUNC_END
, (int)vp
, (int)wbp
->cl_scmap
, wbp
->cl_scdirty
, 0, 0);
4225 * sparse_cluster_add is called with the write behind lock held
4228 sparse_cluster_add(struct cl_writebehind
*wbp
, vnode_t vp
, struct cl_extent
*cl
, off_t EOF
)
4234 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 80)) | DBG_FUNC_START
, (int)wbp
->cl_scmap
, wbp
->cl_scdirty
, (int)cl
->b_addr
, (int)cl
->e_addr
, 0);
4236 offset
= (off_t
)(cl
->b_addr
* PAGE_SIZE_64
);
4237 length
= ((u_int
)(cl
->e_addr
- cl
->b_addr
)) * PAGE_SIZE
;
4239 while (vfs_drt_mark_pages(&(wbp
->cl_scmap
), offset
, length
, &new_dirty
) != KERN_SUCCESS
) {
4241 * no room left in the map
4242 * only a partial update was done
4243 * push out some pages and try again
4245 wbp
->cl_scdirty
+= new_dirty
;
4247 sparse_cluster_push(wbp
, vp
, EOF
, 0);
4249 offset
+= (new_dirty
* PAGE_SIZE_64
);
4250 length
-= (new_dirty
* PAGE_SIZE
);
4252 wbp
->cl_scdirty
+= new_dirty
;
4254 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 80)) | DBG_FUNC_END
, (int)vp
, (int)wbp
->cl_scmap
, wbp
->cl_scdirty
, 0, 0);
4259 cluster_align_phys_io(vnode_t vp
, struct uio
*uio
, addr64_t usr_paddr
, int xsize
, int flags
)
4261 upl_page_info_t
*pl
;
4270 upl_flags
= UPL_SET_LITE
;
4271 if (! (flags
& CL_READ
)) {
4273 * "write" operation: let the UPL subsystem know
4274 * that we intend to modify the buffer cache pages
4277 upl_flags
|= UPL_WILL_MODIFY
;
4280 kret
= ubc_create_upl(vp
,
4281 uio
->uio_offset
& ~PAGE_MASK_64
,
4287 if (kret
!= KERN_SUCCESS
)
4290 if (!upl_valid_page(pl
, 0)) {
4292 * issue a synchronous read to cluster_io
4294 error
= cluster_io(vp
, upl
, 0, uio
->uio_offset
& ~PAGE_MASK_64
, PAGE_SIZE
,
4295 CL_READ
, (buf_t
)NULL
, (struct clios
*)NULL
);
4297 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
4303 ubc_paddr
= ((addr64_t
)upl_phys_page(pl
, 0) << 12) + (addr64_t
)(uio
->uio_offset
& PAGE_MASK_64
);
4306 * NOTE: There is no prototype for the following in BSD. It, and the definitions
4307 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
4308 * osfmk/ppc/mappings.h. They are not included here because there appears to be no
4309 * way to do so without exporting them to kexts as well.
4311 if (flags
& CL_READ
)
4312 // copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */
4313 copypv(ubc_paddr
, usr_paddr
, xsize
, 2 | 1 | 4); /* Copy physical to physical and flush the destination */
4315 // copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */
4316 copypv(usr_paddr
, ubc_paddr
, xsize
, 2 | 1 | 8); /* Copy physical to physical and flush the source */
4318 if ( !(flags
& CL_READ
) || (upl_valid_page(pl
, 0) && upl_dirty_page(pl
, 0))) {
4320 * issue a synchronous write to cluster_io
4322 error
= cluster_io(vp
, upl
, 0, uio
->uio_offset
& ~PAGE_MASK_64
, PAGE_SIZE
,
4323 0, (buf_t
)NULL
, (struct clios
*)NULL
);
4326 uio_update(uio
, (user_size_t
)xsize
);
4329 abort_flags
= UPL_ABORT_FREE_ON_EMPTY
;
4331 abort_flags
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_DUMP_PAGES
;
4333 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, abort_flags
);
4341 cluster_copy_upl_data(struct uio
*uio
, upl_t upl
, int upl_offset
, int xsize
)
4348 upl_page_info_t
*pl
;
4350 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_START
,
4351 (int)uio
->uio_offset
, uio_resid(uio
), upl_offset
, xsize
, 0);
4353 segflg
= uio
->uio_segflg
;
4357 case UIO_USERSPACE32
:
4358 case UIO_USERISPACE32
:
4359 uio
->uio_segflg
= UIO_PHYS_USERSPACE32
;
4363 case UIO_USERISPACE
:
4364 uio
->uio_segflg
= UIO_PHYS_USERSPACE
;
4367 case UIO_USERSPACE64
:
4368 case UIO_USERISPACE64
:
4369 uio
->uio_segflg
= UIO_PHYS_USERSPACE64
;
4372 case UIO_SYSSPACE32
:
4373 uio
->uio_segflg
= UIO_PHYS_SYSSPACE32
;
4377 uio
->uio_segflg
= UIO_PHYS_SYSSPACE
;
4380 case UIO_SYSSPACE64
:
4381 uio
->uio_segflg
= UIO_PHYS_SYSSPACE64
;
4384 pl
= ubc_upl_pageinfo(upl
);
4386 pg_index
= upl_offset
/ PAGE_SIZE
;
4387 pg_offset
= upl_offset
& PAGE_MASK
;
4388 csize
= min(PAGE_SIZE
- pg_offset
, xsize
);
4390 while (xsize
&& retval
== 0) {
4393 paddr
= ((addr64_t
)upl_phys_page(pl
, pg_index
) << 12) + pg_offset
;
4395 retval
= uiomove64(paddr
, csize
, uio
);
4400 csize
= min(PAGE_SIZE
, xsize
);
4402 uio
->uio_segflg
= segflg
;
4404 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
4405 (int)uio
->uio_offset
, uio_resid(uio
), retval
, segflg
, 0);
4412 cluster_copy_ubc_data(vnode_t vp
, struct uio
*uio
, int *io_resid
, int mark_dirty
)
4419 memory_object_control_t control
;
4422 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_START
,
4423 (int)uio
->uio_offset
, uio_resid(uio
), 0, *io_resid
, 0);
4425 control
= ubc_getobject(vp
, UBC_FLAGS_NONE
);
4426 if (control
== MEMORY_OBJECT_CONTROL_NULL
) {
4427 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
4428 (int)uio
->uio_offset
, uio_resid(uio
), retval
, 3, 0);
4432 segflg
= uio
->uio_segflg
;
4436 case UIO_USERSPACE32
:
4437 case UIO_USERISPACE32
:
4438 uio
->uio_segflg
= UIO_PHYS_USERSPACE32
;
4441 case UIO_USERSPACE64
:
4442 case UIO_USERISPACE64
:
4443 uio
->uio_segflg
= UIO_PHYS_USERSPACE64
;
4446 case UIO_SYSSPACE32
:
4447 uio
->uio_segflg
= UIO_PHYS_SYSSPACE32
;
4450 case UIO_SYSSPACE64
:
4451 uio
->uio_segflg
= UIO_PHYS_SYSSPACE64
;
4455 case UIO_USERISPACE
:
4456 uio
->uio_segflg
= UIO_PHYS_USERSPACE
;
4460 uio
->uio_segflg
= UIO_PHYS_SYSSPACE
;
4464 if ( (io_size
= *io_resid
) ) {
4465 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
4466 xsize
= uio_resid(uio
);
4468 retval
= memory_object_control_uiomove(control
, uio
->uio_offset
- start_offset
,
4469 uio
, start_offset
, io_size
, mark_dirty
);
4470 xsize
-= uio_resid(uio
);
4473 uio
->uio_segflg
= segflg
;
4474 *io_resid
= io_size
;
4476 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
4477 (int)uio
->uio_offset
, uio_resid(uio
), retval
, 0x80000000 | segflg
, 0);
4484 is_file_clean(vnode_t vp
, off_t filesize
)
4488 int total_dirty
= 0;
4490 for (f_offset
= 0; f_offset
< filesize
; f_offset
+= PAGE_SIZE_64
) {
4491 if (ubc_page_op(vp
, f_offset
, 0, 0, &flags
) == KERN_SUCCESS
) {
4492 if (flags
& UPL_POP_DIRTY
) {
4506 * Dirty region tracking/clustering mechanism.
4508 * This code (vfs_drt_*) provides a mechanism for tracking and clustering
4509 * dirty regions within a larger space (file). It is primarily intended to
4510 * support clustering in large files with many dirty areas.
4512 * The implementation assumes that the dirty regions are pages.
4514 * To represent dirty pages within the file, we store bit vectors in a
4515 * variable-size circular hash.
4519 * Bitvector size. This determines the number of pages we group in a
4520 * single hashtable entry. Each hashtable entry is aligned to this
4521 * size within the file.
4523 #define DRT_BITVECTOR_PAGES 256
4526 * File offset handling.
4528 * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
4529 * the correct formula is (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)
4531 #define DRT_ADDRESS_MASK (~((1 << 20) - 1))
4532 #define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
4535 * Hashtable address field handling.
4537 * The low-order bits of the hashtable address are used to conserve
4540 * DRT_HASH_COUNT_MASK must be large enough to store the range
4541 * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
4542 * to indicate that the bucket is actually unoccupied.
4544 #define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
4545 #define DRT_HASH_SET_ADDRESS(scm, i, a) \
4547 (scm)->scm_hashtable[(i)].dhe_control = \
4548 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
4550 #define DRT_HASH_COUNT_MASK 0x1ff
4551 #define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
4552 #define DRT_HASH_SET_COUNT(scm, i, c) \
4554 (scm)->scm_hashtable[(i)].dhe_control = \
4555 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \
4557 #define DRT_HASH_CLEAR(scm, i) \
4559 (scm)->scm_hashtable[(i)].dhe_control = 0; \
4561 #define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
4562 #define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
4563 #define DRT_HASH_COPY(oscm, oi, scm, i) \
4565 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
4566 DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
4571 * Hash table moduli.
4573 * Since the hashtable entry's size is dependent on the size of
4574 * the bitvector, and since the hashtable size is constrained to
4575 * both being prime and fitting within the desired allocation
4576 * size, these values need to be manually determined.
4578 * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
4580 * The small hashtable allocation is 1024 bytes, so the modulus is 23.
4581 * The large hashtable allocation is 16384 bytes, so the modulus is 401.
4583 #define DRT_HASH_SMALL_MODULUS 23
4584 #define DRT_HASH_LARGE_MODULUS 401
4586 #define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */
4587 #define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */
4589 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
4592 * Hashtable bitvector handling.
4594 * Bitvector fields are 32 bits long.
4597 #define DRT_HASH_SET_BIT(scm, i, bit) \
4598 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
4600 #define DRT_HASH_CLEAR_BIT(scm, i, bit) \
4601 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
4603 #define DRT_HASH_TEST_BIT(scm, i, bit) \
4604 ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
4606 #define DRT_BITVECTOR_CLEAR(scm, i) \
4607 bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4609 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
4610 bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
4611 &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
4612 (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4619 struct vfs_drt_hashentry
{
4620 u_int64_t dhe_control
;
4621 u_int32_t dhe_bitvector
[DRT_BITVECTOR_PAGES
/ 32];
4625 * Dirty Region Tracking structure.
4627 * The hashtable is allocated entirely inside the DRT structure.
4629 * The hash is a simple circular prime modulus arrangement, the structure
4630 * is resized from small to large if it overflows.
4633 struct vfs_drt_clustermap
{
4634 u_int32_t scm_magic
; /* sanity/detection */
4635 #define DRT_SCM_MAGIC 0x12020003
4636 u_int32_t scm_modulus
; /* current ring size */
4637 u_int32_t scm_buckets
; /* number of occupied buckets */
4638 u_int32_t scm_lastclean
; /* last entry we cleaned */
4639 u_int32_t scm_iskips
; /* number of slot skips */
4641 struct vfs_drt_hashentry scm_hashtable
[0];
4645 #define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
4646 #define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
4649 * Debugging codes and arguments.
4651 #define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
4652 #define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
4653 #define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
4654 #define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
4655 #define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
4658 /* 1 (clean, no map) */
4659 /* 2 (map alloc fail) */
4660 /* 3, resid (partial) */
4661 #define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
4662 #define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
4663 * lastclean, iskips */
4666 static kern_return_t
vfs_drt_alloc_map(struct vfs_drt_clustermap
**cmapp
);
4667 static kern_return_t
vfs_drt_free_map(struct vfs_drt_clustermap
*cmap
);
4668 static kern_return_t
vfs_drt_search_index(struct vfs_drt_clustermap
*cmap
,
4669 u_int64_t offset
, int *indexp
);
4670 static kern_return_t
vfs_drt_get_index(struct vfs_drt_clustermap
**cmapp
,
4674 static kern_return_t
vfs_drt_do_mark_pages(
4680 static void vfs_drt_trace(
4681 struct vfs_drt_clustermap
*cmap
,
4690 * Allocate and initialise a sparse cluster map.
4692 * Will allocate a new map, resize or compact an existing map.
4694 * XXX we should probably have at least one intermediate map size,
4695 * as the 1:16 ratio seems a bit drastic.
4697 static kern_return_t
4698 vfs_drt_alloc_map(struct vfs_drt_clustermap
**cmapp
)
4700 struct vfs_drt_clustermap
*cmap
, *ocmap
;
4703 int nsize
, i
, active_buckets
, index
, copycount
;
4710 * Decide on the size of the new map.
4712 if (ocmap
== NULL
) {
4713 nsize
= DRT_HASH_SMALL_MODULUS
;
4715 /* count the number of active buckets in the old map */
4717 for (i
= 0; i
< ocmap
->scm_modulus
; i
++) {
4718 if (!DRT_HASH_VACANT(ocmap
, i
) &&
4719 (DRT_HASH_GET_COUNT(ocmap
, i
) != 0))
4723 * If we're currently using the small allocation, check to
4724 * see whether we should grow to the large one.
4726 if (ocmap
->scm_modulus
== DRT_HASH_SMALL_MODULUS
) {
4727 /* if the ring is nearly full */
4728 if (active_buckets
> (DRT_HASH_SMALL_MODULUS
- 5)) {
4729 nsize
= DRT_HASH_LARGE_MODULUS
;
4731 nsize
= DRT_HASH_SMALL_MODULUS
;
4734 /* already using the large modulus */
4735 nsize
= DRT_HASH_LARGE_MODULUS
;
4737 * If the ring is completely full, there's
4738 * nothing useful for us to do. Behave as
4739 * though we had compacted into the new
4742 if (active_buckets
>= DRT_HASH_LARGE_MODULUS
)
4743 return(KERN_SUCCESS
);
4748 * Allocate and initialise the new map.
4751 kret
= kmem_alloc(kernel_map
, (vm_offset_t
*)&cmap
,
4752 (nsize
== DRT_HASH_SMALL_MODULUS
) ? DRT_SMALL_ALLOCATION
: DRT_LARGE_ALLOCATION
);
4753 if (kret
!= KERN_SUCCESS
)
4755 cmap
->scm_magic
= DRT_SCM_MAGIC
;
4756 cmap
->scm_modulus
= nsize
;
4757 cmap
->scm_buckets
= 0;
4758 cmap
->scm_lastclean
= 0;
4759 cmap
->scm_iskips
= 0;
4760 for (i
= 0; i
< cmap
->scm_modulus
; i
++) {
4761 DRT_HASH_CLEAR(cmap
, i
);
4762 DRT_HASH_VACATE(cmap
, i
);
4763 DRT_BITVECTOR_CLEAR(cmap
, i
);
4767 * If there's an old map, re-hash entries from it into the new map.
4770 if (ocmap
!= NULL
) {
4771 for (i
= 0; i
< ocmap
->scm_modulus
; i
++) {
4772 /* skip empty buckets */
4773 if (DRT_HASH_VACANT(ocmap
, i
) ||
4774 (DRT_HASH_GET_COUNT(ocmap
, i
) == 0))
4777 offset
= DRT_HASH_GET_ADDRESS(ocmap
, i
);
4778 kret
= vfs_drt_get_index(&cmap
, offset
, &index
, 1);
4779 if (kret
!= KERN_SUCCESS
) {
4780 /* XXX need to bail out gracefully here */
4781 panic("vfs_drt: new cluster map mysteriously too small");
4784 DRT_HASH_COPY(ocmap
, i
, cmap
, index
);
4789 /* log what we've done */
4790 vfs_drt_trace(cmap
, DRT_DEBUG_ALLOC
, copycount
, 0, 0, 0);
4793 * It's important to ensure that *cmapp always points to
4794 * a valid map, so we must overwrite it before freeing
4798 if (ocmap
!= NULL
) {
4799 /* emit stats into trace buffer */
4800 vfs_drt_trace(ocmap
, DRT_DEBUG_SCMDATA
,
4803 ocmap
->scm_lastclean
,
4806 vfs_drt_free_map(ocmap
);
4808 return(KERN_SUCCESS
);
4813 * Free a sparse cluster map.
4815 static kern_return_t
4816 vfs_drt_free_map(struct vfs_drt_clustermap
*cmap
)
4818 kmem_free(kernel_map
, (vm_offset_t
)cmap
,
4819 (cmap
->scm_modulus
== DRT_HASH_SMALL_MODULUS
) ? DRT_SMALL_ALLOCATION
: DRT_LARGE_ALLOCATION
);
4820 return(KERN_SUCCESS
);
4825 * Find the hashtable slot currently occupied by an entry for the supplied offset.
4827 static kern_return_t
4828 vfs_drt_search_index(struct vfs_drt_clustermap
*cmap
, u_int64_t offset
, int *indexp
)
4832 offset
= DRT_ALIGN_ADDRESS(offset
);
4833 index
= DRT_HASH(cmap
, offset
);
4835 /* traverse the hashtable */
4836 for (i
= 0; i
< cmap
->scm_modulus
; i
++) {
4839 * If the slot is vacant, we can stop.
4841 if (DRT_HASH_VACANT(cmap
, index
))
4845 * If the address matches our offset, we have success.
4847 if (DRT_HASH_GET_ADDRESS(cmap
, index
) == offset
) {
4849 return(KERN_SUCCESS
);
4853 * Move to the next slot, try again.
4855 index
= DRT_HASH_NEXT(cmap
, index
);
4860 return(KERN_FAILURE
);
4864 * Find the hashtable slot for the supplied offset. If we haven't allocated
4865 * one yet, allocate one and populate the address field. Note that it will
4866 * not have a nonzero page count and thus will still technically be free, so
4867 * in the case where we are called to clean pages, the slot will remain free.
4869 static kern_return_t
4870 vfs_drt_get_index(struct vfs_drt_clustermap
**cmapp
, u_int64_t offset
, int *indexp
, int recursed
)
4872 struct vfs_drt_clustermap
*cmap
;
4878 /* look for an existing entry */
4879 kret
= vfs_drt_search_index(cmap
, offset
, indexp
);
4880 if (kret
== KERN_SUCCESS
)
4883 /* need to allocate an entry */
4884 offset
= DRT_ALIGN_ADDRESS(offset
);
4885 index
= DRT_HASH(cmap
, offset
);
4887 /* scan from the index forwards looking for a vacant slot */
4888 for (i
= 0; i
< cmap
->scm_modulus
; i
++) {
4890 if (DRT_HASH_VACANT(cmap
, index
) || DRT_HASH_GET_COUNT(cmap
,index
) == 0) {
4891 cmap
->scm_buckets
++;
4892 if (index
< cmap
->scm_lastclean
)
4893 cmap
->scm_lastclean
= index
;
4894 DRT_HASH_SET_ADDRESS(cmap
, index
, offset
);
4895 DRT_HASH_SET_COUNT(cmap
, index
, 0);
4896 DRT_BITVECTOR_CLEAR(cmap
, index
);
4898 vfs_drt_trace(cmap
, DRT_DEBUG_INSERT
, (int)offset
, i
, 0, 0);
4899 return(KERN_SUCCESS
);
4901 cmap
->scm_iskips
+= i
;
4902 index
= DRT_HASH_NEXT(cmap
, index
);
4906 * We haven't found a vacant slot, so the map is full. If we're not
4907 * already recursed, try reallocating/compacting it.
4910 return(KERN_FAILURE
);
4911 kret
= vfs_drt_alloc_map(cmapp
);
4912 if (kret
== KERN_SUCCESS
) {
4913 /* now try to insert again */
4914 kret
= vfs_drt_get_index(cmapp
, offset
, indexp
, 1);
4920 * Implementation of set dirty/clean.
4922 * In the 'clean' case, not finding a map is OK.
4924 static kern_return_t
4925 vfs_drt_do_mark_pages(
4932 struct vfs_drt_clustermap
*cmap
, **cmapp
;
4934 int i
, index
, pgoff
, pgcount
, setcount
, ecount
;
4936 cmapp
= (struct vfs_drt_clustermap
**)private;
4939 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_START
, (int)offset
, (int)length
, dirty
, 0);
4941 if (setcountp
!= NULL
)
4944 /* allocate a cluster map if we don't already have one */
4946 /* no cluster map, nothing to clean */
4948 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 1, 0, 0, 0);
4949 return(KERN_SUCCESS
);
4951 kret
= vfs_drt_alloc_map(cmapp
);
4952 if (kret
!= KERN_SUCCESS
) {
4953 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 2, 0, 0, 0);
4960 * Iterate over the length of the region.
4962 while (length
> 0) {
4964 * Get the hashtable index for this offset.
4966 * XXX this will add blank entries if we are clearing a range
4967 * that hasn't been dirtied.
4969 kret
= vfs_drt_get_index(cmapp
, offset
, &index
, 0);
4970 cmap
= *cmapp
; /* may have changed! */
4971 /* this may be a partial-success return */
4972 if (kret
!= KERN_SUCCESS
) {
4973 if (setcountp
!= NULL
)
4974 *setcountp
= setcount
;
4975 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 3, (int)length
, 0, 0);
4981 * Work out how many pages we're modifying in this
4984 pgoff
= (offset
- DRT_ALIGN_ADDRESS(offset
)) / PAGE_SIZE
;
4985 pgcount
= min((length
/ PAGE_SIZE
), (DRT_BITVECTOR_PAGES
- pgoff
));
4988 * Iterate over pages, dirty/clearing as we go.
4990 ecount
= DRT_HASH_GET_COUNT(cmap
, index
);
4991 for (i
= 0; i
< pgcount
; i
++) {
4993 if (!DRT_HASH_TEST_BIT(cmap
, index
, pgoff
+ i
)) {
4994 DRT_HASH_SET_BIT(cmap
, index
, pgoff
+ i
);
4999 if (DRT_HASH_TEST_BIT(cmap
, index
, pgoff
+ i
)) {
5000 DRT_HASH_CLEAR_BIT(cmap
, index
, pgoff
+ i
);
5006 DRT_HASH_SET_COUNT(cmap
, index
, ecount
);
5008 offset
+= pgcount
* PAGE_SIZE
;
5009 length
-= pgcount
* PAGE_SIZE
;
5011 if (setcountp
!= NULL
)
5012 *setcountp
= setcount
;
5014 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 0, setcount
, 0, 0);
5016 return(KERN_SUCCESS
);
5020 * Mark a set of pages as dirty/clean.
5022 * This is a public interface.
5025 * Pointer to storage suitable for holding a pointer. Note that
5026 * this must either be NULL or a value set by this function.
5029 * Current file size in bytes.
5032 * Offset of the first page to be marked as dirty, in bytes. Must be
5036 * Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
5039 * Number of pages newly marked dirty by this call (optional).
5041 * Returns KERN_SUCCESS if all the pages were successfully marked.
5043 static kern_return_t
5044 vfs_drt_mark_pages(void **cmapp
, off_t offset
, u_int length
, int *setcountp
)
5046 /* XXX size unused, drop from interface */
5047 return(vfs_drt_do_mark_pages(cmapp
, offset
, length
, setcountp
, 1));
5051 static kern_return_t
5052 vfs_drt_unmark_pages(void **cmapp
, off_t offset
, u_int length
)
5054 return(vfs_drt_do_mark_pages(cmapp
, offset
, length
, NULL
, 0));
5059 * Get a cluster of dirty pages.
5061 * This is a public interface.
5064 * Pointer to storage managed by drt_mark_pages. Note that this must
5065 * be NULL or a value set by drt_mark_pages.
5068 * Returns the byte offset into the file of the first page in the cluster.
5071 * Returns the length in bytes of the cluster of dirty pages.
5073 * Returns success if a cluster was found. If KERN_FAILURE is returned, there
5074 * are no dirty pages meeting the minmum size criteria. Private storage will
5075 * be released if there are no more dirty pages left in the map
5078 static kern_return_t
5079 vfs_drt_get_cluster(void **cmapp
, off_t
*offsetp
, u_int
*lengthp
)
5081 struct vfs_drt_clustermap
*cmap
;
5084 int index
, i
, j
, fs
, ls
;
5087 if ((cmapp
== NULL
) || (*cmapp
== NULL
))
5088 return(KERN_FAILURE
);
5091 /* walk the hashtable */
5092 for (offset
= 0, j
= 0; j
< cmap
->scm_modulus
; offset
+= (DRT_BITVECTOR_PAGES
* PAGE_SIZE
), j
++) {
5093 index
= DRT_HASH(cmap
, offset
);
5095 if (DRT_HASH_VACANT(cmap
, index
) || (DRT_HASH_GET_COUNT(cmap
, index
) == 0))
5098 /* scan the bitfield for a string of bits */
5101 for (i
= 0; i
< DRT_BITVECTOR_PAGES
; i
++) {
5102 if (DRT_HASH_TEST_BIT(cmap
, index
, i
)) {
5108 /* didn't find any bits set */
5109 panic("vfs_drt: entry summary count > 0 but no bits set in map");
5111 for (ls
= 0; i
< DRT_BITVECTOR_PAGES
; i
++, ls
++) {
5112 if (!DRT_HASH_TEST_BIT(cmap
, index
, i
))
5116 /* compute offset and length, mark pages clean */
5117 offset
= DRT_HASH_GET_ADDRESS(cmap
, index
) + (PAGE_SIZE
* fs
);
5118 length
= ls
* PAGE_SIZE
;
5119 vfs_drt_do_mark_pages(cmapp
, offset
, length
, NULL
, 0);
5120 cmap
->scm_lastclean
= index
;
5122 /* return successful */
5123 *offsetp
= (off_t
)offset
;
5126 vfs_drt_trace(cmap
, DRT_DEBUG_RETCLUSTER
, (int)offset
, (int)length
, 0, 0);
5127 return(KERN_SUCCESS
);
5130 * We didn't find anything... hashtable is empty
5131 * emit stats into trace buffer and
5134 vfs_drt_trace(cmap
, DRT_DEBUG_SCMDATA
,
5137 cmap
->scm_lastclean
,
5140 vfs_drt_free_map(cmap
);
5143 return(KERN_FAILURE
);
5147 static kern_return_t
5148 vfs_drt_control(void **cmapp
, int op_type
)
5150 struct vfs_drt_clustermap
*cmap
;
5153 if ((cmapp
== NULL
) || (*cmapp
== NULL
))
5154 return(KERN_FAILURE
);
5159 /* emit stats into trace buffer */
5160 vfs_drt_trace(cmap
, DRT_DEBUG_SCMDATA
,
5163 cmap
->scm_lastclean
,
5166 vfs_drt_free_map(cmap
);
5171 cmap
->scm_lastclean
= 0;
5174 return(KERN_SUCCESS
);
5180 * Emit a summary of the state of the clustermap into the trace buffer
5181 * along with some caller-provided data.
5185 vfs_drt_trace(__unused
struct vfs_drt_clustermap
*cmap
, int code
, int arg1
, int arg2
, int arg3
, int arg4
)
5187 KERNEL_DEBUG(code
, arg1
, arg2
, arg3
, arg4
, 0);
5191 vfs_drt_trace(__unused
struct vfs_drt_clustermap
*cmap
, __unused
int code
,
5192 __unused
int arg1
, __unused
int arg2
, __unused
int arg3
,
5200 * Perform basic sanity check on the hash entry summary count
5201 * vs. the actual bits set in the entry.
5204 vfs_drt_sanity(struct vfs_drt_clustermap
*cmap
)
5209 for (index
= 0; index
< cmap
->scm_modulus
; index
++) {
5210 if (DRT_HASH_VACANT(cmap
, index
))
5213 for (bits_on
= 0, i
= 0; i
< DRT_BITVECTOR_PAGES
; i
++) {
5214 if (DRT_HASH_TEST_BIT(cmap
, index
, i
))
5217 if (bits_on
!= DRT_HASH_GET_COUNT(cmap
, index
))
5218 panic("bits_on = %d, index = %d\n", bits_on
, index
);