2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
25 * The Regents of the University of California. All rights reserved.
27 * Redistribution and use in source and binary forms, with or without
28 * modification, are permitted provided that the following conditions
30 * 1. Redistributions of source code must retain the above copyright
31 * notice, this list of conditions and the following disclaimer.
32 * 2. Redistributions in binary form must reproduce the above copyright
33 * notice, this list of conditions and the following disclaimer in the
34 * documentation and/or other materials provided with the distribution.
35 * 3. All advertising materials mentioning features or use of this software
36 * must display the following acknowledgement:
37 * This product includes software developed by the University of
38 * California, Berkeley and its contributors.
39 * 4. Neither the name of the University nor the names of its contributors
40 * may be used to endorse or promote products derived from this software
41 * without specific prior written permission.
43 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
58 #include <sys/param.h>
59 #include <sys/proc_internal.h>
60 #include <sys/buf_internal.h>
61 #include <sys/mount_internal.h>
62 #include <sys/vnode_internal.h>
63 #include <sys/trace.h>
64 #include <sys/malloc.h>
66 #include <sys/kernel.h>
67 #include <sys/resourcevar.h>
68 #include <sys/uio_internal.h>
69 #include <libkern/libkern.h>
70 #include <machine/machine_routines.h>
72 #include <sys/ubc_internal.h>
74 #include <mach/mach_types.h>
75 #include <mach/memory_object_types.h>
76 #include <mach/vm_map.h>
79 #include <vm/vm_kern.h>
80 #include <vm/vm_map.h>
81 #include <vm/vm_pageout.h>
83 #include <sys/kdebug.h>
89 #define CL_COMMIT 0x04
90 #define CL_PAGEOUT 0x10
93 #define CL_NOZERO 0x80
94 #define CL_PAGEIN 0x100
95 #define CL_DEV_MEMORY 0x200
96 #define CL_PRESERVE 0x400
97 #define CL_THROTTLE 0x800
98 #define CL_KEEPCACHED 0x1000
102 u_int io_completed
; /* amount of io that has currently completed */
103 u_int io_issued
; /* amount of io that was successfully issued */
104 int io_error
; /* error code of first error encountered */
105 int io_wanted
; /* someone is sleeping waiting for a change in state */
108 static lck_grp_t
*cl_mtx_grp
;
109 static lck_attr_t
*cl_mtx_attr
;
110 static lck_grp_attr_t
*cl_mtx_grp_attr
;
111 static lck_mtx_t
*cl_mtxp
;
114 static int cluster_io(vnode_t vp
, upl_t upl
, vm_offset_t upl_offset
, off_t f_offset
, int non_rounded_size
,
115 int flags
, buf_t real_bp
, struct clios
*iostate
);
116 static int cluster_iodone(buf_t bp
, void *dummy
);
117 static int cluster_rd_prefetch(vnode_t vp
, off_t f_offset
, u_int size
, off_t filesize
);
118 static int cluster_hard_throttle_on(vnode_t vp
);
120 static int cluster_read_x(vnode_t vp
, struct uio
*uio
, off_t filesize
, int flags
);
121 static int cluster_write_x(vnode_t vp
, struct uio
*uio
, off_t oldEOF
, off_t newEOF
,
122 off_t headOff
, off_t tailOff
, int flags
);
123 static int cluster_nocopy_read(vnode_t vp
, struct uio
*uio
, off_t filesize
);
124 static int cluster_nocopy_write(vnode_t vp
, struct uio
*uio
, off_t newEOF
);
125 static int cluster_phys_read(vnode_t vp
, struct uio
*uio
, off_t filesize
);
126 static int cluster_phys_write(vnode_t vp
, struct uio
*uio
, off_t newEOF
);
127 static int cluster_align_phys_io(vnode_t vp
, struct uio
*uio
, addr64_t usr_paddr
, int xsize
, int flags
);
129 static void cluster_rd_ahead(vnode_t vp
, struct cl_extent
*extent
, off_t filesize
, struct cl_readahead
*ra
);
131 static int cluster_push_x(vnode_t vp
, struct cl_extent
*, off_t EOF
, int flags
);
132 static void cluster_push_EOF(vnode_t vp
, off_t EOF
);
134 static int cluster_try_push(struct cl_writebehind
*, vnode_t vp
, off_t EOF
, int can_delay
, int push_all
);
136 static void sparse_cluster_switch(struct cl_writebehind
*, vnode_t vp
, off_t EOF
);
137 static void sparse_cluster_push(struct cl_writebehind
*, vnode_t vp
, off_t EOF
, int push_all
);
138 static void sparse_cluster_add(struct cl_writebehind
*, vnode_t vp
, struct cl_extent
*, off_t EOF
);
140 static kern_return_t
vfs_drt_mark_pages(void **cmapp
, off_t offset
, u_int length
, int *setcountp
);
141 static kern_return_t
vfs_drt_get_cluster(void **cmapp
, off_t
*offsetp
, u_int
*lengthp
);
142 static kern_return_t
vfs_drt_control(void **cmapp
, int op_type
);
144 int is_file_clean(vnode_t
, off_t
);
147 * throttle the number of async writes that
148 * can be outstanding on a single vnode
149 * before we issue a synchronous write
151 #define HARD_THROTTLE_MAXCNT 0
152 #define HARD_THROTTLE_MAXSIZE (64 * 1024)
154 int hard_throttle_on_root
= 0;
155 struct timeval priority_IO_timestamp_for_root
;
161 * allocate lock group attribute and group
163 cl_mtx_grp_attr
= lck_grp_attr_alloc_init();
164 //lck_grp_attr_setstat(cl_mtx_grp_attr);
165 cl_mtx_grp
= lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr
);
168 * allocate the lock attribute
170 cl_mtx_attr
= lck_attr_alloc_init();
171 //lck_attr_setdebug(clf_mtx_attr);
174 * allocate and initialize mutex's used to protect updates and waits
175 * on the cluster_io context
177 cl_mtxp
= lck_mtx_alloc_init(cl_mtx_grp
, cl_mtx_attr
);
180 panic("cluster_init: failed to allocate cl_mtxp");
185 #define CLW_ALLOCATE 0x01
186 #define CLW_RETURNLOCKED 0x02
188 * if the read ahead context doesn't yet exist,
189 * allocate and initialize it...
190 * the vnode lock serializes multiple callers
191 * during the actual assignment... first one
192 * to grab the lock wins... the other callers
193 * will release the now unnecessary storage
195 * once the context is present, try to grab (but don't block on)
196 * the lock associated with it... if someone
197 * else currently owns it, than the read
198 * will run without read-ahead. this allows
199 * multiple readers to run in parallel and
200 * since there's only 1 read ahead context,
201 * there's no real loss in only allowing 1
202 * reader to have read-ahead enabled.
204 static struct cl_readahead
*
205 cluster_get_rap(vnode_t vp
)
207 struct ubc_info
*ubc
;
208 struct cl_readahead
*rap
;
212 if ((rap
= ubc
->cl_rahead
) == NULL
) {
213 MALLOC_ZONE(rap
, struct cl_readahead
*, sizeof *rap
, M_CLRDAHEAD
, M_WAITOK
);
215 bzero(rap
, sizeof *rap
);
217 lck_mtx_init(&rap
->cl_lockr
, cl_mtx_grp
, cl_mtx_attr
);
221 if (ubc
->cl_rahead
== NULL
)
222 ubc
->cl_rahead
= rap
;
224 lck_mtx_destroy(&rap
->cl_lockr
, cl_mtx_grp
);
225 FREE_ZONE((void *)rap
, sizeof *rap
, M_CLRDAHEAD
);
226 rap
= ubc
->cl_rahead
;
230 if (lck_mtx_try_lock(&rap
->cl_lockr
) == TRUE
)
233 return ((struct cl_readahead
*)NULL
);
238 * if the write behind context doesn't yet exist,
239 * and CLW_ALLOCATE is specified, allocate and initialize it...
240 * the vnode lock serializes multiple callers
241 * during the actual assignment... first one
242 * to grab the lock wins... the other callers
243 * will release the now unnecessary storage
245 * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
246 * the lock associated with the write behind context before
250 static struct cl_writebehind
*
251 cluster_get_wbp(vnode_t vp
, int flags
)
253 struct ubc_info
*ubc
;
254 struct cl_writebehind
*wbp
;
258 if ((wbp
= ubc
->cl_wbehind
) == NULL
) {
260 if ( !(flags
& CLW_ALLOCATE
))
261 return ((struct cl_writebehind
*)NULL
);
263 MALLOC_ZONE(wbp
, struct cl_writebehind
*, sizeof *wbp
, M_CLWRBEHIND
, M_WAITOK
);
265 bzero(wbp
, sizeof *wbp
);
266 lck_mtx_init(&wbp
->cl_lockw
, cl_mtx_grp
, cl_mtx_attr
);
270 if (ubc
->cl_wbehind
== NULL
)
271 ubc
->cl_wbehind
= wbp
;
273 lck_mtx_destroy(&wbp
->cl_lockw
, cl_mtx_grp
);
274 FREE_ZONE((void *)wbp
, sizeof *wbp
, M_CLWRBEHIND
);
275 wbp
= ubc
->cl_wbehind
;
279 if (flags
& CLW_RETURNLOCKED
)
280 lck_mtx_lock(&wbp
->cl_lockw
);
287 cluster_hard_throttle_on(vnode_t vp
)
289 static struct timeval hard_throttle_maxelapsed
= { 0, 200000 };
291 if (vp
->v_mount
->mnt_kern_flag
& MNTK_ROOTDEV
) {
292 struct timeval elapsed
;
294 if (hard_throttle_on_root
)
297 microuptime(&elapsed
);
298 timevalsub(&elapsed
, &priority_IO_timestamp_for_root
);
300 if (timevalcmp(&elapsed
, &hard_throttle_maxelapsed
, <))
308 cluster_iodone(buf_t bp
, __unused
void *dummy
)
321 struct clios
*iostate
;
325 cbp_head
= (buf_t
)(bp
->b_trans_head
);
327 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_START
,
328 (int)cbp_head
, bp
->b_lblkno
, bp
->b_bcount
, bp
->b_flags
, 0);
330 for (cbp
= cbp_head
; cbp
; cbp
= cbp
->b_trans_next
) {
332 * all I/O requests that are part of this transaction
333 * have to complete before we can process it
335 if ( !(cbp
->b_flags
& B_DONE
)) {
337 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
338 (int)cbp_head
, (int)cbp
, cbp
->b_bcount
, cbp
->b_flags
, 0);
348 upl_offset
= cbp
->b_uploffset
;
350 b_flags
= cbp
->b_flags
;
351 real_bp
= cbp
->b_real_bp
;
352 zero_offset
= cbp
->b_validend
;
353 iostate
= (struct clios
*)cbp
->b_iostate
;
356 real_bp
->b_dev
= cbp
->b_dev
;
359 if ((cbp
->b_flags
& B_ERROR
) && error
== 0)
360 error
= cbp
->b_error
;
362 total_resid
+= cbp
->b_resid
;
363 total_size
+= cbp
->b_bcount
;
365 cbp_next
= cbp
->b_trans_next
;
372 cluster_zero(upl
, zero_offset
, PAGE_SIZE
- (zero_offset
& PAGE_MASK
), real_bp
);
378 * someone has issued multiple I/Os asynchrounsly
379 * and is waiting for them to complete (streaming)
381 lck_mtx_lock(cl_mtxp
);
383 if (error
&& iostate
->io_error
== 0)
384 iostate
->io_error
= error
;
386 iostate
->io_completed
+= total_size
;
388 if (iostate
->io_wanted
) {
390 * someone is waiting for the state of
391 * this io stream to change
393 iostate
->io_wanted
= 0;
396 lck_mtx_unlock(cl_mtxp
);
399 wakeup((caddr_t
)&iostate
->io_wanted
);
401 if ((b_flags
& B_NEED_IODONE
) && real_bp
) {
403 real_bp
->b_flags
|= B_ERROR
;
404 real_bp
->b_error
= error
;
406 real_bp
->b_resid
= total_resid
;
408 buf_biodone(real_bp
);
410 if (error
== 0 && total_resid
)
413 if (b_flags
& B_COMMIT_UPL
) {
414 pg_offset
= upl_offset
& PAGE_MASK
;
415 commit_size
= (pg_offset
+ total_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
417 if (error
|| (b_flags
& B_NOCACHE
)) {
422 if (b_flags
& B_PAGEIO
) {
423 if (b_flags
& B_READ
)
428 if (b_flags
& B_CACHE
) /* leave pages in the cache unchanged on error */
429 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
;
430 else if (page_out
&& (error
!= ENXIO
)) /* transient error */
431 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
;
433 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
;
435 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_DUMP_PAGES
;
437 ubc_upl_abort_range(upl
, upl_offset
- pg_offset
, commit_size
,
440 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
441 (int)upl
, upl_offset
- pg_offset
, commit_size
,
442 0x80000000|upl_abort_code
, 0);
445 int upl_commit_flags
= UPL_COMMIT_FREE_ON_EMPTY
;
447 if ((b_flags
& B_PHYS
) && (b_flags
& B_READ
))
448 upl_commit_flags
|= UPL_COMMIT_SET_DIRTY
;
451 upl_commit_flags
|= UPL_COMMIT_INACTIVATE
;
453 ubc_upl_commit_range(upl
, upl_offset
- pg_offset
, commit_size
,
456 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
457 (int)upl
, upl_offset
- pg_offset
, commit_size
,
458 upl_commit_flags
, 0);
461 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
462 (int)upl
, upl_offset
, 0, error
, 0);
470 cluster_zero(upl_t upl
, vm_offset_t upl_offset
, int size
, buf_t bp
)
474 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 23)) | DBG_FUNC_START
,
475 upl_offset
, size
, (int)bp
, 0, 0);
477 if (bp
== NULL
|| bp
->b_datap
== 0) {
479 pl
= ubc_upl_pageinfo(upl
);
487 page_index
= upl_offset
/ PAGE_SIZE
;
488 page_offset
= upl_offset
& PAGE_MASK
;
490 zero_addr
= ((addr64_t
)upl_phys_page(pl
, page_index
) << 12) + page_offset
;
491 zero_cnt
= min(PAGE_SIZE
- page_offset
, size
);
493 bzero_phys(zero_addr
, zero_cnt
);
496 upl_offset
+= zero_cnt
;
499 bzero((caddr_t
)((vm_offset_t
)bp
->b_datap
+ upl_offset
), size
);
501 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 23)) | DBG_FUNC_END
,
502 upl_offset
, size
, 0, 0, 0);
507 cluster_io(vnode_t vp
, upl_t upl
, vm_offset_t upl_offset
, off_t f_offset
, int non_rounded_size
,
508 int flags
, buf_t real_bp
, struct clios
*iostate
)
517 buf_t cbp_head
= NULL
;
518 buf_t cbp_tail
= NULL
;
526 int async_throttle
= 0;
531 if (mp
->mnt_devblocksize
> 1) {
533 * round the requested size up so that this I/O ends on a
534 * page boundary in case this is a 'write'... if the filesystem
535 * has blocks allocated to back the page beyond the EOF, we want to
536 * make sure to write out the zero's that are sitting beyond the EOF
537 * so that in case the filesystem doesn't explicitly zero this area
538 * if a hole is created via a lseek/write beyond the current EOF,
539 * it will return zeros when it's read back from the disk. If the
540 * physical allocation doesn't extend for the whole page, we'll
541 * only write/read from the disk up to the end of this allocation
542 * via the extent info returned from the VNOP_BLOCKMAP call.
544 pg_offset
= upl_offset
& PAGE_MASK
;
546 size
= (((non_rounded_size
+ pg_offset
) + (PAGE_SIZE
- 1)) & ~PAGE_MASK
) - pg_offset
;
549 * anyone advertising a blocksize of 1 byte probably
550 * can't deal with us rounding up the request size
551 * AFP is one such filesystem/device
553 size
= non_rounded_size
;
555 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_START
,
556 (int)f_offset
, size
, upl_offset
, flags
, 0);
558 if (flags
& CL_READ
) {
560 bmap_flags
= VNODE_READ
;
562 max_iosize
= mp
->mnt_maxreadcnt
;
563 max_vectors
= mp
->mnt_segreadcnt
;
566 bmap_flags
= VNODE_WRITE
;
568 max_iosize
= mp
->mnt_maxwritecnt
;
569 max_vectors
= mp
->mnt_segwritecnt
;
571 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_NONE
, max_iosize
, max_vectors
, mp
->mnt_devblocksize
, 0, 0);
574 * make sure the maximum iosize is a
575 * multiple of the page size
577 max_iosize
&= ~PAGE_MASK
;
579 if (flags
& CL_THROTTLE
) {
580 if ( !(flags
& CL_PAGEOUT
) && cluster_hard_throttle_on(vp
)) {
581 if (max_iosize
> HARD_THROTTLE_MAXSIZE
)
582 max_iosize
= HARD_THROTTLE_MAXSIZE
;
583 async_throttle
= HARD_THROTTLE_MAXCNT
;
585 async_throttle
= VNODE_ASYNC_THROTTLE
;
590 io_flags
|= B_NOCACHE
;
591 if (flags
& (CL_PAGEIN
| CL_PAGEOUT
))
592 io_flags
|= B_PAGEIO
;
593 if (flags
& CL_COMMIT
)
594 io_flags
|= B_COMMIT_UPL
;
595 if (flags
& CL_PRESERVE
)
597 if (flags
& CL_KEEPCACHED
)
600 if ((flags
& CL_READ
) && ((upl_offset
+ non_rounded_size
) & PAGE_MASK
) && (!(flags
& CL_NOZERO
))) {
602 * then we are going to end up
603 * with a page that we can't complete (the file size wasn't a multiple
604 * of PAGE_SIZE and we're trying to read to the end of the file
605 * so we'll go ahead and zero out the portion of the page we can't
606 * read in from the file
608 zero_offset
= upl_offset
+ non_rounded_size
;
615 if (size
> max_iosize
)
616 io_size
= max_iosize
;
620 if ((error
= VNOP_BLOCKMAP(vp
, f_offset
, io_size
, &blkno
, (size_t *)&io_size
, NULL
, bmap_flags
, NULL
))) {
623 if (real_bp
&& (real_bp
->b_blkno
== real_bp
->b_lblkno
))
624 real_bp
->b_blkno
= blkno
;
626 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 24)) | DBG_FUNC_NONE
,
627 (int)f_offset
, (int)blkno
, io_size
, zero_offset
, 0);
631 * vnop_blockmap didn't return an error... however, it did
632 * return an extent size of 0 which means we can't
633 * make forward progress on this I/O... a hole in the
634 * file would be returned as a blkno of -1 with a non-zero io_size
635 * a real extent is returned with a blkno != -1 and a non-zero io_size
640 if ( !(flags
& CL_READ
) && blkno
== -1) {
644 * we're writing into a 'hole'
646 if (flags
& CL_PAGEOUT
) {
648 * if we got here via cluster_pageout
649 * then just error the request and return
650 * the 'hole' should already have been covered
655 if ( !(flags
& CL_COMMIT
)) {
657 * currently writes always request the commit to happen
658 * as part of the io completion... however, if the CL_COMMIT
659 * flag isn't specified, than we can't issue the abort_range
660 * since the call site is going to abort or commit the same upl..
661 * in this case we can only return an error
667 * we can get here if the cluster code happens to
668 * pick up a page that was dirtied via mmap vs
669 * a 'write' and the page targets a 'hole'...
670 * i.e. the writes to the cluster were sparse
671 * and the file was being written for the first time
673 * we can also get here if the filesystem supports
674 * 'holes' that are less than PAGE_SIZE.... because
675 * we can't know if the range in the page that covers
676 * the 'hole' has been dirtied via an mmap or not,
677 * we have to assume the worst and try to push the
678 * entire page to storage.
680 * Try paging out the page individually before
681 * giving up entirely and dumping it (the pageout
682 * path will insure that the zero extent accounting
683 * has been taken care of before we get back into cluster_io)
685 ubc_upl_abort_range(upl
, trunc_page(upl_offset
), PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
687 e_offset
= round_page_64(f_offset
+ 1);
689 if (ubc_sync_range(vp
, f_offset
, e_offset
, UBC_PUSHDIRTY
) == 0) {
693 io_size
= e_offset
- f_offset
;
696 upl_offset
+= io_size
;
703 * keep track of how much of the original request
704 * that we've actually completed... non_rounded_size
705 * may go negative due to us rounding the request
706 * to a page size multiple (i.e. size > non_rounded_size)
708 non_rounded_size
-= io_size
;
710 if (non_rounded_size
<= 0) {
712 * we've transferred all of the data in the original
713 * request, but we were unable to complete the tail
714 * of the last page because the file didn't have
715 * an allocation to back that portion... this is ok.
721 lblkno
= (daddr64_t
)(f_offset
/ PAGE_SIZE_64
);
723 * we have now figured out how much I/O we can do - this is in 'io_size'
724 * pg_offset is the starting point in the first page for the I/O
725 * pg_count is the number of full and partial pages that 'io_size' encompasses
727 pg_offset
= upl_offset
& PAGE_MASK
;
729 if (flags
& CL_DEV_MEMORY
) {
731 * currently, can't deal with reading 'holes' in file
738 * treat physical requests as one 'giant' page
742 pg_count
= (io_size
+ pg_offset
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
744 if ((flags
& CL_READ
) && blkno
== -1) {
748 * if we're reading and blkno == -1, then we've got a
749 * 'hole' in the file that we need to deal with by zeroing
750 * out the affected area in the upl
752 if (zero_offset
&& io_size
== size
) {
754 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
755 * than 'zero_offset' will be non-zero
756 * if the 'hole' returned by vnop_blockmap extends all the way to the eof
757 * (indicated by the io_size finishing off the I/O request for this UPL)
758 * than we're not going to issue an I/O for the
759 * last page in this upl... we need to zero both the hole and the tail
760 * of the page beyond the EOF, since the delayed zero-fill won't kick in
762 bytes_to_zero
= (((upl_offset
+ io_size
) + (PAGE_SIZE
- 1)) & ~PAGE_MASK
) - upl_offset
;
766 bytes_to_zero
= io_size
;
768 cluster_zero(upl
, upl_offset
, bytes_to_zero
, real_bp
);
772 * if there is a current I/O chain pending
773 * then the first page of the group we just zero'd
774 * will be handled by the I/O completion if the zero
775 * fill started in the middle of the page
777 pg_count
= (io_size
- pg_offset
) / PAGE_SIZE
;
780 * no pending I/O to pick up that first page
781 * so, we have to make sure it gets committed
783 * set the pg_offset to 0 so that the upl_commit_range
784 * starts with this page
786 pg_count
= (io_size
+ pg_offset
) / PAGE_SIZE
;
789 if (io_size
== size
&& ((upl_offset
+ io_size
) & PAGE_MASK
))
791 * if we're done with the request for this UPL
792 * then we have to make sure to commit the last page
793 * even if we only partially zero-filled it
799 pg_resid
= PAGE_SIZE
- pg_offset
;
803 if (flags
& CL_COMMIT
)
804 ubc_upl_commit_range(upl
,
805 (upl_offset
+ pg_resid
) & ~PAGE_MASK
,
806 pg_count
* PAGE_SIZE
,
807 UPL_COMMIT_CLEAR_DIRTY
| UPL_COMMIT_FREE_ON_EMPTY
);
809 upl_offset
+= io_size
;
813 * keep track of how much of the original request
814 * that we've actually completed... non_rounded_size
815 * may go negative due to us rounding the request
816 * to a page size multiple (i.e. size > non_rounded_size)
818 non_rounded_size
-= io_size
;
820 if (non_rounded_size
<= 0) {
822 * we've transferred all of the data in the original
823 * request, but we were unable to complete the tail
824 * of the last page because the file didn't have
825 * an allocation to back that portion... this is ok.
829 if (cbp_head
&& pg_count
)
834 if (pg_count
> max_vectors
) {
835 if (((pg_count
- max_vectors
) * PAGE_SIZE
) > io_size
) {
836 io_size
= PAGE_SIZE
- pg_offset
;
839 io_size
-= (pg_count
- max_vectors
) * PAGE_SIZE
;
840 pg_count
= max_vectors
;
844 if ( !(mp
->mnt_kern_flag
& MNTK_VIRTUALDEV
))
846 * if we're not targeting a virtual device i.e. a disk image
847 * it's safe to dip into the reserve pool since real devices
848 * can complete this I/O request without requiring additional
849 * bufs from the alloc_io_buf pool
852 else if ((flags
& CL_ASYNC
) && !(flags
& CL_PAGEOUT
))
854 * Throttle the speculative IO
860 cbp
= alloc_io_buf(vp
, priv
);
862 if (flags
& CL_PAGEOUT
) {
865 for (i
= 0; i
< pg_count
; i
++) {
866 if (buf_invalblkno(vp
, lblkno
+ i
, 0) == EBUSY
)
867 panic("BUSY bp found in cluster_io");
870 if (flags
& CL_ASYNC
) {
871 if (buf_setcallback(cbp
, (void *)cluster_iodone
, NULL
))
872 panic("buf_setcallback failed\n");
874 cbp
->b_flags
|= io_flags
;
876 cbp
->b_lblkno
= lblkno
;
877 cbp
->b_blkno
= blkno
;
878 cbp
->b_bcount
= io_size
;
880 if (buf_setupl(cbp
, upl
, upl_offset
))
881 panic("buf_setupl failed\n");
883 cbp
->b_trans_next
= (buf_t
)NULL
;
885 if ((cbp
->b_iostate
= (void *)iostate
))
887 * caller wants to track the state of this
888 * io... bump the amount issued against this stream
890 iostate
->io_issued
+= io_size
;
892 if (flags
& CL_READ
) {
893 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 26)) | DBG_FUNC_NONE
,
894 (int)cbp
->b_lblkno
, (int)cbp
->b_blkno
, upl_offset
, io_size
, 0);
897 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 27)) | DBG_FUNC_NONE
,
898 (int)cbp
->b_lblkno
, (int)cbp
->b_blkno
, upl_offset
, io_size
, 0);
902 cbp_tail
->b_trans_next
= cbp
;
908 (buf_t
)(cbp
->b_trans_head
) = cbp_head
;
911 upl_offset
+= io_size
;
915 * keep track of how much of the original request
916 * that we've actually completed... non_rounded_size
917 * may go negative due to us rounding the request
918 * to a page size multiple (i.e. size > non_rounded_size)
920 non_rounded_size
-= io_size
;
922 if (non_rounded_size
<= 0) {
924 * we've transferred all of the data in the original
925 * request, but we were unable to complete the tail
926 * of the last page because the file didn't have
927 * an allocation to back that portion... this is ok.
931 if ( (!(upl_offset
& PAGE_MASK
) && !(flags
& CL_DEV_MEMORY
) && ((flags
& CL_ASYNC
) || trans_count
> 8)) || size
== 0) {
933 * if we have no more I/O to issue or
934 * the current I/O we've prepared fully
935 * completes the last page in this request
936 * and it's either an ASYNC request or
937 * we've already accumulated more than 8 I/O's into
938 * this transaction and it's not an I/O directed to
939 * special DEVICE memory
940 * then go ahead and issue the I/O
944 cbp_head
->b_flags
|= B_NEED_IODONE
;
945 cbp_head
->b_real_bp
= real_bp
;
947 cbp_head
->b_real_bp
= (buf_t
)NULL
;
951 * we're about to issue the last I/O for this upl
952 * if this was a read to the eof and the eof doesn't
953 * finish on a page boundary, than we need to zero-fill
954 * the rest of the page....
956 cbp_head
->b_validend
= zero_offset
;
958 cbp_head
->b_validend
= 0;
960 if (flags
& CL_THROTTLE
)
961 (void)vnode_waitforwrites(vp
, async_throttle
, 0, 0, (char *)"cluster_io");
963 for (cbp
= cbp_head
; cbp
;) {
966 if ( !(io_flags
& B_READ
))
967 vnode_startwrite(vp
);
969 cbp_next
= cbp
->b_trans_next
;
971 (void) VNOP_STRATEGY(cbp
);
974 if ( !(flags
& CL_ASYNC
)) {
977 for (cbp
= cbp_head
; cbp
; cbp
= cbp
->b_trans_next
)
980 if ((error
= cluster_iodone(cbp_head
, (void *)&dummy
))) {
981 if ((flags
& (CL_PAGEOUT
| CL_KEEPCACHED
) == CL_PAGEOUT
) && (error
== ENXIO
))
982 error
= 0; /* drop the error */
990 cbp_head
= (buf_t
)NULL
;
991 cbp_tail
= (buf_t
)NULL
;
1001 for (cbp
= cbp_head
; cbp
;) {
1004 upl_offset
-= cbp
->b_bcount
;
1005 size
+= cbp
->b_bcount
;
1006 io_size
+= cbp
->b_bcount
;
1008 cbp_next
= cbp
->b_trans_next
;
1013 int need_wakeup
= 0;
1016 * update the error condition for this stream
1017 * since we never really issued the io
1018 * just go ahead and adjust it back
1020 lck_mtx_lock(cl_mtxp
);
1022 if (iostate
->io_error
== 0)
1023 iostate
->io_error
= error
;
1024 iostate
->io_issued
-= io_size
;
1026 if (iostate
->io_wanted
) {
1028 * someone is waiting for the state of
1029 * this io stream to change
1031 iostate
->io_wanted
= 0;
1034 lck_mtx_unlock(cl_mtxp
);
1037 wakeup((caddr_t
)&iostate
->io_wanted
);
1039 pg_offset
= upl_offset
& PAGE_MASK
;
1040 abort_size
= (size
+ pg_offset
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
1042 if (flags
& CL_COMMIT
) {
1045 if (flags
& CL_PRESERVE
) {
1046 ubc_upl_commit_range(upl
, upl_offset
- pg_offset
, abort_size
,
1047 UPL_COMMIT_FREE_ON_EMPTY
);
1049 if ((flags
& CL_PAGEOUT
) && (error
!= ENXIO
)) /* transient error */
1050 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
;
1051 else if (flags
& CL_PAGEIN
)
1052 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
;
1054 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_DUMP_PAGES
;
1056 ubc_upl_abort_range(upl
, upl_offset
- pg_offset
, abort_size
,
1059 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 28)) | DBG_FUNC_NONE
,
1060 (int)upl
, upl_offset
- pg_offset
, abort_size
, error
, 0);
1063 real_bp
->b_flags
|= B_ERROR
;
1064 real_bp
->b_error
= error
;
1066 buf_biodone(real_bp
);
1071 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_END
,
1072 (int)f_offset
, size
, upl_offset
, retval
, 0);
1079 cluster_rd_prefetch(vnode_t vp
, off_t f_offset
, u_int size
, off_t filesize
)
1081 int pages_in_prefetch
;
1083 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_START
,
1084 (int)f_offset
, size
, (int)filesize
, 0, 0);
1086 if (f_offset
>= filesize
) {
1087 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_END
,
1088 (int)f_offset
, 0, 0, 0, 0);
1091 if (size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
1092 size
= (MAX_UPL_TRANSFER
* PAGE_SIZE
);
1094 size
= (size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
1096 if ((off_t
)size
> (filesize
- f_offset
))
1097 size
= filesize
- f_offset
;
1098 pages_in_prefetch
= (size
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
1100 advisory_read(vp
, filesize
, f_offset
, size
);
1102 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_END
,
1103 (int)f_offset
+ size
, pages_in_prefetch
, 0, 1, 0);
1105 return (pages_in_prefetch
);
1111 cluster_rd_ahead(vnode_t vp
, struct cl_extent
*extent
, off_t filesize
, struct cl_readahead
*rap
)
1115 int size_of_prefetch
;
1118 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_START
,
1119 (int)extent
->b_addr
, (int)extent
->e_addr
, (int)rap
->cl_lastr
, 0, 0);
1121 if (extent
->b_addr
== rap
->cl_lastr
&& extent
->b_addr
== extent
->e_addr
) {
1122 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
1123 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 0, 0);
1126 if (rap
->cl_lastr
== -1 || (extent
->b_addr
!= rap
->cl_lastr
&& extent
->b_addr
!= (rap
->cl_lastr
+ 1) &&
1127 (extent
->b_addr
!= (rap
->cl_maxra
+ 1) || rap
->cl_ralen
== 0))) {
1131 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
1132 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 1, 0);
1136 if (extent
->e_addr
< rap
->cl_maxra
) {
1137 if ((rap
->cl_maxra
- extent
->e_addr
) > (MAX_UPL_TRANSFER
/ 4)) {
1139 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
1140 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 2, 0);
1144 r_addr
= max(extent
->e_addr
, rap
->cl_maxra
) + 1;
1145 f_offset
= (off_t
)(r_addr
* PAGE_SIZE_64
);
1147 size_of_prefetch
= 0;
1149 ubc_range_op(vp
, f_offset
, f_offset
+ PAGE_SIZE_64
, UPL_ROP_PRESENT
, &size_of_prefetch
);
1151 if (size_of_prefetch
) {
1152 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
1153 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 3, 0);
1156 if (f_offset
< filesize
) {
1157 daddr64_t read_size
;
1159 rap
->cl_ralen
= rap
->cl_ralen
? min(MAX_UPL_TRANSFER
, rap
->cl_ralen
<< 1) : 1;
1161 read_size
= (extent
->e_addr
+ 1) - extent
->b_addr
;
1163 if (read_size
> rap
->cl_ralen
) {
1164 if (read_size
> MAX_UPL_TRANSFER
)
1165 rap
->cl_ralen
= MAX_UPL_TRANSFER
;
1167 rap
->cl_ralen
= read_size
;
1169 size_of_prefetch
= cluster_rd_prefetch(vp
, f_offset
, rap
->cl_ralen
* PAGE_SIZE
, filesize
);
1171 if (size_of_prefetch
)
1172 rap
->cl_maxra
= (r_addr
+ size_of_prefetch
) - 1;
1174 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
1175 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 4, 0);
1179 cluster_pageout(vnode_t vp
, upl_t upl
, vm_offset_t upl_offset
, off_t f_offset
,
1180 int size
, off_t filesize
, int flags
)
1186 struct cl_writebehind
*wbp
;
1188 if (vp
->v_mount
->mnt_kern_flag
& MNTK_VIRTUALDEV
)
1190 * if we know we're issuing this I/O to a virtual device (i.e. disk image)
1191 * then we don't want to enforce this throttle... if we do, we can
1192 * potentially deadlock since we're stalling the pageout thread at a time
1193 * when the disk image might need additional memory (which won't be available
1194 * if the pageout thread can't run)... instead we'll just depend on the throttle
1195 * that the pageout thread now has in place to deal with external files
1197 local_flags
= CL_PAGEOUT
;
1199 local_flags
= CL_PAGEOUT
| CL_THROTTLE
;
1201 if ((flags
& UPL_IOSYNC
) == 0)
1202 local_flags
|= CL_ASYNC
;
1203 if ((flags
& UPL_NOCOMMIT
) == 0)
1204 local_flags
|= CL_COMMIT
;
1205 if ((flags
& UPL_KEEPCACHED
))
1206 local_flags
|= CL_KEEPCACHED
;
1209 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 52)) | DBG_FUNC_NONE
,
1210 (int)f_offset
, size
, (int)filesize
, local_flags
, 0);
1213 * If they didn't specify any I/O, then we are done...
1214 * we can't issue an abort because we don't know how
1215 * big the upl really is
1220 if (vp
->v_mount
->mnt_flag
& MNT_RDONLY
) {
1221 if (local_flags
& CL_COMMIT
)
1222 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
);
1226 * can't page-in from a negative offset
1227 * or if we're starting beyond the EOF
1228 * or if the file offset isn't page aligned
1229 * or the size requested isn't a multiple of PAGE_SIZE
1231 if (f_offset
< 0 || f_offset
>= filesize
||
1232 (f_offset
& PAGE_MASK_64
) || (size
& PAGE_MASK
)) {
1233 if (local_flags
& CL_COMMIT
)
1234 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
);
1237 max_size
= filesize
- f_offset
;
1239 if (size
< max_size
)
1244 rounded_size
= (io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
1246 if (size
> rounded_size
) {
1247 if (local_flags
& CL_COMMIT
)
1248 ubc_upl_abort_range(upl
, upl_offset
+ rounded_size
, size
- rounded_size
,
1249 UPL_ABORT_FREE_ON_EMPTY
);
1251 if ((wbp
= cluster_get_wbp(vp
, 0)) != NULL
)
1252 wbp
->cl_hasbeenpaged
= 1;
1254 return (cluster_io(vp
, upl
, upl_offset
, f_offset
, io_size
,
1255 local_flags
, (buf_t
)NULL
, (struct clios
*)NULL
));
1259 cluster_pagein(vnode_t vp
, upl_t upl
, vm_offset_t upl_offset
, off_t f_offset
,
1260 int size
, off_t filesize
, int flags
)
1266 int local_flags
= 0;
1268 if (upl
== NULL
|| size
< 0)
1269 panic("cluster_pagein: NULL upl passed in");
1271 if ((flags
& UPL_IOSYNC
) == 0)
1272 local_flags
|= CL_ASYNC
;
1273 if ((flags
& UPL_NOCOMMIT
) == 0)
1274 local_flags
|= CL_COMMIT
;
1277 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 56)) | DBG_FUNC_NONE
,
1278 (int)f_offset
, size
, (int)filesize
, local_flags
, 0);
1281 * can't page-in from a negative offset
1282 * or if we're starting beyond the EOF
1283 * or if the file offset isn't page aligned
1284 * or the size requested isn't a multiple of PAGE_SIZE
1286 if (f_offset
< 0 || f_offset
>= filesize
||
1287 (f_offset
& PAGE_MASK_64
) || (size
& PAGE_MASK
) || (upl_offset
& PAGE_MASK
)) {
1288 if (local_flags
& CL_COMMIT
)
1289 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
);
1292 max_size
= filesize
- f_offset
;
1294 if (size
< max_size
)
1299 rounded_size
= (io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
1301 if (size
> rounded_size
&& (local_flags
& CL_COMMIT
))
1302 ubc_upl_abort_range(upl
, upl_offset
+ rounded_size
,
1303 size
- rounded_size
, UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
);
1305 retval
= cluster_io(vp
, upl
, upl_offset
, f_offset
, io_size
,
1306 local_flags
| CL_READ
| CL_PAGEIN
, (buf_t
)NULL
, (struct clios
*)NULL
);
1308 if (retval
== 0 && !(flags
& UPL_NORDAHEAD
) && !(vp
->v_flag
& VRAOFF
)) {
1309 struct cl_readahead
*rap
;
1311 rap
= cluster_get_rap(vp
);
1314 struct cl_extent extent
;
1316 extent
.b_addr
= (daddr64_t
)(f_offset
/ PAGE_SIZE_64
);
1317 extent
.e_addr
= (daddr64_t
)((f_offset
+ ((off_t
)io_size
- 1)) / PAGE_SIZE_64
);
1319 if (rounded_size
== PAGE_SIZE
) {
1321 * we haven't read the last page in of the file yet
1322 * so let's try to read ahead if we're in
1323 * a sequential access pattern
1325 cluster_rd_ahead(vp
, &extent
, filesize
, rap
);
1327 rap
->cl_lastr
= extent
.e_addr
;
1329 lck_mtx_unlock(&rap
->cl_lockr
);
1336 cluster_bp(buf_t bp
)
1341 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 19)) | DBG_FUNC_START
,
1342 (int)bp
, (int)bp
->b_lblkno
, bp
->b_bcount
, bp
->b_flags
, 0);
1344 if (bp
->b_flags
& B_READ
)
1345 flags
= CL_ASYNC
| CL_READ
;
1349 f_offset
= ubc_blktooff(bp
->b_vp
, bp
->b_lblkno
);
1351 return (cluster_io(bp
->b_vp
, bp
->b_upl
, 0, f_offset
, bp
->b_bcount
, flags
, bp
, (struct clios
*)NULL
));
1355 cluster_write(vnode_t vp
, struct uio
*uio
, off_t oldEOF
, off_t newEOF
, off_t headOff
, off_t tailOff
, int xflags
)
1368 if (vp
->v_flag
& VNOCACHE_DATA
)
1369 flags
|= IO_NOCACHE
;
1371 if ( (!(flags
& IO_NOCACHE
)) || (!uio
) || (!UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
))) {
1373 * go do a write through the cache if one of the following is true....
1374 * NOCACHE is not true
1375 * there is no uio structure or it doesn't target USERSPACE
1377 return (cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, flags
));
1381 if (IS_VALID_UIO_SEGFLG(uio
->uio_segflg
) == 0) {
1382 panic("%s :%d - invalid uio_segflg\n", __FILE__
, __LINE__
);
1384 #endif /* LP64_DEBUG */
1386 while (uio_resid(uio
) && uio
->uio_offset
< newEOF
&& retval
== 0) {
1391 * we know we have a resid, so this is safe
1392 * skip over any emtpy vectors
1394 iov_len
= uio_iov_len(uio
);
1396 while (iov_len
== 0) {
1399 iov_len
= uio_iov_len(uio
);
1401 iov_base
= uio_iov_base(uio
);
1403 upl_size
= PAGE_SIZE
;
1404 upl_flags
= UPL_QUERY_OBJECT_TYPE
;
1406 // LP64todo - fix this!
1407 if ((vm_map_get_upl(current_map(),
1408 CAST_DOWN(vm_offset_t
, iov_base
) & ~PAGE_MASK
,
1409 &upl_size
, &upl
, NULL
, NULL
, &upl_flags
, 0)) != KERN_SUCCESS
) {
1411 * the user app must have passed in an invalid address
1417 * We check every vector target but if it is physically
1418 * contiguous space, we skip the sanity checks.
1420 if (upl_flags
& UPL_PHYS_CONTIG
) {
1423 zflags
= flags
& ~IO_TAILZEROFILL
;
1424 zflags
|= IO_HEADZEROFILL
;
1426 if (flags
& IO_HEADZEROFILL
) {
1428 * in case we have additional vectors, we don't want to do this again
1430 flags
&= ~IO_HEADZEROFILL
;
1432 if ((retval
= cluster_write_x(vp
, (struct uio
*)0, 0, uio
->uio_offset
, headOff
, 0, zflags
)))
1435 retval
= cluster_phys_write(vp
, uio
, newEOF
);
1437 if (uio_resid(uio
) == 0 && (flags
& IO_TAILZEROFILL
)) {
1438 return (cluster_write_x(vp
, (struct uio
*)0, 0, tailOff
, uio
->uio_offset
, 0, zflags
));
1441 else if ((uio_resid(uio
) < PAGE_SIZE
) || (flags
& (IO_TAILZEROFILL
| IO_HEADZEROFILL
))) {
1443 * we're here because we're don't have a physically contiguous target buffer
1444 * go do a write through the cache if one of the following is true....
1445 * the total xfer size is less than a page...
1446 * we're being asked to ZEROFILL either the head or the tail of the I/O...
1448 return (cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, flags
));
1450 // LP64todo - fix this!
1451 else if (((int)uio
->uio_offset
& PAGE_MASK
) || (CAST_DOWN(int, iov_base
) & PAGE_MASK
)) {
1452 if (((int)uio
->uio_offset
& PAGE_MASK
) == (CAST_DOWN(int, iov_base
) & PAGE_MASK
)) {
1454 * Bring the file offset write up to a pagesize boundary
1455 * this will also bring the base address to a page boundary
1456 * since they both are currently on the same offset within a page
1457 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
1458 * so the computed clip_size must always be less than the current uio_resid
1460 clip_size
= (PAGE_SIZE
- (uio
->uio_offset
& PAGE_MASK_64
));
1463 * Fake the resid going into the cluster_write_x call
1464 * and restore it on the way out.
1466 // LP64todo - fix this
1467 prev_resid
= uio_resid(uio
);
1468 uio_setresid(uio
, clip_size
);
1470 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, flags
);
1472 uio_setresid(uio
, prev_resid
- (clip_size
- uio_resid(uio
)));
1475 * can't get both the file offset and the buffer offset aligned to a page boundary
1476 * so fire an I/O through the cache for this entire vector
1478 // LP64todo - fix this
1479 clip_size
= iov_len
;
1480 // LP64todo - fix this
1481 prev_resid
= uio_resid(uio
);
1482 uio_setresid(uio
, clip_size
);
1484 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, flags
);
1486 uio_setresid(uio
, prev_resid
- (clip_size
- uio_resid(uio
)));
1490 * If we come in here, we know the offset into
1491 * the file is on a pagesize boundary and the
1492 * target buffer address is also on a page boundary
1494 max_io_size
= newEOF
- uio
->uio_offset
;
1495 // LP64todo - fix this
1496 clip_size
= uio_resid(uio
);
1497 if (iov_len
< clip_size
)
1498 // LP64todo - fix this!
1499 clip_size
= iov_len
;
1500 if (max_io_size
< clip_size
)
1501 clip_size
= max_io_size
;
1503 if (clip_size
< PAGE_SIZE
) {
1505 * Take care of tail end of write in this vector
1507 // LP64todo - fix this
1508 prev_resid
= uio_resid(uio
);
1509 uio_setresid(uio
, clip_size
);
1511 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, flags
);
1513 uio_setresid(uio
, prev_resid
- (clip_size
- uio_resid(uio
)));
1515 /* round clip_size down to a multiple of pagesize */
1516 clip_size
= clip_size
& ~(PAGE_MASK
);
1517 // LP64todo - fix this
1518 prev_resid
= uio_resid(uio
);
1519 uio_setresid(uio
, clip_size
);
1521 retval
= cluster_nocopy_write(vp
, uio
, newEOF
);
1523 if ((retval
== 0) && uio_resid(uio
))
1524 retval
= cluster_write_x(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, flags
);
1526 uio_setresid(uio
, prev_resid
- (clip_size
- uio_resid(uio
)));
1536 cluster_nocopy_write(vnode_t vp
, struct uio
*uio
, off_t newEOF
)
1539 upl_page_info_t
*pl
;
1540 vm_offset_t upl_offset
;
1544 int upl_needed_size
;
1549 int force_data_sync
;
1551 struct clios iostate
;
1552 struct cl_writebehind
*wbp
;
1555 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_START
,
1556 (int)uio
->uio_offset
, (int)uio_resid(uio
),
1560 * When we enter this routine, we know
1561 * -- the offset into the file is on a pagesize boundary
1562 * -- the resid is a page multiple
1563 * -- the resid will not exceed iov_len
1566 if ((wbp
= cluster_get_wbp(vp
, CLW_RETURNLOCKED
)) != NULL
) {
1568 cluster_try_push(wbp
, vp
, newEOF
, 0, 1);
1570 lck_mtx_unlock(&wbp
->cl_lockw
);
1572 iostate
.io_completed
= 0;
1573 iostate
.io_issued
= 0;
1574 iostate
.io_error
= 0;
1575 iostate
.io_wanted
= 0;
1579 while (uio_resid(uio
) && uio
->uio_offset
< newEOF
&& error
== 0) {
1580 io_size
= uio_resid(uio
);
1582 if (io_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
1583 io_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
1585 // LP64todo - fix this!
1586 upl_offset
= CAST_DOWN(vm_offset_t
, iov
->iov_base
) & PAGE_MASK
;
1588 upl_needed_size
= (upl_offset
+ io_size
+ (PAGE_SIZE
-1)) & ~PAGE_MASK
;
1590 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_START
,
1591 (int)upl_offset
, upl_needed_size
, (int)iov
->iov_base
, io_size
, 0);
1593 for (force_data_sync
= 0; force_data_sync
< 3; force_data_sync
++) {
1595 upl_size
= upl_needed_size
;
1596 upl_flags
= UPL_FILE_IO
| UPL_COPYOUT_FROM
| UPL_NO_SYNC
|
1597 UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
1599 // LP64todo - fix this!
1600 kret
= vm_map_get_upl(current_map(),
1601 CAST_DOWN(vm_offset_t
, iov
->iov_base
) & ~PAGE_MASK
,
1609 if (kret
!= KERN_SUCCESS
) {
1610 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
1613 * cluster_nocopy_write: failed to get pagelist
1615 * we may have already spun some portion of this request
1616 * off as async requests... we need to wait for the I/O
1617 * to complete before returning
1619 goto wait_for_writes
;
1621 pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
1622 pages_in_pl
= upl_size
/ PAGE_SIZE
;
1624 for (i
= 0; i
< pages_in_pl
; i
++) {
1625 if (!upl_valid_page(pl
, i
))
1628 if (i
== pages_in_pl
)
1632 * didn't get all the pages back that we
1633 * needed... release this upl and try again
1635 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
1636 UPL_ABORT_FREE_ON_EMPTY
);
1638 if (force_data_sync
>= 3) {
1639 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
1640 i
, pages_in_pl
, upl_size
, kret
, 0);
1642 * for some reason, we couldn't acquire a hold on all
1643 * the pages needed in the user's address space
1645 * we may have already spun some portion of this request
1646 * off as async requests... we need to wait for the I/O
1647 * to complete before returning
1649 goto wait_for_writes
;
1653 * Consider the possibility that upl_size wasn't satisfied.
1655 if (upl_size
!= upl_needed_size
)
1656 io_size
= (upl_size
- (int)upl_offset
) & ~PAGE_MASK
;
1658 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
1659 (int)upl_offset
, upl_size
, (int)iov
->iov_base
, io_size
, 0);
1662 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
1663 UPL_ABORT_FREE_ON_EMPTY
);
1665 * we may have already spun some portion of this request
1666 * off as async requests... we need to wait for the I/O
1667 * to complete before returning
1669 goto wait_for_writes
;
1672 * Now look for pages already in the cache
1673 * and throw them away.
1674 * uio->uio_offset is page aligned within the file
1675 * io_size is a multiple of PAGE_SIZE
1677 ubc_range_op(vp
, uio
->uio_offset
, uio
->uio_offset
+ io_size
, UPL_ROP_DUMP
, NULL
);
1680 * we want push out these writes asynchronously so that we can overlap
1681 * the preparation of the next I/O
1682 * if there are already too many outstanding writes
1683 * wait until some complete before issuing the next
1685 lck_mtx_lock(cl_mtxp
);
1687 while ((iostate
.io_issued
- iostate
.io_completed
) > (2 * MAX_UPL_TRANSFER
* PAGE_SIZE
)) {
1688 iostate
.io_wanted
= 1;
1689 msleep((caddr_t
)&iostate
.io_wanted
, cl_mtxp
, PRIBIO
+ 1, "cluster_nocopy_write", 0);
1691 lck_mtx_unlock(cl_mtxp
);
1693 if (iostate
.io_error
) {
1695 * one of the earlier writes we issued ran into a hard error
1696 * don't issue any more writes, cleanup the UPL
1697 * that was just created but not used, then
1698 * go wait for all writes that are part of this stream
1699 * to complete before returning the error to the caller
1701 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
,
1702 UPL_ABORT_FREE_ON_EMPTY
);
1704 goto wait_for_writes
;
1706 io_flag
= CL_ASYNC
| CL_PRESERVE
| CL_COMMIT
| CL_THROTTLE
;
1708 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 77)) | DBG_FUNC_START
,
1709 (int)upl_offset
, (int)uio
->uio_offset
, io_size
, io_flag
, 0);
1711 error
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
,
1712 io_size
, io_flag
, (buf_t
)NULL
, &iostate
);
1714 iov
->iov_len
-= io_size
;
1715 ((u_int32_t
)iov
->iov_base
) += io_size
;
1716 uio_setresid(uio
, (uio_resid(uio
) - io_size
));
1717 uio
->uio_offset
+= io_size
;
1719 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 77)) | DBG_FUNC_END
,
1720 (int)upl_offset
, (int)uio
->uio_offset
, (int)uio_resid(uio
), error
, 0);
1726 * make sure all async writes issued as part of this stream
1727 * have completed before we return
1729 lck_mtx_lock(cl_mtxp
);
1731 while (iostate
.io_issued
!= iostate
.io_completed
) {
1732 iostate
.io_wanted
= 1;
1733 msleep((caddr_t
)&iostate
.io_wanted
, cl_mtxp
, PRIBIO
+ 1, "cluster_nocopy_write", 0);
1735 lck_mtx_unlock(cl_mtxp
);
1737 if (iostate
.io_error
)
1738 error
= iostate
.io_error
;
1740 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_END
,
1741 (int)uio
->uio_offset
, (int)uio
->uio_resid
, error
, 4, 0);
1748 cluster_phys_write(vnode_t vp
, struct uio
*uio
, off_t newEOF
)
1750 upl_page_info_t
*pl
;
1753 vm_offset_t upl_offset
;
1757 int upl_needed_size
;
1764 struct cl_writebehind
*wbp
;
1766 devblocksize
= vp
->v_mount
->mnt_devblocksize
;
1768 * When we enter this routine, we know
1769 * -- the resid will not exceed iov_len
1770 * -- the vector target address is physcially contiguous
1772 if ((wbp
= cluster_get_wbp(vp
, CLW_RETURNLOCKED
)) != NULL
) {
1774 cluster_try_push(wbp
, vp
, newEOF
, 0, 1);
1776 lck_mtx_unlock(&wbp
->cl_lockw
);
1779 if (IS_VALID_UIO_SEGFLG(uio
->uio_segflg
) == 0) {
1780 panic("%s :%d - invalid uio_segflg\n", __FILE__
, __LINE__
);
1782 #endif /* LP64_DEBUG */
1784 // LP64todo - fix this!
1785 io_size
= uio_iov_len(uio
);
1786 iov_base
= uio_iov_base(uio
);
1787 upl_offset
= CAST_DOWN(upl_offset_t
, iov_base
) & PAGE_MASK
;
1788 upl_needed_size
= upl_offset
+ io_size
;
1791 upl_size
= upl_needed_size
;
1792 upl_flags
= UPL_FILE_IO
| UPL_COPYOUT_FROM
| UPL_NO_SYNC
|
1793 UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
1795 // LP64todo - fix this!
1796 kret
= vm_map_get_upl(current_map(),
1797 CAST_DOWN(upl_offset_t
, iov_base
) & ~PAGE_MASK
,
1798 &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
, 0);
1800 if (kret
!= KERN_SUCCESS
) {
1802 * cluster_phys_write: failed to get pagelist
1803 * note: return kret here
1808 * Consider the possibility that upl_size wasn't satisfied.
1809 * This is a failure in the physical memory case.
1811 if (upl_size
< upl_needed_size
) {
1812 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
1815 pl
= ubc_upl_pageinfo(upl
);
1817 src_paddr
= ((addr64_t
)upl_phys_page(pl
, 0) << 12) + ((addr64_t
)(iov_base
& PAGE_MASK
));
1819 while (((uio
->uio_offset
& (devblocksize
- 1)) || io_size
< devblocksize
) && io_size
) {
1822 head_size
= devblocksize
- (int)(uio
->uio_offset
& (devblocksize
- 1));
1824 if (head_size
> io_size
)
1825 head_size
= io_size
;
1827 error
= cluster_align_phys_io(vp
, uio
, src_paddr
, head_size
, 0);
1830 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
1834 upl_offset
+= head_size
;
1835 src_paddr
+= head_size
;
1836 io_size
-= head_size
;
1838 tail_size
= io_size
& (devblocksize
- 1);
1839 io_size
-= tail_size
;
1843 * issue a synchronous write to cluster_io
1845 error
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
,
1846 io_size
, CL_DEV_MEMORY
, (buf_t
)NULL
, (struct clios
*)NULL
);
1850 * The cluster_io write completed successfully,
1851 * update the uio structure
1853 uio_setresid(uio
, (uio_resid(uio
) - io_size
));
1854 uio_iov_len_add(uio
, -io_size
);
1855 uio_iov_base_add(uio
, io_size
);
1856 uio
->uio_offset
+= io_size
;
1857 src_paddr
+= io_size
;
1860 error
= cluster_align_phys_io(vp
, uio
, src_paddr
, tail_size
, 0);
1863 * just release our hold on the physically contiguous
1864 * region without changing any state
1866 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
1873 cluster_write_x(vnode_t vp
, struct uio
*uio
, off_t oldEOF
, off_t newEOF
, off_t headOff
, off_t tailOff
, int flags
)
1875 upl_page_info_t
*pl
;
1877 vm_offset_t upl_offset
= 0;
1890 long long total_size
;
1893 long long zero_cnt1
;
1895 struct cl_extent cl
;
1897 struct cl_writebehind
*wbp
;
1899 if ((wbp
= cluster_get_wbp(vp
, 0)) != NULL
)
1901 if (wbp
->cl_hasbeenpaged
) {
1903 * this vnode had pages cleaned to it by
1904 * the pager which indicates that either
1905 * it's not very 'hot', or the system is
1906 * being overwhelmed by a lot of dirty
1907 * data being delayed in the VM cache...
1908 * in either event, we'll push our remaining
1909 * delayed data at this point... this will
1910 * be more efficient than paging out 1 page at
1911 * a time, and will also act as a throttle
1912 * by delaying this client from writing any
1913 * more data until all his delayed data has
1914 * at least been queued to the uderlying driver.
1916 if (wbp
->cl_number
|| wbp
->cl_scmap
)
1917 cluster_push_EOF(vp
, newEOF
);
1919 wbp
->cl_hasbeenpaged
= 0;
1923 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_START
,
1924 (int)uio
->uio_offset
, uio_resid(uio
), (int)oldEOF
, (int)newEOF
, 0);
1926 // LP64todo - fix this
1927 io_resid
= uio_resid(uio
);
1929 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_START
,
1930 0, 0, (int)oldEOF
, (int)newEOF
, 0);
1939 if (flags
& IO_HEADZEROFILL
) {
1941 * some filesystems (HFS is one) don't support unallocated holes within a file...
1942 * so we zero fill the intervening space between the old EOF and the offset
1943 * where the next chunk of real data begins.... ftruncate will also use this
1944 * routine to zero fill to the new EOF when growing a file... in this case, the
1945 * uio structure will not be provided
1948 if (headOff
< uio
->uio_offset
) {
1949 zero_cnt
= uio
->uio_offset
- headOff
;
1952 } else if (headOff
< newEOF
) {
1953 zero_cnt
= newEOF
- headOff
;
1957 if (flags
& IO_TAILZEROFILL
) {
1959 // LP64todo - fix this
1960 zero_off1
= uio
->uio_offset
+ uio_resid(uio
);
1962 if (zero_off1
< tailOff
)
1963 zero_cnt1
= tailOff
- zero_off1
;
1966 if (zero_cnt
== 0 && uio
== (struct uio
*) 0) {
1967 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_END
,
1968 retval
, 0, 0, 0, 0);
1972 while ((total_size
= (io_resid
+ zero_cnt
+ zero_cnt1
)) && retval
== 0) {
1974 * for this iteration of the loop, figure out where our starting point is
1977 start_offset
= (int)(zero_off
& PAGE_MASK_64
);
1978 upl_f_offset
= zero_off
- start_offset
;
1979 } else if (io_resid
) {
1980 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
1981 upl_f_offset
= uio
->uio_offset
- start_offset
;
1983 start_offset
= (int)(zero_off1
& PAGE_MASK_64
);
1984 upl_f_offset
= zero_off1
- start_offset
;
1986 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 46)) | DBG_FUNC_NONE
,
1987 (int)zero_off
, (int)zero_cnt
, (int)zero_off1
, (int)zero_cnt1
, 0);
1989 if (total_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
1990 total_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
1992 cl
.b_addr
= (daddr64_t
)(upl_f_offset
/ PAGE_SIZE_64
);
1994 if (uio
&& ((flags
& (IO_NOCACHE
| IO_SYNC
| IO_HEADZEROFILL
| IO_TAILZEROFILL
)) == 0)) {
1996 * assumption... total_size <= io_resid
1997 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
1999 if ((start_offset
+ total_size
) > (MAX_UPL_TRANSFER
* PAGE_SIZE
))
2000 total_size
-= start_offset
;
2001 xfer_resid
= total_size
;
2003 retval
= cluster_copy_ubc_data(vp
, uio
, &xfer_resid
, 1);
2008 io_resid
-= (total_size
- xfer_resid
);
2009 total_size
= xfer_resid
;
2010 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
2011 upl_f_offset
= uio
->uio_offset
- start_offset
;
2013 if (total_size
== 0) {
2016 * the write did not finish on a page boundary
2017 * which will leave upl_f_offset pointing to the
2018 * beginning of the last page written instead of
2019 * the page beyond it... bump it in this case
2020 * so that the cluster code records the last page
2023 upl_f_offset
+= PAGE_SIZE_64
;
2031 * compute the size of the upl needed to encompass
2032 * the requested write... limit each call to cluster_io
2033 * to the maximum UPL size... cluster_io will clip if
2034 * this exceeds the maximum io_size for the device,
2035 * make sure to account for
2036 * a starting offset that's not page aligned
2038 upl_size
= (start_offset
+ total_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
2040 if (upl_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
2041 upl_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
2043 pages_in_upl
= upl_size
/ PAGE_SIZE
;
2044 io_size
= upl_size
- start_offset
;
2046 if ((long long)io_size
> total_size
)
2047 io_size
= total_size
;
2049 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_START
, upl_size
, io_size
, total_size
, 0, 0);
2053 * Gather the pages from the buffer cache.
2054 * The UPL_WILL_MODIFY flag lets the UPL subsystem know
2055 * that we intend to modify these pages.
2057 kret
= ubc_create_upl(vp
,
2062 UPL_SET_LITE
| UPL_WILL_MODIFY
);
2063 if (kret
!= KERN_SUCCESS
)
2064 panic("cluster_write: failed to get pagelist");
2066 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_END
,
2067 (int)upl
, (int)upl_f_offset
, start_offset
, 0, 0);
2069 if (start_offset
&& !upl_valid_page(pl
, 0)) {
2073 * we're starting in the middle of the first page of the upl
2074 * and the page isn't currently valid, so we're going to have
2075 * to read it in first... this is a synchronous operation
2077 read_size
= PAGE_SIZE
;
2079 if ((upl_f_offset
+ read_size
) > newEOF
)
2080 read_size
= newEOF
- upl_f_offset
;
2082 retval
= cluster_io(vp
, upl
, 0, upl_f_offset
, read_size
,
2083 CL_READ
, (buf_t
)NULL
, (struct clios
*)NULL
);
2086 * we had an error during the read which causes us to abort
2087 * the current cluster_write request... before we do, we need
2088 * to release the rest of the pages in the upl without modifying
2089 * there state and mark the failed page in error
2091 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
);
2093 if (upl_size
> PAGE_SIZE
)
2094 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
2096 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
2097 (int)upl
, 0, 0, retval
, 0);
2101 if ((start_offset
== 0 || upl_size
> PAGE_SIZE
) && ((start_offset
+ io_size
) & PAGE_MASK
)) {
2103 * the last offset we're writing to in this upl does not end on a page
2104 * boundary... if it's not beyond the old EOF, then we'll also need to
2105 * pre-read this page in if it isn't already valid
2107 upl_offset
= upl_size
- PAGE_SIZE
;
2109 if ((upl_f_offset
+ start_offset
+ io_size
) < oldEOF
&&
2110 !upl_valid_page(pl
, upl_offset
/ PAGE_SIZE
)) {
2113 read_size
= PAGE_SIZE
;
2115 if ((upl_f_offset
+ upl_offset
+ read_size
) > newEOF
)
2116 read_size
= newEOF
- (upl_f_offset
+ upl_offset
);
2118 retval
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, read_size
,
2119 CL_READ
, (buf_t
)NULL
, (struct clios
*)NULL
);
2122 * we had an error during the read which causes us to abort
2123 * the current cluster_write request... before we do, we
2124 * need to release the rest of the pages in the upl without
2125 * modifying there state and mark the failed page in error
2127 ubc_upl_abort_range(upl
, upl_offset
, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
);
2129 if (upl_size
> PAGE_SIZE
)
2130 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
2132 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
2133 (int)upl
, 0, 0, retval
, 0);
2138 xfer_resid
= io_size
;
2139 io_offset
= start_offset
;
2141 while (zero_cnt
&& xfer_resid
) {
2143 if (zero_cnt
< (long long)xfer_resid
)
2144 bytes_to_zero
= zero_cnt
;
2146 bytes_to_zero
= xfer_resid
;
2148 if ( !(flags
& (IO_NOZEROVALID
| IO_NOZERODIRTY
))) {
2149 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
2153 bytes_to_zero
= min(bytes_to_zero
, PAGE_SIZE
- (int)(zero_off
& PAGE_MASK_64
));
2154 zero_pg_index
= (int)((zero_off
- upl_f_offset
) / PAGE_SIZE_64
);
2156 if ( !upl_valid_page(pl
, zero_pg_index
)) {
2157 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
2159 } else if ((flags
& (IO_NOZERODIRTY
| IO_NOZEROVALID
)) == IO_NOZERODIRTY
&&
2160 !upl_dirty_page(pl
, zero_pg_index
)) {
2161 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
2164 xfer_resid
-= bytes_to_zero
;
2165 zero_cnt
-= bytes_to_zero
;
2166 zero_off
+= bytes_to_zero
;
2167 io_offset
+= bytes_to_zero
;
2169 if (xfer_resid
&& io_resid
) {
2170 bytes_to_move
= min(io_resid
, xfer_resid
);
2172 retval
= cluster_copy_upl_data(uio
, upl
, io_offset
, bytes_to_move
);
2176 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
2178 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
2179 (int)upl
, 0, 0, retval
, 0);
2181 io_resid
-= bytes_to_move
;
2182 xfer_resid
-= bytes_to_move
;
2183 io_offset
+= bytes_to_move
;
2186 while (xfer_resid
&& zero_cnt1
&& retval
== 0) {
2188 if (zero_cnt1
< (long long)xfer_resid
)
2189 bytes_to_zero
= zero_cnt1
;
2191 bytes_to_zero
= xfer_resid
;
2193 if ( !(flags
& (IO_NOZEROVALID
| IO_NOZERODIRTY
))) {
2194 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
2198 bytes_to_zero
= min(bytes_to_zero
, PAGE_SIZE
- (int)(zero_off1
& PAGE_MASK_64
));
2199 zero_pg_index
= (int)((zero_off1
- upl_f_offset
) / PAGE_SIZE_64
);
2201 if ( !upl_valid_page(pl
, zero_pg_index
)) {
2202 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
2203 } else if ((flags
& (IO_NOZERODIRTY
| IO_NOZEROVALID
)) == IO_NOZERODIRTY
&&
2204 !upl_dirty_page(pl
, zero_pg_index
)) {
2205 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
2208 xfer_resid
-= bytes_to_zero
;
2209 zero_cnt1
-= bytes_to_zero
;
2210 zero_off1
+= bytes_to_zero
;
2211 io_offset
+= bytes_to_zero
;
2218 io_size
+= start_offset
;
2220 if ((upl_f_offset
+ io_size
) >= newEOF
&& io_size
< upl_size
) {
2222 * if we're extending the file with this write
2223 * we'll zero fill the rest of the page so that
2224 * if the file gets extended again in such a way as to leave a
2225 * hole starting at this EOF, we'll have zero's in the correct spot
2227 cluster_zero(upl
, io_size
, upl_size
- io_size
, NULL
);
2229 if (flags
& IO_SYNC
)
2231 * if the IO_SYNC flag is set than we need to
2232 * bypass any clusters and immediately issue
2238 * take the lock to protect our accesses
2239 * of the writebehind and sparse cluster state
2241 wbp
= cluster_get_wbp(vp
, CLW_ALLOCATE
| CLW_RETURNLOCKED
);
2244 * calculate the last logical block number
2245 * that this delayed I/O encompassed
2247 cl
.e_addr
= (daddr64_t
)((upl_f_offset
+ (off_t
)upl_size
) / PAGE_SIZE_64
);
2249 if (wbp
->cl_scmap
) {
2251 if ( !(flags
& IO_NOCACHE
)) {
2253 * we've fallen into the sparse
2254 * cluster method of delaying dirty pages
2255 * first, we need to release the upl if we hold one
2256 * since pages in it may be present in the sparse cluster map
2257 * and may span 2 separate buckets there... if they do and
2258 * we happen to have to flush a bucket to make room and it intersects
2259 * this upl, a deadlock may result on page BUSY
2262 ubc_upl_commit_range(upl
, 0, upl_size
,
2263 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2265 sparse_cluster_add(wbp
, vp
, &cl
, newEOF
);
2267 lck_mtx_unlock(&wbp
->cl_lockw
);
2272 * must have done cached writes that fell into
2273 * the sparse cluster mechanism... we've switched
2274 * to uncached writes on the file, so go ahead
2275 * and push whatever's in the sparse map
2276 * and switch back to normal clustering
2278 * see the comment above concerning a possible deadlock...
2281 ubc_upl_commit_range(upl
, 0, upl_size
,
2282 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2284 * setting upl_size to 0 keeps us from committing a
2285 * second time in the start_new_cluster path
2289 sparse_cluster_push(wbp
, vp
, newEOF
, 1);
2293 * no clusters of either type present at this point
2294 * so just go directly to start_new_cluster since
2295 * we know we need to delay this I/O since we've
2296 * already released the pages back into the cache
2297 * to avoid the deadlock with sparse_cluster_push
2299 goto start_new_cluster
;
2303 if (wbp
->cl_number
== 0)
2305 * no clusters currently present
2307 goto start_new_cluster
;
2309 for (cl_index
= 0; cl_index
< wbp
->cl_number
; cl_index
++) {
2311 * check each cluster that we currently hold
2312 * try to merge some or all of this write into
2313 * one or more of the existing clusters... if
2314 * any portion of the write remains, start a
2317 if (cl
.b_addr
>= wbp
->cl_clusters
[cl_index
].b_addr
) {
2319 * the current write starts at or after the current cluster
2321 if (cl
.e_addr
<= (wbp
->cl_clusters
[cl_index
].b_addr
+ MAX_UPL_TRANSFER
)) {
2323 * we have a write that fits entirely
2324 * within the existing cluster limits
2326 if (cl
.e_addr
> wbp
->cl_clusters
[cl_index
].e_addr
)
2328 * update our idea of where the cluster ends
2330 wbp
->cl_clusters
[cl_index
].e_addr
= cl
.e_addr
;
2333 if (cl
.b_addr
< (wbp
->cl_clusters
[cl_index
].b_addr
+ MAX_UPL_TRANSFER
)) {
2335 * we have a write that starts in the middle of the current cluster
2336 * but extends beyond the cluster's limit... we know this because
2337 * of the previous checks
2338 * we'll extend the current cluster to the max
2339 * and update the b_addr for the current write to reflect that
2340 * the head of it was absorbed into this cluster...
2341 * note that we'll always have a leftover tail in this case since
2342 * full absorbtion would have occurred in the clause above
2344 wbp
->cl_clusters
[cl_index
].e_addr
= wbp
->cl_clusters
[cl_index
].b_addr
+ MAX_UPL_TRANSFER
;
2347 daddr64_t start_pg_in_upl
;
2349 start_pg_in_upl
= (daddr64_t
)(upl_f_offset
/ PAGE_SIZE_64
);
2351 if (start_pg_in_upl
< wbp
->cl_clusters
[cl_index
].e_addr
) {
2352 intersection
= (int)((wbp
->cl_clusters
[cl_index
].e_addr
- start_pg_in_upl
) * PAGE_SIZE
);
2354 ubc_upl_commit_range(upl
, upl_offset
, intersection
,
2355 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2356 upl_f_offset
+= intersection
;
2357 upl_offset
+= intersection
;
2358 upl_size
-= intersection
;
2361 cl
.b_addr
= wbp
->cl_clusters
[cl_index
].e_addr
;
2364 * we come here for the case where the current write starts
2365 * beyond the limit of the existing cluster or we have a leftover
2366 * tail after a partial absorbtion
2368 * in either case, we'll check the remaining clusters before
2369 * starting a new one
2373 * the current write starts in front of the cluster we're currently considering
2375 if ((wbp
->cl_clusters
[cl_index
].e_addr
- cl
.b_addr
) <= MAX_UPL_TRANSFER
) {
2377 * we can just merge the new request into
2378 * this cluster and leave it in the cache
2379 * since the resulting cluster is still
2380 * less than the maximum allowable size
2382 wbp
->cl_clusters
[cl_index
].b_addr
= cl
.b_addr
;
2384 if (cl
.e_addr
> wbp
->cl_clusters
[cl_index
].e_addr
) {
2386 * the current write completely
2387 * envelops the existing cluster and since
2388 * each write is limited to at most MAX_UPL_TRANSFER bytes
2389 * we can just use the start and last blocknos of the write
2390 * to generate the cluster limits
2392 wbp
->cl_clusters
[cl_index
].e_addr
= cl
.e_addr
;
2398 * if we were to combine this write with the current cluster
2399 * we would exceed the cluster size limit.... so,
2400 * let's see if there's any overlap of the new I/O with
2401 * the cluster we're currently considering... in fact, we'll
2402 * stretch the cluster out to it's full limit and see if we
2403 * get an intersection with the current write
2406 if (cl
.e_addr
> wbp
->cl_clusters
[cl_index
].e_addr
- MAX_UPL_TRANSFER
) {
2408 * the current write extends into the proposed cluster
2409 * clip the length of the current write after first combining it's
2410 * tail with the newly shaped cluster
2412 wbp
->cl_clusters
[cl_index
].b_addr
= wbp
->cl_clusters
[cl_index
].e_addr
- MAX_UPL_TRANSFER
;
2415 intersection
= (int)((cl
.e_addr
- wbp
->cl_clusters
[cl_index
].b_addr
) * PAGE_SIZE
);
2417 if (intersection
> upl_size
)
2419 * because the current write may consist of a number of pages found in the cache
2420 * which are not part of the UPL, we may have an intersection that exceeds
2421 * the size of the UPL that is also part of this write
2423 intersection
= upl_size
;
2425 ubc_upl_commit_range(upl
, upl_offset
+ (upl_size
- intersection
), intersection
,
2426 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2427 upl_size
-= intersection
;
2429 cl
.e_addr
= wbp
->cl_clusters
[cl_index
].b_addr
;
2432 * if we get here, there was no way to merge
2433 * any portion of this write with this cluster
2434 * or we could only merge part of it which
2435 * will leave a tail...
2436 * we'll check the remaining clusters before starting a new one
2440 if (cl_index
< wbp
->cl_number
)
2442 * we found an existing cluster(s) that we
2443 * could entirely merge this I/O into
2447 if (wbp
->cl_number
< MAX_CLUSTERS
&& !(flags
& IO_NOCACHE
))
2449 * we didn't find an existing cluster to
2450 * merge into, but there's room to start
2453 goto start_new_cluster
;
2456 * no exisitng cluster to merge with and no
2457 * room to start a new one... we'll try
2458 * pushing one of the existing ones... if none of
2459 * them are able to be pushed, we'll switch
2460 * to the sparse cluster mechanism
2461 * cluster_try_push updates cl_number to the
2462 * number of remaining clusters... and
2463 * returns the number of currently unused clusters
2465 int ret_cluster_try_push
= 0;
2466 /* if writes are not deferred, call cluster push immediately */
2467 if (!((unsigned int)vfs_flags(vp
->v_mount
) & MNT_DEFWRITE
)) {
2468 if (flags
& IO_NOCACHE
)
2473 ret_cluster_try_push
= cluster_try_push(wbp
, vp
, newEOF
, can_delay
, 0);
2476 /* execute following regardless writes are deferred or not */
2477 if (ret_cluster_try_push
== 0) {
2479 * no more room in the normal cluster mechanism
2480 * so let's switch to the more expansive but expensive
2481 * sparse mechanism....
2482 * first, we need to release the upl if we hold one
2483 * since pages in it may be present in the sparse cluster map (after the cluster_switch)
2484 * and may span 2 separate buckets there... if they do and
2485 * we happen to have to flush a bucket to make room and it intersects
2486 * this upl, a deadlock may result on page BUSY
2489 ubc_upl_commit_range(upl
, upl_offset
, upl_size
,
2490 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2492 sparse_cluster_switch(wbp
, vp
, newEOF
);
2493 sparse_cluster_add(wbp
, vp
, &cl
, newEOF
);
2495 lck_mtx_unlock(&wbp
->cl_lockw
);
2500 * we pushed one cluster successfully, so we must be sequentially writing this file
2501 * otherwise, we would have failed and fallen into the sparse cluster support
2502 * so let's take the opportunity to push out additional clusters as long as we
2503 * remain below the throttle... this will give us better I/O locality if we're
2504 * in a copy loop (i.e. we won't jump back and forth between the read and write points
2505 * however, we don't want to push so much out that the write throttle kicks in and
2506 * hangs this thread up until some of the I/O completes...
2508 if (!((unsigned int)vfs_flags(vp
->v_mount
) & MNT_DEFWRITE
)) {
2509 while (wbp
->cl_number
&& (vp
->v_numoutput
<= (VNODE_ASYNC_THROTTLE
/ 2)))
2510 cluster_try_push(wbp
, vp
, newEOF
, 0, 0);
2514 wbp
->cl_clusters
[wbp
->cl_number
].b_addr
= cl
.b_addr
;
2515 wbp
->cl_clusters
[wbp
->cl_number
].e_addr
= cl
.e_addr
;
2517 if (flags
& IO_NOCACHE
)
2518 wbp
->cl_clusters
[wbp
->cl_number
].io_nocache
= 1;
2520 wbp
->cl_clusters
[wbp
->cl_number
].io_nocache
= 0;
2524 ubc_upl_commit_range(upl
, upl_offset
, upl_size
,
2525 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2527 lck_mtx_unlock(&wbp
->cl_lockw
);
2532 * we don't hold the vnode lock at this point
2534 * because we had to ask for a UPL that provides currenty non-present pages, the
2535 * UPL has been automatically set to clear the dirty flags (both software and hardware)
2536 * upon committing it... this is not the behavior we want since it's possible for
2537 * pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
2538 * in order to maintain some semblance of coherency with mapped writes
2539 * we need to drop the current upl and pick it back up with COPYOUT_FROM set
2540 * so that we correctly deal with a change in state of the hardware modify bit...
2541 * we do this via cluster_push_x... by passing along the IO_SYNC flag, we force
2542 * cluster_push_x to wait until all the I/Os have completed... cluster_push_x is also
2543 * responsible for generating the correct sized I/O(s)
2545 ubc_upl_commit_range(upl
, 0, upl_size
,
2546 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
2548 cl
.e_addr
= (upl_f_offset
+ (off_t
)upl_size
) / PAGE_SIZE_64
;
2550 retval
= cluster_push_x(vp
, &cl
, newEOF
, flags
);
2553 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_END
,
2554 retval
, 0, io_resid
, 0, 0);
2560 cluster_read(vnode_t vp
, struct uio
*uio
, off_t filesize
, int xflags
)
2573 if (vp
->v_flag
& VNOCACHE_DATA
)
2574 flags
|= IO_NOCACHE
;
2575 if (vp
->v_flag
& VRAOFF
)
2578 if (!((flags
& IO_NOCACHE
) && UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
))) {
2580 * go do a read through the cache if one of the following is true....
2581 * NOCACHE is not true
2582 * the uio request doesn't target USERSPACE
2584 return (cluster_read_x(vp
, uio
, filesize
, flags
));
2588 if (IS_VALID_UIO_SEGFLG(uio
->uio_segflg
) == 0) {
2589 panic("%s :%d - invalid uio_segflg\n", __FILE__
, __LINE__
);
2591 #endif /* LP64_DEBUG */
2593 while (uio_resid(uio
) && uio
->uio_offset
< filesize
&& retval
== 0) {
2598 * we know we have a resid, so this is safe
2599 * skip over any emtpy vectors
2601 iov_len
= uio_iov_len(uio
);
2603 while (iov_len
== 0) {
2606 iov_len
= uio_iov_len(uio
);
2608 iov_base
= uio_iov_base(uio
);
2609 upl_size
= PAGE_SIZE
;
2610 upl_flags
= UPL_QUERY_OBJECT_TYPE
;
2612 // LP64todo - fix this!
2613 if ((vm_map_get_upl(current_map(),
2614 CAST_DOWN(vm_offset_t
, iov_base
) & ~PAGE_MASK
,
2615 &upl_size
, &upl
, NULL
, NULL
, &upl_flags
, 0)) != KERN_SUCCESS
) {
2617 * the user app must have passed in an invalid address
2623 * We check every vector target but if it is physically
2624 * contiguous space, we skip the sanity checks.
2626 if (upl_flags
& UPL_PHYS_CONTIG
) {
2627 retval
= cluster_phys_read(vp
, uio
, filesize
);
2629 else if (uio_resid(uio
) < PAGE_SIZE
) {
2631 * we're here because we're don't have a physically contiguous target buffer
2632 * go do a read through the cache if
2633 * the total xfer size is less than a page...
2635 return (cluster_read_x(vp
, uio
, filesize
, flags
));
2637 // LP64todo - fix this!
2638 else if (((int)uio
->uio_offset
& PAGE_MASK
) || (CAST_DOWN(int, iov_base
) & PAGE_MASK
)) {
2639 if (((int)uio
->uio_offset
& PAGE_MASK
) == (CAST_DOWN(int, iov_base
) & PAGE_MASK
)) {
2641 * Bring the file offset read up to a pagesize boundary
2642 * this will also bring the base address to a page boundary
2643 * since they both are currently on the same offset within a page
2644 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
2645 * so the computed clip_size must always be less than the current uio_resid
2647 clip_size
= (PAGE_SIZE
- (int)(uio
->uio_offset
& PAGE_MASK_64
));
2650 * Fake the resid going into the cluster_read_x call
2651 * and restore it on the way out.
2653 prev_resid
= uio_resid(uio
);
2654 // LP64todo - fix this
2655 uio_setresid(uio
, clip_size
);
2657 retval
= cluster_read_x(vp
, uio
, filesize
, flags
);
2659 uio_setresid(uio
, prev_resid
- (clip_size
- uio_resid(uio
)));
2662 * can't get both the file offset and the buffer offset aligned to a page boundary
2663 * so fire an I/O through the cache for this entire vector
2665 // LP64todo - fix this!
2666 clip_size
= iov_len
;
2667 prev_resid
= uio_resid(uio
);
2668 uio_setresid(uio
, clip_size
);
2670 retval
= cluster_read_x(vp
, uio
, filesize
, flags
);
2672 uio_setresid(uio
, prev_resid
- (clip_size
- uio_resid(uio
)));
2676 * If we come in here, we know the offset into
2677 * the file is on a pagesize boundary
2679 max_io_size
= filesize
- uio
->uio_offset
;
2680 // LP64todo - fix this
2681 clip_size
= uio_resid(uio
);
2682 if (iov_len
< clip_size
)
2683 clip_size
= iov_len
;
2684 if (max_io_size
< clip_size
)
2685 clip_size
= (int)max_io_size
;
2687 if (clip_size
< PAGE_SIZE
) {
2689 * Take care of the tail end of the read in this vector.
2691 // LP64todo - fix this
2692 prev_resid
= uio_resid(uio
);
2693 uio_setresid(uio
, clip_size
);
2695 retval
= cluster_read_x(vp
, uio
, filesize
, flags
);
2697 uio_setresid(uio
, prev_resid
- (clip_size
- uio_resid(uio
)));
2699 /* round clip_size down to a multiple of pagesize */
2700 clip_size
= clip_size
& ~(PAGE_MASK
);
2701 // LP64todo - fix this
2702 prev_resid
= uio_resid(uio
);
2703 uio_setresid(uio
, clip_size
);
2705 retval
= cluster_nocopy_read(vp
, uio
, filesize
);
2707 if ((retval
==0) && uio_resid(uio
))
2708 retval
= cluster_read_x(vp
, uio
, filesize
, flags
);
2710 uio_setresid(uio
, prev_resid
- (clip_size
- uio_resid(uio
)));
2719 cluster_read_x(vnode_t vp
, struct uio
*uio
, off_t filesize
, int flags
)
2721 upl_page_info_t
*pl
;
2723 vm_offset_t upl_offset
;
2732 off_t last_ioread_offset
;
2733 off_t last_request_offset
;
2734 u_int size_of_prefetch
;
2739 u_int max_rd_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
2740 u_int rd_ahead_enabled
= 1;
2741 u_int prefetch_enabled
= 1;
2742 struct cl_readahead
* rap
;
2743 struct clios iostate
;
2744 struct cl_extent extent
;
2746 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_START
,
2747 (int)uio
->uio_offset
, uio_resid(uio
), (int)filesize
, 0, 0);
2749 // LP64todo - fix this
2750 last_request_offset
= uio
->uio_offset
+ uio_resid(uio
);
2752 if ((flags
& (IO_RAOFF
|IO_NOCACHE
)) ||
2753 ((last_request_offset
& ~PAGE_MASK_64
) == (uio
->uio_offset
& ~PAGE_MASK_64
))) {
2754 rd_ahead_enabled
= 0;
2757 if (cluster_hard_throttle_on(vp
)) {
2758 rd_ahead_enabled
= 0;
2759 prefetch_enabled
= 0;
2761 max_rd_size
= HARD_THROTTLE_MAXSIZE
;
2763 if ((rap
= cluster_get_rap(vp
)) == NULL
)
2764 rd_ahead_enabled
= 0;
2766 if (last_request_offset
> filesize
)
2767 last_request_offset
= filesize
;
2768 extent
.b_addr
= uio
->uio_offset
/ PAGE_SIZE_64
;
2769 extent
.e_addr
= (last_request_offset
- 1) / PAGE_SIZE_64
;
2771 if (rap
!= NULL
&& rap
->cl_ralen
&& (rap
->cl_lastr
== extent
.b_addr
|| (rap
->cl_lastr
+ 1) == extent
.b_addr
)) {
2773 * determine if we already have a read-ahead in the pipe courtesy of the
2774 * last read systemcall that was issued...
2775 * if so, pick up it's extent to determine where we should start
2776 * with respect to any read-ahead that might be necessary to
2777 * garner all the data needed to complete this read systemcall
2779 last_ioread_offset
= (rap
->cl_maxra
* PAGE_SIZE_64
) + PAGE_SIZE_64
;
2781 if (last_ioread_offset
< uio
->uio_offset
)
2782 last_ioread_offset
= (off_t
)0;
2783 else if (last_ioread_offset
> last_request_offset
)
2784 last_ioread_offset
= last_request_offset
;
2786 last_ioread_offset
= (off_t
)0;
2788 while (uio_resid(uio
) && uio
->uio_offset
< filesize
&& retval
== 0) {
2790 * compute the size of the upl needed to encompass
2791 * the requested read... limit each call to cluster_io
2792 * to the maximum UPL size... cluster_io will clip if
2793 * this exceeds the maximum io_size for the device,
2794 * make sure to account for
2795 * a starting offset that's not page aligned
2797 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
2798 upl_f_offset
= uio
->uio_offset
- (off_t
)start_offset
;
2799 max_size
= filesize
- uio
->uio_offset
;
2801 // LP64todo - fix this!
2802 if ((off_t
)((unsigned int)uio_resid(uio
)) < max_size
)
2803 io_size
= uio_resid(uio
);
2807 if (!(flags
& IO_NOCACHE
)) {
2814 * if we keep finding the pages we need already in the cache, then
2815 * don't bother to call cluster_rd_prefetch since it costs CPU cycles
2816 * to determine that we have all the pages we need... once we miss in
2817 * the cache and have issued an I/O, than we'll assume that we're likely
2818 * to continue to miss in the cache and it's to our advantage to try and prefetch
2820 if (last_request_offset
&& last_ioread_offset
&& (size_of_prefetch
= (last_request_offset
- last_ioread_offset
))) {
2821 if ((last_ioread_offset
- uio
->uio_offset
) <= max_rd_size
&& prefetch_enabled
) {
2823 * we've already issued I/O for this request and
2824 * there's still work to do and
2825 * our prefetch stream is running dry, so issue a
2826 * pre-fetch I/O... the I/O latency will overlap
2827 * with the copying of the data
2829 if (size_of_prefetch
> max_rd_size
)
2830 size_of_prefetch
= max_rd_size
;
2832 size_of_prefetch
= cluster_rd_prefetch(vp
, last_ioread_offset
, size_of_prefetch
, filesize
);
2834 last_ioread_offset
+= (off_t
)(size_of_prefetch
* PAGE_SIZE
);
2836 if (last_ioread_offset
> last_request_offset
)
2837 last_ioread_offset
= last_request_offset
;
2841 * limit the size of the copy we're about to do so that
2842 * we can notice that our I/O pipe is running dry and
2843 * get the next I/O issued before it does go dry
2845 if (last_ioread_offset
&& io_size
> ((MAX_UPL_TRANSFER
* PAGE_SIZE
) / 4))
2846 io_resid
= ((MAX_UPL_TRANSFER
* PAGE_SIZE
) / 4);
2850 io_requested
= io_resid
;
2852 retval
= cluster_copy_ubc_data(vp
, uio
, &io_resid
, 0);
2854 io_size
-= (io_requested
- io_resid
);
2856 if (retval
|| io_resid
)
2858 * if we run into a real error or
2859 * a page that is not in the cache
2860 * we need to leave streaming mode
2864 if ((io_size
== 0 || last_ioread_offset
== last_request_offset
) && rd_ahead_enabled
) {
2866 * we're already finished the I/O for this read request
2867 * let's see if we should do a read-ahead
2869 cluster_rd_ahead(vp
, &extent
, filesize
, rap
);
2876 if (extent
.e_addr
< rap
->cl_lastr
)
2878 rap
->cl_lastr
= extent
.e_addr
;
2882 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
2883 upl_f_offset
= uio
->uio_offset
- (off_t
)start_offset
;
2884 max_size
= filesize
- uio
->uio_offset
;
2886 if (io_size
> max_rd_size
)
2887 io_size
= max_rd_size
;
2889 upl_size
= (start_offset
+ io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
2891 if (upl_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
) / 4)
2892 upl_size
= (MAX_UPL_TRANSFER
* PAGE_SIZE
) / 4;
2893 pages_in_upl
= upl_size
/ PAGE_SIZE
;
2895 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 33)) | DBG_FUNC_START
,
2896 (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
2898 kret
= ubc_create_upl(vp
,
2904 if (kret
!= KERN_SUCCESS
)
2905 panic("cluster_read: failed to get pagelist");
2907 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 33)) | DBG_FUNC_END
,
2908 (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
2911 * scan from the beginning of the upl looking for the first
2912 * non-valid page.... this will become the first page in
2913 * the request we're going to make to 'cluster_io'... if all
2914 * of the pages are valid, we won't call through to 'cluster_io'
2916 for (start_pg
= 0; start_pg
< pages_in_upl
; start_pg
++) {
2917 if (!upl_valid_page(pl
, start_pg
))
2922 * scan from the starting invalid page looking for a valid
2923 * page before the end of the upl is reached, if we
2924 * find one, then it will be the last page of the request to
2927 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
2928 if (upl_valid_page(pl
, last_pg
))
2931 iostate
.io_completed
= 0;
2932 iostate
.io_issued
= 0;
2933 iostate
.io_error
= 0;
2934 iostate
.io_wanted
= 0;
2936 if (start_pg
< last_pg
) {
2938 * we found a range of 'invalid' pages that must be filled
2939 * if the last page in this range is the last page of the file
2940 * we may have to clip the size of it to keep from reading past
2941 * the end of the last physical block associated with the file
2943 upl_offset
= start_pg
* PAGE_SIZE
;
2944 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
2946 if ((upl_f_offset
+ upl_offset
+ io_size
) > filesize
)
2947 io_size
= filesize
- (upl_f_offset
+ upl_offset
);
2950 * issue an asynchronous read to cluster_io
2953 error
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
,
2954 io_size
, CL_READ
| CL_ASYNC
, (buf_t
)NULL
, &iostate
);
2958 * if the read completed successfully, or there was no I/O request
2959 * issued, than copy the data into user land via 'cluster_upl_copy_data'
2960 * we'll first add on any 'valid'
2961 * pages that were present in the upl when we acquired it.
2965 for (uio_last
= last_pg
; uio_last
< pages_in_upl
; uio_last
++) {
2966 if (!upl_valid_page(pl
, uio_last
))
2970 * compute size to transfer this round, if uio->uio_resid is
2971 * still non-zero after this attempt, we'll loop around and
2972 * set up for another I/O.
2974 val_size
= (uio_last
* PAGE_SIZE
) - start_offset
;
2976 if (val_size
> max_size
)
2977 val_size
= max_size
;
2979 if (val_size
> uio_resid(uio
))
2980 // LP64todo - fix this
2981 val_size
= uio_resid(uio
);
2983 if (last_ioread_offset
== 0)
2984 last_ioread_offset
= uio
->uio_offset
+ val_size
;
2986 if ((size_of_prefetch
= (last_request_offset
- last_ioread_offset
)) && prefetch_enabled
) {
2988 * if there's still I/O left to do for this request, and...
2989 * we're not in hard throttle mode, then issue a
2990 * pre-fetch I/O... the I/O latency will overlap
2991 * with the copying of the data
2993 size_of_prefetch
= cluster_rd_prefetch(vp
, last_ioread_offset
, size_of_prefetch
, filesize
);
2995 last_ioread_offset
+= (off_t
)(size_of_prefetch
* PAGE_SIZE
);
2997 if (last_ioread_offset
> last_request_offset
)
2998 last_ioread_offset
= last_request_offset
;
3000 } else if ((uio
->uio_offset
+ val_size
) == last_request_offset
) {
3002 * this transfer will finish this request, so...
3003 * let's try to read ahead if we're in
3004 * a sequential access pattern and we haven't
3005 * explicitly disabled it
3007 if (rd_ahead_enabled
)
3008 cluster_rd_ahead(vp
, &extent
, filesize
, rap
);
3011 if (extent
.e_addr
< rap
->cl_lastr
)
3013 rap
->cl_lastr
= extent
.e_addr
;
3016 lck_mtx_lock(cl_mtxp
);
3018 while (iostate
.io_issued
!= iostate
.io_completed
) {
3019 iostate
.io_wanted
= 1;
3020 msleep((caddr_t
)&iostate
.io_wanted
, cl_mtxp
, PRIBIO
+ 1, "cluster_read_x", 0);
3022 lck_mtx_unlock(cl_mtxp
);
3024 if (iostate
.io_error
)
3025 error
= iostate
.io_error
;
3027 retval
= cluster_copy_upl_data(uio
, upl
, start_offset
, val_size
);
3029 if (start_pg
< last_pg
) {
3031 * compute the range of pages that we actually issued an I/O for
3032 * and either commit them as valid if the I/O succeeded
3033 * or abort them if the I/O failed
3035 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
3037 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_START
,
3038 (int)upl
, start_pg
* PAGE_SIZE
, io_size
, error
, 0);
3040 if (error
|| (flags
& IO_NOCACHE
))
3041 ubc_upl_abort_range(upl
, start_pg
* PAGE_SIZE
, io_size
,
3042 UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
3044 ubc_upl_commit_range(upl
, start_pg
* PAGE_SIZE
, io_size
,
3045 UPL_COMMIT_CLEAR_DIRTY
|
3046 UPL_COMMIT_FREE_ON_EMPTY
|
3047 UPL_COMMIT_INACTIVATE
);
3049 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_END
,
3050 (int)upl
, start_pg
* PAGE_SIZE
, io_size
, error
, 0);
3052 if ((last_pg
- start_pg
) < pages_in_upl
) {
3057 * the set of pages that we issued an I/O for did not encompass
3058 * the entire upl... so just release these without modifying
3062 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3064 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_START
,
3065 (int)upl
, -1, pages_in_upl
- (last_pg
- start_pg
), 0, 0);
3069 * we found some already valid pages at the beginning of
3070 * the upl commit these back to the inactive list with
3073 for (cur_pg
= 0; cur_pg
< start_pg
; cur_pg
++) {
3074 commit_flags
= UPL_COMMIT_FREE_ON_EMPTY
3075 | UPL_COMMIT_INACTIVATE
;
3077 if (upl_dirty_page(pl
, cur_pg
))
3078 commit_flags
|= UPL_COMMIT_SET_DIRTY
;
3080 if ( !(commit_flags
& UPL_COMMIT_SET_DIRTY
) && (flags
& IO_NOCACHE
))
3081 ubc_upl_abort_range(upl
, cur_pg
* PAGE_SIZE
, PAGE_SIZE
,
3082 UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
3084 ubc_upl_commit_range(upl
, cur_pg
* PAGE_SIZE
,
3085 PAGE_SIZE
, commit_flags
);
3088 if (last_pg
< uio_last
) {
3090 * we found some already valid pages immediately after the
3091 * pages we issued I/O for, commit these back to the
3092 * inactive list with reference cleared
3094 for (cur_pg
= last_pg
; cur_pg
< uio_last
; cur_pg
++) {
3095 commit_flags
= UPL_COMMIT_FREE_ON_EMPTY
3096 | UPL_COMMIT_INACTIVATE
;
3098 if (upl_dirty_page(pl
, cur_pg
))
3099 commit_flags
|= UPL_COMMIT_SET_DIRTY
;
3101 if ( !(commit_flags
& UPL_COMMIT_SET_DIRTY
) && (flags
& IO_NOCACHE
))
3102 ubc_upl_abort_range(upl
, cur_pg
* PAGE_SIZE
, PAGE_SIZE
,
3103 UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
3105 ubc_upl_commit_range(upl
, cur_pg
* PAGE_SIZE
,
3106 PAGE_SIZE
, commit_flags
);
3109 if (uio_last
< pages_in_upl
) {
3111 * there were some invalid pages beyond the valid pages
3112 * that we didn't issue an I/O for, just release them
3115 ubc_upl_abort_range(upl
, uio_last
* PAGE_SIZE
,
3116 (pages_in_upl
- uio_last
) * PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
3119 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_END
,
3120 (int)upl
, -1, -1, 0, 0);
3126 if ( uio_resid(uio
) ) {
3127 if (cluster_hard_throttle_on(vp
)) {
3128 rd_ahead_enabled
= 0;
3129 prefetch_enabled
= 0;
3131 max_rd_size
= HARD_THROTTLE_MAXSIZE
;
3134 rd_ahead_enabled
= 1;
3135 prefetch_enabled
= 1;
3137 max_rd_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
3142 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_END
,
3143 (int)uio
->uio_offset
, uio_resid(uio
), rap
->cl_lastr
, retval
, 0);
3145 lck_mtx_unlock(&rap
->cl_lockr
);
3147 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_END
,
3148 (int)uio
->uio_offset
, uio_resid(uio
), 0, retval
, 0);
3156 cluster_nocopy_read(vnode_t vp
, struct uio
*uio
, off_t filesize
)
3159 upl_page_info_t
*pl
;
3160 vm_offset_t upl_offset
;
3164 int upl_needed_size
;
3170 int force_data_sync
;
3172 int no_zero_fill
= 0;
3174 struct clios iostate
;
3175 u_int max_rd_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
3176 u_int max_rd_ahead
= MAX_UPL_TRANSFER
* PAGE_SIZE
* 2;
3179 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_START
,
3180 (int)uio
->uio_offset
, uio_resid(uio
), (int)filesize
, 0, 0);
3183 * When we enter this routine, we know
3184 * -- the offset into the file is on a pagesize boundary
3185 * -- the resid is a page multiple
3186 * -- the resid will not exceed iov_len
3189 iostate
.io_completed
= 0;
3190 iostate
.io_issued
= 0;
3191 iostate
.io_error
= 0;
3192 iostate
.io_wanted
= 0;
3196 while (uio_resid(uio
) && uio
->uio_offset
< filesize
&& retval
== 0) {
3198 if (cluster_hard_throttle_on(vp
)) {
3199 max_rd_size
= HARD_THROTTLE_MAXSIZE
;
3200 max_rd_ahead
= HARD_THROTTLE_MAXSIZE
- 1;
3202 max_rd_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
3203 max_rd_ahead
= MAX_UPL_TRANSFER
* PAGE_SIZE
* 2;
3205 max_io_size
= filesize
- uio
->uio_offset
;
3207 // LP64todo - fix this
3208 if (max_io_size
< (off_t
)((unsigned int)uio_resid(uio
)))
3209 io_size
= max_io_size
;
3211 io_size
= uio_resid(uio
);
3214 * First look for pages already in the cache
3215 * and move them to user space.
3217 retval
= cluster_copy_ubc_data(vp
, uio
, &io_size
, 0);
3221 * we may have already spun some portion of this request
3222 * off as async requests... we need to wait for the I/O
3223 * to complete before returning
3225 goto wait_for_reads
;
3228 * If we are already finished with this read, then return
3232 * we may have already spun some portion of this request
3233 * off as async requests... we need to wait for the I/O
3234 * to complete before returning
3236 goto wait_for_reads
;
3238 max_io_size
= io_size
;
3240 if (max_io_size
> max_rd_size
)
3241 max_io_size
= max_rd_size
;
3245 ubc_range_op(vp
, uio
->uio_offset
, uio
->uio_offset
+ max_io_size
, UPL_ROP_ABSENT
, &io_size
);
3249 * we may have already spun some portion of this request
3250 * off as async requests... we need to wait for the I/O
3251 * to complete before returning
3253 goto wait_for_reads
;
3255 // LP64todo - fix this!
3256 upl_offset
= CAST_DOWN(vm_offset_t
, iov
->iov_base
) & PAGE_MASK
;
3257 upl_needed_size
= (upl_offset
+ io_size
+ (PAGE_SIZE
-1)) & ~PAGE_MASK
;
3259 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_START
,
3260 (int)upl_offset
, upl_needed_size
, (int)iov
->iov_base
, io_size
, 0);
3262 if (upl_offset
== 0 && ((io_size
& PAGE_MASK
) == 0)) {
3264 abort_flag
= UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
;
3267 abort_flag
= UPL_ABORT_FREE_ON_EMPTY
;
3269 for (force_data_sync
= 0; force_data_sync
< 3; force_data_sync
++) {
3271 upl_size
= upl_needed_size
;
3272 upl_flags
= UPL_FILE_IO
| UPL_NO_SYNC
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
3275 upl_flags
|= UPL_NOZEROFILL
;
3276 if (force_data_sync
)
3277 upl_flags
|= UPL_FORCE_DATA_SYNC
;
3279 // LP64todo - fix this!
3280 kret
= vm_map_create_upl(current_map(),
3281 (vm_map_offset_t
)(CAST_DOWN(vm_offset_t
, iov
->iov_base
) & ~PAGE_MASK
),
3282 &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
);
3284 if (kret
!= KERN_SUCCESS
) {
3285 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
3286 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
3288 * cluster_nocopy_read: failed to get pagelist
3290 * we may have already spun some portion of this request
3291 * off as async requests... we need to wait for the I/O
3292 * to complete before returning
3294 goto wait_for_reads
;
3296 pages_in_pl
= upl_size
/ PAGE_SIZE
;
3297 pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
3299 for (i
= 0; i
< pages_in_pl
; i
++) {
3300 if (!upl_valid_page(pl
, i
))
3303 if (i
== pages_in_pl
)
3306 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
, abort_flag
);
3308 if (force_data_sync
>= 3) {
3309 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
3310 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
3312 goto wait_for_reads
;
3315 * Consider the possibility that upl_size wasn't satisfied.
3317 if (upl_size
!= upl_needed_size
)
3318 io_size
= (upl_size
- (int)upl_offset
) & ~PAGE_MASK
;
3321 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
, abort_flag
);
3322 goto wait_for_reads
;
3324 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
3325 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
3328 * request asynchronously so that we can overlap
3329 * the preparation of the next I/O
3330 * if there are already too many outstanding reads
3331 * wait until some have completed before issuing the next read
3333 lck_mtx_lock(cl_mtxp
);
3335 while ((iostate
.io_issued
- iostate
.io_completed
) > max_rd_ahead
) {
3336 iostate
.io_wanted
= 1;
3337 msleep((caddr_t
)&iostate
.io_wanted
, cl_mtxp
, PRIBIO
+ 1, "cluster_nocopy_read", 0);
3339 lck_mtx_unlock(cl_mtxp
);
3341 if (iostate
.io_error
) {
3343 * one of the earlier reads we issued ran into a hard error
3344 * don't issue any more reads, cleanup the UPL
3345 * that was just created but not used, then
3346 * go wait for any other reads to complete before
3347 * returning the error to the caller
3349 ubc_upl_abort_range(upl
, (upl_offset
& ~PAGE_MASK
), upl_size
, abort_flag
);
3351 goto wait_for_reads
;
3353 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 73)) | DBG_FUNC_START
,
3354 (int)upl
, (int)upl_offset
, (int)uio
->uio_offset
, io_size
, 0);
3356 retval
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
, io_size
,
3357 CL_PRESERVE
| CL_COMMIT
| CL_READ
| CL_ASYNC
| CL_NOZERO
,
3358 (buf_t
)NULL
, &iostate
);
3361 * update the uio structure
3363 ((u_int32_t
)iov
->iov_base
) += io_size
;
3364 iov
->iov_len
-= io_size
;
3365 uio_setresid(uio
, (uio_resid(uio
) - io_size
));
3366 uio
->uio_offset
+= io_size
;
3368 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 73)) | DBG_FUNC_END
,
3369 (int)upl
, (int)uio
->uio_offset
, (int)uio_resid(uio
), retval
, 0);
3375 * make sure all async reads that are part of this stream
3376 * have completed before we return
3378 lck_mtx_lock(cl_mtxp
);
3380 while (iostate
.io_issued
!= iostate
.io_completed
) {
3381 iostate
.io_wanted
= 1;
3382 msleep((caddr_t
)&iostate
.io_wanted
, cl_mtxp
, PRIBIO
+ 1, "cluster_nocopy_read", 0);
3384 lck_mtx_unlock(cl_mtxp
);
3386 if (iostate
.io_error
)
3387 retval
= iostate
.io_error
;
3389 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_END
,
3390 (int)uio
->uio_offset
, (int)uio_resid(uio
), 6, retval
, 0);
3397 cluster_phys_read(vnode_t vp
, struct uio
*uio
, off_t filesize
)
3399 upl_page_info_t
*pl
;
3401 vm_offset_t upl_offset
;
3415 int upl_needed_size
;
3419 struct clios iostate
;
3423 devblocksize
= vp
->v_mount
->mnt_devblocksize
;
3425 * When we enter this routine, we know
3426 * -- the resid will not exceed iov_len
3427 * -- the target address is physically contiguous
3431 if (IS_VALID_UIO_SEGFLG(uio
->uio_segflg
) == 0) {
3432 panic("%s :%d - invalid uio_segflg\n", __FILE__
, __LINE__
);
3434 #endif /* LP64_DEBUG */
3436 iov_len
= uio_iov_len(uio
);
3437 iov_base
= uio_iov_base(uio
);
3439 max_size
= filesize
- uio
->uio_offset
;
3441 // LP64todo - fix this!
3442 if (max_size
< 0 || (u_int64_t
)max_size
> iov_len
)
3447 // LP64todo - fix this!
3448 upl_offset
= CAST_DOWN(vm_offset_t
, iov_base
) & PAGE_MASK
;
3449 upl_needed_size
= upl_offset
+ io_size
;
3453 upl_size
= upl_needed_size
;
3454 upl_flags
= UPL_FILE_IO
| UPL_NO_SYNC
| UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
3456 kret
= vm_map_get_upl(current_map(),
3457 CAST_DOWN(vm_offset_t
, iov_base
) & ~PAGE_MASK
,
3458 &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
, 0);
3460 if (kret
!= KERN_SUCCESS
) {
3462 * cluster_phys_read: failed to get pagelist
3466 if (upl_size
< upl_needed_size
) {
3468 * The upl_size wasn't satisfied.
3470 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3474 pl
= ubc_upl_pageinfo(upl
);
3476 dst_paddr
= ((addr64_t
)upl_phys_page(pl
, 0) << 12) + ((addr64_t
)(iov_base
& PAGE_MASK
));
3478 while (((uio
->uio_offset
& (devblocksize
- 1)) || io_size
< devblocksize
) && io_size
) {
3481 head_size
= devblocksize
- (int)(uio
->uio_offset
& (devblocksize
- 1));
3483 if (head_size
> io_size
)
3484 head_size
= io_size
;
3486 error
= cluster_align_phys_io(vp
, uio
, dst_paddr
, head_size
, CL_READ
);
3489 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3493 upl_offset
+= head_size
;
3494 dst_paddr
+= head_size
;
3495 io_size
-= head_size
;
3497 tail_size
= io_size
& (devblocksize
- 1);
3498 io_size
-= tail_size
;
3500 iostate
.io_completed
= 0;
3501 iostate
.io_issued
= 0;
3502 iostate
.io_error
= 0;
3503 iostate
.io_wanted
= 0;
3505 while (io_size
&& error
== 0) {
3508 if (io_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
3509 xsize
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
3513 * request asynchronously so that we can overlap
3514 * the preparation of the next I/O... we'll do
3515 * the commit after all the I/O has completed
3516 * since its all issued against the same UPL
3517 * if there are already too many outstanding reads
3518 * wait until some have completed before issuing the next
3520 lck_mtx_lock(cl_mtxp
);
3522 while ((iostate
.io_issued
- iostate
.io_completed
) > (2 * MAX_UPL_TRANSFER
* PAGE_SIZE
)) {
3523 iostate
.io_wanted
= 1;
3524 msleep((caddr_t
)&iostate
.io_wanted
, cl_mtxp
, PRIBIO
+ 1, "cluster_phys_read", 0);
3526 lck_mtx_unlock(cl_mtxp
);
3528 error
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
, xsize
,
3529 CL_READ
| CL_NOZERO
| CL_DEV_MEMORY
| CL_ASYNC
,
3530 (buf_t
)NULL
, &iostate
);
3532 * The cluster_io read was issued successfully,
3533 * update the uio structure
3536 uio_setresid(uio
, (uio_resid(uio
) - xsize
));
3537 uio_iov_base_add(uio
, xsize
);
3538 uio_iov_len_add(uio
, -xsize
);
3539 uio
->uio_offset
+= xsize
;
3541 upl_offset
+= xsize
;
3546 * make sure all async reads that are part of this stream
3547 * have completed before we proceed
3549 lck_mtx_lock(cl_mtxp
);
3551 while (iostate
.io_issued
!= iostate
.io_completed
) {
3552 iostate
.io_wanted
= 1;
3553 msleep((caddr_t
)&iostate
.io_wanted
, cl_mtxp
, PRIBIO
+ 1, "cluster_phys_read", 0);
3555 lck_mtx_unlock(cl_mtxp
);
3557 if (iostate
.io_error
)
3558 error
= iostate
.io_error
;
3560 if (error
== 0 && tail_size
)
3561 error
= cluster_align_phys_io(vp
, uio
, dst_paddr
, tail_size
, CL_READ
);
3564 * just release our hold on the physically contiguous
3565 * region without changing any state
3567 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3574 * generate advisory I/O's in the largest chunks possible
3575 * the completed pages will be released into the VM cache
3578 advisory_read(vnode_t vp
, off_t filesize
, off_t f_offset
, int resid
)
3580 upl_page_info_t
*pl
;
3582 vm_offset_t upl_offset
;
3596 if ( !UBCINFOEXISTS(vp
))
3599 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 60)) | DBG_FUNC_START
,
3600 (int)f_offset
, resid
, (int)filesize
, 0, 0);
3602 while (resid
&& f_offset
< filesize
&& retval
== 0) {
3604 * compute the size of the upl needed to encompass
3605 * the requested read... limit each call to cluster_io
3606 * to the maximum UPL size... cluster_io will clip if
3607 * this exceeds the maximum io_size for the device,
3608 * make sure to account for
3609 * a starting offset that's not page aligned
3611 start_offset
= (int)(f_offset
& PAGE_MASK_64
);
3612 upl_f_offset
= f_offset
- (off_t
)start_offset
;
3613 max_size
= filesize
- f_offset
;
3615 if (resid
< max_size
)
3620 upl_size
= (start_offset
+ io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
3621 if (upl_size
> (MAX_UPL_TRANSFER
* PAGE_SIZE
))
3622 upl_size
= MAX_UPL_TRANSFER
* PAGE_SIZE
;
3626 * return the number of contiguously present pages in the cache
3627 * starting at upl_f_offset within the file
3629 ubc_range_op(vp
, upl_f_offset
, upl_f_offset
+ upl_size
, UPL_ROP_PRESENT
, &skip_range
);
3633 * skip over pages already present in the cache
3635 io_size
= skip_range
- start_offset
;
3637 f_offset
+= io_size
;
3640 if (skip_range
== upl_size
)
3643 * have to issue some real I/O
3644 * at this point, we know it's starting on a page boundary
3645 * because we've skipped over at least the first page in the request
3648 upl_f_offset
+= skip_range
;
3649 upl_size
-= skip_range
;
3651 pages_in_upl
= upl_size
/ PAGE_SIZE
;
3653 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 61)) | DBG_FUNC_START
,
3654 (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
3656 kret
= ubc_create_upl(vp
,
3661 UPL_RET_ONLY_ABSENT
| UPL_SET_LITE
);
3662 if (kret
!= KERN_SUCCESS
)
3667 * before we start marching forward, we must make sure we end on
3668 * a present page, otherwise we will be working with a freed
3671 for (last_pg
= pages_in_upl
- 1; last_pg
>= 0; last_pg
--) {
3672 if (upl_page_present(pl
, last_pg
))
3675 pages_in_upl
= last_pg
+ 1;
3678 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 61)) | DBG_FUNC_END
,
3679 (int)upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
3682 for (last_pg
= 0; last_pg
< pages_in_upl
; ) {
3684 * scan from the beginning of the upl looking for the first
3685 * page that is present.... this will become the first page in
3686 * the request we're going to make to 'cluster_io'... if all
3687 * of the pages are absent, we won't call through to 'cluster_io'
3689 for (start_pg
= last_pg
; start_pg
< pages_in_upl
; start_pg
++) {
3690 if (upl_page_present(pl
, start_pg
))
3695 * scan from the starting present page looking for an absent
3696 * page before the end of the upl is reached, if we
3697 * find one, then it will terminate the range of pages being
3698 * presented to 'cluster_io'
3700 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
3701 if (!upl_page_present(pl
, last_pg
))
3705 if (last_pg
> start_pg
) {
3707 * we found a range of pages that must be filled
3708 * if the last page in this range is the last page of the file
3709 * we may have to clip the size of it to keep from reading past
3710 * the end of the last physical block associated with the file
3712 upl_offset
= start_pg
* PAGE_SIZE
;
3713 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
3715 if ((upl_f_offset
+ upl_offset
+ io_size
) > filesize
)
3716 io_size
= filesize
- (upl_f_offset
+ upl_offset
);
3719 * issue an asynchronous read to cluster_io
3721 retval
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, io_size
,
3722 CL_ASYNC
| CL_READ
| CL_COMMIT
| CL_AGE
, (buf_t
)NULL
, (struct clios
*)NULL
);
3728 ubc_upl_abort(upl
, 0);
3730 io_size
= upl_size
- start_offset
;
3732 if (io_size
> resid
)
3734 f_offset
+= io_size
;
3738 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 60)) | DBG_FUNC_END
,
3739 (int)f_offset
, resid
, retval
, 0, 0);
3746 cluster_push(vnode_t vp
, int flags
)
3749 struct cl_writebehind
*wbp
;
3751 if ( !UBCINFOEXISTS(vp
)) {
3752 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_NONE
, (int)vp
, flags
, 0, -1, 0);
3755 /* return if deferred write is set */
3756 if (((unsigned int)vfs_flags(vp
->v_mount
) & MNT_DEFWRITE
) && (flags
& IO_DEFWRITE
)) {
3759 if ((wbp
= cluster_get_wbp(vp
, CLW_RETURNLOCKED
)) == NULL
) {
3760 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_NONE
, (int)vp
, flags
, 0, -2, 0);
3763 if (wbp
->cl_number
== 0 && wbp
->cl_scmap
== NULL
) {
3764 lck_mtx_unlock(&wbp
->cl_lockw
);
3766 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_NONE
, (int)vp
, flags
, 0, -3, 0);
3769 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_START
,
3770 (int)wbp
->cl_scmap
, wbp
->cl_number
, flags
, 0, 0);
3772 if (wbp
->cl_scmap
) {
3773 sparse_cluster_push(wbp
, vp
, ubc_getsize(vp
), 1);
3777 retval
= cluster_try_push(wbp
, vp
, ubc_getsize(vp
), 0, 1);
3779 lck_mtx_unlock(&wbp
->cl_lockw
);
3781 if (flags
& IO_SYNC
)
3782 (void)vnode_waitforwrites(vp
, 0, 0, 0, (char *)"cluster_push");
3784 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_END
,
3785 (int)wbp
->cl_scmap
, wbp
->cl_number
, retval
, 0, 0);
3791 __private_extern__
void
3792 cluster_release(struct ubc_info
*ubc
)
3794 struct cl_writebehind
*wbp
;
3795 struct cl_readahead
*rap
;
3797 if ((wbp
= ubc
->cl_wbehind
)) {
3799 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_START
, (int)ubc
, (int)wbp
->cl_scmap
, wbp
->cl_scdirty
, 0, 0);
3802 vfs_drt_control(&(wbp
->cl_scmap
), 0);
3804 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_START
, (int)ubc
, 0, 0, 0, 0);
3807 rap
= ubc
->cl_rahead
;
3810 lck_mtx_destroy(&wbp
->cl_lockw
, cl_mtx_grp
);
3811 FREE_ZONE((void *)wbp
, sizeof *wbp
, M_CLWRBEHIND
);
3813 if ((rap
= ubc
->cl_rahead
)) {
3814 lck_mtx_destroy(&rap
->cl_lockr
, cl_mtx_grp
);
3815 FREE_ZONE((void *)rap
, sizeof *rap
, M_CLRDAHEAD
);
3817 ubc
->cl_rahead
= NULL
;
3818 ubc
->cl_wbehind
= NULL
;
3820 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_END
, (int)ubc
, (int)rap
, (int)wbp
, 0, 0);
3825 cluster_push_EOF(vnode_t vp
, off_t EOF
)
3827 struct cl_writebehind
*wbp
;
3829 wbp
= cluster_get_wbp(vp
, CLW_ALLOCATE
| CLW_RETURNLOCKED
);
3831 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_START
,
3832 (int)wbp
->cl_scmap
, wbp
->cl_number
, (int)EOF
, 0, 0);
3835 sparse_cluster_push(wbp
, vp
, EOF
, 1);
3837 cluster_try_push(wbp
, vp
, EOF
, 0, 1);
3839 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_END
,
3840 (int)wbp
->cl_scmap
, wbp
->cl_number
, 0, 0, 0);
3842 lck_mtx_unlock(&wbp
->cl_lockw
);
3847 cluster_try_push(struct cl_writebehind
*wbp
, vnode_t vp
, off_t EOF
, int can_delay
, int push_all
)
3854 struct cl_wextent l_clusters
[MAX_CLUSTERS
];
3857 * the write behind context exists and has
3858 * already been locked...
3860 * make a local 'sorted' copy of the clusters
3861 * and clear wbp->cl_number so that new clusters can
3864 for (cl_index
= 0; cl_index
< wbp
->cl_number
; cl_index
++) {
3865 for (min_index
= -1, cl_index1
= 0; cl_index1
< wbp
->cl_number
; cl_index1
++) {
3866 if (wbp
->cl_clusters
[cl_index1
].b_addr
== wbp
->cl_clusters
[cl_index1
].e_addr
)
3868 if (min_index
== -1)
3869 min_index
= cl_index1
;
3870 else if (wbp
->cl_clusters
[cl_index1
].b_addr
< wbp
->cl_clusters
[min_index
].b_addr
)
3871 min_index
= cl_index1
;
3873 if (min_index
== -1)
3875 l_clusters
[cl_index
].b_addr
= wbp
->cl_clusters
[min_index
].b_addr
;
3876 l_clusters
[cl_index
].e_addr
= wbp
->cl_clusters
[min_index
].e_addr
;
3877 l_clusters
[cl_index
].io_nocache
= wbp
->cl_clusters
[min_index
].io_nocache
;
3879 wbp
->cl_clusters
[min_index
].b_addr
= wbp
->cl_clusters
[min_index
].e_addr
;
3885 if (can_delay
&& cl_len
== MAX_CLUSTERS
) {
3889 * determine if we appear to be writing the file sequentially
3890 * if not, by returning without having pushed any clusters
3891 * we will cause this vnode to be pushed into the sparse cluster mechanism
3892 * used for managing more random I/O patterns
3894 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
3895 * that's why we're in try_push with can_delay true...
3897 * check to make sure that all the clusters except the last one are 'full'... and that each cluster
3898 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
3899 * so we can just make a simple pass through, up to, but not including the last one...
3900 * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
3903 * we let the last one be partial as long as it was adjacent to the previous one...
3904 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
3905 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
3907 for (i
= 0; i
< MAX_CLUSTERS
- 1; i
++) {
3908 if ((l_clusters
[i
].e_addr
- l_clusters
[i
].b_addr
) != MAX_UPL_TRANSFER
)
3910 if (l_clusters
[i
].e_addr
!= l_clusters
[i
+1].b_addr
)
3915 * drop the lock while we're firing off the I/Os...
3916 * this is safe since I'm working off of a private sorted copy
3917 * of the clusters, and I'm going to re-evaluate the public
3918 * state after I retake the lock
3920 lck_mtx_unlock(&wbp
->cl_lockw
);
3922 for (cl_index
= 0; cl_index
< cl_len
; cl_index
++) {
3924 struct cl_extent cl
;
3927 * try to push each cluster in turn...
3929 if (l_clusters
[cl_index
].io_nocache
)
3933 cl
.b_addr
= l_clusters
[cl_index
].b_addr
;
3934 cl
.e_addr
= l_clusters
[cl_index
].e_addr
;
3936 cluster_push_x(vp
, &cl
, EOF
, flags
);
3938 l_clusters
[cl_index
].b_addr
= 0;
3939 l_clusters
[cl_index
].e_addr
= 0;
3946 lck_mtx_lock(&wbp
->cl_lockw
);
3949 if (cl_len
> cl_pushed
) {
3951 * we didn't push all of the clusters, so
3952 * lets try to merge them back in to the vnode
3954 if ((MAX_CLUSTERS
- wbp
->cl_number
) < (cl_len
- cl_pushed
)) {
3956 * we picked up some new clusters while we were trying to
3957 * push the old ones... this can happen because I've dropped
3958 * the vnode lock... the sum of the
3959 * leftovers plus the new cluster count exceeds our ability
3960 * to represent them, so switch to the sparse cluster mechanism
3962 * collect the active public clusters...
3964 sparse_cluster_switch(wbp
, vp
, EOF
);
3966 for (cl_index
= 0, cl_index1
= 0; cl_index
< cl_len
; cl_index
++) {
3967 if (l_clusters
[cl_index
].b_addr
== l_clusters
[cl_index
].e_addr
)
3969 wbp
->cl_clusters
[cl_index1
].b_addr
= l_clusters
[cl_index
].b_addr
;
3970 wbp
->cl_clusters
[cl_index1
].e_addr
= l_clusters
[cl_index
].e_addr
;
3971 wbp
->cl_clusters
[cl_index1
].io_nocache
= l_clusters
[cl_index
].io_nocache
;
3976 * update the cluster count
3978 wbp
->cl_number
= cl_index1
;
3981 * and collect the original clusters that were moved into the
3982 * local storage for sorting purposes
3984 sparse_cluster_switch(wbp
, vp
, EOF
);
3988 * we've got room to merge the leftovers back in
3989 * just append them starting at the next 'hole'
3990 * represented by wbp->cl_number
3992 for (cl_index
= 0, cl_index1
= wbp
->cl_number
; cl_index
< cl_len
; cl_index
++) {
3993 if (l_clusters
[cl_index
].b_addr
== l_clusters
[cl_index
].e_addr
)
3996 wbp
->cl_clusters
[cl_index1
].b_addr
= l_clusters
[cl_index
].b_addr
;
3997 wbp
->cl_clusters
[cl_index1
].e_addr
= l_clusters
[cl_index
].e_addr
;
3998 wbp
->cl_clusters
[cl_index1
].io_nocache
= l_clusters
[cl_index
].io_nocache
;
4003 * update the cluster count
4005 wbp
->cl_number
= cl_index1
;
4008 return(MAX_CLUSTERS
- wbp
->cl_number
);
4014 cluster_push_x(vnode_t vp
, struct cl_extent
*cl
, off_t EOF
, int flags
)
4016 upl_page_info_t
*pl
;
4018 vm_offset_t upl_offset
;
4033 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_START
,
4034 (int)cl
->b_addr
, (int)cl
->e_addr
, (int)EOF
, flags
, 0);
4036 if ((pages_in_upl
= (int)(cl
->e_addr
- cl
->b_addr
)) == 0) {
4037 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 0, 0, 0, 0);
4041 upl_size
= pages_in_upl
* PAGE_SIZE
;
4042 upl_f_offset
= (off_t
)(cl
->b_addr
* PAGE_SIZE_64
);
4044 if (upl_f_offset
+ upl_size
>= EOF
) {
4046 if (upl_f_offset
>= EOF
) {
4048 * must have truncated the file and missed
4049 * clearing a dangling cluster (i.e. it's completely
4050 * beyond the new EOF
4052 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 1, 0, 0, 0);
4056 size
= EOF
- upl_f_offset
;
4058 upl_size
= (size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
4059 pages_in_upl
= upl_size
/ PAGE_SIZE
;
4063 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_START
, upl_size
, size
, 0, 0, 0);
4066 * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
4068 * - only pages that are currently dirty are returned... these are the ones we need to clean
4069 * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
4070 * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
4071 * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
4072 * someone dirties this page while the I/O is in progress, we don't lose track of the new state
4074 * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
4077 if ((vp
->v_flag
& VNOCACHE_DATA
) || (flags
& IO_NOCACHE
))
4078 upl_flags
= UPL_COPYOUT_FROM
| UPL_RET_ONLY_DIRTY
| UPL_SET_LITE
| UPL_WILL_BE_DUMPED
;
4080 upl_flags
= UPL_COPYOUT_FROM
| UPL_RET_ONLY_DIRTY
| UPL_SET_LITE
;
4082 kret
= ubc_create_upl(vp
,
4088 if (kret
!= KERN_SUCCESS
)
4089 panic("cluster_push: failed to get pagelist");
4091 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_END
, (int)upl
, upl_f_offset
, 0, 0, 0);
4094 * since we only asked for the dirty pages back
4095 * it's possible that we may only get a few or even none, so...
4096 * before we start marching forward, we must make sure we know
4097 * where the last present page is in the UPL, otherwise we could
4098 * end up working with a freed upl due to the FREE_ON_EMPTY semantics
4099 * employed by commit_range and abort_range.
4101 for (last_pg
= pages_in_upl
- 1; last_pg
>= 0; last_pg
--) {
4102 if (upl_page_present(pl
, last_pg
))
4105 pages_in_upl
= last_pg
+ 1;
4107 if (pages_in_upl
== 0) {
4108 ubc_upl_abort(upl
, 0);
4110 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 2, 0, 0, 0);
4114 for (last_pg
= 0; last_pg
< pages_in_upl
; ) {
4116 * find the next dirty page in the UPL
4117 * this will become the first page in the
4118 * next I/O to generate
4120 for (start_pg
= last_pg
; start_pg
< pages_in_upl
; start_pg
++) {
4121 if (upl_dirty_page(pl
, start_pg
))
4123 if (upl_page_present(pl
, start_pg
))
4125 * RET_ONLY_DIRTY will return non-dirty 'precious' pages
4126 * just release these unchanged since we're not going
4127 * to steal them or change their state
4129 ubc_upl_abort_range(upl
, start_pg
* PAGE_SIZE
, PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
4131 if (start_pg
>= pages_in_upl
)
4133 * done... no more dirty pages to push
4136 if (start_pg
> last_pg
)
4138 * skipped over some non-dirty pages
4140 size
-= ((start_pg
- last_pg
) * PAGE_SIZE
);
4143 * find a range of dirty pages to write
4145 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
4146 if (!upl_dirty_page(pl
, last_pg
))
4149 upl_offset
= start_pg
* PAGE_SIZE
;
4151 io_size
= min(size
, (last_pg
- start_pg
) * PAGE_SIZE
);
4153 io_flags
= CL_THROTTLE
| CL_COMMIT
;
4155 if ( !(flags
& IO_SYNC
))
4156 io_flags
|= CL_ASYNC
;
4158 retval
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, io_size
,
4159 io_flags
, (buf_t
)NULL
, (struct clios
*)NULL
);
4161 if (error
== 0 && retval
)
4166 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 3, 0, 0, 0);
4173 * sparse_cluster_switch is called with the write behind lock held
4176 sparse_cluster_switch(struct cl_writebehind
*wbp
, vnode_t vp
, off_t EOF
)
4180 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 78)) | DBG_FUNC_START
, (int)vp
, (int)wbp
->cl_scmap
, wbp
->cl_scdirty
, 0, 0);
4182 if (wbp
->cl_scmap
== NULL
)
4183 wbp
->cl_scdirty
= 0;
4185 for (cl_index
= 0; cl_index
< wbp
->cl_number
; cl_index
++) {
4187 struct cl_extent cl
;
4189 for (cl
.b_addr
= wbp
->cl_clusters
[cl_index
].b_addr
; cl
.b_addr
< wbp
->cl_clusters
[cl_index
].e_addr
; cl
.b_addr
++) {
4191 if (ubc_page_op(vp
, (off_t
)(cl
.b_addr
* PAGE_SIZE_64
), 0, 0, &flags
) == KERN_SUCCESS
) {
4192 if (flags
& UPL_POP_DIRTY
) {
4193 cl
.e_addr
= cl
.b_addr
+ 1;
4195 sparse_cluster_add(wbp
, vp
, &cl
, EOF
);
4202 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 78)) | DBG_FUNC_END
, (int)vp
, (int)wbp
->cl_scmap
, wbp
->cl_scdirty
, 0, 0);
4207 * sparse_cluster_push is called with the write behind lock held
4210 sparse_cluster_push(struct cl_writebehind
*wbp
, vnode_t vp
, off_t EOF
, int push_all
)
4212 struct cl_extent cl
;
4216 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 79)) | DBG_FUNC_START
, (int)vp
, (int)wbp
->cl_scmap
, wbp
->cl_scdirty
, push_all
, 0);
4219 vfs_drt_control(&(wbp
->cl_scmap
), 1);
4222 if (vfs_drt_get_cluster(&(wbp
->cl_scmap
), &offset
, &length
) != KERN_SUCCESS
)
4225 cl
.b_addr
= (daddr64_t
)(offset
/ PAGE_SIZE_64
);
4226 cl
.e_addr
= (daddr64_t
)((offset
+ length
) / PAGE_SIZE_64
);
4228 wbp
->cl_scdirty
-= (int)(cl
.e_addr
- cl
.b_addr
);
4230 cluster_push_x(vp
, &cl
, EOF
, 0);
4235 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 79)) | DBG_FUNC_END
, (int)vp
, (int)wbp
->cl_scmap
, wbp
->cl_scdirty
, 0, 0);
4240 * sparse_cluster_add is called with the write behind lock held
4243 sparse_cluster_add(struct cl_writebehind
*wbp
, vnode_t vp
, struct cl_extent
*cl
, off_t EOF
)
4249 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 80)) | DBG_FUNC_START
, (int)wbp
->cl_scmap
, wbp
->cl_scdirty
, (int)cl
->b_addr
, (int)cl
->e_addr
, 0);
4251 offset
= (off_t
)(cl
->b_addr
* PAGE_SIZE_64
);
4252 length
= ((u_int
)(cl
->e_addr
- cl
->b_addr
)) * PAGE_SIZE
;
4254 while (vfs_drt_mark_pages(&(wbp
->cl_scmap
), offset
, length
, &new_dirty
) != KERN_SUCCESS
) {
4256 * no room left in the map
4257 * only a partial update was done
4258 * push out some pages and try again
4260 wbp
->cl_scdirty
+= new_dirty
;
4262 sparse_cluster_push(wbp
, vp
, EOF
, 0);
4264 offset
+= (new_dirty
* PAGE_SIZE_64
);
4265 length
-= (new_dirty
* PAGE_SIZE
);
4267 wbp
->cl_scdirty
+= new_dirty
;
4269 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 80)) | DBG_FUNC_END
, (int)vp
, (int)wbp
->cl_scmap
, wbp
->cl_scdirty
, 0, 0);
4274 cluster_align_phys_io(vnode_t vp
, struct uio
*uio
, addr64_t usr_paddr
, int xsize
, int flags
)
4277 upl_page_info_t
*pl
;
4288 upl_flags
= UPL_SET_LITE
;
4289 if (! (flags
& CL_READ
)) {
4291 * "write" operation: let the UPL subsystem know
4292 * that we intend to modify the buffer cache pages
4295 upl_flags
|= UPL_WILL_MODIFY
;
4298 kret
= ubc_create_upl(vp
,
4299 uio
->uio_offset
& ~PAGE_MASK_64
,
4305 if (kret
!= KERN_SUCCESS
)
4308 if (!upl_valid_page(pl
, 0)) {
4310 * issue a synchronous read to cluster_io
4312 error
= cluster_io(vp
, upl
, 0, uio
->uio_offset
& ~PAGE_MASK_64
, PAGE_SIZE
,
4313 CL_READ
, (buf_t
)NULL
, (struct clios
*)NULL
);
4315 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
4321 ubc_paddr
= ((addr64_t
)upl_phys_page(pl
, 0) << 12) + (addr64_t
)(uio
->uio_offset
& PAGE_MASK_64
);
4324 * NOTE: There is no prototype for the following in BSD. It, and the definitions
4325 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
4326 * osfmk/ppc/mappings.h. They are not included here because there appears to be no
4327 * way to do so without exporting them to kexts as well.
4329 if (flags
& CL_READ
)
4330 // copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */
4331 copypv(ubc_paddr
, usr_paddr
, xsize
, 2 | 1 | 4); /* Copy physical to physical and flush the destination */
4333 // copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */
4334 copypv(usr_paddr
, ubc_paddr
, xsize
, 2 | 1 | 8); /* Copy physical to physical and flush the source */
4336 if ( !(flags
& CL_READ
) || (upl_valid_page(pl
, 0) && upl_dirty_page(pl
, 0))) {
4338 * issue a synchronous write to cluster_io
4340 error
= cluster_io(vp
, upl
, 0, uio
->uio_offset
& ~PAGE_MASK_64
, PAGE_SIZE
,
4341 0, (buf_t
)NULL
, (struct clios
*)NULL
);
4344 uio
->uio_offset
+= xsize
;
4345 uio_iov_base_add(uio
, xsize
);
4346 uio_iov_len_add(uio
, -xsize
);
4347 uio_setresid(uio
, (uio_resid(uio
) - xsize
));
4350 abort_flags
= UPL_ABORT_FREE_ON_EMPTY
;
4352 abort_flags
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_DUMP_PAGES
;
4354 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, abort_flags
);
4362 cluster_copy_upl_data(struct uio
*uio
, upl_t upl
, int upl_offset
, int xsize
)
4369 upl_page_info_t
*pl
;
4371 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_START
,
4372 (int)uio
->uio_offset
, uio_resid(uio
), upl_offset
, xsize
, 0);
4374 segflg
= uio
->uio_segflg
;
4378 case UIO_USERSPACE32
:
4379 case UIO_USERISPACE32
:
4380 uio
->uio_segflg
= UIO_PHYS_USERSPACE32
;
4384 case UIO_USERISPACE
:
4385 uio
->uio_segflg
= UIO_PHYS_USERSPACE
;
4388 case UIO_USERSPACE64
:
4389 case UIO_USERISPACE64
:
4390 uio
->uio_segflg
= UIO_PHYS_USERSPACE64
;
4393 case UIO_SYSSPACE32
:
4394 uio
->uio_segflg
= UIO_PHYS_SYSSPACE32
;
4398 uio
->uio_segflg
= UIO_PHYS_SYSSPACE
;
4401 case UIO_SYSSPACE64
:
4402 uio
->uio_segflg
= UIO_PHYS_SYSSPACE64
;
4405 pl
= ubc_upl_pageinfo(upl
);
4407 pg_index
= upl_offset
/ PAGE_SIZE
;
4408 pg_offset
= upl_offset
& PAGE_MASK
;
4409 csize
= min(PAGE_SIZE
- pg_offset
, xsize
);
4411 while (xsize
&& retval
== 0) {
4414 paddr
= ((addr64_t
)upl_phys_page(pl
, pg_index
) << 12) + pg_offset
;
4416 retval
= uiomove64(paddr
, csize
, uio
);
4421 csize
= min(PAGE_SIZE
, xsize
);
4423 uio
->uio_segflg
= segflg
;
4425 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
4426 (int)uio
->uio_offset
, uio_resid(uio
), retval
, segflg
, 0);
4433 cluster_copy_ubc_data(vnode_t vp
, struct uio
*uio
, int *io_resid
, int mark_dirty
)
4440 memory_object_control_t control
;
4443 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_START
,
4444 (int)uio
->uio_offset
, uio_resid(uio
), 0, *io_resid
, 0);
4446 control
= ubc_getobject(vp
, UBC_FLAGS_NONE
);
4447 if (control
== MEMORY_OBJECT_CONTROL_NULL
) {
4448 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
4449 (int)uio
->uio_offset
, uio_resid(uio
), retval
, 3, 0);
4453 segflg
= uio
->uio_segflg
;
4457 case UIO_USERSPACE32
:
4458 case UIO_USERISPACE32
:
4459 uio
->uio_segflg
= UIO_PHYS_USERSPACE32
;
4462 case UIO_USERSPACE64
:
4463 case UIO_USERISPACE64
:
4464 uio
->uio_segflg
= UIO_PHYS_USERSPACE64
;
4467 case UIO_SYSSPACE32
:
4468 uio
->uio_segflg
= UIO_PHYS_SYSSPACE32
;
4471 case UIO_SYSSPACE64
:
4472 uio
->uio_segflg
= UIO_PHYS_SYSSPACE64
;
4476 case UIO_USERISPACE
:
4477 uio
->uio_segflg
= UIO_PHYS_USERSPACE
;
4481 uio
->uio_segflg
= UIO_PHYS_SYSSPACE
;
4485 if ( (io_size
= *io_resid
) ) {
4486 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
4487 xsize
= uio_resid(uio
);
4489 retval
= memory_object_control_uiomove(control
, uio
->uio_offset
- start_offset
,
4490 uio
, start_offset
, io_size
, mark_dirty
);
4491 xsize
-= uio_resid(uio
);
4494 uio
->uio_segflg
= segflg
;
4495 *io_resid
= io_size
;
4497 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
4498 (int)uio
->uio_offset
, uio_resid(uio
), retval
, 0x80000000 | segflg
, 0);
4505 is_file_clean(vnode_t vp
, off_t filesize
)
4509 int total_dirty
= 0;
4511 for (f_offset
= 0; f_offset
< filesize
; f_offset
+= PAGE_SIZE_64
) {
4512 if (ubc_page_op(vp
, f_offset
, 0, 0, &flags
) == KERN_SUCCESS
) {
4513 if (flags
& UPL_POP_DIRTY
) {
4527 * Dirty region tracking/clustering mechanism.
4529 * This code (vfs_drt_*) provides a mechanism for tracking and clustering
4530 * dirty regions within a larger space (file). It is primarily intended to
4531 * support clustering in large files with many dirty areas.
4533 * The implementation assumes that the dirty regions are pages.
4535 * To represent dirty pages within the file, we store bit vectors in a
4536 * variable-size circular hash.
4540 * Bitvector size. This determines the number of pages we group in a
4541 * single hashtable entry. Each hashtable entry is aligned to this
4542 * size within the file.
4544 #define DRT_BITVECTOR_PAGES 256
4547 * File offset handling.
4549 * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
4550 * the correct formula is (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)
4552 #define DRT_ADDRESS_MASK (~((1 << 20) - 1))
4553 #define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
4556 * Hashtable address field handling.
4558 * The low-order bits of the hashtable address are used to conserve
4561 * DRT_HASH_COUNT_MASK must be large enough to store the range
4562 * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
4563 * to indicate that the bucket is actually unoccupied.
4565 #define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
4566 #define DRT_HASH_SET_ADDRESS(scm, i, a) \
4568 (scm)->scm_hashtable[(i)].dhe_control = \
4569 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
4571 #define DRT_HASH_COUNT_MASK 0x1ff
4572 #define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
4573 #define DRT_HASH_SET_COUNT(scm, i, c) \
4575 (scm)->scm_hashtable[(i)].dhe_control = \
4576 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \
4578 #define DRT_HASH_CLEAR(scm, i) \
4580 (scm)->scm_hashtable[(i)].dhe_control = 0; \
4582 #define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
4583 #define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
4584 #define DRT_HASH_COPY(oscm, oi, scm, i) \
4586 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
4587 DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
4592 * Hash table moduli.
4594 * Since the hashtable entry's size is dependent on the size of
4595 * the bitvector, and since the hashtable size is constrained to
4596 * both being prime and fitting within the desired allocation
4597 * size, these values need to be manually determined.
4599 * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
4601 * The small hashtable allocation is 1024 bytes, so the modulus is 23.
4602 * The large hashtable allocation is 16384 bytes, so the modulus is 401.
4604 #define DRT_HASH_SMALL_MODULUS 23
4605 #define DRT_HASH_LARGE_MODULUS 401
4607 #define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */
4608 #define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */
4610 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
4613 * Hashtable bitvector handling.
4615 * Bitvector fields are 32 bits long.
4618 #define DRT_HASH_SET_BIT(scm, i, bit) \
4619 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
4621 #define DRT_HASH_CLEAR_BIT(scm, i, bit) \
4622 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
4624 #define DRT_HASH_TEST_BIT(scm, i, bit) \
4625 ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
4627 #define DRT_BITVECTOR_CLEAR(scm, i) \
4628 bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4630 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
4631 bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
4632 &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
4633 (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4640 struct vfs_drt_hashentry
{
4641 u_int64_t dhe_control
;
4642 u_int32_t dhe_bitvector
[DRT_BITVECTOR_PAGES
/ 32];
4646 * Dirty Region Tracking structure.
4648 * The hashtable is allocated entirely inside the DRT structure.
4650 * The hash is a simple circular prime modulus arrangement, the structure
4651 * is resized from small to large if it overflows.
4654 struct vfs_drt_clustermap
{
4655 u_int32_t scm_magic
; /* sanity/detection */
4656 #define DRT_SCM_MAGIC 0x12020003
4657 u_int32_t scm_modulus
; /* current ring size */
4658 u_int32_t scm_buckets
; /* number of occupied buckets */
4659 u_int32_t scm_lastclean
; /* last entry we cleaned */
4660 u_int32_t scm_iskips
; /* number of slot skips */
4662 struct vfs_drt_hashentry scm_hashtable
[0];
4666 #define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
4667 #define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
4670 * Debugging codes and arguments.
4672 #define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
4673 #define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
4674 #define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
4675 #define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
4676 #define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
4679 /* 1 (clean, no map) */
4680 /* 2 (map alloc fail) */
4681 /* 3, resid (partial) */
4682 #define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
4683 #define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
4684 * lastclean, iskips */
4687 static kern_return_t
vfs_drt_alloc_map(struct vfs_drt_clustermap
**cmapp
);
4688 static kern_return_t
vfs_drt_free_map(struct vfs_drt_clustermap
*cmap
);
4689 static kern_return_t
vfs_drt_search_index(struct vfs_drt_clustermap
*cmap
,
4690 u_int64_t offset
, int *indexp
);
4691 static kern_return_t
vfs_drt_get_index(struct vfs_drt_clustermap
**cmapp
,
4695 static kern_return_t
vfs_drt_do_mark_pages(
4701 static void vfs_drt_trace(
4702 struct vfs_drt_clustermap
*cmap
,
4711 * Allocate and initialise a sparse cluster map.
4713 * Will allocate a new map, resize or compact an existing map.
4715 * XXX we should probably have at least one intermediate map size,
4716 * as the 1:16 ratio seems a bit drastic.
4718 static kern_return_t
4719 vfs_drt_alloc_map(struct vfs_drt_clustermap
**cmapp
)
4721 struct vfs_drt_clustermap
*cmap
, *ocmap
;
4724 int nsize
, i
, active_buckets
, index
, copycount
;
4731 * Decide on the size of the new map.
4733 if (ocmap
== NULL
) {
4734 nsize
= DRT_HASH_SMALL_MODULUS
;
4736 /* count the number of active buckets in the old map */
4738 for (i
= 0; i
< ocmap
->scm_modulus
; i
++) {
4739 if (!DRT_HASH_VACANT(ocmap
, i
) &&
4740 (DRT_HASH_GET_COUNT(ocmap
, i
) != 0))
4744 * If we're currently using the small allocation, check to
4745 * see whether we should grow to the large one.
4747 if (ocmap
->scm_modulus
== DRT_HASH_SMALL_MODULUS
) {
4748 /* if the ring is nearly full */
4749 if (active_buckets
> (DRT_HASH_SMALL_MODULUS
- 5)) {
4750 nsize
= DRT_HASH_LARGE_MODULUS
;
4752 nsize
= DRT_HASH_SMALL_MODULUS
;
4755 /* already using the large modulus */
4756 nsize
= DRT_HASH_LARGE_MODULUS
;
4758 * If the ring is completely full, there's
4759 * nothing useful for us to do. Behave as
4760 * though we had compacted into the new
4763 if (active_buckets
>= DRT_HASH_LARGE_MODULUS
)
4764 return(KERN_SUCCESS
);
4769 * Allocate and initialise the new map.
4772 kret
= kmem_alloc(kernel_map
, (vm_offset_t
*)&cmap
,
4773 (nsize
== DRT_HASH_SMALL_MODULUS
) ? DRT_SMALL_ALLOCATION
: DRT_LARGE_ALLOCATION
);
4774 if (kret
!= KERN_SUCCESS
)
4776 cmap
->scm_magic
= DRT_SCM_MAGIC
;
4777 cmap
->scm_modulus
= nsize
;
4778 cmap
->scm_buckets
= 0;
4779 cmap
->scm_lastclean
= 0;
4780 cmap
->scm_iskips
= 0;
4781 for (i
= 0; i
< cmap
->scm_modulus
; i
++) {
4782 DRT_HASH_CLEAR(cmap
, i
);
4783 DRT_HASH_VACATE(cmap
, i
);
4784 DRT_BITVECTOR_CLEAR(cmap
, i
);
4788 * If there's an old map, re-hash entries from it into the new map.
4791 if (ocmap
!= NULL
) {
4792 for (i
= 0; i
< ocmap
->scm_modulus
; i
++) {
4793 /* skip empty buckets */
4794 if (DRT_HASH_VACANT(ocmap
, i
) ||
4795 (DRT_HASH_GET_COUNT(ocmap
, i
) == 0))
4798 offset
= DRT_HASH_GET_ADDRESS(ocmap
, i
);
4799 kret
= vfs_drt_get_index(&cmap
, offset
, &index
, 1);
4800 if (kret
!= KERN_SUCCESS
) {
4801 /* XXX need to bail out gracefully here */
4802 panic("vfs_drt: new cluster map mysteriously too small");
4805 DRT_HASH_COPY(ocmap
, i
, cmap
, index
);
4810 /* log what we've done */
4811 vfs_drt_trace(cmap
, DRT_DEBUG_ALLOC
, copycount
, 0, 0, 0);
4814 * It's important to ensure that *cmapp always points to
4815 * a valid map, so we must overwrite it before freeing
4819 if (ocmap
!= NULL
) {
4820 /* emit stats into trace buffer */
4821 vfs_drt_trace(ocmap
, DRT_DEBUG_SCMDATA
,
4824 ocmap
->scm_lastclean
,
4827 vfs_drt_free_map(ocmap
);
4829 return(KERN_SUCCESS
);
4834 * Free a sparse cluster map.
4836 static kern_return_t
4837 vfs_drt_free_map(struct vfs_drt_clustermap
*cmap
)
4839 kmem_free(kernel_map
, (vm_offset_t
)cmap
,
4840 (cmap
->scm_modulus
== DRT_HASH_SMALL_MODULUS
) ? DRT_SMALL_ALLOCATION
: DRT_LARGE_ALLOCATION
);
4841 return(KERN_SUCCESS
);
4846 * Find the hashtable slot currently occupied by an entry for the supplied offset.
4848 static kern_return_t
4849 vfs_drt_search_index(struct vfs_drt_clustermap
*cmap
, u_int64_t offset
, int *indexp
)
4853 offset
= DRT_ALIGN_ADDRESS(offset
);
4854 index
= DRT_HASH(cmap
, offset
);
4856 /* traverse the hashtable */
4857 for (i
= 0; i
< cmap
->scm_modulus
; i
++) {
4860 * If the slot is vacant, we can stop.
4862 if (DRT_HASH_VACANT(cmap
, index
))
4866 * If the address matches our offset, we have success.
4868 if (DRT_HASH_GET_ADDRESS(cmap
, index
) == offset
) {
4870 return(KERN_SUCCESS
);
4874 * Move to the next slot, try again.
4876 index
= DRT_HASH_NEXT(cmap
, index
);
4881 return(KERN_FAILURE
);
4885 * Find the hashtable slot for the supplied offset. If we haven't allocated
4886 * one yet, allocate one and populate the address field. Note that it will
4887 * not have a nonzero page count and thus will still technically be free, so
4888 * in the case where we are called to clean pages, the slot will remain free.
4890 static kern_return_t
4891 vfs_drt_get_index(struct vfs_drt_clustermap
**cmapp
, u_int64_t offset
, int *indexp
, int recursed
)
4893 struct vfs_drt_clustermap
*cmap
;
4899 /* look for an existing entry */
4900 kret
= vfs_drt_search_index(cmap
, offset
, indexp
);
4901 if (kret
== KERN_SUCCESS
)
4904 /* need to allocate an entry */
4905 offset
= DRT_ALIGN_ADDRESS(offset
);
4906 index
= DRT_HASH(cmap
, offset
);
4908 /* scan from the index forwards looking for a vacant slot */
4909 for (i
= 0; i
< cmap
->scm_modulus
; i
++) {
4911 if (DRT_HASH_VACANT(cmap
, index
) || DRT_HASH_GET_COUNT(cmap
,index
) == 0) {
4912 cmap
->scm_buckets
++;
4913 if (index
< cmap
->scm_lastclean
)
4914 cmap
->scm_lastclean
= index
;
4915 DRT_HASH_SET_ADDRESS(cmap
, index
, offset
);
4916 DRT_HASH_SET_COUNT(cmap
, index
, 0);
4917 DRT_BITVECTOR_CLEAR(cmap
, index
);
4919 vfs_drt_trace(cmap
, DRT_DEBUG_INSERT
, (int)offset
, i
, 0, 0);
4920 return(KERN_SUCCESS
);
4922 cmap
->scm_iskips
+= i
;
4923 index
= DRT_HASH_NEXT(cmap
, index
);
4927 * We haven't found a vacant slot, so the map is full. If we're not
4928 * already recursed, try reallocating/compacting it.
4931 return(KERN_FAILURE
);
4932 kret
= vfs_drt_alloc_map(cmapp
);
4933 if (kret
== KERN_SUCCESS
) {
4934 /* now try to insert again */
4935 kret
= vfs_drt_get_index(cmapp
, offset
, indexp
, 1);
4941 * Implementation of set dirty/clean.
4943 * In the 'clean' case, not finding a map is OK.
4945 static kern_return_t
4946 vfs_drt_do_mark_pages(
4953 struct vfs_drt_clustermap
*cmap
, **cmapp
;
4955 int i
, index
, pgoff
, pgcount
, setcount
, ecount
;
4957 cmapp
= (struct vfs_drt_clustermap
**)private;
4960 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_START
, (int)offset
, (int)length
, dirty
, 0);
4962 if (setcountp
!= NULL
)
4965 /* allocate a cluster map if we don't already have one */
4967 /* no cluster map, nothing to clean */
4969 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 1, 0, 0, 0);
4970 return(KERN_SUCCESS
);
4972 kret
= vfs_drt_alloc_map(cmapp
);
4973 if (kret
!= KERN_SUCCESS
) {
4974 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 2, 0, 0, 0);
4981 * Iterate over the length of the region.
4983 while (length
> 0) {
4985 * Get the hashtable index for this offset.
4987 * XXX this will add blank entries if we are clearing a range
4988 * that hasn't been dirtied.
4990 kret
= vfs_drt_get_index(cmapp
, offset
, &index
, 0);
4991 cmap
= *cmapp
; /* may have changed! */
4992 /* this may be a partial-success return */
4993 if (kret
!= KERN_SUCCESS
) {
4994 if (setcountp
!= NULL
)
4995 *setcountp
= setcount
;
4996 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 3, (int)length
, 0, 0);
5002 * Work out how many pages we're modifying in this
5005 pgoff
= (offset
- DRT_ALIGN_ADDRESS(offset
)) / PAGE_SIZE
;
5006 pgcount
= min((length
/ PAGE_SIZE
), (DRT_BITVECTOR_PAGES
- pgoff
));
5009 * Iterate over pages, dirty/clearing as we go.
5011 ecount
= DRT_HASH_GET_COUNT(cmap
, index
);
5012 for (i
= 0; i
< pgcount
; i
++) {
5014 if (!DRT_HASH_TEST_BIT(cmap
, index
, pgoff
+ i
)) {
5015 DRT_HASH_SET_BIT(cmap
, index
, pgoff
+ i
);
5020 if (DRT_HASH_TEST_BIT(cmap
, index
, pgoff
+ i
)) {
5021 DRT_HASH_CLEAR_BIT(cmap
, index
, pgoff
+ i
);
5027 DRT_HASH_SET_COUNT(cmap
, index
, ecount
);
5029 offset
+= pgcount
* PAGE_SIZE
;
5030 length
-= pgcount
* PAGE_SIZE
;
5032 if (setcountp
!= NULL
)
5033 *setcountp
= setcount
;
5035 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 0, setcount
, 0, 0);
5037 return(KERN_SUCCESS
);
5041 * Mark a set of pages as dirty/clean.
5043 * This is a public interface.
5046 * Pointer to storage suitable for holding a pointer. Note that
5047 * this must either be NULL or a value set by this function.
5050 * Current file size in bytes.
5053 * Offset of the first page to be marked as dirty, in bytes. Must be
5057 * Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
5060 * Number of pages newly marked dirty by this call (optional).
5062 * Returns KERN_SUCCESS if all the pages were successfully marked.
5064 static kern_return_t
5065 vfs_drt_mark_pages(void **cmapp
, off_t offset
, u_int length
, int *setcountp
)
5067 /* XXX size unused, drop from interface */
5068 return(vfs_drt_do_mark_pages(cmapp
, offset
, length
, setcountp
, 1));
5072 static kern_return_t
5073 vfs_drt_unmark_pages(void **cmapp
, off_t offset
, u_int length
)
5075 return(vfs_drt_do_mark_pages(cmapp
, offset
, length
, NULL
, 0));
5080 * Get a cluster of dirty pages.
5082 * This is a public interface.
5085 * Pointer to storage managed by drt_mark_pages. Note that this must
5086 * be NULL or a value set by drt_mark_pages.
5089 * Returns the byte offset into the file of the first page in the cluster.
5092 * Returns the length in bytes of the cluster of dirty pages.
5094 * Returns success if a cluster was found. If KERN_FAILURE is returned, there
5095 * are no dirty pages meeting the minmum size criteria. Private storage will
5096 * be released if there are no more dirty pages left in the map
5099 static kern_return_t
5100 vfs_drt_get_cluster(void **cmapp
, off_t
*offsetp
, u_int
*lengthp
)
5102 struct vfs_drt_clustermap
*cmap
;
5105 int index
, i
, j
, fs
, ls
;
5108 if ((cmapp
== NULL
) || (*cmapp
== NULL
))
5109 return(KERN_FAILURE
);
5112 /* walk the hashtable */
5113 for (offset
= 0, j
= 0; j
< cmap
->scm_modulus
; offset
+= (DRT_BITVECTOR_PAGES
* PAGE_SIZE
), j
++) {
5114 index
= DRT_HASH(cmap
, offset
);
5116 if (DRT_HASH_VACANT(cmap
, index
) || (DRT_HASH_GET_COUNT(cmap
, index
) == 0))
5119 /* scan the bitfield for a string of bits */
5122 for (i
= 0; i
< DRT_BITVECTOR_PAGES
; i
++) {
5123 if (DRT_HASH_TEST_BIT(cmap
, index
, i
)) {
5129 /* didn't find any bits set */
5130 panic("vfs_drt: entry summary count > 0 but no bits set in map");
5132 for (ls
= 0; i
< DRT_BITVECTOR_PAGES
; i
++, ls
++) {
5133 if (!DRT_HASH_TEST_BIT(cmap
, index
, i
))
5137 /* compute offset and length, mark pages clean */
5138 offset
= DRT_HASH_GET_ADDRESS(cmap
, index
) + (PAGE_SIZE
* fs
);
5139 length
= ls
* PAGE_SIZE
;
5140 vfs_drt_do_mark_pages(cmapp
, offset
, length
, NULL
, 0);
5141 cmap
->scm_lastclean
= index
;
5143 /* return successful */
5144 *offsetp
= (off_t
)offset
;
5147 vfs_drt_trace(cmap
, DRT_DEBUG_RETCLUSTER
, (int)offset
, (int)length
, 0, 0);
5148 return(KERN_SUCCESS
);
5151 * We didn't find anything... hashtable is empty
5152 * emit stats into trace buffer and
5155 vfs_drt_trace(cmap
, DRT_DEBUG_SCMDATA
,
5158 cmap
->scm_lastclean
,
5161 vfs_drt_free_map(cmap
);
5164 return(KERN_FAILURE
);
5168 static kern_return_t
5169 vfs_drt_control(void **cmapp
, int op_type
)
5171 struct vfs_drt_clustermap
*cmap
;
5174 if ((cmapp
== NULL
) || (*cmapp
== NULL
))
5175 return(KERN_FAILURE
);
5180 /* emit stats into trace buffer */
5181 vfs_drt_trace(cmap
, DRT_DEBUG_SCMDATA
,
5184 cmap
->scm_lastclean
,
5187 vfs_drt_free_map(cmap
);
5192 cmap
->scm_lastclean
= 0;
5195 return(KERN_SUCCESS
);
5201 * Emit a summary of the state of the clustermap into the trace buffer
5202 * along with some caller-provided data.
5206 vfs_drt_trace(__unused
struct vfs_drt_clustermap
*cmap
, int code
, int arg1
, int arg2
, int arg3
, int arg4
)
5208 KERNEL_DEBUG(code
, arg1
, arg2
, arg3
, arg4
, 0);
5212 vfs_drt_trace(__unused
struct vfs_drt_clustermap
*cmap
, __unused
int code
,
5213 __unused
int arg1
, __unused
int arg2
, __unused
int arg3
,
5221 * Perform basic sanity check on the hash entry summary count
5222 * vs. the actual bits set in the entry.
5225 vfs_drt_sanity(struct vfs_drt_clustermap
*cmap
)
5230 for (index
= 0; index
< cmap
->scm_modulus
; index
++) {
5231 if (DRT_HASH_VACANT(cmap
, index
))
5234 for (bits_on
= 0, i
= 0; i
< DRT_BITVECTOR_PAGES
; i
++) {
5235 if (DRT_HASH_TEST_BIT(cmap
, index
, i
))
5238 if (bits_on
!= DRT_HASH_GET_COUNT(cmap
, index
))
5239 panic("bits_on = %d, index = %d\n", bits_on
, index
);