2 * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
31 * The Regents of the University of California. All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
64 #include <sys/param.h>
65 #include <sys/proc_internal.h>
66 #include <sys/buf_internal.h>
67 #include <sys/mount_internal.h>
68 #include <sys/vnode_internal.h>
69 #include <sys/trace.h>
70 #include <sys/malloc.h>
72 #include <sys/kernel.h>
73 #include <sys/resourcevar.h>
74 #include <miscfs/specfs/specdev.h>
75 #include <sys/uio_internal.h>
76 #include <libkern/libkern.h>
77 #include <machine/machine_routines.h>
79 #include <sys/ubc_internal.h>
80 #include <vm/vnode_pager.h>
82 #include <mach/mach_types.h>
83 #include <mach/memory_object_types.h>
84 #include <mach/vm_map.h>
86 #include <kern/task.h>
88 #include <vm/vm_kern.h>
89 #include <vm/vm_map.h>
90 #include <vm/vm_pageout.h>
92 #include <sys/kdebug.h>
93 #include <libkern/OSAtomic.h>
99 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
104 #define CL_WRITE 0x02
105 #define CL_ASYNC 0x04
106 #define CL_COMMIT 0x08
107 #define CL_PAGEOUT 0x10
109 #define CL_NOZERO 0x40
110 #define CL_PAGEIN 0x80
111 #define CL_DEV_MEMORY 0x100
112 #define CL_PRESERVE 0x200
113 #define CL_THROTTLE 0x400
114 #define CL_KEEPCACHED 0x800
115 #define CL_DIRECT_IO 0x1000
116 #define CL_PASSIVE 0x2000
117 #define CL_IOSTREAMING 0x4000
118 #define CL_CLOSE 0x8000
119 #define CL_ENCRYPTED 0x10000
120 #define CL_RAW_ENCRYPTED 0x20000
121 #define CL_NOCACHE 0x40000
123 #define MAX_VECTOR_UPL_ELEMENTS 8
124 #define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE) * PAGE_SIZE
126 extern upl_t
vector_upl_create(vm_offset_t
);
127 extern boolean_t
vector_upl_is_valid(upl_t
);
128 extern boolean_t
vector_upl_set_subupl(upl_t
,upl_t
, u_int32_t
);
129 extern void vector_upl_set_pagelist(upl_t
);
130 extern void vector_upl_set_iostate(upl_t
, upl_t
, vm_offset_t
, u_int32_t
);
134 u_int io_completed
; /* amount of io that has currently completed */
135 u_int io_issued
; /* amount of io that was successfully issued */
136 int io_error
; /* error code of first error encountered */
137 int io_wanted
; /* someone is sleeping waiting for a change in state */
140 static lck_grp_t
*cl_mtx_grp
;
141 static lck_attr_t
*cl_mtx_attr
;
142 static lck_grp_attr_t
*cl_mtx_grp_attr
;
143 static lck_mtx_t
*cl_transaction_mtxp
;
151 #define PUSH_DELAY 0x01
152 #define PUSH_ALL 0x02
153 #define PUSH_SYNC 0x04
156 static void cluster_EOT(buf_t cbp_head
, buf_t cbp_tail
, int zero_offset
);
157 static void cluster_wait_IO(buf_t cbp_head
, int async
);
158 static void cluster_complete_transaction(buf_t
*cbp_head
, void *callback_arg
, int *retval
, int flags
, int needwait
);
160 static int cluster_io_type(struct uio
*uio
, int *io_type
, u_int32_t
*io_length
, u_int32_t min_length
);
162 static int cluster_io(vnode_t vp
, upl_t upl
, vm_offset_t upl_offset
, off_t f_offset
, int non_rounded_size
,
163 int flags
, buf_t real_bp
, struct clios
*iostate
, int (*)(buf_t
, void *), void *callback_arg
);
164 static int cluster_iodone(buf_t bp
, void *callback_arg
);
165 static int cluster_ioerror(upl_t upl
, int upl_offset
, int abort_size
, int error
, int io_flags
);
166 static int cluster_hard_throttle_on(vnode_t vp
, uint32_t);
168 static void cluster_iostate_wait(struct clios
*iostate
, u_int target
, const char *wait_name
);
170 static void cluster_syncup(vnode_t vp
, off_t newEOF
, int (*)(buf_t
, void *), void *callback_arg
);
172 static void cluster_read_upl_release(upl_t upl
, int start_pg
, int last_pg
, int take_reference
);
173 static int cluster_copy_ubc_data_internal(vnode_t vp
, struct uio
*uio
, int *io_resid
, int mark_dirty
, int take_reference
);
175 static int cluster_read_copy(vnode_t vp
, struct uio
*uio
, u_int32_t io_req_size
, off_t filesize
, int flags
,
176 int (*)(buf_t
, void *), void *callback_arg
);
177 static int cluster_read_direct(vnode_t vp
, struct uio
*uio
, off_t filesize
, int *read_type
, u_int32_t
*read_length
,
178 int flags
, int (*)(buf_t
, void *), void *callback_arg
);
179 static int cluster_read_contig(vnode_t vp
, struct uio
*uio
, off_t filesize
, int *read_type
, u_int32_t
*read_length
,
180 int (*)(buf_t
, void *), void *callback_arg
, int flags
);
182 static int cluster_write_copy(vnode_t vp
, struct uio
*uio
, u_int32_t io_req_size
, off_t oldEOF
, off_t newEOF
,
183 off_t headOff
, off_t tailOff
, int flags
, int (*)(buf_t
, void *), void *callback_arg
);
184 static int cluster_write_direct(vnode_t vp
, struct uio
*uio
, off_t oldEOF
, off_t newEOF
,
185 int *write_type
, u_int32_t
*write_length
, int flags
, int (*)(buf_t
, void *), void *callback_arg
);
186 static int cluster_write_contig(vnode_t vp
, struct uio
*uio
, off_t newEOF
,
187 int *write_type
, u_int32_t
*write_length
, int (*)(buf_t
, void *), void *callback_arg
, int bflag
);
189 static int cluster_align_phys_io(vnode_t vp
, struct uio
*uio
, addr64_t usr_paddr
, u_int32_t xsize
, int flags
, int (*)(buf_t
, void *), void *callback_arg
);
191 static int cluster_read_prefetch(vnode_t vp
, off_t f_offset
, u_int size
, off_t filesize
, int (*callback
)(buf_t
, void *), void *callback_arg
, int bflag
);
192 static void cluster_read_ahead(vnode_t vp
, struct cl_extent
*extent
, off_t filesize
, struct cl_readahead
*ra
, int (*callback
)(buf_t
, void *), void *callback_arg
, int bflag
);
194 static int cluster_push_now(vnode_t vp
, struct cl_extent
*, off_t EOF
, int flags
, int (*)(buf_t
, void *), void *callback_arg
);
196 static int cluster_try_push(struct cl_writebehind
*, vnode_t vp
, off_t EOF
, int push_flag
, int flags
, int (*)(buf_t
, void *), void *callback_arg
);
198 static void sparse_cluster_switch(struct cl_writebehind
*, vnode_t vp
, off_t EOF
, int (*)(buf_t
, void *), void *callback_arg
);
199 static void sparse_cluster_push(void **cmapp
, vnode_t vp
, off_t EOF
, int push_flag
, int io_flags
, int (*)(buf_t
, void *), void *callback_arg
);
200 static void sparse_cluster_add(void **cmapp
, vnode_t vp
, struct cl_extent
*, off_t EOF
, int (*)(buf_t
, void *), void *callback_arg
);
202 static kern_return_t
vfs_drt_mark_pages(void **cmapp
, off_t offset
, u_int length
, u_int
*setcountp
);
203 static kern_return_t
vfs_drt_get_cluster(void **cmapp
, off_t
*offsetp
, u_int
*lengthp
);
204 static kern_return_t
vfs_drt_control(void **cmapp
, int op_type
);
208 * For throttled IO to check whether
209 * a block is cached by the boot cache
210 * and thus it can avoid delaying the IO.
212 * bootcache_contains_block is initially
213 * NULL. The BootCache will set it while
214 * the cache is active and clear it when
215 * the cache is jettisoned.
217 * Returns 0 if the block is not
218 * contained in the cache, 1 if it is
221 * The function pointer remains valid
222 * after the cache has been evicted even
223 * if bootcache_contains_block has been
226 * See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs
228 int (*bootcache_contains_block
)(dev_t device
, u_int64_t blkno
) = NULL
;
232 * limit the internal I/O size so that we
233 * can represent it in a 32 bit int
235 #define MAX_IO_REQUEST_SIZE (1024 * 1024 * 512)
236 #define MAX_IO_CONTIG_SIZE (MAX_UPL_SIZE * PAGE_SIZE)
238 #define MIN_DIRECT_WRITE_SIZE (4 * PAGE_SIZE)
240 #define WRITE_THROTTLE 6
241 #define WRITE_THROTTLE_SSD 2
242 #define WRITE_BEHIND 1
243 #define WRITE_BEHIND_SSD 1
247 #define PREFETCH_SSD 1
248 uint32_t speculative_prefetch_max
= 512; /* maximum number of pages to use for a specluative read-ahead */
249 uint32_t speculative_prefetch_max_iosize
= (512 * 1024); /* maximum I/O size to use for a specluative read-ahead */
252 #define PREFETCH_SSD 1
253 uint32_t speculative_prefetch_max
= (MAX_UPL_SIZE
* 3);
254 uint32_t speculative_prefetch_max_iosize
= (512 * 1024); /* maximum I/O size to use for a specluative read-ahead on SSDs*/
258 #define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * (base))
259 #define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE))
260 #define MAX_PREFETCH(vp, size, is_ssd) (size * IO_SCALE(vp, ((is_ssd && !ignore_is_ssd) ? PREFETCH_SSD : PREFETCH)))
262 int ignore_is_ssd
= 0;
263 int speculative_reads_disabled
= 0;
266 * throttle the number of async writes that
267 * can be outstanding on a single vnode
268 * before we issue a synchronous write
270 #define HARD_THROTTLE_MAXCNT 0
271 #define HARD_THROTTLE_MAX_IOSIZE (128 * 1024)
272 #define LEGACY_HARD_THROTTLE_MAX_IOSIZE (512 * 1024)
274 extern int32_t throttle_legacy_process_count
;
275 int hard_throttle_on_root
= 0;
276 uint32_t hard_throttle_max_iosize
= HARD_THROTTLE_MAX_IOSIZE
;
277 uint32_t legacy_hard_throttle_max_iosize
= LEGACY_HARD_THROTTLE_MAX_IOSIZE
;
278 struct timeval priority_IO_timestamp_for_root
;
281 #define THROTTLE_MAX_IOSIZE (hard_throttle_max_iosize)
283 #define THROTTLE_MAX_IOSIZE (throttle_legacy_process_count == 0 ? hard_throttle_max_iosize : legacy_hard_throttle_max_iosize)
287 SYSCTL_INT(_debug
, OID_AUTO
, lowpri_throttle_max_iosize
, CTLFLAG_RW
| CTLFLAG_LOCKED
, &hard_throttle_max_iosize
, 0, "");
288 SYSCTL_INT(_debug
, OID_AUTO
, lowpri_legacy_throttle_max_iosize
, CTLFLAG_RW
| CTLFLAG_LOCKED
, &legacy_hard_throttle_max_iosize
, 0, "");
294 * allocate lock group attribute and group
296 cl_mtx_grp_attr
= lck_grp_attr_alloc_init();
297 cl_mtx_grp
= lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr
);
300 * allocate the lock attribute
302 cl_mtx_attr
= lck_attr_alloc_init();
304 cl_transaction_mtxp
= lck_mtx_alloc_init(cl_mtx_grp
, cl_mtx_attr
);
306 if (cl_transaction_mtxp
== NULL
)
307 panic("cluster_init: failed to allocate cl_transaction_mtxp");
312 cluster_max_io_size(mount_t mp
, int type
)
314 uint32_t max_io_size
;
321 segcnt
= mp
->mnt_segreadcnt
;
322 maxcnt
= mp
->mnt_maxreadcnt
;
325 segcnt
= mp
->mnt_segwritecnt
;
326 maxcnt
= mp
->mnt_maxwritecnt
;
329 segcnt
= min(mp
->mnt_segreadcnt
, mp
->mnt_segwritecnt
);
330 maxcnt
= min(mp
->mnt_maxreadcnt
, mp
->mnt_maxwritecnt
);
333 if (segcnt
> MAX_UPL_SIZE
) {
335 * don't allow a size beyond the max UPL size we can create
337 segcnt
= MAX_UPL_SIZE
;
339 max_io_size
= min((segcnt
* PAGE_SIZE
), maxcnt
);
341 if (max_io_size
< (MAX_UPL_TRANSFER
* PAGE_SIZE
)) {
343 * don't allow a size smaller than the old fixed limit
345 max_io_size
= (MAX_UPL_TRANSFER
* PAGE_SIZE
);
348 * make sure the size specified is a multiple of PAGE_SIZE
350 max_io_size
&= ~PAGE_MASK
;
352 return (max_io_size
);
358 #define CLW_ALLOCATE 0x01
359 #define CLW_RETURNLOCKED 0x02
360 #define CLW_IONOCACHE 0x04
361 #define CLW_IOPASSIVE 0x08
364 * if the read ahead context doesn't yet exist,
365 * allocate and initialize it...
366 * the vnode lock serializes multiple callers
367 * during the actual assignment... first one
368 * to grab the lock wins... the other callers
369 * will release the now unnecessary storage
371 * once the context is present, try to grab (but don't block on)
372 * the lock associated with it... if someone
373 * else currently owns it, than the read
374 * will run without read-ahead. this allows
375 * multiple readers to run in parallel and
376 * since there's only 1 read ahead context,
377 * there's no real loss in only allowing 1
378 * reader to have read-ahead enabled.
380 static struct cl_readahead
*
381 cluster_get_rap(vnode_t vp
)
383 struct ubc_info
*ubc
;
384 struct cl_readahead
*rap
;
388 if ((rap
= ubc
->cl_rahead
) == NULL
) {
389 MALLOC_ZONE(rap
, struct cl_readahead
*, sizeof *rap
, M_CLRDAHEAD
, M_WAITOK
);
391 bzero(rap
, sizeof *rap
);
393 lck_mtx_init(&rap
->cl_lockr
, cl_mtx_grp
, cl_mtx_attr
);
397 if (ubc
->cl_rahead
== NULL
)
398 ubc
->cl_rahead
= rap
;
400 lck_mtx_destroy(&rap
->cl_lockr
, cl_mtx_grp
);
401 FREE_ZONE((void *)rap
, sizeof *rap
, M_CLRDAHEAD
);
402 rap
= ubc
->cl_rahead
;
406 if (lck_mtx_try_lock(&rap
->cl_lockr
) == TRUE
)
409 return ((struct cl_readahead
*)NULL
);
414 * if the write behind context doesn't yet exist,
415 * and CLW_ALLOCATE is specified, allocate and initialize it...
416 * the vnode lock serializes multiple callers
417 * during the actual assignment... first one
418 * to grab the lock wins... the other callers
419 * will release the now unnecessary storage
421 * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
422 * the lock associated with the write behind context before
426 static struct cl_writebehind
*
427 cluster_get_wbp(vnode_t vp
, int flags
)
429 struct ubc_info
*ubc
;
430 struct cl_writebehind
*wbp
;
434 if ((wbp
= ubc
->cl_wbehind
) == NULL
) {
436 if ( !(flags
& CLW_ALLOCATE
))
437 return ((struct cl_writebehind
*)NULL
);
439 MALLOC_ZONE(wbp
, struct cl_writebehind
*, sizeof *wbp
, M_CLWRBEHIND
, M_WAITOK
);
441 bzero(wbp
, sizeof *wbp
);
442 lck_mtx_init(&wbp
->cl_lockw
, cl_mtx_grp
, cl_mtx_attr
);
446 if (ubc
->cl_wbehind
== NULL
)
447 ubc
->cl_wbehind
= wbp
;
449 lck_mtx_destroy(&wbp
->cl_lockw
, cl_mtx_grp
);
450 FREE_ZONE((void *)wbp
, sizeof *wbp
, M_CLWRBEHIND
);
451 wbp
= ubc
->cl_wbehind
;
455 if (flags
& CLW_RETURNLOCKED
)
456 lck_mtx_lock(&wbp
->cl_lockw
);
463 cluster_syncup(vnode_t vp
, off_t newEOF
, int (*callback
)(buf_t
, void *), void *callback_arg
)
465 struct cl_writebehind
*wbp
;
467 if ((wbp
= cluster_get_wbp(vp
, 0)) != NULL
) {
469 if (wbp
->cl_number
) {
470 lck_mtx_lock(&wbp
->cl_lockw
);
472 cluster_try_push(wbp
, vp
, newEOF
, PUSH_ALL
| PUSH_SYNC
, 0, callback
, callback_arg
);
474 lck_mtx_unlock(&wbp
->cl_lockw
);
481 cluster_io_present_in_BC(vnode_t vp
, off_t f_offset
)
485 int (*bootcache_check_fn
)(dev_t device
, u_int64_t blkno
) = bootcache_contains_block
;
487 if (bootcache_check_fn
) {
488 if (VNOP_BLOCKMAP(vp
, f_offset
, PAGE_SIZE
, &blkno
, &io_size
, NULL
, VNODE_READ
, NULL
))
494 if (bootcache_check_fn(vp
->v_mount
->mnt_devvp
->v_rdev
, blkno
))
502 cluster_hard_throttle_on(vnode_t vp
, uint32_t hard_throttle
)
504 int throttle_type
= 0;
506 if ( (throttle_type
= throttle_io_will_be_throttled(-1, vp
->v_mount
)) )
507 return(throttle_type
);
509 if (hard_throttle
&& (vp
->v_mount
->mnt_kern_flag
& MNTK_ROOTDEV
)) {
510 static struct timeval hard_throttle_maxelapsed
= { 0, 100000 };
511 struct timeval elapsed
;
513 if (hard_throttle_on_root
)
516 microuptime(&elapsed
);
517 timevalsub(&elapsed
, &priority_IO_timestamp_for_root
);
519 if (timevalcmp(&elapsed
, &hard_throttle_maxelapsed
, <))
527 cluster_iostate_wait(struct clios
*iostate
, u_int target
, const char *wait_name
)
530 lck_mtx_lock(&iostate
->io_mtxp
);
532 while ((iostate
->io_issued
- iostate
->io_completed
) > target
) {
534 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 95)) | DBG_FUNC_START
,
535 iostate
->io_issued
, iostate
->io_completed
, target
, 0, 0);
537 iostate
->io_wanted
= 1;
538 msleep((caddr_t
)&iostate
->io_wanted
, &iostate
->io_mtxp
, PRIBIO
+ 1, wait_name
, NULL
);
540 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 95)) | DBG_FUNC_END
,
541 iostate
->io_issued
, iostate
->io_completed
, target
, 0, 0);
543 lck_mtx_unlock(&iostate
->io_mtxp
);
548 cluster_ioerror(upl_t upl
, int upl_offset
, int abort_size
, int error
, int io_flags
)
550 int upl_abort_code
= 0;
554 if ((io_flags
& (B_PHYS
| B_CACHE
)) == (B_PHYS
| B_CACHE
))
556 * direct write of any flavor, or a direct read that wasn't aligned
558 ubc_upl_commit_range(upl
, upl_offset
, abort_size
, UPL_COMMIT_FREE_ON_EMPTY
);
560 if (io_flags
& B_PAGEIO
) {
561 if (io_flags
& B_READ
)
566 if (io_flags
& B_CACHE
)
568 * leave pages in the cache unchanged on error
570 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
;
571 else if (page_out
&& (error
!= ENXIO
))
573 * transient error... leave pages unchanged
575 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
;
577 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
;
579 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_DUMP_PAGES
;
581 ubc_upl_abort_range(upl
, upl_offset
, abort_size
, upl_abort_code
);
583 return (upl_abort_code
);
588 cluster_iodone(buf_t bp
, void *callback_arg
)
599 int transaction_size
= 0;
605 struct clios
*iostate
;
606 boolean_t transaction_complete
= FALSE
;
608 cbp_head
= (buf_t
)(bp
->b_trans_head
);
610 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_START
,
611 cbp_head
, bp
->b_lblkno
, bp
->b_bcount
, bp
->b_flags
, 0);
613 if (cbp_head
->b_trans_next
|| !(cbp_head
->b_flags
& B_EOT
)) {
614 boolean_t need_wakeup
= FALSE
;
616 lck_mtx_lock_spin(cl_transaction_mtxp
);
618 bp
->b_flags
|= B_TDONE
;
620 if (bp
->b_flags
& B_TWANTED
) {
621 CLR(bp
->b_flags
, B_TWANTED
);
624 for (cbp
= cbp_head
; cbp
; cbp
= cbp
->b_trans_next
) {
626 * all I/O requests that are part of this transaction
627 * have to complete before we can process it
629 if ( !(cbp
->b_flags
& B_TDONE
)) {
631 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
632 cbp_head
, cbp
, cbp
->b_bcount
, cbp
->b_flags
, 0);
634 lck_mtx_unlock(cl_transaction_mtxp
);
636 if (need_wakeup
== TRUE
)
641 if (cbp
->b_flags
& B_EOT
)
642 transaction_complete
= TRUE
;
644 lck_mtx_unlock(cl_transaction_mtxp
);
646 if (need_wakeup
== TRUE
)
649 if (transaction_complete
== FALSE
) {
650 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
651 cbp_head
, 0, 0, 0, 0);
660 upl_offset
= cbp
->b_uploffset
;
662 b_flags
= cbp
->b_flags
;
663 real_bp
= cbp
->b_real_bp
;
664 zero_offset
= cbp
->b_validend
;
665 iostate
= (struct clios
*)cbp
->b_iostate
;
668 real_bp
->b_dev
= cbp
->b_dev
;
671 if ((cbp
->b_flags
& B_ERROR
) && error
== 0)
672 error
= cbp
->b_error
;
674 total_resid
+= cbp
->b_resid
;
675 total_size
+= cbp
->b_bcount
;
677 cbp_next
= cbp
->b_trans_next
;
679 if (cbp_next
== NULL
)
681 * compute the overall size of the transaction
682 * in case we created one that has 'holes' in it
683 * 'total_size' represents the amount of I/O we
684 * did, not the span of the transaction w/r to the UPL
686 transaction_size
= cbp
->b_uploffset
+ cbp
->b_bcount
- upl_offset
;
693 if (error
== 0 && total_resid
)
697 int (*cliodone_func
)(buf_t
, void *) = (int (*)(buf_t
, void *))(cbp_head
->b_cliodone
);
699 if (cliodone_func
!= NULL
) {
700 cbp_head
->b_bcount
= transaction_size
;
702 error
= (*cliodone_func
)(cbp_head
, callback_arg
);
706 cluster_zero(upl
, zero_offset
, PAGE_SIZE
- (zero_offset
& PAGE_MASK
), real_bp
);
708 free_io_buf(cbp_head
);
714 * someone has issued multiple I/Os asynchrounsly
715 * and is waiting for them to complete (streaming)
717 lck_mtx_lock_spin(&iostate
->io_mtxp
);
719 if (error
&& iostate
->io_error
== 0)
720 iostate
->io_error
= error
;
722 iostate
->io_completed
+= total_size
;
724 if (iostate
->io_wanted
) {
726 * someone is waiting for the state of
727 * this io stream to change
729 iostate
->io_wanted
= 0;
732 lck_mtx_unlock(&iostate
->io_mtxp
);
735 wakeup((caddr_t
)&iostate
->io_wanted
);
738 if (b_flags
& B_COMMIT_UPL
) {
740 pg_offset
= upl_offset
& PAGE_MASK
;
741 commit_size
= (pg_offset
+ transaction_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
744 upl_flags
= cluster_ioerror(upl
, upl_offset
- pg_offset
, commit_size
, error
, b_flags
);
746 upl_flags
= UPL_COMMIT_FREE_ON_EMPTY
;
748 if ((b_flags
& B_PHYS
) && (b_flags
& B_READ
))
749 upl_flags
|= UPL_COMMIT_SET_DIRTY
;
752 upl_flags
|= UPL_COMMIT_INACTIVATE
;
754 ubc_upl_commit_range(upl
, upl_offset
- pg_offset
, commit_size
, upl_flags
);
759 real_bp
->b_flags
|= B_ERROR
;
760 real_bp
->b_error
= error
;
762 real_bp
->b_resid
= total_resid
;
764 buf_biodone(real_bp
);
766 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
767 upl
, upl_offset
- pg_offset
, commit_size
, (error
<< 24) | upl_flags
, 0);
774 cluster_hard_throttle_limit(vnode_t vp
, uint32_t *limit
, uint32_t hard_throttle
)
776 if (cluster_hard_throttle_on(vp
, hard_throttle
)) {
777 *limit
= THROTTLE_MAX_IOSIZE
;
785 cluster_zero(upl_t upl
, upl_offset_t upl_offset
, int size
, buf_t bp
)
788 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 23)) | DBG_FUNC_START
,
789 upl_offset
, size
, bp
, 0, 0);
791 if (bp
== NULL
|| bp
->b_datap
== 0) {
795 pl
= ubc_upl_pageinfo(upl
);
797 if (upl_device_page(pl
) == TRUE
) {
798 zero_addr
= ((addr64_t
)upl_phys_page(pl
, 0) << 12) + upl_offset
;
800 bzero_phys_nc(zero_addr
, size
);
807 page_index
= upl_offset
/ PAGE_SIZE
;
808 page_offset
= upl_offset
& PAGE_MASK
;
810 zero_addr
= ((addr64_t
)upl_phys_page(pl
, page_index
) << 12) + page_offset
;
811 zero_cnt
= min(PAGE_SIZE
- page_offset
, size
);
813 bzero_phys(zero_addr
, zero_cnt
);
816 upl_offset
+= zero_cnt
;
820 bzero((caddr_t
)((vm_offset_t
)bp
->b_datap
+ upl_offset
), size
);
822 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 23)) | DBG_FUNC_END
,
823 upl_offset
, size
, 0, 0, 0);
828 cluster_EOT(buf_t cbp_head
, buf_t cbp_tail
, int zero_offset
)
830 cbp_head
->b_validend
= zero_offset
;
831 cbp_tail
->b_flags
|= B_EOT
;
835 cluster_wait_IO(buf_t cbp_head
, int async
)
841 * async callback completion will not normally
842 * generate a wakeup upon I/O completion...
843 * by setting B_TWANTED, we will force a wakeup
844 * to occur as any outstanding I/Os complete...
845 * I/Os already completed will have B_TDONE already
846 * set and we won't cause us to block
847 * note that we're actually waiting for the bp to have
848 * completed the callback function... only then
849 * can we safely take back ownership of the bp
851 lck_mtx_lock_spin(cl_transaction_mtxp
);
853 for (cbp
= cbp_head
; cbp
; cbp
= cbp
->b_trans_next
)
854 cbp
->b_flags
|= B_TWANTED
;
856 lck_mtx_unlock(cl_transaction_mtxp
);
858 for (cbp
= cbp_head
; cbp
; cbp
= cbp
->b_trans_next
) {
861 while (!ISSET(cbp
->b_flags
, B_TDONE
)) {
863 lck_mtx_lock_spin(cl_transaction_mtxp
);
865 if (!ISSET(cbp
->b_flags
, B_TDONE
)) {
866 DTRACE_IO1(wait__start
, buf_t
, cbp
);
867 (void) msleep(cbp
, cl_transaction_mtxp
, PDROP
| (PRIBIO
+1), "cluster_wait_IO", NULL
);
868 DTRACE_IO1(wait__done
, buf_t
, cbp
);
870 lck_mtx_unlock(cl_transaction_mtxp
);
878 cluster_complete_transaction(buf_t
*cbp_head
, void *callback_arg
, int *retval
, int flags
, int needwait
)
884 * cluster_complete_transaction will
885 * only be called if we've issued a complete chain in synchronous mode
886 * or, we've already done a cluster_wait_IO on an incomplete chain
889 for (cbp
= *cbp_head
; cbp
; cbp
= cbp
->b_trans_next
)
893 * we've already waited on all of the I/Os in this transaction,
894 * so mark all of the buf_t's in this transaction as B_TDONE
895 * so that cluster_iodone sees the transaction as completed
897 for (cbp
= *cbp_head
; cbp
; cbp
= cbp
->b_trans_next
)
898 cbp
->b_flags
|= B_TDONE
;
900 error
= cluster_iodone(*cbp_head
, callback_arg
);
902 if ( !(flags
& CL_ASYNC
) && error
&& *retval
== 0) {
903 if (((flags
& (CL_PAGEOUT
| CL_KEEPCACHED
)) != CL_PAGEOUT
) || (error
!= ENXIO
))
906 *cbp_head
= (buf_t
)NULL
;
911 cluster_io(vnode_t vp
, upl_t upl
, vm_offset_t upl_offset
, off_t f_offset
, int non_rounded_size
,
912 int flags
, buf_t real_bp
, struct clios
*iostate
, int (*callback
)(buf_t
, void *), void *callback_arg
)
921 buf_t cbp_head
= NULL
;
922 buf_t cbp_tail
= NULL
;
931 int async_throttle
= 0;
933 vm_offset_t upl_end_offset
;
934 boolean_t need_EOT
= FALSE
;
937 * we currently don't support buffers larger than a page
939 if (real_bp
&& non_rounded_size
> PAGE_SIZE
)
940 panic("%s(): Called with real buffer of size %d bytes which "
941 "is greater than the maximum allowed size of "
942 "%d bytes (the system PAGE_SIZE).\n",
943 __FUNCTION__
, non_rounded_size
, PAGE_SIZE
);
948 * we don't want to do any funny rounding of the size for IO requests
949 * coming through the DIRECT or CONTIGUOUS paths... those pages don't
950 * belong to us... we can't extend (nor do we need to) the I/O to fill
953 if (mp
->mnt_devblocksize
> 1 && !(flags
& (CL_DEV_MEMORY
| CL_DIRECT_IO
))) {
955 * round the requested size up so that this I/O ends on a
956 * page boundary in case this is a 'write'... if the filesystem
957 * has blocks allocated to back the page beyond the EOF, we want to
958 * make sure to write out the zero's that are sitting beyond the EOF
959 * so that in case the filesystem doesn't explicitly zero this area
960 * if a hole is created via a lseek/write beyond the current EOF,
961 * it will return zeros when it's read back from the disk. If the
962 * physical allocation doesn't extend for the whole page, we'll
963 * only write/read from the disk up to the end of this allocation
964 * via the extent info returned from the VNOP_BLOCKMAP call.
966 pg_offset
= upl_offset
& PAGE_MASK
;
968 size
= (((non_rounded_size
+ pg_offset
) + (PAGE_SIZE
- 1)) & ~PAGE_MASK
) - pg_offset
;
971 * anyone advertising a blocksize of 1 byte probably
972 * can't deal with us rounding up the request size
973 * AFP is one such filesystem/device
975 size
= non_rounded_size
;
977 upl_end_offset
= upl_offset
+ size
;
979 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_START
, (int)f_offset
, size
, upl_offset
, flags
, 0);
982 * Set the maximum transaction size to the maximum desired number of
986 if (flags
& CL_DEV_MEMORY
)
987 max_trans_count
= 16;
989 if (flags
& CL_READ
) {
991 bmap_flags
= VNODE_READ
;
993 max_iosize
= mp
->mnt_maxreadcnt
;
994 max_vectors
= mp
->mnt_segreadcnt
;
997 bmap_flags
= VNODE_WRITE
;
999 max_iosize
= mp
->mnt_maxwritecnt
;
1000 max_vectors
= mp
->mnt_segwritecnt
;
1002 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_NONE
, max_iosize
, max_vectors
, mp
->mnt_devblocksize
, 0, 0);
1005 * make sure the maximum iosize is a
1006 * multiple of the page size
1008 max_iosize
&= ~PAGE_MASK
;
1011 * Ensure the maximum iosize is sensible.
1014 max_iosize
= PAGE_SIZE
;
1016 if (flags
& CL_THROTTLE
) {
1017 if ( !(flags
& CL_PAGEOUT
) && cluster_hard_throttle_on(vp
, 1)) {
1018 if (max_iosize
> THROTTLE_MAX_IOSIZE
)
1019 max_iosize
= THROTTLE_MAX_IOSIZE
;
1020 async_throttle
= HARD_THROTTLE_MAXCNT
;
1022 if ( (flags
& CL_DEV_MEMORY
) )
1023 async_throttle
= IO_SCALE(vp
, VNODE_ASYNC_THROTTLE
);
1026 u_int max_cluster_size
;
1029 max_cluster_size
= MAX_CLUSTER_SIZE(vp
);
1031 if (max_iosize
> max_cluster_size
)
1032 max_cluster
= max_cluster_size
;
1034 max_cluster
= max_iosize
;
1036 if (size
< max_cluster
)
1039 if ((vp
->v_mount
->mnt_kern_flag
& MNTK_SSD
) && !ignore_is_ssd
)
1040 scale
= WRITE_THROTTLE_SSD
;
1042 scale
= WRITE_THROTTLE
;
1044 if (flags
& CL_CLOSE
)
1045 scale
+= MAX_CLUSTERS
;
1047 async_throttle
= min(IO_SCALE(vp
, VNODE_ASYNC_THROTTLE
), ((scale
* max_cluster_size
) / max_cluster
) - 1);
1053 if (flags
& (CL_PAGEIN
| CL_PAGEOUT
))
1054 io_flags
|= B_PAGEIO
;
1055 if (flags
& (CL_IOSTREAMING
))
1056 io_flags
|= B_IOSTREAMING
;
1057 if (flags
& CL_COMMIT
)
1058 io_flags
|= B_COMMIT_UPL
;
1059 if (flags
& CL_DIRECT_IO
)
1061 if (flags
& (CL_PRESERVE
| CL_KEEPCACHED
))
1062 io_flags
|= B_CACHE
;
1063 if (flags
& CL_PASSIVE
)
1064 io_flags
|= B_PASSIVE
;
1065 if (flags
& CL_ENCRYPTED
)
1066 io_flags
|= B_ENCRYPTED_IO
;
1067 if (vp
->v_flag
& VSYSTEM
)
1070 if ((flags
& CL_READ
) && ((upl_offset
+ non_rounded_size
) & PAGE_MASK
) && (!(flags
& CL_NOZERO
))) {
1072 * then we are going to end up
1073 * with a page that we can't complete (the file size wasn't a multiple
1074 * of PAGE_SIZE and we're trying to read to the end of the file
1075 * so we'll go ahead and zero out the portion of the page we can't
1076 * read in from the file
1078 zero_offset
= upl_offset
+ non_rounded_size
;
1083 u_int io_size_wanted
;
1086 if (size
> max_iosize
)
1087 io_size
= max_iosize
;
1091 io_size_wanted
= io_size
;
1092 io_size_tmp
= (size_t)io_size
;
1094 if ((error
= VNOP_BLOCKMAP(vp
, f_offset
, io_size
, &blkno
, &io_size_tmp
, NULL
, bmap_flags
, NULL
)))
1097 if (io_size_tmp
> io_size_wanted
)
1098 io_size
= io_size_wanted
;
1100 io_size
= (u_int
)io_size_tmp
;
1102 if (real_bp
&& (real_bp
->b_blkno
== real_bp
->b_lblkno
))
1103 real_bp
->b_blkno
= blkno
;
1105 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 24)) | DBG_FUNC_NONE
,
1106 (int)f_offset
, (int)(blkno
>>32), (int)blkno
, io_size
, 0);
1110 * vnop_blockmap didn't return an error... however, it did
1111 * return an extent size of 0 which means we can't
1112 * make forward progress on this I/O... a hole in the
1113 * file would be returned as a blkno of -1 with a non-zero io_size
1114 * a real extent is returned with a blkno != -1 and a non-zero io_size
1119 if ( !(flags
& CL_READ
) && blkno
== -1) {
1123 if (upl_get_internal_vectorupl(upl
))
1124 panic("Vector UPLs should not take this code-path\n");
1126 * we're writing into a 'hole'
1128 if (flags
& CL_PAGEOUT
) {
1130 * if we got here via cluster_pageout
1131 * then just error the request and return
1132 * the 'hole' should already have been covered
1138 * we can get here if the cluster code happens to
1139 * pick up a page that was dirtied via mmap vs
1140 * a 'write' and the page targets a 'hole'...
1141 * i.e. the writes to the cluster were sparse
1142 * and the file was being written for the first time
1144 * we can also get here if the filesystem supports
1145 * 'holes' that are less than PAGE_SIZE.... because
1146 * we can't know if the range in the page that covers
1147 * the 'hole' has been dirtied via an mmap or not,
1148 * we have to assume the worst and try to push the
1149 * entire page to storage.
1151 * Try paging out the page individually before
1152 * giving up entirely and dumping it (the pageout
1153 * path will insure that the zero extent accounting
1154 * has been taken care of before we get back into cluster_io)
1156 * go direct to vnode_pageout so that we don't have to
1157 * unbusy the page from the UPL... we used to do this
1158 * so that we could call ubc_sync_range, but that results
1159 * in a potential deadlock if someone else races us to acquire
1160 * that page and wins and in addition needs one of the pages
1161 * we're continuing to hold in the UPL
1163 pageout_flags
= UPL_MSYNC
| UPL_VNODE_PAGER
| UPL_NESTED_PAGEOUT
;
1165 if ( !(flags
& CL_ASYNC
))
1166 pageout_flags
|= UPL_IOSYNC
;
1167 if ( !(flags
& CL_COMMIT
))
1168 pageout_flags
|= UPL_NOCOMMIT
;
1174 * first we have to wait for the the current outstanding I/Os
1175 * to complete... EOT hasn't been set yet on this transaction
1176 * so the pages won't be released just because all of the current
1177 * I/O linked to this transaction has completed...
1179 cluster_wait_IO(cbp_head
, (flags
& CL_ASYNC
));
1182 * we've got a transcation that
1183 * includes the page we're about to push out through vnode_pageout...
1184 * find the last bp in the list which will be the one that
1185 * includes the head of this page and round it's iosize down
1186 * to a page boundary...
1188 for (last_cbp
= cbp
= cbp_head
; cbp
->b_trans_next
; cbp
= cbp
->b_trans_next
)
1191 cbp
->b_bcount
&= ~PAGE_MASK
;
1193 if (cbp
->b_bcount
== 0) {
1195 * this buf no longer has any I/O associated with it
1199 if (cbp
== cbp_head
) {
1201 * the buf we just freed was the only buf in
1202 * this transaction... so there's no I/O to do
1207 * remove the buf we just freed from
1208 * the transaction list
1210 last_cbp
->b_trans_next
= NULL
;
1211 cbp_tail
= last_cbp
;
1216 * there was more to the current transaction
1217 * than just the page we are pushing out via vnode_pageout...
1218 * mark it as finished and complete it... we've already
1219 * waited for the I/Os to complete above in the call to cluster_wait_IO
1221 cluster_EOT(cbp_head
, cbp_tail
, 0);
1223 cluster_complete_transaction(&cbp_head
, callback_arg
, &retval
, flags
, 0);
1228 if (vnode_pageout(vp
, upl
, trunc_page(upl_offset
), trunc_page_64(f_offset
), PAGE_SIZE
, pageout_flags
, NULL
) != PAGER_SUCCESS
) {
1231 e_offset
= round_page_64(f_offset
+ 1);
1232 io_size
= e_offset
- f_offset
;
1234 f_offset
+= io_size
;
1235 upl_offset
+= io_size
;
1237 if (size
>= io_size
)
1242 * keep track of how much of the original request
1243 * that we've actually completed... non_rounded_size
1244 * may go negative due to us rounding the request
1245 * to a page size multiple (i.e. size > non_rounded_size)
1247 non_rounded_size
-= io_size
;
1249 if (non_rounded_size
<= 0) {
1251 * we've transferred all of the data in the original
1252 * request, but we were unable to complete the tail
1253 * of the last page because the file didn't have
1254 * an allocation to back that portion... this is ok.
1260 flags
&= ~CL_COMMIT
;
1265 lblkno
= (daddr64_t
)(f_offset
/ PAGE_SIZE_64
);
1267 * we have now figured out how much I/O we can do - this is in 'io_size'
1268 * pg_offset is the starting point in the first page for the I/O
1269 * pg_count is the number of full and partial pages that 'io_size' encompasses
1271 pg_offset
= upl_offset
& PAGE_MASK
;
1273 if (flags
& CL_DEV_MEMORY
) {
1275 * treat physical requests as one 'giant' page
1279 pg_count
= (io_size
+ pg_offset
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
1281 if ((flags
& CL_READ
) && blkno
== -1) {
1282 vm_offset_t commit_offset
;
1284 int complete_transaction_now
= 0;
1287 * if we're reading and blkno == -1, then we've got a
1288 * 'hole' in the file that we need to deal with by zeroing
1289 * out the affected area in the upl
1291 if (io_size
>= (u_int
)non_rounded_size
) {
1293 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
1294 * than 'zero_offset' will be non-zero
1295 * if the 'hole' returned by vnop_blockmap extends all the way to the eof
1296 * (indicated by the io_size finishing off the I/O request for this UPL)
1297 * than we're not going to issue an I/O for the
1298 * last page in this upl... we need to zero both the hole and the tail
1299 * of the page beyond the EOF, since the delayed zero-fill won't kick in
1301 bytes_to_zero
= non_rounded_size
;
1302 if (!(flags
& CL_NOZERO
))
1303 bytes_to_zero
= (((upl_offset
+ io_size
) + (PAGE_SIZE
- 1)) & ~PAGE_MASK
) - upl_offset
;
1307 bytes_to_zero
= io_size
;
1311 cluster_zero(upl
, upl_offset
, bytes_to_zero
, real_bp
);
1317 * if there is a current I/O chain pending
1318 * then the first page of the group we just zero'd
1319 * will be handled by the I/O completion if the zero
1320 * fill started in the middle of the page
1322 commit_offset
= (upl_offset
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
1324 pg_resid
= commit_offset
- upl_offset
;
1326 if (bytes_to_zero
>= pg_resid
) {
1328 * the last page of the current I/O
1329 * has been completed...
1330 * compute the number of fully zero'd
1331 * pages that are beyond it
1332 * plus the last page if its partial
1333 * and we have no more I/O to issue...
1334 * otherwise a partial page is left
1335 * to begin the next I/O
1337 if ((int)io_size
>= non_rounded_size
)
1338 pg_count
= (bytes_to_zero
- pg_resid
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
1340 pg_count
= (bytes_to_zero
- pg_resid
) / PAGE_SIZE
;
1342 complete_transaction_now
= 1;
1346 * no pending I/O to deal with
1347 * so, commit all of the fully zero'd pages
1348 * plus the last page if its partial
1349 * and we have no more I/O to issue...
1350 * otherwise a partial page is left
1351 * to begin the next I/O
1353 if ((int)io_size
>= non_rounded_size
)
1354 pg_count
= (pg_offset
+ bytes_to_zero
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
1356 pg_count
= (pg_offset
+ bytes_to_zero
) / PAGE_SIZE
;
1358 commit_offset
= upl_offset
& ~PAGE_MASK
;
1360 if ( (flags
& CL_COMMIT
) && pg_count
) {
1361 ubc_upl_commit_range(upl
, commit_offset
, pg_count
* PAGE_SIZE
,
1362 UPL_COMMIT_CLEAR_DIRTY
| UPL_COMMIT_FREE_ON_EMPTY
);
1364 upl_offset
+= io_size
;
1365 f_offset
+= io_size
;
1369 * keep track of how much of the original request
1370 * that we've actually completed... non_rounded_size
1371 * may go negative due to us rounding the request
1372 * to a page size multiple (i.e. size > non_rounded_size)
1374 non_rounded_size
-= io_size
;
1376 if (non_rounded_size
<= 0) {
1378 * we've transferred all of the data in the original
1379 * request, but we were unable to complete the tail
1380 * of the last page because the file didn't have
1381 * an allocation to back that portion... this is ok.
1385 if (cbp_head
&& (complete_transaction_now
|| size
== 0)) {
1386 cluster_wait_IO(cbp_head
, (flags
& CL_ASYNC
));
1388 cluster_EOT(cbp_head
, cbp_tail
, size
== 0 ? zero_offset
: 0);
1390 cluster_complete_transaction(&cbp_head
, callback_arg
, &retval
, flags
, 0);
1396 if (pg_count
> max_vectors
) {
1397 if (((pg_count
- max_vectors
) * PAGE_SIZE
) > io_size
) {
1398 io_size
= PAGE_SIZE
- pg_offset
;
1401 io_size
-= (pg_count
- max_vectors
) * PAGE_SIZE
;
1402 pg_count
= max_vectors
;
1406 * If the transaction is going to reach the maximum number of
1407 * desired elements, truncate the i/o to the nearest page so
1408 * that the actual i/o is initiated after this buffer is
1409 * created and added to the i/o chain.
1411 * I/O directed to physically contiguous memory
1412 * doesn't have a requirement to make sure we 'fill' a page
1414 if ( !(flags
& CL_DEV_MEMORY
) && trans_count
>= max_trans_count
&&
1415 ((upl_offset
+ io_size
) & PAGE_MASK
)) {
1416 vm_offset_t aligned_ofs
;
1418 aligned_ofs
= (upl_offset
+ io_size
) & ~PAGE_MASK
;
1420 * If the io_size does not actually finish off even a
1421 * single page we have to keep adding buffers to the
1422 * transaction despite having reached the desired limit.
1424 * Eventually we get here with the page being finished
1425 * off (and exceeded) and then we truncate the size of
1426 * this i/o request so that it is page aligned so that
1427 * we can finally issue the i/o on the transaction.
1429 if (aligned_ofs
> upl_offset
) {
1430 io_size
= aligned_ofs
- upl_offset
;
1435 if ( !(mp
->mnt_kern_flag
& MNTK_VIRTUALDEV
))
1437 * if we're not targeting a virtual device i.e. a disk image
1438 * it's safe to dip into the reserve pool since real devices
1439 * can complete this I/O request without requiring additional
1440 * bufs from the alloc_io_buf pool
1443 else if ((flags
& CL_ASYNC
) && !(flags
& CL_PAGEOUT
))
1445 * Throttle the speculative IO
1451 cbp
= alloc_io_buf(vp
, priv
);
1453 if (flags
& CL_PAGEOUT
) {
1456 for (i
= 0; i
< pg_count
; i
++) {
1457 if (buf_invalblkno(vp
, lblkno
+ i
, 0) == EBUSY
)
1458 panic("BUSY bp found in cluster_io");
1461 if (flags
& CL_ASYNC
) {
1462 if (buf_setcallback(cbp
, (void *)cluster_iodone
, callback_arg
))
1463 panic("buf_setcallback failed\n");
1465 cbp
->b_cliodone
= (void *)callback
;
1466 cbp
->b_flags
|= io_flags
;
1467 if (flags
& CL_NOCACHE
)
1468 cbp
->b_attr
.ba_flags
|= BA_NOCACHE
;
1470 cbp
->b_lblkno
= lblkno
;
1471 cbp
->b_blkno
= blkno
;
1472 cbp
->b_bcount
= io_size
;
1474 if (buf_setupl(cbp
, upl
, upl_offset
))
1475 panic("buf_setupl failed\n");
1477 cbp
->b_trans_next
= (buf_t
)NULL
;
1479 if ((cbp
->b_iostate
= (void *)iostate
))
1481 * caller wants to track the state of this
1482 * io... bump the amount issued against this stream
1484 iostate
->io_issued
+= io_size
;
1486 if (flags
& CL_READ
) {
1487 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 26)) | DBG_FUNC_NONE
,
1488 (int)cbp
->b_lblkno
, (int)cbp
->b_blkno
, upl_offset
, io_size
, 0);
1491 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 27)) | DBG_FUNC_NONE
,
1492 (int)cbp
->b_lblkno
, (int)cbp
->b_blkno
, upl_offset
, io_size
, 0);
1496 cbp_tail
->b_trans_next
= cbp
;
1502 if ( (cbp_head
->b_real_bp
= real_bp
) )
1503 real_bp
= (buf_t
)NULL
;
1505 *(buf_t
*)(&cbp
->b_trans_head
) = cbp_head
;
1509 upl_offset
+= io_size
;
1510 f_offset
+= io_size
;
1513 * keep track of how much of the original request
1514 * that we've actually completed... non_rounded_size
1515 * may go negative due to us rounding the request
1516 * to a page size multiple (i.e. size > non_rounded_size)
1518 non_rounded_size
-= io_size
;
1520 if (non_rounded_size
<= 0) {
1522 * we've transferred all of the data in the original
1523 * request, but we were unable to complete the tail
1524 * of the last page because the file didn't have
1525 * an allocation to back that portion... this is ok.
1531 * we have no more I/O to issue, so go
1532 * finish the final transaction
1535 } else if ( ((flags
& CL_DEV_MEMORY
) || (upl_offset
& PAGE_MASK
) == 0) &&
1536 ((flags
& CL_ASYNC
) || trans_count
> max_trans_count
) ) {
1538 * I/O directed to physically contiguous memory...
1539 * which doesn't have a requirement to make sure we 'fill' a page
1541 * the current I/O we've prepared fully
1542 * completes the last page in this request
1544 * it's either an ASYNC request or
1545 * we've already accumulated more than 8 I/O's into
1546 * this transaction so mark it as complete so that
1547 * it can finish asynchronously or via the cluster_complete_transaction
1548 * below if the request is synchronous
1552 if (need_EOT
== TRUE
)
1553 cluster_EOT(cbp_head
, cbp_tail
, size
== 0 ? zero_offset
: 0);
1555 if (flags
& CL_THROTTLE
)
1556 (void)vnode_waitforwrites(vp
, async_throttle
, 0, 0, "cluster_io");
1558 if ( !(io_flags
& B_READ
))
1559 vnode_startwrite(vp
);
1561 if (flags
& CL_RAW_ENCRYPTED
) {
1563 * User requested raw encrypted bytes.
1564 * Twiddle the bit in the ba_flags for the buffer
1566 cbp
->b_attr
.ba_flags
|= BA_RAW_ENCRYPTED_IO
;
1569 (void) VNOP_STRATEGY(cbp
);
1571 if (need_EOT
== TRUE
) {
1572 if ( !(flags
& CL_ASYNC
))
1573 cluster_complete_transaction(&cbp_head
, callback_arg
, &retval
, flags
, 1);
1587 * first wait until all of the outstanding I/O
1588 * for this partial transaction has completed
1590 cluster_wait_IO(cbp_head
, (flags
& CL_ASYNC
));
1593 * Rewind the upl offset to the beginning of the
1596 upl_offset
= cbp_head
->b_uploffset
;
1598 for (cbp
= cbp_head
; cbp
;) {
1601 size
+= cbp
->b_bcount
;
1602 io_size
+= cbp
->b_bcount
;
1604 cbp_next
= cbp
->b_trans_next
;
1610 int need_wakeup
= 0;
1613 * update the error condition for this stream
1614 * since we never really issued the io
1615 * just go ahead and adjust it back
1617 lck_mtx_lock_spin(&iostate
->io_mtxp
);
1619 if (iostate
->io_error
== 0)
1620 iostate
->io_error
= error
;
1621 iostate
->io_issued
-= io_size
;
1623 if (iostate
->io_wanted
) {
1625 * someone is waiting for the state of
1626 * this io stream to change
1628 iostate
->io_wanted
= 0;
1631 lck_mtx_unlock(&iostate
->io_mtxp
);
1634 wakeup((caddr_t
)&iostate
->io_wanted
);
1636 if (flags
& CL_COMMIT
) {
1639 pg_offset
= upl_offset
& PAGE_MASK
;
1640 abort_size
= (upl_end_offset
- upl_offset
+ PAGE_MASK
) & ~PAGE_MASK
;
1642 upl_flags
= cluster_ioerror(upl
, upl_offset
- pg_offset
, abort_size
, error
, io_flags
);
1644 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 28)) | DBG_FUNC_NONE
,
1645 upl
, upl_offset
- pg_offset
, abort_size
, (error
<< 24) | upl_flags
, 0);
1649 } else if (cbp_head
)
1650 panic("%s(): cbp_head is not NULL.\n", __FUNCTION__
);
1654 * can get here if we either encountered an error
1655 * or we completely zero-filled the request and
1659 real_bp
->b_flags
|= B_ERROR
;
1660 real_bp
->b_error
= error
;
1662 buf_biodone(real_bp
);
1664 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_END
, (int)f_offset
, size
, upl_offset
, retval
, 0);
1669 #define reset_vector_run_state() \
1670 issueVectorUPL = vector_upl_offset = vector_upl_index = vector_upl_iosize = vector_upl_size = 0;
1673 vector_cluster_io(vnode_t vp
, upl_t vector_upl
, vm_offset_t vector_upl_offset
, off_t v_upl_uio_offset
, int vector_upl_iosize
,
1674 int io_flag
, buf_t real_bp
, struct clios
*iostate
, int (*callback
)(buf_t
, void *), void *callback_arg
)
1676 vector_upl_set_pagelist(vector_upl
);
1678 if(io_flag
& CL_READ
) {
1679 if(vector_upl_offset
== 0 && ((vector_upl_iosize
& PAGE_MASK
)==0))
1680 io_flag
&= ~CL_PRESERVE
; /*don't zero fill*/
1682 io_flag
|= CL_PRESERVE
; /*zero fill*/
1684 return (cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, real_bp
, iostate
, callback
, callback_arg
));
1689 cluster_read_prefetch(vnode_t vp
, off_t f_offset
, u_int size
, off_t filesize
, int (*callback
)(buf_t
, void *), void *callback_arg
, int bflag
)
1691 int pages_in_prefetch
;
1693 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_START
,
1694 (int)f_offset
, size
, (int)filesize
, 0, 0);
1696 if (f_offset
>= filesize
) {
1697 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_END
,
1698 (int)f_offset
, 0, 0, 0, 0);
1701 if ((off_t
)size
> (filesize
- f_offset
))
1702 size
= filesize
- f_offset
;
1703 pages_in_prefetch
= (size
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
1705 advisory_read_ext(vp
, filesize
, f_offset
, size
, callback
, callback_arg
, bflag
);
1707 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_END
,
1708 (int)f_offset
+ size
, pages_in_prefetch
, 0, 1, 0);
1710 return (pages_in_prefetch
);
1716 cluster_read_ahead(vnode_t vp
, struct cl_extent
*extent
, off_t filesize
, struct cl_readahead
*rap
, int (*callback
)(buf_t
, void *), void *callback_arg
,
1721 int size_of_prefetch
;
1725 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_START
,
1726 (int)extent
->b_addr
, (int)extent
->e_addr
, (int)rap
->cl_lastr
, 0, 0);
1728 if (extent
->b_addr
== rap
->cl_lastr
&& extent
->b_addr
== extent
->e_addr
) {
1729 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
1730 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 0, 0);
1733 if (rap
->cl_lastr
== -1 || (extent
->b_addr
!= rap
->cl_lastr
&& extent
->b_addr
!= (rap
->cl_lastr
+ 1))) {
1737 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
1738 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 1, 0);
1742 max_prefetch
= MAX_PREFETCH(vp
, cluster_max_io_size(vp
->v_mount
, CL_READ
), (vp
->v_mount
->mnt_kern_flag
& MNTK_SSD
));
1744 if ((max_prefetch
/ PAGE_SIZE
) > speculative_prefetch_max
)
1745 max_prefetch
= (speculative_prefetch_max
* PAGE_SIZE
);
1747 if (max_prefetch
<= PAGE_SIZE
) {
1748 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
1749 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 6, 0);
1752 if (extent
->e_addr
< rap
->cl_maxra
) {
1753 if ((rap
->cl_maxra
- extent
->e_addr
) > ((max_prefetch
/ PAGE_SIZE
) / 4)) {
1755 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
1756 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 2, 0);
1760 r_addr
= max(extent
->e_addr
, rap
->cl_maxra
) + 1;
1761 f_offset
= (off_t
)(r_addr
* PAGE_SIZE_64
);
1763 size_of_prefetch
= 0;
1765 ubc_range_op(vp
, f_offset
, f_offset
+ PAGE_SIZE_64
, UPL_ROP_PRESENT
, &size_of_prefetch
);
1767 if (size_of_prefetch
) {
1768 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
1769 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 3, 0);
1772 if (f_offset
< filesize
) {
1773 daddr64_t read_size
;
1775 rap
->cl_ralen
= rap
->cl_ralen
? min(max_prefetch
/ PAGE_SIZE
, rap
->cl_ralen
<< 1) : 1;
1777 read_size
= (extent
->e_addr
+ 1) - extent
->b_addr
;
1779 if (read_size
> rap
->cl_ralen
) {
1780 if (read_size
> max_prefetch
/ PAGE_SIZE
)
1781 rap
->cl_ralen
= max_prefetch
/ PAGE_SIZE
;
1783 rap
->cl_ralen
= read_size
;
1785 size_of_prefetch
= cluster_read_prefetch(vp
, f_offset
, rap
->cl_ralen
* PAGE_SIZE
, filesize
, callback
, callback_arg
, bflag
);
1787 if (size_of_prefetch
)
1788 rap
->cl_maxra
= (r_addr
+ size_of_prefetch
) - 1;
1790 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
1791 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 4, 0);
1796 cluster_pageout(vnode_t vp
, upl_t upl
, upl_offset_t upl_offset
, off_t f_offset
,
1797 int size
, off_t filesize
, int flags
)
1799 return cluster_pageout_ext(vp
, upl
, upl_offset
, f_offset
, size
, filesize
, flags
, NULL
, NULL
);
1805 cluster_pageout_ext(vnode_t vp
, upl_t upl
, upl_offset_t upl_offset
, off_t f_offset
,
1806 int size
, off_t filesize
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
1813 local_flags
= CL_PAGEOUT
| CL_THROTTLE
;
1815 if ((flags
& UPL_IOSYNC
) == 0)
1816 local_flags
|= CL_ASYNC
;
1817 if ((flags
& UPL_NOCOMMIT
) == 0)
1818 local_flags
|= CL_COMMIT
;
1819 if ((flags
& UPL_KEEPCACHED
))
1820 local_flags
|= CL_KEEPCACHED
;
1821 if (flags
& UPL_PAGING_ENCRYPTED
)
1822 local_flags
|= CL_ENCRYPTED
;
1825 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 52)) | DBG_FUNC_NONE
,
1826 (int)f_offset
, size
, (int)filesize
, local_flags
, 0);
1829 * If they didn't specify any I/O, then we are done...
1830 * we can't issue an abort because we don't know how
1831 * big the upl really is
1836 if (vp
->v_mount
->mnt_flag
& MNT_RDONLY
) {
1837 if (local_flags
& CL_COMMIT
)
1838 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
);
1842 * can't page-in from a negative offset
1843 * or if we're starting beyond the EOF
1844 * or if the file offset isn't page aligned
1845 * or the size requested isn't a multiple of PAGE_SIZE
1847 if (f_offset
< 0 || f_offset
>= filesize
||
1848 (f_offset
& PAGE_MASK_64
) || (size
& PAGE_MASK
)) {
1849 if (local_flags
& CL_COMMIT
)
1850 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
);
1853 max_size
= filesize
- f_offset
;
1855 if (size
< max_size
)
1860 rounded_size
= (io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
1862 if (size
> rounded_size
) {
1863 if (local_flags
& CL_COMMIT
)
1864 ubc_upl_abort_range(upl
, upl_offset
+ rounded_size
, size
- rounded_size
,
1865 UPL_ABORT_FREE_ON_EMPTY
);
1867 return (cluster_io(vp
, upl
, upl_offset
, f_offset
, io_size
,
1868 local_flags
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
));
1873 cluster_pagein(vnode_t vp
, upl_t upl
, upl_offset_t upl_offset
, off_t f_offset
,
1874 int size
, off_t filesize
, int flags
)
1876 return cluster_pagein_ext(vp
, upl
, upl_offset
, f_offset
, size
, filesize
, flags
, NULL
, NULL
);
1881 cluster_pagein_ext(vnode_t vp
, upl_t upl
, upl_offset_t upl_offset
, off_t f_offset
,
1882 int size
, off_t filesize
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
1888 int local_flags
= 0;
1890 if (upl
== NULL
|| size
< 0)
1891 panic("cluster_pagein: NULL upl passed in");
1893 if ((flags
& UPL_IOSYNC
) == 0)
1894 local_flags
|= CL_ASYNC
;
1895 if ((flags
& UPL_NOCOMMIT
) == 0)
1896 local_flags
|= CL_COMMIT
;
1897 if (flags
& UPL_IOSTREAMING
)
1898 local_flags
|= CL_IOSTREAMING
;
1899 if (flags
& UPL_PAGING_ENCRYPTED
)
1900 local_flags
|= CL_ENCRYPTED
;
1903 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 56)) | DBG_FUNC_NONE
,
1904 (int)f_offset
, size
, (int)filesize
, local_flags
, 0);
1907 * can't page-in from a negative offset
1908 * or if we're starting beyond the EOF
1909 * or if the file offset isn't page aligned
1910 * or the size requested isn't a multiple of PAGE_SIZE
1912 if (f_offset
< 0 || f_offset
>= filesize
||
1913 (f_offset
& PAGE_MASK_64
) || (size
& PAGE_MASK
) || (upl_offset
& PAGE_MASK
)) {
1914 if (local_flags
& CL_COMMIT
)
1915 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
);
1918 max_size
= filesize
- f_offset
;
1920 if (size
< max_size
)
1925 rounded_size
= (io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
1927 if (size
> rounded_size
&& (local_flags
& CL_COMMIT
))
1928 ubc_upl_abort_range(upl
, upl_offset
+ rounded_size
,
1929 size
- rounded_size
, UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
);
1931 retval
= cluster_io(vp
, upl
, upl_offset
, f_offset
, io_size
,
1932 local_flags
| CL_READ
| CL_PAGEIN
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
);
1939 cluster_bp(buf_t bp
)
1941 return cluster_bp_ext(bp
, NULL
, NULL
);
1946 cluster_bp_ext(buf_t bp
, int (*callback
)(buf_t
, void *), void *callback_arg
)
1951 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 19)) | DBG_FUNC_START
,
1952 bp
, (int)bp
->b_lblkno
, bp
->b_bcount
, bp
->b_flags
, 0);
1954 if (bp
->b_flags
& B_READ
)
1955 flags
= CL_ASYNC
| CL_READ
;
1958 if (bp
->b_flags
& B_PASSIVE
)
1959 flags
|= CL_PASSIVE
;
1961 f_offset
= ubc_blktooff(bp
->b_vp
, bp
->b_lblkno
);
1963 return (cluster_io(bp
->b_vp
, bp
->b_upl
, 0, f_offset
, bp
->b_bcount
, flags
, bp
, (struct clios
*)NULL
, callback
, callback_arg
));
1969 cluster_write(vnode_t vp
, struct uio
*uio
, off_t oldEOF
, off_t newEOF
, off_t headOff
, off_t tailOff
, int xflags
)
1971 return cluster_write_ext(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, xflags
, NULL
, NULL
);
1976 cluster_write_ext(vnode_t vp
, struct uio
*uio
, off_t oldEOF
, off_t newEOF
, off_t headOff
, off_t tailOff
,
1977 int xflags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
1979 user_ssize_t cur_resid
;
1984 int write_type
= IO_COPY
;
1985 u_int32_t write_length
;
1989 if (flags
& IO_PASSIVE
)
1994 if (vp
->v_flag
& VNOCACHE_DATA
){
1995 flags
|= IO_NOCACHE
;
1996 bflag
|= CL_NOCACHE
;
2001 * this call is being made to zero-fill some range in the file
2003 retval
= cluster_write_copy(vp
, NULL
, (u_int32_t
)0, oldEOF
, newEOF
, headOff
, tailOff
, flags
, callback
, callback_arg
);
2008 * do a write through the cache if one of the following is true....
2009 * NOCACHE is not true or NODIRECT is true
2010 * the uio request doesn't target USERSPACE
2011 * otherwise, find out if we want the direct or contig variant for
2012 * the first vector in the uio request
2014 if ( ((flags
& (IO_NOCACHE
| IO_NODIRECT
)) == IO_NOCACHE
) && UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
) )
2015 retval
= cluster_io_type(uio
, &write_type
, &write_length
, MIN_DIRECT_WRITE_SIZE
);
2017 if ( (flags
& (IO_TAILZEROFILL
| IO_HEADZEROFILL
)) && write_type
== IO_DIRECT
)
2019 * must go through the cached variant in this case
2021 write_type
= IO_COPY
;
2023 while ((cur_resid
= uio_resid(uio
)) && uio
->uio_offset
< newEOF
&& retval
== 0) {
2025 switch (write_type
) {
2029 * make sure the uio_resid isn't too big...
2030 * internally, we want to handle all of the I/O in
2031 * chunk sizes that fit in a 32 bit int
2033 if (cur_resid
> (user_ssize_t
)(MAX_IO_REQUEST_SIZE
)) {
2035 * we're going to have to call cluster_write_copy
2038 * only want the last call to cluster_write_copy to
2039 * have the IO_TAILZEROFILL flag set and only the
2040 * first call should have IO_HEADZEROFILL
2042 zflags
= flags
& ~IO_TAILZEROFILL
;
2043 flags
&= ~IO_HEADZEROFILL
;
2045 write_length
= MAX_IO_REQUEST_SIZE
;
2048 * last call to cluster_write_copy
2052 write_length
= (u_int32_t
)cur_resid
;
2054 retval
= cluster_write_copy(vp
, uio
, write_length
, oldEOF
, newEOF
, headOff
, tailOff
, zflags
, callback
, callback_arg
);
2058 zflags
= flags
& ~(IO_TAILZEROFILL
| IO_HEADZEROFILL
);
2060 if (flags
& IO_HEADZEROFILL
) {
2062 * only do this once per request
2064 flags
&= ~IO_HEADZEROFILL
;
2066 retval
= cluster_write_copy(vp
, (struct uio
*)0, (u_int32_t
)0, (off_t
)0, uio
->uio_offset
,
2067 headOff
, (off_t
)0, zflags
| IO_HEADZEROFILL
| IO_SYNC
, callback
, callback_arg
);
2071 retval
= cluster_write_contig(vp
, uio
, newEOF
, &write_type
, &write_length
, callback
, callback_arg
, bflag
);
2073 if (retval
== 0 && (flags
& IO_TAILZEROFILL
) && uio_resid(uio
) == 0) {
2075 * we're done with the data from the user specified buffer(s)
2076 * and we've been requested to zero fill at the tail
2077 * treat this as an IO_HEADZEROFILL which doesn't require a uio
2078 * by rearranging the args and passing in IO_HEADZEROFILL
2080 retval
= cluster_write_copy(vp
, (struct uio
*)0, (u_int32_t
)0, (off_t
)0, tailOff
, uio
->uio_offset
,
2081 (off_t
)0, zflags
| IO_HEADZEROFILL
| IO_SYNC
, callback
, callback_arg
);
2087 * cluster_write_direct is never called with IO_TAILZEROFILL || IO_HEADZEROFILL
2089 retval
= cluster_write_direct(vp
, uio
, oldEOF
, newEOF
, &write_type
, &write_length
, flags
, callback
, callback_arg
);
2093 retval
= cluster_io_type(uio
, &write_type
, &write_length
, MIN_DIRECT_WRITE_SIZE
);
2097 * in case we end up calling cluster_write_copy (from cluster_write_direct)
2098 * multiple times to service a multi-vector request that is not aligned properly
2099 * we need to update the oldEOF so that we
2100 * don't zero-fill the head of a page if we've successfully written
2101 * data to that area... 'cluster_write_copy' will zero-fill the head of a
2102 * page that is beyond the oldEOF if the write is unaligned... we only
2103 * want that to happen for the very first page of the cluster_write,
2104 * NOT the first page of each vector making up a multi-vector write.
2106 if (uio
->uio_offset
> oldEOF
)
2107 oldEOF
= uio
->uio_offset
;
2114 cluster_write_direct(vnode_t vp
, struct uio
*uio
, off_t oldEOF
, off_t newEOF
, int *write_type
, u_int32_t
*write_length
,
2115 int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
2118 upl_page_info_t
*pl
;
2119 vm_offset_t upl_offset
;
2120 vm_offset_t vector_upl_offset
= 0;
2121 u_int32_t io_req_size
;
2122 u_int32_t offset_in_file
;
2123 u_int32_t offset_in_iovbase
;
2126 upl_size_t upl_size
, vector_upl_size
= 0;
2127 vm_size_t upl_needed_size
;
2128 mach_msg_type_number_t pages_in_pl
;
2131 mach_msg_type_number_t i
;
2132 int force_data_sync
;
2135 struct clios iostate
;
2136 user_addr_t iov_base
;
2137 u_int32_t mem_alignment_mask
;
2138 u_int32_t devblocksize
;
2139 u_int32_t max_io_size
;
2140 u_int32_t max_upl_size
;
2141 u_int32_t max_vector_size
;
2142 boolean_t io_throttled
= FALSE
;
2144 u_int32_t vector_upl_iosize
= 0;
2145 int issueVectorUPL
= 0,useVectorUPL
= (uio
->uio_iovcnt
> 1);
2146 off_t v_upl_uio_offset
= 0;
2147 int vector_upl_index
=0;
2148 upl_t vector_upl
= NULL
;
2152 * When we enter this routine, we know
2153 * -- the resid will not exceed iov_len
2155 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_START
,
2156 (int)uio
->uio_offset
, *write_length
, (int)newEOF
, 0, 0);
2158 max_upl_size
= cluster_max_io_size(vp
->v_mount
, CL_WRITE
);
2160 io_flag
= CL_ASYNC
| CL_PRESERVE
| CL_COMMIT
| CL_THROTTLE
| CL_DIRECT_IO
;
2162 if (flags
& IO_PASSIVE
)
2163 io_flag
|= CL_PASSIVE
;
2165 if (flags
& IO_NOCACHE
)
2166 io_flag
|= CL_NOCACHE
;
2168 iostate
.io_completed
= 0;
2169 iostate
.io_issued
= 0;
2170 iostate
.io_error
= 0;
2171 iostate
.io_wanted
= 0;
2173 lck_mtx_init(&iostate
.io_mtxp
, cl_mtx_grp
, cl_mtx_attr
);
2175 mem_alignment_mask
= (u_int32_t
)vp
->v_mount
->mnt_alignmentmask
;
2176 devblocksize
= (u_int32_t
)vp
->v_mount
->mnt_devblocksize
;
2178 if (devblocksize
== 1) {
2180 * the AFP client advertises a devblocksize of 1
2181 * however, its BLOCKMAP routine maps to physical
2182 * blocks that are PAGE_SIZE in size...
2183 * therefore we can't ask for I/Os that aren't page aligned
2184 * or aren't multiples of PAGE_SIZE in size
2185 * by setting devblocksize to PAGE_SIZE, we re-instate
2186 * the old behavior we had before the mem_alignment_mask
2187 * changes went in...
2189 devblocksize
= PAGE_SIZE
;
2193 io_req_size
= *write_length
;
2194 iov_base
= uio_curriovbase(uio
);
2196 offset_in_file
= (u_int32_t
)uio
->uio_offset
& PAGE_MASK
;
2197 offset_in_iovbase
= (u_int32_t
)iov_base
& mem_alignment_mask
;
2199 if (offset_in_file
|| offset_in_iovbase
) {
2201 * one of the 2 important offsets is misaligned
2202 * so fire an I/O through the cache for this entire vector
2204 goto wait_for_dwrites
;
2206 if (iov_base
& (devblocksize
- 1)) {
2208 * the offset in memory must be on a device block boundary
2209 * so that we can guarantee that we can generate an
2210 * I/O that ends on a page boundary in cluster_io
2212 goto wait_for_dwrites
;
2215 while (io_req_size
>= PAGE_SIZE
&& uio
->uio_offset
< newEOF
&& retval
== 0) {
2218 if ( (throttle_type
= cluster_hard_throttle_on(vp
, 1)) ) {
2220 * we're in the throttle window, at the very least
2221 * we want to limit the size of the I/O we're about
2224 if ( (flags
& IO_RETURN_ON_THROTTLE
) && throttle_type
== 2) {
2226 * we're in the throttle window and at least 1 I/O
2227 * has already been issued by a throttleable thread
2228 * in this window, so return with EAGAIN to indicate
2229 * to the FS issuing the cluster_write call that it
2230 * should now throttle after dropping any locks
2232 throttle_info_update_by_mount(vp
->v_mount
);
2234 io_throttled
= TRUE
;
2235 goto wait_for_dwrites
;
2237 max_vector_size
= THROTTLE_MAX_IOSIZE
;
2238 max_io_size
= THROTTLE_MAX_IOSIZE
;
2240 max_vector_size
= MAX_VECTOR_UPL_SIZE
;
2241 max_io_size
= max_upl_size
;
2245 cluster_syncup(vp
, newEOF
, callback
, callback_arg
);
2248 io_size
= io_req_size
& ~PAGE_MASK
;
2249 iov_base
= uio_curriovbase(uio
);
2251 if (io_size
> max_io_size
)
2252 io_size
= max_io_size
;
2254 if(useVectorUPL
&& (iov_base
& PAGE_MASK
)) {
2256 * We have an iov_base that's not page-aligned.
2257 * Issue all I/O's that have been collected within
2258 * this Vectored UPL.
2260 if(vector_upl_index
) {
2261 retval
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
2262 reset_vector_run_state();
2266 * After this point, if we are using the Vector UPL path and the base is
2267 * not page-aligned then the UPL with that base will be the first in the vector UPL.
2271 upl_offset
= (vm_offset_t
)((u_int32_t
)iov_base
& PAGE_MASK
);
2272 upl_needed_size
= (upl_offset
+ io_size
+ (PAGE_SIZE
-1)) & ~PAGE_MASK
;
2274 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_START
,
2275 (int)upl_offset
, upl_needed_size
, (int)iov_base
, io_size
, 0);
2277 for (force_data_sync
= 0; force_data_sync
< 3; force_data_sync
++) {
2279 upl_size
= upl_needed_size
;
2280 upl_flags
= UPL_FILE_IO
| UPL_COPYOUT_FROM
| UPL_NO_SYNC
|
2281 UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
2283 kret
= vm_map_get_upl(current_map(),
2284 (vm_map_offset_t
)(iov_base
& ~((user_addr_t
)PAGE_MASK
)),
2292 if (kret
!= KERN_SUCCESS
) {
2293 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
2296 * failed to get pagelist
2298 * we may have already spun some portion of this request
2299 * off as async requests... we need to wait for the I/O
2300 * to complete before returning
2302 goto wait_for_dwrites
;
2304 pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
2305 pages_in_pl
= upl_size
/ PAGE_SIZE
;
2307 for (i
= 0; i
< pages_in_pl
; i
++) {
2308 if (!upl_valid_page(pl
, i
))
2311 if (i
== pages_in_pl
)
2315 * didn't get all the pages back that we
2316 * needed... release this upl and try again
2318 ubc_upl_abort(upl
, 0);
2320 if (force_data_sync
>= 3) {
2321 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
2322 i
, pages_in_pl
, upl_size
, kret
, 0);
2324 * for some reason, we couldn't acquire a hold on all
2325 * the pages needed in the user's address space
2327 * we may have already spun some portion of this request
2328 * off as async requests... we need to wait for the I/O
2329 * to complete before returning
2331 goto wait_for_dwrites
;
2335 * Consider the possibility that upl_size wasn't satisfied.
2337 if (upl_size
< upl_needed_size
) {
2338 if (upl_size
&& upl_offset
== 0)
2343 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
2344 (int)upl_offset
, upl_size
, (int)iov_base
, io_size
, 0);
2347 ubc_upl_abort(upl
, 0);
2349 * we may have already spun some portion of this request
2350 * off as async requests... we need to wait for the I/O
2351 * to complete before returning
2353 goto wait_for_dwrites
;
2357 vm_offset_t end_off
= ((iov_base
+ io_size
) & PAGE_MASK
);
2361 * After this point, if we are using a vector UPL, then
2362 * either all the UPL elements end on a page boundary OR
2363 * this UPL is the last element because it does not end
2364 * on a page boundary.
2369 * Now look for pages already in the cache
2370 * and throw them away.
2371 * uio->uio_offset is page aligned within the file
2372 * io_size is a multiple of PAGE_SIZE
2374 ubc_range_op(vp
, uio
->uio_offset
, uio
->uio_offset
+ io_size
, UPL_ROP_DUMP
, NULL
);
2377 * we want push out these writes asynchronously so that we can overlap
2378 * the preparation of the next I/O
2379 * if there are already too many outstanding writes
2380 * wait until some complete before issuing the next
2382 if (iostate
.io_issued
> iostate
.io_completed
)
2383 cluster_iostate_wait(&iostate
, max_upl_size
* IO_SCALE(vp
, 2), "cluster_write_direct");
2385 if (iostate
.io_error
) {
2387 * one of the earlier writes we issued ran into a hard error
2388 * don't issue any more writes, cleanup the UPL
2389 * that was just created but not used, then
2390 * go wait for all writes that are part of this stream
2391 * to complete before returning the error to the caller
2393 ubc_upl_abort(upl
, 0);
2395 goto wait_for_dwrites
;
2398 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 77)) | DBG_FUNC_START
,
2399 (int)upl_offset
, (int)uio
->uio_offset
, io_size
, io_flag
, 0);
2402 retval
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
,
2403 io_size
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
2406 if(!vector_upl_index
) {
2407 vector_upl
= vector_upl_create(upl_offset
);
2408 v_upl_uio_offset
= uio
->uio_offset
;
2409 vector_upl_offset
= upl_offset
;
2412 vector_upl_set_subupl(vector_upl
,upl
,upl_size
);
2413 vector_upl_set_iostate(vector_upl
, upl
, vector_upl_size
, upl_size
);
2415 vector_upl_iosize
+= io_size
;
2416 vector_upl_size
+= upl_size
;
2418 if(issueVectorUPL
|| vector_upl_index
== MAX_VECTOR_UPL_ELEMENTS
|| vector_upl_size
>= max_vector_size
) {
2419 retval
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
2420 reset_vector_run_state();
2425 * update the uio structure to
2426 * reflect the I/O that we just issued
2428 uio_update(uio
, (user_size_t
)io_size
);
2431 * in case we end up calling through to cluster_write_copy to finish
2432 * the tail of this request, we need to update the oldEOF so that we
2433 * don't zero-fill the head of a page if we've successfully written
2434 * data to that area... 'cluster_write_copy' will zero-fill the head of a
2435 * page that is beyond the oldEOF if the write is unaligned... we only
2436 * want that to happen for the very first page of the cluster_write,
2437 * NOT the first page of each vector making up a multi-vector write.
2439 if (uio
->uio_offset
> oldEOF
)
2440 oldEOF
= uio
->uio_offset
;
2442 io_req_size
-= io_size
;
2444 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 77)) | DBG_FUNC_END
,
2445 (int)upl_offset
, (int)uio
->uio_offset
, io_req_size
, retval
, 0);
2449 if (retval
== 0 && iostate
.io_error
== 0 && io_req_size
== 0) {
2451 retval
= cluster_io_type(uio
, write_type
, write_length
, MIN_DIRECT_WRITE_SIZE
);
2453 if (retval
== 0 && *write_type
== IO_DIRECT
) {
2455 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_NONE
,
2456 (int)uio
->uio_offset
, *write_length
, (int)newEOF
, 0, 0);
2464 if (retval
== 0 && iostate
.io_error
== 0 && useVectorUPL
&& vector_upl_index
) {
2465 retval
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
2466 reset_vector_run_state();
2469 if (iostate
.io_issued
> iostate
.io_completed
) {
2471 * make sure all async writes issued as part of this stream
2472 * have completed before we return
2474 cluster_iostate_wait(&iostate
, 0, "cluster_write_direct");
2476 if (iostate
.io_error
)
2477 retval
= iostate
.io_error
;
2479 lck_mtx_destroy(&iostate
.io_mtxp
, cl_mtx_grp
);
2481 if (io_throttled
== TRUE
&& retval
== 0)
2484 if (io_req_size
&& retval
== 0) {
2486 * we couldn't handle the tail of this request in DIRECT mode
2487 * so fire it through the copy path
2489 * note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set
2490 * so we can just pass 0 in for the headOff and tailOff
2492 if (uio
->uio_offset
> oldEOF
)
2493 oldEOF
= uio
->uio_offset
;
2495 retval
= cluster_write_copy(vp
, uio
, io_req_size
, oldEOF
, newEOF
, (off_t
)0, (off_t
)0, flags
, callback
, callback_arg
);
2497 *write_type
= IO_UNKNOWN
;
2499 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_END
,
2500 (int)uio
->uio_offset
, io_req_size
, retval
, 4, 0);
2507 cluster_write_contig(vnode_t vp
, struct uio
*uio
, off_t newEOF
, int *write_type
, u_int32_t
*write_length
,
2508 int (*callback
)(buf_t
, void *), void *callback_arg
, int bflag
)
2510 upl_page_info_t
*pl
;
2511 addr64_t src_paddr
= 0;
2512 upl_t upl
[MAX_VECTS
];
2513 vm_offset_t upl_offset
;
2514 u_int32_t tail_size
= 0;
2517 upl_size_t upl_size
;
2518 vm_size_t upl_needed_size
;
2519 mach_msg_type_number_t pages_in_pl
;
2522 struct clios iostate
;
2527 user_addr_t iov_base
;
2528 u_int32_t devblocksize
;
2529 u_int32_t mem_alignment_mask
;
2532 * When we enter this routine, we know
2533 * -- the io_req_size will not exceed iov_len
2534 * -- the target address is physically contiguous
2536 cluster_syncup(vp
, newEOF
, callback
, callback_arg
);
2538 devblocksize
= (u_int32_t
)vp
->v_mount
->mnt_devblocksize
;
2539 mem_alignment_mask
= (u_int32_t
)vp
->v_mount
->mnt_alignmentmask
;
2541 iostate
.io_completed
= 0;
2542 iostate
.io_issued
= 0;
2543 iostate
.io_error
= 0;
2544 iostate
.io_wanted
= 0;
2546 lck_mtx_init(&iostate
.io_mtxp
, cl_mtx_grp
, cl_mtx_attr
);
2549 io_size
= *write_length
;
2551 iov_base
= uio_curriovbase(uio
);
2553 upl_offset
= (vm_offset_t
)((u_int32_t
)iov_base
& PAGE_MASK
);
2554 upl_needed_size
= upl_offset
+ io_size
;
2557 upl_size
= upl_needed_size
;
2558 upl_flags
= UPL_FILE_IO
| UPL_COPYOUT_FROM
| UPL_NO_SYNC
|
2559 UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
2561 kret
= vm_map_get_upl(current_map(),
2562 (vm_map_offset_t
)(iov_base
& ~((user_addr_t
)PAGE_MASK
)),
2563 &upl_size
, &upl
[cur_upl
], NULL
, &pages_in_pl
, &upl_flags
, 0);
2565 if (kret
!= KERN_SUCCESS
) {
2567 * failed to get pagelist
2570 goto wait_for_cwrites
;
2575 * Consider the possibility that upl_size wasn't satisfied.
2577 if (upl_size
< upl_needed_size
) {
2579 * This is a failure in the physical memory case.
2582 goto wait_for_cwrites
;
2584 pl
= ubc_upl_pageinfo(upl
[cur_upl
]);
2586 src_paddr
= ((addr64_t
)upl_phys_page(pl
, 0) << 12) + (addr64_t
)upl_offset
;
2588 while (((uio
->uio_offset
& (devblocksize
- 1)) || io_size
< devblocksize
) && io_size
) {
2589 u_int32_t head_size
;
2591 head_size
= devblocksize
- (u_int32_t
)(uio
->uio_offset
& (devblocksize
- 1));
2593 if (head_size
> io_size
)
2594 head_size
= io_size
;
2596 error
= cluster_align_phys_io(vp
, uio
, src_paddr
, head_size
, 0, callback
, callback_arg
);
2599 goto wait_for_cwrites
;
2601 upl_offset
+= head_size
;
2602 src_paddr
+= head_size
;
2603 io_size
-= head_size
;
2605 iov_base
+= head_size
;
2607 if ((u_int32_t
)iov_base
& mem_alignment_mask
) {
2609 * request doesn't set up on a memory boundary
2610 * the underlying DMA engine can handle...
2611 * return an error instead of going through
2612 * the slow copy path since the intent of this
2613 * path is direct I/O from device memory
2616 goto wait_for_cwrites
;
2619 tail_size
= io_size
& (devblocksize
- 1);
2620 io_size
-= tail_size
;
2622 while (io_size
&& error
== 0) {
2624 if (io_size
> MAX_IO_CONTIG_SIZE
)
2625 xsize
= MAX_IO_CONTIG_SIZE
;
2629 * request asynchronously so that we can overlap
2630 * the preparation of the next I/O... we'll do
2631 * the commit after all the I/O has completed
2632 * since its all issued against the same UPL
2633 * if there are already too many outstanding writes
2634 * wait until some have completed before issuing the next
2636 if (iostate
.io_issued
> iostate
.io_completed
)
2637 cluster_iostate_wait(&iostate
, MAX_IO_CONTIG_SIZE
* IO_SCALE(vp
, 2), "cluster_write_contig");
2639 if (iostate
.io_error
) {
2641 * one of the earlier writes we issued ran into a hard error
2642 * don't issue any more writes...
2643 * go wait for all writes that are part of this stream
2644 * to complete before returning the error to the caller
2646 goto wait_for_cwrites
;
2649 * issue an asynchronous write to cluster_io
2651 error
= cluster_io(vp
, upl
[cur_upl
], upl_offset
, uio
->uio_offset
,
2652 xsize
, CL_DEV_MEMORY
| CL_ASYNC
| bflag
, (buf_t
)NULL
, (struct clios
*)&iostate
, callback
, callback_arg
);
2656 * The cluster_io write completed successfully,
2657 * update the uio structure
2659 uio_update(uio
, (user_size_t
)xsize
);
2661 upl_offset
+= xsize
;
2666 if (error
== 0 && iostate
.io_error
== 0 && tail_size
== 0 && num_upl
< MAX_VECTS
) {
2668 error
= cluster_io_type(uio
, write_type
, write_length
, 0);
2670 if (error
== 0 && *write_type
== IO_CONTIG
) {
2675 *write_type
= IO_UNKNOWN
;
2679 * make sure all async writes that are part of this stream
2680 * have completed before we proceed
2682 if (iostate
.io_issued
> iostate
.io_completed
)
2683 cluster_iostate_wait(&iostate
, 0, "cluster_write_contig");
2685 if (iostate
.io_error
)
2686 error
= iostate
.io_error
;
2688 lck_mtx_destroy(&iostate
.io_mtxp
, cl_mtx_grp
);
2690 if (error
== 0 && tail_size
)
2691 error
= cluster_align_phys_io(vp
, uio
, src_paddr
, tail_size
, 0, callback
, callback_arg
);
2693 for (n
= 0; n
< num_upl
; n
++)
2695 * just release our hold on each physically contiguous
2696 * region without changing any state
2698 ubc_upl_abort(upl
[n
], 0);
2705 * need to avoid a race between an msync of a range of pages dirtied via mmap
2706 * vs a filesystem such as HFS deciding to write a 'hole' to disk via cluster_write's
2707 * zerofill mechanism before it has seen the VNOP_PAGEOUTs for the pages being msync'd
2709 * we should never force-zero-fill pages that are already valid in the cache...
2710 * the entire page contains valid data (either from disk, zero-filled or dirtied
2711 * via an mmap) so we can only do damage by trying to zero-fill
2715 cluster_zero_range(upl_t upl
, upl_page_info_t
*pl
, int flags
, int io_offset
, off_t zero_off
, off_t upl_f_offset
, int bytes_to_zero
)
2718 boolean_t need_cluster_zero
= TRUE
;
2720 if ((flags
& (IO_NOZEROVALID
| IO_NOZERODIRTY
))) {
2722 bytes_to_zero
= min(bytes_to_zero
, PAGE_SIZE
- (int)(zero_off
& PAGE_MASK_64
));
2723 zero_pg_index
= (int)((zero_off
- upl_f_offset
) / PAGE_SIZE_64
);
2725 if (upl_valid_page(pl
, zero_pg_index
)) {
2727 * never force zero valid pages - dirty or clean
2728 * we'll leave these in the UPL for cluster_write_copy to deal with
2730 need_cluster_zero
= FALSE
;
2733 if (need_cluster_zero
== TRUE
)
2734 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
2736 return (bytes_to_zero
);
2741 cluster_write_copy(vnode_t vp
, struct uio
*uio
, u_int32_t io_req_size
, off_t oldEOF
, off_t newEOF
, off_t headOff
,
2742 off_t tailOff
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
2744 upl_page_info_t
*pl
;
2746 vm_offset_t upl_offset
= 0;
2759 long long total_size
;
2762 long long zero_cnt1
;
2764 off_t write_off
= 0;
2766 boolean_t first_pass
= FALSE
;
2767 struct cl_extent cl
;
2768 struct cl_writebehind
*wbp
;
2770 u_int max_cluster_pgcount
;
2774 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_START
,
2775 (int)uio
->uio_offset
, io_req_size
, (int)oldEOF
, (int)newEOF
, 0);
2777 io_resid
= io_req_size
;
2779 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_START
,
2780 0, 0, (int)oldEOF
, (int)newEOF
, 0);
2784 if (flags
& IO_PASSIVE
)
2788 if (flags
& IO_NOCACHE
)
2789 bflag
|= CL_NOCACHE
;
2796 max_cluster_pgcount
= MAX_CLUSTER_SIZE(vp
) / PAGE_SIZE
;
2797 max_io_size
= cluster_max_io_size(vp
->v_mount
, CL_WRITE
);
2799 if (flags
& IO_HEADZEROFILL
) {
2801 * some filesystems (HFS is one) don't support unallocated holes within a file...
2802 * so we zero fill the intervening space between the old EOF and the offset
2803 * where the next chunk of real data begins.... ftruncate will also use this
2804 * routine to zero fill to the new EOF when growing a file... in this case, the
2805 * uio structure will not be provided
2808 if (headOff
< uio
->uio_offset
) {
2809 zero_cnt
= uio
->uio_offset
- headOff
;
2812 } else if (headOff
< newEOF
) {
2813 zero_cnt
= newEOF
- headOff
;
2817 if (uio
&& uio
->uio_offset
> oldEOF
) {
2818 zero_off
= uio
->uio_offset
& ~PAGE_MASK_64
;
2820 if (zero_off
>= oldEOF
) {
2821 zero_cnt
= uio
->uio_offset
- zero_off
;
2823 flags
|= IO_HEADZEROFILL
;
2827 if (flags
& IO_TAILZEROFILL
) {
2829 zero_off1
= uio
->uio_offset
+ io_req_size
;
2831 if (zero_off1
< tailOff
)
2832 zero_cnt1
= tailOff
- zero_off1
;
2835 if (uio
&& newEOF
> oldEOF
) {
2836 zero_off1
= uio
->uio_offset
+ io_req_size
;
2838 if (zero_off1
== newEOF
&& (zero_off1
& PAGE_MASK_64
)) {
2839 zero_cnt1
= PAGE_SIZE_64
- (zero_off1
& PAGE_MASK_64
);
2841 flags
|= IO_TAILZEROFILL
;
2845 if (zero_cnt
== 0 && uio
== (struct uio
*) 0) {
2846 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_END
,
2847 retval
, 0, 0, 0, 0);
2851 write_off
= uio
->uio_offset
;
2852 write_cnt
= uio_resid(uio
);
2854 * delay updating the sequential write info
2855 * in the control block until we've obtained
2860 while ((total_size
= (io_resid
+ zero_cnt
+ zero_cnt1
)) && retval
== 0) {
2862 * for this iteration of the loop, figure out where our starting point is
2865 start_offset
= (int)(zero_off
& PAGE_MASK_64
);
2866 upl_f_offset
= zero_off
- start_offset
;
2867 } else if (io_resid
) {
2868 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
2869 upl_f_offset
= uio
->uio_offset
- start_offset
;
2871 start_offset
= (int)(zero_off1
& PAGE_MASK_64
);
2872 upl_f_offset
= zero_off1
- start_offset
;
2874 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 46)) | DBG_FUNC_NONE
,
2875 (int)zero_off
, (int)zero_cnt
, (int)zero_off1
, (int)zero_cnt1
, 0);
2877 if (total_size
> max_io_size
)
2878 total_size
= max_io_size
;
2880 cl
.b_addr
= (daddr64_t
)(upl_f_offset
/ PAGE_SIZE_64
);
2882 if (uio
&& ((flags
& (IO_SYNC
| IO_HEADZEROFILL
| IO_TAILZEROFILL
)) == 0)) {
2884 * assumption... total_size <= io_resid
2885 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
2887 if ((start_offset
+ total_size
) > max_io_size
)
2888 total_size
= max_io_size
- start_offset
;
2889 xfer_resid
= total_size
;
2891 retval
= cluster_copy_ubc_data_internal(vp
, uio
, &xfer_resid
, 1, 1);
2896 io_resid
-= (total_size
- xfer_resid
);
2897 total_size
= xfer_resid
;
2898 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
2899 upl_f_offset
= uio
->uio_offset
- start_offset
;
2901 if (total_size
== 0) {
2904 * the write did not finish on a page boundary
2905 * which will leave upl_f_offset pointing to the
2906 * beginning of the last page written instead of
2907 * the page beyond it... bump it in this case
2908 * so that the cluster code records the last page
2911 upl_f_offset
+= PAGE_SIZE_64
;
2919 * compute the size of the upl needed to encompass
2920 * the requested write... limit each call to cluster_io
2921 * to the maximum UPL size... cluster_io will clip if
2922 * this exceeds the maximum io_size for the device,
2923 * make sure to account for
2924 * a starting offset that's not page aligned
2926 upl_size
= (start_offset
+ total_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
2928 if (upl_size
> max_io_size
)
2929 upl_size
= max_io_size
;
2931 pages_in_upl
= upl_size
/ PAGE_SIZE
;
2932 io_size
= upl_size
- start_offset
;
2934 if ((long long)io_size
> total_size
)
2935 io_size
= total_size
;
2937 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_START
, upl_size
, io_size
, total_size
, 0, 0);
2941 * Gather the pages from the buffer cache.
2942 * The UPL_WILL_MODIFY flag lets the UPL subsystem know
2943 * that we intend to modify these pages.
2945 kret
= ubc_create_upl(vp
,
2950 UPL_SET_LITE
| (( uio
!=NULL
&& (uio
->uio_flags
& UIO_FLAGS_IS_COMPRESSED_FILE
)) ? 0 : UPL_WILL_MODIFY
));
2951 if (kret
!= KERN_SUCCESS
)
2952 panic("cluster_write_copy: failed to get pagelist");
2954 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_END
,
2955 upl
, (int)upl_f_offset
, start_offset
, 0, 0);
2957 if (start_offset
&& upl_f_offset
< oldEOF
&& !upl_valid_page(pl
, 0)) {
2961 * we're starting in the middle of the first page of the upl
2962 * and the page isn't currently valid, so we're going to have
2963 * to read it in first... this is a synchronous operation
2965 read_size
= PAGE_SIZE
;
2967 if ((upl_f_offset
+ read_size
) > oldEOF
)
2968 read_size
= oldEOF
- upl_f_offset
;
2970 retval
= cluster_io(vp
, upl
, 0, upl_f_offset
, read_size
,
2971 CL_READ
| bflag
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
);
2974 * we had an error during the read which causes us to abort
2975 * the current cluster_write request... before we do, we need
2976 * to release the rest of the pages in the upl without modifying
2977 * there state and mark the failed page in error
2979 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
|UPL_ABORT_FREE_ON_EMPTY
);
2981 if (upl_size
> PAGE_SIZE
)
2982 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
2984 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
2985 upl
, 0, 0, retval
, 0);
2989 if ((start_offset
== 0 || upl_size
> PAGE_SIZE
) && ((start_offset
+ io_size
) & PAGE_MASK
)) {
2991 * the last offset we're writing to in this upl does not end on a page
2992 * boundary... if it's not beyond the old EOF, then we'll also need to
2993 * pre-read this page in if it isn't already valid
2995 upl_offset
= upl_size
- PAGE_SIZE
;
2997 if ((upl_f_offset
+ start_offset
+ io_size
) < oldEOF
&&
2998 !upl_valid_page(pl
, upl_offset
/ PAGE_SIZE
)) {
3001 read_size
= PAGE_SIZE
;
3003 if ((off_t
)(upl_f_offset
+ upl_offset
+ read_size
) > oldEOF
)
3004 read_size
= oldEOF
- (upl_f_offset
+ upl_offset
);
3006 retval
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, read_size
,
3007 CL_READ
| bflag
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
);
3010 * we had an error during the read which causes us to abort
3011 * the current cluster_write request... before we do, we
3012 * need to release the rest of the pages in the upl without
3013 * modifying there state and mark the failed page in error
3015 ubc_upl_abort_range(upl
, upl_offset
, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
|UPL_ABORT_FREE_ON_EMPTY
);
3017 if (upl_size
> PAGE_SIZE
)
3018 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3020 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
3021 upl
, 0, 0, retval
, 0);
3026 xfer_resid
= io_size
;
3027 io_offset
= start_offset
;
3029 while (zero_cnt
&& xfer_resid
) {
3031 if (zero_cnt
< (long long)xfer_resid
)
3032 bytes_to_zero
= zero_cnt
;
3034 bytes_to_zero
= xfer_resid
;
3036 bytes_to_zero
= cluster_zero_range(upl
, pl
, flags
, io_offset
, zero_off
, upl_f_offset
, bytes_to_zero
);
3038 xfer_resid
-= bytes_to_zero
;
3039 zero_cnt
-= bytes_to_zero
;
3040 zero_off
+= bytes_to_zero
;
3041 io_offset
+= bytes_to_zero
;
3043 if (xfer_resid
&& io_resid
) {
3044 u_int32_t io_requested
;
3046 bytes_to_move
= min(io_resid
, xfer_resid
);
3047 io_requested
= bytes_to_move
;
3049 retval
= cluster_copy_upl_data(uio
, upl
, io_offset
, (int *)&io_requested
);
3052 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
3054 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
3055 upl
, 0, 0, retval
, 0);
3057 io_resid
-= bytes_to_move
;
3058 xfer_resid
-= bytes_to_move
;
3059 io_offset
+= bytes_to_move
;
3062 while (xfer_resid
&& zero_cnt1
&& retval
== 0) {
3064 if (zero_cnt1
< (long long)xfer_resid
)
3065 bytes_to_zero
= zero_cnt1
;
3067 bytes_to_zero
= xfer_resid
;
3069 bytes_to_zero
= cluster_zero_range(upl
, pl
, flags
, io_offset
, zero_off1
, upl_f_offset
, bytes_to_zero
);
3071 xfer_resid
-= bytes_to_zero
;
3072 zero_cnt1
-= bytes_to_zero
;
3073 zero_off1
+= bytes_to_zero
;
3074 io_offset
+= bytes_to_zero
;
3078 int ret_cluster_try_push
;
3080 io_size
+= start_offset
;
3082 if ((upl_f_offset
+ io_size
) >= newEOF
&& (u_int
)io_size
< upl_size
) {
3084 * if we're extending the file with this write
3085 * we'll zero fill the rest of the page so that
3086 * if the file gets extended again in such a way as to leave a
3087 * hole starting at this EOF, we'll have zero's in the correct spot
3089 cluster_zero(upl
, io_size
, upl_size
- io_size
, NULL
);
3092 * release the upl now if we hold one since...
3093 * 1) pages in it may be present in the sparse cluster map
3094 * and may span 2 separate buckets there... if they do and
3095 * we happen to have to flush a bucket to make room and it intersects
3096 * this upl, a deadlock may result on page BUSY
3097 * 2) we're delaying the I/O... from this point forward we're just updating
3098 * the cluster state... no need to hold the pages, so commit them
3099 * 3) IO_SYNC is set...
3100 * because we had to ask for a UPL that provides currenty non-present pages, the
3101 * UPL has been automatically set to clear the dirty flags (both software and hardware)
3102 * upon committing it... this is not the behavior we want since it's possible for
3103 * pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
3104 * we'll pick these pages back up later with the correct behavior specified.
3105 * 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush
3106 * of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages
3107 * we hold since the flushing context is holding the cluster lock.
3109 ubc_upl_commit_range(upl
, 0, upl_size
,
3110 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
3113 * calculate the last logical block number
3114 * that this delayed I/O encompassed
3116 cl
.e_addr
= (daddr64_t
)((upl_f_offset
+ (off_t
)upl_size
) / PAGE_SIZE_64
);
3118 if (flags
& IO_SYNC
) {
3120 * if the IO_SYNC flag is set than we need to
3121 * bypass any clusters and immediately issue
3127 * take the lock to protect our accesses
3128 * of the writebehind and sparse cluster state
3130 wbp
= cluster_get_wbp(vp
, CLW_ALLOCATE
| CLW_RETURNLOCKED
);
3132 if (wbp
->cl_scmap
) {
3134 if ( !(flags
& IO_NOCACHE
)) {
3136 * we've fallen into the sparse
3137 * cluster method of delaying dirty pages
3139 sparse_cluster_add(&(wbp
->cl_scmap
), vp
, &cl
, newEOF
, callback
, callback_arg
);
3141 lck_mtx_unlock(&wbp
->cl_lockw
);
3146 * must have done cached writes that fell into
3147 * the sparse cluster mechanism... we've switched
3148 * to uncached writes on the file, so go ahead
3149 * and push whatever's in the sparse map
3150 * and switch back to normal clustering
3154 sparse_cluster_push(&(wbp
->cl_scmap
), vp
, newEOF
, PUSH_ALL
, 0, callback
, callback_arg
);
3156 * no clusters of either type present at this point
3157 * so just go directly to start_new_cluster since
3158 * we know we need to delay this I/O since we've
3159 * already released the pages back into the cache
3160 * to avoid the deadlock with sparse_cluster_push
3162 goto start_new_cluster
;
3165 if (write_off
== wbp
->cl_last_write
)
3166 wbp
->cl_seq_written
+= write_cnt
;
3168 wbp
->cl_seq_written
= write_cnt
;
3170 wbp
->cl_last_write
= write_off
+ write_cnt
;
3174 if (wbp
->cl_number
== 0)
3176 * no clusters currently present
3178 goto start_new_cluster
;
3180 for (cl_index
= 0; cl_index
< wbp
->cl_number
; cl_index
++) {
3182 * check each cluster that we currently hold
3183 * try to merge some or all of this write into
3184 * one or more of the existing clusters... if
3185 * any portion of the write remains, start a
3188 if (cl
.b_addr
>= wbp
->cl_clusters
[cl_index
].b_addr
) {
3190 * the current write starts at or after the current cluster
3192 if (cl
.e_addr
<= (wbp
->cl_clusters
[cl_index
].b_addr
+ max_cluster_pgcount
)) {
3194 * we have a write that fits entirely
3195 * within the existing cluster limits
3197 if (cl
.e_addr
> wbp
->cl_clusters
[cl_index
].e_addr
)
3199 * update our idea of where the cluster ends
3201 wbp
->cl_clusters
[cl_index
].e_addr
= cl
.e_addr
;
3204 if (cl
.b_addr
< (wbp
->cl_clusters
[cl_index
].b_addr
+ max_cluster_pgcount
)) {
3206 * we have a write that starts in the middle of the current cluster
3207 * but extends beyond the cluster's limit... we know this because
3208 * of the previous checks
3209 * we'll extend the current cluster to the max
3210 * and update the b_addr for the current write to reflect that
3211 * the head of it was absorbed into this cluster...
3212 * note that we'll always have a leftover tail in this case since
3213 * full absorbtion would have occurred in the clause above
3215 wbp
->cl_clusters
[cl_index
].e_addr
= wbp
->cl_clusters
[cl_index
].b_addr
+ max_cluster_pgcount
;
3217 cl
.b_addr
= wbp
->cl_clusters
[cl_index
].e_addr
;
3220 * we come here for the case where the current write starts
3221 * beyond the limit of the existing cluster or we have a leftover
3222 * tail after a partial absorbtion
3224 * in either case, we'll check the remaining clusters before
3225 * starting a new one
3229 * the current write starts in front of the cluster we're currently considering
3231 if ((wbp
->cl_clusters
[cl_index
].e_addr
- cl
.b_addr
) <= max_cluster_pgcount
) {
3233 * we can just merge the new request into
3234 * this cluster and leave it in the cache
3235 * since the resulting cluster is still
3236 * less than the maximum allowable size
3238 wbp
->cl_clusters
[cl_index
].b_addr
= cl
.b_addr
;
3240 if (cl
.e_addr
> wbp
->cl_clusters
[cl_index
].e_addr
) {
3242 * the current write completely
3243 * envelops the existing cluster and since
3244 * each write is limited to at most max_cluster_pgcount pages
3245 * we can just use the start and last blocknos of the write
3246 * to generate the cluster limits
3248 wbp
->cl_clusters
[cl_index
].e_addr
= cl
.e_addr
;
3254 * if we were to combine this write with the current cluster
3255 * we would exceed the cluster size limit.... so,
3256 * let's see if there's any overlap of the new I/O with
3257 * the cluster we're currently considering... in fact, we'll
3258 * stretch the cluster out to it's full limit and see if we
3259 * get an intersection with the current write
3262 if (cl
.e_addr
> wbp
->cl_clusters
[cl_index
].e_addr
- max_cluster_pgcount
) {
3264 * the current write extends into the proposed cluster
3265 * clip the length of the current write after first combining it's
3266 * tail with the newly shaped cluster
3268 wbp
->cl_clusters
[cl_index
].b_addr
= wbp
->cl_clusters
[cl_index
].e_addr
- max_cluster_pgcount
;
3270 cl
.e_addr
= wbp
->cl_clusters
[cl_index
].b_addr
;
3273 * if we get here, there was no way to merge
3274 * any portion of this write with this cluster
3275 * or we could only merge part of it which
3276 * will leave a tail...
3277 * we'll check the remaining clusters before starting a new one
3281 if (cl_index
< wbp
->cl_number
)
3283 * we found an existing cluster(s) that we
3284 * could entirely merge this I/O into
3288 if (!((unsigned int)vfs_flags(vp
->v_mount
) & MNT_DEFWRITE
) &&
3289 wbp
->cl_number
== MAX_CLUSTERS
&&
3290 wbp
->cl_seq_written
>= (MAX_CLUSTERS
* (max_cluster_pgcount
* PAGE_SIZE
))) {
3293 if (vp
->v_mount
->mnt_kern_flag
& MNTK_SSD
)
3294 n
= WRITE_BEHIND_SSD
;
3299 cluster_try_push(wbp
, vp
, newEOF
, 0, 0, callback
, callback_arg
);
3301 if (wbp
->cl_number
< MAX_CLUSTERS
) {
3303 * we didn't find an existing cluster to
3304 * merge into, but there's room to start
3307 goto start_new_cluster
;
3310 * no exisitng cluster to merge with and no
3311 * room to start a new one... we'll try
3312 * pushing one of the existing ones... if none of
3313 * them are able to be pushed, we'll switch
3314 * to the sparse cluster mechanism
3315 * cluster_try_push updates cl_number to the
3316 * number of remaining clusters... and
3317 * returns the number of currently unused clusters
3319 ret_cluster_try_push
= 0;
3322 * if writes are not deferred, call cluster push immediately
3324 if (!((unsigned int)vfs_flags(vp
->v_mount
) & MNT_DEFWRITE
)) {
3326 ret_cluster_try_push
= cluster_try_push(wbp
, vp
, newEOF
, (flags
& IO_NOCACHE
) ? 0 : PUSH_DELAY
, 0, callback
, callback_arg
);
3330 * execute following regardless of writes being deferred or not
3332 if (ret_cluster_try_push
== 0) {
3334 * no more room in the normal cluster mechanism
3335 * so let's switch to the more expansive but expensive
3336 * sparse mechanism....
3338 sparse_cluster_switch(wbp
, vp
, newEOF
, callback
, callback_arg
);
3339 sparse_cluster_add(&(wbp
->cl_scmap
), vp
, &cl
, newEOF
, callback
, callback_arg
);
3341 lck_mtx_unlock(&wbp
->cl_lockw
);
3346 wbp
->cl_clusters
[wbp
->cl_number
].b_addr
= cl
.b_addr
;
3347 wbp
->cl_clusters
[wbp
->cl_number
].e_addr
= cl
.e_addr
;
3349 wbp
->cl_clusters
[wbp
->cl_number
].io_flags
= 0;
3351 if (flags
& IO_NOCACHE
)
3352 wbp
->cl_clusters
[wbp
->cl_number
].io_flags
|= CLW_IONOCACHE
;
3354 if (bflag
& CL_PASSIVE
)
3355 wbp
->cl_clusters
[wbp
->cl_number
].io_flags
|= CLW_IOPASSIVE
;
3359 lck_mtx_unlock(&wbp
->cl_lockw
);
3364 * we don't hold the lock at this point
3366 * we've already dropped the current upl, so pick it back up with COPYOUT_FROM set
3367 * so that we correctly deal with a change in state of the hardware modify bit...
3368 * we do this via cluster_push_now... by passing along the IO_SYNC flag, we force
3369 * cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also
3370 * responsible for generating the correct sized I/O(s)
3372 retval
= cluster_push_now(vp
, &cl
, newEOF
, flags
, callback
, callback_arg
);
3375 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_END
, retval
, 0, io_resid
, 0, 0);
3383 cluster_read(vnode_t vp
, struct uio
*uio
, off_t filesize
, int xflags
)
3385 return cluster_read_ext(vp
, uio
, filesize
, xflags
, NULL
, NULL
);
3390 cluster_read_ext(vnode_t vp
, struct uio
*uio
, off_t filesize
, int xflags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
3394 user_ssize_t cur_resid
;
3396 u_int32_t read_length
= 0;
3397 int read_type
= IO_COPY
;
3401 if (vp
->v_flag
& VNOCACHE_DATA
)
3402 flags
|= IO_NOCACHE
;
3403 if ((vp
->v_flag
& VRAOFF
) || speculative_reads_disabled
)
3407 * If we're doing an encrypted IO, then first check to see
3408 * if the IO requested was page aligned. If not, then bail
3411 if (flags
& IO_ENCRYPTED
) {
3412 if (read_length
& PAGE_MASK
) {
3419 * do a read through the cache if one of the following is true....
3420 * NOCACHE is not true
3421 * the uio request doesn't target USERSPACE
3422 * Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well.
3423 * Reading encrypted data from a CP filesystem should never result in the data touching
3426 * otherwise, find out if we want the direct or contig variant for
3427 * the first vector in the uio request
3429 if (((flags
& IO_NOCACHE
) || (flags
& IO_ENCRYPTED
)) && UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
)) {
3430 retval
= cluster_io_type(uio
, &read_type
, &read_length
, 0);
3433 while ((cur_resid
= uio_resid(uio
)) && uio
->uio_offset
< filesize
&& retval
== 0) {
3435 switch (read_type
) {
3439 * make sure the uio_resid isn't too big...
3440 * internally, we want to handle all of the I/O in
3441 * chunk sizes that fit in a 32 bit int
3443 if (cur_resid
> (user_ssize_t
)(MAX_IO_REQUEST_SIZE
))
3444 io_size
= MAX_IO_REQUEST_SIZE
;
3446 io_size
= (u_int32_t
)cur_resid
;
3448 retval
= cluster_read_copy(vp
, uio
, io_size
, filesize
, flags
, callback
, callback_arg
);
3452 retval
= cluster_read_direct(vp
, uio
, filesize
, &read_type
, &read_length
, flags
, callback
, callback_arg
);
3456 retval
= cluster_read_contig(vp
, uio
, filesize
, &read_type
, &read_length
, callback
, callback_arg
, flags
);
3460 retval
= cluster_io_type(uio
, &read_type
, &read_length
, 0);
3470 cluster_read_upl_release(upl_t upl
, int start_pg
, int last_pg
, int take_reference
)
3473 int abort_flags
= UPL_ABORT_FREE_ON_EMPTY
;
3475 if ((range
= last_pg
- start_pg
)) {
3477 abort_flags
|= UPL_ABORT_REFERENCE
;
3479 ubc_upl_abort_range(upl
, start_pg
* PAGE_SIZE
, range
* PAGE_SIZE
, abort_flags
);
3485 cluster_read_copy(vnode_t vp
, struct uio
*uio
, u_int32_t io_req_size
, off_t filesize
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
3487 upl_page_info_t
*pl
;
3489 vm_offset_t upl_offset
;
3498 off_t last_ioread_offset
;
3499 off_t last_request_offset
;
3503 u_int32_t size_of_prefetch
;
3506 u_int32_t max_rd_size
;
3507 u_int32_t max_io_size
;
3508 u_int32_t max_prefetch
;
3509 u_int rd_ahead_enabled
= 1;
3510 u_int prefetch_enabled
= 1;
3511 struct cl_readahead
* rap
;
3512 struct clios iostate
;
3513 struct cl_extent extent
;
3515 int take_reference
= 1;
3516 int policy
= IOPOL_DEFAULT
;
3517 boolean_t iolock_inited
= FALSE
;
3519 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_START
,
3520 (int)uio
->uio_offset
, io_req_size
, (int)filesize
, flags
, 0);
3522 if (flags
& IO_ENCRYPTED
) {
3523 panic ("encrypted blocks will hit UBC!");
3526 policy
= proc_get_task_selfdiskacc();
3528 if (policy
== IOPOL_THROTTLE
|| policy
== IOPOL_UTILITY
|| (flags
& IO_NOCACHE
))
3531 if (flags
& IO_PASSIVE
)
3536 if (flags
& IO_NOCACHE
)
3537 bflag
|= CL_NOCACHE
;
3539 max_io_size
= cluster_max_io_size(vp
->v_mount
, CL_READ
);
3540 max_prefetch
= MAX_PREFETCH(vp
, max_io_size
, (vp
->v_mount
->mnt_kern_flag
& MNTK_SSD
));
3541 max_rd_size
= max_prefetch
;
3543 last_request_offset
= uio
->uio_offset
+ io_req_size
;
3545 if (last_request_offset
> filesize
)
3546 last_request_offset
= filesize
;
3548 if ((flags
& (IO_RAOFF
|IO_NOCACHE
)) || ((last_request_offset
& ~PAGE_MASK_64
) == (uio
->uio_offset
& ~PAGE_MASK_64
))) {
3549 rd_ahead_enabled
= 0;
3552 if (cluster_hard_throttle_on(vp
, 1)) {
3554 * we're in the throttle window, at the very least
3555 * we want to limit the size of the I/O we're about
3558 rd_ahead_enabled
= 0;
3559 prefetch_enabled
= 0;
3561 max_rd_size
= THROTTLE_MAX_IOSIZE
;
3563 if ((rap
= cluster_get_rap(vp
)) == NULL
)
3564 rd_ahead_enabled
= 0;
3566 extent
.b_addr
= uio
->uio_offset
/ PAGE_SIZE_64
;
3567 extent
.e_addr
= (last_request_offset
- 1) / PAGE_SIZE_64
;
3570 if (rap
!= NULL
&& rap
->cl_ralen
&& (rap
->cl_lastr
== extent
.b_addr
|| (rap
->cl_lastr
+ 1) == extent
.b_addr
)) {
3572 * determine if we already have a read-ahead in the pipe courtesy of the
3573 * last read systemcall that was issued...
3574 * if so, pick up it's extent to determine where we should start
3575 * with respect to any read-ahead that might be necessary to
3576 * garner all the data needed to complete this read systemcall
3578 last_ioread_offset
= (rap
->cl_maxra
* PAGE_SIZE_64
) + PAGE_SIZE_64
;
3580 if (last_ioread_offset
< uio
->uio_offset
)
3581 last_ioread_offset
= (off_t
)0;
3582 else if (last_ioread_offset
> last_request_offset
)
3583 last_ioread_offset
= last_request_offset
;
3585 last_ioread_offset
= (off_t
)0;
3587 while (io_req_size
&& uio
->uio_offset
< filesize
&& retval
== 0) {
3589 max_size
= filesize
- uio
->uio_offset
;
3591 if ((off_t
)(io_req_size
) < max_size
)
3592 io_size
= io_req_size
;
3596 if (!(flags
& IO_NOCACHE
)) {
3600 u_int32_t io_requested
;
3603 * if we keep finding the pages we need already in the cache, then
3604 * don't bother to call cluster_read_prefetch since it costs CPU cycles
3605 * to determine that we have all the pages we need... once we miss in
3606 * the cache and have issued an I/O, than we'll assume that we're likely
3607 * to continue to miss in the cache and it's to our advantage to try and prefetch
3609 if (last_request_offset
&& last_ioread_offset
&& (size_of_prefetch
= (last_request_offset
- last_ioread_offset
))) {
3610 if ((last_ioread_offset
- uio
->uio_offset
) <= max_rd_size
&& prefetch_enabled
) {
3612 * we've already issued I/O for this request and
3613 * there's still work to do and
3614 * our prefetch stream is running dry, so issue a
3615 * pre-fetch I/O... the I/O latency will overlap
3616 * with the copying of the data
3618 if (size_of_prefetch
> max_rd_size
)
3619 size_of_prefetch
= max_rd_size
;
3621 size_of_prefetch
= cluster_read_prefetch(vp
, last_ioread_offset
, size_of_prefetch
, filesize
, callback
, callback_arg
, bflag
);
3623 last_ioread_offset
+= (off_t
)(size_of_prefetch
* PAGE_SIZE
);
3625 if (last_ioread_offset
> last_request_offset
)
3626 last_ioread_offset
= last_request_offset
;
3630 * limit the size of the copy we're about to do so that
3631 * we can notice that our I/O pipe is running dry and
3632 * get the next I/O issued before it does go dry
3634 if (last_ioread_offset
&& io_size
> (max_io_size
/ 4))
3635 io_resid
= (max_io_size
/ 4);
3639 io_requested
= io_resid
;
3641 retval
= cluster_copy_ubc_data_internal(vp
, uio
, (int *)&io_resid
, 0, take_reference
);
3643 xsize
= io_requested
- io_resid
;
3646 io_req_size
-= xsize
;
3648 if (retval
|| io_resid
)
3650 * if we run into a real error or
3651 * a page that is not in the cache
3652 * we need to leave streaming mode
3656 if (rd_ahead_enabled
&& (io_size
== 0 || last_ioread_offset
== last_request_offset
)) {
3658 * we're already finished the I/O for this read request
3659 * let's see if we should do a read-ahead
3661 cluster_read_ahead(vp
, &extent
, filesize
, rap
, callback
, callback_arg
, bflag
);
3668 if (extent
.e_addr
< rap
->cl_lastr
)
3670 rap
->cl_lastr
= extent
.e_addr
;
3675 * recompute max_size since cluster_copy_ubc_data_internal
3676 * may have advanced uio->uio_offset
3678 max_size
= filesize
- uio
->uio_offset
;
3681 iostate
.io_completed
= 0;
3682 iostate
.io_issued
= 0;
3683 iostate
.io_error
= 0;
3684 iostate
.io_wanted
= 0;
3686 if ( (flags
& IO_RETURN_ON_THROTTLE
) ) {
3687 if (cluster_hard_throttle_on(vp
, 0) == 2) {
3688 if ( !cluster_io_present_in_BC(vp
, uio
->uio_offset
)) {
3690 * we're in the throttle window and at least 1 I/O
3691 * has already been issued by a throttleable thread
3692 * in this window, so return with EAGAIN to indicate
3693 * to the FS issuing the cluster_read call that it
3694 * should now throttle after dropping any locks
3696 throttle_info_update_by_mount(vp
->v_mount
);
3705 * compute the size of the upl needed to encompass
3706 * the requested read... limit each call to cluster_io
3707 * to the maximum UPL size... cluster_io will clip if
3708 * this exceeds the maximum io_size for the device,
3709 * make sure to account for
3710 * a starting offset that's not page aligned
3712 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
3713 upl_f_offset
= uio
->uio_offset
- (off_t
)start_offset
;
3715 if (io_size
> max_rd_size
)
3716 io_size
= max_rd_size
;
3718 upl_size
= (start_offset
+ io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
3720 if (flags
& IO_NOCACHE
) {
3721 if (upl_size
> max_io_size
)
3722 upl_size
= max_io_size
;
3724 if (upl_size
> max_io_size
/ 4)
3725 upl_size
= max_io_size
/ 4;
3727 pages_in_upl
= upl_size
/ PAGE_SIZE
;
3729 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 33)) | DBG_FUNC_START
,
3730 upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
3732 kret
= ubc_create_upl(vp
,
3737 UPL_FILE_IO
| UPL_SET_LITE
);
3738 if (kret
!= KERN_SUCCESS
)
3739 panic("cluster_read_copy: failed to get pagelist");
3741 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 33)) | DBG_FUNC_END
,
3742 upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
3745 * scan from the beginning of the upl looking for the first
3746 * non-valid page.... this will become the first page in
3747 * the request we're going to make to 'cluster_io'... if all
3748 * of the pages are valid, we won't call through to 'cluster_io'
3750 for (start_pg
= 0; start_pg
< pages_in_upl
; start_pg
++) {
3751 if (!upl_valid_page(pl
, start_pg
))
3756 * scan from the starting invalid page looking for a valid
3757 * page before the end of the upl is reached, if we
3758 * find one, then it will be the last page of the request to
3761 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
3762 if (upl_valid_page(pl
, last_pg
))
3766 if (start_pg
< last_pg
) {
3768 * we found a range of 'invalid' pages that must be filled
3769 * if the last page in this range is the last page of the file
3770 * we may have to clip the size of it to keep from reading past
3771 * the end of the last physical block associated with the file
3773 if (iolock_inited
== FALSE
) {
3774 lck_mtx_init(&iostate
.io_mtxp
, cl_mtx_grp
, cl_mtx_attr
);
3776 iolock_inited
= TRUE
;
3778 upl_offset
= start_pg
* PAGE_SIZE
;
3779 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
3781 if ((off_t
)(upl_f_offset
+ upl_offset
+ io_size
) > filesize
)
3782 io_size
= filesize
- (upl_f_offset
+ upl_offset
);
3785 * issue an asynchronous read to cluster_io
3788 error
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
,
3789 io_size
, CL_READ
| CL_ASYNC
| bflag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
3792 if (extent
.e_addr
< rap
->cl_maxra
) {
3794 * we've just issued a read for a block that should have been
3795 * in the cache courtesy of the read-ahead engine... something
3796 * has gone wrong with the pipeline, so reset the read-ahead
3797 * logic which will cause us to restart from scratch
3805 * if the read completed successfully, or there was no I/O request
3806 * issued, than copy the data into user land via 'cluster_upl_copy_data'
3807 * we'll first add on any 'valid'
3808 * pages that were present in the upl when we acquired it.
3812 for (uio_last
= last_pg
; uio_last
< pages_in_upl
; uio_last
++) {
3813 if (!upl_valid_page(pl
, uio_last
))
3816 if (uio_last
< pages_in_upl
) {
3818 * there were some invalid pages beyond the valid pages
3819 * that we didn't issue an I/O for, just release them
3820 * unchanged now, so that any prefetch/readahed can
3823 ubc_upl_abort_range(upl
, uio_last
* PAGE_SIZE
,
3824 (pages_in_upl
- uio_last
) * PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
3828 * compute size to transfer this round, if io_req_size is
3829 * still non-zero after this attempt, we'll loop around and
3830 * set up for another I/O.
3832 val_size
= (uio_last
* PAGE_SIZE
) - start_offset
;
3834 if (val_size
> max_size
)
3835 val_size
= max_size
;
3837 if (val_size
> io_req_size
)
3838 val_size
= io_req_size
;
3840 if ((uio
->uio_offset
+ val_size
) > last_ioread_offset
)
3841 last_ioread_offset
= uio
->uio_offset
+ val_size
;
3843 if ((size_of_prefetch
= (last_request_offset
- last_ioread_offset
)) && prefetch_enabled
) {
3845 if ((last_ioread_offset
- (uio
->uio_offset
+ val_size
)) <= upl_size
) {
3847 * if there's still I/O left to do for this request, and...
3848 * we're not in hard throttle mode, and...
3849 * we're close to using up the previous prefetch, then issue a
3850 * new pre-fetch I/O... the I/O latency will overlap
3851 * with the copying of the data
3853 if (size_of_prefetch
> max_rd_size
)
3854 size_of_prefetch
= max_rd_size
;
3856 size_of_prefetch
= cluster_read_prefetch(vp
, last_ioread_offset
, size_of_prefetch
, filesize
, callback
, callback_arg
, bflag
);
3858 last_ioread_offset
+= (off_t
)(size_of_prefetch
* PAGE_SIZE
);
3860 if (last_ioread_offset
> last_request_offset
)
3861 last_ioread_offset
= last_request_offset
;
3864 } else if ((uio
->uio_offset
+ val_size
) == last_request_offset
) {
3866 * this transfer will finish this request, so...
3867 * let's try to read ahead if we're in
3868 * a sequential access pattern and we haven't
3869 * explicitly disabled it
3871 if (rd_ahead_enabled
)
3872 cluster_read_ahead(vp
, &extent
, filesize
, rap
, callback
, callback_arg
, bflag
);
3875 if (extent
.e_addr
< rap
->cl_lastr
)
3877 rap
->cl_lastr
= extent
.e_addr
;
3880 if (iostate
.io_issued
> iostate
.io_completed
)
3881 cluster_iostate_wait(&iostate
, 0, "cluster_read_copy");
3883 if (iostate
.io_error
)
3884 error
= iostate
.io_error
;
3886 u_int32_t io_requested
;
3888 io_requested
= val_size
;
3890 retval
= cluster_copy_upl_data(uio
, upl
, start_offset
, (int *)&io_requested
);
3892 io_req_size
-= (val_size
- io_requested
);
3895 if (iostate
.io_issued
> iostate
.io_completed
)
3896 cluster_iostate_wait(&iostate
, 0, "cluster_read_copy");
3898 if (start_pg
< last_pg
) {
3900 * compute the range of pages that we actually issued an I/O for
3901 * and either commit them as valid if the I/O succeeded
3902 * or abort them if the I/O failed or we're not supposed to
3903 * keep them in the cache
3905 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
3907 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_START
, upl
, start_pg
* PAGE_SIZE
, io_size
, error
, 0);
3909 if (error
|| (flags
& IO_NOCACHE
))
3910 ubc_upl_abort_range(upl
, start_pg
* PAGE_SIZE
, io_size
,
3911 UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
3913 int commit_flags
= UPL_COMMIT_CLEAR_DIRTY
| UPL_COMMIT_FREE_ON_EMPTY
;
3916 commit_flags
|= UPL_COMMIT_INACTIVATE
;
3918 commit_flags
|= UPL_COMMIT_SPECULATE
;
3920 ubc_upl_commit_range(upl
, start_pg
* PAGE_SIZE
, io_size
, commit_flags
);
3922 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_END
, upl
, start_pg
* PAGE_SIZE
, io_size
, error
, 0);
3924 if ((last_pg
- start_pg
) < pages_in_upl
) {
3926 * the set of pages that we issued an I/O for did not encompass
3927 * the entire upl... so just release these without modifying
3931 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3934 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_START
,
3935 upl
, -1, pages_in_upl
- (last_pg
- start_pg
), 0, 0);
3938 * handle any valid pages at the beginning of
3939 * the upl... release these appropriately
3941 cluster_read_upl_release(upl
, 0, start_pg
, take_reference
);
3944 * handle any valid pages immediately after the
3945 * pages we issued I/O for... ... release these appropriately
3947 cluster_read_upl_release(upl
, last_pg
, uio_last
, take_reference
);
3949 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_END
, upl
, -1, -1, 0, 0);
3956 if (cluster_hard_throttle_on(vp
, 1)) {
3958 * we're in the throttle window, at the very least
3959 * we want to limit the size of the I/O we're about
3962 rd_ahead_enabled
= 0;
3963 prefetch_enabled
= 0;
3964 max_rd_size
= THROTTLE_MAX_IOSIZE
;
3966 if (max_rd_size
== THROTTLE_MAX_IOSIZE
) {
3968 * coming out of throttled state
3970 if (policy
!= IOPOL_THROTTLE
&& policy
!= IOPOL_UTILITY
) {
3972 rd_ahead_enabled
= 1;
3973 prefetch_enabled
= 1;
3975 max_rd_size
= max_prefetch
;
3976 last_ioread_offset
= 0;
3981 if (iolock_inited
== TRUE
) {
3982 if (iostate
.io_issued
> iostate
.io_completed
) {
3984 * cluster_io returned an error after it
3985 * had already issued some I/O. we need
3986 * to wait for that I/O to complete before
3987 * we can destroy the iostate mutex...
3988 * 'retval' already contains the early error
3989 * so no need to pick it up from iostate.io_error
3991 cluster_iostate_wait(&iostate
, 0, "cluster_read_copy");
3993 lck_mtx_destroy(&iostate
.io_mtxp
, cl_mtx_grp
);
3996 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_END
,
3997 (int)uio
->uio_offset
, io_req_size
, rap
->cl_lastr
, retval
, 0);
3999 lck_mtx_unlock(&rap
->cl_lockr
);
4001 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_END
,
4002 (int)uio
->uio_offset
, io_req_size
, 0, retval
, 0);
4010 cluster_read_direct(vnode_t vp
, struct uio
*uio
, off_t filesize
, int *read_type
, u_int32_t
*read_length
,
4011 int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
4014 upl_page_info_t
*pl
;
4016 vm_offset_t upl_offset
, vector_upl_offset
= 0;
4017 upl_size_t upl_size
, vector_upl_size
= 0;
4018 vm_size_t upl_needed_size
;
4019 unsigned int pages_in_pl
;
4023 int force_data_sync
;
4025 int no_zero_fill
= 0;
4028 struct clios iostate
;
4029 user_addr_t iov_base
;
4030 u_int32_t io_req_size
;
4031 u_int32_t offset_in_file
;
4032 u_int32_t offset_in_iovbase
;
4036 u_int32_t devblocksize
;
4037 u_int32_t mem_alignment_mask
;
4038 u_int32_t max_upl_size
;
4039 u_int32_t max_rd_size
;
4040 u_int32_t max_rd_ahead
;
4041 u_int32_t max_vector_size
;
4042 boolean_t strict_uncached_IO
= FALSE
;
4043 boolean_t io_throttled
= FALSE
;
4045 u_int32_t vector_upl_iosize
= 0;
4046 int issueVectorUPL
= 0,useVectorUPL
= (uio
->uio_iovcnt
> 1);
4047 off_t v_upl_uio_offset
= 0;
4048 int vector_upl_index
=0;
4049 upl_t vector_upl
= NULL
;
4051 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_START
,
4052 (int)uio
->uio_offset
, (int)filesize
, *read_type
, *read_length
, 0);
4054 max_upl_size
= cluster_max_io_size(vp
->v_mount
, CL_READ
);
4056 max_rd_size
= max_upl_size
;
4057 max_rd_ahead
= max_rd_size
* IO_SCALE(vp
, 2);
4059 io_flag
= CL_COMMIT
| CL_READ
| CL_ASYNC
| CL_NOZERO
| CL_DIRECT_IO
;
4061 if (flags
& IO_PASSIVE
)
4062 io_flag
|= CL_PASSIVE
;
4064 if (flags
& IO_ENCRYPTED
) {
4065 io_flag
|= CL_RAW_ENCRYPTED
;
4068 if (flags
& IO_NOCACHE
) {
4069 io_flag
|= CL_NOCACHE
;
4072 iostate
.io_completed
= 0;
4073 iostate
.io_issued
= 0;
4074 iostate
.io_error
= 0;
4075 iostate
.io_wanted
= 0;
4077 lck_mtx_init(&iostate
.io_mtxp
, cl_mtx_grp
, cl_mtx_attr
);
4079 devblocksize
= (u_int32_t
)vp
->v_mount
->mnt_devblocksize
;
4080 mem_alignment_mask
= (u_int32_t
)vp
->v_mount
->mnt_alignmentmask
;
4082 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_NONE
,
4083 (int)devblocksize
, (int)mem_alignment_mask
, 0, 0, 0);
4085 if (devblocksize
== 1) {
4087 * the AFP client advertises a devblocksize of 1
4088 * however, its BLOCKMAP routine maps to physical
4089 * blocks that are PAGE_SIZE in size...
4090 * therefore we can't ask for I/Os that aren't page aligned
4091 * or aren't multiples of PAGE_SIZE in size
4092 * by setting devblocksize to PAGE_SIZE, we re-instate
4093 * the old behavior we had before the mem_alignment_mask
4094 * changes went in...
4096 devblocksize
= PAGE_SIZE
;
4099 strict_uncached_IO
= ubc_strict_uncached_IO(vp
);
4102 io_req_size
= *read_length
;
4103 iov_base
= uio_curriovbase(uio
);
4105 max_io_size
= filesize
- uio
->uio_offset
;
4107 if ((off_t
)io_req_size
> max_io_size
)
4108 io_req_size
= max_io_size
;
4110 offset_in_file
= (u_int32_t
)uio
->uio_offset
& (devblocksize
- 1);
4111 offset_in_iovbase
= (u_int32_t
)iov_base
& mem_alignment_mask
;
4113 if (offset_in_file
|| offset_in_iovbase
) {
4115 * one of the 2 important offsets is misaligned
4116 * so fire an I/O through the cache for this entire vector
4120 if (iov_base
& (devblocksize
- 1)) {
4122 * the offset in memory must be on a device block boundary
4123 * so that we can guarantee that we can generate an
4124 * I/O that ends on a page boundary in cluster_io
4130 * The user must request IO in aligned chunks. If the
4131 * offset into the file is bad, or the userland pointer
4132 * is non-aligned, then we cannot service the encrypted IO request.
4134 if ((flags
& IO_ENCRYPTED
) && (misaligned
)) {
4139 * When we get to this point, we know...
4140 * -- the offset into the file is on a devblocksize boundary
4143 while (io_req_size
&& retval
== 0) {
4146 if (cluster_hard_throttle_on(vp
, 1)) {
4148 * we're in the throttle window, at the very least
4149 * we want to limit the size of the I/O we're about
4152 max_rd_size
= THROTTLE_MAX_IOSIZE
;
4153 max_rd_ahead
= THROTTLE_MAX_IOSIZE
- 1;
4154 max_vector_size
= THROTTLE_MAX_IOSIZE
;
4156 max_rd_size
= max_upl_size
;
4157 max_rd_ahead
= max_rd_size
* IO_SCALE(vp
, 2);
4158 max_vector_size
= MAX_VECTOR_UPL_SIZE
;
4160 io_start
= io_size
= io_req_size
;
4163 * First look for pages already in the cache
4164 * and move them to user space. But only do this
4165 * check if we are not retrieving encrypted data directly
4166 * from the filesystem; those blocks should never
4169 * cluster_copy_ubc_data returns the resid
4172 if ((strict_uncached_IO
== FALSE
) && ((flags
& IO_ENCRYPTED
) == 0)) {
4173 retval
= cluster_copy_ubc_data_internal(vp
, uio
, (int *)&io_size
, 0, 0);
4176 * calculate the number of bytes actually copied
4177 * starting size - residual
4179 xsize
= io_start
- io_size
;
4181 io_req_size
-= xsize
;
4183 if(useVectorUPL
&& (xsize
|| (iov_base
& PAGE_MASK
))) {
4185 * We found something in the cache or we have an iov_base that's not
4188 * Issue all I/O's that have been collected within this Vectored UPL.
4190 if(vector_upl_index
) {
4191 retval
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
4192 reset_vector_run_state();
4199 * After this point, if we are using the Vector UPL path and the base is
4200 * not page-aligned then the UPL with that base will be the first in the vector UPL.
4205 * check to see if we are finished with this request.
4207 * If we satisfied this IO already, then io_req_size will be 0.
4208 * Otherwise, see if the IO was mis-aligned and needs to go through
4209 * the UBC to deal with the 'tail'.
4212 if (io_req_size
== 0 || (misaligned
)) {
4214 * see if there's another uio vector to
4215 * process that's of type IO_DIRECT
4217 * break out of while loop to get there
4222 * assume the request ends on a device block boundary
4224 io_min
= devblocksize
;
4227 * we can handle I/O's in multiples of the device block size
4228 * however, if io_size isn't a multiple of devblocksize we
4229 * want to clip it back to the nearest page boundary since
4230 * we are going to have to go through cluster_read_copy to
4231 * deal with the 'overhang'... by clipping it to a PAGE_SIZE
4232 * multiple, we avoid asking the drive for the same physical
4233 * blocks twice.. once for the partial page at the end of the
4234 * request and a 2nd time for the page we read into the cache
4235 * (which overlaps the end of the direct read) in order to
4236 * get at the overhang bytes
4238 if (io_size
& (devblocksize
- 1)) {
4239 if (flags
& IO_ENCRYPTED
) {
4241 * Normally, we'd round down to the previous page boundary to
4242 * let the UBC manage the zero-filling of the file past the EOF.
4243 * But if we're doing encrypted IO, we can't let any of
4244 * the data hit the UBC. This means we have to do the full
4245 * IO to the upper block boundary of the device block that
4246 * contains the EOF. The user will be responsible for not
4247 * interpreting data PAST the EOF in its buffer.
4249 * So just bump the IO back up to a multiple of devblocksize
4251 io_size
= ((io_size
+ devblocksize
) & ~(devblocksize
- 1));
4256 * Clip the request to the previous page size boundary
4257 * since request does NOT end on a device block boundary
4259 io_size
&= ~PAGE_MASK
;
4264 if (retval
|| io_size
< io_min
) {
4266 * either an error or we only have the tail left to
4267 * complete via the copy path...
4268 * we may have already spun some portion of this request
4269 * off as async requests... we need to wait for the I/O
4270 * to complete before returning
4272 goto wait_for_dreads
;
4276 * Don't re-check the UBC data if we are looking for uncached IO
4277 * or asking for encrypted blocks.
4279 if ((strict_uncached_IO
== FALSE
) && ((flags
& IO_ENCRYPTED
) == 0)) {
4281 if ((xsize
= io_size
) > max_rd_size
)
4282 xsize
= max_rd_size
;
4286 ubc_range_op(vp
, uio
->uio_offset
, uio
->uio_offset
+ xsize
, UPL_ROP_ABSENT
, (int *)&io_size
);
4290 * a page must have just come into the cache
4291 * since the first page in this range is no
4292 * longer absent, go back and re-evaluate
4297 if ( (flags
& IO_RETURN_ON_THROTTLE
) ) {
4298 if (cluster_hard_throttle_on(vp
, 0) == 2) {
4299 if ( !cluster_io_present_in_BC(vp
, uio
->uio_offset
)) {
4301 * we're in the throttle window and at least 1 I/O
4302 * has already been issued by a throttleable thread
4303 * in this window, so return with EAGAIN to indicate
4304 * to the FS issuing the cluster_read call that it
4305 * should now throttle after dropping any locks
4307 throttle_info_update_by_mount(vp
->v_mount
);
4309 io_throttled
= TRUE
;
4310 goto wait_for_dreads
;
4314 if (io_size
> max_rd_size
)
4315 io_size
= max_rd_size
;
4317 iov_base
= uio_curriovbase(uio
);
4319 upl_offset
= (vm_offset_t
)((u_int32_t
)iov_base
& PAGE_MASK
);
4320 upl_needed_size
= (upl_offset
+ io_size
+ (PAGE_SIZE
-1)) & ~PAGE_MASK
;
4322 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_START
,
4323 (int)upl_offset
, upl_needed_size
, (int)iov_base
, io_size
, 0);
4325 if (upl_offset
== 0 && ((io_size
& PAGE_MASK
) == 0))
4330 for (force_data_sync
= 0; force_data_sync
< 3; force_data_sync
++) {
4332 upl_size
= upl_needed_size
;
4333 upl_flags
= UPL_FILE_IO
| UPL_NO_SYNC
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
4336 upl_flags
|= UPL_NOZEROFILL
;
4337 if (force_data_sync
)
4338 upl_flags
|= UPL_FORCE_DATA_SYNC
;
4340 kret
= vm_map_create_upl(current_map(),
4341 (vm_map_offset_t
)(iov_base
& ~((user_addr_t
)PAGE_MASK
)),
4342 &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
);
4344 if (kret
!= KERN_SUCCESS
) {
4345 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
4346 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
4348 * failed to get pagelist
4350 * we may have already spun some portion of this request
4351 * off as async requests... we need to wait for the I/O
4352 * to complete before returning
4354 goto wait_for_dreads
;
4356 pages_in_pl
= upl_size
/ PAGE_SIZE
;
4357 pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
4359 for (i
= 0; i
< pages_in_pl
; i
++) {
4360 if (!upl_page_present(pl
, i
))
4363 if (i
== pages_in_pl
)
4366 ubc_upl_abort(upl
, 0);
4368 if (force_data_sync
>= 3) {
4369 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
4370 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
4372 goto wait_for_dreads
;
4375 * Consider the possibility that upl_size wasn't satisfied.
4377 if (upl_size
< upl_needed_size
) {
4378 if (upl_size
&& upl_offset
== 0)
4384 ubc_upl_abort(upl
, 0);
4385 goto wait_for_dreads
;
4387 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
4388 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
4391 vm_offset_t end_off
= ((iov_base
+ io_size
) & PAGE_MASK
);
4395 * After this point, if we are using a vector UPL, then
4396 * either all the UPL elements end on a page boundary OR
4397 * this UPL is the last element because it does not end
4398 * on a page boundary.
4403 * request asynchronously so that we can overlap
4404 * the preparation of the next I/O
4405 * if there are already too many outstanding reads
4406 * wait until some have completed before issuing the next read
4408 if (iostate
.io_issued
> iostate
.io_completed
)
4409 cluster_iostate_wait(&iostate
, max_rd_ahead
, "cluster_read_direct");
4411 if (iostate
.io_error
) {
4413 * one of the earlier reads we issued ran into a hard error
4414 * don't issue any more reads, cleanup the UPL
4415 * that was just created but not used, then
4416 * go wait for any other reads to complete before
4417 * returning the error to the caller
4419 ubc_upl_abort(upl
, 0);
4421 goto wait_for_dreads
;
4423 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 73)) | DBG_FUNC_START
,
4424 upl
, (int)upl_offset
, (int)uio
->uio_offset
, io_size
, 0);
4429 io_flag
&= ~CL_PRESERVE
;
4431 io_flag
|= CL_PRESERVE
;
4433 retval
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
, io_size
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
4437 if(!vector_upl_index
) {
4438 vector_upl
= vector_upl_create(upl_offset
);
4439 v_upl_uio_offset
= uio
->uio_offset
;
4440 vector_upl_offset
= upl_offset
;
4443 vector_upl_set_subupl(vector_upl
,upl
, upl_size
);
4444 vector_upl_set_iostate(vector_upl
, upl
, vector_upl_size
, upl_size
);
4446 vector_upl_size
+= upl_size
;
4447 vector_upl_iosize
+= io_size
;
4449 if(issueVectorUPL
|| vector_upl_index
== MAX_VECTOR_UPL_ELEMENTS
|| vector_upl_size
>= max_vector_size
) {
4450 retval
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
4451 reset_vector_run_state();
4455 * update the uio structure
4457 if ((flags
& IO_ENCRYPTED
) && (max_io_size
< io_size
)) {
4458 uio_update(uio
, (user_size_t
)max_io_size
);
4461 uio_update(uio
, (user_size_t
)io_size
);
4464 * Under normal circumstances, the io_size should not be
4465 * bigger than the io_req_size, but we may have had to round up
4466 * to the end of the page in the encrypted IO case. In that case only,
4467 * ensure that we only decrement io_req_size to 0.
4469 if ((flags
& IO_ENCRYPTED
) && (io_size
> io_req_size
)) {
4473 io_req_size
-= io_size
;
4476 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 73)) | DBG_FUNC_END
,
4477 upl
, (int)uio
->uio_offset
, io_req_size
, retval
, 0);
4481 if (retval
== 0 && iostate
.io_error
== 0 && io_req_size
== 0 && uio
->uio_offset
< filesize
) {
4483 retval
= cluster_io_type(uio
, read_type
, read_length
, 0);
4485 if (retval
== 0 && *read_type
== IO_DIRECT
) {
4487 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_NONE
,
4488 (int)uio
->uio_offset
, (int)filesize
, *read_type
, *read_length
, 0);
4496 if(retval
== 0 && iostate
.io_error
== 0 && useVectorUPL
&& vector_upl_index
) {
4497 retval
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
4498 reset_vector_run_state();
4501 * make sure all async reads that are part of this stream
4502 * have completed before we return
4504 if (iostate
.io_issued
> iostate
.io_completed
)
4505 cluster_iostate_wait(&iostate
, 0, "cluster_read_direct");
4507 if (iostate
.io_error
)
4508 retval
= iostate
.io_error
;
4510 lck_mtx_destroy(&iostate
.io_mtxp
, cl_mtx_grp
);
4512 if (io_throttled
== TRUE
&& retval
== 0)
4515 if (io_req_size
&& retval
== 0) {
4517 * we couldn't handle the tail of this request in DIRECT mode
4518 * so fire it through the copy path
4520 retval
= cluster_read_copy(vp
, uio
, io_req_size
, filesize
, flags
, callback
, callback_arg
);
4522 *read_type
= IO_UNKNOWN
;
4524 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_END
,
4525 (int)uio
->uio_offset
, (int)uio_resid(uio
), io_req_size
, retval
, 0);
4532 cluster_read_contig(vnode_t vp
, struct uio
*uio
, off_t filesize
, int *read_type
, u_int32_t
*read_length
,
4533 int (*callback
)(buf_t
, void *), void *callback_arg
, int flags
)
4535 upl_page_info_t
*pl
;
4536 upl_t upl
[MAX_VECTS
];
4537 vm_offset_t upl_offset
;
4538 addr64_t dst_paddr
= 0;
4539 user_addr_t iov_base
;
4541 upl_size_t upl_size
;
4542 vm_size_t upl_needed_size
;
4543 mach_msg_type_number_t pages_in_pl
;
4546 struct clios iostate
;
4553 u_int32_t devblocksize
;
4554 u_int32_t mem_alignment_mask
;
4555 u_int32_t tail_size
= 0;
4558 if (flags
& IO_PASSIVE
)
4563 if (flags
& IO_NOCACHE
)
4564 bflag
|= CL_NOCACHE
;
4567 * When we enter this routine, we know
4568 * -- the read_length will not exceed the current iov_len
4569 * -- the target address is physically contiguous for read_length
4571 cluster_syncup(vp
, filesize
, callback
, callback_arg
);
4573 devblocksize
= (u_int32_t
)vp
->v_mount
->mnt_devblocksize
;
4574 mem_alignment_mask
= (u_int32_t
)vp
->v_mount
->mnt_alignmentmask
;
4576 iostate
.io_completed
= 0;
4577 iostate
.io_issued
= 0;
4578 iostate
.io_error
= 0;
4579 iostate
.io_wanted
= 0;
4581 lck_mtx_init(&iostate
.io_mtxp
, cl_mtx_grp
, cl_mtx_attr
);
4584 io_size
= *read_length
;
4586 max_size
= filesize
- uio
->uio_offset
;
4588 if (io_size
> max_size
)
4591 iov_base
= uio_curriovbase(uio
);
4593 upl_offset
= (vm_offset_t
)((u_int32_t
)iov_base
& PAGE_MASK
);
4594 upl_needed_size
= upl_offset
+ io_size
;
4597 upl_size
= upl_needed_size
;
4598 upl_flags
= UPL_FILE_IO
| UPL_NO_SYNC
| UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
4601 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 92)) | DBG_FUNC_START
,
4602 (int)upl_offset
, (int)upl_size
, (int)iov_base
, io_size
, 0);
4604 kret
= vm_map_get_upl(current_map(),
4605 (vm_map_offset_t
)(iov_base
& ~((user_addr_t
)PAGE_MASK
)),
4606 &upl_size
, &upl
[cur_upl
], NULL
, &pages_in_pl
, &upl_flags
, 0);
4608 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 92)) | DBG_FUNC_END
,
4609 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
4611 if (kret
!= KERN_SUCCESS
) {
4613 * failed to get pagelist
4616 goto wait_for_creads
;
4620 if (upl_size
< upl_needed_size
) {
4622 * The upl_size wasn't satisfied.
4625 goto wait_for_creads
;
4627 pl
= ubc_upl_pageinfo(upl
[cur_upl
]);
4629 dst_paddr
= ((addr64_t
)upl_phys_page(pl
, 0) << 12) + (addr64_t
)upl_offset
;
4631 while (((uio
->uio_offset
& (devblocksize
- 1)) || io_size
< devblocksize
) && io_size
) {
4632 u_int32_t head_size
;
4634 head_size
= devblocksize
- (u_int32_t
)(uio
->uio_offset
& (devblocksize
- 1));
4636 if (head_size
> io_size
)
4637 head_size
= io_size
;
4639 error
= cluster_align_phys_io(vp
, uio
, dst_paddr
, head_size
, CL_READ
, callback
, callback_arg
);
4642 goto wait_for_creads
;
4644 upl_offset
+= head_size
;
4645 dst_paddr
+= head_size
;
4646 io_size
-= head_size
;
4648 iov_base
+= head_size
;
4650 if ((u_int32_t
)iov_base
& mem_alignment_mask
) {
4652 * request doesn't set up on a memory boundary
4653 * the underlying DMA engine can handle...
4654 * return an error instead of going through
4655 * the slow copy path since the intent of this
4656 * path is direct I/O to device memory
4659 goto wait_for_creads
;
4662 tail_size
= io_size
& (devblocksize
- 1);
4664 io_size
-= tail_size
;
4666 while (io_size
&& error
== 0) {
4668 if (io_size
> MAX_IO_CONTIG_SIZE
)
4669 xsize
= MAX_IO_CONTIG_SIZE
;
4673 * request asynchronously so that we can overlap
4674 * the preparation of the next I/O... we'll do
4675 * the commit after all the I/O has completed
4676 * since its all issued against the same UPL
4677 * if there are already too many outstanding reads
4678 * wait until some have completed before issuing the next
4680 if (iostate
.io_issued
> iostate
.io_completed
)
4681 cluster_iostate_wait(&iostate
, MAX_IO_CONTIG_SIZE
* IO_SCALE(vp
, 2), "cluster_read_contig");
4683 if (iostate
.io_error
) {
4685 * one of the earlier reads we issued ran into a hard error
4686 * don't issue any more reads...
4687 * go wait for any other reads to complete before
4688 * returning the error to the caller
4690 goto wait_for_creads
;
4692 error
= cluster_io(vp
, upl
[cur_upl
], upl_offset
, uio
->uio_offset
, xsize
,
4693 CL_READ
| CL_NOZERO
| CL_DEV_MEMORY
| CL_ASYNC
| bflag
,
4694 (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
4696 * The cluster_io read was issued successfully,
4697 * update the uio structure
4700 uio_update(uio
, (user_size_t
)xsize
);
4703 upl_offset
+= xsize
;
4707 if (error
== 0 && iostate
.io_error
== 0 && tail_size
== 0 && num_upl
< MAX_VECTS
&& uio
->uio_offset
< filesize
) {
4709 error
= cluster_io_type(uio
, read_type
, read_length
, 0);
4711 if (error
== 0 && *read_type
== IO_CONTIG
) {
4716 *read_type
= IO_UNKNOWN
;
4720 * make sure all async reads that are part of this stream
4721 * have completed before we proceed
4723 if (iostate
.io_issued
> iostate
.io_completed
)
4724 cluster_iostate_wait(&iostate
, 0, "cluster_read_contig");
4726 if (iostate
.io_error
)
4727 error
= iostate
.io_error
;
4729 lck_mtx_destroy(&iostate
.io_mtxp
, cl_mtx_grp
);
4731 if (error
== 0 && tail_size
)
4732 error
= cluster_align_phys_io(vp
, uio
, dst_paddr
, tail_size
, CL_READ
, callback
, callback_arg
);
4734 for (n
= 0; n
< num_upl
; n
++)
4736 * just release our hold on each physically contiguous
4737 * region without changing any state
4739 ubc_upl_abort(upl
[n
], 0);
4746 cluster_io_type(struct uio
*uio
, int *io_type
, u_int32_t
*io_length
, u_int32_t min_length
)
4748 user_size_t iov_len
;
4749 user_addr_t iov_base
= 0;
4751 upl_size_t upl_size
;
4756 * skip over any emtpy vectors
4758 uio_update(uio
, (user_size_t
)0);
4760 iov_len
= uio_curriovlen(uio
);
4762 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 94)) | DBG_FUNC_START
, uio
, (int)iov_len
, 0, 0, 0);
4765 iov_base
= uio_curriovbase(uio
);
4767 * make sure the size of the vector isn't too big...
4768 * internally, we want to handle all of the I/O in
4769 * chunk sizes that fit in a 32 bit int
4771 if (iov_len
> (user_size_t
)MAX_IO_REQUEST_SIZE
)
4772 upl_size
= MAX_IO_REQUEST_SIZE
;
4774 upl_size
= (u_int32_t
)iov_len
;
4776 upl_flags
= UPL_QUERY_OBJECT_TYPE
;
4778 if ((vm_map_get_upl(current_map(),
4779 (vm_map_offset_t
)(iov_base
& ~((user_addr_t
)PAGE_MASK
)),
4780 &upl_size
, &upl
, NULL
, NULL
, &upl_flags
, 0)) != KERN_SUCCESS
) {
4782 * the user app must have passed in an invalid address
4789 *io_length
= upl_size
;
4791 if (upl_flags
& UPL_PHYS_CONTIG
)
4792 *io_type
= IO_CONTIG
;
4793 else if (iov_len
>= min_length
)
4794 *io_type
= IO_DIRECT
;
4799 * nothing left to do for this uio
4802 *io_type
= IO_UNKNOWN
;
4804 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 94)) | DBG_FUNC_END
, iov_base
, *io_type
, *io_length
, retval
, 0);
4811 * generate advisory I/O's in the largest chunks possible
4812 * the completed pages will be released into the VM cache
4815 advisory_read(vnode_t vp
, off_t filesize
, off_t f_offset
, int resid
)
4817 return advisory_read_ext(vp
, filesize
, f_offset
, resid
, NULL
, NULL
, CL_PASSIVE
);
4821 advisory_read_ext(vnode_t vp
, off_t filesize
, off_t f_offset
, int resid
, int (*callback
)(buf_t
, void *), void *callback_arg
, int bflag
)
4823 upl_page_info_t
*pl
;
4825 vm_offset_t upl_offset
;
4838 uint32_t max_io_size
;
4841 if ( !UBCINFOEXISTS(vp
))
4847 max_io_size
= cluster_max_io_size(vp
->v_mount
, CL_READ
);
4850 if (max_io_size
> speculative_prefetch_max_iosize
)
4851 max_io_size
= speculative_prefetch_max_iosize
;
4853 if ((vp
->v_mount
->mnt_kern_flag
& MNTK_SSD
) && !ignore_is_ssd
) {
4854 if (max_io_size
> speculative_prefetch_max_iosize
)
4855 max_io_size
= speculative_prefetch_max_iosize
;
4859 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 60)) | DBG_FUNC_START
,
4860 (int)f_offset
, resid
, (int)filesize
, 0, 0);
4862 while (resid
&& f_offset
< filesize
&& retval
== 0) {
4864 * compute the size of the upl needed to encompass
4865 * the requested read... limit each call to cluster_io
4866 * to the maximum UPL size... cluster_io will clip if
4867 * this exceeds the maximum io_size for the device,
4868 * make sure to account for
4869 * a starting offset that's not page aligned
4871 start_offset
= (int)(f_offset
& PAGE_MASK_64
);
4872 upl_f_offset
= f_offset
- (off_t
)start_offset
;
4873 max_size
= filesize
- f_offset
;
4875 if (resid
< max_size
)
4880 upl_size
= (start_offset
+ io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
4881 if ((uint32_t)upl_size
> max_io_size
)
4882 upl_size
= max_io_size
;
4886 * return the number of contiguously present pages in the cache
4887 * starting at upl_f_offset within the file
4889 ubc_range_op(vp
, upl_f_offset
, upl_f_offset
+ upl_size
, UPL_ROP_PRESENT
, &skip_range
);
4893 * skip over pages already present in the cache
4895 io_size
= skip_range
- start_offset
;
4897 f_offset
+= io_size
;
4900 if (skip_range
== upl_size
)
4903 * have to issue some real I/O
4904 * at this point, we know it's starting on a page boundary
4905 * because we've skipped over at least the first page in the request
4908 upl_f_offset
+= skip_range
;
4909 upl_size
-= skip_range
;
4911 pages_in_upl
= upl_size
/ PAGE_SIZE
;
4913 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 61)) | DBG_FUNC_START
,
4914 upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
4916 kret
= ubc_create_upl(vp
,
4921 UPL_RET_ONLY_ABSENT
| UPL_SET_LITE
);
4922 if (kret
!= KERN_SUCCESS
)
4927 * before we start marching forward, we must make sure we end on
4928 * a present page, otherwise we will be working with a freed
4931 for (last_pg
= pages_in_upl
- 1; last_pg
>= 0; last_pg
--) {
4932 if (upl_page_present(pl
, last_pg
))
4935 pages_in_upl
= last_pg
+ 1;
4938 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 61)) | DBG_FUNC_END
,
4939 upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
4942 for (last_pg
= 0; last_pg
< pages_in_upl
; ) {
4944 * scan from the beginning of the upl looking for the first
4945 * page that is present.... this will become the first page in
4946 * the request we're going to make to 'cluster_io'... if all
4947 * of the pages are absent, we won't call through to 'cluster_io'
4949 for (start_pg
= last_pg
; start_pg
< pages_in_upl
; start_pg
++) {
4950 if (upl_page_present(pl
, start_pg
))
4955 * scan from the starting present page looking for an absent
4956 * page before the end of the upl is reached, if we
4957 * find one, then it will terminate the range of pages being
4958 * presented to 'cluster_io'
4960 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
4961 if (!upl_page_present(pl
, last_pg
))
4965 if (last_pg
> start_pg
) {
4967 * we found a range of pages that must be filled
4968 * if the last page in this range is the last page of the file
4969 * we may have to clip the size of it to keep from reading past
4970 * the end of the last physical block associated with the file
4972 upl_offset
= start_pg
* PAGE_SIZE
;
4973 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
4975 if ((off_t
)(upl_f_offset
+ upl_offset
+ io_size
) > filesize
)
4976 io_size
= filesize
- (upl_f_offset
+ upl_offset
);
4979 * issue an asynchronous read to cluster_io
4981 retval
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, io_size
,
4982 CL_ASYNC
| CL_READ
| CL_COMMIT
| CL_AGE
| bflag
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
);
4988 ubc_upl_abort(upl
, 0);
4990 io_size
= upl_size
- start_offset
;
4992 if (io_size
> resid
)
4994 f_offset
+= io_size
;
4998 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 60)) | DBG_FUNC_END
,
4999 (int)f_offset
, resid
, retval
, 0, 0);
5006 cluster_push(vnode_t vp
, int flags
)
5008 return cluster_push_ext(vp
, flags
, NULL
, NULL
);
5013 cluster_push_ext(vnode_t vp
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
5016 int my_sparse_wait
= 0;
5017 struct cl_writebehind
*wbp
;
5019 if ( !UBCINFOEXISTS(vp
)) {
5020 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_NONE
, vp
, flags
, 0, -1, 0);
5023 /* return if deferred write is set */
5024 if (((unsigned int)vfs_flags(vp
->v_mount
) & MNT_DEFWRITE
) && (flags
& IO_DEFWRITE
)) {
5027 if ((wbp
= cluster_get_wbp(vp
, CLW_RETURNLOCKED
)) == NULL
) {
5028 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_NONE
, vp
, flags
, 0, -2, 0);
5031 if (wbp
->cl_number
== 0 && wbp
->cl_scmap
== NULL
) {
5032 lck_mtx_unlock(&wbp
->cl_lockw
);
5034 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_NONE
, vp
, flags
, 0, -3, 0);
5037 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_START
,
5038 wbp
->cl_scmap
, wbp
->cl_number
, flags
, 0, 0);
5041 * if we have an fsync in progress, we don't want to allow any additional
5042 * sync/fsync/close(s) to occur until it finishes.
5043 * note that its possible for writes to continue to occur to this file
5044 * while we're waiting and also once the fsync starts to clean if we're
5045 * in the sparse map case
5047 while (wbp
->cl_sparse_wait
) {
5048 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 97)) | DBG_FUNC_START
, vp
, 0, 0, 0, 0);
5050 msleep((caddr_t
)&wbp
->cl_sparse_wait
, &wbp
->cl_lockw
, PRIBIO
+ 1, "cluster_push_ext", NULL
);
5052 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 97)) | DBG_FUNC_END
, vp
, 0, 0, 0, 0);
5054 if (flags
& IO_SYNC
) {
5056 wbp
->cl_sparse_wait
= 1;
5059 * this is an fsync (or equivalent)... we must wait for any existing async
5060 * cleaning operations to complete before we evaulate the current state
5061 * and finish cleaning... this insures that all writes issued before this
5062 * fsync actually get cleaned to the disk before this fsync returns
5064 while (wbp
->cl_sparse_pushes
) {
5065 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 98)) | DBG_FUNC_START
, vp
, 0, 0, 0, 0);
5067 msleep((caddr_t
)&wbp
->cl_sparse_pushes
, &wbp
->cl_lockw
, PRIBIO
+ 1, "cluster_push_ext", NULL
);
5069 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 98)) | DBG_FUNC_END
, vp
, 0, 0, 0, 0);
5072 if (wbp
->cl_scmap
) {
5075 if (wbp
->cl_sparse_pushes
< SPARSE_PUSH_LIMIT
) {
5077 scmap
= wbp
->cl_scmap
;
5078 wbp
->cl_scmap
= NULL
;
5080 wbp
->cl_sparse_pushes
++;
5082 lck_mtx_unlock(&wbp
->cl_lockw
);
5084 sparse_cluster_push(&scmap
, vp
, ubc_getsize(vp
), PUSH_ALL
, flags
| IO_PASSIVE
, callback
, callback_arg
);
5086 lck_mtx_lock(&wbp
->cl_lockw
);
5088 wbp
->cl_sparse_pushes
--;
5090 if (wbp
->cl_sparse_wait
&& wbp
->cl_sparse_pushes
== 0)
5091 wakeup((caddr_t
)&wbp
->cl_sparse_pushes
);
5093 sparse_cluster_push(&(wbp
->cl_scmap
), vp
, ubc_getsize(vp
), PUSH_ALL
, flags
| IO_PASSIVE
, callback
, callback_arg
);
5097 retval
= cluster_try_push(wbp
, vp
, ubc_getsize(vp
), PUSH_ALL
, flags
| IO_PASSIVE
, callback
, callback_arg
);
5099 lck_mtx_unlock(&wbp
->cl_lockw
);
5101 if (flags
& IO_SYNC
)
5102 (void)vnode_waitforwrites(vp
, 0, 0, 0, "cluster_push");
5104 if (my_sparse_wait
) {
5106 * I'm the owner of the serialization token
5107 * clear it and wakeup anyone that is waiting
5110 lck_mtx_lock(&wbp
->cl_lockw
);
5112 wbp
->cl_sparse_wait
= 0;
5113 wakeup((caddr_t
)&wbp
->cl_sparse_wait
);
5115 lck_mtx_unlock(&wbp
->cl_lockw
);
5117 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_END
,
5118 wbp
->cl_scmap
, wbp
->cl_number
, retval
, 0, 0);
5124 __private_extern__
void
5125 cluster_release(struct ubc_info
*ubc
)
5127 struct cl_writebehind
*wbp
;
5128 struct cl_readahead
*rap
;
5130 if ((wbp
= ubc
->cl_wbehind
)) {
5132 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_START
, ubc
, wbp
->cl_scmap
, 0, 0, 0);
5135 vfs_drt_control(&(wbp
->cl_scmap
), 0);
5137 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_START
, ubc
, 0, 0, 0, 0);
5140 rap
= ubc
->cl_rahead
;
5143 lck_mtx_destroy(&wbp
->cl_lockw
, cl_mtx_grp
);
5144 FREE_ZONE((void *)wbp
, sizeof *wbp
, M_CLWRBEHIND
);
5146 if ((rap
= ubc
->cl_rahead
)) {
5147 lck_mtx_destroy(&rap
->cl_lockr
, cl_mtx_grp
);
5148 FREE_ZONE((void *)rap
, sizeof *rap
, M_CLRDAHEAD
);
5150 ubc
->cl_rahead
= NULL
;
5151 ubc
->cl_wbehind
= NULL
;
5153 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_END
, ubc
, rap
, wbp
, 0, 0);
5158 cluster_try_push(struct cl_writebehind
*wbp
, vnode_t vp
, off_t EOF
, int push_flag
, int io_flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
5165 struct cl_wextent l_clusters
[MAX_CLUSTERS
];
5166 u_int max_cluster_pgcount
;
5169 max_cluster_pgcount
= MAX_CLUSTER_SIZE(vp
) / PAGE_SIZE
;
5171 * the write behind context exists and has
5172 * already been locked...
5174 if (wbp
->cl_number
== 0)
5176 * no clusters to push
5177 * return number of empty slots
5179 return (MAX_CLUSTERS
);
5182 * make a local 'sorted' copy of the clusters
5183 * and clear wbp->cl_number so that new clusters can
5186 for (cl_index
= 0; cl_index
< wbp
->cl_number
; cl_index
++) {
5187 for (min_index
= -1, cl_index1
= 0; cl_index1
< wbp
->cl_number
; cl_index1
++) {
5188 if (wbp
->cl_clusters
[cl_index1
].b_addr
== wbp
->cl_clusters
[cl_index1
].e_addr
)
5190 if (min_index
== -1)
5191 min_index
= cl_index1
;
5192 else if (wbp
->cl_clusters
[cl_index1
].b_addr
< wbp
->cl_clusters
[min_index
].b_addr
)
5193 min_index
= cl_index1
;
5195 if (min_index
== -1)
5198 l_clusters
[cl_index
].b_addr
= wbp
->cl_clusters
[min_index
].b_addr
;
5199 l_clusters
[cl_index
].e_addr
= wbp
->cl_clusters
[min_index
].e_addr
;
5200 l_clusters
[cl_index
].io_flags
= wbp
->cl_clusters
[min_index
].io_flags
;
5202 wbp
->cl_clusters
[min_index
].b_addr
= wbp
->cl_clusters
[min_index
].e_addr
;
5208 if ( (push_flag
& PUSH_DELAY
) && cl_len
== MAX_CLUSTERS
) {
5212 * determine if we appear to be writing the file sequentially
5213 * if not, by returning without having pushed any clusters
5214 * we will cause this vnode to be pushed into the sparse cluster mechanism
5215 * used for managing more random I/O patterns
5217 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
5218 * that's why we're in try_push with PUSH_DELAY...
5220 * check to make sure that all the clusters except the last one are 'full'... and that each cluster
5221 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
5222 * so we can just make a simple pass through, up to, but not including the last one...
5223 * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
5226 * we let the last one be partial as long as it was adjacent to the previous one...
5227 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
5228 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
5230 for (i
= 0; i
< MAX_CLUSTERS
- 1; i
++) {
5231 if ((l_clusters
[i
].e_addr
- l_clusters
[i
].b_addr
) != max_cluster_pgcount
)
5233 if (l_clusters
[i
].e_addr
!= l_clusters
[i
+1].b_addr
)
5237 for (cl_index
= 0; cl_index
< cl_len
; cl_index
++) {
5239 struct cl_extent cl
;
5241 flags
= io_flags
& (IO_PASSIVE
|IO_CLOSE
);
5244 * try to push each cluster in turn...
5246 if (l_clusters
[cl_index
].io_flags
& CLW_IONOCACHE
)
5247 flags
|= IO_NOCACHE
;
5249 if (l_clusters
[cl_index
].io_flags
& CLW_IOPASSIVE
)
5250 flags
|= IO_PASSIVE
;
5252 if (push_flag
& PUSH_SYNC
)
5255 cl
.b_addr
= l_clusters
[cl_index
].b_addr
;
5256 cl
.e_addr
= l_clusters
[cl_index
].e_addr
;
5258 cluster_push_now(vp
, &cl
, EOF
, flags
, callback
, callback_arg
);
5260 l_clusters
[cl_index
].b_addr
= 0;
5261 l_clusters
[cl_index
].e_addr
= 0;
5265 if ( !(push_flag
& PUSH_ALL
) )
5269 if (cl_len
> cl_pushed
) {
5271 * we didn't push all of the clusters, so
5272 * lets try to merge them back in to the vnode
5274 if ((MAX_CLUSTERS
- wbp
->cl_number
) < (cl_len
- cl_pushed
)) {
5276 * we picked up some new clusters while we were trying to
5277 * push the old ones... this can happen because I've dropped
5278 * the vnode lock... the sum of the
5279 * leftovers plus the new cluster count exceeds our ability
5280 * to represent them, so switch to the sparse cluster mechanism
5282 * collect the active public clusters...
5284 sparse_cluster_switch(wbp
, vp
, EOF
, callback
, callback_arg
);
5286 for (cl_index
= 0, cl_index1
= 0; cl_index
< cl_len
; cl_index
++) {
5287 if (l_clusters
[cl_index
].b_addr
== l_clusters
[cl_index
].e_addr
)
5289 wbp
->cl_clusters
[cl_index1
].b_addr
= l_clusters
[cl_index
].b_addr
;
5290 wbp
->cl_clusters
[cl_index1
].e_addr
= l_clusters
[cl_index
].e_addr
;
5291 wbp
->cl_clusters
[cl_index1
].io_flags
= l_clusters
[cl_index
].io_flags
;
5296 * update the cluster count
5298 wbp
->cl_number
= cl_index1
;
5301 * and collect the original clusters that were moved into the
5302 * local storage for sorting purposes
5304 sparse_cluster_switch(wbp
, vp
, EOF
, callback
, callback_arg
);
5308 * we've got room to merge the leftovers back in
5309 * just append them starting at the next 'hole'
5310 * represented by wbp->cl_number
5312 for (cl_index
= 0, cl_index1
= wbp
->cl_number
; cl_index
< cl_len
; cl_index
++) {
5313 if (l_clusters
[cl_index
].b_addr
== l_clusters
[cl_index
].e_addr
)
5316 wbp
->cl_clusters
[cl_index1
].b_addr
= l_clusters
[cl_index
].b_addr
;
5317 wbp
->cl_clusters
[cl_index1
].e_addr
= l_clusters
[cl_index
].e_addr
;
5318 wbp
->cl_clusters
[cl_index1
].io_flags
= l_clusters
[cl_index
].io_flags
;
5323 * update the cluster count
5325 wbp
->cl_number
= cl_index1
;
5328 return (MAX_CLUSTERS
- wbp
->cl_number
);
5334 cluster_push_now(vnode_t vp
, struct cl_extent
*cl
, off_t EOF
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
5336 upl_page_info_t
*pl
;
5338 vm_offset_t upl_offset
;
5353 if (flags
& IO_PASSIVE
)
5358 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_START
,
5359 (int)cl
->b_addr
, (int)cl
->e_addr
, (int)EOF
, flags
, 0);
5361 if ((pages_in_upl
= (int)(cl
->e_addr
- cl
->b_addr
)) == 0) {
5362 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 0, 0, 0, 0);
5366 upl_size
= pages_in_upl
* PAGE_SIZE
;
5367 upl_f_offset
= (off_t
)(cl
->b_addr
* PAGE_SIZE_64
);
5369 if (upl_f_offset
+ upl_size
>= EOF
) {
5371 if (upl_f_offset
>= EOF
) {
5373 * must have truncated the file and missed
5374 * clearing a dangling cluster (i.e. it's completely
5375 * beyond the new EOF
5377 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 1, 0, 0, 0);
5381 size
= EOF
- upl_f_offset
;
5383 upl_size
= (size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
5384 pages_in_upl
= upl_size
/ PAGE_SIZE
;
5388 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_START
, upl_size
, size
, 0, 0, 0);
5391 * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
5393 * - only pages that are currently dirty are returned... these are the ones we need to clean
5394 * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
5395 * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
5396 * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
5397 * someone dirties this page while the I/O is in progress, we don't lose track of the new state
5399 * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
5402 if ((vp
->v_flag
& VNOCACHE_DATA
) || (flags
& IO_NOCACHE
))
5403 upl_flags
= UPL_COPYOUT_FROM
| UPL_RET_ONLY_DIRTY
| UPL_SET_LITE
| UPL_WILL_BE_DUMPED
;
5405 upl_flags
= UPL_COPYOUT_FROM
| UPL_RET_ONLY_DIRTY
| UPL_SET_LITE
;
5407 kret
= ubc_create_upl(vp
,
5413 if (kret
!= KERN_SUCCESS
)
5414 panic("cluster_push: failed to get pagelist");
5416 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_END
, upl
, upl_f_offset
, 0, 0, 0);
5419 * since we only asked for the dirty pages back
5420 * it's possible that we may only get a few or even none, so...
5421 * before we start marching forward, we must make sure we know
5422 * where the last present page is in the UPL, otherwise we could
5423 * end up working with a freed upl due to the FREE_ON_EMPTY semantics
5424 * employed by commit_range and abort_range.
5426 for (last_pg
= pages_in_upl
- 1; last_pg
>= 0; last_pg
--) {
5427 if (upl_page_present(pl
, last_pg
))
5430 pages_in_upl
= last_pg
+ 1;
5432 if (pages_in_upl
== 0) {
5433 ubc_upl_abort(upl
, 0);
5435 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 2, 0, 0, 0);
5439 for (last_pg
= 0; last_pg
< pages_in_upl
; ) {
5441 * find the next dirty page in the UPL
5442 * this will become the first page in the
5443 * next I/O to generate
5445 for (start_pg
= last_pg
; start_pg
< pages_in_upl
; start_pg
++) {
5446 if (upl_dirty_page(pl
, start_pg
))
5448 if (upl_page_present(pl
, start_pg
))
5450 * RET_ONLY_DIRTY will return non-dirty 'precious' pages
5451 * just release these unchanged since we're not going
5452 * to steal them or change their state
5454 ubc_upl_abort_range(upl
, start_pg
* PAGE_SIZE
, PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
5456 if (start_pg
>= pages_in_upl
)
5458 * done... no more dirty pages to push
5461 if (start_pg
> last_pg
)
5463 * skipped over some non-dirty pages
5465 size
-= ((start_pg
- last_pg
) * PAGE_SIZE
);
5468 * find a range of dirty pages to write
5470 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
5471 if (!upl_dirty_page(pl
, last_pg
))
5474 upl_offset
= start_pg
* PAGE_SIZE
;
5476 io_size
= min(size
, (last_pg
- start_pg
) * PAGE_SIZE
);
5478 io_flags
= CL_THROTTLE
| CL_COMMIT
| CL_AGE
| bflag
;
5480 if ( !(flags
& IO_SYNC
))
5481 io_flags
|= CL_ASYNC
;
5483 if (flags
& IO_CLOSE
)
5484 io_flags
|= CL_CLOSE
;
5486 if (flags
& IO_NOCACHE
)
5487 io_flags
|= CL_NOCACHE
;
5489 retval
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, io_size
,
5490 io_flags
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
);
5492 if (error
== 0 && retval
)
5497 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 3, 0, 0, 0);
5504 * sparse_cluster_switch is called with the write behind lock held
5507 sparse_cluster_switch(struct cl_writebehind
*wbp
, vnode_t vp
, off_t EOF
, int (*callback
)(buf_t
, void *), void *callback_arg
)
5511 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 78)) | DBG_FUNC_START
, vp
, wbp
->cl_scmap
, 0, 0, 0);
5513 for (cl_index
= 0; cl_index
< wbp
->cl_number
; cl_index
++) {
5515 struct cl_extent cl
;
5517 for (cl
.b_addr
= wbp
->cl_clusters
[cl_index
].b_addr
; cl
.b_addr
< wbp
->cl_clusters
[cl_index
].e_addr
; cl
.b_addr
++) {
5519 if (ubc_page_op(vp
, (off_t
)(cl
.b_addr
* PAGE_SIZE_64
), 0, NULL
, &flags
) == KERN_SUCCESS
) {
5520 if (flags
& UPL_POP_DIRTY
) {
5521 cl
.e_addr
= cl
.b_addr
+ 1;
5523 sparse_cluster_add(&(wbp
->cl_scmap
), vp
, &cl
, EOF
, callback
, callback_arg
);
5530 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 78)) | DBG_FUNC_END
, vp
, wbp
->cl_scmap
, 0, 0, 0);
5535 * sparse_cluster_push must be called with the write-behind lock held if the scmap is
5536 * still associated with the write-behind context... however, if the scmap has been disassociated
5537 * from the write-behind context (the cluster_push case), the wb lock is not held
5540 sparse_cluster_push(void **scmap
, vnode_t vp
, off_t EOF
, int push_flag
, int io_flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
5542 struct cl_extent cl
;
5546 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 79)) | DBG_FUNC_START
, vp
, (*scmap
), 0, push_flag
, 0);
5548 if (push_flag
& PUSH_ALL
)
5549 vfs_drt_control(scmap
, 1);
5552 if (vfs_drt_get_cluster(scmap
, &offset
, &length
) != KERN_SUCCESS
)
5555 cl
.b_addr
= (daddr64_t
)(offset
/ PAGE_SIZE_64
);
5556 cl
.e_addr
= (daddr64_t
)((offset
+ length
) / PAGE_SIZE_64
);
5558 cluster_push_now(vp
, &cl
, EOF
, io_flags
& (IO_PASSIVE
|IO_CLOSE
), callback
, callback_arg
);
5560 if ( !(push_flag
& PUSH_ALL
) )
5563 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 79)) | DBG_FUNC_END
, vp
, (*scmap
), 0, 0, 0);
5568 * sparse_cluster_add is called with the write behind lock held
5571 sparse_cluster_add(void **scmap
, vnode_t vp
, struct cl_extent
*cl
, off_t EOF
, int (*callback
)(buf_t
, void *), void *callback_arg
)
5577 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 80)) | DBG_FUNC_START
, (*scmap
), 0, cl
->b_addr
, (int)cl
->e_addr
, 0);
5579 offset
= (off_t
)(cl
->b_addr
* PAGE_SIZE_64
);
5580 length
= ((u_int
)(cl
->e_addr
- cl
->b_addr
)) * PAGE_SIZE
;
5582 while (vfs_drt_mark_pages(scmap
, offset
, length
, &new_dirty
) != KERN_SUCCESS
) {
5584 * no room left in the map
5585 * only a partial update was done
5586 * push out some pages and try again
5588 sparse_cluster_push(scmap
, vp
, EOF
, 0, 0, callback
, callback_arg
);
5590 offset
+= (new_dirty
* PAGE_SIZE_64
);
5591 length
-= (new_dirty
* PAGE_SIZE
);
5593 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 80)) | DBG_FUNC_END
, vp
, (*scmap
), 0, 0, 0);
5598 cluster_align_phys_io(vnode_t vp
, struct uio
*uio
, addr64_t usr_paddr
, u_int32_t xsize
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
5600 upl_page_info_t
*pl
;
5610 if (flags
& IO_PASSIVE
)
5615 if (flags
& IO_NOCACHE
)
5616 bflag
|= CL_NOCACHE
;
5618 upl_flags
= UPL_SET_LITE
;
5620 if ( !(flags
& CL_READ
) ) {
5622 * "write" operation: let the UPL subsystem know
5623 * that we intend to modify the buffer cache pages
5626 upl_flags
|= UPL_WILL_MODIFY
;
5629 * indicate that there is no need to pull the
5630 * mapping for this page... we're only going
5631 * to read from it, not modify it.
5633 upl_flags
|= UPL_FILE_IO
;
5635 kret
= ubc_create_upl(vp
,
5636 uio
->uio_offset
& ~PAGE_MASK_64
,
5642 if (kret
!= KERN_SUCCESS
)
5645 if (!upl_valid_page(pl
, 0)) {
5647 * issue a synchronous read to cluster_io
5649 error
= cluster_io(vp
, upl
, 0, uio
->uio_offset
& ~PAGE_MASK_64
, PAGE_SIZE
,
5650 CL_READ
| bflag
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
);
5652 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
5658 ubc_paddr
= ((addr64_t
)upl_phys_page(pl
, 0) << 12) + (addr64_t
)(uio
->uio_offset
& PAGE_MASK_64
);
5661 * NOTE: There is no prototype for the following in BSD. It, and the definitions
5662 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
5663 * osfmk/ppc/mappings.h. They are not included here because there appears to be no
5664 * way to do so without exporting them to kexts as well.
5666 if (flags
& CL_READ
)
5667 // copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */
5668 copypv(ubc_paddr
, usr_paddr
, xsize
, 2 | 1 | 4); /* Copy physical to physical and flush the destination */
5670 // copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */
5671 copypv(usr_paddr
, ubc_paddr
, xsize
, 2 | 1 | 8); /* Copy physical to physical and flush the source */
5673 if ( !(flags
& CL_READ
) || (upl_valid_page(pl
, 0) && upl_dirty_page(pl
, 0))) {
5675 * issue a synchronous write to cluster_io
5677 error
= cluster_io(vp
, upl
, 0, uio
->uio_offset
& ~PAGE_MASK_64
, PAGE_SIZE
,
5678 bflag
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
);
5681 uio_update(uio
, (user_size_t
)xsize
);
5684 abort_flags
= UPL_ABORT_FREE_ON_EMPTY
;
5686 abort_flags
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_DUMP_PAGES
;
5688 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, abort_flags
);
5696 cluster_copy_upl_data(struct uio
*uio
, upl_t upl
, int upl_offset
, int *io_resid
)
5704 upl_page_info_t
*pl
;
5708 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_START
,
5709 (int)uio
->uio_offset
, upl_offset
, xsize
, 0, 0);
5711 segflg
= uio
->uio_segflg
;
5715 case UIO_USERSPACE32
:
5716 case UIO_USERISPACE32
:
5717 uio
->uio_segflg
= UIO_PHYS_USERSPACE32
;
5721 case UIO_USERISPACE
:
5722 uio
->uio_segflg
= UIO_PHYS_USERSPACE
;
5725 case UIO_USERSPACE64
:
5726 case UIO_USERISPACE64
:
5727 uio
->uio_segflg
= UIO_PHYS_USERSPACE64
;
5731 uio
->uio_segflg
= UIO_PHYS_SYSSPACE
;
5735 pl
= ubc_upl_pageinfo(upl
);
5737 pg_index
= upl_offset
/ PAGE_SIZE
;
5738 pg_offset
= upl_offset
& PAGE_MASK
;
5739 csize
= min(PAGE_SIZE
- pg_offset
, xsize
);
5741 while (xsize
&& retval
== 0) {
5744 paddr
= ((addr64_t
)upl_phys_page(pl
, pg_index
) << 12) + pg_offset
;
5746 retval
= uiomove64(paddr
, csize
, uio
);
5751 csize
= min(PAGE_SIZE
, xsize
);
5755 uio
->uio_segflg
= segflg
;
5757 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
5758 (int)uio
->uio_offset
, xsize
, retval
, segflg
, 0);
5765 cluster_copy_ubc_data(vnode_t vp
, struct uio
*uio
, int *io_resid
, int mark_dirty
)
5768 return (cluster_copy_ubc_data_internal(vp
, uio
, io_resid
, mark_dirty
, 1));
5773 cluster_copy_ubc_data_internal(vnode_t vp
, struct uio
*uio
, int *io_resid
, int mark_dirty
, int take_reference
)
5780 memory_object_control_t control
;
5782 io_size
= *io_resid
;
5784 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_START
,
5785 (int)uio
->uio_offset
, io_size
, mark_dirty
, take_reference
, 0);
5787 control
= ubc_getobject(vp
, UBC_FLAGS_NONE
);
5789 if (control
== MEMORY_OBJECT_CONTROL_NULL
) {
5790 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
5791 (int)uio
->uio_offset
, io_size
, retval
, 3, 0);
5795 segflg
= uio
->uio_segflg
;
5799 case UIO_USERSPACE32
:
5800 case UIO_USERISPACE32
:
5801 uio
->uio_segflg
= UIO_PHYS_USERSPACE32
;
5804 case UIO_USERSPACE64
:
5805 case UIO_USERISPACE64
:
5806 uio
->uio_segflg
= UIO_PHYS_USERSPACE64
;
5810 case UIO_USERISPACE
:
5811 uio
->uio_segflg
= UIO_PHYS_USERSPACE
;
5815 uio
->uio_segflg
= UIO_PHYS_SYSSPACE
;
5819 if ( (io_size
= *io_resid
) ) {
5820 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
5821 xsize
= uio_resid(uio
);
5823 retval
= memory_object_control_uiomove(control
, uio
->uio_offset
- start_offset
, uio
,
5824 start_offset
, io_size
, mark_dirty
, take_reference
);
5825 xsize
-= uio_resid(uio
);
5828 uio
->uio_segflg
= segflg
;
5829 *io_resid
= io_size
;
5831 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
5832 (int)uio
->uio_offset
, io_size
, retval
, 0x80000000 | segflg
, 0);
5839 is_file_clean(vnode_t vp
, off_t filesize
)
5843 int total_dirty
= 0;
5845 for (f_offset
= 0; f_offset
< filesize
; f_offset
+= PAGE_SIZE_64
) {
5846 if (ubc_page_op(vp
, f_offset
, 0, NULL
, &flags
) == KERN_SUCCESS
) {
5847 if (flags
& UPL_POP_DIRTY
) {
5861 * Dirty region tracking/clustering mechanism.
5863 * This code (vfs_drt_*) provides a mechanism for tracking and clustering
5864 * dirty regions within a larger space (file). It is primarily intended to
5865 * support clustering in large files with many dirty areas.
5867 * The implementation assumes that the dirty regions are pages.
5869 * To represent dirty pages within the file, we store bit vectors in a
5870 * variable-size circular hash.
5874 * Bitvector size. This determines the number of pages we group in a
5875 * single hashtable entry. Each hashtable entry is aligned to this
5876 * size within the file.
5878 #define DRT_BITVECTOR_PAGES 256
5881 * File offset handling.
5883 * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
5884 * the correct formula is (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)
5886 #define DRT_ADDRESS_MASK (~((1 << 20) - 1))
5887 #define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
5890 * Hashtable address field handling.
5892 * The low-order bits of the hashtable address are used to conserve
5895 * DRT_HASH_COUNT_MASK must be large enough to store the range
5896 * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
5897 * to indicate that the bucket is actually unoccupied.
5899 #define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
5900 #define DRT_HASH_SET_ADDRESS(scm, i, a) \
5902 (scm)->scm_hashtable[(i)].dhe_control = \
5903 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
5905 #define DRT_HASH_COUNT_MASK 0x1ff
5906 #define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
5907 #define DRT_HASH_SET_COUNT(scm, i, c) \
5909 (scm)->scm_hashtable[(i)].dhe_control = \
5910 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \
5912 #define DRT_HASH_CLEAR(scm, i) \
5914 (scm)->scm_hashtable[(i)].dhe_control = 0; \
5916 #define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
5917 #define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
5918 #define DRT_HASH_COPY(oscm, oi, scm, i) \
5920 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
5921 DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
5926 * Hash table moduli.
5928 * Since the hashtable entry's size is dependent on the size of
5929 * the bitvector, and since the hashtable size is constrained to
5930 * both being prime and fitting within the desired allocation
5931 * size, these values need to be manually determined.
5933 * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
5935 * The small hashtable allocation is 1024 bytes, so the modulus is 23.
5936 * The large hashtable allocation is 16384 bytes, so the modulus is 401.
5938 #define DRT_HASH_SMALL_MODULUS 23
5939 #define DRT_HASH_LARGE_MODULUS 401
5942 * Physical memory required before the large hash modulus is permitted.
5944 * On small memory systems, the large hash modulus can lead to phsyical
5945 * memory starvation, so we avoid using it there.
5947 #define DRT_HASH_LARGE_MEMORY_REQUIRED (1024LL * 1024LL * 1024LL) /* 1GiB */
5949 #define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */
5950 #define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */
5952 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
5955 * Hashtable bitvector handling.
5957 * Bitvector fields are 32 bits long.
5960 #define DRT_HASH_SET_BIT(scm, i, bit) \
5961 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
5963 #define DRT_HASH_CLEAR_BIT(scm, i, bit) \
5964 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
5966 #define DRT_HASH_TEST_BIT(scm, i, bit) \
5967 ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
5969 #define DRT_BITVECTOR_CLEAR(scm, i) \
5970 bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
5972 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
5973 bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
5974 &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
5975 (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
5982 struct vfs_drt_hashentry
{
5983 u_int64_t dhe_control
;
5984 u_int32_t dhe_bitvector
[DRT_BITVECTOR_PAGES
/ 32];
5988 * Dirty Region Tracking structure.
5990 * The hashtable is allocated entirely inside the DRT structure.
5992 * The hash is a simple circular prime modulus arrangement, the structure
5993 * is resized from small to large if it overflows.
5996 struct vfs_drt_clustermap
{
5997 u_int32_t scm_magic
; /* sanity/detection */
5998 #define DRT_SCM_MAGIC 0x12020003
5999 u_int32_t scm_modulus
; /* current ring size */
6000 u_int32_t scm_buckets
; /* number of occupied buckets */
6001 u_int32_t scm_lastclean
; /* last entry we cleaned */
6002 u_int32_t scm_iskips
; /* number of slot skips */
6004 struct vfs_drt_hashentry scm_hashtable
[0];
6008 #define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
6009 #define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
6012 * Debugging codes and arguments.
6014 #define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
6015 #define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
6016 #define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
6017 #define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
6018 #define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
6021 /* 1 (clean, no map) */
6022 /* 2 (map alloc fail) */
6023 /* 3, resid (partial) */
6024 #define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
6025 #define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
6026 * lastclean, iskips */
6029 static kern_return_t
vfs_drt_alloc_map(struct vfs_drt_clustermap
**cmapp
);
6030 static kern_return_t
vfs_drt_free_map(struct vfs_drt_clustermap
*cmap
);
6031 static kern_return_t
vfs_drt_search_index(struct vfs_drt_clustermap
*cmap
,
6032 u_int64_t offset
, int *indexp
);
6033 static kern_return_t
vfs_drt_get_index(struct vfs_drt_clustermap
**cmapp
,
6037 static kern_return_t
vfs_drt_do_mark_pages(
6043 static void vfs_drt_trace(
6044 struct vfs_drt_clustermap
*cmap
,
6053 * Allocate and initialise a sparse cluster map.
6055 * Will allocate a new map, resize or compact an existing map.
6057 * XXX we should probably have at least one intermediate map size,
6058 * as the 1:16 ratio seems a bit drastic.
6060 static kern_return_t
6061 vfs_drt_alloc_map(struct vfs_drt_clustermap
**cmapp
)
6063 struct vfs_drt_clustermap
*cmap
, *ocmap
;
6067 int nsize
, active_buckets
, index
, copycount
;
6074 * Decide on the size of the new map.
6076 if (ocmap
== NULL
) {
6077 nsize
= DRT_HASH_SMALL_MODULUS
;
6079 /* count the number of active buckets in the old map */
6081 for (i
= 0; i
< ocmap
->scm_modulus
; i
++) {
6082 if (!DRT_HASH_VACANT(ocmap
, i
) &&
6083 (DRT_HASH_GET_COUNT(ocmap
, i
) != 0))
6087 * If we're currently using the small allocation, check to
6088 * see whether we should grow to the large one.
6090 if (ocmap
->scm_modulus
== DRT_HASH_SMALL_MODULUS
) {
6092 * If the ring is nearly full and we are allowed to
6093 * use the large modulus, upgrade.
6095 if ((active_buckets
> (DRT_HASH_SMALL_MODULUS
- 5)) &&
6096 (max_mem
>= DRT_HASH_LARGE_MEMORY_REQUIRED
)) {
6097 nsize
= DRT_HASH_LARGE_MODULUS
;
6099 nsize
= DRT_HASH_SMALL_MODULUS
;
6102 /* already using the large modulus */
6103 nsize
= DRT_HASH_LARGE_MODULUS
;
6105 * If the ring is completely full, there's
6106 * nothing useful for us to do. Behave as
6107 * though we had compacted into the new
6110 if (active_buckets
>= DRT_HASH_LARGE_MODULUS
)
6111 return(KERN_SUCCESS
);
6116 * Allocate and initialise the new map.
6119 kret
= kmem_alloc(kernel_map
, (vm_offset_t
*)&cmap
,
6120 (nsize
== DRT_HASH_SMALL_MODULUS
) ? DRT_SMALL_ALLOCATION
: DRT_LARGE_ALLOCATION
);
6121 if (kret
!= KERN_SUCCESS
)
6123 cmap
->scm_magic
= DRT_SCM_MAGIC
;
6124 cmap
->scm_modulus
= nsize
;
6125 cmap
->scm_buckets
= 0;
6126 cmap
->scm_lastclean
= 0;
6127 cmap
->scm_iskips
= 0;
6128 for (i
= 0; i
< cmap
->scm_modulus
; i
++) {
6129 DRT_HASH_CLEAR(cmap
, i
);
6130 DRT_HASH_VACATE(cmap
, i
);
6131 DRT_BITVECTOR_CLEAR(cmap
, i
);
6135 * If there's an old map, re-hash entries from it into the new map.
6138 if (ocmap
!= NULL
) {
6139 for (i
= 0; i
< ocmap
->scm_modulus
; i
++) {
6140 /* skip empty buckets */
6141 if (DRT_HASH_VACANT(ocmap
, i
) ||
6142 (DRT_HASH_GET_COUNT(ocmap
, i
) == 0))
6145 offset
= DRT_HASH_GET_ADDRESS(ocmap
, i
);
6146 kret
= vfs_drt_get_index(&cmap
, offset
, &index
, 1);
6147 if (kret
!= KERN_SUCCESS
) {
6148 /* XXX need to bail out gracefully here */
6149 panic("vfs_drt: new cluster map mysteriously too small");
6153 DRT_HASH_COPY(ocmap
, i
, cmap
, index
);
6158 /* log what we've done */
6159 vfs_drt_trace(cmap
, DRT_DEBUG_ALLOC
, copycount
, 0, 0, 0);
6162 * It's important to ensure that *cmapp always points to
6163 * a valid map, so we must overwrite it before freeing
6167 if (ocmap
!= NULL
) {
6168 /* emit stats into trace buffer */
6169 vfs_drt_trace(ocmap
, DRT_DEBUG_SCMDATA
,
6172 ocmap
->scm_lastclean
,
6175 vfs_drt_free_map(ocmap
);
6177 return(KERN_SUCCESS
);
6182 * Free a sparse cluster map.
6184 static kern_return_t
6185 vfs_drt_free_map(struct vfs_drt_clustermap
*cmap
)
6187 kmem_free(kernel_map
, (vm_offset_t
)cmap
,
6188 (cmap
->scm_modulus
== DRT_HASH_SMALL_MODULUS
) ? DRT_SMALL_ALLOCATION
: DRT_LARGE_ALLOCATION
);
6189 return(KERN_SUCCESS
);
6194 * Find the hashtable slot currently occupied by an entry for the supplied offset.
6196 static kern_return_t
6197 vfs_drt_search_index(struct vfs_drt_clustermap
*cmap
, u_int64_t offset
, int *indexp
)
6202 offset
= DRT_ALIGN_ADDRESS(offset
);
6203 index
= DRT_HASH(cmap
, offset
);
6205 /* traverse the hashtable */
6206 for (i
= 0; i
< cmap
->scm_modulus
; i
++) {
6209 * If the slot is vacant, we can stop.
6211 if (DRT_HASH_VACANT(cmap
, index
))
6215 * If the address matches our offset, we have success.
6217 if (DRT_HASH_GET_ADDRESS(cmap
, index
) == offset
) {
6219 return(KERN_SUCCESS
);
6223 * Move to the next slot, try again.
6225 index
= DRT_HASH_NEXT(cmap
, index
);
6230 return(KERN_FAILURE
);
6234 * Find the hashtable slot for the supplied offset. If we haven't allocated
6235 * one yet, allocate one and populate the address field. Note that it will
6236 * not have a nonzero page count and thus will still technically be free, so
6237 * in the case where we are called to clean pages, the slot will remain free.
6239 static kern_return_t
6240 vfs_drt_get_index(struct vfs_drt_clustermap
**cmapp
, u_int64_t offset
, int *indexp
, int recursed
)
6242 struct vfs_drt_clustermap
*cmap
;
6249 /* look for an existing entry */
6250 kret
= vfs_drt_search_index(cmap
, offset
, indexp
);
6251 if (kret
== KERN_SUCCESS
)
6254 /* need to allocate an entry */
6255 offset
= DRT_ALIGN_ADDRESS(offset
);
6256 index
= DRT_HASH(cmap
, offset
);
6258 /* scan from the index forwards looking for a vacant slot */
6259 for (i
= 0; i
< cmap
->scm_modulus
; i
++) {
6261 if (DRT_HASH_VACANT(cmap
, index
) || DRT_HASH_GET_COUNT(cmap
,index
) == 0) {
6262 cmap
->scm_buckets
++;
6263 if (index
< cmap
->scm_lastclean
)
6264 cmap
->scm_lastclean
= index
;
6265 DRT_HASH_SET_ADDRESS(cmap
, index
, offset
);
6266 DRT_HASH_SET_COUNT(cmap
, index
, 0);
6267 DRT_BITVECTOR_CLEAR(cmap
, index
);
6269 vfs_drt_trace(cmap
, DRT_DEBUG_INSERT
, (int)offset
, i
, 0, 0);
6270 return(KERN_SUCCESS
);
6272 cmap
->scm_iskips
+= i
;
6273 index
= DRT_HASH_NEXT(cmap
, index
);
6277 * We haven't found a vacant slot, so the map is full. If we're not
6278 * already recursed, try reallocating/compacting it.
6281 return(KERN_FAILURE
);
6282 kret
= vfs_drt_alloc_map(cmapp
);
6283 if (kret
== KERN_SUCCESS
) {
6284 /* now try to insert again */
6285 kret
= vfs_drt_get_index(cmapp
, offset
, indexp
, 1);
6291 * Implementation of set dirty/clean.
6293 * In the 'clean' case, not finding a map is OK.
6295 static kern_return_t
6296 vfs_drt_do_mark_pages(
6303 struct vfs_drt_clustermap
*cmap
, **cmapp
;
6305 int i
, index
, pgoff
, pgcount
, setcount
, ecount
;
6307 cmapp
= (struct vfs_drt_clustermap
**)private;
6310 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_START
, (int)offset
, (int)length
, dirty
, 0);
6312 if (setcountp
!= NULL
)
6315 /* allocate a cluster map if we don't already have one */
6317 /* no cluster map, nothing to clean */
6319 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 1, 0, 0, 0);
6320 return(KERN_SUCCESS
);
6322 kret
= vfs_drt_alloc_map(cmapp
);
6323 if (kret
!= KERN_SUCCESS
) {
6324 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 2, 0, 0, 0);
6331 * Iterate over the length of the region.
6333 while (length
> 0) {
6335 * Get the hashtable index for this offset.
6337 * XXX this will add blank entries if we are clearing a range
6338 * that hasn't been dirtied.
6340 kret
= vfs_drt_get_index(cmapp
, offset
, &index
, 0);
6341 cmap
= *cmapp
; /* may have changed! */
6342 /* this may be a partial-success return */
6343 if (kret
!= KERN_SUCCESS
) {
6344 if (setcountp
!= NULL
)
6345 *setcountp
= setcount
;
6346 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 3, (int)length
, 0, 0);
6352 * Work out how many pages we're modifying in this
6355 pgoff
= (offset
- DRT_ALIGN_ADDRESS(offset
)) / PAGE_SIZE
;
6356 pgcount
= min((length
/ PAGE_SIZE
), (DRT_BITVECTOR_PAGES
- pgoff
));
6359 * Iterate over pages, dirty/clearing as we go.
6361 ecount
= DRT_HASH_GET_COUNT(cmap
, index
);
6362 for (i
= 0; i
< pgcount
; i
++) {
6364 if (!DRT_HASH_TEST_BIT(cmap
, index
, pgoff
+ i
)) {
6365 DRT_HASH_SET_BIT(cmap
, index
, pgoff
+ i
);
6370 if (DRT_HASH_TEST_BIT(cmap
, index
, pgoff
+ i
)) {
6371 DRT_HASH_CLEAR_BIT(cmap
, index
, pgoff
+ i
);
6377 DRT_HASH_SET_COUNT(cmap
, index
, ecount
);
6379 offset
+= pgcount
* PAGE_SIZE
;
6380 length
-= pgcount
* PAGE_SIZE
;
6382 if (setcountp
!= NULL
)
6383 *setcountp
= setcount
;
6385 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 0, setcount
, 0, 0);
6387 return(KERN_SUCCESS
);
6391 * Mark a set of pages as dirty/clean.
6393 * This is a public interface.
6396 * Pointer to storage suitable for holding a pointer. Note that
6397 * this must either be NULL or a value set by this function.
6400 * Current file size in bytes.
6403 * Offset of the first page to be marked as dirty, in bytes. Must be
6407 * Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
6410 * Number of pages newly marked dirty by this call (optional).
6412 * Returns KERN_SUCCESS if all the pages were successfully marked.
6414 static kern_return_t
6415 vfs_drt_mark_pages(void **cmapp
, off_t offset
, u_int length
, u_int
*setcountp
)
6417 /* XXX size unused, drop from interface */
6418 return(vfs_drt_do_mark_pages(cmapp
, offset
, length
, setcountp
, 1));
6422 static kern_return_t
6423 vfs_drt_unmark_pages(void **cmapp
, off_t offset
, u_int length
)
6425 return(vfs_drt_do_mark_pages(cmapp
, offset
, length
, NULL
, 0));
6430 * Get a cluster of dirty pages.
6432 * This is a public interface.
6435 * Pointer to storage managed by drt_mark_pages. Note that this must
6436 * be NULL or a value set by drt_mark_pages.
6439 * Returns the byte offset into the file of the first page in the cluster.
6442 * Returns the length in bytes of the cluster of dirty pages.
6444 * Returns success if a cluster was found. If KERN_FAILURE is returned, there
6445 * are no dirty pages meeting the minmum size criteria. Private storage will
6446 * be released if there are no more dirty pages left in the map
6449 static kern_return_t
6450 vfs_drt_get_cluster(void **cmapp
, off_t
*offsetp
, u_int
*lengthp
)
6452 struct vfs_drt_clustermap
*cmap
;
6456 int index
, i
, fs
, ls
;
6459 if ((cmapp
== NULL
) || (*cmapp
== NULL
))
6460 return(KERN_FAILURE
);
6463 /* walk the hashtable */
6464 for (offset
= 0, j
= 0; j
< cmap
->scm_modulus
; offset
+= (DRT_BITVECTOR_PAGES
* PAGE_SIZE
), j
++) {
6465 index
= DRT_HASH(cmap
, offset
);
6467 if (DRT_HASH_VACANT(cmap
, index
) || (DRT_HASH_GET_COUNT(cmap
, index
) == 0))
6470 /* scan the bitfield for a string of bits */
6473 for (i
= 0; i
< DRT_BITVECTOR_PAGES
; i
++) {
6474 if (DRT_HASH_TEST_BIT(cmap
, index
, i
)) {
6480 /* didn't find any bits set */
6481 panic("vfs_drt: entry summary count > 0 but no bits set in map");
6483 for (ls
= 0; i
< DRT_BITVECTOR_PAGES
; i
++, ls
++) {
6484 if (!DRT_HASH_TEST_BIT(cmap
, index
, i
))
6488 /* compute offset and length, mark pages clean */
6489 offset
= DRT_HASH_GET_ADDRESS(cmap
, index
) + (PAGE_SIZE
* fs
);
6490 length
= ls
* PAGE_SIZE
;
6491 vfs_drt_do_mark_pages(cmapp
, offset
, length
, NULL
, 0);
6492 cmap
->scm_lastclean
= index
;
6494 /* return successful */
6495 *offsetp
= (off_t
)offset
;
6498 vfs_drt_trace(cmap
, DRT_DEBUG_RETCLUSTER
, (int)offset
, (int)length
, 0, 0);
6499 return(KERN_SUCCESS
);
6502 * We didn't find anything... hashtable is empty
6503 * emit stats into trace buffer and
6506 vfs_drt_trace(cmap
, DRT_DEBUG_SCMDATA
,
6509 cmap
->scm_lastclean
,
6512 vfs_drt_free_map(cmap
);
6515 return(KERN_FAILURE
);
6519 static kern_return_t
6520 vfs_drt_control(void **cmapp
, int op_type
)
6522 struct vfs_drt_clustermap
*cmap
;
6525 if ((cmapp
== NULL
) || (*cmapp
== NULL
))
6526 return(KERN_FAILURE
);
6531 /* emit stats into trace buffer */
6532 vfs_drt_trace(cmap
, DRT_DEBUG_SCMDATA
,
6535 cmap
->scm_lastclean
,
6538 vfs_drt_free_map(cmap
);
6543 cmap
->scm_lastclean
= 0;
6546 return(KERN_SUCCESS
);
6552 * Emit a summary of the state of the clustermap into the trace buffer
6553 * along with some caller-provided data.
6557 vfs_drt_trace(__unused
struct vfs_drt_clustermap
*cmap
, int code
, int arg1
, int arg2
, int arg3
, int arg4
)
6559 KERNEL_DEBUG(code
, arg1
, arg2
, arg3
, arg4
, 0);
6563 vfs_drt_trace(__unused
struct vfs_drt_clustermap
*cmap
, __unused
int code
,
6564 __unused
int arg1
, __unused
int arg2
, __unused
int arg3
,
6572 * Perform basic sanity check on the hash entry summary count
6573 * vs. the actual bits set in the entry.
6576 vfs_drt_sanity(struct vfs_drt_clustermap
*cmap
)
6581 for (index
= 0; index
< cmap
->scm_modulus
; index
++) {
6582 if (DRT_HASH_VACANT(cmap
, index
))
6585 for (bits_on
= 0, i
= 0; i
< DRT_BITVECTOR_PAGES
; i
++) {
6586 if (DRT_HASH_TEST_BIT(cmap
, index
, i
))
6589 if (bits_on
!= DRT_HASH_GET_COUNT(cmap
, index
))
6590 panic("bits_on = %d, index = %d\n", bits_on
, index
);