2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
31 * The Regents of the University of California. All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
64 #include <sys/param.h>
65 #include <sys/proc_internal.h>
66 #include <sys/buf_internal.h>
67 #include <sys/mount_internal.h>
68 #include <sys/vnode_internal.h>
69 #include <sys/trace.h>
70 #include <kern/kalloc.h>
72 #include <sys/kernel.h>
73 #include <sys/resourcevar.h>
74 #include <miscfs/specfs/specdev.h>
75 #include <sys/uio_internal.h>
76 #include <libkern/libkern.h>
77 #include <machine/machine_routines.h>
79 #include <sys/ubc_internal.h>
80 #include <vm/vnode_pager.h>
82 #include <mach/mach_types.h>
83 #include <mach/memory_object_types.h>
84 #include <mach/vm_map.h>
86 #include <kern/task.h>
87 #include <kern/policy_internal.h>
89 #include <vm/vm_kern.h>
90 #include <vm/vm_map.h>
91 #include <vm/vm_pageout.h>
92 #include <vm/vm_fault.h>
94 #include <sys/kdebug.h>
95 #include <libkern/OSAtomic.h>
101 #include <vfs/vfs_disk_conditioner.h>
105 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
110 #define CL_WRITE 0x02
111 #define CL_ASYNC 0x04
112 #define CL_COMMIT 0x08
113 #define CL_PAGEOUT 0x10
115 #define CL_NOZERO 0x40
116 #define CL_PAGEIN 0x80
117 #define CL_DEV_MEMORY 0x100
118 #define CL_PRESERVE 0x200
119 #define CL_THROTTLE 0x400
120 #define CL_KEEPCACHED 0x800
121 #define CL_DIRECT_IO 0x1000
122 #define CL_PASSIVE 0x2000
123 #define CL_IOSTREAMING 0x4000
124 #define CL_CLOSE 0x8000
125 #define CL_ENCRYPTED 0x10000
126 #define CL_RAW_ENCRYPTED 0x20000
127 #define CL_NOCACHE 0x40000
129 #define MAX_VECTOR_UPL_ELEMENTS 8
130 #define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE_BYTES)
132 #define CLUSTER_IO_WAITING ((buf_t)1)
134 extern upl_t
vector_upl_create(vm_offset_t
);
135 extern boolean_t
vector_upl_is_valid(upl_t
);
136 extern boolean_t
vector_upl_set_subupl(upl_t
, upl_t
, u_int32_t
);
137 extern void vector_upl_set_pagelist(upl_t
);
138 extern void vector_upl_set_iostate(upl_t
, upl_t
, vm_offset_t
, u_int32_t
);
142 u_int io_completed
; /* amount of io that has currently completed */
143 u_int io_issued
; /* amount of io that was successfully issued */
144 int io_error
; /* error code of first error encountered */
145 int io_wanted
; /* someone is sleeping waiting for a change in state */
148 struct cl_direct_read_lock
{
149 LIST_ENTRY(cl_direct_read_lock
) chain
;
155 #define CL_DIRECT_READ_LOCK_BUCKETS 61
157 static LIST_HEAD(cl_direct_read_locks
, cl_direct_read_lock
)
158 cl_direct_read_locks
[CL_DIRECT_READ_LOCK_BUCKETS
];
160 static LCK_GRP_DECLARE(cl_mtx_grp
, "cluster I/O");
161 static LCK_MTX_DECLARE(cl_transaction_mtxp
, &cl_mtx_grp
);
162 static LCK_SPIN_DECLARE(cl_direct_read_spin_lock
, &cl_mtx_grp
);
164 static ZONE_DECLARE(cl_rd_zone
, "cluster_read",
165 sizeof(struct cl_readahead
), ZC_ZFREE_CLEARMEM
| ZC_NOENCRYPT
);
167 static ZONE_DECLARE(cl_wr_zone
, "cluster_write",
168 sizeof(struct cl_writebehind
), ZC_ZFREE_CLEARMEM
| ZC_NOENCRYPT
);
175 #define PUSH_DELAY 0x01
176 #define PUSH_ALL 0x02
177 #define PUSH_SYNC 0x04
180 static void cluster_EOT(buf_t cbp_head
, buf_t cbp_tail
, int zero_offset
);
181 static void cluster_wait_IO(buf_t cbp_head
, int async
);
182 static void cluster_complete_transaction(buf_t
*cbp_head
, void *callback_arg
, int *retval
, int flags
, int needwait
);
184 static int cluster_io_type(struct uio
*uio
, int *io_type
, u_int32_t
*io_length
, u_int32_t min_length
);
186 static int cluster_io(vnode_t vp
, upl_t upl
, vm_offset_t upl_offset
, off_t f_offset
, int non_rounded_size
,
187 int flags
, buf_t real_bp
, struct clios
*iostate
, int (*)(buf_t
, void *), void *callback_arg
);
188 static int cluster_iodone(buf_t bp
, void *callback_arg
);
189 static int cluster_ioerror(upl_t upl
, int upl_offset
, int abort_size
, int error
, int io_flags
, vnode_t vp
);
190 static int cluster_is_throttled(vnode_t vp
);
192 static void cluster_iostate_wait(struct clios
*iostate
, u_int target
, const char *wait_name
);
194 static void cluster_syncup(vnode_t vp
, off_t newEOF
, int (*)(buf_t
, void *), void *callback_arg
, int flags
);
196 static void cluster_read_upl_release(upl_t upl
, int start_pg
, int last_pg
, int take_reference
);
197 static int cluster_copy_ubc_data_internal(vnode_t vp
, struct uio
*uio
, int *io_resid
, int mark_dirty
, int take_reference
);
199 static int cluster_read_copy(vnode_t vp
, struct uio
*uio
, u_int32_t io_req_size
, off_t filesize
, int flags
,
200 int (*)(buf_t
, void *), void *callback_arg
) __attribute__((noinline
));
201 static int cluster_read_direct(vnode_t vp
, struct uio
*uio
, off_t filesize
, int *read_type
, u_int32_t
*read_length
,
202 int flags
, int (*)(buf_t
, void *), void *callback_arg
) __attribute__((noinline
));
203 static int cluster_read_contig(vnode_t vp
, struct uio
*uio
, off_t filesize
, int *read_type
, u_int32_t
*read_length
,
204 int (*)(buf_t
, void *), void *callback_arg
, int flags
) __attribute__((noinline
));
206 static int cluster_write_copy(vnode_t vp
, struct uio
*uio
, u_int32_t io_req_size
, off_t oldEOF
, off_t newEOF
,
207 off_t headOff
, off_t tailOff
, int flags
, int (*)(buf_t
, void *), void *callback_arg
) __attribute__((noinline
));
208 static int cluster_write_direct(vnode_t vp
, struct uio
*uio
, off_t oldEOF
, off_t newEOF
,
209 int *write_type
, u_int32_t
*write_length
, int flags
, int (*)(buf_t
, void *), void *callback_arg
) __attribute__((noinline
));
210 static int cluster_write_contig(vnode_t vp
, struct uio
*uio
, off_t newEOF
,
211 int *write_type
, u_int32_t
*write_length
, int (*)(buf_t
, void *), void *callback_arg
, int bflag
) __attribute__((noinline
));
213 static void cluster_update_state_internal(vnode_t vp
, struct cl_extent
*cl
, int flags
, boolean_t defer_writes
, boolean_t
*first_pass
,
214 off_t write_off
, int write_cnt
, off_t newEOF
, int (*callback
)(buf_t
, void *), void *callback_arg
, boolean_t vm_initiated
);
216 static int cluster_align_phys_io(vnode_t vp
, struct uio
*uio
, addr64_t usr_paddr
, u_int32_t xsize
, int flags
, int (*)(buf_t
, void *), void *callback_arg
);
218 static int cluster_read_prefetch(vnode_t vp
, off_t f_offset
, u_int size
, off_t filesize
, int (*callback
)(buf_t
, void *), void *callback_arg
, int bflag
);
219 static void cluster_read_ahead(vnode_t vp
, struct cl_extent
*extent
, off_t filesize
, struct cl_readahead
*ra
,
220 int (*callback
)(buf_t
, void *), void *callback_arg
, int bflag
);
222 static int cluster_push_now(vnode_t vp
, struct cl_extent
*, off_t EOF
, int flags
, int (*)(buf_t
, void *), void *callback_arg
, boolean_t vm_ioitiated
);
224 static int cluster_try_push(struct cl_writebehind
*, vnode_t vp
, off_t EOF
, int push_flag
, int flags
, int (*)(buf_t
, void *),
225 void *callback_arg
, int *err
, boolean_t vm_initiated
);
227 static int sparse_cluster_switch(struct cl_writebehind
*, vnode_t vp
, off_t EOF
, int (*)(buf_t
, void *), void *callback_arg
, boolean_t vm_initiated
);
228 static int sparse_cluster_push(struct cl_writebehind
*, void **cmapp
, vnode_t vp
, off_t EOF
, int push_flag
,
229 int io_flags
, int (*)(buf_t
, void *), void *callback_arg
, boolean_t vm_initiated
);
230 static int sparse_cluster_add(struct cl_writebehind
*, void **cmapp
, vnode_t vp
, struct cl_extent
*, off_t EOF
,
231 int (*)(buf_t
, void *), void *callback_arg
, boolean_t vm_initiated
);
233 static kern_return_t
vfs_drt_mark_pages(void **cmapp
, off_t offset
, u_int length
, u_int
*setcountp
);
234 static kern_return_t
vfs_drt_get_cluster(void **cmapp
, off_t
*offsetp
, u_int
*lengthp
);
235 static kern_return_t
vfs_drt_control(void **cmapp
, int op_type
);
236 static kern_return_t
vfs_get_scmap_push_behavior_internal(void **cmapp
, int *push_flag
);
240 * For throttled IO to check whether
241 * a block is cached by the boot cache
242 * and thus it can avoid delaying the IO.
244 * bootcache_contains_block is initially
245 * NULL. The BootCache will set it while
246 * the cache is active and clear it when
247 * the cache is jettisoned.
249 * Returns 0 if the block is not
250 * contained in the cache, 1 if it is
253 * The function pointer remains valid
254 * after the cache has been evicted even
255 * if bootcache_contains_block has been
258 * See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs
260 int (*bootcache_contains_block
)(dev_t device
, u_int64_t blkno
) = NULL
;
264 * limit the internal I/O size so that we
265 * can represent it in a 32 bit int
267 #define MAX_IO_REQUEST_SIZE (1024 * 1024 * 512)
268 #define MAX_IO_CONTIG_SIZE MAX_UPL_SIZE_BYTES
271 * The MIN_DIRECT_WRITE_SIZE governs how much I/O should be issued before we consider
272 * allowing the caller to bypass the buffer cache. For small I/Os (less than 16k),
273 * we have not historically allowed the write to bypass the UBC.
275 #define MIN_DIRECT_WRITE_SIZE (16384)
277 #define WRITE_THROTTLE 6
278 #define WRITE_THROTTLE_SSD 2
279 #define WRITE_BEHIND 1
280 #define WRITE_BEHIND_SSD 1
282 #if !defined(XNU_TARGET_OS_OSX)
284 #define PREFETCH_SSD 1
285 uint32_t speculative_prefetch_max
= (2048 * 1024); /* maximum bytes in a specluative read-ahead */
286 uint32_t speculative_prefetch_max_iosize
= (512 * 1024); /* maximum I/O size to use in a specluative read-ahead */
287 #else /* XNU_TARGET_OS_OSX */
289 #define PREFETCH_SSD 2
290 uint32_t speculative_prefetch_max
= (MAX_UPL_SIZE_BYTES
* 3); /* maximum bytes in a specluative read-ahead */
291 uint32_t speculative_prefetch_max_iosize
= (512 * 1024); /* maximum I/O size to use in a specluative read-ahead on SSDs*/
292 #endif /* ! XNU_TARGET_OS_OSX */
295 #define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * (base))
296 #define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE))
297 #define MAX_PREFETCH(vp, size, is_ssd) (size * IO_SCALE(vp, ((is_ssd) ? PREFETCH_SSD : PREFETCH)))
299 int speculative_reads_disabled
= 0;
302 * throttle the number of async writes that
303 * can be outstanding on a single vnode
304 * before we issue a synchronous write
306 #define THROTTLE_MAXCNT 0
308 uint32_t throttle_max_iosize
= (128 * 1024);
310 #define THROTTLE_MAX_IOSIZE (throttle_max_iosize)
312 SYSCTL_INT(_debug
, OID_AUTO
, lowpri_throttle_max_iosize
, CTLFLAG_RW
| CTLFLAG_LOCKED
, &throttle_max_iosize
, 0, "");
318 for (int i
= 0; i
< CL_DIRECT_READ_LOCK_BUCKETS
; ++i
) {
319 LIST_INIT(&cl_direct_read_locks
[i
]);
325 cluster_max_io_size(mount_t mp
, int type
)
327 uint32_t max_io_size
;
333 segcnt
= mp
->mnt_segreadcnt
;
334 maxcnt
= mp
->mnt_maxreadcnt
;
337 segcnt
= mp
->mnt_segwritecnt
;
338 maxcnt
= mp
->mnt_maxwritecnt
;
341 segcnt
= min(mp
->mnt_segreadcnt
, mp
->mnt_segwritecnt
);
342 maxcnt
= min(mp
->mnt_maxreadcnt
, mp
->mnt_maxwritecnt
);
345 if (segcnt
> (MAX_UPL_SIZE_BYTES
>> PAGE_SHIFT
)) {
347 * don't allow a size beyond the max UPL size we can create
349 segcnt
= MAX_UPL_SIZE_BYTES
>> PAGE_SHIFT
;
351 max_io_size
= min((segcnt
* PAGE_SIZE
), maxcnt
);
353 if (max_io_size
< MAX_UPL_TRANSFER_BYTES
) {
355 * don't allow a size smaller than the old fixed limit
357 max_io_size
= MAX_UPL_TRANSFER_BYTES
;
360 * make sure the size specified is a multiple of PAGE_SIZE
362 max_io_size
&= ~PAGE_MASK
;
370 #define CLW_ALLOCATE 0x01
371 #define CLW_RETURNLOCKED 0x02
372 #define CLW_IONOCACHE 0x04
373 #define CLW_IOPASSIVE 0x08
376 * if the read ahead context doesn't yet exist,
377 * allocate and initialize it...
378 * the vnode lock serializes multiple callers
379 * during the actual assignment... first one
380 * to grab the lock wins... the other callers
381 * will release the now unnecessary storage
383 * once the context is present, try to grab (but don't block on)
384 * the lock associated with it... if someone
385 * else currently owns it, than the read
386 * will run without read-ahead. this allows
387 * multiple readers to run in parallel and
388 * since there's only 1 read ahead context,
389 * there's no real loss in only allowing 1
390 * reader to have read-ahead enabled.
392 static struct cl_readahead
*
393 cluster_get_rap(vnode_t vp
)
395 struct ubc_info
*ubc
;
396 struct cl_readahead
*rap
;
400 if ((rap
= ubc
->cl_rahead
) == NULL
) {
401 rap
= zalloc_flags(cl_rd_zone
, Z_WAITOK
| Z_ZERO
);
403 lck_mtx_init(&rap
->cl_lockr
, &cl_mtx_grp
, LCK_ATTR_NULL
);
407 if (ubc
->cl_rahead
== NULL
) {
408 ubc
->cl_rahead
= rap
;
410 lck_mtx_destroy(&rap
->cl_lockr
, &cl_mtx_grp
);
411 zfree(cl_rd_zone
, rap
);
412 rap
= ubc
->cl_rahead
;
416 if (lck_mtx_try_lock(&rap
->cl_lockr
) == TRUE
) {
420 return (struct cl_readahead
*)NULL
;
425 * if the write behind context doesn't yet exist,
426 * and CLW_ALLOCATE is specified, allocate and initialize it...
427 * the vnode lock serializes multiple callers
428 * during the actual assignment... first one
429 * to grab the lock wins... the other callers
430 * will release the now unnecessary storage
432 * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
433 * the lock associated with the write behind context before
437 static struct cl_writebehind
*
438 cluster_get_wbp(vnode_t vp
, int flags
)
440 struct ubc_info
*ubc
;
441 struct cl_writebehind
*wbp
;
445 if ((wbp
= ubc
->cl_wbehind
) == NULL
) {
446 if (!(flags
& CLW_ALLOCATE
)) {
447 return (struct cl_writebehind
*)NULL
;
450 wbp
= zalloc_flags(cl_wr_zone
, Z_WAITOK
| Z_ZERO
);
452 lck_mtx_init(&wbp
->cl_lockw
, &cl_mtx_grp
, LCK_ATTR_NULL
);
456 if (ubc
->cl_wbehind
== NULL
) {
457 ubc
->cl_wbehind
= wbp
;
459 lck_mtx_destroy(&wbp
->cl_lockw
, &cl_mtx_grp
);
460 zfree(cl_wr_zone
, wbp
);
461 wbp
= ubc
->cl_wbehind
;
465 if (flags
& CLW_RETURNLOCKED
) {
466 lck_mtx_lock(&wbp
->cl_lockw
);
474 cluster_syncup(vnode_t vp
, off_t newEOF
, int (*callback
)(buf_t
, void *), void *callback_arg
, int flags
)
476 struct cl_writebehind
*wbp
;
478 if ((wbp
= cluster_get_wbp(vp
, 0)) != NULL
) {
479 if (wbp
->cl_number
) {
480 lck_mtx_lock(&wbp
->cl_lockw
);
482 cluster_try_push(wbp
, vp
, newEOF
, PUSH_ALL
| flags
, 0, callback
, callback_arg
, NULL
, FALSE
);
484 lck_mtx_unlock(&wbp
->cl_lockw
);
491 cluster_io_present_in_BC(vnode_t vp
, off_t f_offset
)
495 int (*bootcache_check_fn
)(dev_t device
, u_int64_t blkno
) = bootcache_contains_block
;
497 if (bootcache_check_fn
&& vp
->v_mount
&& vp
->v_mount
->mnt_devvp
) {
498 if (VNOP_BLOCKMAP(vp
, f_offset
, PAGE_SIZE
, &blkno
, &io_size
, NULL
, VNODE_READ
| VNODE_BLOCKMAP_NO_TRACK
, NULL
)) {
506 if (bootcache_check_fn(vp
->v_mount
->mnt_devvp
->v_rdev
, blkno
)) {
515 cluster_is_throttled(vnode_t vp
)
517 return throttle_io_will_be_throttled(-1, vp
->v_mount
);
522 cluster_iostate_wait(struct clios
*iostate
, u_int target
, const char *wait_name
)
524 lck_mtx_lock(&iostate
->io_mtxp
);
526 while ((iostate
->io_issued
- iostate
->io_completed
) > target
) {
527 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 95)) | DBG_FUNC_START
,
528 iostate
->io_issued
, iostate
->io_completed
, target
, 0, 0);
530 iostate
->io_wanted
= 1;
531 msleep((caddr_t
)&iostate
->io_wanted
, &iostate
->io_mtxp
, PRIBIO
+ 1, wait_name
, NULL
);
533 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 95)) | DBG_FUNC_END
,
534 iostate
->io_issued
, iostate
->io_completed
, target
, 0, 0);
536 lck_mtx_unlock(&iostate
->io_mtxp
);
540 cluster_handle_associated_upl(struct clios
*iostate
, upl_t upl
,
541 upl_offset_t upl_offset
, upl_size_t size
)
547 upl_t associated_upl
= upl_associated_upl(upl
);
549 if (!associated_upl
) {
554 printf("1: %d %d\n", upl_offset
, upl_offset
+ size
);
558 * The associated UPL is page aligned to file offsets whereas the
559 * UPL it's attached to has different alignment requirements. The
560 * upl_offset that we have refers to @upl. The code that follows
561 * has to deal with the first and last pages in this transaction
562 * which might straddle pages in the associated UPL. To keep
563 * track of these pages, we use the mark bits: if the mark bit is
564 * set, we know another transaction has completed its part of that
565 * page and so we can unlock that page here.
567 * The following illustrates what we have to deal with:
569 * MEM u <------------ 1 PAGE ------------> e
570 * +-------------+----------------------+-----------------
571 * | |######################|#################
572 * +-------------+----------------------+-----------------
573 * FILE | <--- a ---> o <------------ 1 PAGE ------------>
575 * So here we show a write to offset @o. The data that is to be
576 * written is in a buffer that is not page aligned; it has offset
577 * @a in the page. The upl that carries the data starts in memory
578 * at @u. The associated upl starts in the file at offset @o. A
579 * transaction will always end on a page boundary (like @e above)
580 * except for the very last transaction in the group. We cannot
581 * unlock the page at @o in the associated upl until both the
582 * transaction ending at @e and the following transaction (that
583 * starts at @e) has completed.
587 * We record whether or not the two UPLs are aligned as the mark
588 * bit in the first page of @upl.
590 upl_page_info_t
*pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
591 bool is_unaligned
= upl_page_get_mark(pl
, 0);
594 upl_page_info_t
*assoc_pl
= UPL_GET_INTERNAL_PAGE_LIST(associated_upl
);
596 upl_offset_t upl_end
= upl_offset
+ size
;
597 assert(upl_end
>= PAGE_SIZE
);
599 upl_size_t assoc_upl_size
= upl_get_size(associated_upl
);
602 * In the very first transaction in the group, upl_offset will
603 * not be page aligned, but after that it will be and in that
604 * case we want the preceding page in the associated UPL hence
609 upl_offset
= trunc_page_32(upl_offset
- 1);
612 lck_mtx_lock_spin(&iostate
->io_mtxp
);
614 // Look at the first page...
616 && !upl_page_get_mark(assoc_pl
, upl_offset
>> PAGE_SHIFT
)) {
618 * The first page isn't marked so let another transaction
619 * completion handle it.
621 upl_page_set_mark(assoc_pl
, upl_offset
>> PAGE_SHIFT
, true);
622 upl_offset
+= PAGE_SIZE
;
625 // And now the last page...
628 * This needs to be > rather than >= because if it's equal, it
629 * means there's another transaction that is sharing the last
632 if (upl_end
> assoc_upl_size
) {
633 upl_end
= assoc_upl_size
;
635 upl_end
= trunc_page_32(upl_end
);
636 const int last_pg
= (upl_end
>> PAGE_SHIFT
) - 1;
638 if (!upl_page_get_mark(assoc_pl
, last_pg
)) {
640 * The last page isn't marked so mark the page and let another
641 * transaction completion handle it.
643 upl_page_set_mark(assoc_pl
, last_pg
, true);
644 upl_end
-= PAGE_SIZE
;
648 lck_mtx_unlock(&iostate
->io_mtxp
);
651 printf("2: %d %d\n", upl_offset
, upl_end
);
654 if (upl_end
<= upl_offset
) {
658 size
= upl_end
- upl_offset
;
660 assert(!(upl_offset
& PAGE_MASK
));
661 assert(!(size
& PAGE_MASK
));
667 * We can unlock these pages now and as this is for a
668 * direct/uncached write, we want to dump the pages too.
670 kern_return_t kr
= upl_abort_range(associated_upl
, upl_offset
, size
,
671 UPL_ABORT_DUMP_PAGES
, &empty
);
676 upl_set_associated_upl(upl
, NULL
);
677 upl_deallocate(associated_upl
);
682 cluster_ioerror(upl_t upl
, int upl_offset
, int abort_size
, int error
, int io_flags
, vnode_t vp
)
684 int upl_abort_code
= 0;
688 if ((io_flags
& (B_PHYS
| B_CACHE
)) == (B_PHYS
| B_CACHE
)) {
690 * direct write of any flavor, or a direct read that wasn't aligned
692 ubc_upl_commit_range(upl
, upl_offset
, abort_size
, UPL_COMMIT_FREE_ON_EMPTY
);
694 if (io_flags
& B_PAGEIO
) {
695 if (io_flags
& B_READ
) {
701 if (io_flags
& B_CACHE
) {
703 * leave pages in the cache unchanged on error
705 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
;
706 } else if (((io_flags
& B_READ
) == 0) && ((error
!= ENXIO
) || vnode_isswap(vp
))) {
708 * transient error on pageout/write path... leave pages unchanged
710 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
;
711 } else if (page_in
) {
712 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
;
714 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_DUMP_PAGES
;
717 ubc_upl_abort_range(upl
, upl_offset
, abort_size
, upl_abort_code
);
719 return upl_abort_code
;
724 cluster_iodone(buf_t bp
, void *callback_arg
)
735 int transaction_size
= 0;
742 struct clios
*iostate
;
743 boolean_t transaction_complete
= FALSE
;
745 __IGNORE_WCASTALIGN(cbp_head
= (buf_t
)(bp
->b_trans_head
));
747 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_START
,
748 cbp_head
, bp
->b_lblkno
, bp
->b_bcount
, bp
->b_flags
, 0);
750 if (cbp_head
->b_trans_next
|| !(cbp_head
->b_flags
& B_EOT
)) {
751 lck_mtx_lock_spin(&cl_transaction_mtxp
);
753 bp
->b_flags
|= B_TDONE
;
755 for (cbp
= cbp_head
; cbp
; cbp
= cbp
->b_trans_next
) {
757 * all I/O requests that are part of this transaction
758 * have to complete before we can process it
760 if (!(cbp
->b_flags
& B_TDONE
)) {
761 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
762 cbp_head
, cbp
, cbp
->b_bcount
, cbp
->b_flags
, 0);
764 lck_mtx_unlock(&cl_transaction_mtxp
);
769 if (cbp
->b_trans_next
== CLUSTER_IO_WAITING
) {
770 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
771 cbp_head
, cbp
, cbp
->b_bcount
, cbp
->b_flags
, 0);
773 lck_mtx_unlock(&cl_transaction_mtxp
);
779 if (cbp
->b_flags
& B_EOT
) {
780 transaction_complete
= TRUE
;
783 lck_mtx_unlock(&cl_transaction_mtxp
);
785 if (transaction_complete
== FALSE
) {
786 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
787 cbp_head
, 0, 0, 0, 0);
797 upl_offset
= cbp
->b_uploffset
;
799 b_flags
= cbp
->b_flags
;
800 real_bp
= cbp
->b_real_bp
;
801 zero_offset
= cbp
->b_validend
;
802 iostate
= (struct clios
*)cbp
->b_iostate
;
805 real_bp
->b_dev
= cbp
->b_dev
;
809 if ((cbp
->b_flags
& B_ERROR
) && error
== 0) {
810 error
= cbp
->b_error
;
813 total_resid
+= cbp
->b_resid
;
814 total_size
+= cbp
->b_bcount
;
816 cbp_next
= cbp
->b_trans_next
;
818 if (cbp_next
== NULL
) {
820 * compute the overall size of the transaction
821 * in case we created one that has 'holes' in it
822 * 'total_size' represents the amount of I/O we
823 * did, not the span of the transaction w/r to the UPL
825 transaction_size
= cbp
->b_uploffset
+ cbp
->b_bcount
- upl_offset
;
828 if (cbp
!= cbp_head
) {
835 if (ISSET(b_flags
, B_COMMIT_UPL
)) {
836 cluster_handle_associated_upl(iostate
,
842 if (error
== 0 && total_resid
) {
847 int (*cliodone_func
)(buf_t
, void *) = (int (*)(buf_t
, void *))(cbp_head
->b_cliodone
);
849 if (cliodone_func
!= NULL
) {
850 cbp_head
->b_bcount
= transaction_size
;
852 error
= (*cliodone_func
)(cbp_head
, callback_arg
);
856 cluster_zero(upl
, zero_offset
, PAGE_SIZE
- (zero_offset
& PAGE_MASK
), real_bp
);
859 free_io_buf(cbp_head
);
865 * someone has issued multiple I/Os asynchrounsly
866 * and is waiting for them to complete (streaming)
868 lck_mtx_lock_spin(&iostate
->io_mtxp
);
870 if (error
&& iostate
->io_error
== 0) {
871 iostate
->io_error
= error
;
874 iostate
->io_completed
+= total_size
;
876 if (iostate
->io_wanted
) {
878 * someone is waiting for the state of
879 * this io stream to change
881 iostate
->io_wanted
= 0;
884 lck_mtx_unlock(&iostate
->io_mtxp
);
887 wakeup((caddr_t
)&iostate
->io_wanted
);
891 if (b_flags
& B_COMMIT_UPL
) {
892 pg_offset
= upl_offset
& PAGE_MASK
;
893 commit_size
= (pg_offset
+ transaction_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
896 upl_set_iodone_error(upl
, error
);
898 upl_flags
= cluster_ioerror(upl
, upl_offset
- pg_offset
, commit_size
, error
, b_flags
, vp
);
900 upl_flags
= UPL_COMMIT_FREE_ON_EMPTY
;
902 if ((b_flags
& B_PHYS
) && (b_flags
& B_READ
)) {
903 upl_flags
|= UPL_COMMIT_SET_DIRTY
;
906 if (b_flags
& B_AGE
) {
907 upl_flags
|= UPL_COMMIT_INACTIVATE
;
910 ubc_upl_commit_range(upl
, upl_offset
- pg_offset
, commit_size
, upl_flags
);
915 real_bp
->b_flags
|= B_ERROR
;
916 real_bp
->b_error
= error
;
918 real_bp
->b_resid
= total_resid
;
920 buf_biodone(real_bp
);
922 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
923 upl
, upl_offset
- pg_offset
, commit_size
, (error
<< 24) | upl_flags
, 0);
930 cluster_throttle_io_limit(vnode_t vp
, uint32_t *limit
)
932 if (cluster_is_throttled(vp
)) {
933 *limit
= THROTTLE_MAX_IOSIZE
;
941 cluster_zero(upl_t upl
, upl_offset_t upl_offset
, int size
, buf_t bp
)
943 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 23)) | DBG_FUNC_START
,
944 upl_offset
, size
, bp
, 0, 0);
946 if (bp
== NULL
|| bp
->b_datap
== 0) {
950 pl
= ubc_upl_pageinfo(upl
);
952 if (upl_device_page(pl
) == TRUE
) {
953 zero_addr
= ((addr64_t
)upl_phys_page(pl
, 0) << PAGE_SHIFT
) + upl_offset
;
955 bzero_phys_nc(zero_addr
, size
);
962 page_index
= upl_offset
/ PAGE_SIZE
;
963 page_offset
= upl_offset
& PAGE_MASK
;
965 zero_addr
= ((addr64_t
)upl_phys_page(pl
, page_index
) << PAGE_SHIFT
) + page_offset
;
966 zero_cnt
= min(PAGE_SIZE
- page_offset
, size
);
968 bzero_phys(zero_addr
, zero_cnt
);
971 upl_offset
+= zero_cnt
;
975 bzero((caddr_t
)((vm_offset_t
)bp
->b_datap
+ upl_offset
), size
);
978 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 23)) | DBG_FUNC_END
,
979 upl_offset
, size
, 0, 0, 0);
984 cluster_EOT(buf_t cbp_head
, buf_t cbp_tail
, int zero_offset
)
986 cbp_head
->b_validend
= zero_offset
;
987 cbp_tail
->b_flags
|= B_EOT
;
991 cluster_wait_IO(buf_t cbp_head
, int async
)
997 * Async callback completion will not normally generate a
998 * wakeup upon I/O completion. To get woken up, we set
999 * b_trans_next (which is safe for us to modify) on the last
1000 * buffer to CLUSTER_IO_WAITING so that cluster_iodone knows
1001 * to wake us up when all buffers as part of this transaction
1002 * are completed. This is done under the umbrella of
1003 * cl_transaction_mtxp which is also taken in cluster_iodone.
1008 lck_mtx_lock_spin(&cl_transaction_mtxp
);
1010 for (cbp
= cbp_head
; cbp
; last
= cbp
, cbp
= cbp
->b_trans_next
) {
1011 if (!ISSET(cbp
->b_flags
, B_TDONE
)) {
1017 last
->b_trans_next
= CLUSTER_IO_WAITING
;
1019 DTRACE_IO1(wait__start
, buf_t
, last
);
1021 msleep(last
, &cl_transaction_mtxp
, PSPIN
| (PRIBIO
+ 1), "cluster_wait_IO", NULL
);
1024 * We should only have been woken up if all the
1025 * buffers are completed, but just in case...
1028 for (cbp
= cbp_head
; cbp
!= CLUSTER_IO_WAITING
; cbp
= cbp
->b_trans_next
) {
1029 if (!ISSET(cbp
->b_flags
, B_TDONE
)) {
1035 DTRACE_IO1(wait__done
, buf_t
, last
);
1037 last
->b_trans_next
= NULL
;
1040 lck_mtx_unlock(&cl_transaction_mtxp
);
1042 for (cbp
= cbp_head
; cbp
; cbp
= cbp
->b_trans_next
) {
1049 cluster_complete_transaction(buf_t
*cbp_head
, void *callback_arg
, int *retval
, int flags
, int needwait
)
1053 boolean_t isswapout
= FALSE
;
1056 * cluster_complete_transaction will
1057 * only be called if we've issued a complete chain in synchronous mode
1058 * or, we've already done a cluster_wait_IO on an incomplete chain
1061 for (cbp
= *cbp_head
; cbp
; cbp
= cbp
->b_trans_next
) {
1066 * we've already waited on all of the I/Os in this transaction,
1067 * so mark all of the buf_t's in this transaction as B_TDONE
1068 * so that cluster_iodone sees the transaction as completed
1070 for (cbp
= *cbp_head
; cbp
; cbp
= cbp
->b_trans_next
) {
1071 cbp
->b_flags
|= B_TDONE
;
1075 if ((flags
& (CL_ASYNC
| CL_PAGEOUT
)) == CL_PAGEOUT
&& vnode_isswap(cbp
->b_vp
)) {
1079 error
= cluster_iodone(cbp
, callback_arg
);
1081 if (!(flags
& CL_ASYNC
) && error
&& *retval
== 0) {
1082 if (((flags
& (CL_PAGEOUT
| CL_KEEPCACHED
)) != CL_PAGEOUT
) || (error
!= ENXIO
)) {
1084 } else if (isswapout
== TRUE
) {
1088 *cbp_head
= (buf_t
)NULL
;
1093 cluster_io(vnode_t vp
, upl_t upl
, vm_offset_t upl_offset
, off_t f_offset
, int non_rounded_size
,
1094 int flags
, buf_t real_bp
, struct clios
*iostate
, int (*callback
)(buf_t
, void *), void *callback_arg
)
1103 buf_t cbp_head
= NULL
;
1104 buf_t cbp_tail
= NULL
;
1105 int trans_count
= 0;
1106 int max_trans_count
;
1112 int zero_offset
= 0;
1113 int async_throttle
= 0;
1115 vm_offset_t upl_end_offset
;
1116 boolean_t need_EOT
= FALSE
;
1119 * we currently don't support buffers larger than a page
1121 if (real_bp
&& non_rounded_size
> PAGE_SIZE
) {
1122 panic("%s(): Called with real buffer of size %d bytes which "
1123 "is greater than the maximum allowed size of "
1124 "%d bytes (the system PAGE_SIZE).\n",
1125 __FUNCTION__
, non_rounded_size
, PAGE_SIZE
);
1131 * we don't want to do any funny rounding of the size for IO requests
1132 * coming through the DIRECT or CONTIGUOUS paths... those pages don't
1133 * belong to us... we can't extend (nor do we need to) the I/O to fill
1136 if (mp
->mnt_devblocksize
> 1 && !(flags
& (CL_DEV_MEMORY
| CL_DIRECT_IO
))) {
1138 * round the requested size up so that this I/O ends on a
1139 * page boundary in case this is a 'write'... if the filesystem
1140 * has blocks allocated to back the page beyond the EOF, we want to
1141 * make sure to write out the zero's that are sitting beyond the EOF
1142 * so that in case the filesystem doesn't explicitly zero this area
1143 * if a hole is created via a lseek/write beyond the current EOF,
1144 * it will return zeros when it's read back from the disk. If the
1145 * physical allocation doesn't extend for the whole page, we'll
1146 * only write/read from the disk up to the end of this allocation
1147 * via the extent info returned from the VNOP_BLOCKMAP call.
1149 pg_offset
= upl_offset
& PAGE_MASK
;
1151 size
= (((non_rounded_size
+ pg_offset
) + (PAGE_SIZE
- 1)) & ~PAGE_MASK
) - pg_offset
;
1154 * anyone advertising a blocksize of 1 byte probably
1155 * can't deal with us rounding up the request size
1156 * AFP is one such filesystem/device
1158 size
= non_rounded_size
;
1160 upl_end_offset
= upl_offset
+ size
;
1162 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_START
, (int)f_offset
, size
, upl_offset
, flags
, 0);
1165 * Set the maximum transaction size to the maximum desired number of
1168 max_trans_count
= 8;
1169 if (flags
& CL_DEV_MEMORY
) {
1170 max_trans_count
= 16;
1173 if (flags
& CL_READ
) {
1175 bmap_flags
= VNODE_READ
;
1177 max_iosize
= mp
->mnt_maxreadcnt
;
1178 max_vectors
= mp
->mnt_segreadcnt
;
1181 bmap_flags
= VNODE_WRITE
;
1183 max_iosize
= mp
->mnt_maxwritecnt
;
1184 max_vectors
= mp
->mnt_segwritecnt
;
1186 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_NONE
, max_iosize
, max_vectors
, mp
->mnt_devblocksize
, 0, 0);
1189 * make sure the maximum iosize is a
1190 * multiple of the page size
1192 max_iosize
&= ~PAGE_MASK
;
1195 * Ensure the maximum iosize is sensible.
1198 max_iosize
= PAGE_SIZE
;
1201 if (flags
& CL_THROTTLE
) {
1202 if (!(flags
& CL_PAGEOUT
) && cluster_is_throttled(vp
)) {
1203 if (max_iosize
> THROTTLE_MAX_IOSIZE
) {
1204 max_iosize
= THROTTLE_MAX_IOSIZE
;
1206 async_throttle
= THROTTLE_MAXCNT
;
1208 if ((flags
& CL_DEV_MEMORY
)) {
1209 async_throttle
= IO_SCALE(vp
, VNODE_ASYNC_THROTTLE
);
1212 u_int max_cluster_size
;
1215 if (vp
->v_mount
->mnt_minsaturationbytecount
) {
1216 max_cluster_size
= vp
->v_mount
->mnt_minsaturationbytecount
;
1220 max_cluster_size
= MAX_CLUSTER_SIZE(vp
);
1222 if (disk_conditioner_mount_is_ssd(vp
->v_mount
)) {
1223 scale
= WRITE_THROTTLE_SSD
;
1225 scale
= WRITE_THROTTLE
;
1228 if (max_iosize
> max_cluster_size
) {
1229 max_cluster
= max_cluster_size
;
1231 max_cluster
= max_iosize
;
1234 if (size
< max_cluster
) {
1238 if (flags
& CL_CLOSE
) {
1239 scale
+= MAX_CLUSTERS
;
1242 async_throttle
= min(IO_SCALE(vp
, VNODE_ASYNC_THROTTLE
), ((scale
* max_cluster_size
) / max_cluster
) - 1);
1246 if (flags
& CL_AGE
) {
1249 if (flags
& (CL_PAGEIN
| CL_PAGEOUT
)) {
1250 io_flags
|= B_PAGEIO
;
1252 if (flags
& (CL_IOSTREAMING
)) {
1253 io_flags
|= B_IOSTREAMING
;
1255 if (flags
& CL_COMMIT
) {
1256 io_flags
|= B_COMMIT_UPL
;
1258 if (flags
& CL_DIRECT_IO
) {
1261 if (flags
& (CL_PRESERVE
| CL_KEEPCACHED
)) {
1262 io_flags
|= B_CACHE
;
1264 if (flags
& CL_PASSIVE
) {
1265 io_flags
|= B_PASSIVE
;
1267 if (flags
& CL_ENCRYPTED
) {
1268 io_flags
|= B_ENCRYPTED_IO
;
1271 if (vp
->v_flag
& VSYSTEM
) {
1275 if ((flags
& CL_READ
) && ((upl_offset
+ non_rounded_size
) & PAGE_MASK
) && (!(flags
& CL_NOZERO
))) {
1277 * then we are going to end up
1278 * with a page that we can't complete (the file size wasn't a multiple
1279 * of PAGE_SIZE and we're trying to read to the end of the file
1280 * so we'll go ahead and zero out the portion of the page we can't
1281 * read in from the file
1283 zero_offset
= (int)(upl_offset
+ non_rounded_size
);
1284 } else if (!ISSET(flags
, CL_READ
) && ISSET(flags
, CL_DIRECT_IO
)) {
1285 assert(ISSET(flags
, CL_COMMIT
));
1287 // For a direct/uncached write, we need to lock pages...
1292 * Create a UPL to lock the pages in the cache whilst the
1293 * write is in progress.
1295 ubc_create_upl_kernel(vp
, f_offset
, non_rounded_size
, &cached_upl
,
1296 NULL
, UPL_SET_LITE
, VM_KERN_MEMORY_FILE
);
1299 * Attach this UPL to the other UPL so that we can find it
1302 upl_set_associated_upl(upl
, cached_upl
);
1304 if (upl_offset
& PAGE_MASK
) {
1306 * The two UPLs are not aligned, so mark the first page in
1307 * @upl so that cluster_handle_associated_upl can handle
1310 upl_page_info_t
*pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
1311 upl_page_set_mark(pl
, 0, true);
1318 u_int io_size_wanted
;
1321 if (size
> max_iosize
) {
1322 io_size
= max_iosize
;
1327 io_size_wanted
= io_size
;
1328 io_size_tmp
= (size_t)io_size
;
1330 if ((error
= VNOP_BLOCKMAP(vp
, f_offset
, io_size
, &blkno
, &io_size_tmp
, NULL
, bmap_flags
, NULL
))) {
1334 if (io_size_tmp
> io_size_wanted
) {
1335 io_size
= io_size_wanted
;
1337 io_size
= (u_int
)io_size_tmp
;
1340 if (real_bp
&& (real_bp
->b_blkno
== real_bp
->b_lblkno
)) {
1341 real_bp
->b_blkno
= blkno
;
1344 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 24)) | DBG_FUNC_NONE
,
1345 (int)f_offset
, (int)(blkno
>> 32), (int)blkno
, io_size
, 0);
1349 * vnop_blockmap didn't return an error... however, it did
1350 * return an extent size of 0 which means we can't
1351 * make forward progress on this I/O... a hole in the
1352 * file would be returned as a blkno of -1 with a non-zero io_size
1353 * a real extent is returned with a blkno != -1 and a non-zero io_size
1358 if (!(flags
& CL_READ
) && blkno
== -1) {
1362 if (upl_get_internal_vectorupl(upl
)) {
1363 panic("Vector UPLs should not take this code-path\n");
1366 * we're writing into a 'hole'
1368 if (flags
& CL_PAGEOUT
) {
1370 * if we got here via cluster_pageout
1371 * then just error the request and return
1372 * the 'hole' should already have been covered
1378 * we can get here if the cluster code happens to
1379 * pick up a page that was dirtied via mmap vs
1380 * a 'write' and the page targets a 'hole'...
1381 * i.e. the writes to the cluster were sparse
1382 * and the file was being written for the first time
1384 * we can also get here if the filesystem supports
1385 * 'holes' that are less than PAGE_SIZE.... because
1386 * we can't know if the range in the page that covers
1387 * the 'hole' has been dirtied via an mmap or not,
1388 * we have to assume the worst and try to push the
1389 * entire page to storage.
1391 * Try paging out the page individually before
1392 * giving up entirely and dumping it (the pageout
1393 * path will insure that the zero extent accounting
1394 * has been taken care of before we get back into cluster_io)
1396 * go direct to vnode_pageout so that we don't have to
1397 * unbusy the page from the UPL... we used to do this
1398 * so that we could call ubc_msync, but that results
1399 * in a potential deadlock if someone else races us to acquire
1400 * that page and wins and in addition needs one of the pages
1401 * we're continuing to hold in the UPL
1403 pageout_flags
= UPL_MSYNC
| UPL_VNODE_PAGER
| UPL_NESTED_PAGEOUT
;
1405 if (!(flags
& CL_ASYNC
)) {
1406 pageout_flags
|= UPL_IOSYNC
;
1408 if (!(flags
& CL_COMMIT
)) {
1409 pageout_flags
|= UPL_NOCOMMIT
;
1414 uint32_t bytes_in_last_page
;
1417 * first we have to wait for the the current outstanding I/Os
1418 * to complete... EOT hasn't been set yet on this transaction
1419 * so the pages won't be released
1421 cluster_wait_IO(cbp_head
, (flags
& CL_ASYNC
));
1423 bytes_in_last_page
= cbp_head
->b_uploffset
& PAGE_MASK
;
1424 for (cbp
= cbp_head
; cbp
; cbp
= cbp
->b_trans_next
) {
1425 bytes_in_last_page
+= cbp
->b_bcount
;
1427 bytes_in_last_page
&= PAGE_MASK
;
1429 while (bytes_in_last_page
) {
1431 * we've got a transcation that
1432 * includes the page we're about to push out through vnode_pageout...
1433 * find the bp's in the list which intersect this page and either
1434 * remove them entirely from the transaction (there could be multiple bp's), or
1435 * round it's iosize down to the page boundary (there can only be one)...
1437 * find the last bp in the list and act on it
1439 for (prev_cbp
= cbp
= cbp_head
; cbp
->b_trans_next
; cbp
= cbp
->b_trans_next
) {
1443 if (bytes_in_last_page
>= cbp
->b_bcount
) {
1445 * this buf no longer has any I/O associated with it
1447 bytes_in_last_page
-= cbp
->b_bcount
;
1452 if (cbp
== cbp_head
) {
1453 assert(bytes_in_last_page
== 0);
1455 * the buf we just freed was the only buf in
1456 * this transaction... so there's no I/O to do
1462 * remove the buf we just freed from
1463 * the transaction list
1465 prev_cbp
->b_trans_next
= NULL
;
1466 cbp_tail
= prev_cbp
;
1470 * this is the last bp that has I/O
1471 * intersecting the page of interest
1472 * only some of the I/O is in the intersection
1473 * so clip the size but keep it in the transaction list
1475 cbp
->b_bcount
-= bytes_in_last_page
;
1477 bytes_in_last_page
= 0;
1482 * there was more to the current transaction
1483 * than just the page we are pushing out via vnode_pageout...
1484 * mark it as finished and complete it... we've already
1485 * waited for the I/Os to complete above in the call to cluster_wait_IO
1487 cluster_EOT(cbp_head
, cbp_tail
, 0);
1489 cluster_complete_transaction(&cbp_head
, callback_arg
, &retval
, flags
, 0);
1494 if (vnode_pageout(vp
, upl
, (upl_offset_t
)trunc_page(upl_offset
), trunc_page_64(f_offset
), PAGE_SIZE
, pageout_flags
, NULL
) != PAGER_SUCCESS
) {
1497 e_offset
= round_page_64(f_offset
+ 1);
1498 io_size
= (u_int
)(e_offset
- f_offset
);
1500 f_offset
+= io_size
;
1501 upl_offset
+= io_size
;
1503 if (size
>= io_size
) {
1509 * keep track of how much of the original request
1510 * that we've actually completed... non_rounded_size
1511 * may go negative due to us rounding the request
1512 * to a page size multiple (i.e. size > non_rounded_size)
1514 non_rounded_size
-= io_size
;
1516 if (non_rounded_size
<= 0) {
1518 * we've transferred all of the data in the original
1519 * request, but we were unable to complete the tail
1520 * of the last page because the file didn't have
1521 * an allocation to back that portion... this is ok.
1527 flags
&= ~CL_COMMIT
;
1533 lblkno
= (daddr64_t
)(f_offset
/ 0x1000);
1535 * we have now figured out how much I/O we can do - this is in 'io_size'
1536 * pg_offset is the starting point in the first page for the I/O
1537 * pg_count is the number of full and partial pages that 'io_size' encompasses
1539 pg_offset
= upl_offset
& PAGE_MASK
;
1541 if (flags
& CL_DEV_MEMORY
) {
1543 * treat physical requests as one 'giant' page
1547 pg_count
= (io_size
+ pg_offset
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
1550 if ((flags
& CL_READ
) && blkno
== -1) {
1551 vm_offset_t commit_offset
;
1553 int complete_transaction_now
= 0;
1556 * if we're reading and blkno == -1, then we've got a
1557 * 'hole' in the file that we need to deal with by zeroing
1558 * out the affected area in the upl
1560 if (io_size
>= (u_int
)non_rounded_size
) {
1562 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
1563 * than 'zero_offset' will be non-zero
1564 * if the 'hole' returned by vnop_blockmap extends all the way to the eof
1565 * (indicated by the io_size finishing off the I/O request for this UPL)
1566 * than we're not going to issue an I/O for the
1567 * last page in this upl... we need to zero both the hole and the tail
1568 * of the page beyond the EOF, since the delayed zero-fill won't kick in
1570 bytes_to_zero
= non_rounded_size
;
1571 if (!(flags
& CL_NOZERO
)) {
1572 bytes_to_zero
= (int)((((upl_offset
+ io_size
) + (PAGE_SIZE
- 1)) & ~PAGE_MASK
) - upl_offset
);
1577 bytes_to_zero
= io_size
;
1582 cluster_zero(upl
, (upl_offset_t
)upl_offset
, bytes_to_zero
, real_bp
);
1588 * if there is a current I/O chain pending
1589 * then the first page of the group we just zero'd
1590 * will be handled by the I/O completion if the zero
1591 * fill started in the middle of the page
1593 commit_offset
= (upl_offset
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
1595 pg_resid
= (int)(commit_offset
- upl_offset
);
1597 if (bytes_to_zero
>= pg_resid
) {
1599 * the last page of the current I/O
1600 * has been completed...
1601 * compute the number of fully zero'd
1602 * pages that are beyond it
1603 * plus the last page if its partial
1604 * and we have no more I/O to issue...
1605 * otherwise a partial page is left
1606 * to begin the next I/O
1608 if ((int)io_size
>= non_rounded_size
) {
1609 pg_count
= (bytes_to_zero
- pg_resid
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
1611 pg_count
= (bytes_to_zero
- pg_resid
) / PAGE_SIZE
;
1614 complete_transaction_now
= 1;
1618 * no pending I/O to deal with
1619 * so, commit all of the fully zero'd pages
1620 * plus the last page if its partial
1621 * and we have no more I/O to issue...
1622 * otherwise a partial page is left
1623 * to begin the next I/O
1625 if ((int)io_size
>= non_rounded_size
) {
1626 pg_count
= (pg_offset
+ bytes_to_zero
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
1628 pg_count
= (pg_offset
+ bytes_to_zero
) / PAGE_SIZE
;
1631 commit_offset
= upl_offset
& ~PAGE_MASK
;
1634 // Associated UPL is currently only used in the direct write path
1635 assert(!upl_associated_upl(upl
));
1637 if ((flags
& CL_COMMIT
) && pg_count
) {
1638 ubc_upl_commit_range(upl
, (upl_offset_t
)commit_offset
,
1639 pg_count
* PAGE_SIZE
,
1640 UPL_COMMIT_CLEAR_DIRTY
| UPL_COMMIT_FREE_ON_EMPTY
);
1642 upl_offset
+= io_size
;
1643 f_offset
+= io_size
;
1647 * keep track of how much of the original request
1648 * that we've actually completed... non_rounded_size
1649 * may go negative due to us rounding the request
1650 * to a page size multiple (i.e. size > non_rounded_size)
1652 non_rounded_size
-= io_size
;
1654 if (non_rounded_size
<= 0) {
1656 * we've transferred all of the data in the original
1657 * request, but we were unable to complete the tail
1658 * of the last page because the file didn't have
1659 * an allocation to back that portion... this is ok.
1663 if (cbp_head
&& (complete_transaction_now
|| size
== 0)) {
1664 cluster_wait_IO(cbp_head
, (flags
& CL_ASYNC
));
1666 cluster_EOT(cbp_head
, cbp_tail
, size
== 0 ? zero_offset
: 0);
1668 cluster_complete_transaction(&cbp_head
, callback_arg
, &retval
, flags
, 0);
1674 if (pg_count
> max_vectors
) {
1675 if (((pg_count
- max_vectors
) * PAGE_SIZE
) > io_size
) {
1676 io_size
= PAGE_SIZE
- pg_offset
;
1679 io_size
-= (pg_count
- max_vectors
) * PAGE_SIZE
;
1680 pg_count
= max_vectors
;
1684 * If the transaction is going to reach the maximum number of
1685 * desired elements, truncate the i/o to the nearest page so
1686 * that the actual i/o is initiated after this buffer is
1687 * created and added to the i/o chain.
1689 * I/O directed to physically contiguous memory
1690 * doesn't have a requirement to make sure we 'fill' a page
1692 if (!(flags
& CL_DEV_MEMORY
) && trans_count
>= max_trans_count
&&
1693 ((upl_offset
+ io_size
) & PAGE_MASK
)) {
1694 vm_offset_t aligned_ofs
;
1696 aligned_ofs
= (upl_offset
+ io_size
) & ~PAGE_MASK
;
1698 * If the io_size does not actually finish off even a
1699 * single page we have to keep adding buffers to the
1700 * transaction despite having reached the desired limit.
1702 * Eventually we get here with the page being finished
1703 * off (and exceeded) and then we truncate the size of
1704 * this i/o request so that it is page aligned so that
1705 * we can finally issue the i/o on the transaction.
1707 if (aligned_ofs
> upl_offset
) {
1708 io_size
= (u_int
)(aligned_ofs
- upl_offset
);
1713 if (!(mp
->mnt_kern_flag
& MNTK_VIRTUALDEV
)) {
1715 * if we're not targeting a virtual device i.e. a disk image
1716 * it's safe to dip into the reserve pool since real devices
1717 * can complete this I/O request without requiring additional
1718 * bufs from the alloc_io_buf pool
1721 } else if ((flags
& CL_ASYNC
) && !(flags
& CL_PAGEOUT
) && !cbp_head
) {
1723 * Throttle the speculative IO
1725 * We can only throttle this if it is the first iobuf
1726 * for the transaction. alloc_io_buf implements
1727 * additional restrictions for diskimages anyway.
1734 cbp
= alloc_io_buf(vp
, priv
);
1736 if (flags
& CL_PAGEOUT
) {
1740 * since blocks are in offsets of 0x1000, scale
1741 * iteration to (PAGE_SIZE * pg_count) of blks.
1743 for (i
= 0; i
< (PAGE_SIZE
* pg_count
) / 0x1000; i
++) {
1744 if (buf_invalblkno(vp
, lblkno
+ i
, 0) == EBUSY
) {
1745 panic("BUSY bp found in cluster_io");
1749 if (flags
& CL_ASYNC
) {
1750 if (buf_setcallback(cbp
, (void *)cluster_iodone
, callback_arg
)) {
1751 panic("buf_setcallback failed\n");
1754 cbp
->b_cliodone
= (void *)callback
;
1755 cbp
->b_flags
|= io_flags
;
1756 if (flags
& CL_NOCACHE
) {
1757 cbp
->b_attr
.ba_flags
|= BA_NOCACHE
;
1760 cbp
->b_lblkno
= lblkno
;
1761 cbp
->b_blkno
= blkno
;
1762 cbp
->b_bcount
= io_size
;
1764 if (buf_setupl(cbp
, upl
, (uint32_t)upl_offset
)) {
1765 panic("buf_setupl failed\n");
1768 upl_set_blkno(upl
, upl_offset
, io_size
, blkno
);
1770 cbp
->b_trans_next
= (buf_t
)NULL
;
1772 if ((cbp
->b_iostate
= (void *)iostate
)) {
1774 * caller wants to track the state of this
1775 * io... bump the amount issued against this stream
1777 iostate
->io_issued
+= io_size
;
1780 if (flags
& CL_READ
) {
1781 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 26)) | DBG_FUNC_NONE
,
1782 (int)cbp
->b_lblkno
, (int)cbp
->b_blkno
, upl_offset
, io_size
, 0);
1784 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 27)) | DBG_FUNC_NONE
,
1785 (int)cbp
->b_lblkno
, (int)cbp
->b_blkno
, upl_offset
, io_size
, 0);
1789 cbp_tail
->b_trans_next
= cbp
;
1795 if ((cbp_head
->b_real_bp
= real_bp
)) {
1796 real_bp
= (buf_t
)NULL
;
1799 *(buf_t
*)(&cbp
->b_trans_head
) = cbp_head
;
1803 upl_offset
+= io_size
;
1804 f_offset
+= io_size
;
1807 * keep track of how much of the original request
1808 * that we've actually completed... non_rounded_size
1809 * may go negative due to us rounding the request
1810 * to a page size multiple (i.e. size > non_rounded_size)
1812 non_rounded_size
-= io_size
;
1814 if (non_rounded_size
<= 0) {
1816 * we've transferred all of the data in the original
1817 * request, but we were unable to complete the tail
1818 * of the last page because the file didn't have
1819 * an allocation to back that portion... this is ok.
1825 * we have no more I/O to issue, so go
1826 * finish the final transaction
1829 } else if (((flags
& CL_DEV_MEMORY
) || (upl_offset
& PAGE_MASK
) == 0) &&
1830 ((flags
& CL_ASYNC
) || trans_count
> max_trans_count
)) {
1832 * I/O directed to physically contiguous memory...
1833 * which doesn't have a requirement to make sure we 'fill' a page
1835 * the current I/O we've prepared fully
1836 * completes the last page in this request
1838 * it's either an ASYNC request or
1839 * we've already accumulated more than 8 I/O's into
1840 * this transaction so mark it as complete so that
1841 * it can finish asynchronously or via the cluster_complete_transaction
1842 * below if the request is synchronous
1846 if (need_EOT
== TRUE
) {
1847 cluster_EOT(cbp_head
, cbp_tail
, size
== 0 ? zero_offset
: 0);
1850 if (flags
& CL_THROTTLE
) {
1851 (void)vnode_waitforwrites(vp
, async_throttle
, 0, 0, "cluster_io");
1854 if (!(io_flags
& B_READ
)) {
1855 vnode_startwrite(vp
);
1858 if (flags
& CL_RAW_ENCRYPTED
) {
1860 * User requested raw encrypted bytes.
1861 * Twiddle the bit in the ba_flags for the buffer
1863 cbp
->b_attr
.ba_flags
|= BA_RAW_ENCRYPTED_IO
;
1866 (void) VNOP_STRATEGY(cbp
);
1868 if (need_EOT
== TRUE
) {
1869 if (!(flags
& CL_ASYNC
)) {
1870 cluster_complete_transaction(&cbp_head
, callback_arg
, &retval
, flags
, 1);
1885 * Wait until all of the outstanding I/O
1886 * for this partial transaction has completed
1888 cluster_wait_IO(cbp_head
, (flags
& CL_ASYNC
));
1891 * Rewind the upl offset to the beginning of the
1894 upl_offset
= cbp_head
->b_uploffset
;
1897 if (ISSET(flags
, CL_COMMIT
)) {
1898 cluster_handle_associated_upl(iostate
, upl
,
1899 (upl_offset_t
)upl_offset
,
1900 (upl_size_t
)(upl_end_offset
- upl_offset
));
1903 // Free all the IO buffers in this transaction
1904 for (cbp
= cbp_head
; cbp
;) {
1907 size
+= cbp
->b_bcount
;
1908 io_size
+= cbp
->b_bcount
;
1910 cbp_next
= cbp
->b_trans_next
;
1916 int need_wakeup
= 0;
1919 * update the error condition for this stream
1920 * since we never really issued the io
1921 * just go ahead and adjust it back
1923 lck_mtx_lock_spin(&iostate
->io_mtxp
);
1925 if (iostate
->io_error
== 0) {
1926 iostate
->io_error
= error
;
1928 iostate
->io_issued
-= io_size
;
1930 if (iostate
->io_wanted
) {
1932 * someone is waiting for the state of
1933 * this io stream to change
1935 iostate
->io_wanted
= 0;
1938 lck_mtx_unlock(&iostate
->io_mtxp
);
1941 wakeup((caddr_t
)&iostate
->io_wanted
);
1945 if (flags
& CL_COMMIT
) {
1948 pg_offset
= upl_offset
& PAGE_MASK
;
1949 abort_size
= (int)((upl_end_offset
- upl_offset
+ PAGE_MASK
) & ~PAGE_MASK
);
1951 upl_flags
= cluster_ioerror(upl
, (int)(upl_offset
- pg_offset
),
1952 abort_size
, error
, io_flags
, vp
);
1954 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 28)) | DBG_FUNC_NONE
,
1955 upl
, upl_offset
- pg_offset
, abort_size
, (error
<< 24) | upl_flags
, 0);
1960 } else if (cbp_head
) {
1961 panic("%s(): cbp_head is not NULL.\n", __FUNCTION__
);
1966 * can get here if we either encountered an error
1967 * or we completely zero-filled the request and
1971 real_bp
->b_flags
|= B_ERROR
;
1972 real_bp
->b_error
= error
;
1974 buf_biodone(real_bp
);
1976 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_END
, (int)f_offset
, size
, upl_offset
, retval
, 0);
1981 #define reset_vector_run_state() \
1982 issueVectorUPL = vector_upl_offset = vector_upl_index = vector_upl_iosize = vector_upl_size = 0;
1985 vector_cluster_io(vnode_t vp
, upl_t vector_upl
, vm_offset_t vector_upl_offset
, off_t v_upl_uio_offset
, int vector_upl_iosize
,
1986 int io_flag
, buf_t real_bp
, struct clios
*iostate
, int (*callback
)(buf_t
, void *), void *callback_arg
)
1988 vector_upl_set_pagelist(vector_upl
);
1990 if (io_flag
& CL_READ
) {
1991 if (vector_upl_offset
== 0 && ((vector_upl_iosize
& PAGE_MASK
) == 0)) {
1992 io_flag
&= ~CL_PRESERVE
; /*don't zero fill*/
1994 io_flag
|= CL_PRESERVE
; /*zero fill*/
1997 return cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, real_bp
, iostate
, callback
, callback_arg
);
2001 cluster_read_prefetch(vnode_t vp
, off_t f_offset
, u_int size
, off_t filesize
, int (*callback
)(buf_t
, void *), void *callback_arg
, int bflag
)
2003 int pages_in_prefetch
;
2005 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_START
,
2006 (int)f_offset
, size
, (int)filesize
, 0, 0);
2008 if (f_offset
>= filesize
) {
2009 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_END
,
2010 (int)f_offset
, 0, 0, 0, 0);
2013 if ((off_t
)size
> (filesize
- f_offset
)) {
2014 size
= (u_int
)(filesize
- f_offset
);
2016 pages_in_prefetch
= (size
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
2018 advisory_read_ext(vp
, filesize
, f_offset
, size
, callback
, callback_arg
, bflag
);
2020 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_END
,
2021 (int)f_offset
+ size
, pages_in_prefetch
, 0, 1, 0);
2023 return pages_in_prefetch
;
2029 cluster_read_ahead(vnode_t vp
, struct cl_extent
*extent
, off_t filesize
, struct cl_readahead
*rap
, int (*callback
)(buf_t
, void *), void *callback_arg
,
2034 int size_of_prefetch
;
2038 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_START
,
2039 (int)extent
->b_addr
, (int)extent
->e_addr
, (int)rap
->cl_lastr
, 0, 0);
2041 if (extent
->b_addr
== rap
->cl_lastr
&& extent
->b_addr
== extent
->e_addr
) {
2042 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
2043 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 0, 0);
2046 if (rap
->cl_lastr
== -1 || (extent
->b_addr
!= rap
->cl_lastr
&& extent
->b_addr
!= (rap
->cl_lastr
+ 1))) {
2050 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
2051 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 1, 0);
2055 max_prefetch
= MAX_PREFETCH(vp
, cluster_max_io_size(vp
->v_mount
, CL_READ
), disk_conditioner_mount_is_ssd(vp
->v_mount
));
2057 if (max_prefetch
> speculative_prefetch_max
) {
2058 max_prefetch
= speculative_prefetch_max
;
2061 if (max_prefetch
<= PAGE_SIZE
) {
2062 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
2063 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 6, 0);
2066 if (extent
->e_addr
< rap
->cl_maxra
&& rap
->cl_ralen
>= 4) {
2067 if ((rap
->cl_maxra
- extent
->e_addr
) > (rap
->cl_ralen
/ 4)) {
2068 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
2069 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 2, 0);
2073 r_addr
= MAX(extent
->e_addr
, rap
->cl_maxra
) + 1;
2074 f_offset
= (off_t
)(r_addr
* PAGE_SIZE_64
);
2076 size_of_prefetch
= 0;
2078 ubc_range_op(vp
, f_offset
, f_offset
+ PAGE_SIZE_64
, UPL_ROP_PRESENT
, &size_of_prefetch
);
2080 if (size_of_prefetch
) {
2081 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
2082 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 3, 0);
2085 if (f_offset
< filesize
) {
2086 daddr64_t read_size
;
2088 rap
->cl_ralen
= rap
->cl_ralen
? min(max_prefetch
/ PAGE_SIZE
, rap
->cl_ralen
<< 1) : 1;
2090 read_size
= (extent
->e_addr
+ 1) - extent
->b_addr
;
2092 if (read_size
> rap
->cl_ralen
) {
2093 if (read_size
> max_prefetch
/ PAGE_SIZE
) {
2094 rap
->cl_ralen
= max_prefetch
/ PAGE_SIZE
;
2096 rap
->cl_ralen
= (int)read_size
;
2099 size_of_prefetch
= cluster_read_prefetch(vp
, f_offset
, rap
->cl_ralen
* PAGE_SIZE
, filesize
, callback
, callback_arg
, bflag
);
2101 if (size_of_prefetch
) {
2102 rap
->cl_maxra
= (r_addr
+ size_of_prefetch
) - 1;
2105 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
2106 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 4, 0);
2111 cluster_pageout(vnode_t vp
, upl_t upl
, upl_offset_t upl_offset
, off_t f_offset
,
2112 int size
, off_t filesize
, int flags
)
2114 return cluster_pageout_ext(vp
, upl
, upl_offset
, f_offset
, size
, filesize
, flags
, NULL
, NULL
);
2119 cluster_pageout_ext(vnode_t vp
, upl_t upl
, upl_offset_t upl_offset
, off_t f_offset
,
2120 int size
, off_t filesize
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
2127 local_flags
= CL_PAGEOUT
| CL_THROTTLE
;
2129 if ((flags
& UPL_IOSYNC
) == 0) {
2130 local_flags
|= CL_ASYNC
;
2132 if ((flags
& UPL_NOCOMMIT
) == 0) {
2133 local_flags
|= CL_COMMIT
;
2135 if ((flags
& UPL_KEEPCACHED
)) {
2136 local_flags
|= CL_KEEPCACHED
;
2138 if (flags
& UPL_PAGING_ENCRYPTED
) {
2139 local_flags
|= CL_ENCRYPTED
;
2143 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 52)) | DBG_FUNC_NONE
,
2144 (int)f_offset
, size
, (int)filesize
, local_flags
, 0);
2147 * If they didn't specify any I/O, then we are done...
2148 * we can't issue an abort because we don't know how
2149 * big the upl really is
2155 if (vp
->v_mount
->mnt_flag
& MNT_RDONLY
) {
2156 if (local_flags
& CL_COMMIT
) {
2157 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
);
2162 * can't page-in from a negative offset
2163 * or if we're starting beyond the EOF
2164 * or if the file offset isn't page aligned
2165 * or the size requested isn't a multiple of PAGE_SIZE
2167 if (f_offset
< 0 || f_offset
>= filesize
||
2168 (f_offset
& PAGE_MASK_64
) || (size
& PAGE_MASK
)) {
2169 if (local_flags
& CL_COMMIT
) {
2170 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
);
2174 max_size
= filesize
- f_offset
;
2176 if (size
< max_size
) {
2179 io_size
= (int)max_size
;
2182 rounded_size
= (io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
2184 if (size
> rounded_size
) {
2185 if (local_flags
& CL_COMMIT
) {
2186 ubc_upl_abort_range(upl
, upl_offset
+ rounded_size
, size
- rounded_size
,
2187 UPL_ABORT_FREE_ON_EMPTY
);
2190 return cluster_io(vp
, upl
, upl_offset
, f_offset
, io_size
,
2191 local_flags
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
);
2196 cluster_pagein(vnode_t vp
, upl_t upl
, upl_offset_t upl_offset
, off_t f_offset
,
2197 int size
, off_t filesize
, int flags
)
2199 return cluster_pagein_ext(vp
, upl
, upl_offset
, f_offset
, size
, filesize
, flags
, NULL
, NULL
);
2204 cluster_pagein_ext(vnode_t vp
, upl_t upl
, upl_offset_t upl_offset
, off_t f_offset
,
2205 int size
, off_t filesize
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
2211 int local_flags
= 0;
2213 if (upl
== NULL
|| size
< 0) {
2214 panic("cluster_pagein: NULL upl passed in");
2217 if ((flags
& UPL_IOSYNC
) == 0) {
2218 local_flags
|= CL_ASYNC
;
2220 if ((flags
& UPL_NOCOMMIT
) == 0) {
2221 local_flags
|= CL_COMMIT
;
2223 if (flags
& UPL_IOSTREAMING
) {
2224 local_flags
|= CL_IOSTREAMING
;
2226 if (flags
& UPL_PAGING_ENCRYPTED
) {
2227 local_flags
|= CL_ENCRYPTED
;
2231 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 56)) | DBG_FUNC_NONE
,
2232 (int)f_offset
, size
, (int)filesize
, local_flags
, 0);
2235 * can't page-in from a negative offset
2236 * or if we're starting beyond the EOF
2237 * or if the file offset isn't page aligned
2238 * or the size requested isn't a multiple of PAGE_SIZE
2240 if (f_offset
< 0 || f_offset
>= filesize
||
2241 (f_offset
& PAGE_MASK_64
) || (size
& PAGE_MASK
) || (upl_offset
& PAGE_MASK
)) {
2242 if (local_flags
& CL_COMMIT
) {
2243 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
);
2247 max_size
= filesize
- f_offset
;
2249 if (size
< max_size
) {
2252 io_size
= (int)max_size
;
2255 rounded_size
= (io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
2257 if (size
> rounded_size
&& (local_flags
& CL_COMMIT
)) {
2258 ubc_upl_abort_range(upl
, upl_offset
+ rounded_size
,
2259 size
- rounded_size
, UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
);
2262 retval
= cluster_io(vp
, upl
, upl_offset
, f_offset
, io_size
,
2263 local_flags
| CL_READ
| CL_PAGEIN
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
);
2270 cluster_bp(buf_t bp
)
2272 return cluster_bp_ext(bp
, NULL
, NULL
);
2277 cluster_bp_ext(buf_t bp
, int (*callback
)(buf_t
, void *), void *callback_arg
)
2282 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 19)) | DBG_FUNC_START
,
2283 bp
, (int)bp
->b_lblkno
, bp
->b_bcount
, bp
->b_flags
, 0);
2285 if (bp
->b_flags
& B_READ
) {
2286 flags
= CL_ASYNC
| CL_READ
;
2290 if (bp
->b_flags
& B_PASSIVE
) {
2291 flags
|= CL_PASSIVE
;
2294 f_offset
= ubc_blktooff(bp
->b_vp
, bp
->b_lblkno
);
2296 return cluster_io(bp
->b_vp
, bp
->b_upl
, 0, f_offset
, bp
->b_bcount
, flags
, bp
, (struct clios
*)NULL
, callback
, callback_arg
);
2302 cluster_write(vnode_t vp
, struct uio
*uio
, off_t oldEOF
, off_t newEOF
, off_t headOff
, off_t tailOff
, int xflags
)
2304 return cluster_write_ext(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, xflags
, NULL
, NULL
);
2309 cluster_write_ext(vnode_t vp
, struct uio
*uio
, off_t oldEOF
, off_t newEOF
, off_t headOff
, off_t tailOff
,
2310 int xflags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
2312 user_ssize_t cur_resid
;
2317 int write_type
= IO_COPY
;
2318 u_int32_t write_length
;
2322 if (flags
& IO_PASSIVE
) {
2328 if (vp
->v_flag
& VNOCACHE_DATA
) {
2329 flags
|= IO_NOCACHE
;
2330 bflag
|= CL_NOCACHE
;
2335 * this call is being made to zero-fill some range in the file
2337 retval
= cluster_write_copy(vp
, NULL
, (u_int32_t
)0, oldEOF
, newEOF
, headOff
, tailOff
, flags
, callback
, callback_arg
);
2342 * do a write through the cache if one of the following is true....
2343 * NOCACHE is not true or NODIRECT is true
2344 * the uio request doesn't target USERSPACE
2345 * otherwise, find out if we want the direct or contig variant for
2346 * the first vector in the uio request
2348 if (((flags
& (IO_NOCACHE
| IO_NODIRECT
)) == IO_NOCACHE
) && UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
)) {
2349 retval
= cluster_io_type(uio
, &write_type
, &write_length
, MIN_DIRECT_WRITE_SIZE
);
2352 if ((flags
& (IO_TAILZEROFILL
| IO_HEADZEROFILL
)) && write_type
== IO_DIRECT
) {
2354 * must go through the cached variant in this case
2356 write_type
= IO_COPY
;
2359 while ((cur_resid
= uio_resid(uio
)) && uio
->uio_offset
< newEOF
&& retval
== 0) {
2360 switch (write_type
) {
2363 * make sure the uio_resid isn't too big...
2364 * internally, we want to handle all of the I/O in
2365 * chunk sizes that fit in a 32 bit int
2367 if (cur_resid
> (user_ssize_t
)(MAX_IO_REQUEST_SIZE
)) {
2369 * we're going to have to call cluster_write_copy
2372 * only want the last call to cluster_write_copy to
2373 * have the IO_TAILZEROFILL flag set and only the
2374 * first call should have IO_HEADZEROFILL
2376 zflags
= flags
& ~IO_TAILZEROFILL
;
2377 flags
&= ~IO_HEADZEROFILL
;
2379 write_length
= MAX_IO_REQUEST_SIZE
;
2382 * last call to cluster_write_copy
2386 write_length
= (u_int32_t
)cur_resid
;
2388 retval
= cluster_write_copy(vp
, uio
, write_length
, oldEOF
, newEOF
, headOff
, tailOff
, zflags
, callback
, callback_arg
);
2392 zflags
= flags
& ~(IO_TAILZEROFILL
| IO_HEADZEROFILL
);
2394 if (flags
& IO_HEADZEROFILL
) {
2396 * only do this once per request
2398 flags
&= ~IO_HEADZEROFILL
;
2400 retval
= cluster_write_copy(vp
, (struct uio
*)0, (u_int32_t
)0, (off_t
)0, uio
->uio_offset
,
2401 headOff
, (off_t
)0, zflags
| IO_HEADZEROFILL
| IO_SYNC
, callback
, callback_arg
);
2406 retval
= cluster_write_contig(vp
, uio
, newEOF
, &write_type
, &write_length
, callback
, callback_arg
, bflag
);
2408 if (retval
== 0 && (flags
& IO_TAILZEROFILL
) && uio_resid(uio
) == 0) {
2410 * we're done with the data from the user specified buffer(s)
2411 * and we've been requested to zero fill at the tail
2412 * treat this as an IO_HEADZEROFILL which doesn't require a uio
2413 * by rearranging the args and passing in IO_HEADZEROFILL
2415 retval
= cluster_write_copy(vp
, (struct uio
*)0, (u_int32_t
)0, (off_t
)0, tailOff
, uio
->uio_offset
,
2416 (off_t
)0, zflags
| IO_HEADZEROFILL
| IO_SYNC
, callback
, callback_arg
);
2422 * cluster_write_direct is never called with IO_TAILZEROFILL || IO_HEADZEROFILL
2424 retval
= cluster_write_direct(vp
, uio
, oldEOF
, newEOF
, &write_type
, &write_length
, flags
, callback
, callback_arg
);
2428 retval
= cluster_io_type(uio
, &write_type
, &write_length
, MIN_DIRECT_WRITE_SIZE
);
2432 * in case we end up calling cluster_write_copy (from cluster_write_direct)
2433 * multiple times to service a multi-vector request that is not aligned properly
2434 * we need to update the oldEOF so that we
2435 * don't zero-fill the head of a page if we've successfully written
2436 * data to that area... 'cluster_write_copy' will zero-fill the head of a
2437 * page that is beyond the oldEOF if the write is unaligned... we only
2438 * want that to happen for the very first page of the cluster_write,
2439 * NOT the first page of each vector making up a multi-vector write.
2441 if (uio
->uio_offset
> oldEOF
) {
2442 oldEOF
= uio
->uio_offset
;
2450 cluster_write_direct(vnode_t vp
, struct uio
*uio
, off_t oldEOF
, off_t newEOF
, int *write_type
, u_int32_t
*write_length
,
2451 int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
2454 upl_page_info_t
*pl
;
2455 vm_offset_t upl_offset
;
2456 vm_offset_t vector_upl_offset
= 0;
2457 u_int32_t io_req_size
;
2458 u_int32_t offset_in_file
;
2459 u_int32_t offset_in_iovbase
;
2462 upl_size_t upl_size
, vector_upl_size
= 0;
2463 vm_size_t upl_needed_size
;
2464 mach_msg_type_number_t pages_in_pl
;
2465 upl_control_flags_t upl_flags
;
2467 mach_msg_type_number_t i
;
2468 int force_data_sync
;
2471 struct clios iostate
;
2472 user_addr_t iov_base
;
2473 u_int32_t mem_alignment_mask
;
2474 u_int32_t devblocksize
;
2475 u_int32_t max_io_size
;
2476 u_int32_t max_upl_size
;
2477 u_int32_t max_vector_size
;
2478 u_int32_t bytes_outstanding_limit
;
2479 boolean_t io_throttled
= FALSE
;
2481 u_int32_t vector_upl_iosize
= 0;
2482 int issueVectorUPL
= 0, useVectorUPL
= (uio
->uio_iovcnt
> 1);
2483 off_t v_upl_uio_offset
= 0;
2484 int vector_upl_index
= 0;
2485 upl_t vector_upl
= NULL
;
2489 * When we enter this routine, we know
2490 * -- the resid will not exceed iov_len
2492 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_START
,
2493 (int)uio
->uio_offset
, *write_length
, (int)newEOF
, 0, 0);
2495 assert(vm_map_page_shift(current_map()) >= PAGE_SHIFT
);
2497 max_upl_size
= cluster_max_io_size(vp
->v_mount
, CL_WRITE
);
2499 io_flag
= CL_ASYNC
| CL_PRESERVE
| CL_COMMIT
| CL_THROTTLE
| CL_DIRECT_IO
;
2501 if (flags
& IO_PASSIVE
) {
2502 io_flag
|= CL_PASSIVE
;
2505 if (flags
& IO_NOCACHE
) {
2506 io_flag
|= CL_NOCACHE
;
2509 if (flags
& IO_SKIP_ENCRYPTION
) {
2510 io_flag
|= CL_ENCRYPTED
;
2513 iostate
.io_completed
= 0;
2514 iostate
.io_issued
= 0;
2515 iostate
.io_error
= 0;
2516 iostate
.io_wanted
= 0;
2518 lck_mtx_init(&iostate
.io_mtxp
, &cl_mtx_grp
, LCK_ATTR_NULL
);
2520 mem_alignment_mask
= (u_int32_t
)vp
->v_mount
->mnt_alignmentmask
;
2521 devblocksize
= (u_int32_t
)vp
->v_mount
->mnt_devblocksize
;
2523 if (devblocksize
== 1) {
2525 * the AFP client advertises a devblocksize of 1
2526 * however, its BLOCKMAP routine maps to physical
2527 * blocks that are PAGE_SIZE in size...
2528 * therefore we can't ask for I/Os that aren't page aligned
2529 * or aren't multiples of PAGE_SIZE in size
2530 * by setting devblocksize to PAGE_SIZE, we re-instate
2531 * the old behavior we had before the mem_alignment_mask
2532 * changes went in...
2534 devblocksize
= PAGE_SIZE
;
2538 io_req_size
= *write_length
;
2539 iov_base
= uio_curriovbase(uio
);
2541 offset_in_file
= (u_int32_t
)uio
->uio_offset
& PAGE_MASK
;
2542 offset_in_iovbase
= (u_int32_t
)iov_base
& mem_alignment_mask
;
2544 if (offset_in_file
|| offset_in_iovbase
) {
2546 * one of the 2 important offsets is misaligned
2547 * so fire an I/O through the cache for this entire vector
2549 goto wait_for_dwrites
;
2551 if (iov_base
& (devblocksize
- 1)) {
2553 * the offset in memory must be on a device block boundary
2554 * so that we can guarantee that we can generate an
2555 * I/O that ends on a page boundary in cluster_io
2557 goto wait_for_dwrites
;
2560 task_update_logical_writes(current_task(), (io_req_size
& ~PAGE_MASK
), TASK_WRITE_IMMEDIATE
, vp
);
2561 while (io_req_size
>= PAGE_SIZE
&& uio
->uio_offset
< newEOF
&& retval
== 0) {
2564 if ((throttle_type
= cluster_is_throttled(vp
))) {
2566 * we're in the throttle window, at the very least
2567 * we want to limit the size of the I/O we're about
2570 if ((flags
& IO_RETURN_ON_THROTTLE
) && throttle_type
== THROTTLE_NOW
) {
2572 * we're in the throttle window and at least 1 I/O
2573 * has already been issued by a throttleable thread
2574 * in this window, so return with EAGAIN to indicate
2575 * to the FS issuing the cluster_write call that it
2576 * should now throttle after dropping any locks
2578 throttle_info_update_by_mount(vp
->v_mount
);
2580 io_throttled
= TRUE
;
2581 goto wait_for_dwrites
;
2583 max_vector_size
= THROTTLE_MAX_IOSIZE
;
2584 max_io_size
= THROTTLE_MAX_IOSIZE
;
2586 max_vector_size
= MAX_VECTOR_UPL_SIZE
;
2587 max_io_size
= max_upl_size
;
2591 cluster_syncup(vp
, newEOF
, callback
, callback_arg
, callback
? PUSH_SYNC
: 0);
2594 io_size
= io_req_size
& ~PAGE_MASK
;
2595 iov_base
= uio_curriovbase(uio
);
2597 if (io_size
> max_io_size
) {
2598 io_size
= max_io_size
;
2601 if (useVectorUPL
&& (iov_base
& PAGE_MASK
)) {
2603 * We have an iov_base that's not page-aligned.
2604 * Issue all I/O's that have been collected within
2605 * this Vectored UPL.
2607 if (vector_upl_index
) {
2608 retval
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
2609 reset_vector_run_state();
2613 * After this point, if we are using the Vector UPL path and the base is
2614 * not page-aligned then the UPL with that base will be the first in the vector UPL.
2618 upl_offset
= (vm_offset_t
)((u_int32_t
)iov_base
& PAGE_MASK
);
2619 upl_needed_size
= (upl_offset
+ io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
2621 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_START
,
2622 (int)upl_offset
, upl_needed_size
, (int)iov_base
, io_size
, 0);
2624 vm_map_t map
= UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
) ? current_map() : kernel_map
;
2625 for (force_data_sync
= 0; force_data_sync
< 3; force_data_sync
++) {
2627 upl_size
= (upl_size_t
)upl_needed_size
;
2628 upl_flags
= UPL_FILE_IO
| UPL_COPYOUT_FROM
| UPL_NO_SYNC
|
2629 UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
2631 kret
= vm_map_get_upl(map
,
2632 vm_map_trunc_page(iov_base
, vm_map_page_mask(map
)),
2638 VM_KERN_MEMORY_FILE
,
2641 if (kret
!= KERN_SUCCESS
) {
2642 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
2645 * failed to get pagelist
2647 * we may have already spun some portion of this request
2648 * off as async requests... we need to wait for the I/O
2649 * to complete before returning
2651 goto wait_for_dwrites
;
2653 pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
2654 pages_in_pl
= upl_size
/ PAGE_SIZE
;
2656 for (i
= 0; i
< pages_in_pl
; i
++) {
2657 if (!upl_valid_page(pl
, i
)) {
2661 if (i
== pages_in_pl
) {
2666 * didn't get all the pages back that we
2667 * needed... release this upl and try again
2669 ubc_upl_abort(upl
, 0);
2671 if (force_data_sync
>= 3) {
2672 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
2673 i
, pages_in_pl
, upl_size
, kret
, 0);
2675 * for some reason, we couldn't acquire a hold on all
2676 * the pages needed in the user's address space
2678 * we may have already spun some portion of this request
2679 * off as async requests... we need to wait for the I/O
2680 * to complete before returning
2682 goto wait_for_dwrites
;
2686 * Consider the possibility that upl_size wasn't satisfied.
2688 if (upl_size
< upl_needed_size
) {
2689 if (upl_size
&& upl_offset
== 0) {
2695 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
2696 (int)upl_offset
, upl_size
, (int)iov_base
, io_size
, 0);
2699 ubc_upl_abort(upl
, 0);
2701 * we may have already spun some portion of this request
2702 * off as async requests... we need to wait for the I/O
2703 * to complete before returning
2705 goto wait_for_dwrites
;
2709 vm_offset_t end_off
= ((iov_base
+ io_size
) & PAGE_MASK
);
2714 * After this point, if we are using a vector UPL, then
2715 * either all the UPL elements end on a page boundary OR
2716 * this UPL is the last element because it does not end
2717 * on a page boundary.
2722 * we want push out these writes asynchronously so that we can overlap
2723 * the preparation of the next I/O
2724 * if there are already too many outstanding writes
2725 * wait until some complete before issuing the next
2727 if (vp
->v_mount
->mnt_minsaturationbytecount
) {
2728 bytes_outstanding_limit
= vp
->v_mount
->mnt_minsaturationbytecount
;
2730 bytes_outstanding_limit
= max_upl_size
* IO_SCALE(vp
, 2);
2733 cluster_iostate_wait(&iostate
, bytes_outstanding_limit
, "cluster_write_direct");
2735 if (iostate
.io_error
) {
2737 * one of the earlier writes we issued ran into a hard error
2738 * don't issue any more writes, cleanup the UPL
2739 * that was just created but not used, then
2740 * go wait for all writes that are part of this stream
2741 * to complete before returning the error to the caller
2743 ubc_upl_abort(upl
, 0);
2745 goto wait_for_dwrites
;
2748 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 77)) | DBG_FUNC_START
,
2749 (int)upl_offset
, (int)uio
->uio_offset
, io_size
, io_flag
, 0);
2751 if (!useVectorUPL
) {
2752 retval
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
,
2753 io_size
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
2755 if (!vector_upl_index
) {
2756 vector_upl
= vector_upl_create(upl_offset
);
2757 v_upl_uio_offset
= uio
->uio_offset
;
2758 vector_upl_offset
= upl_offset
;
2761 vector_upl_set_subupl(vector_upl
, upl
, upl_size
);
2762 vector_upl_set_iostate(vector_upl
, upl
, vector_upl_size
, upl_size
);
2764 vector_upl_iosize
+= io_size
;
2765 vector_upl_size
+= upl_size
;
2767 if (issueVectorUPL
|| vector_upl_index
== MAX_VECTOR_UPL_ELEMENTS
|| vector_upl_size
>= max_vector_size
) {
2768 retval
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
2769 reset_vector_run_state();
2774 * update the uio structure to
2775 * reflect the I/O that we just issued
2777 uio_update(uio
, (user_size_t
)io_size
);
2780 * in case we end up calling through to cluster_write_copy to finish
2781 * the tail of this request, we need to update the oldEOF so that we
2782 * don't zero-fill the head of a page if we've successfully written
2783 * data to that area... 'cluster_write_copy' will zero-fill the head of a
2784 * page that is beyond the oldEOF if the write is unaligned... we only
2785 * want that to happen for the very first page of the cluster_write,
2786 * NOT the first page of each vector making up a multi-vector write.
2788 if (uio
->uio_offset
> oldEOF
) {
2789 oldEOF
= uio
->uio_offset
;
2792 io_req_size
-= io_size
;
2794 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 77)) | DBG_FUNC_END
,
2795 (int)upl_offset
, (int)uio
->uio_offset
, io_req_size
, retval
, 0);
2798 if (retval
== 0 && iostate
.io_error
== 0 && io_req_size
== 0) {
2799 retval
= cluster_io_type(uio
, write_type
, write_length
, MIN_DIRECT_WRITE_SIZE
);
2801 if (retval
== 0 && *write_type
== IO_DIRECT
) {
2802 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_NONE
,
2803 (int)uio
->uio_offset
, *write_length
, (int)newEOF
, 0, 0);
2811 if (retval
== 0 && iostate
.io_error
== 0 && useVectorUPL
&& vector_upl_index
) {
2812 retval
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
2813 reset_vector_run_state();
2816 * make sure all async writes issued as part of this stream
2817 * have completed before we return
2819 cluster_iostate_wait(&iostate
, 0, "cluster_write_direct");
2821 if (iostate
.io_error
) {
2822 retval
= iostate
.io_error
;
2825 lck_mtx_destroy(&iostate
.io_mtxp
, &cl_mtx_grp
);
2827 if (io_throttled
== TRUE
&& retval
== 0) {
2831 if (io_req_size
&& retval
== 0) {
2833 * we couldn't handle the tail of this request in DIRECT mode
2834 * so fire it through the copy path
2836 * note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set
2837 * so we can just pass 0 in for the headOff and tailOff
2839 if (uio
->uio_offset
> oldEOF
) {
2840 oldEOF
= uio
->uio_offset
;
2843 retval
= cluster_write_copy(vp
, uio
, io_req_size
, oldEOF
, newEOF
, (off_t
)0, (off_t
)0, flags
, callback
, callback_arg
);
2845 *write_type
= IO_UNKNOWN
;
2847 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_END
,
2848 (int)uio
->uio_offset
, io_req_size
, retval
, 4, 0);
2855 cluster_write_contig(vnode_t vp
, struct uio
*uio
, off_t newEOF
, int *write_type
, u_int32_t
*write_length
,
2856 int (*callback
)(buf_t
, void *), void *callback_arg
, int bflag
)
2858 upl_page_info_t
*pl
;
2859 addr64_t src_paddr
= 0;
2860 upl_t upl
[MAX_VECTS
];
2861 vm_offset_t upl_offset
;
2862 u_int32_t tail_size
= 0;
2865 upl_size_t upl_size
;
2866 vm_size_t upl_needed_size
;
2867 mach_msg_type_number_t pages_in_pl
;
2868 upl_control_flags_t upl_flags
;
2870 struct clios iostate
;
2875 user_addr_t iov_base
;
2876 u_int32_t devblocksize
;
2877 u_int32_t mem_alignment_mask
;
2880 * When we enter this routine, we know
2881 * -- the io_req_size will not exceed iov_len
2882 * -- the target address is physically contiguous
2884 cluster_syncup(vp
, newEOF
, callback
, callback_arg
, callback
? PUSH_SYNC
: 0);
2886 devblocksize
= (u_int32_t
)vp
->v_mount
->mnt_devblocksize
;
2887 mem_alignment_mask
= (u_int32_t
)vp
->v_mount
->mnt_alignmentmask
;
2889 iostate
.io_completed
= 0;
2890 iostate
.io_issued
= 0;
2891 iostate
.io_error
= 0;
2892 iostate
.io_wanted
= 0;
2894 lck_mtx_init(&iostate
.io_mtxp
, &cl_mtx_grp
, LCK_ATTR_NULL
);
2897 io_size
= *write_length
;
2899 iov_base
= uio_curriovbase(uio
);
2901 upl_offset
= (vm_offset_t
)((u_int32_t
)iov_base
& PAGE_MASK
);
2902 upl_needed_size
= upl_offset
+ io_size
;
2905 upl_size
= (upl_size_t
)upl_needed_size
;
2906 upl_flags
= UPL_FILE_IO
| UPL_COPYOUT_FROM
| UPL_NO_SYNC
|
2907 UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
2909 vm_map_t map
= UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
) ? current_map() : kernel_map
;
2910 kret
= vm_map_get_upl(map
,
2911 vm_map_trunc_page(iov_base
, vm_map_page_mask(map
)),
2912 &upl_size
, &upl
[cur_upl
], NULL
, &pages_in_pl
, &upl_flags
, VM_KERN_MEMORY_FILE
, 0);
2914 if (kret
!= KERN_SUCCESS
) {
2916 * failed to get pagelist
2919 goto wait_for_cwrites
;
2924 * Consider the possibility that upl_size wasn't satisfied.
2926 if (upl_size
< upl_needed_size
) {
2928 * This is a failure in the physical memory case.
2931 goto wait_for_cwrites
;
2933 pl
= ubc_upl_pageinfo(upl
[cur_upl
]);
2935 src_paddr
= ((addr64_t
)upl_phys_page(pl
, 0) << PAGE_SHIFT
) + (addr64_t
)upl_offset
;
2937 while (((uio
->uio_offset
& (devblocksize
- 1)) || io_size
< devblocksize
) && io_size
) {
2938 u_int32_t head_size
;
2940 head_size
= devblocksize
- (u_int32_t
)(uio
->uio_offset
& (devblocksize
- 1));
2942 if (head_size
> io_size
) {
2943 head_size
= io_size
;
2946 error
= cluster_align_phys_io(vp
, uio
, src_paddr
, head_size
, 0, callback
, callback_arg
);
2949 goto wait_for_cwrites
;
2952 upl_offset
+= head_size
;
2953 src_paddr
+= head_size
;
2954 io_size
-= head_size
;
2956 iov_base
+= head_size
;
2958 if ((u_int32_t
)iov_base
& mem_alignment_mask
) {
2960 * request doesn't set up on a memory boundary
2961 * the underlying DMA engine can handle...
2962 * return an error instead of going through
2963 * the slow copy path since the intent of this
2964 * path is direct I/O from device memory
2967 goto wait_for_cwrites
;
2970 tail_size
= io_size
& (devblocksize
- 1);
2971 io_size
-= tail_size
;
2973 while (io_size
&& error
== 0) {
2974 if (io_size
> MAX_IO_CONTIG_SIZE
) {
2975 xsize
= MAX_IO_CONTIG_SIZE
;
2980 * request asynchronously so that we can overlap
2981 * the preparation of the next I/O... we'll do
2982 * the commit after all the I/O has completed
2983 * since its all issued against the same UPL
2984 * if there are already too many outstanding writes
2985 * wait until some have completed before issuing the next
2987 cluster_iostate_wait(&iostate
, MAX_IO_CONTIG_SIZE
* IO_SCALE(vp
, 2), "cluster_write_contig");
2989 if (iostate
.io_error
) {
2991 * one of the earlier writes we issued ran into a hard error
2992 * don't issue any more writes...
2993 * go wait for all writes that are part of this stream
2994 * to complete before returning the error to the caller
2996 goto wait_for_cwrites
;
2999 * issue an asynchronous write to cluster_io
3001 error
= cluster_io(vp
, upl
[cur_upl
], upl_offset
, uio
->uio_offset
,
3002 xsize
, CL_DEV_MEMORY
| CL_ASYNC
| bflag
, (buf_t
)NULL
, (struct clios
*)&iostate
, callback
, callback_arg
);
3006 * The cluster_io write completed successfully,
3007 * update the uio structure
3009 uio_update(uio
, (user_size_t
)xsize
);
3011 upl_offset
+= xsize
;
3016 if (error
== 0 && iostate
.io_error
== 0 && tail_size
== 0 && num_upl
< MAX_VECTS
) {
3017 error
= cluster_io_type(uio
, write_type
, write_length
, 0);
3019 if (error
== 0 && *write_type
== IO_CONTIG
) {
3024 *write_type
= IO_UNKNOWN
;
3029 * make sure all async writes that are part of this stream
3030 * have completed before we proceed
3032 cluster_iostate_wait(&iostate
, 0, "cluster_write_contig");
3034 if (iostate
.io_error
) {
3035 error
= iostate
.io_error
;
3038 lck_mtx_destroy(&iostate
.io_mtxp
, &cl_mtx_grp
);
3040 if (error
== 0 && tail_size
) {
3041 error
= cluster_align_phys_io(vp
, uio
, src_paddr
, tail_size
, 0, callback
, callback_arg
);
3044 for (n
= 0; n
< num_upl
; n
++) {
3046 * just release our hold on each physically contiguous
3047 * region without changing any state
3049 ubc_upl_abort(upl
[n
], 0);
3057 * need to avoid a race between an msync of a range of pages dirtied via mmap
3058 * vs a filesystem such as HFS deciding to write a 'hole' to disk via cluster_write's
3059 * zerofill mechanism before it has seen the VNOP_PAGEOUTs for the pages being msync'd
3061 * we should never force-zero-fill pages that are already valid in the cache...
3062 * the entire page contains valid data (either from disk, zero-filled or dirtied
3063 * via an mmap) so we can only do damage by trying to zero-fill
3067 cluster_zero_range(upl_t upl
, upl_page_info_t
*pl
, int flags
, int io_offset
, off_t zero_off
, off_t upl_f_offset
, int bytes_to_zero
)
3070 boolean_t need_cluster_zero
= TRUE
;
3072 if ((flags
& (IO_NOZEROVALID
| IO_NOZERODIRTY
))) {
3073 bytes_to_zero
= min(bytes_to_zero
, PAGE_SIZE
- (int)(zero_off
& PAGE_MASK_64
));
3074 zero_pg_index
= (int)((zero_off
- upl_f_offset
) / PAGE_SIZE_64
);
3076 if (upl_valid_page(pl
, zero_pg_index
)) {
3078 * never force zero valid pages - dirty or clean
3079 * we'll leave these in the UPL for cluster_write_copy to deal with
3081 need_cluster_zero
= FALSE
;
3084 if (need_cluster_zero
== TRUE
) {
3085 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
3088 return bytes_to_zero
;
3093 cluster_update_state(vnode_t vp
, vm_object_offset_t s_offset
, vm_object_offset_t e_offset
, boolean_t vm_initiated
)
3095 struct cl_extent cl
;
3096 boolean_t first_pass
= TRUE
;
3098 assert(s_offset
< e_offset
);
3099 assert((s_offset
& PAGE_MASK_64
) == 0);
3100 assert((e_offset
& PAGE_MASK_64
) == 0);
3102 cl
.b_addr
= (daddr64_t
)(s_offset
/ PAGE_SIZE_64
);
3103 cl
.e_addr
= (daddr64_t
)(e_offset
/ PAGE_SIZE_64
);
3105 cluster_update_state_internal(vp
, &cl
, 0, TRUE
, &first_pass
, s_offset
, (int)(e_offset
- s_offset
),
3106 vp
->v_un
.vu_ubcinfo
->ui_size
, NULL
, NULL
, vm_initiated
);
3111 cluster_update_state_internal(vnode_t vp
, struct cl_extent
*cl
, int flags
, boolean_t defer_writes
,
3112 boolean_t
*first_pass
, off_t write_off
, int write_cnt
, off_t newEOF
,
3113 int (*callback
)(buf_t
, void *), void *callback_arg
, boolean_t vm_initiated
)
3115 struct cl_writebehind
*wbp
;
3117 int ret_cluster_try_push
;
3118 u_int max_cluster_pgcount
;
3121 max_cluster_pgcount
= MAX_CLUSTER_SIZE(vp
) / PAGE_SIZE
;
3124 * take the lock to protect our accesses
3125 * of the writebehind and sparse cluster state
3127 wbp
= cluster_get_wbp(vp
, CLW_ALLOCATE
| CLW_RETURNLOCKED
);
3129 if (wbp
->cl_scmap
) {
3130 if (!(flags
& IO_NOCACHE
)) {
3132 * we've fallen into the sparse
3133 * cluster method of delaying dirty pages
3135 sparse_cluster_add(wbp
, &(wbp
->cl_scmap
), vp
, cl
, newEOF
, callback
, callback_arg
, vm_initiated
);
3137 lck_mtx_unlock(&wbp
->cl_lockw
);
3141 * must have done cached writes that fell into
3142 * the sparse cluster mechanism... we've switched
3143 * to uncached writes on the file, so go ahead
3144 * and push whatever's in the sparse map
3145 * and switch back to normal clustering
3149 sparse_cluster_push(wbp
, &(wbp
->cl_scmap
), vp
, newEOF
, PUSH_ALL
, 0, callback
, callback_arg
, vm_initiated
);
3151 * no clusters of either type present at this point
3152 * so just go directly to start_new_cluster since
3153 * we know we need to delay this I/O since we've
3154 * already released the pages back into the cache
3155 * to avoid the deadlock with sparse_cluster_push
3157 goto start_new_cluster
;
3159 if (*first_pass
== TRUE
) {
3160 if (write_off
== wbp
->cl_last_write
) {
3161 wbp
->cl_seq_written
+= write_cnt
;
3163 wbp
->cl_seq_written
= write_cnt
;
3166 wbp
->cl_last_write
= write_off
+ write_cnt
;
3168 *first_pass
= FALSE
;
3170 if (wbp
->cl_number
== 0) {
3172 * no clusters currently present
3174 goto start_new_cluster
;
3177 for (cl_index
= 0; cl_index
< wbp
->cl_number
; cl_index
++) {
3179 * check each cluster that we currently hold
3180 * try to merge some or all of this write into
3181 * one or more of the existing clusters... if
3182 * any portion of the write remains, start a
3185 if (cl
->b_addr
>= wbp
->cl_clusters
[cl_index
].b_addr
) {
3187 * the current write starts at or after the current cluster
3189 if (cl
->e_addr
<= (wbp
->cl_clusters
[cl_index
].b_addr
+ max_cluster_pgcount
)) {
3191 * we have a write that fits entirely
3192 * within the existing cluster limits
3194 if (cl
->e_addr
> wbp
->cl_clusters
[cl_index
].e_addr
) {
3196 * update our idea of where the cluster ends
3198 wbp
->cl_clusters
[cl_index
].e_addr
= cl
->e_addr
;
3202 if (cl
->b_addr
< (wbp
->cl_clusters
[cl_index
].b_addr
+ max_cluster_pgcount
)) {
3204 * we have a write that starts in the middle of the current cluster
3205 * but extends beyond the cluster's limit... we know this because
3206 * of the previous checks
3207 * we'll extend the current cluster to the max
3208 * and update the b_addr for the current write to reflect that
3209 * the head of it was absorbed into this cluster...
3210 * note that we'll always have a leftover tail in this case since
3211 * full absorbtion would have occurred in the clause above
3213 wbp
->cl_clusters
[cl_index
].e_addr
= wbp
->cl_clusters
[cl_index
].b_addr
+ max_cluster_pgcount
;
3215 cl
->b_addr
= wbp
->cl_clusters
[cl_index
].e_addr
;
3218 * we come here for the case where the current write starts
3219 * beyond the limit of the existing cluster or we have a leftover
3220 * tail after a partial absorbtion
3222 * in either case, we'll check the remaining clusters before
3223 * starting a new one
3227 * the current write starts in front of the cluster we're currently considering
3229 if ((wbp
->cl_clusters
[cl_index
].e_addr
- cl
->b_addr
) <= max_cluster_pgcount
) {
3231 * we can just merge the new request into
3232 * this cluster and leave it in the cache
3233 * since the resulting cluster is still
3234 * less than the maximum allowable size
3236 wbp
->cl_clusters
[cl_index
].b_addr
= cl
->b_addr
;
3238 if (cl
->e_addr
> wbp
->cl_clusters
[cl_index
].e_addr
) {
3240 * the current write completely
3241 * envelops the existing cluster and since
3242 * each write is limited to at most max_cluster_pgcount pages
3243 * we can just use the start and last blocknos of the write
3244 * to generate the cluster limits
3246 wbp
->cl_clusters
[cl_index
].e_addr
= cl
->e_addr
;
3251 * if we were to combine this write with the current cluster
3252 * we would exceed the cluster size limit.... so,
3253 * let's see if there's any overlap of the new I/O with
3254 * the cluster we're currently considering... in fact, we'll
3255 * stretch the cluster out to it's full limit and see if we
3256 * get an intersection with the current write
3259 if (cl
->e_addr
> wbp
->cl_clusters
[cl_index
].e_addr
- max_cluster_pgcount
) {
3261 * the current write extends into the proposed cluster
3262 * clip the length of the current write after first combining it's
3263 * tail with the newly shaped cluster
3265 wbp
->cl_clusters
[cl_index
].b_addr
= wbp
->cl_clusters
[cl_index
].e_addr
- max_cluster_pgcount
;
3267 cl
->e_addr
= wbp
->cl_clusters
[cl_index
].b_addr
;
3270 * if we get here, there was no way to merge
3271 * any portion of this write with this cluster
3272 * or we could only merge part of it which
3273 * will leave a tail...
3274 * we'll check the remaining clusters before starting a new one
3278 if (cl_index
< wbp
->cl_number
) {
3280 * we found an existing cluster(s) that we
3281 * could entirely merge this I/O into
3286 if (defer_writes
== FALSE
&&
3287 wbp
->cl_number
== MAX_CLUSTERS
&&
3288 wbp
->cl_seq_written
>= (MAX_CLUSTERS
* (max_cluster_pgcount
* PAGE_SIZE
))) {
3291 if (vp
->v_mount
->mnt_minsaturationbytecount
) {
3292 n
= vp
->v_mount
->mnt_minsaturationbytecount
/ MAX_CLUSTER_SIZE(vp
);
3294 if (n
> MAX_CLUSTERS
) {
3302 if (disk_conditioner_mount_is_ssd(vp
->v_mount
)) {
3303 n
= WRITE_BEHIND_SSD
;
3309 cluster_try_push(wbp
, vp
, newEOF
, 0, 0, callback
, callback_arg
, NULL
, vm_initiated
);
3312 if (wbp
->cl_number
< MAX_CLUSTERS
) {
3314 * we didn't find an existing cluster to
3315 * merge into, but there's room to start
3318 goto start_new_cluster
;
3321 * no exisitng cluster to merge with and no
3322 * room to start a new one... we'll try
3323 * pushing one of the existing ones... if none of
3324 * them are able to be pushed, we'll switch
3325 * to the sparse cluster mechanism
3326 * cluster_try_push updates cl_number to the
3327 * number of remaining clusters... and
3328 * returns the number of currently unused clusters
3330 ret_cluster_try_push
= 0;
3333 * if writes are not deferred, call cluster push immediately
3335 if (defer_writes
== FALSE
) {
3336 ret_cluster_try_push
= cluster_try_push(wbp
, vp
, newEOF
, (flags
& IO_NOCACHE
) ? 0 : PUSH_DELAY
, 0, callback
, callback_arg
, NULL
, vm_initiated
);
3339 * execute following regardless of writes being deferred or not
3341 if (ret_cluster_try_push
== 0) {
3343 * no more room in the normal cluster mechanism
3344 * so let's switch to the more expansive but expensive
3345 * sparse mechanism....
3347 sparse_cluster_switch(wbp
, vp
, newEOF
, callback
, callback_arg
, vm_initiated
);
3348 sparse_cluster_add(wbp
, &(wbp
->cl_scmap
), vp
, cl
, newEOF
, callback
, callback_arg
, vm_initiated
);
3350 lck_mtx_unlock(&wbp
->cl_lockw
);
3354 wbp
->cl_clusters
[wbp
->cl_number
].b_addr
= cl
->b_addr
;
3355 wbp
->cl_clusters
[wbp
->cl_number
].e_addr
= cl
->e_addr
;
3357 wbp
->cl_clusters
[wbp
->cl_number
].io_flags
= 0;
3359 if (flags
& IO_NOCACHE
) {
3360 wbp
->cl_clusters
[wbp
->cl_number
].io_flags
|= CLW_IONOCACHE
;
3363 if (flags
& IO_PASSIVE
) {
3364 wbp
->cl_clusters
[wbp
->cl_number
].io_flags
|= CLW_IOPASSIVE
;
3369 lck_mtx_unlock(&wbp
->cl_lockw
);
3375 cluster_write_copy(vnode_t vp
, struct uio
*uio
, u_int32_t io_req_size
, off_t oldEOF
, off_t newEOF
, off_t headOff
,
3376 off_t tailOff
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
3378 upl_page_info_t
*pl
;
3380 vm_offset_t upl_offset
= 0;
3393 long long total_size
;
3396 long long zero_cnt1
;
3398 off_t write_off
= 0;
3400 boolean_t first_pass
= FALSE
;
3401 struct cl_extent cl
;
3406 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_START
,
3407 (int)uio
->uio_offset
, io_req_size
, (int)oldEOF
, (int)newEOF
, 0);
3409 io_resid
= io_req_size
;
3411 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_START
,
3412 0, 0, (int)oldEOF
, (int)newEOF
, 0);
3416 if (flags
& IO_PASSIVE
) {
3421 if (flags
& IO_NOCACHE
) {
3422 bflag
|= CL_NOCACHE
;
3425 if (flags
& IO_SKIP_ENCRYPTION
) {
3426 bflag
|= CL_ENCRYPTED
;
3434 max_io_size
= cluster_max_io_size(vp
->v_mount
, CL_WRITE
);
3436 if (flags
& IO_HEADZEROFILL
) {
3438 * some filesystems (HFS is one) don't support unallocated holes within a file...
3439 * so we zero fill the intervening space between the old EOF and the offset
3440 * where the next chunk of real data begins.... ftruncate will also use this
3441 * routine to zero fill to the new EOF when growing a file... in this case, the
3442 * uio structure will not be provided
3445 if (headOff
< uio
->uio_offset
) {
3446 zero_cnt
= uio
->uio_offset
- headOff
;
3449 } else if (headOff
< newEOF
) {
3450 zero_cnt
= newEOF
- headOff
;
3454 if (uio
&& uio
->uio_offset
> oldEOF
) {
3455 zero_off
= uio
->uio_offset
& ~PAGE_MASK_64
;
3457 if (zero_off
>= oldEOF
) {
3458 zero_cnt
= uio
->uio_offset
- zero_off
;
3460 flags
|= IO_HEADZEROFILL
;
3464 if (flags
& IO_TAILZEROFILL
) {
3466 zero_off1
= uio
->uio_offset
+ io_req_size
;
3468 if (zero_off1
< tailOff
) {
3469 zero_cnt1
= tailOff
- zero_off1
;
3473 if (uio
&& newEOF
> oldEOF
) {
3474 zero_off1
= uio
->uio_offset
+ io_req_size
;
3476 if (zero_off1
== newEOF
&& (zero_off1
& PAGE_MASK_64
)) {
3477 zero_cnt1
= PAGE_SIZE_64
- (zero_off1
& PAGE_MASK_64
);
3479 flags
|= IO_TAILZEROFILL
;
3483 if (zero_cnt
== 0 && uio
== (struct uio
*) 0) {
3484 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_END
,
3485 retval
, 0, 0, 0, 0);
3489 write_off
= uio
->uio_offset
;
3490 write_cnt
= (int)uio_resid(uio
);
3492 * delay updating the sequential write info
3493 * in the control block until we've obtained
3498 while ((total_size
= (io_resid
+ zero_cnt
+ zero_cnt1
)) && retval
== 0) {
3500 * for this iteration of the loop, figure out where our starting point is
3503 start_offset
= (int)(zero_off
& PAGE_MASK_64
);
3504 upl_f_offset
= zero_off
- start_offset
;
3505 } else if (io_resid
) {
3506 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
3507 upl_f_offset
= uio
->uio_offset
- start_offset
;
3509 start_offset
= (int)(zero_off1
& PAGE_MASK_64
);
3510 upl_f_offset
= zero_off1
- start_offset
;
3512 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 46)) | DBG_FUNC_NONE
,
3513 (int)zero_off
, (int)zero_cnt
, (int)zero_off1
, (int)zero_cnt1
, 0);
3515 if (total_size
> max_io_size
) {
3516 total_size
= max_io_size
;
3519 cl
.b_addr
= (daddr64_t
)(upl_f_offset
/ PAGE_SIZE_64
);
3521 if (uio
&& ((flags
& (IO_SYNC
| IO_HEADZEROFILL
| IO_TAILZEROFILL
)) == 0)) {
3523 * assumption... total_size <= io_resid
3524 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
3526 if ((start_offset
+ total_size
) > max_io_size
) {
3527 total_size
= max_io_size
- start_offset
;
3529 xfer_resid
= (int)total_size
;
3531 retval
= cluster_copy_ubc_data_internal(vp
, uio
, &xfer_resid
, 1, 1);
3537 io_resid
-= (total_size
- xfer_resid
);
3538 total_size
= xfer_resid
;
3539 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
3540 upl_f_offset
= uio
->uio_offset
- start_offset
;
3542 if (total_size
== 0) {
3545 * the write did not finish on a page boundary
3546 * which will leave upl_f_offset pointing to the
3547 * beginning of the last page written instead of
3548 * the page beyond it... bump it in this case
3549 * so that the cluster code records the last page
3552 upl_f_offset
+= PAGE_SIZE_64
;
3560 * compute the size of the upl needed to encompass
3561 * the requested write... limit each call to cluster_io
3562 * to the maximum UPL size... cluster_io will clip if
3563 * this exceeds the maximum io_size for the device,
3564 * make sure to account for
3565 * a starting offset that's not page aligned
3567 upl_size
= (start_offset
+ total_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
3569 if (upl_size
> max_io_size
) {
3570 upl_size
= max_io_size
;
3573 pages_in_upl
= (int)(upl_size
/ PAGE_SIZE
);
3574 io_size
= (int)(upl_size
- start_offset
);
3576 if ((long long)io_size
> total_size
) {
3577 io_size
= (int)total_size
;
3580 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_START
, upl_size
, io_size
, total_size
, 0, 0);
3584 * Gather the pages from the buffer cache.
3585 * The UPL_WILL_MODIFY flag lets the UPL subsystem know
3586 * that we intend to modify these pages.
3588 kret
= ubc_create_upl_kernel(vp
,
3593 UPL_SET_LITE
| ((uio
!= NULL
&& (uio
->uio_flags
& UIO_FLAGS_IS_COMPRESSED_FILE
)) ? 0 : UPL_WILL_MODIFY
),
3594 VM_KERN_MEMORY_FILE
);
3595 if (kret
!= KERN_SUCCESS
) {
3596 panic("cluster_write_copy: failed to get pagelist");
3599 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_END
,
3600 upl
, (int)upl_f_offset
, start_offset
, 0, 0);
3602 if (start_offset
&& upl_f_offset
< oldEOF
&& !upl_valid_page(pl
, 0)) {
3606 * we're starting in the middle of the first page of the upl
3607 * and the page isn't currently valid, so we're going to have
3608 * to read it in first... this is a synchronous operation
3610 read_size
= PAGE_SIZE
;
3612 if ((upl_f_offset
+ read_size
) > oldEOF
) {
3613 read_size
= (int)(oldEOF
- upl_f_offset
);
3616 retval
= cluster_io(vp
, upl
, 0, upl_f_offset
, read_size
,
3617 CL_READ
| bflag
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
);
3620 * we had an error during the read which causes us to abort
3621 * the current cluster_write request... before we do, we need
3622 * to release the rest of the pages in the upl without modifying
3623 * there state and mark the failed page in error
3625 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
3627 if (upl_size
> PAGE_SIZE
) {
3628 ubc_upl_abort_range(upl
, 0, (upl_size_t
)upl_size
,
3629 UPL_ABORT_FREE_ON_EMPTY
);
3632 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
3633 upl
, 0, 0, retval
, 0);
3637 if ((start_offset
== 0 || upl_size
> PAGE_SIZE
) && ((start_offset
+ io_size
) & PAGE_MASK
)) {
3639 * the last offset we're writing to in this upl does not end on a page
3640 * boundary... if it's not beyond the old EOF, then we'll also need to
3641 * pre-read this page in if it isn't already valid
3643 upl_offset
= upl_size
- PAGE_SIZE
;
3645 if ((upl_f_offset
+ start_offset
+ io_size
) < oldEOF
&&
3646 !upl_valid_page(pl
, (int)(upl_offset
/ PAGE_SIZE
))) {
3649 read_size
= PAGE_SIZE
;
3651 if ((off_t
)(upl_f_offset
+ upl_offset
+ read_size
) > oldEOF
) {
3652 read_size
= (int)(oldEOF
- (upl_f_offset
+ upl_offset
));
3655 retval
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, read_size
,
3656 CL_READ
| bflag
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
);
3659 * we had an error during the read which causes us to abort
3660 * the current cluster_write request... before we do, we
3661 * need to release the rest of the pages in the upl without
3662 * modifying there state and mark the failed page in error
3664 ubc_upl_abort_range(upl
, (upl_offset_t
)upl_offset
, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
3666 if (upl_size
> PAGE_SIZE
) {
3667 ubc_upl_abort_range(upl
, 0, (upl_size_t
)upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3670 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
3671 upl
, 0, 0, retval
, 0);
3676 xfer_resid
= io_size
;
3677 io_offset
= start_offset
;
3679 while (zero_cnt
&& xfer_resid
) {
3680 if (zero_cnt
< (long long)xfer_resid
) {
3681 bytes_to_zero
= (int)zero_cnt
;
3683 bytes_to_zero
= xfer_resid
;
3686 bytes_to_zero
= cluster_zero_range(upl
, pl
, flags
, io_offset
, zero_off
, upl_f_offset
, bytes_to_zero
);
3688 xfer_resid
-= bytes_to_zero
;
3689 zero_cnt
-= bytes_to_zero
;
3690 zero_off
+= bytes_to_zero
;
3691 io_offset
+= bytes_to_zero
;
3693 if (xfer_resid
&& io_resid
) {
3694 u_int32_t io_requested
;
3696 bytes_to_move
= min(io_resid
, xfer_resid
);
3697 io_requested
= bytes_to_move
;
3699 retval
= cluster_copy_upl_data(uio
, upl
, io_offset
, (int *)&io_requested
);
3702 ubc_upl_abort_range(upl
, 0, (upl_size_t
)upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3704 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
3705 upl
, 0, 0, retval
, 0);
3707 io_resid
-= bytes_to_move
;
3708 xfer_resid
-= bytes_to_move
;
3709 io_offset
+= bytes_to_move
;
3712 while (xfer_resid
&& zero_cnt1
&& retval
== 0) {
3713 if (zero_cnt1
< (long long)xfer_resid
) {
3714 bytes_to_zero
= (int)zero_cnt1
;
3716 bytes_to_zero
= xfer_resid
;
3719 bytes_to_zero
= cluster_zero_range(upl
, pl
, flags
, io_offset
, zero_off1
, upl_f_offset
, bytes_to_zero
);
3721 xfer_resid
-= bytes_to_zero
;
3722 zero_cnt1
-= bytes_to_zero
;
3723 zero_off1
+= bytes_to_zero
;
3724 io_offset
+= bytes_to_zero
;
3729 io_size
+= start_offset
;
3731 /* Force more restrictive zeroing behavior only on APFS */
3732 if ((vnode_tag(vp
) == VT_APFS
) && (newEOF
< oldEOF
)) {
3736 if (do_zeroing
&& (upl_f_offset
+ io_size
) >= newEOF
&& (u_int
)io_size
< upl_size
) {
3738 * if we're extending the file with this write
3739 * we'll zero fill the rest of the page so that
3740 * if the file gets extended again in such a way as to leave a
3741 * hole starting at this EOF, we'll have zero's in the correct spot
3743 cluster_zero(upl
, io_size
, (int)(upl_size
- io_size
), NULL
);
3746 * release the upl now if we hold one since...
3747 * 1) pages in it may be present in the sparse cluster map
3748 * and may span 2 separate buckets there... if they do and
3749 * we happen to have to flush a bucket to make room and it intersects
3750 * this upl, a deadlock may result on page BUSY
3751 * 2) we're delaying the I/O... from this point forward we're just updating
3752 * the cluster state... no need to hold the pages, so commit them
3753 * 3) IO_SYNC is set...
3754 * because we had to ask for a UPL that provides currenty non-present pages, the
3755 * UPL has been automatically set to clear the dirty flags (both software and hardware)
3756 * upon committing it... this is not the behavior we want since it's possible for
3757 * pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
3758 * we'll pick these pages back up later with the correct behavior specified.
3759 * 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush
3760 * of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages
3761 * we hold since the flushing context is holding the cluster lock.
3763 ubc_upl_commit_range(upl
, 0, (upl_size_t
)upl_size
,
3764 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
3767 * calculate the last logical block number
3768 * that this delayed I/O encompassed
3770 cl
.e_addr
= (daddr64_t
)((upl_f_offset
+ (off_t
)upl_size
) / PAGE_SIZE_64
);
3772 if (flags
& IO_SYNC
) {
3774 * if the IO_SYNC flag is set than we need to bypass
3775 * any clustering and immediately issue the I/O
3777 * we don't hold the lock at this point
3779 * we've already dropped the current upl, so pick it back up with COPYOUT_FROM set
3780 * so that we correctly deal with a change in state of the hardware modify bit...
3781 * we do this via cluster_push_now... by passing along the IO_SYNC flag, we force
3782 * cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also
3783 * responsible for generating the correct sized I/O(s)
3785 retval
= cluster_push_now(vp
, &cl
, newEOF
, flags
, callback
, callback_arg
, FALSE
);
3787 boolean_t defer_writes
= FALSE
;
3789 if (vfs_flags(vp
->v_mount
) & MNT_DEFWRITE
) {
3790 defer_writes
= TRUE
;
3793 cluster_update_state_internal(vp
, &cl
, flags
, defer_writes
, &first_pass
,
3794 write_off
, write_cnt
, newEOF
, callback
, callback_arg
, FALSE
);
3798 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_END
, retval
, 0, io_resid
, 0, 0);
3806 cluster_read(vnode_t vp
, struct uio
*uio
, off_t filesize
, int xflags
)
3808 return cluster_read_ext(vp
, uio
, filesize
, xflags
, NULL
, NULL
);
3813 cluster_read_ext(vnode_t vp
, struct uio
*uio
, off_t filesize
, int xflags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
3817 user_ssize_t cur_resid
;
3819 u_int32_t read_length
= 0;
3820 int read_type
= IO_COPY
;
3824 if (vp
->v_flag
& VNOCACHE_DATA
) {
3825 flags
|= IO_NOCACHE
;
3827 if ((vp
->v_flag
& VRAOFF
) || speculative_reads_disabled
) {
3831 if (flags
& IO_SKIP_ENCRYPTION
) {
3832 flags
|= IO_ENCRYPTED
;
3836 * do a read through the cache if one of the following is true....
3837 * NOCACHE is not true
3838 * the uio request doesn't target USERSPACE
3839 * Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well.
3840 * Reading encrypted data from a CP filesystem should never result in the data touching
3843 * otherwise, find out if we want the direct or contig variant for
3844 * the first vector in the uio request
3846 if (((flags
& IO_NOCACHE
) && UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
)) || (flags
& IO_ENCRYPTED
)) {
3847 retval
= cluster_io_type(uio
, &read_type
, &read_length
, 0);
3850 while ((cur_resid
= uio_resid(uio
)) && uio
->uio_offset
< filesize
&& retval
== 0) {
3851 switch (read_type
) {
3854 * make sure the uio_resid isn't too big...
3855 * internally, we want to handle all of the I/O in
3856 * chunk sizes that fit in a 32 bit int
3858 if (cur_resid
> (user_ssize_t
)(MAX_IO_REQUEST_SIZE
)) {
3859 io_size
= MAX_IO_REQUEST_SIZE
;
3861 io_size
= (u_int32_t
)cur_resid
;
3864 retval
= cluster_read_copy(vp
, uio
, io_size
, filesize
, flags
, callback
, callback_arg
);
3868 retval
= cluster_read_direct(vp
, uio
, filesize
, &read_type
, &read_length
, flags
, callback
, callback_arg
);
3872 retval
= cluster_read_contig(vp
, uio
, filesize
, &read_type
, &read_length
, callback
, callback_arg
, flags
);
3876 retval
= cluster_io_type(uio
, &read_type
, &read_length
, 0);
3886 cluster_read_upl_release(upl_t upl
, int start_pg
, int last_pg
, int take_reference
)
3889 int abort_flags
= UPL_ABORT_FREE_ON_EMPTY
;
3891 if ((range
= last_pg
- start_pg
)) {
3892 if (take_reference
) {
3893 abort_flags
|= UPL_ABORT_REFERENCE
;
3896 ubc_upl_abort_range(upl
, start_pg
* PAGE_SIZE
, range
* PAGE_SIZE
, abort_flags
);
3902 cluster_read_copy(vnode_t vp
, struct uio
*uio
, u_int32_t io_req_size
, off_t filesize
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
3904 upl_page_info_t
*pl
;
3906 vm_offset_t upl_offset
;
3915 off_t last_ioread_offset
;
3916 off_t last_request_offset
;
3920 u_int32_t size_of_prefetch
;
3923 u_int32_t max_rd_size
;
3924 u_int32_t max_io_size
;
3925 u_int32_t max_prefetch
;
3926 u_int rd_ahead_enabled
= 1;
3927 u_int prefetch_enabled
= 1;
3928 struct cl_readahead
* rap
;
3929 struct clios iostate
;
3930 struct cl_extent extent
;
3932 int take_reference
= 1;
3933 int policy
= IOPOL_DEFAULT
;
3934 boolean_t iolock_inited
= FALSE
;
3936 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_START
,
3937 (int)uio
->uio_offset
, io_req_size
, (int)filesize
, flags
, 0);
3939 if (flags
& IO_ENCRYPTED
) {
3940 panic("encrypted blocks will hit UBC!");
3943 policy
= throttle_get_io_policy(NULL
);
3945 if (policy
== THROTTLE_LEVEL_TIER3
|| policy
== THROTTLE_LEVEL_TIER2
|| (flags
& IO_NOCACHE
)) {
3949 if (flags
& IO_PASSIVE
) {
3955 if (flags
& IO_NOCACHE
) {
3956 bflag
|= CL_NOCACHE
;
3959 if (flags
& IO_SKIP_ENCRYPTION
) {
3960 bflag
|= CL_ENCRYPTED
;
3963 max_io_size
= cluster_max_io_size(vp
->v_mount
, CL_READ
);
3964 max_prefetch
= MAX_PREFETCH(vp
, max_io_size
, disk_conditioner_mount_is_ssd(vp
->v_mount
));
3965 max_rd_size
= max_prefetch
;
3967 last_request_offset
= uio
->uio_offset
+ io_req_size
;
3969 if (last_request_offset
> filesize
) {
3970 last_request_offset
= filesize
;
3973 if ((flags
& (IO_RAOFF
| IO_NOCACHE
)) || ((last_request_offset
& ~PAGE_MASK_64
) == (uio
->uio_offset
& ~PAGE_MASK_64
))) {
3974 rd_ahead_enabled
= 0;
3977 if (cluster_is_throttled(vp
)) {
3979 * we're in the throttle window, at the very least
3980 * we want to limit the size of the I/O we're about
3983 rd_ahead_enabled
= 0;
3984 prefetch_enabled
= 0;
3986 max_rd_size
= THROTTLE_MAX_IOSIZE
;
3988 if ((rap
= cluster_get_rap(vp
)) == NULL
) {
3989 rd_ahead_enabled
= 0;
3991 extent
.b_addr
= uio
->uio_offset
/ PAGE_SIZE_64
;
3992 extent
.e_addr
= (last_request_offset
- 1) / PAGE_SIZE_64
;
3995 if (rap
!= NULL
&& rap
->cl_ralen
&& (rap
->cl_lastr
== extent
.b_addr
|| (rap
->cl_lastr
+ 1) == extent
.b_addr
)) {
3997 * determine if we already have a read-ahead in the pipe courtesy of the
3998 * last read systemcall that was issued...
3999 * if so, pick up it's extent to determine where we should start
4000 * with respect to any read-ahead that might be necessary to
4001 * garner all the data needed to complete this read systemcall
4003 last_ioread_offset
= (rap
->cl_maxra
* PAGE_SIZE_64
) + PAGE_SIZE_64
;
4005 if (last_ioread_offset
< uio
->uio_offset
) {
4006 last_ioread_offset
= (off_t
)0;
4007 } else if (last_ioread_offset
> last_request_offset
) {
4008 last_ioread_offset
= last_request_offset
;
4011 last_ioread_offset
= (off_t
)0;
4014 while (io_req_size
&& uio
->uio_offset
< filesize
&& retval
== 0) {
4015 max_size
= filesize
- uio
->uio_offset
;
4016 bool leftover_upl_aborted
= false;
4018 if ((off_t
)(io_req_size
) < max_size
) {
4019 io_size
= io_req_size
;
4021 io_size
= (u_int32_t
)max_size
;
4024 if (!(flags
& IO_NOCACHE
)) {
4027 u_int32_t io_requested
;
4030 * if we keep finding the pages we need already in the cache, then
4031 * don't bother to call cluster_read_prefetch since it costs CPU cycles
4032 * to determine that we have all the pages we need... once we miss in
4033 * the cache and have issued an I/O, than we'll assume that we're likely
4034 * to continue to miss in the cache and it's to our advantage to try and prefetch
4036 if (last_request_offset
&& last_ioread_offset
&& (size_of_prefetch
= (u_int32_t
)(last_request_offset
- last_ioread_offset
))) {
4037 if ((last_ioread_offset
- uio
->uio_offset
) <= max_rd_size
&& prefetch_enabled
) {
4039 * we've already issued I/O for this request and
4040 * there's still work to do and
4041 * our prefetch stream is running dry, so issue a
4042 * pre-fetch I/O... the I/O latency will overlap
4043 * with the copying of the data
4045 if (size_of_prefetch
> max_rd_size
) {
4046 size_of_prefetch
= max_rd_size
;
4049 size_of_prefetch
= cluster_read_prefetch(vp
, last_ioread_offset
, size_of_prefetch
, filesize
, callback
, callback_arg
, bflag
);
4051 last_ioread_offset
+= (off_t
)(size_of_prefetch
* PAGE_SIZE
);
4053 if (last_ioread_offset
> last_request_offset
) {
4054 last_ioread_offset
= last_request_offset
;
4059 * limit the size of the copy we're about to do so that
4060 * we can notice that our I/O pipe is running dry and
4061 * get the next I/O issued before it does go dry
4063 if (last_ioread_offset
&& io_size
> (max_io_size
/ 4)) {
4064 io_resid
= (max_io_size
/ 4);
4069 io_requested
= io_resid
;
4071 retval
= cluster_copy_ubc_data_internal(vp
, uio
, (int *)&io_resid
, 0, take_reference
);
4073 xsize
= io_requested
- io_resid
;
4076 io_req_size
-= xsize
;
4078 if (retval
|| io_resid
) {
4080 * if we run into a real error or
4081 * a page that is not in the cache
4082 * we need to leave streaming mode
4087 if (rd_ahead_enabled
&& (io_size
== 0 || last_ioread_offset
== last_request_offset
)) {
4089 * we're already finished the I/O for this read request
4090 * let's see if we should do a read-ahead
4092 cluster_read_ahead(vp
, &extent
, filesize
, rap
, callback
, callback_arg
, bflag
);
4100 if (extent
.e_addr
< rap
->cl_lastr
) {
4103 rap
->cl_lastr
= extent
.e_addr
;
4108 * recompute max_size since cluster_copy_ubc_data_internal
4109 * may have advanced uio->uio_offset
4111 max_size
= filesize
- uio
->uio_offset
;
4114 iostate
.io_completed
= 0;
4115 iostate
.io_issued
= 0;
4116 iostate
.io_error
= 0;
4117 iostate
.io_wanted
= 0;
4119 if ((flags
& IO_RETURN_ON_THROTTLE
)) {
4120 if (cluster_is_throttled(vp
) == THROTTLE_NOW
) {
4121 if (!cluster_io_present_in_BC(vp
, uio
->uio_offset
)) {
4123 * we're in the throttle window and at least 1 I/O
4124 * has already been issued by a throttleable thread
4125 * in this window, so return with EAGAIN to indicate
4126 * to the FS issuing the cluster_read call that it
4127 * should now throttle after dropping any locks
4129 throttle_info_update_by_mount(vp
->v_mount
);
4138 * compute the size of the upl needed to encompass
4139 * the requested read... limit each call to cluster_io
4140 * to the maximum UPL size... cluster_io will clip if
4141 * this exceeds the maximum io_size for the device,
4142 * make sure to account for
4143 * a starting offset that's not page aligned
4145 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
4146 upl_f_offset
= uio
->uio_offset
- (off_t
)start_offset
;
4148 if (io_size
> max_rd_size
) {
4149 io_size
= max_rd_size
;
4152 upl_size
= (start_offset
+ io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
4154 if (flags
& IO_NOCACHE
) {
4155 if (upl_size
> max_io_size
) {
4156 upl_size
= max_io_size
;
4159 if (upl_size
> max_io_size
/ 4) {
4160 upl_size
= max_io_size
/ 4;
4161 upl_size
&= ~PAGE_MASK
;
4163 if (upl_size
== 0) {
4164 upl_size
= PAGE_SIZE
;
4168 pages_in_upl
= upl_size
/ PAGE_SIZE
;
4170 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 33)) | DBG_FUNC_START
,
4171 upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
4173 kret
= ubc_create_upl_kernel(vp
,
4178 UPL_FILE_IO
| UPL_SET_LITE
,
4179 VM_KERN_MEMORY_FILE
);
4180 if (kret
!= KERN_SUCCESS
) {
4181 panic("cluster_read_copy: failed to get pagelist");
4184 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 33)) | DBG_FUNC_END
,
4185 upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
4188 * scan from the beginning of the upl looking for the first
4189 * non-valid page.... this will become the first page in
4190 * the request we're going to make to 'cluster_io'... if all
4191 * of the pages are valid, we won't call through to 'cluster_io'
4193 for (start_pg
= 0; start_pg
< pages_in_upl
; start_pg
++) {
4194 if (!upl_valid_page(pl
, start_pg
)) {
4200 * scan from the starting invalid page looking for a valid
4201 * page before the end of the upl is reached, if we
4202 * find one, then it will be the last page of the request to
4205 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
4206 if (upl_valid_page(pl
, last_pg
)) {
4211 if (start_pg
< last_pg
) {
4213 * we found a range of 'invalid' pages that must be filled
4214 * if the last page in this range is the last page of the file
4215 * we may have to clip the size of it to keep from reading past
4216 * the end of the last physical block associated with the file
4218 if (iolock_inited
== FALSE
) {
4219 lck_mtx_init(&iostate
.io_mtxp
, &cl_mtx_grp
, LCK_ATTR_NULL
);
4221 iolock_inited
= TRUE
;
4223 upl_offset
= start_pg
* PAGE_SIZE
;
4224 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
4226 if ((off_t
)(upl_f_offset
+ upl_offset
+ io_size
) > filesize
) {
4227 io_size
= (u_int32_t
)(filesize
- (upl_f_offset
+ upl_offset
));
4231 * Find out if this needs verification, we'll have to manage the UPL
4232 * diffrently if so. Note that this call only lets us know if
4233 * verification is enabled on this mount point, the actual verification
4234 * is performed in the File system.
4236 size_t verify_block_size
= 0;
4237 if ((VNOP_VERIFY(vp
, start_offset
, NULL
, 0, &verify_block_size
, VNODE_VERIFY_DEFAULT
, NULL
) == 0) /* && verify_block_size */) {
4238 for (uio_last
= last_pg
; uio_last
< pages_in_upl
; uio_last
++) {
4239 if (!upl_valid_page(pl
, uio_last
)) {
4243 if (uio_last
< pages_in_upl
) {
4245 * there were some invalid pages beyond the valid pages
4246 * that we didn't issue an I/O for, just release them
4247 * unchanged now, so that any prefetch/readahed can
4250 ubc_upl_abort_range(upl
, uio_last
* PAGE_SIZE
,
4251 (pages_in_upl
- uio_last
) * PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
4252 leftover_upl_aborted
= true;
4257 * issue an asynchronous read to cluster_io
4260 error
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
,
4261 io_size
, CL_READ
| CL_ASYNC
| bflag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
4264 if (extent
.e_addr
< rap
->cl_maxra
) {
4266 * we've just issued a read for a block that should have been
4267 * in the cache courtesy of the read-ahead engine... something
4268 * has gone wrong with the pipeline, so reset the read-ahead
4269 * logic which will cause us to restart from scratch
4277 * if the read completed successfully, or there was no I/O request
4278 * issued, than copy the data into user land via 'cluster_upl_copy_data'
4279 * we'll first add on any 'valid'
4280 * pages that were present in the upl when we acquired it.
4284 if (!leftover_upl_aborted
) {
4285 for (uio_last
= last_pg
; uio_last
< pages_in_upl
; uio_last
++) {
4286 if (!upl_valid_page(pl
, uio_last
)) {
4290 if (uio_last
< pages_in_upl
) {
4292 * there were some invalid pages beyond the valid pages
4293 * that we didn't issue an I/O for, just release them
4294 * unchanged now, so that any prefetch/readahed can
4297 ubc_upl_abort_range(upl
, uio_last
* PAGE_SIZE
,
4298 (pages_in_upl
- uio_last
) * PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
4303 * compute size to transfer this round, if io_req_size is
4304 * still non-zero after this attempt, we'll loop around and
4305 * set up for another I/O.
4307 val_size
= (uio_last
* PAGE_SIZE
) - start_offset
;
4309 if (val_size
> max_size
) {
4310 val_size
= (u_int
)max_size
;
4313 if (val_size
> io_req_size
) {
4314 val_size
= io_req_size
;
4317 if ((uio
->uio_offset
+ val_size
) > last_ioread_offset
) {
4318 last_ioread_offset
= uio
->uio_offset
+ val_size
;
4321 if ((size_of_prefetch
= (u_int32_t
)(last_request_offset
- last_ioread_offset
)) && prefetch_enabled
) {
4322 if ((last_ioread_offset
- (uio
->uio_offset
+ val_size
)) <= upl_size
) {
4324 * if there's still I/O left to do for this request, and...
4325 * we're not in hard throttle mode, and...
4326 * we're close to using up the previous prefetch, then issue a
4327 * new pre-fetch I/O... the I/O latency will overlap
4328 * with the copying of the data
4330 if (size_of_prefetch
> max_rd_size
) {
4331 size_of_prefetch
= max_rd_size
;
4334 size_of_prefetch
= cluster_read_prefetch(vp
, last_ioread_offset
, size_of_prefetch
, filesize
, callback
, callback_arg
, bflag
);
4336 last_ioread_offset
+= (off_t
)(size_of_prefetch
* PAGE_SIZE
);
4338 if (last_ioread_offset
> last_request_offset
) {
4339 last_ioread_offset
= last_request_offset
;
4342 } else if ((uio
->uio_offset
+ val_size
) == last_request_offset
) {
4344 * this transfer will finish this request, so...
4345 * let's try to read ahead if we're in
4346 * a sequential access pattern and we haven't
4347 * explicitly disabled it
4349 if (rd_ahead_enabled
) {
4350 cluster_read_ahead(vp
, &extent
, filesize
, rap
, callback
, callback_arg
, bflag
);
4354 if (extent
.e_addr
< rap
->cl_lastr
) {
4357 rap
->cl_lastr
= extent
.e_addr
;
4360 if (iolock_inited
== TRUE
) {
4361 cluster_iostate_wait(&iostate
, 0, "cluster_read_copy");
4364 if (iostate
.io_error
) {
4365 error
= iostate
.io_error
;
4367 u_int32_t io_requested
;
4369 io_requested
= val_size
;
4371 retval
= cluster_copy_upl_data(uio
, upl
, start_offset
, (int *)&io_requested
);
4373 io_req_size
-= (val_size
- io_requested
);
4376 if (iolock_inited
== TRUE
) {
4377 cluster_iostate_wait(&iostate
, 0, "cluster_read_copy");
4380 if (start_pg
< last_pg
) {
4382 * compute the range of pages that we actually issued an I/O for
4383 * and either commit them as valid if the I/O succeeded
4384 * or abort them if the I/O failed or we're not supposed to
4385 * keep them in the cache
4387 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
4389 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_START
, upl
, start_pg
* PAGE_SIZE
, io_size
, error
, 0);
4391 if (error
|| (flags
& IO_NOCACHE
)) {
4392 ubc_upl_abort_range(upl
, start_pg
* PAGE_SIZE
, io_size
,
4393 UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
4395 int commit_flags
= UPL_COMMIT_CLEAR_DIRTY
| UPL_COMMIT_FREE_ON_EMPTY
;
4397 if (take_reference
) {
4398 commit_flags
|= UPL_COMMIT_INACTIVATE
;
4400 commit_flags
|= UPL_COMMIT_SPECULATE
;
4403 ubc_upl_commit_range(upl
, start_pg
* PAGE_SIZE
, io_size
, commit_flags
);
4405 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_END
, upl
, start_pg
* PAGE_SIZE
, io_size
, error
, 0);
4407 if ((last_pg
- start_pg
) < pages_in_upl
) {
4409 * the set of pages that we issued an I/O for did not encompass
4410 * the entire upl... so just release these without modifying
4414 if (leftover_upl_aborted
) {
4415 ubc_upl_abort_range(upl
, start_pg
* PAGE_SIZE
, (uio_last
- start_pg
) * PAGE_SIZE
,
4416 UPL_ABORT_FREE_ON_EMPTY
);
4418 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
4421 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_START
,
4422 upl
, -1, pages_in_upl
- (last_pg
- start_pg
), 0, 0);
4425 * handle any valid pages at the beginning of
4426 * the upl... release these appropriately
4428 cluster_read_upl_release(upl
, 0, start_pg
, take_reference
);
4431 * handle any valid pages immediately after the
4432 * pages we issued I/O for... ... release these appropriately
4434 cluster_read_upl_release(upl
, last_pg
, uio_last
, take_reference
);
4436 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_END
, upl
, -1, -1, 0, 0);
4444 if (cluster_is_throttled(vp
)) {
4446 * we're in the throttle window, at the very least
4447 * we want to limit the size of the I/O we're about
4450 rd_ahead_enabled
= 0;
4451 prefetch_enabled
= 0;
4452 max_rd_size
= THROTTLE_MAX_IOSIZE
;
4454 if (max_rd_size
== THROTTLE_MAX_IOSIZE
) {
4456 * coming out of throttled state
4458 if (policy
!= THROTTLE_LEVEL_TIER3
&& policy
!= THROTTLE_LEVEL_TIER2
) {
4460 rd_ahead_enabled
= 1;
4462 prefetch_enabled
= 1;
4464 max_rd_size
= max_prefetch
;
4465 last_ioread_offset
= 0;
4470 if (iolock_inited
== TRUE
) {
4472 * cluster_io returned an error after it
4473 * had already issued some I/O. we need
4474 * to wait for that I/O to complete before
4475 * we can destroy the iostate mutex...
4476 * 'retval' already contains the early error
4477 * so no need to pick it up from iostate.io_error
4479 cluster_iostate_wait(&iostate
, 0, "cluster_read_copy");
4481 lck_mtx_destroy(&iostate
.io_mtxp
, &cl_mtx_grp
);
4484 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_END
,
4485 (int)uio
->uio_offset
, io_req_size
, rap
->cl_lastr
, retval
, 0);
4487 lck_mtx_unlock(&rap
->cl_lockr
);
4489 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_END
,
4490 (int)uio
->uio_offset
, io_req_size
, 0, retval
, 0);
4497 * We don't want another read/write lock for every vnode in the system
4498 * so we keep a hash of them here. There should never be very many of
4499 * these around at any point in time.
4501 cl_direct_read_lock_t
*
4502 cluster_lock_direct_read(vnode_t vp
, lck_rw_type_t type
)
4504 struct cl_direct_read_locks
*head
4505 = &cl_direct_read_locks
[(uintptr_t)vp
/ sizeof(*vp
)
4506 % CL_DIRECT_READ_LOCK_BUCKETS
];
4508 struct cl_direct_read_lock
*lck
, *new_lck
= NULL
;
4511 lck_spin_lock(&cl_direct_read_spin_lock
);
4513 LIST_FOREACH(lck
, head
, chain
) {
4514 if (lck
->vp
== vp
) {
4516 lck_spin_unlock(&cl_direct_read_spin_lock
);
4518 // Someone beat us to it, ditch the allocation
4519 lck_rw_destroy(&new_lck
->rw_lock
, &cl_mtx_grp
);
4520 kheap_free(KHEAP_DEFAULT
, new_lck
, sizeof(cl_direct_read_lock_t
));
4522 lck_rw_lock(&lck
->rw_lock
, type
);
4528 // Use the lock we allocated
4529 LIST_INSERT_HEAD(head
, new_lck
, chain
);
4530 lck_spin_unlock(&cl_direct_read_spin_lock
);
4531 lck_rw_lock(&new_lck
->rw_lock
, type
);
4535 lck_spin_unlock(&cl_direct_read_spin_lock
);
4537 // Allocate a new lock
4538 new_lck
= kheap_alloc(KHEAP_DEFAULT
, sizeof(cl_direct_read_lock_t
),
4540 lck_rw_init(&new_lck
->rw_lock
, &cl_mtx_grp
, LCK_ATTR_NULL
);
4542 new_lck
->ref_count
= 1;
4544 // Got to go round again
4549 cluster_unlock_direct_read(cl_direct_read_lock_t
*lck
)
4551 lck_rw_done(&lck
->rw_lock
);
4553 lck_spin_lock(&cl_direct_read_spin_lock
);
4554 if (lck
->ref_count
== 1) {
4555 LIST_REMOVE(lck
, chain
);
4556 lck_spin_unlock(&cl_direct_read_spin_lock
);
4557 lck_rw_destroy(&lck
->rw_lock
, &cl_mtx_grp
);
4558 kheap_free(KHEAP_DEFAULT
, lck
, sizeof(cl_direct_read_lock_t
));
4561 lck_spin_unlock(&cl_direct_read_spin_lock
);
4566 cluster_read_direct(vnode_t vp
, struct uio
*uio
, off_t filesize
, int *read_type
, u_int32_t
*read_length
,
4567 int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
4570 upl_page_info_t
*pl
;
4572 vm_offset_t upl_offset
, vector_upl_offset
= 0;
4573 upl_size_t upl_size
, vector_upl_size
= 0;
4574 vm_size_t upl_needed_size
;
4575 unsigned int pages_in_pl
;
4576 upl_control_flags_t upl_flags
;
4579 int force_data_sync
;
4581 int no_zero_fill
= 0;
4584 struct clios iostate
;
4585 user_addr_t iov_base
;
4586 u_int32_t io_req_size
;
4587 u_int32_t offset_in_file
;
4588 u_int32_t offset_in_iovbase
;
4592 u_int32_t devblocksize
;
4593 u_int32_t mem_alignment_mask
;
4594 u_int32_t max_upl_size
;
4595 u_int32_t max_rd_size
;
4596 u_int32_t max_rd_ahead
;
4597 u_int32_t max_vector_size
;
4598 boolean_t io_throttled
= FALSE
;
4600 u_int32_t vector_upl_iosize
= 0;
4601 int issueVectorUPL
= 0, useVectorUPL
= (uio
->uio_iovcnt
> 1);
4602 off_t v_upl_uio_offset
= 0;
4603 int vector_upl_index
= 0;
4604 upl_t vector_upl
= NULL
;
4605 cl_direct_read_lock_t
*lock
= NULL
;
4607 user_addr_t orig_iov_base
= 0;
4608 user_addr_t last_iov_base
= 0;
4609 user_addr_t next_iov_base
= 0;
4611 assert(vm_map_page_shift(current_map()) >= PAGE_SHIFT
);
4613 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_START
,
4614 (int)uio
->uio_offset
, (int)filesize
, *read_type
, *read_length
, 0);
4616 max_upl_size
= cluster_max_io_size(vp
->v_mount
, CL_READ
);
4618 max_rd_size
= max_upl_size
;
4619 max_rd_ahead
= max_rd_size
* IO_SCALE(vp
, 2);
4621 io_flag
= CL_COMMIT
| CL_READ
| CL_ASYNC
| CL_NOZERO
| CL_DIRECT_IO
;
4623 if (flags
& IO_PASSIVE
) {
4624 io_flag
|= CL_PASSIVE
;
4627 if (flags
& IO_ENCRYPTED
) {
4628 io_flag
|= CL_RAW_ENCRYPTED
;
4631 if (flags
& IO_NOCACHE
) {
4632 io_flag
|= CL_NOCACHE
;
4635 if (flags
& IO_SKIP_ENCRYPTION
) {
4636 io_flag
|= CL_ENCRYPTED
;
4639 iostate
.io_completed
= 0;
4640 iostate
.io_issued
= 0;
4641 iostate
.io_error
= 0;
4642 iostate
.io_wanted
= 0;
4644 lck_mtx_init(&iostate
.io_mtxp
, &cl_mtx_grp
, LCK_ATTR_NULL
);
4646 devblocksize
= (u_int32_t
)vp
->v_mount
->mnt_devblocksize
;
4647 mem_alignment_mask
= (u_int32_t
)vp
->v_mount
->mnt_alignmentmask
;
4649 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_NONE
,
4650 (int)devblocksize
, (int)mem_alignment_mask
, 0, 0, 0);
4652 if (devblocksize
== 1) {
4654 * the AFP client advertises a devblocksize of 1
4655 * however, its BLOCKMAP routine maps to physical
4656 * blocks that are PAGE_SIZE in size...
4657 * therefore we can't ask for I/Os that aren't page aligned
4658 * or aren't multiples of PAGE_SIZE in size
4659 * by setting devblocksize to PAGE_SIZE, we re-instate
4660 * the old behavior we had before the mem_alignment_mask
4661 * changes went in...
4663 devblocksize
= PAGE_SIZE
;
4666 orig_iov_base
= uio_curriovbase(uio
);
4667 last_iov_base
= orig_iov_base
;
4670 io_req_size
= *read_length
;
4671 iov_base
= uio_curriovbase(uio
);
4673 offset_in_file
= (u_int32_t
)uio
->uio_offset
& (devblocksize
- 1);
4674 offset_in_iovbase
= (u_int32_t
)iov_base
& mem_alignment_mask
;
4676 if (vm_map_page_mask(current_map()) < PAGE_MASK
) {
4679 * Direct I/O might not work as expected from a 16k kernel space
4680 * to a 4k user space because each 4k chunk might point to
4681 * a different 16k physical page...
4682 * Let's go the "misaligned" way.
4685 DEBUG4K_VFS("forcing misaligned\n");
4690 if (offset_in_file
|| offset_in_iovbase
) {
4692 * one of the 2 important offsets is misaligned
4693 * so fire an I/O through the cache for this entire vector
4697 if (iov_base
& (devblocksize
- 1)) {
4699 * the offset in memory must be on a device block boundary
4700 * so that we can guarantee that we can generate an
4701 * I/O that ends on a page boundary in cluster_io
4706 max_io_size
= filesize
- uio
->uio_offset
;
4709 * The user must request IO in aligned chunks. If the
4710 * offset into the file is bad, or the userland pointer
4711 * is non-aligned, then we cannot service the encrypted IO request.
4713 if (flags
& IO_ENCRYPTED
) {
4714 if (misaligned
|| (io_req_size
& (devblocksize
- 1))) {
4718 max_io_size
= roundup(max_io_size
, devblocksize
);
4721 if ((off_t
)io_req_size
> max_io_size
) {
4722 io_req_size
= (u_int32_t
)max_io_size
;
4726 * When we get to this point, we know...
4727 * -- the offset into the file is on a devblocksize boundary
4730 while (io_req_size
&& retval
== 0) {
4733 if (cluster_is_throttled(vp
)) {
4735 * we're in the throttle window, at the very least
4736 * we want to limit the size of the I/O we're about
4739 max_rd_size
= THROTTLE_MAX_IOSIZE
;
4740 max_rd_ahead
= THROTTLE_MAX_IOSIZE
- 1;
4741 max_vector_size
= THROTTLE_MAX_IOSIZE
;
4743 max_rd_size
= max_upl_size
;
4744 max_rd_ahead
= max_rd_size
* IO_SCALE(vp
, 2);
4745 max_vector_size
= MAX_VECTOR_UPL_SIZE
;
4747 io_start
= io_size
= io_req_size
;
4750 * First look for pages already in the cache
4751 * and move them to user space. But only do this
4752 * check if we are not retrieving encrypted data directly
4753 * from the filesystem; those blocks should never
4756 * cluster_copy_ubc_data returns the resid
4759 if ((flags
& IO_ENCRYPTED
) == 0) {
4760 retval
= cluster_copy_ubc_data_internal(vp
, uio
, (int *)&io_size
, 0, 0);
4763 * calculate the number of bytes actually copied
4764 * starting size - residual
4766 xsize
= io_start
- io_size
;
4768 io_req_size
-= xsize
;
4770 if (useVectorUPL
&& (xsize
|| (iov_base
& PAGE_MASK
))) {
4772 * We found something in the cache or we have an iov_base that's not
4775 * Issue all I/O's that have been collected within this Vectored UPL.
4777 if (vector_upl_index
) {
4778 retval
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
4779 reset_vector_run_state();
4787 * After this point, if we are using the Vector UPL path and the base is
4788 * not page-aligned then the UPL with that base will be the first in the vector UPL.
4793 * check to see if we are finished with this request.
4795 * If we satisfied this IO already, then io_req_size will be 0.
4796 * Otherwise, see if the IO was mis-aligned and needs to go through
4797 * the UBC to deal with the 'tail'.
4800 if (io_req_size
== 0 || (misaligned
)) {
4802 * see if there's another uio vector to
4803 * process that's of type IO_DIRECT
4805 * break out of while loop to get there
4810 * assume the request ends on a device block boundary
4812 io_min
= devblocksize
;
4815 * we can handle I/O's in multiples of the device block size
4816 * however, if io_size isn't a multiple of devblocksize we
4817 * want to clip it back to the nearest page boundary since
4818 * we are going to have to go through cluster_read_copy to
4819 * deal with the 'overhang'... by clipping it to a PAGE_SIZE
4820 * multiple, we avoid asking the drive for the same physical
4821 * blocks twice.. once for the partial page at the end of the
4822 * request and a 2nd time for the page we read into the cache
4823 * (which overlaps the end of the direct read) in order to
4824 * get at the overhang bytes
4826 if (io_size
& (devblocksize
- 1)) {
4827 assert(!(flags
& IO_ENCRYPTED
));
4829 * Clip the request to the previous page size boundary
4830 * since request does NOT end on a device block boundary
4832 io_size
&= ~PAGE_MASK
;
4835 if (retval
|| io_size
< io_min
) {
4837 * either an error or we only have the tail left to
4838 * complete via the copy path...
4839 * we may have already spun some portion of this request
4840 * off as async requests... we need to wait for the I/O
4841 * to complete before returning
4843 goto wait_for_dreads
;
4847 * Don't re-check the UBC data if we are looking for uncached IO
4848 * or asking for encrypted blocks.
4850 if ((flags
& IO_ENCRYPTED
) == 0) {
4851 if ((xsize
= io_size
) > max_rd_size
) {
4852 xsize
= max_rd_size
;
4859 * We hold a lock here between the time we check the
4860 * cache and the time we issue I/O. This saves us
4861 * from having to lock the pages in the cache. Not
4862 * all clients will care about this lock but some
4863 * clients may want to guarantee stability between
4864 * here and when the I/O is issued in which case they
4865 * will take the lock exclusively.
4867 lock
= cluster_lock_direct_read(vp
, LCK_RW_TYPE_SHARED
);
4870 ubc_range_op(vp
, uio
->uio_offset
, uio
->uio_offset
+ xsize
, UPL_ROP_ABSENT
, (int *)&io_size
);
4874 * a page must have just come into the cache
4875 * since the first page in this range is no
4876 * longer absent, go back and re-evaluate
4881 if ((flags
& IO_RETURN_ON_THROTTLE
)) {
4882 if (cluster_is_throttled(vp
) == THROTTLE_NOW
) {
4883 if (!cluster_io_present_in_BC(vp
, uio
->uio_offset
)) {
4885 * we're in the throttle window and at least 1 I/O
4886 * has already been issued by a throttleable thread
4887 * in this window, so return with EAGAIN to indicate
4888 * to the FS issuing the cluster_read call that it
4889 * should now throttle after dropping any locks
4891 throttle_info_update_by_mount(vp
->v_mount
);
4893 io_throttled
= TRUE
;
4894 goto wait_for_dreads
;
4898 if (io_size
> max_rd_size
) {
4899 io_size
= max_rd_size
;
4902 iov_base
= uio_curriovbase(uio
);
4904 upl_offset
= (vm_offset_t
)((u_int32_t
)iov_base
& PAGE_MASK
);
4905 upl_needed_size
= (upl_offset
+ io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
4907 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_START
,
4908 (int)upl_offset
, upl_needed_size
, (int)iov_base
, io_size
, 0);
4910 if (upl_offset
== 0 && ((io_size
& PAGE_MASK
) == 0)) {
4916 vm_map_t map
= UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
) ? current_map() : kernel_map
;
4917 for (force_data_sync
= 0; force_data_sync
< 3; force_data_sync
++) {
4919 upl_size
= (upl_size_t
)upl_needed_size
;
4920 upl_flags
= UPL_FILE_IO
| UPL_NO_SYNC
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
4922 upl_flags
|= UPL_NOZEROFILL
;
4924 if (force_data_sync
) {
4925 upl_flags
|= UPL_FORCE_DATA_SYNC
;
4928 kret
= vm_map_create_upl(map
,
4929 (vm_map_offset_t
)(iov_base
& ~((user_addr_t
)PAGE_MASK
)),
4930 &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
, VM_KERN_MEMORY_FILE
);
4932 if (kret
!= KERN_SUCCESS
) {
4933 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
4934 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
4936 * failed to get pagelist
4938 * we may have already spun some portion of this request
4939 * off as async requests... we need to wait for the I/O
4940 * to complete before returning
4942 goto wait_for_dreads
;
4944 pages_in_pl
= upl_size
/ PAGE_SIZE
;
4945 pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
4947 for (i
= 0; i
< pages_in_pl
; i
++) {
4948 if (!upl_page_present(pl
, i
)) {
4952 if (i
== pages_in_pl
) {
4956 ubc_upl_abort(upl
, 0);
4958 if (force_data_sync
>= 3) {
4959 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
4960 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
4962 goto wait_for_dreads
;
4965 * Consider the possibility that upl_size wasn't satisfied.
4967 if (upl_size
< upl_needed_size
) {
4968 if (upl_size
&& upl_offset
== 0) {
4975 ubc_upl_abort(upl
, 0);
4976 goto wait_for_dreads
;
4978 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
4979 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
4982 vm_offset_t end_off
= ((iov_base
+ io_size
) & PAGE_MASK
);
4987 * After this point, if we are using a vector UPL, then
4988 * either all the UPL elements end on a page boundary OR
4989 * this UPL is the last element because it does not end
4990 * on a page boundary.
4995 * request asynchronously so that we can overlap
4996 * the preparation of the next I/O
4997 * if there are already too many outstanding reads
4998 * wait until some have completed before issuing the next read
5000 cluster_iostate_wait(&iostate
, max_rd_ahead
, "cluster_read_direct");
5002 if (iostate
.io_error
) {
5004 * one of the earlier reads we issued ran into a hard error
5005 * don't issue any more reads, cleanup the UPL
5006 * that was just created but not used, then
5007 * go wait for any other reads to complete before
5008 * returning the error to the caller
5010 ubc_upl_abort(upl
, 0);
5012 goto wait_for_dreads
;
5014 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 73)) | DBG_FUNC_START
,
5015 upl
, (int)upl_offset
, (int)uio
->uio_offset
, io_size
, 0);
5017 if (!useVectorUPL
) {
5019 io_flag
&= ~CL_PRESERVE
;
5021 io_flag
|= CL_PRESERVE
;
5024 retval
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
, io_size
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
5026 if (!vector_upl_index
) {
5027 vector_upl
= vector_upl_create(upl_offset
);
5028 v_upl_uio_offset
= uio
->uio_offset
;
5029 vector_upl_offset
= upl_offset
;
5032 vector_upl_set_subupl(vector_upl
, upl
, upl_size
);
5033 vector_upl_set_iostate(vector_upl
, upl
, vector_upl_size
, upl_size
);
5035 vector_upl_size
+= upl_size
;
5036 vector_upl_iosize
+= io_size
;
5038 if (issueVectorUPL
|| vector_upl_index
== MAX_VECTOR_UPL_ELEMENTS
|| vector_upl_size
>= max_vector_size
) {
5039 retval
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
5040 reset_vector_run_state();
5043 last_iov_base
= iov_base
+ io_size
;
5046 // We don't need to wait for the I/O to complete
5047 cluster_unlock_direct_read(lock
);
5052 * update the uio structure
5054 if ((flags
& IO_ENCRYPTED
) && (max_io_size
< io_size
)) {
5055 uio_update(uio
, (user_size_t
)max_io_size
);
5057 uio_update(uio
, (user_size_t
)io_size
);
5060 io_req_size
-= io_size
;
5062 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 73)) | DBG_FUNC_END
,
5063 upl
, (int)uio
->uio_offset
, io_req_size
, retval
, 0);
5066 if (retval
== 0 && iostate
.io_error
== 0 && io_req_size
== 0 && uio
->uio_offset
< filesize
) {
5067 retval
= cluster_io_type(uio
, read_type
, read_length
, 0);
5069 if (retval
== 0 && *read_type
== IO_DIRECT
) {
5070 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_NONE
,
5071 (int)uio
->uio_offset
, (int)filesize
, *read_type
, *read_length
, 0);
5079 if (retval
== 0 && iostate
.io_error
== 0 && useVectorUPL
&& vector_upl_index
) {
5080 retval
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
5081 reset_vector_run_state();
5084 // We don't need to wait for the I/O to complete
5086 cluster_unlock_direct_read(lock
);
5090 * make sure all async reads that are part of this stream
5091 * have completed before we return
5093 cluster_iostate_wait(&iostate
, 0, "cluster_read_direct");
5095 if (iostate
.io_error
) {
5096 retval
= iostate
.io_error
;
5099 lck_mtx_destroy(&iostate
.io_mtxp
, &cl_mtx_grp
);
5101 if (io_throttled
== TRUE
&& retval
== 0) {
5105 vm_map_offset_t current_page_size
, current_page_mask
;
5106 current_page_size
= vm_map_page_size(current_map());
5107 current_page_mask
= vm_map_page_mask(current_map());
5108 for (next_iov_base
= orig_iov_base
;
5109 next_iov_base
< last_iov_base
;
5110 next_iov_base
+= current_page_size
) {
5112 * This is specifically done for pmap accounting purposes.
5113 * vm_pre_fault() will call vm_fault() to enter the page into
5114 * the pmap if there isn't _a_ physical page for that VA already.
5116 vm_pre_fault(vm_map_trunc_page(next_iov_base
, current_page_mask
), VM_PROT_READ
);
5119 if (io_req_size
&& retval
== 0) {
5121 * we couldn't handle the tail of this request in DIRECT mode
5122 * so fire it through the copy path
5124 if (flags
& IO_ENCRYPTED
) {
5126 * We cannot fall back to the copy path for encrypted I/O. If this
5127 * happens, there is something wrong with the user buffer passed
5132 retval
= cluster_read_copy(vp
, uio
, io_req_size
, filesize
, flags
, callback
, callback_arg
);
5135 *read_type
= IO_UNKNOWN
;
5137 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_END
,
5138 (int)uio
->uio_offset
, (int)uio_resid(uio
), io_req_size
, retval
, 0);
5145 cluster_read_contig(vnode_t vp
, struct uio
*uio
, off_t filesize
, int *read_type
, u_int32_t
*read_length
,
5146 int (*callback
)(buf_t
, void *), void *callback_arg
, int flags
)
5148 upl_page_info_t
*pl
;
5149 upl_t upl
[MAX_VECTS
];
5150 vm_offset_t upl_offset
;
5151 addr64_t dst_paddr
= 0;
5152 user_addr_t iov_base
;
5154 upl_size_t upl_size
;
5155 vm_size_t upl_needed_size
;
5156 mach_msg_type_number_t pages_in_pl
;
5157 upl_control_flags_t upl_flags
;
5159 struct clios iostate
;
5166 u_int32_t devblocksize
;
5167 u_int32_t mem_alignment_mask
;
5168 u_int32_t tail_size
= 0;
5171 if (flags
& IO_PASSIVE
) {
5177 if (flags
& IO_NOCACHE
) {
5178 bflag
|= CL_NOCACHE
;
5182 * When we enter this routine, we know
5183 * -- the read_length will not exceed the current iov_len
5184 * -- the target address is physically contiguous for read_length
5186 cluster_syncup(vp
, filesize
, callback
, callback_arg
, PUSH_SYNC
);
5188 devblocksize
= (u_int32_t
)vp
->v_mount
->mnt_devblocksize
;
5189 mem_alignment_mask
= (u_int32_t
)vp
->v_mount
->mnt_alignmentmask
;
5191 iostate
.io_completed
= 0;
5192 iostate
.io_issued
= 0;
5193 iostate
.io_error
= 0;
5194 iostate
.io_wanted
= 0;
5196 lck_mtx_init(&iostate
.io_mtxp
, &cl_mtx_grp
, LCK_ATTR_NULL
);
5199 io_size
= *read_length
;
5201 max_size
= filesize
- uio
->uio_offset
;
5203 if (io_size
> max_size
) {
5204 io_size
= (u_int32_t
)max_size
;
5207 iov_base
= uio_curriovbase(uio
);
5209 upl_offset
= (vm_offset_t
)((u_int32_t
)iov_base
& PAGE_MASK
);
5210 upl_needed_size
= upl_offset
+ io_size
;
5213 upl_size
= (upl_size_t
)upl_needed_size
;
5214 upl_flags
= UPL_FILE_IO
| UPL_NO_SYNC
| UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
5217 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 92)) | DBG_FUNC_START
,
5218 (int)upl_offset
, (int)upl_size
, (int)iov_base
, io_size
, 0);
5220 vm_map_t map
= UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
) ? current_map() : kernel_map
;
5221 kret
= vm_map_get_upl(map
,
5222 vm_map_trunc_page(iov_base
, vm_map_page_mask(map
)),
5223 &upl_size
, &upl
[cur_upl
], NULL
, &pages_in_pl
, &upl_flags
, VM_KERN_MEMORY_FILE
, 0);
5225 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 92)) | DBG_FUNC_END
,
5226 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
5228 if (kret
!= KERN_SUCCESS
) {
5230 * failed to get pagelist
5233 goto wait_for_creads
;
5237 if (upl_size
< upl_needed_size
) {
5239 * The upl_size wasn't satisfied.
5242 goto wait_for_creads
;
5244 pl
= ubc_upl_pageinfo(upl
[cur_upl
]);
5246 dst_paddr
= ((addr64_t
)upl_phys_page(pl
, 0) << PAGE_SHIFT
) + (addr64_t
)upl_offset
;
5248 while (((uio
->uio_offset
& (devblocksize
- 1)) || io_size
< devblocksize
) && io_size
) {
5249 u_int32_t head_size
;
5251 head_size
= devblocksize
- (u_int32_t
)(uio
->uio_offset
& (devblocksize
- 1));
5253 if (head_size
> io_size
) {
5254 head_size
= io_size
;
5257 error
= cluster_align_phys_io(vp
, uio
, dst_paddr
, head_size
, CL_READ
, callback
, callback_arg
);
5260 goto wait_for_creads
;
5263 upl_offset
+= head_size
;
5264 dst_paddr
+= head_size
;
5265 io_size
-= head_size
;
5267 iov_base
+= head_size
;
5269 if ((u_int32_t
)iov_base
& mem_alignment_mask
) {
5271 * request doesn't set up on a memory boundary
5272 * the underlying DMA engine can handle...
5273 * return an error instead of going through
5274 * the slow copy path since the intent of this
5275 * path is direct I/O to device memory
5278 goto wait_for_creads
;
5281 tail_size
= io_size
& (devblocksize
- 1);
5283 io_size
-= tail_size
;
5285 while (io_size
&& error
== 0) {
5286 if (io_size
> MAX_IO_CONTIG_SIZE
) {
5287 xsize
= MAX_IO_CONTIG_SIZE
;
5292 * request asynchronously so that we can overlap
5293 * the preparation of the next I/O... we'll do
5294 * the commit after all the I/O has completed
5295 * since its all issued against the same UPL
5296 * if there are already too many outstanding reads
5297 * wait until some have completed before issuing the next
5299 cluster_iostate_wait(&iostate
, MAX_IO_CONTIG_SIZE
* IO_SCALE(vp
, 2), "cluster_read_contig");
5301 if (iostate
.io_error
) {
5303 * one of the earlier reads we issued ran into a hard error
5304 * don't issue any more reads...
5305 * go wait for any other reads to complete before
5306 * returning the error to the caller
5308 goto wait_for_creads
;
5310 error
= cluster_io(vp
, upl
[cur_upl
], upl_offset
, uio
->uio_offset
, xsize
,
5311 CL_READ
| CL_NOZERO
| CL_DEV_MEMORY
| CL_ASYNC
| bflag
,
5312 (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
5314 * The cluster_io read was issued successfully,
5315 * update the uio structure
5318 uio_update(uio
, (user_size_t
)xsize
);
5321 upl_offset
+= xsize
;
5325 if (error
== 0 && iostate
.io_error
== 0 && tail_size
== 0 && num_upl
< MAX_VECTS
&& uio
->uio_offset
< filesize
) {
5326 error
= cluster_io_type(uio
, read_type
, read_length
, 0);
5328 if (error
== 0 && *read_type
== IO_CONTIG
) {
5333 *read_type
= IO_UNKNOWN
;
5338 * make sure all async reads that are part of this stream
5339 * have completed before we proceed
5341 cluster_iostate_wait(&iostate
, 0, "cluster_read_contig");
5343 if (iostate
.io_error
) {
5344 error
= iostate
.io_error
;
5347 lck_mtx_destroy(&iostate
.io_mtxp
, &cl_mtx_grp
);
5349 if (error
== 0 && tail_size
) {
5350 error
= cluster_align_phys_io(vp
, uio
, dst_paddr
, tail_size
, CL_READ
, callback
, callback_arg
);
5353 for (n
= 0; n
< num_upl
; n
++) {
5355 * just release our hold on each physically contiguous
5356 * region without changing any state
5358 ubc_upl_abort(upl
[n
], 0);
5366 cluster_io_type(struct uio
*uio
, int *io_type
, u_int32_t
*io_length
, u_int32_t min_length
)
5368 user_size_t iov_len
;
5369 user_addr_t iov_base
= 0;
5371 upl_size_t upl_size
;
5372 upl_control_flags_t upl_flags
;
5376 * skip over any emtpy vectors
5378 uio_update(uio
, (user_size_t
)0);
5380 iov_len
= uio_curriovlen(uio
);
5382 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 94)) | DBG_FUNC_START
, uio
, (int)iov_len
, 0, 0, 0);
5385 iov_base
= uio_curriovbase(uio
);
5387 * make sure the size of the vector isn't too big...
5388 * internally, we want to handle all of the I/O in
5389 * chunk sizes that fit in a 32 bit int
5391 if (iov_len
> (user_size_t
)MAX_IO_REQUEST_SIZE
) {
5392 upl_size
= MAX_IO_REQUEST_SIZE
;
5394 upl_size
= (u_int32_t
)iov_len
;
5397 upl_flags
= UPL_QUERY_OBJECT_TYPE
;
5399 vm_map_t map
= UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
) ? current_map() : kernel_map
;
5400 if ((vm_map_get_upl(map
,
5401 vm_map_trunc_page(iov_base
, vm_map_page_mask(map
)),
5402 &upl_size
, &upl
, NULL
, NULL
, &upl_flags
, VM_KERN_MEMORY_FILE
, 0)) != KERN_SUCCESS
) {
5404 * the user app must have passed in an invalid address
5408 if (upl_size
== 0) {
5412 *io_length
= upl_size
;
5414 if (upl_flags
& UPL_PHYS_CONTIG
) {
5415 *io_type
= IO_CONTIG
;
5416 } else if (iov_len
>= min_length
) {
5417 *io_type
= IO_DIRECT
;
5423 * nothing left to do for this uio
5426 *io_type
= IO_UNKNOWN
;
5428 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 94)) | DBG_FUNC_END
, iov_base
, *io_type
, *io_length
, retval
, 0);
5430 if (*io_type
== IO_DIRECT
&&
5431 vm_map_page_shift(current_map()) < PAGE_SHIFT
) {
5432 /* no direct I/O for sub-page-size address spaces */
5433 DEBUG4K_VFS("io_type IO_DIRECT -> IO_COPY\n");
5442 * generate advisory I/O's in the largest chunks possible
5443 * the completed pages will be released into the VM cache
5446 advisory_read(vnode_t vp
, off_t filesize
, off_t f_offset
, int resid
)
5448 return advisory_read_ext(vp
, filesize
, f_offset
, resid
, NULL
, NULL
, CL_PASSIVE
);
5452 advisory_read_ext(vnode_t vp
, off_t filesize
, off_t f_offset
, int resid
, int (*callback
)(buf_t
, void *), void *callback_arg
, int bflag
)
5454 upl_page_info_t
*pl
;
5456 vm_offset_t upl_offset
;
5469 uint32_t max_io_size
;
5472 if (!UBCINFOEXISTS(vp
)) {
5476 if (f_offset
< 0 || resid
< 0) {
5480 max_io_size
= cluster_max_io_size(vp
->v_mount
, CL_READ
);
5482 if (disk_conditioner_mount_is_ssd(vp
->v_mount
)) {
5483 if (max_io_size
> speculative_prefetch_max_iosize
) {
5484 max_io_size
= speculative_prefetch_max_iosize
;
5488 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 60)) | DBG_FUNC_START
,
5489 (int)f_offset
, resid
, (int)filesize
, 0, 0);
5491 while (resid
&& f_offset
< filesize
&& retval
== 0) {
5493 * compute the size of the upl needed to encompass
5494 * the requested read... limit each call to cluster_io
5495 * to the maximum UPL size... cluster_io will clip if
5496 * this exceeds the maximum io_size for the device,
5497 * make sure to account for
5498 * a starting offset that's not page aligned
5500 start_offset
= (int)(f_offset
& PAGE_MASK_64
);
5501 upl_f_offset
= f_offset
- (off_t
)start_offset
;
5502 max_size
= filesize
- f_offset
;
5504 if (resid
< max_size
) {
5507 io_size
= (int)max_size
;
5510 upl_size
= (start_offset
+ io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
5511 if ((uint32_t)upl_size
> max_io_size
) {
5512 upl_size
= max_io_size
;
5517 * return the number of contiguously present pages in the cache
5518 * starting at upl_f_offset within the file
5520 ubc_range_op(vp
, upl_f_offset
, upl_f_offset
+ upl_size
, UPL_ROP_PRESENT
, &skip_range
);
5524 * skip over pages already present in the cache
5526 io_size
= skip_range
- start_offset
;
5528 f_offset
+= io_size
;
5531 if (skip_range
== upl_size
) {
5535 * have to issue some real I/O
5536 * at this point, we know it's starting on a page boundary
5537 * because we've skipped over at least the first page in the request
5540 upl_f_offset
+= skip_range
;
5541 upl_size
-= skip_range
;
5543 pages_in_upl
= upl_size
/ PAGE_SIZE
;
5545 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 61)) | DBG_FUNC_START
,
5546 upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
5548 kret
= ubc_create_upl_kernel(vp
,
5553 UPL_RET_ONLY_ABSENT
| UPL_SET_LITE
,
5554 VM_KERN_MEMORY_FILE
);
5555 if (kret
!= KERN_SUCCESS
) {
5561 * before we start marching forward, we must make sure we end on
5562 * a present page, otherwise we will be working with a freed
5565 for (last_pg
= pages_in_upl
- 1; last_pg
>= 0; last_pg
--) {
5566 if (upl_page_present(pl
, last_pg
)) {
5570 pages_in_upl
= last_pg
+ 1;
5573 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 61)) | DBG_FUNC_END
,
5574 upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
5577 for (last_pg
= 0; last_pg
< pages_in_upl
;) {
5579 * scan from the beginning of the upl looking for the first
5580 * page that is present.... this will become the first page in
5581 * the request we're going to make to 'cluster_io'... if all
5582 * of the pages are absent, we won't call through to 'cluster_io'
5584 for (start_pg
= last_pg
; start_pg
< pages_in_upl
; start_pg
++) {
5585 if (upl_page_present(pl
, start_pg
)) {
5591 * scan from the starting present page looking for an absent
5592 * page before the end of the upl is reached, if we
5593 * find one, then it will terminate the range of pages being
5594 * presented to 'cluster_io'
5596 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
5597 if (!upl_page_present(pl
, last_pg
)) {
5602 if (last_pg
> start_pg
) {
5604 * we found a range of pages that must be filled
5605 * if the last page in this range is the last page of the file
5606 * we may have to clip the size of it to keep from reading past
5607 * the end of the last physical block associated with the file
5609 upl_offset
= start_pg
* PAGE_SIZE
;
5610 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
5612 if ((off_t
)(upl_f_offset
+ upl_offset
+ io_size
) > filesize
) {
5613 io_size
= (int)(filesize
- (upl_f_offset
+ upl_offset
));
5617 * issue an asynchronous read to cluster_io
5619 retval
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, io_size
,
5620 CL_ASYNC
| CL_READ
| CL_COMMIT
| CL_AGE
| bflag
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
);
5625 if (issued_io
== 0) {
5626 ubc_upl_abort(upl
, 0);
5629 io_size
= upl_size
- start_offset
;
5631 if (io_size
> resid
) {
5634 f_offset
+= io_size
;
5638 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 60)) | DBG_FUNC_END
,
5639 (int)f_offset
, resid
, retval
, 0, 0);
5646 cluster_push(vnode_t vp
, int flags
)
5648 return cluster_push_ext(vp
, flags
, NULL
, NULL
);
5653 cluster_push_ext(vnode_t vp
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
5655 return cluster_push_err(vp
, flags
, callback
, callback_arg
, NULL
);
5658 /* write errors via err, but return the number of clusters written */
5660 cluster_push_err(vnode_t vp
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
, int *err
)
5663 int my_sparse_wait
= 0;
5664 struct cl_writebehind
*wbp
;
5671 if (!UBCINFOEXISTS(vp
)) {
5672 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_NONE
, kdebug_vnode(vp
), flags
, 0, -1, 0);
5675 /* return if deferred write is set */
5676 if (((unsigned int)vfs_flags(vp
->v_mount
) & MNT_DEFWRITE
) && (flags
& IO_DEFWRITE
)) {
5679 if ((wbp
= cluster_get_wbp(vp
, CLW_RETURNLOCKED
)) == NULL
) {
5680 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_NONE
, kdebug_vnode(vp
), flags
, 0, -2, 0);
5683 if (!ISSET(flags
, IO_SYNC
) && wbp
->cl_number
== 0 && wbp
->cl_scmap
== NULL
) {
5684 lck_mtx_unlock(&wbp
->cl_lockw
);
5686 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_NONE
, kdebug_vnode(vp
), flags
, 0, -3, 0);
5689 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_START
,
5690 wbp
->cl_scmap
, wbp
->cl_number
, flags
, 0, 0);
5693 * if we have an fsync in progress, we don't want to allow any additional
5694 * sync/fsync/close(s) to occur until it finishes.
5695 * note that its possible for writes to continue to occur to this file
5696 * while we're waiting and also once the fsync starts to clean if we're
5697 * in the sparse map case
5699 while (wbp
->cl_sparse_wait
) {
5700 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 97)) | DBG_FUNC_START
, kdebug_vnode(vp
), 0, 0, 0, 0);
5702 msleep((caddr_t
)&wbp
->cl_sparse_wait
, &wbp
->cl_lockw
, PRIBIO
+ 1, "cluster_push_ext", NULL
);
5704 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 97)) | DBG_FUNC_END
, kdebug_vnode(vp
), 0, 0, 0, 0);
5706 if (flags
& IO_SYNC
) {
5708 wbp
->cl_sparse_wait
= 1;
5711 * this is an fsync (or equivalent)... we must wait for any existing async
5712 * cleaning operations to complete before we evaulate the current state
5713 * and finish cleaning... this insures that all writes issued before this
5714 * fsync actually get cleaned to the disk before this fsync returns
5716 while (wbp
->cl_sparse_pushes
) {
5717 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 98)) | DBG_FUNC_START
, kdebug_vnode(vp
), 0, 0, 0, 0);
5719 msleep((caddr_t
)&wbp
->cl_sparse_pushes
, &wbp
->cl_lockw
, PRIBIO
+ 1, "cluster_push_ext", NULL
);
5721 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 98)) | DBG_FUNC_END
, kdebug_vnode(vp
), 0, 0, 0, 0);
5724 if (wbp
->cl_scmap
) {
5727 if (wbp
->cl_sparse_pushes
< SPARSE_PUSH_LIMIT
) {
5728 scmap
= wbp
->cl_scmap
;
5729 wbp
->cl_scmap
= NULL
;
5731 wbp
->cl_sparse_pushes
++;
5733 lck_mtx_unlock(&wbp
->cl_lockw
);
5735 retval
= sparse_cluster_push(wbp
, &scmap
, vp
, ubc_getsize(vp
), PUSH_ALL
, flags
, callback
, callback_arg
, FALSE
);
5737 lck_mtx_lock(&wbp
->cl_lockw
);
5739 wbp
->cl_sparse_pushes
--;
5742 if (wbp
->cl_scmap
!= NULL
) {
5743 panic("cluster_push_err: Expected NULL cl_scmap\n");
5746 wbp
->cl_scmap
= scmap
;
5749 if (wbp
->cl_sparse_wait
&& wbp
->cl_sparse_pushes
== 0) {
5750 wakeup((caddr_t
)&wbp
->cl_sparse_pushes
);
5753 retval
= sparse_cluster_push(wbp
, &(wbp
->cl_scmap
), vp
, ubc_getsize(vp
), PUSH_ALL
, flags
, callback
, callback_arg
, FALSE
);
5763 retval
= cluster_try_push(wbp
, vp
, ubc_getsize(vp
), PUSH_ALL
, flags
, callback
, callback_arg
, &local_err
, FALSE
);
5768 lck_mtx_unlock(&wbp
->cl_lockw
);
5770 if (flags
& IO_SYNC
) {
5771 (void)vnode_waitforwrites(vp
, 0, 0, 0, "cluster_push");
5774 if (my_sparse_wait
) {
5776 * I'm the owner of the serialization token
5777 * clear it and wakeup anyone that is waiting
5780 lck_mtx_lock(&wbp
->cl_lockw
);
5782 wbp
->cl_sparse_wait
= 0;
5783 wakeup((caddr_t
)&wbp
->cl_sparse_wait
);
5785 lck_mtx_unlock(&wbp
->cl_lockw
);
5787 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_END
,
5788 wbp
->cl_scmap
, wbp
->cl_number
, retval
, local_err
, 0);
5794 __private_extern__
void
5795 cluster_release(struct ubc_info
*ubc
)
5797 struct cl_writebehind
*wbp
;
5798 struct cl_readahead
*rap
;
5800 if ((wbp
= ubc
->cl_wbehind
)) {
5801 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_START
, ubc
, wbp
->cl_scmap
, 0, 0, 0);
5803 if (wbp
->cl_scmap
) {
5804 vfs_drt_control(&(wbp
->cl_scmap
), 0);
5806 lck_mtx_destroy(&wbp
->cl_lockw
, &cl_mtx_grp
);
5807 zfree(cl_wr_zone
, wbp
);
5808 ubc
->cl_wbehind
= NULL
;
5810 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_START
, ubc
, 0, 0, 0, 0);
5813 if ((rap
= ubc
->cl_rahead
)) {
5814 lck_mtx_destroy(&rap
->cl_lockr
, &cl_mtx_grp
);
5815 zfree(cl_rd_zone
, rap
);
5816 ubc
->cl_rahead
= NULL
;
5819 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_END
, ubc
, rap
, wbp
, 0, 0);
5824 cluster_try_push(struct cl_writebehind
*wbp
, vnode_t vp
, off_t EOF
, int push_flag
, int io_flags
, int (*callback
)(buf_t
, void *), void *callback_arg
, int *err
, boolean_t vm_initiated
)
5831 struct cl_wextent l_clusters
[MAX_CLUSTERS
];
5832 u_int max_cluster_pgcount
;
5835 max_cluster_pgcount
= MAX_CLUSTER_SIZE(vp
) / PAGE_SIZE
;
5837 * the write behind context exists and has
5838 * already been locked...
5840 if (wbp
->cl_number
== 0) {
5842 * no clusters to push
5843 * return number of empty slots
5845 return MAX_CLUSTERS
;
5849 * make a local 'sorted' copy of the clusters
5850 * and clear wbp->cl_number so that new clusters can
5853 for (cl_index
= 0; cl_index
< wbp
->cl_number
; cl_index
++) {
5854 for (min_index
= -1, cl_index1
= 0; cl_index1
< wbp
->cl_number
; cl_index1
++) {
5855 if (wbp
->cl_clusters
[cl_index1
].b_addr
== wbp
->cl_clusters
[cl_index1
].e_addr
) {
5858 if (min_index
== -1) {
5859 min_index
= cl_index1
;
5860 } else if (wbp
->cl_clusters
[cl_index1
].b_addr
< wbp
->cl_clusters
[min_index
].b_addr
) {
5861 min_index
= cl_index1
;
5864 if (min_index
== -1) {
5868 l_clusters
[cl_index
].b_addr
= wbp
->cl_clusters
[min_index
].b_addr
;
5869 l_clusters
[cl_index
].e_addr
= wbp
->cl_clusters
[min_index
].e_addr
;
5870 l_clusters
[cl_index
].io_flags
= wbp
->cl_clusters
[min_index
].io_flags
;
5872 wbp
->cl_clusters
[min_index
].b_addr
= wbp
->cl_clusters
[min_index
].e_addr
;
5878 /* skip switching to the sparse cluster mechanism if on diskimage */
5879 if (((push_flag
& PUSH_DELAY
) && cl_len
== MAX_CLUSTERS
) &&
5880 !(vp
->v_mount
->mnt_kern_flag
& MNTK_VIRTUALDEV
)) {
5884 * determine if we appear to be writing the file sequentially
5885 * if not, by returning without having pushed any clusters
5886 * we will cause this vnode to be pushed into the sparse cluster mechanism
5887 * used for managing more random I/O patterns
5889 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
5890 * that's why we're in try_push with PUSH_DELAY...
5892 * check to make sure that all the clusters except the last one are 'full'... and that each cluster
5893 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
5894 * so we can just make a simple pass through, up to, but not including the last one...
5895 * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
5898 * we let the last one be partial as long as it was adjacent to the previous one...
5899 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
5900 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
5902 for (i
= 0; i
< MAX_CLUSTERS
- 1; i
++) {
5903 if ((l_clusters
[i
].e_addr
- l_clusters
[i
].b_addr
) != max_cluster_pgcount
) {
5906 if (l_clusters
[i
].e_addr
!= l_clusters
[i
+ 1].b_addr
) {
5911 if (vm_initiated
== TRUE
) {
5912 lck_mtx_unlock(&wbp
->cl_lockw
);
5915 for (cl_index
= 0; cl_index
< cl_len
; cl_index
++) {
5917 struct cl_extent cl
;
5920 flags
= io_flags
& (IO_PASSIVE
| IO_CLOSE
);
5923 * try to push each cluster in turn...
5925 if (l_clusters
[cl_index
].io_flags
& CLW_IONOCACHE
) {
5926 flags
|= IO_NOCACHE
;
5929 if (l_clusters
[cl_index
].io_flags
& CLW_IOPASSIVE
) {
5930 flags
|= IO_PASSIVE
;
5933 if (push_flag
& PUSH_SYNC
) {
5937 cl
.b_addr
= l_clusters
[cl_index
].b_addr
;
5938 cl
.e_addr
= l_clusters
[cl_index
].e_addr
;
5940 retval
= cluster_push_now(vp
, &cl
, EOF
, flags
, callback
, callback_arg
, vm_initiated
);
5945 l_clusters
[cl_index
].b_addr
= 0;
5946 l_clusters
[cl_index
].e_addr
= 0;
5947 } else if (error
== 0) {
5951 if (!(push_flag
& PUSH_ALL
)) {
5955 if (vm_initiated
== TRUE
) {
5956 lck_mtx_lock(&wbp
->cl_lockw
);
5964 if (cl_len
> cl_pushed
) {
5966 * we didn't push all of the clusters, so
5967 * lets try to merge them back in to the vnode
5969 if ((MAX_CLUSTERS
- wbp
->cl_number
) < (cl_len
- cl_pushed
)) {
5971 * we picked up some new clusters while we were trying to
5972 * push the old ones... this can happen because I've dropped
5973 * the vnode lock... the sum of the
5974 * leftovers plus the new cluster count exceeds our ability
5975 * to represent them, so switch to the sparse cluster mechanism
5977 * collect the active public clusters...
5979 sparse_cluster_switch(wbp
, vp
, EOF
, callback
, callback_arg
, vm_initiated
);
5981 for (cl_index
= 0, cl_index1
= 0; cl_index
< cl_len
; cl_index
++) {
5982 if (l_clusters
[cl_index
].b_addr
== l_clusters
[cl_index
].e_addr
) {
5985 wbp
->cl_clusters
[cl_index1
].b_addr
= l_clusters
[cl_index
].b_addr
;
5986 wbp
->cl_clusters
[cl_index1
].e_addr
= l_clusters
[cl_index
].e_addr
;
5987 wbp
->cl_clusters
[cl_index1
].io_flags
= l_clusters
[cl_index
].io_flags
;
5992 * update the cluster count
5994 wbp
->cl_number
= cl_index1
;
5997 * and collect the original clusters that were moved into the
5998 * local storage for sorting purposes
6000 sparse_cluster_switch(wbp
, vp
, EOF
, callback
, callback_arg
, vm_initiated
);
6003 * we've got room to merge the leftovers back in
6004 * just append them starting at the next 'hole'
6005 * represented by wbp->cl_number
6007 for (cl_index
= 0, cl_index1
= wbp
->cl_number
; cl_index
< cl_len
; cl_index
++) {
6008 if (l_clusters
[cl_index
].b_addr
== l_clusters
[cl_index
].e_addr
) {
6012 wbp
->cl_clusters
[cl_index1
].b_addr
= l_clusters
[cl_index
].b_addr
;
6013 wbp
->cl_clusters
[cl_index1
].e_addr
= l_clusters
[cl_index
].e_addr
;
6014 wbp
->cl_clusters
[cl_index1
].io_flags
= l_clusters
[cl_index
].io_flags
;
6019 * update the cluster count
6021 wbp
->cl_number
= cl_index1
;
6024 return MAX_CLUSTERS
- wbp
->cl_number
;
6030 cluster_push_now(vnode_t vp
, struct cl_extent
*cl
, off_t EOF
, int flags
,
6031 int (*callback
)(buf_t
, void *), void *callback_arg
, boolean_t vm_initiated
)
6033 upl_page_info_t
*pl
;
6035 vm_offset_t upl_offset
;
6050 if (flags
& IO_PASSIVE
) {
6056 if (flags
& IO_SKIP_ENCRYPTION
) {
6057 bflag
|= CL_ENCRYPTED
;
6060 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_START
,
6061 (int)cl
->b_addr
, (int)cl
->e_addr
, (int)EOF
, flags
, 0);
6063 if ((pages_in_upl
= (int)(cl
->e_addr
- cl
->b_addr
)) == 0) {
6064 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 0, 0, 0, 0);
6068 upl_size
= pages_in_upl
* PAGE_SIZE
;
6069 upl_f_offset
= (off_t
)(cl
->b_addr
* PAGE_SIZE_64
);
6071 if (upl_f_offset
+ upl_size
>= EOF
) {
6072 if (upl_f_offset
>= EOF
) {
6074 * must have truncated the file and missed
6075 * clearing a dangling cluster (i.e. it's completely
6076 * beyond the new EOF
6078 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 1, 0, 0, 0);
6082 size
= (int)(EOF
- upl_f_offset
);
6084 upl_size
= (size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
6085 pages_in_upl
= upl_size
/ PAGE_SIZE
;
6092 vnode_pageout(vp
, NULL
, (upl_offset_t
)0, upl_f_offset
, (upl_size_t
)upl_size
,
6093 UPL_MSYNC
| UPL_VNODE_PAGER
| UPL_KEEPCACHED
, &error
);
6097 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_START
, upl_size
, size
, 0, 0, 0);
6100 * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
6102 * - only pages that are currently dirty are returned... these are the ones we need to clean
6103 * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
6104 * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
6105 * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
6106 * someone dirties this page while the I/O is in progress, we don't lose track of the new state
6108 * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
6111 if ((vp
->v_flag
& VNOCACHE_DATA
) || (flags
& IO_NOCACHE
)) {
6112 upl_flags
= UPL_COPYOUT_FROM
| UPL_RET_ONLY_DIRTY
| UPL_SET_LITE
| UPL_WILL_BE_DUMPED
;
6114 upl_flags
= UPL_COPYOUT_FROM
| UPL_RET_ONLY_DIRTY
| UPL_SET_LITE
;
6117 kret
= ubc_create_upl_kernel(vp
,
6123 VM_KERN_MEMORY_FILE
);
6124 if (kret
!= KERN_SUCCESS
) {
6125 panic("cluster_push: failed to get pagelist");
6128 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_END
, upl
, upl_f_offset
, 0, 0, 0);
6131 * since we only asked for the dirty pages back
6132 * it's possible that we may only get a few or even none, so...
6133 * before we start marching forward, we must make sure we know
6134 * where the last present page is in the UPL, otherwise we could
6135 * end up working with a freed upl due to the FREE_ON_EMPTY semantics
6136 * employed by commit_range and abort_range.
6138 for (last_pg
= pages_in_upl
- 1; last_pg
>= 0; last_pg
--) {
6139 if (upl_page_present(pl
, last_pg
)) {
6143 pages_in_upl
= last_pg
+ 1;
6145 if (pages_in_upl
== 0) {
6146 ubc_upl_abort(upl
, 0);
6148 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 2, 0, 0, 0);
6152 for (last_pg
= 0; last_pg
< pages_in_upl
;) {
6154 * find the next dirty page in the UPL
6155 * this will become the first page in the
6156 * next I/O to generate
6158 for (start_pg
= last_pg
; start_pg
< pages_in_upl
; start_pg
++) {
6159 if (upl_dirty_page(pl
, start_pg
)) {
6162 if (upl_page_present(pl
, start_pg
)) {
6164 * RET_ONLY_DIRTY will return non-dirty 'precious' pages
6165 * just release these unchanged since we're not going
6166 * to steal them or change their state
6168 ubc_upl_abort_range(upl
, start_pg
* PAGE_SIZE
, PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
6171 if (start_pg
>= pages_in_upl
) {
6173 * done... no more dirty pages to push
6177 if (start_pg
> last_pg
) {
6179 * skipped over some non-dirty pages
6181 size
-= ((start_pg
- last_pg
) * PAGE_SIZE
);
6185 * find a range of dirty pages to write
6187 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
6188 if (!upl_dirty_page(pl
, last_pg
)) {
6192 upl_offset
= start_pg
* PAGE_SIZE
;
6194 io_size
= min(size
, (last_pg
- start_pg
) * PAGE_SIZE
);
6196 io_flags
= CL_THROTTLE
| CL_COMMIT
| CL_AGE
| bflag
;
6198 if (!(flags
& IO_SYNC
)) {
6199 io_flags
|= CL_ASYNC
;
6202 if (flags
& IO_CLOSE
) {
6203 io_flags
|= CL_CLOSE
;
6206 if (flags
& IO_NOCACHE
) {
6207 io_flags
|= CL_NOCACHE
;
6210 retval
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, io_size
,
6211 io_flags
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
);
6213 if (error
== 0 && retval
) {
6219 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 3, error
, 0, 0);
6226 * sparse_cluster_switch is called with the write behind lock held
6229 sparse_cluster_switch(struct cl_writebehind
*wbp
, vnode_t vp
, off_t EOF
, int (*callback
)(buf_t
, void *), void *callback_arg
, boolean_t vm_initiated
)
6234 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 78)) | DBG_FUNC_START
, kdebug_vnode(vp
), wbp
->cl_scmap
, wbp
->cl_number
, 0, 0);
6236 for (cl_index
= 0; cl_index
< wbp
->cl_number
; cl_index
++) {
6238 struct cl_extent cl
;
6240 for (cl
.b_addr
= wbp
->cl_clusters
[cl_index
].b_addr
; cl
.b_addr
< wbp
->cl_clusters
[cl_index
].e_addr
; cl
.b_addr
++) {
6241 if (ubc_page_op(vp
, (off_t
)(cl
.b_addr
* PAGE_SIZE_64
), 0, NULL
, &flags
) == KERN_SUCCESS
) {
6242 if (flags
& UPL_POP_DIRTY
) {
6243 cl
.e_addr
= cl
.b_addr
+ 1;
6245 error
= sparse_cluster_add(wbp
, &(wbp
->cl_scmap
), vp
, &cl
, EOF
, callback
, callback_arg
, vm_initiated
);
6254 wbp
->cl_number
-= cl_index
;
6256 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 78)) | DBG_FUNC_END
, kdebug_vnode(vp
), wbp
->cl_scmap
, wbp
->cl_number
, error
, 0);
6263 * sparse_cluster_push must be called with the write-behind lock held if the scmap is
6264 * still associated with the write-behind context... however, if the scmap has been disassociated
6265 * from the write-behind context (the cluster_push case), the wb lock is not held
6268 sparse_cluster_push(struct cl_writebehind
*wbp
, void **scmap
, vnode_t vp
, off_t EOF
, int push_flag
,
6269 int io_flags
, int (*callback
)(buf_t
, void *), void *callback_arg
, boolean_t vm_initiated
)
6271 struct cl_extent cl
;
6277 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 79)) | DBG_FUNC_START
, kdebug_vnode(vp
), (*scmap
), 0, push_flag
, 0);
6279 if (push_flag
& PUSH_ALL
) {
6280 vfs_drt_control(scmap
, 1);
6288 if (vfs_drt_get_cluster(scmap
, &offset
, &length
) != KERN_SUCCESS
) {
6292 if (vm_initiated
== TRUE
) {
6293 lck_mtx_unlock(&wbp
->cl_lockw
);
6296 cl
.b_addr
= (daddr64_t
)(offset
/ PAGE_SIZE_64
);
6297 cl
.e_addr
= (daddr64_t
)((offset
+ length
) / PAGE_SIZE_64
);
6299 retval
= cluster_push_now(vp
, &cl
, EOF
, io_flags
, callback
, callback_arg
, vm_initiated
);
6300 if (error
== 0 && retval
) {
6304 if (vm_initiated
== TRUE
) {
6305 lck_mtx_lock(&wbp
->cl_lockw
);
6307 if (*scmap
!= l_scmap
) {
6313 if (vfs_drt_mark_pages(scmap
, offset
, length
, NULL
) != KERN_SUCCESS
) {
6314 panic("Failed to restore dirty state on failure\n");
6320 if (!(push_flag
& PUSH_ALL
)) {
6324 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 79)) | DBG_FUNC_END
, kdebug_vnode(vp
), (*scmap
), error
, 0, 0);
6331 * sparse_cluster_add is called with the write behind lock held
6334 sparse_cluster_add(struct cl_writebehind
*wbp
, void **scmap
, vnode_t vp
, struct cl_extent
*cl
, off_t EOF
,
6335 int (*callback
)(buf_t
, void *), void *callback_arg
, boolean_t vm_initiated
)
6341 int push_flag
= 0; /* Is this a valid value? */
6343 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 80)) | DBG_FUNC_START
, (*scmap
), 0, cl
->b_addr
, (int)cl
->e_addr
, 0);
6345 offset
= (off_t
)(cl
->b_addr
* PAGE_SIZE_64
);
6346 length
= ((u_int
)(cl
->e_addr
- cl
->b_addr
)) * PAGE_SIZE
;
6348 while (vfs_drt_mark_pages(scmap
, offset
, length
, &new_dirty
) != KERN_SUCCESS
) {
6350 * no room left in the map
6351 * only a partial update was done
6352 * push out some pages and try again
6355 if (vfs_get_scmap_push_behavior_internal(scmap
, &push_flag
)) {
6359 error
= sparse_cluster_push(wbp
, scmap
, vp
, EOF
, push_flag
, 0, callback
, callback_arg
, vm_initiated
);
6365 offset
+= (new_dirty
* PAGE_SIZE_64
);
6366 length
-= (new_dirty
* PAGE_SIZE
);
6368 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 80)) | DBG_FUNC_END
, kdebug_vnode(vp
), (*scmap
), error
, 0, 0);
6375 cluster_align_phys_io(vnode_t vp
, struct uio
*uio
, addr64_t usr_paddr
, u_int32_t xsize
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
6377 upl_page_info_t
*pl
;
6387 if (flags
& IO_PASSIVE
) {
6393 if (flags
& IO_NOCACHE
) {
6394 bflag
|= CL_NOCACHE
;
6397 upl_flags
= UPL_SET_LITE
;
6399 if (!(flags
& CL_READ
)) {
6401 * "write" operation: let the UPL subsystem know
6402 * that we intend to modify the buffer cache pages
6405 upl_flags
|= UPL_WILL_MODIFY
;
6408 * indicate that there is no need to pull the
6409 * mapping for this page... we're only going
6410 * to read from it, not modify it.
6412 upl_flags
|= UPL_FILE_IO
;
6414 kret
= ubc_create_upl_kernel(vp
,
6415 uio
->uio_offset
& ~PAGE_MASK_64
,
6420 VM_KERN_MEMORY_FILE
);
6422 if (kret
!= KERN_SUCCESS
) {
6426 if (!upl_valid_page(pl
, 0)) {
6428 * issue a synchronous read to cluster_io
6430 error
= cluster_io(vp
, upl
, 0, uio
->uio_offset
& ~PAGE_MASK_64
, PAGE_SIZE
,
6431 CL_READ
| bflag
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
);
6433 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
6439 ubc_paddr
= ((addr64_t
)upl_phys_page(pl
, 0) << PAGE_SHIFT
) + (addr64_t
)(uio
->uio_offset
& PAGE_MASK_64
);
6442 * NOTE: There is no prototype for the following in BSD. It, and the definitions
6443 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
6444 * osfmk/ppc/mappings.h. They are not included here because there appears to be no
6445 * way to do so without exporting them to kexts as well.
6447 if (flags
& CL_READ
) {
6448 // copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */
6449 copypv(ubc_paddr
, usr_paddr
, xsize
, 2 | 1 | 4); /* Copy physical to physical and flush the destination */
6451 // copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */
6452 copypv(usr_paddr
, ubc_paddr
, xsize
, 2 | 1 | 8); /* Copy physical to physical and flush the source */
6454 if (!(flags
& CL_READ
) || (upl_valid_page(pl
, 0) && upl_dirty_page(pl
, 0))) {
6456 * issue a synchronous write to cluster_io
6458 error
= cluster_io(vp
, upl
, 0, uio
->uio_offset
& ~PAGE_MASK_64
, PAGE_SIZE
,
6459 bflag
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
);
6462 uio_update(uio
, (user_size_t
)xsize
);
6466 abort_flags
= UPL_ABORT_FREE_ON_EMPTY
;
6468 abort_flags
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_DUMP_PAGES
;
6471 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, abort_flags
);
6477 cluster_copy_upl_data(struct uio
*uio
, upl_t upl
, int upl_offset
, int *io_resid
)
6485 upl_page_info_t
*pl
;
6490 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_START
,
6491 (int)uio
->uio_offset
, upl_offset
, xsize
, 0, 0);
6493 segflg
= uio
->uio_segflg
;
6496 case UIO_USERSPACE32
:
6497 case UIO_USERISPACE32
:
6498 uio
->uio_segflg
= UIO_PHYS_USERSPACE32
;
6502 case UIO_USERISPACE
:
6503 uio
->uio_segflg
= UIO_PHYS_USERSPACE
;
6506 case UIO_USERSPACE64
:
6507 case UIO_USERISPACE64
:
6508 uio
->uio_segflg
= UIO_PHYS_USERSPACE64
;
6512 uio
->uio_segflg
= UIO_PHYS_SYSSPACE
;
6515 pl
= ubc_upl_pageinfo(upl
);
6517 pg_index
= upl_offset
/ PAGE_SIZE
;
6518 pg_offset
= upl_offset
& PAGE_MASK
;
6519 csize
= min(PAGE_SIZE
- pg_offset
, xsize
);
6522 while (xsize
&& retval
== 0) {
6525 paddr
= ((addr64_t
)upl_phys_page(pl
, pg_index
) << PAGE_SHIFT
) + pg_offset
;
6526 if ((uio
->uio_rw
== UIO_WRITE
) && (upl_dirty_page(pl
, pg_index
) == FALSE
)) {
6530 retval
= uiomove64(paddr
, csize
, uio
);
6535 csize
= min(PAGE_SIZE
, xsize
);
6539 uio
->uio_segflg
= segflg
;
6541 task_update_logical_writes(current_task(), (dirty_count
* PAGE_SIZE
), TASK_WRITE_DEFERRED
, upl_lookup_vnode(upl
));
6542 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
6543 (int)uio
->uio_offset
, xsize
, retval
, segflg
, 0);
6550 cluster_copy_ubc_data(vnode_t vp
, struct uio
*uio
, int *io_resid
, int mark_dirty
)
6552 return cluster_copy_ubc_data_internal(vp
, uio
, io_resid
, mark_dirty
, 1);
6557 cluster_copy_ubc_data_internal(vnode_t vp
, struct uio
*uio
, int *io_resid
, int mark_dirty
, int take_reference
)
6564 memory_object_control_t control
;
6566 io_size
= *io_resid
;
6568 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_START
,
6569 (int)uio
->uio_offset
, io_size
, mark_dirty
, take_reference
, 0);
6571 control
= ubc_getobject(vp
, UBC_FLAGS_NONE
);
6573 if (control
== MEMORY_OBJECT_CONTROL_NULL
) {
6574 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
6575 (int)uio
->uio_offset
, io_size
, retval
, 3, 0);
6579 segflg
= uio
->uio_segflg
;
6582 case UIO_USERSPACE32
:
6583 case UIO_USERISPACE32
:
6584 uio
->uio_segflg
= UIO_PHYS_USERSPACE32
;
6587 case UIO_USERSPACE64
:
6588 case UIO_USERISPACE64
:
6589 uio
->uio_segflg
= UIO_PHYS_USERSPACE64
;
6593 case UIO_USERISPACE
:
6594 uio
->uio_segflg
= UIO_PHYS_USERSPACE
;
6598 uio
->uio_segflg
= UIO_PHYS_SYSSPACE
;
6602 if ((io_size
= *io_resid
)) {
6603 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
6604 xsize
= (int)uio_resid(uio
);
6606 retval
= memory_object_control_uiomove(control
, uio
->uio_offset
- start_offset
, uio
,
6607 start_offset
, io_size
, mark_dirty
, take_reference
);
6608 xsize
-= uio_resid(uio
);
6611 uio
->uio_segflg
= segflg
;
6612 *io_resid
= io_size
;
6614 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
6615 (int)uio
->uio_offset
, io_size
, retval
, 0x80000000 | segflg
, 0);
6622 is_file_clean(vnode_t vp
, off_t filesize
)
6626 int total_dirty
= 0;
6628 for (f_offset
= 0; f_offset
< filesize
; f_offset
+= PAGE_SIZE_64
) {
6629 if (ubc_page_op(vp
, f_offset
, 0, NULL
, &flags
) == KERN_SUCCESS
) {
6630 if (flags
& UPL_POP_DIRTY
) {
6645 * Dirty region tracking/clustering mechanism.
6647 * This code (vfs_drt_*) provides a mechanism for tracking and clustering
6648 * dirty regions within a larger space (file). It is primarily intended to
6649 * support clustering in large files with many dirty areas.
6651 * The implementation assumes that the dirty regions are pages.
6653 * To represent dirty pages within the file, we store bit vectors in a
6654 * variable-size circular hash.
6658 * Bitvector size. This determines the number of pages we group in a
6659 * single hashtable entry. Each hashtable entry is aligned to this
6660 * size within the file.
6662 #define DRT_BITVECTOR_PAGES ((1024 * 256) / PAGE_SIZE)
6665 * File offset handling.
6667 * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
6668 * the correct formula is (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6670 #define DRT_ADDRESS_MASK (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6671 #define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
6674 * Hashtable address field handling.
6676 * The low-order bits of the hashtable address are used to conserve
6679 * DRT_HASH_COUNT_MASK must be large enough to store the range
6680 * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
6681 * to indicate that the bucket is actually unoccupied.
6683 #define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
6684 #define DRT_HASH_SET_ADDRESS(scm, i, a) \
6686 (scm)->scm_hashtable[(i)].dhe_control = \
6687 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
6689 #define DRT_HASH_COUNT_MASK 0x1ff
6690 #define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
6691 #define DRT_HASH_SET_COUNT(scm, i, c) \
6693 (scm)->scm_hashtable[(i)].dhe_control = \
6694 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \
6696 #define DRT_HASH_CLEAR(scm, i) \
6698 (scm)->scm_hashtable[(i)].dhe_control = 0; \
6700 #define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
6701 #define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
6702 #define DRT_HASH_COPY(oscm, oi, scm, i) \
6704 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
6705 DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
6709 #if !defined(XNU_TARGET_OS_OSX)
6711 * Hash table moduli.
6713 * Since the hashtable entry's size is dependent on the size of
6714 * the bitvector, and since the hashtable size is constrained to
6715 * both being prime and fitting within the desired allocation
6716 * size, these values need to be manually determined.
6718 * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
6720 * The small hashtable allocation is 4096 bytes, so the modulus is 251.
6721 * The large hashtable allocation is 32768 bytes, so the modulus is 2039.
6722 * The xlarge hashtable allocation is 131072 bytes, so the modulus is 8179.
6725 #define DRT_HASH_SMALL_MODULUS 251
6726 #define DRT_HASH_LARGE_MODULUS 2039
6727 #define DRT_HASH_XLARGE_MODULUS 8179
6730 * Physical memory required before the large hash modulus is permitted.
6732 * On small memory systems, the large hash modulus can lead to phsyical
6733 * memory starvation, so we avoid using it there.
6735 #define DRT_HASH_LARGE_MEMORY_REQUIRED (1024LL * 1024LL * 1024LL) /* 1GiB */
6736 #define DRT_HASH_XLARGE_MEMORY_REQUIRED (8 * 1024LL * 1024LL * 1024LL) /* 8GiB */
6738 #define DRT_SMALL_ALLOCATION 4096 /* 80 bytes spare */
6739 #define DRT_LARGE_ALLOCATION 32768 /* 144 bytes spare */
6740 #define DRT_XLARGE_ALLOCATION 131072 /* 208 bytes spare */
6742 #else /* XNU_TARGET_OS_OSX */
6744 * Hash table moduli.
6746 * Since the hashtable entry's size is dependent on the size of
6747 * the bitvector, and since the hashtable size is constrained to
6748 * both being prime and fitting within the desired allocation
6749 * size, these values need to be manually determined.
6751 * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
6753 * The small hashtable allocation is 16384 bytes, so the modulus is 1019.
6754 * The large hashtable allocation is 131072 bytes, so the modulus is 8179.
6755 * The xlarge hashtable allocation is 524288 bytes, so the modulus is 32749.
6758 #define DRT_HASH_SMALL_MODULUS 1019
6759 #define DRT_HASH_LARGE_MODULUS 8179
6760 #define DRT_HASH_XLARGE_MODULUS 32749
6763 * Physical memory required before the large hash modulus is permitted.
6765 * On small memory systems, the large hash modulus can lead to phsyical
6766 * memory starvation, so we avoid using it there.
6768 #define DRT_HASH_LARGE_MEMORY_REQUIRED (4 * 1024LL * 1024LL * 1024LL) /* 4GiB */
6769 #define DRT_HASH_XLARGE_MEMORY_REQUIRED (32 * 1024LL * 1024LL * 1024LL) /* 32GiB */
6771 #define DRT_SMALL_ALLOCATION 16384 /* 80 bytes spare */
6772 #define DRT_LARGE_ALLOCATION 131072 /* 208 bytes spare */
6773 #define DRT_XLARGE_ALLOCATION 524288 /* 304 bytes spare */
6775 #endif /* ! XNU_TARGET_OS_OSX */
6777 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
6782 struct vfs_drt_hashentry
{
6783 u_int64_t dhe_control
;
6785 * dhe_bitvector was declared as dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
6786 * DRT_BITVECTOR_PAGES is defined as ((1024 * 256) / PAGE_SIZE)
6787 * Since PAGE_SIZE is only known at boot time,
6788 * -define MAX_DRT_BITVECTOR_PAGES for smallest supported page size (4k)
6789 * -declare dhe_bitvector array for largest possible length
6791 #define MAX_DRT_BITVECTOR_PAGES (1024 * 256)/( 4 * 1024)
6792 u_int32_t dhe_bitvector
[MAX_DRT_BITVECTOR_PAGES
/ 32];
6796 * Hashtable bitvector handling.
6798 * Bitvector fields are 32 bits long.
6801 #define DRT_HASH_SET_BIT(scm, i, bit) \
6802 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
6804 #define DRT_HASH_CLEAR_BIT(scm, i, bit) \
6805 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
6807 #define DRT_HASH_TEST_BIT(scm, i, bit) \
6808 ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
6810 #define DRT_BITVECTOR_CLEAR(scm, i) \
6811 bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
6813 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
6814 bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
6815 &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
6816 (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
6819 * Dirty Region Tracking structure.
6821 * The hashtable is allocated entirely inside the DRT structure.
6823 * The hash is a simple circular prime modulus arrangement, the structure
6824 * is resized from small to large if it overflows.
6827 struct vfs_drt_clustermap
{
6828 u_int32_t scm_magic
; /* sanity/detection */
6829 #define DRT_SCM_MAGIC 0x12020003
6830 u_int32_t scm_modulus
; /* current ring size */
6831 u_int32_t scm_buckets
; /* number of occupied buckets */
6832 u_int32_t scm_lastclean
; /* last entry we cleaned */
6833 u_int32_t scm_iskips
; /* number of slot skips */
6835 struct vfs_drt_hashentry scm_hashtable
[0];
6839 #define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
6840 #define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
6843 * Debugging codes and arguments.
6845 #define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
6846 #define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
6847 #define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
6848 #define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
6849 #define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
6852 /* 1 (clean, no map) */
6853 /* 2 (map alloc fail) */
6854 /* 3, resid (partial) */
6855 #define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
6856 #define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
6857 * lastclean, iskips */
6860 static kern_return_t
vfs_drt_alloc_map(struct vfs_drt_clustermap
**cmapp
);
6861 static kern_return_t
vfs_drt_free_map(struct vfs_drt_clustermap
*cmap
);
6862 static kern_return_t
vfs_drt_search_index(struct vfs_drt_clustermap
*cmap
,
6863 u_int64_t offset
, int *indexp
);
6864 static kern_return_t
vfs_drt_get_index(struct vfs_drt_clustermap
**cmapp
,
6868 static kern_return_t
vfs_drt_do_mark_pages(
6874 static void vfs_drt_trace(
6875 struct vfs_drt_clustermap
*cmap
,
6884 * Allocate and initialise a sparse cluster map.
6886 * Will allocate a new map, resize or compact an existing map.
6888 * XXX we should probably have at least one intermediate map size,
6889 * as the 1:16 ratio seems a bit drastic.
6891 static kern_return_t
6892 vfs_drt_alloc_map(struct vfs_drt_clustermap
**cmapp
)
6894 struct vfs_drt_clustermap
*cmap
= NULL
, *ocmap
= NULL
;
6895 kern_return_t kret
= KERN_SUCCESS
;
6896 u_int64_t offset
= 0;
6898 int modulus_size
= 0, map_size
= 0, active_buckets
= 0, index
= 0, copycount
= 0;
6901 if (cmapp
!= NULL
) {
6906 * Decide on the size of the new map.
6908 if (ocmap
== NULL
) {
6909 modulus_size
= DRT_HASH_SMALL_MODULUS
;
6910 map_size
= DRT_SMALL_ALLOCATION
;
6912 /* count the number of active buckets in the old map */
6914 for (i
= 0; i
< ocmap
->scm_modulus
; i
++) {
6915 if (!DRT_HASH_VACANT(ocmap
, i
) &&
6916 (DRT_HASH_GET_COUNT(ocmap
, i
) != 0)) {
6921 * If we're currently using the small allocation, check to
6922 * see whether we should grow to the large one.
6924 if (ocmap
->scm_modulus
== DRT_HASH_SMALL_MODULUS
) {
6926 * If the ring is nearly full and we are allowed to
6927 * use the large modulus, upgrade.
6929 if ((active_buckets
> (DRT_HASH_SMALL_MODULUS
- 5)) &&
6930 (max_mem
>= DRT_HASH_LARGE_MEMORY_REQUIRED
)) {
6931 modulus_size
= DRT_HASH_LARGE_MODULUS
;
6932 map_size
= DRT_LARGE_ALLOCATION
;
6934 modulus_size
= DRT_HASH_SMALL_MODULUS
;
6935 map_size
= DRT_SMALL_ALLOCATION
;
6937 } else if (ocmap
->scm_modulus
== DRT_HASH_LARGE_MODULUS
) {
6938 if ((active_buckets
> (DRT_HASH_LARGE_MODULUS
- 5)) &&
6939 (max_mem
>= DRT_HASH_XLARGE_MEMORY_REQUIRED
)) {
6940 modulus_size
= DRT_HASH_XLARGE_MODULUS
;
6941 map_size
= DRT_XLARGE_ALLOCATION
;
6944 * If the ring is completely full and we can't
6945 * expand, there's nothing useful for us to do.
6946 * Behave as though we had compacted into the new
6949 return KERN_SUCCESS
;
6952 /* already using the xlarge modulus */
6953 modulus_size
= DRT_HASH_XLARGE_MODULUS
;
6954 map_size
= DRT_XLARGE_ALLOCATION
;
6957 * If the ring is completely full, there's
6958 * nothing useful for us to do. Behave as
6959 * though we had compacted into the new
6962 if (active_buckets
>= DRT_HASH_XLARGE_MODULUS
) {
6963 return KERN_SUCCESS
;
6969 * Allocate and initialise the new map.
6972 kret
= kmem_alloc(kernel_map
, (vm_offset_t
*)&cmap
, map_size
, VM_KERN_MEMORY_FILE
);
6973 if (kret
!= KERN_SUCCESS
) {
6976 cmap
->scm_magic
= DRT_SCM_MAGIC
;
6977 cmap
->scm_modulus
= modulus_size
;
6978 cmap
->scm_buckets
= 0;
6979 cmap
->scm_lastclean
= 0;
6980 cmap
->scm_iskips
= 0;
6981 for (i
= 0; i
< cmap
->scm_modulus
; i
++) {
6982 DRT_HASH_CLEAR(cmap
, i
);
6983 DRT_HASH_VACATE(cmap
, i
);
6984 DRT_BITVECTOR_CLEAR(cmap
, i
);
6988 * If there's an old map, re-hash entries from it into the new map.
6991 if (ocmap
!= NULL
) {
6992 for (i
= 0; i
< ocmap
->scm_modulus
; i
++) {
6993 /* skip empty buckets */
6994 if (DRT_HASH_VACANT(ocmap
, i
) ||
6995 (DRT_HASH_GET_COUNT(ocmap
, i
) == 0)) {
6999 offset
= DRT_HASH_GET_ADDRESS(ocmap
, i
);
7000 kret
= vfs_drt_get_index(&cmap
, offset
, &index
, 1);
7001 if (kret
!= KERN_SUCCESS
) {
7002 /* XXX need to bail out gracefully here */
7003 panic("vfs_drt: new cluster map mysteriously too small");
7007 DRT_HASH_COPY(ocmap
, i
, cmap
, index
);
7012 /* log what we've done */
7013 vfs_drt_trace(cmap
, DRT_DEBUG_ALLOC
, copycount
, 0, 0, 0);
7016 * It's important to ensure that *cmapp always points to
7017 * a valid map, so we must overwrite it before freeing
7021 if (ocmap
!= NULL
) {
7022 /* emit stats into trace buffer */
7023 vfs_drt_trace(ocmap
, DRT_DEBUG_SCMDATA
,
7026 ocmap
->scm_lastclean
,
7029 vfs_drt_free_map(ocmap
);
7031 return KERN_SUCCESS
;
7036 * Free a sparse cluster map.
7038 static kern_return_t
7039 vfs_drt_free_map(struct vfs_drt_clustermap
*cmap
)
7041 vm_size_t map_size
= 0;
7043 if (cmap
->scm_modulus
== DRT_HASH_SMALL_MODULUS
) {
7044 map_size
= DRT_SMALL_ALLOCATION
;
7045 } else if (cmap
->scm_modulus
== DRT_HASH_LARGE_MODULUS
) {
7046 map_size
= DRT_LARGE_ALLOCATION
;
7047 } else if (cmap
->scm_modulus
== DRT_HASH_XLARGE_MODULUS
) {
7048 map_size
= DRT_XLARGE_ALLOCATION
;
7050 panic("vfs_drt_free_map: Invalid modulus %d\n", cmap
->scm_modulus
);
7053 kmem_free(kernel_map
, (vm_offset_t
)cmap
, map_size
);
7054 return KERN_SUCCESS
;
7059 * Find the hashtable slot currently occupied by an entry for the supplied offset.
7061 static kern_return_t
7062 vfs_drt_search_index(struct vfs_drt_clustermap
*cmap
, u_int64_t offset
, int *indexp
)
7067 offset
= DRT_ALIGN_ADDRESS(offset
);
7068 index
= DRT_HASH(cmap
, offset
);
7070 /* traverse the hashtable */
7071 for (i
= 0; i
< cmap
->scm_modulus
; i
++) {
7073 * If the slot is vacant, we can stop.
7075 if (DRT_HASH_VACANT(cmap
, index
)) {
7080 * If the address matches our offset, we have success.
7082 if (DRT_HASH_GET_ADDRESS(cmap
, index
) == offset
) {
7084 return KERN_SUCCESS
;
7088 * Move to the next slot, try again.
7090 index
= DRT_HASH_NEXT(cmap
, index
);
7095 return KERN_FAILURE
;
7099 * Find the hashtable slot for the supplied offset. If we haven't allocated
7100 * one yet, allocate one and populate the address field. Note that it will
7101 * not have a nonzero page count and thus will still technically be free, so
7102 * in the case where we are called to clean pages, the slot will remain free.
7104 static kern_return_t
7105 vfs_drt_get_index(struct vfs_drt_clustermap
**cmapp
, u_int64_t offset
, int *indexp
, int recursed
)
7107 struct vfs_drt_clustermap
*cmap
;
7114 /* look for an existing entry */
7115 kret
= vfs_drt_search_index(cmap
, offset
, indexp
);
7116 if (kret
== KERN_SUCCESS
) {
7120 /* need to allocate an entry */
7121 offset
= DRT_ALIGN_ADDRESS(offset
);
7122 index
= DRT_HASH(cmap
, offset
);
7124 /* scan from the index forwards looking for a vacant slot */
7125 for (i
= 0; i
< cmap
->scm_modulus
; i
++) {
7127 if (DRT_HASH_VACANT(cmap
, index
) || DRT_HASH_GET_COUNT(cmap
, index
) == 0) {
7128 cmap
->scm_buckets
++;
7129 if (index
< cmap
->scm_lastclean
) {
7130 cmap
->scm_lastclean
= index
;
7132 DRT_HASH_SET_ADDRESS(cmap
, index
, offset
);
7133 DRT_HASH_SET_COUNT(cmap
, index
, 0);
7134 DRT_BITVECTOR_CLEAR(cmap
, index
);
7136 vfs_drt_trace(cmap
, DRT_DEBUG_INSERT
, (int)offset
, i
, 0, 0);
7137 return KERN_SUCCESS
;
7139 cmap
->scm_iskips
+= i
;
7140 index
= DRT_HASH_NEXT(cmap
, index
);
7144 * We haven't found a vacant slot, so the map is full. If we're not
7145 * already recursed, try reallocating/compacting it.
7148 return KERN_FAILURE
;
7150 kret
= vfs_drt_alloc_map(cmapp
);
7151 if (kret
== KERN_SUCCESS
) {
7152 /* now try to insert again */
7153 kret
= vfs_drt_get_index(cmapp
, offset
, indexp
, 1);
7159 * Implementation of set dirty/clean.
7161 * In the 'clean' case, not finding a map is OK.
7163 static kern_return_t
7164 vfs_drt_do_mark_pages(
7171 struct vfs_drt_clustermap
*cmap
, **cmapp
;
7173 int i
, index
, pgoff
, pgcount
, setcount
, ecount
;
7175 cmapp
= (struct vfs_drt_clustermap
**)private;
7178 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_START
, (int)offset
, (int)length
, dirty
, 0);
7180 if (setcountp
!= NULL
) {
7184 /* allocate a cluster map if we don't already have one */
7186 /* no cluster map, nothing to clean */
7188 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 1, 0, 0, 0);
7189 return KERN_SUCCESS
;
7191 kret
= vfs_drt_alloc_map(cmapp
);
7192 if (kret
!= KERN_SUCCESS
) {
7193 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 2, 0, 0, 0);
7200 * Iterate over the length of the region.
7202 while (length
> 0) {
7204 * Get the hashtable index for this offset.
7206 * XXX this will add blank entries if we are clearing a range
7207 * that hasn't been dirtied.
7209 kret
= vfs_drt_get_index(cmapp
, offset
, &index
, 0);
7210 cmap
= *cmapp
; /* may have changed! */
7211 /* this may be a partial-success return */
7212 if (kret
!= KERN_SUCCESS
) {
7213 if (setcountp
!= NULL
) {
7214 *setcountp
= setcount
;
7216 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 3, (int)length
, 0, 0);
7222 * Work out how many pages we're modifying in this
7225 pgoff
= (int)((offset
- DRT_ALIGN_ADDRESS(offset
)) / PAGE_SIZE
);
7226 pgcount
= min((length
/ PAGE_SIZE
), (DRT_BITVECTOR_PAGES
- pgoff
));
7229 * Iterate over pages, dirty/clearing as we go.
7231 ecount
= DRT_HASH_GET_COUNT(cmap
, index
);
7232 for (i
= 0; i
< pgcount
; i
++) {
7234 if (!DRT_HASH_TEST_BIT(cmap
, index
, pgoff
+ i
)) {
7235 if (ecount
>= DRT_BITVECTOR_PAGES
) {
7236 panic("ecount >= DRT_BITVECTOR_PAGES, cmap = %p, index = %d, bit = %d", cmap
, index
, pgoff
+ i
);
7238 DRT_HASH_SET_BIT(cmap
, index
, pgoff
+ i
);
7243 if (DRT_HASH_TEST_BIT(cmap
, index
, pgoff
+ i
)) {
7245 panic("ecount <= 0, cmap = %p, index = %d, bit = %d", cmap
, index
, pgoff
+ i
);
7248 DRT_HASH_CLEAR_BIT(cmap
, index
, pgoff
+ i
);
7254 DRT_HASH_SET_COUNT(cmap
, index
, ecount
);
7256 offset
+= pgcount
* PAGE_SIZE
;
7257 length
-= pgcount
* PAGE_SIZE
;
7259 if (setcountp
!= NULL
) {
7260 *setcountp
= setcount
;
7263 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 0, setcount
, 0, 0);
7265 return KERN_SUCCESS
;
7269 * Mark a set of pages as dirty/clean.
7271 * This is a public interface.
7274 * Pointer to storage suitable for holding a pointer. Note that
7275 * this must either be NULL or a value set by this function.
7278 * Current file size in bytes.
7281 * Offset of the first page to be marked as dirty, in bytes. Must be
7285 * Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
7288 * Number of pages newly marked dirty by this call (optional).
7290 * Returns KERN_SUCCESS if all the pages were successfully marked.
7292 static kern_return_t
7293 vfs_drt_mark_pages(void **cmapp
, off_t offset
, u_int length
, u_int
*setcountp
)
7295 /* XXX size unused, drop from interface */
7296 return vfs_drt_do_mark_pages(cmapp
, offset
, length
, setcountp
, 1);
7300 static kern_return_t
7301 vfs_drt_unmark_pages(void **cmapp
, off_t offset
, u_int length
)
7303 return vfs_drt_do_mark_pages(cmapp
, offset
, length
, NULL
, 0);
7308 * Get a cluster of dirty pages.
7310 * This is a public interface.
7313 * Pointer to storage managed by drt_mark_pages. Note that this must
7314 * be NULL or a value set by drt_mark_pages.
7317 * Returns the byte offset into the file of the first page in the cluster.
7320 * Returns the length in bytes of the cluster of dirty pages.
7322 * Returns success if a cluster was found. If KERN_FAILURE is returned, there
7323 * are no dirty pages meeting the minmum size criteria. Private storage will
7324 * be released if there are no more dirty pages left in the map
7327 static kern_return_t
7328 vfs_drt_get_cluster(void **cmapp
, off_t
*offsetp
, u_int
*lengthp
)
7330 struct vfs_drt_clustermap
*cmap
;
7334 int index
, i
, fs
, ls
;
7337 if ((cmapp
== NULL
) || (*cmapp
== NULL
)) {
7338 return KERN_FAILURE
;
7342 /* walk the hashtable */
7343 for (offset
= 0, j
= 0; j
< cmap
->scm_modulus
; offset
+= (DRT_BITVECTOR_PAGES
* PAGE_SIZE
), j
++) {
7344 index
= DRT_HASH(cmap
, offset
);
7346 if (DRT_HASH_VACANT(cmap
, index
) || (DRT_HASH_GET_COUNT(cmap
, index
) == 0)) {
7350 /* scan the bitfield for a string of bits */
7353 for (i
= 0; i
< DRT_BITVECTOR_PAGES
; i
++) {
7354 if (DRT_HASH_TEST_BIT(cmap
, index
, i
)) {
7360 /* didn't find any bits set */
7361 panic("vfs_drt: entry summary count > 0 but no bits set in map, cmap = %p, index = %d, count = %lld",
7362 cmap
, index
, DRT_HASH_GET_COUNT(cmap
, index
));
7364 for (ls
= 0; i
< DRT_BITVECTOR_PAGES
; i
++, ls
++) {
7365 if (!DRT_HASH_TEST_BIT(cmap
, index
, i
)) {
7370 /* compute offset and length, mark pages clean */
7371 offset
= DRT_HASH_GET_ADDRESS(cmap
, index
) + (PAGE_SIZE
* fs
);
7372 length
= ls
* PAGE_SIZE
;
7373 vfs_drt_do_mark_pages(cmapp
, offset
, length
, NULL
, 0);
7374 cmap
->scm_lastclean
= index
;
7376 /* return successful */
7377 *offsetp
= (off_t
)offset
;
7380 vfs_drt_trace(cmap
, DRT_DEBUG_RETCLUSTER
, (int)offset
, (int)length
, 0, 0);
7381 return KERN_SUCCESS
;
7384 * We didn't find anything... hashtable is empty
7385 * emit stats into trace buffer and
7388 vfs_drt_trace(cmap
, DRT_DEBUG_SCMDATA
,
7391 cmap
->scm_lastclean
,
7394 vfs_drt_free_map(cmap
);
7397 return KERN_FAILURE
;
7401 static kern_return_t
7402 vfs_drt_control(void **cmapp
, int op_type
)
7404 struct vfs_drt_clustermap
*cmap
;
7407 if ((cmapp
== NULL
) || (*cmapp
== NULL
)) {
7408 return KERN_FAILURE
;
7414 /* emit stats into trace buffer */
7415 vfs_drt_trace(cmap
, DRT_DEBUG_SCMDATA
,
7418 cmap
->scm_lastclean
,
7421 vfs_drt_free_map(cmap
);
7426 cmap
->scm_lastclean
= 0;
7429 return KERN_SUCCESS
;
7435 * Emit a summary of the state of the clustermap into the trace buffer
7436 * along with some caller-provided data.
7440 vfs_drt_trace(__unused
struct vfs_drt_clustermap
*cmap
, int code
, int arg1
, int arg2
, int arg3
, int arg4
)
7442 KERNEL_DEBUG(code
, arg1
, arg2
, arg3
, arg4
, 0);
7446 vfs_drt_trace(__unused
struct vfs_drt_clustermap
*cmap
, __unused
int code
,
7447 __unused
int arg1
, __unused
int arg2
, __unused
int arg3
,
7455 * Perform basic sanity check on the hash entry summary count
7456 * vs. the actual bits set in the entry.
7459 vfs_drt_sanity(struct vfs_drt_clustermap
*cmap
)
7464 for (index
= 0; index
< cmap
->scm_modulus
; index
++) {
7465 if (DRT_HASH_VACANT(cmap
, index
)) {
7469 for (bits_on
= 0, i
= 0; i
< DRT_BITVECTOR_PAGES
; i
++) {
7470 if (DRT_HASH_TEST_BIT(cmap
, index
, i
)) {
7474 if (bits_on
!= DRT_HASH_GET_COUNT(cmap
, index
)) {
7475 panic("bits_on = %d, index = %d\n", bits_on
, index
);
7482 * Internal interface only.
7484 static kern_return_t
7485 vfs_get_scmap_push_behavior_internal(void **cmapp
, int *push_flag
)
7487 struct vfs_drt_clustermap
*cmap
;
7490 if ((cmapp
== NULL
) || (*cmapp
== NULL
) || (push_flag
== NULL
)) {
7491 return KERN_FAILURE
;
7495 if (cmap
->scm_modulus
== DRT_HASH_XLARGE_MODULUS
) {
7497 * If we have a full xlarge sparse cluster,
7498 * we push it out all at once so the cluster
7499 * map can be available to absorb more I/Os.
7500 * This is done on large memory configs so
7501 * the small I/Os don't interfere with the
7504 *push_flag
= PUSH_ALL
;
7506 return KERN_SUCCESS
;