2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
31 * The Regents of the University of California. All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
64 #include <sys/param.h>
65 #include <sys/proc_internal.h>
66 #include <sys/buf_internal.h>
67 #include <sys/mount_internal.h>
68 #include <sys/vnode_internal.h>
69 #include <sys/trace.h>
70 #include <sys/malloc.h>
72 #include <sys/kernel.h>
73 #include <sys/resourcevar.h>
74 #include <miscfs/specfs/specdev.h>
75 #include <sys/uio_internal.h>
76 #include <libkern/libkern.h>
77 #include <machine/machine_routines.h>
79 #include <sys/ubc_internal.h>
80 #include <vm/vnode_pager.h>
82 #include <mach/mach_types.h>
83 #include <mach/memory_object_types.h>
84 #include <mach/vm_map.h>
86 #include <kern/task.h>
87 #include <kern/policy_internal.h>
89 #include <vm/vm_kern.h>
90 #include <vm/vm_map.h>
91 #include <vm/vm_pageout.h>
92 #include <vm/vm_fault.h>
94 #include <sys/kdebug.h>
95 #include <libkern/OSAtomic.h>
101 #include <vfs/vfs_disk_conditioner.h>
105 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
110 #define CL_WRITE 0x02
111 #define CL_ASYNC 0x04
112 #define CL_COMMIT 0x08
113 #define CL_PAGEOUT 0x10
115 #define CL_NOZERO 0x40
116 #define CL_PAGEIN 0x80
117 #define CL_DEV_MEMORY 0x100
118 #define CL_PRESERVE 0x200
119 #define CL_THROTTLE 0x400
120 #define CL_KEEPCACHED 0x800
121 #define CL_DIRECT_IO 0x1000
122 #define CL_PASSIVE 0x2000
123 #define CL_IOSTREAMING 0x4000
124 #define CL_CLOSE 0x8000
125 #define CL_ENCRYPTED 0x10000
126 #define CL_RAW_ENCRYPTED 0x20000
127 #define CL_NOCACHE 0x40000
129 #define MAX_VECTOR_UPL_ELEMENTS 8
130 #define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE_BYTES)
132 #define CLUSTER_IO_WAITING ((buf_t)1)
134 extern upl_t
vector_upl_create(vm_offset_t
);
135 extern boolean_t
vector_upl_is_valid(upl_t
);
136 extern boolean_t
vector_upl_set_subupl(upl_t
,upl_t
, u_int32_t
);
137 extern void vector_upl_set_pagelist(upl_t
);
138 extern void vector_upl_set_iostate(upl_t
, upl_t
, vm_offset_t
, u_int32_t
);
142 u_int io_completed
; /* amount of io that has currently completed */
143 u_int io_issued
; /* amount of io that was successfully issued */
144 int io_error
; /* error code of first error encountered */
145 int io_wanted
; /* someone is sleeping waiting for a change in state */
148 struct cl_direct_read_lock
{
149 LIST_ENTRY(cl_direct_read_lock
) chain
;
155 #define CL_DIRECT_READ_LOCK_BUCKETS 61
157 static LIST_HEAD(cl_direct_read_locks
, cl_direct_read_lock
)
158 cl_direct_read_locks
[CL_DIRECT_READ_LOCK_BUCKETS
];
160 static lck_spin_t cl_direct_read_spin_lock
;
162 static lck_grp_t
*cl_mtx_grp
;
163 static lck_attr_t
*cl_mtx_attr
;
164 static lck_grp_attr_t
*cl_mtx_grp_attr
;
165 static lck_mtx_t
*cl_transaction_mtxp
;
172 #define PUSH_DELAY 0x01
173 #define PUSH_ALL 0x02
174 #define PUSH_SYNC 0x04
177 static void cluster_EOT(buf_t cbp_head
, buf_t cbp_tail
, int zero_offset
);
178 static void cluster_wait_IO(buf_t cbp_head
, int async
);
179 static void cluster_complete_transaction(buf_t
*cbp_head
, void *callback_arg
, int *retval
, int flags
, int needwait
);
181 static int cluster_io_type(struct uio
*uio
, int *io_type
, u_int32_t
*io_length
, u_int32_t min_length
);
183 static int cluster_io(vnode_t vp
, upl_t upl
, vm_offset_t upl_offset
, off_t f_offset
, int non_rounded_size
,
184 int flags
, buf_t real_bp
, struct clios
*iostate
, int (*)(buf_t
, void *), void *callback_arg
);
185 static int cluster_iodone(buf_t bp
, void *callback_arg
);
186 static int cluster_ioerror(upl_t upl
, int upl_offset
, int abort_size
, int error
, int io_flags
, vnode_t vp
);
187 static int cluster_is_throttled(vnode_t vp
);
189 static void cluster_iostate_wait(struct clios
*iostate
, u_int target
, const char *wait_name
);
191 static void cluster_syncup(vnode_t vp
, off_t newEOF
, int (*)(buf_t
, void *), void *callback_arg
, int flags
);
193 static void cluster_read_upl_release(upl_t upl
, int start_pg
, int last_pg
, int take_reference
);
194 static int cluster_copy_ubc_data_internal(vnode_t vp
, struct uio
*uio
, int *io_resid
, int mark_dirty
, int take_reference
);
196 static int cluster_read_copy(vnode_t vp
, struct uio
*uio
, u_int32_t io_req_size
, off_t filesize
, int flags
,
197 int (*)(buf_t
, void *), void *callback_arg
);
198 static int cluster_read_direct(vnode_t vp
, struct uio
*uio
, off_t filesize
, int *read_type
, u_int32_t
*read_length
,
199 int flags
, int (*)(buf_t
, void *), void *callback_arg
);
200 static int cluster_read_contig(vnode_t vp
, struct uio
*uio
, off_t filesize
, int *read_type
, u_int32_t
*read_length
,
201 int (*)(buf_t
, void *), void *callback_arg
, int flags
);
203 static int cluster_write_copy(vnode_t vp
, struct uio
*uio
, u_int32_t io_req_size
, off_t oldEOF
, off_t newEOF
,
204 off_t headOff
, off_t tailOff
, int flags
, int (*)(buf_t
, void *), void *callback_arg
);
205 static int cluster_write_direct(vnode_t vp
, struct uio
*uio
, off_t oldEOF
, off_t newEOF
,
206 int *write_type
, u_int32_t
*write_length
, int flags
, int (*)(buf_t
, void *), void *callback_arg
);
207 static int cluster_write_contig(vnode_t vp
, struct uio
*uio
, off_t newEOF
,
208 int *write_type
, u_int32_t
*write_length
, int (*)(buf_t
, void *), void *callback_arg
, int bflag
);
210 static int cluster_align_phys_io(vnode_t vp
, struct uio
*uio
, addr64_t usr_paddr
, u_int32_t xsize
, int flags
, int (*)(buf_t
, void *), void *callback_arg
);
212 static int cluster_read_prefetch(vnode_t vp
, off_t f_offset
, u_int size
, off_t filesize
, int (*callback
)(buf_t
, void *), void *callback_arg
, int bflag
);
213 static void cluster_read_ahead(vnode_t vp
, struct cl_extent
*extent
, off_t filesize
, struct cl_readahead
*ra
, int (*callback
)(buf_t
, void *), void *callback_arg
, int bflag
);
215 static int cluster_push_now(vnode_t vp
, struct cl_extent
*, off_t EOF
, int flags
, int (*)(buf_t
, void *), void *callback_arg
);
217 static int cluster_try_push(struct cl_writebehind
*, vnode_t vp
, off_t EOF
, int push_flag
, int flags
, int (*)(buf_t
, void *), void *callback_arg
, int *err
);
219 static void sparse_cluster_switch(struct cl_writebehind
*, vnode_t vp
, off_t EOF
, int (*)(buf_t
, void *), void *callback_arg
);
220 static int sparse_cluster_push(void **cmapp
, vnode_t vp
, off_t EOF
, int push_flag
, int io_flags
, int (*)(buf_t
, void *), void *callback_arg
);
221 static void sparse_cluster_add(void **cmapp
, vnode_t vp
, struct cl_extent
*, off_t EOF
, int (*)(buf_t
, void *), void *callback_arg
);
223 static kern_return_t
vfs_drt_mark_pages(void **cmapp
, off_t offset
, u_int length
, u_int
*setcountp
);
224 static kern_return_t
vfs_drt_get_cluster(void **cmapp
, off_t
*offsetp
, u_int
*lengthp
);
225 static kern_return_t
vfs_drt_control(void **cmapp
, int op_type
);
229 * For throttled IO to check whether
230 * a block is cached by the boot cache
231 * and thus it can avoid delaying the IO.
233 * bootcache_contains_block is initially
234 * NULL. The BootCache will set it while
235 * the cache is active and clear it when
236 * the cache is jettisoned.
238 * Returns 0 if the block is not
239 * contained in the cache, 1 if it is
242 * The function pointer remains valid
243 * after the cache has been evicted even
244 * if bootcache_contains_block has been
247 * See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs
249 int (*bootcache_contains_block
)(dev_t device
, u_int64_t blkno
) = NULL
;
253 * limit the internal I/O size so that we
254 * can represent it in a 32 bit int
256 #define MAX_IO_REQUEST_SIZE (1024 * 1024 * 512)
257 #define MAX_IO_CONTIG_SIZE MAX_UPL_SIZE_BYTES
260 * The MIN_DIRECT_WRITE_SIZE governs how much I/O should be issued before we consider
261 * allowing the caller to bypass the buffer cache. For small I/Os (less than 16k),
262 * we have not historically allowed the write to bypass the UBC.
264 #define MIN_DIRECT_WRITE_SIZE (16384)
266 #define WRITE_THROTTLE 6
267 #define WRITE_THROTTLE_SSD 2
268 #define WRITE_BEHIND 1
269 #define WRITE_BEHIND_SSD 1
273 #define PREFETCH_SSD 1
274 uint32_t speculative_prefetch_max
= (2048 * 1024); /* maximum bytes in a specluative read-ahead */
275 uint32_t speculative_prefetch_max_iosize
= (512 * 1024); /* maximum I/O size to use in a specluative read-ahead */
278 #define PREFETCH_SSD 2
279 uint32_t speculative_prefetch_max
= (MAX_UPL_SIZE_BYTES
* 3); /* maximum bytes in a specluative read-ahead */
280 uint32_t speculative_prefetch_max_iosize
= (512 * 1024); /* maximum I/O size to use in a specluative read-ahead on SSDs*/
284 #define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * (base))
285 #define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE))
286 #define MAX_PREFETCH(vp, size, is_ssd) (size * IO_SCALE(vp, ((is_ssd) ? PREFETCH_SSD : PREFETCH)))
288 int speculative_reads_disabled
= 0;
291 * throttle the number of async writes that
292 * can be outstanding on a single vnode
293 * before we issue a synchronous write
295 #define THROTTLE_MAXCNT 0
297 uint32_t throttle_max_iosize
= (128 * 1024);
299 #define THROTTLE_MAX_IOSIZE (throttle_max_iosize)
301 SYSCTL_INT(_debug
, OID_AUTO
, lowpri_throttle_max_iosize
, CTLFLAG_RW
| CTLFLAG_LOCKED
, &throttle_max_iosize
, 0, "");
307 * allocate lock group attribute and group
309 cl_mtx_grp_attr
= lck_grp_attr_alloc_init();
310 cl_mtx_grp
= lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr
);
313 * allocate the lock attribute
315 cl_mtx_attr
= lck_attr_alloc_init();
317 cl_transaction_mtxp
= lck_mtx_alloc_init(cl_mtx_grp
, cl_mtx_attr
);
319 if (cl_transaction_mtxp
== NULL
)
320 panic("cluster_init: failed to allocate cl_transaction_mtxp");
322 lck_spin_init(&cl_direct_read_spin_lock
, cl_mtx_grp
, cl_mtx_attr
);
324 for (int i
= 0; i
< CL_DIRECT_READ_LOCK_BUCKETS
; ++i
)
325 LIST_INIT(&cl_direct_read_locks
[i
]);
330 cluster_max_io_size(mount_t mp
, int type
)
332 uint32_t max_io_size
;
339 segcnt
= mp
->mnt_segreadcnt
;
340 maxcnt
= mp
->mnt_maxreadcnt
;
343 segcnt
= mp
->mnt_segwritecnt
;
344 maxcnt
= mp
->mnt_maxwritecnt
;
347 segcnt
= min(mp
->mnt_segreadcnt
, mp
->mnt_segwritecnt
);
348 maxcnt
= min(mp
->mnt_maxreadcnt
, mp
->mnt_maxwritecnt
);
351 if (segcnt
> (MAX_UPL_SIZE_BYTES
>> PAGE_SHIFT
)) {
353 * don't allow a size beyond the max UPL size we can create
355 segcnt
= MAX_UPL_SIZE_BYTES
>> PAGE_SHIFT
;
357 max_io_size
= min((segcnt
* PAGE_SIZE
), maxcnt
);
359 if (max_io_size
< MAX_UPL_TRANSFER_BYTES
) {
361 * don't allow a size smaller than the old fixed limit
363 max_io_size
= MAX_UPL_TRANSFER_BYTES
;
366 * make sure the size specified is a multiple of PAGE_SIZE
368 max_io_size
&= ~PAGE_MASK
;
370 return (max_io_size
);
376 #define CLW_ALLOCATE 0x01
377 #define CLW_RETURNLOCKED 0x02
378 #define CLW_IONOCACHE 0x04
379 #define CLW_IOPASSIVE 0x08
382 * if the read ahead context doesn't yet exist,
383 * allocate and initialize it...
384 * the vnode lock serializes multiple callers
385 * during the actual assignment... first one
386 * to grab the lock wins... the other callers
387 * will release the now unnecessary storage
389 * once the context is present, try to grab (but don't block on)
390 * the lock associated with it... if someone
391 * else currently owns it, than the read
392 * will run without read-ahead. this allows
393 * multiple readers to run in parallel and
394 * since there's only 1 read ahead context,
395 * there's no real loss in only allowing 1
396 * reader to have read-ahead enabled.
398 static struct cl_readahead
*
399 cluster_get_rap(vnode_t vp
)
401 struct ubc_info
*ubc
;
402 struct cl_readahead
*rap
;
406 if ((rap
= ubc
->cl_rahead
) == NULL
) {
407 MALLOC_ZONE(rap
, struct cl_readahead
*, sizeof *rap
, M_CLRDAHEAD
, M_WAITOK
);
409 bzero(rap
, sizeof *rap
);
411 lck_mtx_init(&rap
->cl_lockr
, cl_mtx_grp
, cl_mtx_attr
);
415 if (ubc
->cl_rahead
== NULL
)
416 ubc
->cl_rahead
= rap
;
418 lck_mtx_destroy(&rap
->cl_lockr
, cl_mtx_grp
);
419 FREE_ZONE((void *)rap
, sizeof *rap
, M_CLRDAHEAD
);
420 rap
= ubc
->cl_rahead
;
424 if (lck_mtx_try_lock(&rap
->cl_lockr
) == TRUE
)
427 return ((struct cl_readahead
*)NULL
);
432 * if the write behind context doesn't yet exist,
433 * and CLW_ALLOCATE is specified, allocate and initialize it...
434 * the vnode lock serializes multiple callers
435 * during the actual assignment... first one
436 * to grab the lock wins... the other callers
437 * will release the now unnecessary storage
439 * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
440 * the lock associated with the write behind context before
444 static struct cl_writebehind
*
445 cluster_get_wbp(vnode_t vp
, int flags
)
447 struct ubc_info
*ubc
;
448 struct cl_writebehind
*wbp
;
452 if ((wbp
= ubc
->cl_wbehind
) == NULL
) {
454 if ( !(flags
& CLW_ALLOCATE
))
455 return ((struct cl_writebehind
*)NULL
);
457 MALLOC_ZONE(wbp
, struct cl_writebehind
*, sizeof *wbp
, M_CLWRBEHIND
, M_WAITOK
);
459 bzero(wbp
, sizeof *wbp
);
460 lck_mtx_init(&wbp
->cl_lockw
, cl_mtx_grp
, cl_mtx_attr
);
464 if (ubc
->cl_wbehind
== NULL
)
465 ubc
->cl_wbehind
= wbp
;
467 lck_mtx_destroy(&wbp
->cl_lockw
, cl_mtx_grp
);
468 FREE_ZONE((void *)wbp
, sizeof *wbp
, M_CLWRBEHIND
);
469 wbp
= ubc
->cl_wbehind
;
473 if (flags
& CLW_RETURNLOCKED
)
474 lck_mtx_lock(&wbp
->cl_lockw
);
481 cluster_syncup(vnode_t vp
, off_t newEOF
, int (*callback
)(buf_t
, void *), void *callback_arg
, int flags
)
483 struct cl_writebehind
*wbp
;
485 if ((wbp
= cluster_get_wbp(vp
, 0)) != NULL
) {
487 if (wbp
->cl_number
) {
488 lck_mtx_lock(&wbp
->cl_lockw
);
490 cluster_try_push(wbp
, vp
, newEOF
, PUSH_ALL
| flags
, 0, callback
, callback_arg
, NULL
);
492 lck_mtx_unlock(&wbp
->cl_lockw
);
499 cluster_io_present_in_BC(vnode_t vp
, off_t f_offset
)
503 int (*bootcache_check_fn
)(dev_t device
, u_int64_t blkno
) = bootcache_contains_block
;
505 if (bootcache_check_fn
&& vp
->v_mount
&& vp
->v_mount
->mnt_devvp
) {
506 if (VNOP_BLOCKMAP(vp
, f_offset
, PAGE_SIZE
, &blkno
, &io_size
, NULL
, VNODE_READ
| VNODE_BLOCKMAP_NO_TRACK
, NULL
))
512 if (bootcache_check_fn(vp
->v_mount
->mnt_devvp
->v_rdev
, blkno
))
520 cluster_is_throttled(vnode_t vp
)
522 return (throttle_io_will_be_throttled(-1, vp
->v_mount
));
527 cluster_iostate_wait(struct clios
*iostate
, u_int target
, const char *wait_name
)
530 lck_mtx_lock(&iostate
->io_mtxp
);
532 while ((iostate
->io_issued
- iostate
->io_completed
) > target
) {
534 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 95)) | DBG_FUNC_START
,
535 iostate
->io_issued
, iostate
->io_completed
, target
, 0, 0);
537 iostate
->io_wanted
= 1;
538 msleep((caddr_t
)&iostate
->io_wanted
, &iostate
->io_mtxp
, PRIBIO
+ 1, wait_name
, NULL
);
540 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 95)) | DBG_FUNC_END
,
541 iostate
->io_issued
, iostate
->io_completed
, target
, 0, 0);
543 lck_mtx_unlock(&iostate
->io_mtxp
);
546 static void cluster_handle_associated_upl(struct clios
*iostate
, upl_t upl
,
547 upl_offset_t upl_offset
, upl_size_t size
)
552 upl_t associated_upl
= upl_associated_upl(upl
);
558 printf("1: %d %d\n", upl_offset
, upl_offset
+ size
);
562 * The associated UPL is page aligned to file offsets whereas the
563 * UPL it's attached to has different alignment requirements. The
564 * upl_offset that we have refers to @upl. The code that follows
565 * has to deal with the first and last pages in this transaction
566 * which might straddle pages in the associated UPL. To keep
567 * track of these pages, we use the mark bits: if the mark bit is
568 * set, we know another transaction has completed its part of that
569 * page and so we can unlock that page here.
571 * The following illustrates what we have to deal with:
573 * MEM u <------------ 1 PAGE ------------> e
574 * +-------------+----------------------+-----------------
575 * | |######################|#################
576 * +-------------+----------------------+-----------------
577 * FILE | <--- a ---> o <------------ 1 PAGE ------------>
579 * So here we show a write to offset @o. The data that is to be
580 * written is in a buffer that is not page aligned; it has offset
581 * @a in the page. The upl that carries the data starts in memory
582 * at @u. The associated upl starts in the file at offset @o. A
583 * transaction will always end on a page boundary (like @e above)
584 * except for the very last transaction in the group. We cannot
585 * unlock the page at @o in the associated upl until both the
586 * transaction ending at @e and the following transaction (that
587 * starts at @e) has completed.
591 * We record whether or not the two UPLs are aligned as the mark
592 * bit in the first page of @upl.
594 upl_page_info_t
*pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
595 bool is_unaligned
= upl_page_get_mark(pl
, 0);
598 upl_page_info_t
*assoc_pl
= UPL_GET_INTERNAL_PAGE_LIST(associated_upl
);
600 upl_offset_t upl_end
= upl_offset
+ size
;
601 assert(upl_end
>= PAGE_SIZE
);
603 upl_size_t assoc_upl_size
= upl_get_size(associated_upl
);
606 * In the very first transaction in the group, upl_offset will
607 * not be page aligned, but after that it will be and in that
608 * case we want the preceding page in the associated UPL hence
613 upl_offset
= trunc_page_32(upl_offset
- 1);
615 lck_mtx_lock_spin(&iostate
->io_mtxp
);
617 // Look at the first page...
619 && !upl_page_get_mark(assoc_pl
, upl_offset
>> PAGE_SHIFT
)) {
621 * The first page isn't marked so let another transaction
622 * completion handle it.
624 upl_page_set_mark(assoc_pl
, upl_offset
>> PAGE_SHIFT
, true);
625 upl_offset
+= PAGE_SIZE
;
628 // And now the last page...
631 * This needs to be > rather than >= because if it's equal, it
632 * means there's another transaction that is sharing the last
635 if (upl_end
> assoc_upl_size
)
636 upl_end
= assoc_upl_size
;
638 upl_end
= trunc_page_32(upl_end
);
639 const int last_pg
= (upl_end
>> PAGE_SHIFT
) - 1;
641 if (!upl_page_get_mark(assoc_pl
, last_pg
)) {
643 * The last page isn't marked so mark the page and let another
644 * transaction completion handle it.
646 upl_page_set_mark(assoc_pl
, last_pg
, true);
647 upl_end
-= PAGE_SIZE
;
651 lck_mtx_unlock(&iostate
->io_mtxp
);
654 printf("2: %d %d\n", upl_offset
, upl_end
);
657 if (upl_end
<= upl_offset
)
660 size
= upl_end
- upl_offset
;
662 assert(!(upl_offset
& PAGE_MASK
));
663 assert(!(size
& PAGE_MASK
));
669 * We can unlock these pages now and as this is for a
670 * direct/uncached write, we want to dump the pages too.
672 kern_return_t kr
= upl_abort_range(associated_upl
, upl_offset
, size
,
673 UPL_ABORT_DUMP_PAGES
, &empty
);
678 upl_set_associated_upl(upl
, NULL
);
679 upl_deallocate(associated_upl
);
684 cluster_ioerror(upl_t upl
, int upl_offset
, int abort_size
, int error
, int io_flags
, vnode_t vp
)
686 int upl_abort_code
= 0;
690 if ((io_flags
& (B_PHYS
| B_CACHE
)) == (B_PHYS
| B_CACHE
))
692 * direct write of any flavor, or a direct read that wasn't aligned
694 ubc_upl_commit_range(upl
, upl_offset
, abort_size
, UPL_COMMIT_FREE_ON_EMPTY
);
696 if (io_flags
& B_PAGEIO
) {
697 if (io_flags
& B_READ
)
702 if (io_flags
& B_CACHE
)
704 * leave pages in the cache unchanged on error
706 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
;
707 else if (page_out
&& ((error
!= ENXIO
) || vnode_isswap(vp
)))
709 * transient error... leave pages unchanged
711 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
;
713 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
;
715 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_DUMP_PAGES
;
717 ubc_upl_abort_range(upl
, upl_offset
, abort_size
, upl_abort_code
);
719 return (upl_abort_code
);
724 cluster_iodone(buf_t bp
, void *callback_arg
)
735 int transaction_size
= 0;
742 struct clios
*iostate
;
743 boolean_t transaction_complete
= FALSE
;
745 __IGNORE_WCASTALIGN(cbp_head
= (buf_t
)(bp
->b_trans_head
));
747 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_START
,
748 cbp_head
, bp
->b_lblkno
, bp
->b_bcount
, bp
->b_flags
, 0);
750 if (cbp_head
->b_trans_next
|| !(cbp_head
->b_flags
& B_EOT
)) {
751 lck_mtx_lock_spin(cl_transaction_mtxp
);
753 bp
->b_flags
|= B_TDONE
;
755 for (cbp
= cbp_head
; cbp
; cbp
= cbp
->b_trans_next
) {
757 * all I/O requests that are part of this transaction
758 * have to complete before we can process it
760 if ( !(cbp
->b_flags
& B_TDONE
)) {
762 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
763 cbp_head
, cbp
, cbp
->b_bcount
, cbp
->b_flags
, 0);
765 lck_mtx_unlock(cl_transaction_mtxp
);
770 if (cbp
->b_trans_next
== CLUSTER_IO_WAITING
) {
771 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
772 cbp_head
, cbp
, cbp
->b_bcount
, cbp
->b_flags
, 0);
774 lck_mtx_unlock(cl_transaction_mtxp
);
780 if (cbp
->b_flags
& B_EOT
)
781 transaction_complete
= TRUE
;
783 lck_mtx_unlock(cl_transaction_mtxp
);
785 if (transaction_complete
== FALSE
) {
786 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
787 cbp_head
, 0, 0, 0, 0);
797 upl_offset
= cbp
->b_uploffset
;
799 b_flags
= cbp
->b_flags
;
800 real_bp
= cbp
->b_real_bp
;
801 zero_offset
= cbp
->b_validend
;
802 iostate
= (struct clios
*)cbp
->b_iostate
;
805 real_bp
->b_dev
= cbp
->b_dev
;
808 if ((cbp
->b_flags
& B_ERROR
) && error
== 0)
809 error
= cbp
->b_error
;
811 total_resid
+= cbp
->b_resid
;
812 total_size
+= cbp
->b_bcount
;
814 cbp_next
= cbp
->b_trans_next
;
816 if (cbp_next
== NULL
)
818 * compute the overall size of the transaction
819 * in case we created one that has 'holes' in it
820 * 'total_size' represents the amount of I/O we
821 * did, not the span of the transaction w/r to the UPL
823 transaction_size
= cbp
->b_uploffset
+ cbp
->b_bcount
- upl_offset
;
831 if (ISSET(b_flags
, B_COMMIT_UPL
)) {
832 cluster_handle_associated_upl(iostate
,
838 if (error
== 0 && total_resid
)
842 int (*cliodone_func
)(buf_t
, void *) = (int (*)(buf_t
, void *))(cbp_head
->b_cliodone
);
844 if (cliodone_func
!= NULL
) {
845 cbp_head
->b_bcount
= transaction_size
;
847 error
= (*cliodone_func
)(cbp_head
, callback_arg
);
851 cluster_zero(upl
, zero_offset
, PAGE_SIZE
- (zero_offset
& PAGE_MASK
), real_bp
);
853 free_io_buf(cbp_head
);
859 * someone has issued multiple I/Os asynchrounsly
860 * and is waiting for them to complete (streaming)
862 lck_mtx_lock_spin(&iostate
->io_mtxp
);
864 if (error
&& iostate
->io_error
== 0)
865 iostate
->io_error
= error
;
867 iostate
->io_completed
+= total_size
;
869 if (iostate
->io_wanted
) {
871 * someone is waiting for the state of
872 * this io stream to change
874 iostate
->io_wanted
= 0;
877 lck_mtx_unlock(&iostate
->io_mtxp
);
880 wakeup((caddr_t
)&iostate
->io_wanted
);
883 if (b_flags
& B_COMMIT_UPL
) {
884 pg_offset
= upl_offset
& PAGE_MASK
;
885 commit_size
= (pg_offset
+ transaction_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
888 upl_flags
= cluster_ioerror(upl
, upl_offset
- pg_offset
, commit_size
, error
, b_flags
, vp
);
890 upl_flags
= UPL_COMMIT_FREE_ON_EMPTY
;
892 if ((b_flags
& B_PHYS
) && (b_flags
& B_READ
))
893 upl_flags
|= UPL_COMMIT_SET_DIRTY
;
896 upl_flags
|= UPL_COMMIT_INACTIVATE
;
898 ubc_upl_commit_range(upl
, upl_offset
- pg_offset
, commit_size
, upl_flags
);
903 real_bp
->b_flags
|= B_ERROR
;
904 real_bp
->b_error
= error
;
906 real_bp
->b_resid
= total_resid
;
908 buf_biodone(real_bp
);
910 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
911 upl
, upl_offset
- pg_offset
, commit_size
, (error
<< 24) | upl_flags
, 0);
918 cluster_throttle_io_limit(vnode_t vp
, uint32_t *limit
)
920 if (cluster_is_throttled(vp
)) {
921 *limit
= THROTTLE_MAX_IOSIZE
;
929 cluster_zero(upl_t upl
, upl_offset_t upl_offset
, int size
, buf_t bp
)
932 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 23)) | DBG_FUNC_START
,
933 upl_offset
, size
, bp
, 0, 0);
935 if (bp
== NULL
|| bp
->b_datap
== 0) {
939 pl
= ubc_upl_pageinfo(upl
);
941 if (upl_device_page(pl
) == TRUE
) {
942 zero_addr
= ((addr64_t
)upl_phys_page(pl
, 0) << PAGE_SHIFT
) + upl_offset
;
944 bzero_phys_nc(zero_addr
, size
);
951 page_index
= upl_offset
/ PAGE_SIZE
;
952 page_offset
= upl_offset
& PAGE_MASK
;
954 zero_addr
= ((addr64_t
)upl_phys_page(pl
, page_index
) << PAGE_SHIFT
) + page_offset
;
955 zero_cnt
= min(PAGE_SIZE
- page_offset
, size
);
957 bzero_phys(zero_addr
, zero_cnt
);
960 upl_offset
+= zero_cnt
;
964 bzero((caddr_t
)((vm_offset_t
)bp
->b_datap
+ upl_offset
), size
);
966 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 23)) | DBG_FUNC_END
,
967 upl_offset
, size
, 0, 0, 0);
972 cluster_EOT(buf_t cbp_head
, buf_t cbp_tail
, int zero_offset
)
974 cbp_head
->b_validend
= zero_offset
;
975 cbp_tail
->b_flags
|= B_EOT
;
979 cluster_wait_IO(buf_t cbp_head
, int async
)
985 * Async callback completion will not normally generate a
986 * wakeup upon I/O completion. To get woken up, we set
987 * b_trans_next (which is safe for us to modify) on the last
988 * buffer to CLUSTER_IO_WAITING so that cluster_iodone knows
989 * to wake us up when all buffers as part of this transaction
990 * are completed. This is done under the umbrella of
991 * cl_transaction_mtxp which is also taken in cluster_iodone.
996 lck_mtx_lock_spin(cl_transaction_mtxp
);
998 for (cbp
= cbp_head
; cbp
; last
= cbp
, cbp
= cbp
->b_trans_next
) {
999 if (!ISSET(cbp
->b_flags
, B_TDONE
))
1004 last
->b_trans_next
= CLUSTER_IO_WAITING
;
1006 DTRACE_IO1(wait__start
, buf_t
, last
);
1008 msleep(last
, cl_transaction_mtxp
, PSPIN
| (PRIBIO
+1), "cluster_wait_IO", NULL
);
1011 * We should only have been woken up if all the
1012 * buffers are completed, but just in case...
1015 for (cbp
= cbp_head
; cbp
!= CLUSTER_IO_WAITING
; cbp
= cbp
->b_trans_next
) {
1016 if (!ISSET(cbp
->b_flags
, B_TDONE
)) {
1022 DTRACE_IO1(wait__done
, buf_t
, last
);
1024 last
->b_trans_next
= NULL
;
1027 lck_mtx_unlock(cl_transaction_mtxp
);
1029 for (cbp
= cbp_head
; cbp
; cbp
= cbp
->b_trans_next
)
1035 cluster_complete_transaction(buf_t
*cbp_head
, void *callback_arg
, int *retval
, int flags
, int needwait
)
1039 boolean_t isswapout
= FALSE
;
1042 * cluster_complete_transaction will
1043 * only be called if we've issued a complete chain in synchronous mode
1044 * or, we've already done a cluster_wait_IO on an incomplete chain
1047 for (cbp
= *cbp_head
; cbp
; cbp
= cbp
->b_trans_next
)
1051 * we've already waited on all of the I/Os in this transaction,
1052 * so mark all of the buf_t's in this transaction as B_TDONE
1053 * so that cluster_iodone sees the transaction as completed
1055 for (cbp
= *cbp_head
; cbp
; cbp
= cbp
->b_trans_next
)
1056 cbp
->b_flags
|= B_TDONE
;
1059 if ((flags
& (CL_ASYNC
| CL_PAGEOUT
)) == CL_PAGEOUT
&& vnode_isswap(cbp
->b_vp
))
1062 error
= cluster_iodone(cbp
, callback_arg
);
1064 if ( !(flags
& CL_ASYNC
) && error
&& *retval
== 0) {
1065 if (((flags
& (CL_PAGEOUT
| CL_KEEPCACHED
)) != CL_PAGEOUT
) || (error
!= ENXIO
))
1067 else if (isswapout
== TRUE
)
1070 *cbp_head
= (buf_t
)NULL
;
1075 cluster_io(vnode_t vp
, upl_t upl
, vm_offset_t upl_offset
, off_t f_offset
, int non_rounded_size
,
1076 int flags
, buf_t real_bp
, struct clios
*iostate
, int (*callback
)(buf_t
, void *), void *callback_arg
)
1085 buf_t cbp_head
= NULL
;
1086 buf_t cbp_tail
= NULL
;
1087 int trans_count
= 0;
1088 int max_trans_count
;
1094 int zero_offset
= 0;
1095 int async_throttle
= 0;
1097 vm_offset_t upl_end_offset
;
1098 boolean_t need_EOT
= FALSE
;
1101 * we currently don't support buffers larger than a page
1103 if (real_bp
&& non_rounded_size
> PAGE_SIZE
)
1104 panic("%s(): Called with real buffer of size %d bytes which "
1105 "is greater than the maximum allowed size of "
1106 "%d bytes (the system PAGE_SIZE).\n",
1107 __FUNCTION__
, non_rounded_size
, PAGE_SIZE
);
1112 * we don't want to do any funny rounding of the size for IO requests
1113 * coming through the DIRECT or CONTIGUOUS paths... those pages don't
1114 * belong to us... we can't extend (nor do we need to) the I/O to fill
1117 if (mp
->mnt_devblocksize
> 1 && !(flags
& (CL_DEV_MEMORY
| CL_DIRECT_IO
))) {
1119 * round the requested size up so that this I/O ends on a
1120 * page boundary in case this is a 'write'... if the filesystem
1121 * has blocks allocated to back the page beyond the EOF, we want to
1122 * make sure to write out the zero's that are sitting beyond the EOF
1123 * so that in case the filesystem doesn't explicitly zero this area
1124 * if a hole is created via a lseek/write beyond the current EOF,
1125 * it will return zeros when it's read back from the disk. If the
1126 * physical allocation doesn't extend for the whole page, we'll
1127 * only write/read from the disk up to the end of this allocation
1128 * via the extent info returned from the VNOP_BLOCKMAP call.
1130 pg_offset
= upl_offset
& PAGE_MASK
;
1132 size
= (((non_rounded_size
+ pg_offset
) + (PAGE_SIZE
- 1)) & ~PAGE_MASK
) - pg_offset
;
1135 * anyone advertising a blocksize of 1 byte probably
1136 * can't deal with us rounding up the request size
1137 * AFP is one such filesystem/device
1139 size
= non_rounded_size
;
1141 upl_end_offset
= upl_offset
+ size
;
1143 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_START
, (int)f_offset
, size
, upl_offset
, flags
, 0);
1146 * Set the maximum transaction size to the maximum desired number of
1149 max_trans_count
= 8;
1150 if (flags
& CL_DEV_MEMORY
)
1151 max_trans_count
= 16;
1153 if (flags
& CL_READ
) {
1155 bmap_flags
= VNODE_READ
;
1157 max_iosize
= mp
->mnt_maxreadcnt
;
1158 max_vectors
= mp
->mnt_segreadcnt
;
1161 bmap_flags
= VNODE_WRITE
;
1163 max_iosize
= mp
->mnt_maxwritecnt
;
1164 max_vectors
= mp
->mnt_segwritecnt
;
1166 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_NONE
, max_iosize
, max_vectors
, mp
->mnt_devblocksize
, 0, 0);
1169 * make sure the maximum iosize is a
1170 * multiple of the page size
1172 max_iosize
&= ~PAGE_MASK
;
1175 * Ensure the maximum iosize is sensible.
1178 max_iosize
= PAGE_SIZE
;
1180 if (flags
& CL_THROTTLE
) {
1181 if ( !(flags
& CL_PAGEOUT
) && cluster_is_throttled(vp
)) {
1182 if (max_iosize
> THROTTLE_MAX_IOSIZE
)
1183 max_iosize
= THROTTLE_MAX_IOSIZE
;
1184 async_throttle
= THROTTLE_MAXCNT
;
1186 if ( (flags
& CL_DEV_MEMORY
) )
1187 async_throttle
= IO_SCALE(vp
, VNODE_ASYNC_THROTTLE
);
1190 u_int max_cluster_size
;
1193 if (vp
->v_mount
->mnt_minsaturationbytecount
) {
1194 max_cluster_size
= vp
->v_mount
->mnt_minsaturationbytecount
;
1198 max_cluster_size
= MAX_CLUSTER_SIZE(vp
);
1200 if (disk_conditioner_mount_is_ssd(vp
->v_mount
))
1201 scale
= WRITE_THROTTLE_SSD
;
1203 scale
= WRITE_THROTTLE
;
1205 if (max_iosize
> max_cluster_size
)
1206 max_cluster
= max_cluster_size
;
1208 max_cluster
= max_iosize
;
1210 if (size
< max_cluster
)
1213 if (flags
& CL_CLOSE
)
1214 scale
+= MAX_CLUSTERS
;
1216 async_throttle
= min(IO_SCALE(vp
, VNODE_ASYNC_THROTTLE
), ((scale
* max_cluster_size
) / max_cluster
) - 1);
1222 if (flags
& (CL_PAGEIN
| CL_PAGEOUT
))
1223 io_flags
|= B_PAGEIO
;
1224 if (flags
& (CL_IOSTREAMING
))
1225 io_flags
|= B_IOSTREAMING
;
1226 if (flags
& CL_COMMIT
)
1227 io_flags
|= B_COMMIT_UPL
;
1228 if (flags
& CL_DIRECT_IO
)
1230 if (flags
& (CL_PRESERVE
| CL_KEEPCACHED
))
1231 io_flags
|= B_CACHE
;
1232 if (flags
& CL_PASSIVE
)
1233 io_flags
|= B_PASSIVE
;
1234 if (flags
& CL_ENCRYPTED
)
1235 io_flags
|= B_ENCRYPTED_IO
;
1237 if (vp
->v_flag
& VSYSTEM
)
1240 if ((flags
& CL_READ
) && ((upl_offset
+ non_rounded_size
) & PAGE_MASK
) && (!(flags
& CL_NOZERO
))) {
1242 * then we are going to end up
1243 * with a page that we can't complete (the file size wasn't a multiple
1244 * of PAGE_SIZE and we're trying to read to the end of the file
1245 * so we'll go ahead and zero out the portion of the page we can't
1246 * read in from the file
1248 zero_offset
= upl_offset
+ non_rounded_size
;
1249 } else if (!ISSET(flags
, CL_READ
) && ISSET(flags
, CL_DIRECT_IO
)) {
1250 assert(ISSET(flags
, CL_COMMIT
));
1252 // For a direct/uncached write, we need to lock pages...
1257 * Create a UPL to lock the pages in the cache whilst the
1258 * write is in progress.
1260 ubc_create_upl_kernel(vp
, f_offset
, non_rounded_size
, &cached_upl
,
1261 NULL
, UPL_SET_LITE
, VM_KERN_MEMORY_FILE
);
1264 * Attach this UPL to the other UPL so that we can find it
1267 upl_set_associated_upl(upl
, cached_upl
);
1269 if (upl_offset
& PAGE_MASK
) {
1271 * The two UPLs are not aligned, so mark the first page in
1272 * @upl so that cluster_handle_associated_upl can handle
1275 upl_page_info_t
*pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
1276 upl_page_set_mark(pl
, 0, true);
1283 u_int io_size_wanted
;
1286 if (size
> max_iosize
)
1287 io_size
= max_iosize
;
1291 io_size_wanted
= io_size
;
1292 io_size_tmp
= (size_t)io_size
;
1294 if ((error
= VNOP_BLOCKMAP(vp
, f_offset
, io_size
, &blkno
, &io_size_tmp
, NULL
, bmap_flags
, NULL
)))
1297 if (io_size_tmp
> io_size_wanted
)
1298 io_size
= io_size_wanted
;
1300 io_size
= (u_int
)io_size_tmp
;
1302 if (real_bp
&& (real_bp
->b_blkno
== real_bp
->b_lblkno
))
1303 real_bp
->b_blkno
= blkno
;
1305 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 24)) | DBG_FUNC_NONE
,
1306 (int)f_offset
, (int)(blkno
>>32), (int)blkno
, io_size
, 0);
1310 * vnop_blockmap didn't return an error... however, it did
1311 * return an extent size of 0 which means we can't
1312 * make forward progress on this I/O... a hole in the
1313 * file would be returned as a blkno of -1 with a non-zero io_size
1314 * a real extent is returned with a blkno != -1 and a non-zero io_size
1319 if ( !(flags
& CL_READ
) && blkno
== -1) {
1323 if (upl_get_internal_vectorupl(upl
))
1324 panic("Vector UPLs should not take this code-path\n");
1326 * we're writing into a 'hole'
1328 if (flags
& CL_PAGEOUT
) {
1330 * if we got here via cluster_pageout
1331 * then just error the request and return
1332 * the 'hole' should already have been covered
1338 * we can get here if the cluster code happens to
1339 * pick up a page that was dirtied via mmap vs
1340 * a 'write' and the page targets a 'hole'...
1341 * i.e. the writes to the cluster were sparse
1342 * and the file was being written for the first time
1344 * we can also get here if the filesystem supports
1345 * 'holes' that are less than PAGE_SIZE.... because
1346 * we can't know if the range in the page that covers
1347 * the 'hole' has been dirtied via an mmap or not,
1348 * we have to assume the worst and try to push the
1349 * entire page to storage.
1351 * Try paging out the page individually before
1352 * giving up entirely and dumping it (the pageout
1353 * path will insure that the zero extent accounting
1354 * has been taken care of before we get back into cluster_io)
1356 * go direct to vnode_pageout so that we don't have to
1357 * unbusy the page from the UPL... we used to do this
1358 * so that we could call ubc_msync, but that results
1359 * in a potential deadlock if someone else races us to acquire
1360 * that page and wins and in addition needs one of the pages
1361 * we're continuing to hold in the UPL
1363 pageout_flags
= UPL_MSYNC
| UPL_VNODE_PAGER
| UPL_NESTED_PAGEOUT
;
1365 if ( !(flags
& CL_ASYNC
))
1366 pageout_flags
|= UPL_IOSYNC
;
1367 if ( !(flags
& CL_COMMIT
))
1368 pageout_flags
|= UPL_NOCOMMIT
;
1372 int bytes_in_last_page
;
1375 * first we have to wait for the the current outstanding I/Os
1376 * to complete... EOT hasn't been set yet on this transaction
1377 * so the pages won't be released
1379 cluster_wait_IO(cbp_head
, (flags
& CL_ASYNC
));
1381 bytes_in_last_page
= cbp_head
->b_uploffset
& PAGE_MASK
;
1382 for (cbp
= cbp_head
; cbp
; cbp
= cbp
->b_trans_next
)
1383 bytes_in_last_page
+= cbp
->b_bcount
;
1384 bytes_in_last_page
&= PAGE_MASK
;
1386 while (bytes_in_last_page
) {
1388 * we've got a transcation that
1389 * includes the page we're about to push out through vnode_pageout...
1390 * find the bp's in the list which intersect this page and either
1391 * remove them entirely from the transaction (there could be multiple bp's), or
1392 * round it's iosize down to the page boundary (there can only be one)...
1394 * find the last bp in the list and act on it
1396 for (prev_cbp
= cbp
= cbp_head
; cbp
->b_trans_next
; cbp
= cbp
->b_trans_next
)
1399 if (bytes_in_last_page
>= cbp
->b_bcount
) {
1401 * this buf no longer has any I/O associated with it
1403 bytes_in_last_page
-= cbp
->b_bcount
;
1408 if (cbp
== cbp_head
) {
1409 assert(bytes_in_last_page
== 0);
1411 * the buf we just freed was the only buf in
1412 * this transaction... so there's no I/O to do
1418 * remove the buf we just freed from
1419 * the transaction list
1421 prev_cbp
->b_trans_next
= NULL
;
1422 cbp_tail
= prev_cbp
;
1426 * this is the last bp that has I/O
1427 * intersecting the page of interest
1428 * only some of the I/O is in the intersection
1429 * so clip the size but keep it in the transaction list
1431 cbp
->b_bcount
-= bytes_in_last_page
;
1433 bytes_in_last_page
= 0;
1438 * there was more to the current transaction
1439 * than just the page we are pushing out via vnode_pageout...
1440 * mark it as finished and complete it... we've already
1441 * waited for the I/Os to complete above in the call to cluster_wait_IO
1443 cluster_EOT(cbp_head
, cbp_tail
, 0);
1445 cluster_complete_transaction(&cbp_head
, callback_arg
, &retval
, flags
, 0);
1450 if (vnode_pageout(vp
, upl
, trunc_page(upl_offset
), trunc_page_64(f_offset
), PAGE_SIZE
, pageout_flags
, NULL
) != PAGER_SUCCESS
) {
1453 e_offset
= round_page_64(f_offset
+ 1);
1454 io_size
= e_offset
- f_offset
;
1456 f_offset
+= io_size
;
1457 upl_offset
+= io_size
;
1459 if (size
>= io_size
)
1464 * keep track of how much of the original request
1465 * that we've actually completed... non_rounded_size
1466 * may go negative due to us rounding the request
1467 * to a page size multiple (i.e. size > non_rounded_size)
1469 non_rounded_size
-= io_size
;
1471 if (non_rounded_size
<= 0) {
1473 * we've transferred all of the data in the original
1474 * request, but we were unable to complete the tail
1475 * of the last page because the file didn't have
1476 * an allocation to back that portion... this is ok.
1482 flags
&= ~CL_COMMIT
;
1487 lblkno
= (daddr64_t
)(f_offset
/ 0x1000);
1489 * we have now figured out how much I/O we can do - this is in 'io_size'
1490 * pg_offset is the starting point in the first page for the I/O
1491 * pg_count is the number of full and partial pages that 'io_size' encompasses
1493 pg_offset
= upl_offset
& PAGE_MASK
;
1495 if (flags
& CL_DEV_MEMORY
) {
1497 * treat physical requests as one 'giant' page
1501 pg_count
= (io_size
+ pg_offset
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
1503 if ((flags
& CL_READ
) && blkno
== -1) {
1504 vm_offset_t commit_offset
;
1506 int complete_transaction_now
= 0;
1509 * if we're reading and blkno == -1, then we've got a
1510 * 'hole' in the file that we need to deal with by zeroing
1511 * out the affected area in the upl
1513 if (io_size
>= (u_int
)non_rounded_size
) {
1515 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
1516 * than 'zero_offset' will be non-zero
1517 * if the 'hole' returned by vnop_blockmap extends all the way to the eof
1518 * (indicated by the io_size finishing off the I/O request for this UPL)
1519 * than we're not going to issue an I/O for the
1520 * last page in this upl... we need to zero both the hole and the tail
1521 * of the page beyond the EOF, since the delayed zero-fill won't kick in
1523 bytes_to_zero
= non_rounded_size
;
1524 if (!(flags
& CL_NOZERO
))
1525 bytes_to_zero
= (((upl_offset
+ io_size
) + (PAGE_SIZE
- 1)) & ~PAGE_MASK
) - upl_offset
;
1529 bytes_to_zero
= io_size
;
1533 cluster_zero(upl
, upl_offset
, bytes_to_zero
, real_bp
);
1539 * if there is a current I/O chain pending
1540 * then the first page of the group we just zero'd
1541 * will be handled by the I/O completion if the zero
1542 * fill started in the middle of the page
1544 commit_offset
= (upl_offset
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
1546 pg_resid
= commit_offset
- upl_offset
;
1548 if (bytes_to_zero
>= pg_resid
) {
1550 * the last page of the current I/O
1551 * has been completed...
1552 * compute the number of fully zero'd
1553 * pages that are beyond it
1554 * plus the last page if its partial
1555 * and we have no more I/O to issue...
1556 * otherwise a partial page is left
1557 * to begin the next I/O
1559 if ((int)io_size
>= non_rounded_size
)
1560 pg_count
= (bytes_to_zero
- pg_resid
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
1562 pg_count
= (bytes_to_zero
- pg_resid
) / PAGE_SIZE
;
1564 complete_transaction_now
= 1;
1568 * no pending I/O to deal with
1569 * so, commit all of the fully zero'd pages
1570 * plus the last page if its partial
1571 * and we have no more I/O to issue...
1572 * otherwise a partial page is left
1573 * to begin the next I/O
1575 if ((int)io_size
>= non_rounded_size
)
1576 pg_count
= (pg_offset
+ bytes_to_zero
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
1578 pg_count
= (pg_offset
+ bytes_to_zero
) / PAGE_SIZE
;
1580 commit_offset
= upl_offset
& ~PAGE_MASK
;
1583 // Associated UPL is currently only used in the direct write path
1584 assert(!upl_associated_upl(upl
));
1586 if ( (flags
& CL_COMMIT
) && pg_count
) {
1587 ubc_upl_commit_range(upl
, commit_offset
, pg_count
* PAGE_SIZE
,
1588 UPL_COMMIT_CLEAR_DIRTY
| UPL_COMMIT_FREE_ON_EMPTY
);
1590 upl_offset
+= io_size
;
1591 f_offset
+= io_size
;
1595 * keep track of how much of the original request
1596 * that we've actually completed... non_rounded_size
1597 * may go negative due to us rounding the request
1598 * to a page size multiple (i.e. size > non_rounded_size)
1600 non_rounded_size
-= io_size
;
1602 if (non_rounded_size
<= 0) {
1604 * we've transferred all of the data in the original
1605 * request, but we were unable to complete the tail
1606 * of the last page because the file didn't have
1607 * an allocation to back that portion... this is ok.
1611 if (cbp_head
&& (complete_transaction_now
|| size
== 0)) {
1612 cluster_wait_IO(cbp_head
, (flags
& CL_ASYNC
));
1614 cluster_EOT(cbp_head
, cbp_tail
, size
== 0 ? zero_offset
: 0);
1616 cluster_complete_transaction(&cbp_head
, callback_arg
, &retval
, flags
, 0);
1622 if (pg_count
> max_vectors
) {
1623 if (((pg_count
- max_vectors
) * PAGE_SIZE
) > io_size
) {
1624 io_size
= PAGE_SIZE
- pg_offset
;
1627 io_size
-= (pg_count
- max_vectors
) * PAGE_SIZE
;
1628 pg_count
= max_vectors
;
1632 * If the transaction is going to reach the maximum number of
1633 * desired elements, truncate the i/o to the nearest page so
1634 * that the actual i/o is initiated after this buffer is
1635 * created and added to the i/o chain.
1637 * I/O directed to physically contiguous memory
1638 * doesn't have a requirement to make sure we 'fill' a page
1640 if ( !(flags
& CL_DEV_MEMORY
) && trans_count
>= max_trans_count
&&
1641 ((upl_offset
+ io_size
) & PAGE_MASK
)) {
1642 vm_offset_t aligned_ofs
;
1644 aligned_ofs
= (upl_offset
+ io_size
) & ~PAGE_MASK
;
1646 * If the io_size does not actually finish off even a
1647 * single page we have to keep adding buffers to the
1648 * transaction despite having reached the desired limit.
1650 * Eventually we get here with the page being finished
1651 * off (and exceeded) and then we truncate the size of
1652 * this i/o request so that it is page aligned so that
1653 * we can finally issue the i/o on the transaction.
1655 if (aligned_ofs
> upl_offset
) {
1656 io_size
= aligned_ofs
- upl_offset
;
1661 if ( !(mp
->mnt_kern_flag
& MNTK_VIRTUALDEV
))
1663 * if we're not targeting a virtual device i.e. a disk image
1664 * it's safe to dip into the reserve pool since real devices
1665 * can complete this I/O request without requiring additional
1666 * bufs from the alloc_io_buf pool
1669 else if ((flags
& CL_ASYNC
) && !(flags
& CL_PAGEOUT
))
1671 * Throttle the speculative IO
1677 cbp
= alloc_io_buf(vp
, priv
);
1679 if (flags
& CL_PAGEOUT
) {
1683 * since blocks are in offsets of 0x1000, scale
1684 * iteration to (PAGE_SIZE * pg_count) of blks.
1686 for (i
= 0; i
< (PAGE_SIZE
* pg_count
)/0x1000; i
++) {
1687 if (buf_invalblkno(vp
, lblkno
+ i
, 0) == EBUSY
)
1688 panic("BUSY bp found in cluster_io");
1691 if (flags
& CL_ASYNC
) {
1692 if (buf_setcallback(cbp
, (void *)cluster_iodone
, callback_arg
))
1693 panic("buf_setcallback failed\n");
1695 cbp
->b_cliodone
= (void *)callback
;
1696 cbp
->b_flags
|= io_flags
;
1697 if (flags
& CL_NOCACHE
)
1698 cbp
->b_attr
.ba_flags
|= BA_NOCACHE
;
1700 cbp
->b_lblkno
= lblkno
;
1701 cbp
->b_blkno
= blkno
;
1702 cbp
->b_bcount
= io_size
;
1704 if (buf_setupl(cbp
, upl
, upl_offset
))
1705 panic("buf_setupl failed\n");
1707 upl_set_blkno(upl
, upl_offset
, io_size
, blkno
);
1709 cbp
->b_trans_next
= (buf_t
)NULL
;
1711 if ((cbp
->b_iostate
= (void *)iostate
))
1713 * caller wants to track the state of this
1714 * io... bump the amount issued against this stream
1716 iostate
->io_issued
+= io_size
;
1718 if (flags
& CL_READ
) {
1719 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 26)) | DBG_FUNC_NONE
,
1720 (int)cbp
->b_lblkno
, (int)cbp
->b_blkno
, upl_offset
, io_size
, 0);
1723 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 27)) | DBG_FUNC_NONE
,
1724 (int)cbp
->b_lblkno
, (int)cbp
->b_blkno
, upl_offset
, io_size
, 0);
1728 cbp_tail
->b_trans_next
= cbp
;
1734 if ( (cbp_head
->b_real_bp
= real_bp
) )
1735 real_bp
= (buf_t
)NULL
;
1737 *(buf_t
*)(&cbp
->b_trans_head
) = cbp_head
;
1741 upl_offset
+= io_size
;
1742 f_offset
+= io_size
;
1745 * keep track of how much of the original request
1746 * that we've actually completed... non_rounded_size
1747 * may go negative due to us rounding the request
1748 * to a page size multiple (i.e. size > non_rounded_size)
1750 non_rounded_size
-= io_size
;
1752 if (non_rounded_size
<= 0) {
1754 * we've transferred all of the data in the original
1755 * request, but we were unable to complete the tail
1756 * of the last page because the file didn't have
1757 * an allocation to back that portion... this is ok.
1763 * we have no more I/O to issue, so go
1764 * finish the final transaction
1767 } else if ( ((flags
& CL_DEV_MEMORY
) || (upl_offset
& PAGE_MASK
) == 0) &&
1768 ((flags
& CL_ASYNC
) || trans_count
> max_trans_count
) ) {
1770 * I/O directed to physically contiguous memory...
1771 * which doesn't have a requirement to make sure we 'fill' a page
1773 * the current I/O we've prepared fully
1774 * completes the last page in this request
1776 * it's either an ASYNC request or
1777 * we've already accumulated more than 8 I/O's into
1778 * this transaction so mark it as complete so that
1779 * it can finish asynchronously or via the cluster_complete_transaction
1780 * below if the request is synchronous
1784 if (need_EOT
== TRUE
)
1785 cluster_EOT(cbp_head
, cbp_tail
, size
== 0 ? zero_offset
: 0);
1787 if (flags
& CL_THROTTLE
)
1788 (void)vnode_waitforwrites(vp
, async_throttle
, 0, 0, "cluster_io");
1790 if ( !(io_flags
& B_READ
))
1791 vnode_startwrite(vp
);
1793 if (flags
& CL_RAW_ENCRYPTED
) {
1795 * User requested raw encrypted bytes.
1796 * Twiddle the bit in the ba_flags for the buffer
1798 cbp
->b_attr
.ba_flags
|= BA_RAW_ENCRYPTED_IO
;
1801 (void) VNOP_STRATEGY(cbp
);
1803 if (need_EOT
== TRUE
) {
1804 if ( !(flags
& CL_ASYNC
))
1805 cluster_complete_transaction(&cbp_head
, callback_arg
, &retval
, flags
, 1);
1819 * Wait until all of the outstanding I/O
1820 * for this partial transaction has completed
1822 cluster_wait_IO(cbp_head
, (flags
& CL_ASYNC
));
1825 * Rewind the upl offset to the beginning of the
1828 upl_offset
= cbp_head
->b_uploffset
;
1831 if (ISSET(flags
, CL_COMMIT
)) {
1832 cluster_handle_associated_upl(iostate
, upl
, upl_offset
,
1833 upl_end_offset
- upl_offset
);
1836 // Free all the IO buffers in this transaction
1837 for (cbp
= cbp_head
; cbp
;) {
1840 size
+= cbp
->b_bcount
;
1841 io_size
+= cbp
->b_bcount
;
1843 cbp_next
= cbp
->b_trans_next
;
1849 int need_wakeup
= 0;
1852 * update the error condition for this stream
1853 * since we never really issued the io
1854 * just go ahead and adjust it back
1856 lck_mtx_lock_spin(&iostate
->io_mtxp
);
1858 if (iostate
->io_error
== 0)
1859 iostate
->io_error
= error
;
1860 iostate
->io_issued
-= io_size
;
1862 if (iostate
->io_wanted
) {
1864 * someone is waiting for the state of
1865 * this io stream to change
1867 iostate
->io_wanted
= 0;
1870 lck_mtx_unlock(&iostate
->io_mtxp
);
1873 wakeup((caddr_t
)&iostate
->io_wanted
);
1876 if (flags
& CL_COMMIT
) {
1879 pg_offset
= upl_offset
& PAGE_MASK
;
1880 abort_size
= (upl_end_offset
- upl_offset
+ PAGE_MASK
) & ~PAGE_MASK
;
1882 upl_flags
= cluster_ioerror(upl
, upl_offset
- pg_offset
, abort_size
, error
, io_flags
, vp
);
1884 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 28)) | DBG_FUNC_NONE
,
1885 upl
, upl_offset
- pg_offset
, abort_size
, (error
<< 24) | upl_flags
, 0);
1889 } else if (cbp_head
)
1890 panic("%s(): cbp_head is not NULL.\n", __FUNCTION__
);
1894 * can get here if we either encountered an error
1895 * or we completely zero-filled the request and
1899 real_bp
->b_flags
|= B_ERROR
;
1900 real_bp
->b_error
= error
;
1902 buf_biodone(real_bp
);
1904 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_END
, (int)f_offset
, size
, upl_offset
, retval
, 0);
1909 #define reset_vector_run_state() \
1910 issueVectorUPL = vector_upl_offset = vector_upl_index = vector_upl_iosize = vector_upl_size = 0;
1913 vector_cluster_io(vnode_t vp
, upl_t vector_upl
, vm_offset_t vector_upl_offset
, off_t v_upl_uio_offset
, int vector_upl_iosize
,
1914 int io_flag
, buf_t real_bp
, struct clios
*iostate
, int (*callback
)(buf_t
, void *), void *callback_arg
)
1916 vector_upl_set_pagelist(vector_upl
);
1918 if(io_flag
& CL_READ
) {
1919 if(vector_upl_offset
== 0 && ((vector_upl_iosize
& PAGE_MASK
)==0))
1920 io_flag
&= ~CL_PRESERVE
; /*don't zero fill*/
1922 io_flag
|= CL_PRESERVE
; /*zero fill*/
1924 return (cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, real_bp
, iostate
, callback
, callback_arg
));
1929 cluster_read_prefetch(vnode_t vp
, off_t f_offset
, u_int size
, off_t filesize
, int (*callback
)(buf_t
, void *), void *callback_arg
, int bflag
)
1931 int pages_in_prefetch
;
1933 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_START
,
1934 (int)f_offset
, size
, (int)filesize
, 0, 0);
1936 if (f_offset
>= filesize
) {
1937 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_END
,
1938 (int)f_offset
, 0, 0, 0, 0);
1941 if ((off_t
)size
> (filesize
- f_offset
))
1942 size
= filesize
- f_offset
;
1943 pages_in_prefetch
= (size
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
1945 advisory_read_ext(vp
, filesize
, f_offset
, size
, callback
, callback_arg
, bflag
);
1947 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_END
,
1948 (int)f_offset
+ size
, pages_in_prefetch
, 0, 1, 0);
1950 return (pages_in_prefetch
);
1956 cluster_read_ahead(vnode_t vp
, struct cl_extent
*extent
, off_t filesize
, struct cl_readahead
*rap
, int (*callback
)(buf_t
, void *), void *callback_arg
,
1961 int size_of_prefetch
;
1965 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_START
,
1966 (int)extent
->b_addr
, (int)extent
->e_addr
, (int)rap
->cl_lastr
, 0, 0);
1968 if (extent
->b_addr
== rap
->cl_lastr
&& extent
->b_addr
== extent
->e_addr
) {
1969 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
1970 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 0, 0);
1973 if (rap
->cl_lastr
== -1 || (extent
->b_addr
!= rap
->cl_lastr
&& extent
->b_addr
!= (rap
->cl_lastr
+ 1))) {
1977 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
1978 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 1, 0);
1982 max_prefetch
= MAX_PREFETCH(vp
, cluster_max_io_size(vp
->v_mount
, CL_READ
), disk_conditioner_mount_is_ssd(vp
->v_mount
));
1984 if (max_prefetch
> speculative_prefetch_max
)
1985 max_prefetch
= speculative_prefetch_max
;
1987 if (max_prefetch
<= PAGE_SIZE
) {
1988 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
1989 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 6, 0);
1992 if (extent
->e_addr
< rap
->cl_maxra
&& rap
->cl_ralen
>= 4) {
1993 if ((rap
->cl_maxra
- extent
->e_addr
) > (rap
->cl_ralen
/ 4)) {
1995 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
1996 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 2, 0);
2000 r_addr
= max(extent
->e_addr
, rap
->cl_maxra
) + 1;
2001 f_offset
= (off_t
)(r_addr
* PAGE_SIZE_64
);
2003 size_of_prefetch
= 0;
2005 ubc_range_op(vp
, f_offset
, f_offset
+ PAGE_SIZE_64
, UPL_ROP_PRESENT
, &size_of_prefetch
);
2007 if (size_of_prefetch
) {
2008 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
2009 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 3, 0);
2012 if (f_offset
< filesize
) {
2013 daddr64_t read_size
;
2015 rap
->cl_ralen
= rap
->cl_ralen
? min(max_prefetch
/ PAGE_SIZE
, rap
->cl_ralen
<< 1) : 1;
2017 read_size
= (extent
->e_addr
+ 1) - extent
->b_addr
;
2019 if (read_size
> rap
->cl_ralen
) {
2020 if (read_size
> max_prefetch
/ PAGE_SIZE
)
2021 rap
->cl_ralen
= max_prefetch
/ PAGE_SIZE
;
2023 rap
->cl_ralen
= read_size
;
2025 size_of_prefetch
= cluster_read_prefetch(vp
, f_offset
, rap
->cl_ralen
* PAGE_SIZE
, filesize
, callback
, callback_arg
, bflag
);
2027 if (size_of_prefetch
)
2028 rap
->cl_maxra
= (r_addr
+ size_of_prefetch
) - 1;
2030 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
2031 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 4, 0);
2036 cluster_pageout(vnode_t vp
, upl_t upl
, upl_offset_t upl_offset
, off_t f_offset
,
2037 int size
, off_t filesize
, int flags
)
2039 return cluster_pageout_ext(vp
, upl
, upl_offset
, f_offset
, size
, filesize
, flags
, NULL
, NULL
);
2045 cluster_pageout_ext(vnode_t vp
, upl_t upl
, upl_offset_t upl_offset
, off_t f_offset
,
2046 int size
, off_t filesize
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
2053 local_flags
= CL_PAGEOUT
| CL_THROTTLE
;
2055 if ((flags
& UPL_IOSYNC
) == 0)
2056 local_flags
|= CL_ASYNC
;
2057 if ((flags
& UPL_NOCOMMIT
) == 0)
2058 local_flags
|= CL_COMMIT
;
2059 if ((flags
& UPL_KEEPCACHED
))
2060 local_flags
|= CL_KEEPCACHED
;
2061 if (flags
& UPL_PAGING_ENCRYPTED
)
2062 local_flags
|= CL_ENCRYPTED
;
2065 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 52)) | DBG_FUNC_NONE
,
2066 (int)f_offset
, size
, (int)filesize
, local_flags
, 0);
2069 * If they didn't specify any I/O, then we are done...
2070 * we can't issue an abort because we don't know how
2071 * big the upl really is
2076 if (vp
->v_mount
->mnt_flag
& MNT_RDONLY
) {
2077 if (local_flags
& CL_COMMIT
)
2078 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
);
2082 * can't page-in from a negative offset
2083 * or if we're starting beyond the EOF
2084 * or if the file offset isn't page aligned
2085 * or the size requested isn't a multiple of PAGE_SIZE
2087 if (f_offset
< 0 || f_offset
>= filesize
||
2088 (f_offset
& PAGE_MASK_64
) || (size
& PAGE_MASK
)) {
2089 if (local_flags
& CL_COMMIT
)
2090 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
);
2093 max_size
= filesize
- f_offset
;
2095 if (size
< max_size
)
2100 rounded_size
= (io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
2102 if (size
> rounded_size
) {
2103 if (local_flags
& CL_COMMIT
)
2104 ubc_upl_abort_range(upl
, upl_offset
+ rounded_size
, size
- rounded_size
,
2105 UPL_ABORT_FREE_ON_EMPTY
);
2107 return (cluster_io(vp
, upl
, upl_offset
, f_offset
, io_size
,
2108 local_flags
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
));
2113 cluster_pagein(vnode_t vp
, upl_t upl
, upl_offset_t upl_offset
, off_t f_offset
,
2114 int size
, off_t filesize
, int flags
)
2116 return cluster_pagein_ext(vp
, upl
, upl_offset
, f_offset
, size
, filesize
, flags
, NULL
, NULL
);
2121 cluster_pagein_ext(vnode_t vp
, upl_t upl
, upl_offset_t upl_offset
, off_t f_offset
,
2122 int size
, off_t filesize
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
2128 int local_flags
= 0;
2130 if (upl
== NULL
|| size
< 0)
2131 panic("cluster_pagein: NULL upl passed in");
2133 if ((flags
& UPL_IOSYNC
) == 0)
2134 local_flags
|= CL_ASYNC
;
2135 if ((flags
& UPL_NOCOMMIT
) == 0)
2136 local_flags
|= CL_COMMIT
;
2137 if (flags
& UPL_IOSTREAMING
)
2138 local_flags
|= CL_IOSTREAMING
;
2139 if (flags
& UPL_PAGING_ENCRYPTED
)
2140 local_flags
|= CL_ENCRYPTED
;
2143 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 56)) | DBG_FUNC_NONE
,
2144 (int)f_offset
, size
, (int)filesize
, local_flags
, 0);
2147 * can't page-in from a negative offset
2148 * or if we're starting beyond the EOF
2149 * or if the file offset isn't page aligned
2150 * or the size requested isn't a multiple of PAGE_SIZE
2152 if (f_offset
< 0 || f_offset
>= filesize
||
2153 (f_offset
& PAGE_MASK_64
) || (size
& PAGE_MASK
) || (upl_offset
& PAGE_MASK
)) {
2154 if (local_flags
& CL_COMMIT
)
2155 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
);
2158 max_size
= filesize
- f_offset
;
2160 if (size
< max_size
)
2165 rounded_size
= (io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
2167 if (size
> rounded_size
&& (local_flags
& CL_COMMIT
))
2168 ubc_upl_abort_range(upl
, upl_offset
+ rounded_size
,
2169 size
- rounded_size
, UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
);
2171 retval
= cluster_io(vp
, upl
, upl_offset
, f_offset
, io_size
,
2172 local_flags
| CL_READ
| CL_PAGEIN
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
);
2179 cluster_bp(buf_t bp
)
2181 return cluster_bp_ext(bp
, NULL
, NULL
);
2186 cluster_bp_ext(buf_t bp
, int (*callback
)(buf_t
, void *), void *callback_arg
)
2191 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 19)) | DBG_FUNC_START
,
2192 bp
, (int)bp
->b_lblkno
, bp
->b_bcount
, bp
->b_flags
, 0);
2194 if (bp
->b_flags
& B_READ
)
2195 flags
= CL_ASYNC
| CL_READ
;
2198 if (bp
->b_flags
& B_PASSIVE
)
2199 flags
|= CL_PASSIVE
;
2201 f_offset
= ubc_blktooff(bp
->b_vp
, bp
->b_lblkno
);
2203 return (cluster_io(bp
->b_vp
, bp
->b_upl
, 0, f_offset
, bp
->b_bcount
, flags
, bp
, (struct clios
*)NULL
, callback
, callback_arg
));
2209 cluster_write(vnode_t vp
, struct uio
*uio
, off_t oldEOF
, off_t newEOF
, off_t headOff
, off_t tailOff
, int xflags
)
2211 return cluster_write_ext(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, xflags
, NULL
, NULL
);
2216 cluster_write_ext(vnode_t vp
, struct uio
*uio
, off_t oldEOF
, off_t newEOF
, off_t headOff
, off_t tailOff
,
2217 int xflags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
2219 user_ssize_t cur_resid
;
2224 int write_type
= IO_COPY
;
2225 u_int32_t write_length
;
2229 if (flags
& IO_PASSIVE
)
2234 if (vp
->v_flag
& VNOCACHE_DATA
){
2235 flags
|= IO_NOCACHE
;
2236 bflag
|= CL_NOCACHE
;
2241 * this call is being made to zero-fill some range in the file
2243 retval
= cluster_write_copy(vp
, NULL
, (u_int32_t
)0, oldEOF
, newEOF
, headOff
, tailOff
, flags
, callback
, callback_arg
);
2248 * do a write through the cache if one of the following is true....
2249 * NOCACHE is not true or NODIRECT is true
2250 * the uio request doesn't target USERSPACE
2251 * otherwise, find out if we want the direct or contig variant for
2252 * the first vector in the uio request
2254 if ( ((flags
& (IO_NOCACHE
| IO_NODIRECT
)) == IO_NOCACHE
) && UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
) )
2255 retval
= cluster_io_type(uio
, &write_type
, &write_length
, MIN_DIRECT_WRITE_SIZE
);
2257 if ( (flags
& (IO_TAILZEROFILL
| IO_HEADZEROFILL
)) && write_type
== IO_DIRECT
)
2259 * must go through the cached variant in this case
2261 write_type
= IO_COPY
;
2263 while ((cur_resid
= uio_resid(uio
)) && uio
->uio_offset
< newEOF
&& retval
== 0) {
2265 switch (write_type
) {
2269 * make sure the uio_resid isn't too big...
2270 * internally, we want to handle all of the I/O in
2271 * chunk sizes that fit in a 32 bit int
2273 if (cur_resid
> (user_ssize_t
)(MAX_IO_REQUEST_SIZE
)) {
2275 * we're going to have to call cluster_write_copy
2278 * only want the last call to cluster_write_copy to
2279 * have the IO_TAILZEROFILL flag set and only the
2280 * first call should have IO_HEADZEROFILL
2282 zflags
= flags
& ~IO_TAILZEROFILL
;
2283 flags
&= ~IO_HEADZEROFILL
;
2285 write_length
= MAX_IO_REQUEST_SIZE
;
2288 * last call to cluster_write_copy
2292 write_length
= (u_int32_t
)cur_resid
;
2294 retval
= cluster_write_copy(vp
, uio
, write_length
, oldEOF
, newEOF
, headOff
, tailOff
, zflags
, callback
, callback_arg
);
2298 zflags
= flags
& ~(IO_TAILZEROFILL
| IO_HEADZEROFILL
);
2300 if (flags
& IO_HEADZEROFILL
) {
2302 * only do this once per request
2304 flags
&= ~IO_HEADZEROFILL
;
2306 retval
= cluster_write_copy(vp
, (struct uio
*)0, (u_int32_t
)0, (off_t
)0, uio
->uio_offset
,
2307 headOff
, (off_t
)0, zflags
| IO_HEADZEROFILL
| IO_SYNC
, callback
, callback_arg
);
2311 retval
= cluster_write_contig(vp
, uio
, newEOF
, &write_type
, &write_length
, callback
, callback_arg
, bflag
);
2313 if (retval
== 0 && (flags
& IO_TAILZEROFILL
) && uio_resid(uio
) == 0) {
2315 * we're done with the data from the user specified buffer(s)
2316 * and we've been requested to zero fill at the tail
2317 * treat this as an IO_HEADZEROFILL which doesn't require a uio
2318 * by rearranging the args and passing in IO_HEADZEROFILL
2320 retval
= cluster_write_copy(vp
, (struct uio
*)0, (u_int32_t
)0, (off_t
)0, tailOff
, uio
->uio_offset
,
2321 (off_t
)0, zflags
| IO_HEADZEROFILL
| IO_SYNC
, callback
, callback_arg
);
2327 * cluster_write_direct is never called with IO_TAILZEROFILL || IO_HEADZEROFILL
2329 retval
= cluster_write_direct(vp
, uio
, oldEOF
, newEOF
, &write_type
, &write_length
, flags
, callback
, callback_arg
);
2333 retval
= cluster_io_type(uio
, &write_type
, &write_length
, MIN_DIRECT_WRITE_SIZE
);
2337 * in case we end up calling cluster_write_copy (from cluster_write_direct)
2338 * multiple times to service a multi-vector request that is not aligned properly
2339 * we need to update the oldEOF so that we
2340 * don't zero-fill the head of a page if we've successfully written
2341 * data to that area... 'cluster_write_copy' will zero-fill the head of a
2342 * page that is beyond the oldEOF if the write is unaligned... we only
2343 * want that to happen for the very first page of the cluster_write,
2344 * NOT the first page of each vector making up a multi-vector write.
2346 if (uio
->uio_offset
> oldEOF
)
2347 oldEOF
= uio
->uio_offset
;
2354 cluster_write_direct(vnode_t vp
, struct uio
*uio
, off_t oldEOF
, off_t newEOF
, int *write_type
, u_int32_t
*write_length
,
2355 int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
2358 upl_page_info_t
*pl
;
2359 vm_offset_t upl_offset
;
2360 vm_offset_t vector_upl_offset
= 0;
2361 u_int32_t io_req_size
;
2362 u_int32_t offset_in_file
;
2363 u_int32_t offset_in_iovbase
;
2366 upl_size_t upl_size
, vector_upl_size
= 0;
2367 vm_size_t upl_needed_size
;
2368 mach_msg_type_number_t pages_in_pl
;
2369 upl_control_flags_t upl_flags
;
2371 mach_msg_type_number_t i
;
2372 int force_data_sync
;
2375 struct clios iostate
;
2376 user_addr_t iov_base
;
2377 u_int32_t mem_alignment_mask
;
2378 u_int32_t devblocksize
;
2379 u_int32_t max_io_size
;
2380 u_int32_t max_upl_size
;
2381 u_int32_t max_vector_size
;
2382 u_int32_t bytes_outstanding_limit
;
2383 boolean_t io_throttled
= FALSE
;
2385 u_int32_t vector_upl_iosize
= 0;
2386 int issueVectorUPL
= 0,useVectorUPL
= (uio
->uio_iovcnt
> 1);
2387 off_t v_upl_uio_offset
= 0;
2388 int vector_upl_index
=0;
2389 upl_t vector_upl
= NULL
;
2393 * When we enter this routine, we know
2394 * -- the resid will not exceed iov_len
2396 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_START
,
2397 (int)uio
->uio_offset
, *write_length
, (int)newEOF
, 0, 0);
2399 max_upl_size
= cluster_max_io_size(vp
->v_mount
, CL_WRITE
);
2401 io_flag
= CL_ASYNC
| CL_PRESERVE
| CL_COMMIT
| CL_THROTTLE
| CL_DIRECT_IO
;
2403 if (flags
& IO_PASSIVE
)
2404 io_flag
|= CL_PASSIVE
;
2406 if (flags
& IO_NOCACHE
)
2407 io_flag
|= CL_NOCACHE
;
2409 if (flags
& IO_SKIP_ENCRYPTION
)
2410 io_flag
|= CL_ENCRYPTED
;
2412 iostate
.io_completed
= 0;
2413 iostate
.io_issued
= 0;
2414 iostate
.io_error
= 0;
2415 iostate
.io_wanted
= 0;
2417 lck_mtx_init(&iostate
.io_mtxp
, cl_mtx_grp
, cl_mtx_attr
);
2419 mem_alignment_mask
= (u_int32_t
)vp
->v_mount
->mnt_alignmentmask
;
2420 devblocksize
= (u_int32_t
)vp
->v_mount
->mnt_devblocksize
;
2422 if (devblocksize
== 1) {
2424 * the AFP client advertises a devblocksize of 1
2425 * however, its BLOCKMAP routine maps to physical
2426 * blocks that are PAGE_SIZE in size...
2427 * therefore we can't ask for I/Os that aren't page aligned
2428 * or aren't multiples of PAGE_SIZE in size
2429 * by setting devblocksize to PAGE_SIZE, we re-instate
2430 * the old behavior we had before the mem_alignment_mask
2431 * changes went in...
2433 devblocksize
= PAGE_SIZE
;
2437 io_req_size
= *write_length
;
2438 iov_base
= uio_curriovbase(uio
);
2440 offset_in_file
= (u_int32_t
)uio
->uio_offset
& PAGE_MASK
;
2441 offset_in_iovbase
= (u_int32_t
)iov_base
& mem_alignment_mask
;
2443 if (offset_in_file
|| offset_in_iovbase
) {
2445 * one of the 2 important offsets is misaligned
2446 * so fire an I/O through the cache for this entire vector
2448 goto wait_for_dwrites
;
2450 if (iov_base
& (devblocksize
- 1)) {
2452 * the offset in memory must be on a device block boundary
2453 * so that we can guarantee that we can generate an
2454 * I/O that ends on a page boundary in cluster_io
2456 goto wait_for_dwrites
;
2459 task_update_logical_writes(current_task(), (io_req_size
& ~PAGE_MASK
), TASK_WRITE_IMMEDIATE
, vp
);
2460 while (io_req_size
>= PAGE_SIZE
&& uio
->uio_offset
< newEOF
&& retval
== 0) {
2463 if ( (throttle_type
= cluster_is_throttled(vp
)) ) {
2465 * we're in the throttle window, at the very least
2466 * we want to limit the size of the I/O we're about
2469 if ( (flags
& IO_RETURN_ON_THROTTLE
) && throttle_type
== THROTTLE_NOW
) {
2471 * we're in the throttle window and at least 1 I/O
2472 * has already been issued by a throttleable thread
2473 * in this window, so return with EAGAIN to indicate
2474 * to the FS issuing the cluster_write call that it
2475 * should now throttle after dropping any locks
2477 throttle_info_update_by_mount(vp
->v_mount
);
2479 io_throttled
= TRUE
;
2480 goto wait_for_dwrites
;
2482 max_vector_size
= THROTTLE_MAX_IOSIZE
;
2483 max_io_size
= THROTTLE_MAX_IOSIZE
;
2485 max_vector_size
= MAX_VECTOR_UPL_SIZE
;
2486 max_io_size
= max_upl_size
;
2490 cluster_syncup(vp
, newEOF
, callback
, callback_arg
, callback
? PUSH_SYNC
: 0);
2493 io_size
= io_req_size
& ~PAGE_MASK
;
2494 iov_base
= uio_curriovbase(uio
);
2496 if (io_size
> max_io_size
)
2497 io_size
= max_io_size
;
2499 if(useVectorUPL
&& (iov_base
& PAGE_MASK
)) {
2501 * We have an iov_base that's not page-aligned.
2502 * Issue all I/O's that have been collected within
2503 * this Vectored UPL.
2505 if(vector_upl_index
) {
2506 retval
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
2507 reset_vector_run_state();
2511 * After this point, if we are using the Vector UPL path and the base is
2512 * not page-aligned then the UPL with that base will be the first in the vector UPL.
2516 upl_offset
= (vm_offset_t
)((u_int32_t
)iov_base
& PAGE_MASK
);
2517 upl_needed_size
= (upl_offset
+ io_size
+ (PAGE_SIZE
-1)) & ~PAGE_MASK
;
2519 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_START
,
2520 (int)upl_offset
, upl_needed_size
, (int)iov_base
, io_size
, 0);
2522 vm_map_t map
= UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
) ? current_map() : kernel_map
;
2523 for (force_data_sync
= 0; force_data_sync
< 3; force_data_sync
++) {
2525 upl_size
= upl_needed_size
;
2526 upl_flags
= UPL_FILE_IO
| UPL_COPYOUT_FROM
| UPL_NO_SYNC
|
2527 UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
2529 kret
= vm_map_get_upl(map
,
2530 (vm_map_offset_t
)(iov_base
& ~((user_addr_t
)PAGE_MASK
)),
2536 VM_KERN_MEMORY_FILE
,
2539 if (kret
!= KERN_SUCCESS
) {
2540 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
2543 * failed to get pagelist
2545 * we may have already spun some portion of this request
2546 * off as async requests... we need to wait for the I/O
2547 * to complete before returning
2549 goto wait_for_dwrites
;
2551 pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
2552 pages_in_pl
= upl_size
/ PAGE_SIZE
;
2554 for (i
= 0; i
< pages_in_pl
; i
++) {
2555 if (!upl_valid_page(pl
, i
))
2558 if (i
== pages_in_pl
)
2562 * didn't get all the pages back that we
2563 * needed... release this upl and try again
2565 ubc_upl_abort(upl
, 0);
2567 if (force_data_sync
>= 3) {
2568 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
2569 i
, pages_in_pl
, upl_size
, kret
, 0);
2571 * for some reason, we couldn't acquire a hold on all
2572 * the pages needed in the user's address space
2574 * we may have already spun some portion of this request
2575 * off as async requests... we need to wait for the I/O
2576 * to complete before returning
2578 goto wait_for_dwrites
;
2582 * Consider the possibility that upl_size wasn't satisfied.
2584 if (upl_size
< upl_needed_size
) {
2585 if (upl_size
&& upl_offset
== 0)
2590 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
2591 (int)upl_offset
, upl_size
, (int)iov_base
, io_size
, 0);
2594 ubc_upl_abort(upl
, 0);
2596 * we may have already spun some portion of this request
2597 * off as async requests... we need to wait for the I/O
2598 * to complete before returning
2600 goto wait_for_dwrites
;
2604 vm_offset_t end_off
= ((iov_base
+ io_size
) & PAGE_MASK
);
2608 * After this point, if we are using a vector UPL, then
2609 * either all the UPL elements end on a page boundary OR
2610 * this UPL is the last element because it does not end
2611 * on a page boundary.
2616 * we want push out these writes asynchronously so that we can overlap
2617 * the preparation of the next I/O
2618 * if there are already too many outstanding writes
2619 * wait until some complete before issuing the next
2621 if (vp
->v_mount
->mnt_minsaturationbytecount
)
2622 bytes_outstanding_limit
= vp
->v_mount
->mnt_minsaturationbytecount
;
2624 bytes_outstanding_limit
= max_upl_size
* IO_SCALE(vp
, 2);
2626 cluster_iostate_wait(&iostate
, bytes_outstanding_limit
, "cluster_write_direct");
2628 if (iostate
.io_error
) {
2630 * one of the earlier writes we issued ran into a hard error
2631 * don't issue any more writes, cleanup the UPL
2632 * that was just created but not used, then
2633 * go wait for all writes that are part of this stream
2634 * to complete before returning the error to the caller
2636 ubc_upl_abort(upl
, 0);
2638 goto wait_for_dwrites
;
2641 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 77)) | DBG_FUNC_START
,
2642 (int)upl_offset
, (int)uio
->uio_offset
, io_size
, io_flag
, 0);
2645 retval
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
,
2646 io_size
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
2649 if(!vector_upl_index
) {
2650 vector_upl
= vector_upl_create(upl_offset
);
2651 v_upl_uio_offset
= uio
->uio_offset
;
2652 vector_upl_offset
= upl_offset
;
2655 vector_upl_set_subupl(vector_upl
,upl
,upl_size
);
2656 vector_upl_set_iostate(vector_upl
, upl
, vector_upl_size
, upl_size
);
2658 vector_upl_iosize
+= io_size
;
2659 vector_upl_size
+= upl_size
;
2661 if(issueVectorUPL
|| vector_upl_index
== MAX_VECTOR_UPL_ELEMENTS
|| vector_upl_size
>= max_vector_size
) {
2662 retval
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
2663 reset_vector_run_state();
2668 * update the uio structure to
2669 * reflect the I/O that we just issued
2671 uio_update(uio
, (user_size_t
)io_size
);
2674 * in case we end up calling through to cluster_write_copy to finish
2675 * the tail of this request, we need to update the oldEOF so that we
2676 * don't zero-fill the head of a page if we've successfully written
2677 * data to that area... 'cluster_write_copy' will zero-fill the head of a
2678 * page that is beyond the oldEOF if the write is unaligned... we only
2679 * want that to happen for the very first page of the cluster_write,
2680 * NOT the first page of each vector making up a multi-vector write.
2682 if (uio
->uio_offset
> oldEOF
)
2683 oldEOF
= uio
->uio_offset
;
2685 io_req_size
-= io_size
;
2687 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 77)) | DBG_FUNC_END
,
2688 (int)upl_offset
, (int)uio
->uio_offset
, io_req_size
, retval
, 0);
2692 if (retval
== 0 && iostate
.io_error
== 0 && io_req_size
== 0) {
2694 retval
= cluster_io_type(uio
, write_type
, write_length
, MIN_DIRECT_WRITE_SIZE
);
2696 if (retval
== 0 && *write_type
== IO_DIRECT
) {
2698 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_NONE
,
2699 (int)uio
->uio_offset
, *write_length
, (int)newEOF
, 0, 0);
2707 if (retval
== 0 && iostate
.io_error
== 0 && useVectorUPL
&& vector_upl_index
) {
2708 retval
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
2709 reset_vector_run_state();
2712 * make sure all async writes issued as part of this stream
2713 * have completed before we return
2715 cluster_iostate_wait(&iostate
, 0, "cluster_write_direct");
2717 if (iostate
.io_error
)
2718 retval
= iostate
.io_error
;
2720 lck_mtx_destroy(&iostate
.io_mtxp
, cl_mtx_grp
);
2722 if (io_throttled
== TRUE
&& retval
== 0)
2725 if (io_req_size
&& retval
== 0) {
2727 * we couldn't handle the tail of this request in DIRECT mode
2728 * so fire it through the copy path
2730 * note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set
2731 * so we can just pass 0 in for the headOff and tailOff
2733 if (uio
->uio_offset
> oldEOF
)
2734 oldEOF
= uio
->uio_offset
;
2736 retval
= cluster_write_copy(vp
, uio
, io_req_size
, oldEOF
, newEOF
, (off_t
)0, (off_t
)0, flags
, callback
, callback_arg
);
2738 *write_type
= IO_UNKNOWN
;
2740 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_END
,
2741 (int)uio
->uio_offset
, io_req_size
, retval
, 4, 0);
2748 cluster_write_contig(vnode_t vp
, struct uio
*uio
, off_t newEOF
, int *write_type
, u_int32_t
*write_length
,
2749 int (*callback
)(buf_t
, void *), void *callback_arg
, int bflag
)
2751 upl_page_info_t
*pl
;
2752 addr64_t src_paddr
= 0;
2753 upl_t upl
[MAX_VECTS
];
2754 vm_offset_t upl_offset
;
2755 u_int32_t tail_size
= 0;
2758 upl_size_t upl_size
;
2759 vm_size_t upl_needed_size
;
2760 mach_msg_type_number_t pages_in_pl
;
2761 upl_control_flags_t upl_flags
;
2763 struct clios iostate
;
2768 user_addr_t iov_base
;
2769 u_int32_t devblocksize
;
2770 u_int32_t mem_alignment_mask
;
2773 * When we enter this routine, we know
2774 * -- the io_req_size will not exceed iov_len
2775 * -- the target address is physically contiguous
2777 cluster_syncup(vp
, newEOF
, callback
, callback_arg
, callback
? PUSH_SYNC
: 0);
2779 devblocksize
= (u_int32_t
)vp
->v_mount
->mnt_devblocksize
;
2780 mem_alignment_mask
= (u_int32_t
)vp
->v_mount
->mnt_alignmentmask
;
2782 iostate
.io_completed
= 0;
2783 iostate
.io_issued
= 0;
2784 iostate
.io_error
= 0;
2785 iostate
.io_wanted
= 0;
2787 lck_mtx_init(&iostate
.io_mtxp
, cl_mtx_grp
, cl_mtx_attr
);
2790 io_size
= *write_length
;
2792 iov_base
= uio_curriovbase(uio
);
2794 upl_offset
= (vm_offset_t
)((u_int32_t
)iov_base
& PAGE_MASK
);
2795 upl_needed_size
= upl_offset
+ io_size
;
2798 upl_size
= upl_needed_size
;
2799 upl_flags
= UPL_FILE_IO
| UPL_COPYOUT_FROM
| UPL_NO_SYNC
|
2800 UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
2802 vm_map_t map
= UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
) ? current_map() : kernel_map
;
2803 kret
= vm_map_get_upl(map
,
2804 (vm_map_offset_t
)(iov_base
& ~((user_addr_t
)PAGE_MASK
)),
2805 &upl_size
, &upl
[cur_upl
], NULL
, &pages_in_pl
, &upl_flags
, VM_KERN_MEMORY_FILE
, 0);
2807 if (kret
!= KERN_SUCCESS
) {
2809 * failed to get pagelist
2812 goto wait_for_cwrites
;
2817 * Consider the possibility that upl_size wasn't satisfied.
2819 if (upl_size
< upl_needed_size
) {
2821 * This is a failure in the physical memory case.
2824 goto wait_for_cwrites
;
2826 pl
= ubc_upl_pageinfo(upl
[cur_upl
]);
2828 src_paddr
= ((addr64_t
)upl_phys_page(pl
, 0) << PAGE_SHIFT
) + (addr64_t
)upl_offset
;
2830 while (((uio
->uio_offset
& (devblocksize
- 1)) || io_size
< devblocksize
) && io_size
) {
2831 u_int32_t head_size
;
2833 head_size
= devblocksize
- (u_int32_t
)(uio
->uio_offset
& (devblocksize
- 1));
2835 if (head_size
> io_size
)
2836 head_size
= io_size
;
2838 error
= cluster_align_phys_io(vp
, uio
, src_paddr
, head_size
, 0, callback
, callback_arg
);
2841 goto wait_for_cwrites
;
2843 upl_offset
+= head_size
;
2844 src_paddr
+= head_size
;
2845 io_size
-= head_size
;
2847 iov_base
+= head_size
;
2849 if ((u_int32_t
)iov_base
& mem_alignment_mask
) {
2851 * request doesn't set up on a memory boundary
2852 * the underlying DMA engine can handle...
2853 * return an error instead of going through
2854 * the slow copy path since the intent of this
2855 * path is direct I/O from device memory
2858 goto wait_for_cwrites
;
2861 tail_size
= io_size
& (devblocksize
- 1);
2862 io_size
-= tail_size
;
2864 while (io_size
&& error
== 0) {
2866 if (io_size
> MAX_IO_CONTIG_SIZE
)
2867 xsize
= MAX_IO_CONTIG_SIZE
;
2871 * request asynchronously so that we can overlap
2872 * the preparation of the next I/O... we'll do
2873 * the commit after all the I/O has completed
2874 * since its all issued against the same UPL
2875 * if there are already too many outstanding writes
2876 * wait until some have completed before issuing the next
2878 cluster_iostate_wait(&iostate
, MAX_IO_CONTIG_SIZE
* IO_SCALE(vp
, 2), "cluster_write_contig");
2880 if (iostate
.io_error
) {
2882 * one of the earlier writes we issued ran into a hard error
2883 * don't issue any more writes...
2884 * go wait for all writes that are part of this stream
2885 * to complete before returning the error to the caller
2887 goto wait_for_cwrites
;
2890 * issue an asynchronous write to cluster_io
2892 error
= cluster_io(vp
, upl
[cur_upl
], upl_offset
, uio
->uio_offset
,
2893 xsize
, CL_DEV_MEMORY
| CL_ASYNC
| bflag
, (buf_t
)NULL
, (struct clios
*)&iostate
, callback
, callback_arg
);
2897 * The cluster_io write completed successfully,
2898 * update the uio structure
2900 uio_update(uio
, (user_size_t
)xsize
);
2902 upl_offset
+= xsize
;
2907 if (error
== 0 && iostate
.io_error
== 0 && tail_size
== 0 && num_upl
< MAX_VECTS
) {
2909 error
= cluster_io_type(uio
, write_type
, write_length
, 0);
2911 if (error
== 0 && *write_type
== IO_CONTIG
) {
2916 *write_type
= IO_UNKNOWN
;
2920 * make sure all async writes that are part of this stream
2921 * have completed before we proceed
2923 cluster_iostate_wait(&iostate
, 0, "cluster_write_contig");
2925 if (iostate
.io_error
)
2926 error
= iostate
.io_error
;
2928 lck_mtx_destroy(&iostate
.io_mtxp
, cl_mtx_grp
);
2930 if (error
== 0 && tail_size
)
2931 error
= cluster_align_phys_io(vp
, uio
, src_paddr
, tail_size
, 0, callback
, callback_arg
);
2933 for (n
= 0; n
< num_upl
; n
++)
2935 * just release our hold on each physically contiguous
2936 * region without changing any state
2938 ubc_upl_abort(upl
[n
], 0);
2945 * need to avoid a race between an msync of a range of pages dirtied via mmap
2946 * vs a filesystem such as HFS deciding to write a 'hole' to disk via cluster_write's
2947 * zerofill mechanism before it has seen the VNOP_PAGEOUTs for the pages being msync'd
2949 * we should never force-zero-fill pages that are already valid in the cache...
2950 * the entire page contains valid data (either from disk, zero-filled or dirtied
2951 * via an mmap) so we can only do damage by trying to zero-fill
2955 cluster_zero_range(upl_t upl
, upl_page_info_t
*pl
, int flags
, int io_offset
, off_t zero_off
, off_t upl_f_offset
, int bytes_to_zero
)
2958 boolean_t need_cluster_zero
= TRUE
;
2960 if ((flags
& (IO_NOZEROVALID
| IO_NOZERODIRTY
))) {
2962 bytes_to_zero
= min(bytes_to_zero
, PAGE_SIZE
- (int)(zero_off
& PAGE_MASK_64
));
2963 zero_pg_index
= (int)((zero_off
- upl_f_offset
) / PAGE_SIZE_64
);
2965 if (upl_valid_page(pl
, zero_pg_index
)) {
2967 * never force zero valid pages - dirty or clean
2968 * we'll leave these in the UPL for cluster_write_copy to deal with
2970 need_cluster_zero
= FALSE
;
2973 if (need_cluster_zero
== TRUE
)
2974 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
2976 return (bytes_to_zero
);
2981 cluster_write_copy(vnode_t vp
, struct uio
*uio
, u_int32_t io_req_size
, off_t oldEOF
, off_t newEOF
, off_t headOff
,
2982 off_t tailOff
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
2984 upl_page_info_t
*pl
;
2986 vm_offset_t upl_offset
= 0;
2999 long long total_size
;
3002 long long zero_cnt1
;
3004 off_t write_off
= 0;
3006 boolean_t first_pass
= FALSE
;
3007 struct cl_extent cl
;
3008 struct cl_writebehind
*wbp
;
3010 u_int max_cluster_pgcount
;
3014 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_START
,
3015 (int)uio
->uio_offset
, io_req_size
, (int)oldEOF
, (int)newEOF
, 0);
3017 io_resid
= io_req_size
;
3019 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_START
,
3020 0, 0, (int)oldEOF
, (int)newEOF
, 0);
3024 if (flags
& IO_PASSIVE
)
3028 if (flags
& IO_NOCACHE
)
3029 bflag
|= CL_NOCACHE
;
3031 if (flags
& IO_SKIP_ENCRYPTION
)
3032 bflag
|= CL_ENCRYPTED
;
3039 max_cluster_pgcount
= MAX_CLUSTER_SIZE(vp
) / PAGE_SIZE
;
3040 max_io_size
= cluster_max_io_size(vp
->v_mount
, CL_WRITE
);
3042 if (flags
& IO_HEADZEROFILL
) {
3044 * some filesystems (HFS is one) don't support unallocated holes within a file...
3045 * so we zero fill the intervening space between the old EOF and the offset
3046 * where the next chunk of real data begins.... ftruncate will also use this
3047 * routine to zero fill to the new EOF when growing a file... in this case, the
3048 * uio structure will not be provided
3051 if (headOff
< uio
->uio_offset
) {
3052 zero_cnt
= uio
->uio_offset
- headOff
;
3055 } else if (headOff
< newEOF
) {
3056 zero_cnt
= newEOF
- headOff
;
3060 if (uio
&& uio
->uio_offset
> oldEOF
) {
3061 zero_off
= uio
->uio_offset
& ~PAGE_MASK_64
;
3063 if (zero_off
>= oldEOF
) {
3064 zero_cnt
= uio
->uio_offset
- zero_off
;
3066 flags
|= IO_HEADZEROFILL
;
3070 if (flags
& IO_TAILZEROFILL
) {
3072 zero_off1
= uio
->uio_offset
+ io_req_size
;
3074 if (zero_off1
< tailOff
)
3075 zero_cnt1
= tailOff
- zero_off1
;
3078 if (uio
&& newEOF
> oldEOF
) {
3079 zero_off1
= uio
->uio_offset
+ io_req_size
;
3081 if (zero_off1
== newEOF
&& (zero_off1
& PAGE_MASK_64
)) {
3082 zero_cnt1
= PAGE_SIZE_64
- (zero_off1
& PAGE_MASK_64
);
3084 flags
|= IO_TAILZEROFILL
;
3088 if (zero_cnt
== 0 && uio
== (struct uio
*) 0) {
3089 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_END
,
3090 retval
, 0, 0, 0, 0);
3094 write_off
= uio
->uio_offset
;
3095 write_cnt
= uio_resid(uio
);
3097 * delay updating the sequential write info
3098 * in the control block until we've obtained
3103 while ((total_size
= (io_resid
+ zero_cnt
+ zero_cnt1
)) && retval
== 0) {
3105 * for this iteration of the loop, figure out where our starting point is
3108 start_offset
= (int)(zero_off
& PAGE_MASK_64
);
3109 upl_f_offset
= zero_off
- start_offset
;
3110 } else if (io_resid
) {
3111 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
3112 upl_f_offset
= uio
->uio_offset
- start_offset
;
3114 start_offset
= (int)(zero_off1
& PAGE_MASK_64
);
3115 upl_f_offset
= zero_off1
- start_offset
;
3117 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 46)) | DBG_FUNC_NONE
,
3118 (int)zero_off
, (int)zero_cnt
, (int)zero_off1
, (int)zero_cnt1
, 0);
3120 if (total_size
> max_io_size
)
3121 total_size
= max_io_size
;
3123 cl
.b_addr
= (daddr64_t
)(upl_f_offset
/ PAGE_SIZE_64
);
3125 if (uio
&& ((flags
& (IO_SYNC
| IO_HEADZEROFILL
| IO_TAILZEROFILL
)) == 0)) {
3127 * assumption... total_size <= io_resid
3128 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
3130 if ((start_offset
+ total_size
) > max_io_size
)
3131 total_size
= max_io_size
- start_offset
;
3132 xfer_resid
= total_size
;
3134 retval
= cluster_copy_ubc_data_internal(vp
, uio
, &xfer_resid
, 1, 1);
3139 io_resid
-= (total_size
- xfer_resid
);
3140 total_size
= xfer_resid
;
3141 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
3142 upl_f_offset
= uio
->uio_offset
- start_offset
;
3144 if (total_size
== 0) {
3147 * the write did not finish on a page boundary
3148 * which will leave upl_f_offset pointing to the
3149 * beginning of the last page written instead of
3150 * the page beyond it... bump it in this case
3151 * so that the cluster code records the last page
3154 upl_f_offset
+= PAGE_SIZE_64
;
3162 * compute the size of the upl needed to encompass
3163 * the requested write... limit each call to cluster_io
3164 * to the maximum UPL size... cluster_io will clip if
3165 * this exceeds the maximum io_size for the device,
3166 * make sure to account for
3167 * a starting offset that's not page aligned
3169 upl_size
= (start_offset
+ total_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
3171 if (upl_size
> max_io_size
)
3172 upl_size
= max_io_size
;
3174 pages_in_upl
= upl_size
/ PAGE_SIZE
;
3175 io_size
= upl_size
- start_offset
;
3177 if ((long long)io_size
> total_size
)
3178 io_size
= total_size
;
3180 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_START
, upl_size
, io_size
, total_size
, 0, 0);
3184 * Gather the pages from the buffer cache.
3185 * The UPL_WILL_MODIFY flag lets the UPL subsystem know
3186 * that we intend to modify these pages.
3188 kret
= ubc_create_upl_kernel(vp
,
3193 UPL_SET_LITE
| (( uio
!=NULL
&& (uio
->uio_flags
& UIO_FLAGS_IS_COMPRESSED_FILE
)) ? 0 : UPL_WILL_MODIFY
),
3194 VM_KERN_MEMORY_FILE
);
3195 if (kret
!= KERN_SUCCESS
)
3196 panic("cluster_write_copy: failed to get pagelist");
3198 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_END
,
3199 upl
, (int)upl_f_offset
, start_offset
, 0, 0);
3201 if (start_offset
&& upl_f_offset
< oldEOF
&& !upl_valid_page(pl
, 0)) {
3205 * we're starting in the middle of the first page of the upl
3206 * and the page isn't currently valid, so we're going to have
3207 * to read it in first... this is a synchronous operation
3209 read_size
= PAGE_SIZE
;
3211 if ((upl_f_offset
+ read_size
) > oldEOF
)
3212 read_size
= oldEOF
- upl_f_offset
;
3214 retval
= cluster_io(vp
, upl
, 0, upl_f_offset
, read_size
,
3215 CL_READ
| bflag
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
);
3218 * we had an error during the read which causes us to abort
3219 * the current cluster_write request... before we do, we need
3220 * to release the rest of the pages in the upl without modifying
3221 * there state and mark the failed page in error
3223 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
|UPL_ABORT_FREE_ON_EMPTY
);
3225 if (upl_size
> PAGE_SIZE
)
3226 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3228 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
3229 upl
, 0, 0, retval
, 0);
3233 if ((start_offset
== 0 || upl_size
> PAGE_SIZE
) && ((start_offset
+ io_size
) & PAGE_MASK
)) {
3235 * the last offset we're writing to in this upl does not end on a page
3236 * boundary... if it's not beyond the old EOF, then we'll also need to
3237 * pre-read this page in if it isn't already valid
3239 upl_offset
= upl_size
- PAGE_SIZE
;
3241 if ((upl_f_offset
+ start_offset
+ io_size
) < oldEOF
&&
3242 !upl_valid_page(pl
, upl_offset
/ PAGE_SIZE
)) {
3245 read_size
= PAGE_SIZE
;
3247 if ((off_t
)(upl_f_offset
+ upl_offset
+ read_size
) > oldEOF
)
3248 read_size
= oldEOF
- (upl_f_offset
+ upl_offset
);
3250 retval
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, read_size
,
3251 CL_READ
| bflag
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
);
3254 * we had an error during the read which causes us to abort
3255 * the current cluster_write request... before we do, we
3256 * need to release the rest of the pages in the upl without
3257 * modifying there state and mark the failed page in error
3259 ubc_upl_abort_range(upl
, upl_offset
, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
|UPL_ABORT_FREE_ON_EMPTY
);
3261 if (upl_size
> PAGE_SIZE
)
3262 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3264 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
3265 upl
, 0, 0, retval
, 0);
3270 xfer_resid
= io_size
;
3271 io_offset
= start_offset
;
3273 while (zero_cnt
&& xfer_resid
) {
3275 if (zero_cnt
< (long long)xfer_resid
)
3276 bytes_to_zero
= zero_cnt
;
3278 bytes_to_zero
= xfer_resid
;
3280 bytes_to_zero
= cluster_zero_range(upl
, pl
, flags
, io_offset
, zero_off
, upl_f_offset
, bytes_to_zero
);
3282 xfer_resid
-= bytes_to_zero
;
3283 zero_cnt
-= bytes_to_zero
;
3284 zero_off
+= bytes_to_zero
;
3285 io_offset
+= bytes_to_zero
;
3287 if (xfer_resid
&& io_resid
) {
3288 u_int32_t io_requested
;
3290 bytes_to_move
= min(io_resid
, xfer_resid
);
3291 io_requested
= bytes_to_move
;
3293 retval
= cluster_copy_upl_data(uio
, upl
, io_offset
, (int *)&io_requested
);
3296 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
3298 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
3299 upl
, 0, 0, retval
, 0);
3301 io_resid
-= bytes_to_move
;
3302 xfer_resid
-= bytes_to_move
;
3303 io_offset
+= bytes_to_move
;
3306 while (xfer_resid
&& zero_cnt1
&& retval
== 0) {
3308 if (zero_cnt1
< (long long)xfer_resid
)
3309 bytes_to_zero
= zero_cnt1
;
3311 bytes_to_zero
= xfer_resid
;
3313 bytes_to_zero
= cluster_zero_range(upl
, pl
, flags
, io_offset
, zero_off1
, upl_f_offset
, bytes_to_zero
);
3315 xfer_resid
-= bytes_to_zero
;
3316 zero_cnt1
-= bytes_to_zero
;
3317 zero_off1
+= bytes_to_zero
;
3318 io_offset
+= bytes_to_zero
;
3322 int ret_cluster_try_push
;
3326 io_size
+= start_offset
;
3329 /* Force more restrictive zeroing behavior only on APFS */
3330 if ((vnode_tag(vp
) == VT_APFS
) && (newEOF
< oldEOF
)) {
3335 if (do_zeroing
&& (upl_f_offset
+ io_size
) >= newEOF
&& (u_int
)io_size
< upl_size
) {
3338 * if we're extending the file with this write
3339 * we'll zero fill the rest of the page so that
3340 * if the file gets extended again in such a way as to leave a
3341 * hole starting at this EOF, we'll have zero's in the correct spot
3343 cluster_zero(upl
, io_size
, upl_size
- io_size
, NULL
);
3346 * release the upl now if we hold one since...
3347 * 1) pages in it may be present in the sparse cluster map
3348 * and may span 2 separate buckets there... if they do and
3349 * we happen to have to flush a bucket to make room and it intersects
3350 * this upl, a deadlock may result on page BUSY
3351 * 2) we're delaying the I/O... from this point forward we're just updating
3352 * the cluster state... no need to hold the pages, so commit them
3353 * 3) IO_SYNC is set...
3354 * because we had to ask for a UPL that provides currenty non-present pages, the
3355 * UPL has been automatically set to clear the dirty flags (both software and hardware)
3356 * upon committing it... this is not the behavior we want since it's possible for
3357 * pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
3358 * we'll pick these pages back up later with the correct behavior specified.
3359 * 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush
3360 * of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages
3361 * we hold since the flushing context is holding the cluster lock.
3363 ubc_upl_commit_range(upl
, 0, upl_size
,
3364 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
3367 * calculate the last logical block number
3368 * that this delayed I/O encompassed
3370 cl
.e_addr
= (daddr64_t
)((upl_f_offset
+ (off_t
)upl_size
) / PAGE_SIZE_64
);
3372 if (flags
& IO_SYNC
) {
3374 * if the IO_SYNC flag is set than we need to
3375 * bypass any clusters and immediately issue
3381 * take the lock to protect our accesses
3382 * of the writebehind and sparse cluster state
3384 wbp
= cluster_get_wbp(vp
, CLW_ALLOCATE
| CLW_RETURNLOCKED
);
3386 if (wbp
->cl_scmap
) {
3388 if ( !(flags
& IO_NOCACHE
)) {
3390 * we've fallen into the sparse
3391 * cluster method of delaying dirty pages
3393 sparse_cluster_add(&(wbp
->cl_scmap
), vp
, &cl
, newEOF
, callback
, callback_arg
);
3395 lck_mtx_unlock(&wbp
->cl_lockw
);
3400 * must have done cached writes that fell into
3401 * the sparse cluster mechanism... we've switched
3402 * to uncached writes on the file, so go ahead
3403 * and push whatever's in the sparse map
3404 * and switch back to normal clustering
3408 sparse_cluster_push(&(wbp
->cl_scmap
), vp
, newEOF
, PUSH_ALL
, 0, callback
, callback_arg
);
3410 * no clusters of either type present at this point
3411 * so just go directly to start_new_cluster since
3412 * we know we need to delay this I/O since we've
3413 * already released the pages back into the cache
3414 * to avoid the deadlock with sparse_cluster_push
3416 goto start_new_cluster
;
3419 if (write_off
== wbp
->cl_last_write
)
3420 wbp
->cl_seq_written
+= write_cnt
;
3422 wbp
->cl_seq_written
= write_cnt
;
3424 wbp
->cl_last_write
= write_off
+ write_cnt
;
3428 if (wbp
->cl_number
== 0)
3430 * no clusters currently present
3432 goto start_new_cluster
;
3434 for (cl_index
= 0; cl_index
< wbp
->cl_number
; cl_index
++) {
3436 * check each cluster that we currently hold
3437 * try to merge some or all of this write into
3438 * one or more of the existing clusters... if
3439 * any portion of the write remains, start a
3442 if (cl
.b_addr
>= wbp
->cl_clusters
[cl_index
].b_addr
) {
3444 * the current write starts at or after the current cluster
3446 if (cl
.e_addr
<= (wbp
->cl_clusters
[cl_index
].b_addr
+ max_cluster_pgcount
)) {
3448 * we have a write that fits entirely
3449 * within the existing cluster limits
3451 if (cl
.e_addr
> wbp
->cl_clusters
[cl_index
].e_addr
)
3453 * update our idea of where the cluster ends
3455 wbp
->cl_clusters
[cl_index
].e_addr
= cl
.e_addr
;
3458 if (cl
.b_addr
< (wbp
->cl_clusters
[cl_index
].b_addr
+ max_cluster_pgcount
)) {
3460 * we have a write that starts in the middle of the current cluster
3461 * but extends beyond the cluster's limit... we know this because
3462 * of the previous checks
3463 * we'll extend the current cluster to the max
3464 * and update the b_addr for the current write to reflect that
3465 * the head of it was absorbed into this cluster...
3466 * note that we'll always have a leftover tail in this case since
3467 * full absorbtion would have occurred in the clause above
3469 wbp
->cl_clusters
[cl_index
].e_addr
= wbp
->cl_clusters
[cl_index
].b_addr
+ max_cluster_pgcount
;
3471 cl
.b_addr
= wbp
->cl_clusters
[cl_index
].e_addr
;
3474 * we come here for the case where the current write starts
3475 * beyond the limit of the existing cluster or we have a leftover
3476 * tail after a partial absorbtion
3478 * in either case, we'll check the remaining clusters before
3479 * starting a new one
3483 * the current write starts in front of the cluster we're currently considering
3485 if ((wbp
->cl_clusters
[cl_index
].e_addr
- cl
.b_addr
) <= max_cluster_pgcount
) {
3487 * we can just merge the new request into
3488 * this cluster and leave it in the cache
3489 * since the resulting cluster is still
3490 * less than the maximum allowable size
3492 wbp
->cl_clusters
[cl_index
].b_addr
= cl
.b_addr
;
3494 if (cl
.e_addr
> wbp
->cl_clusters
[cl_index
].e_addr
) {
3496 * the current write completely
3497 * envelops the existing cluster and since
3498 * each write is limited to at most max_cluster_pgcount pages
3499 * we can just use the start and last blocknos of the write
3500 * to generate the cluster limits
3502 wbp
->cl_clusters
[cl_index
].e_addr
= cl
.e_addr
;
3508 * if we were to combine this write with the current cluster
3509 * we would exceed the cluster size limit.... so,
3510 * let's see if there's any overlap of the new I/O with
3511 * the cluster we're currently considering... in fact, we'll
3512 * stretch the cluster out to it's full limit and see if we
3513 * get an intersection with the current write
3516 if (cl
.e_addr
> wbp
->cl_clusters
[cl_index
].e_addr
- max_cluster_pgcount
) {
3518 * the current write extends into the proposed cluster
3519 * clip the length of the current write after first combining it's
3520 * tail with the newly shaped cluster
3522 wbp
->cl_clusters
[cl_index
].b_addr
= wbp
->cl_clusters
[cl_index
].e_addr
- max_cluster_pgcount
;
3524 cl
.e_addr
= wbp
->cl_clusters
[cl_index
].b_addr
;
3527 * if we get here, there was no way to merge
3528 * any portion of this write with this cluster
3529 * or we could only merge part of it which
3530 * will leave a tail...
3531 * we'll check the remaining clusters before starting a new one
3535 if (cl_index
< wbp
->cl_number
)
3537 * we found an existing cluster(s) that we
3538 * could entirely merge this I/O into
3542 if (!((unsigned int)vfs_flags(vp
->v_mount
) & MNT_DEFWRITE
) &&
3543 wbp
->cl_number
== MAX_CLUSTERS
&&
3544 wbp
->cl_seq_written
>= (MAX_CLUSTERS
* (max_cluster_pgcount
* PAGE_SIZE
))) {
3547 if (vp
->v_mount
->mnt_minsaturationbytecount
) {
3548 n
= vp
->v_mount
->mnt_minsaturationbytecount
/ MAX_CLUSTER_SIZE(vp
);
3550 if (n
> MAX_CLUSTERS
)
3556 if (disk_conditioner_mount_is_ssd(vp
->v_mount
))
3557 n
= WRITE_BEHIND_SSD
;
3562 cluster_try_push(wbp
, vp
, newEOF
, 0, 0, callback
, callback_arg
, NULL
);
3564 if (wbp
->cl_number
< MAX_CLUSTERS
) {
3566 * we didn't find an existing cluster to
3567 * merge into, but there's room to start
3570 goto start_new_cluster
;
3573 * no exisitng cluster to merge with and no
3574 * room to start a new one... we'll try
3575 * pushing one of the existing ones... if none of
3576 * them are able to be pushed, we'll switch
3577 * to the sparse cluster mechanism
3578 * cluster_try_push updates cl_number to the
3579 * number of remaining clusters... and
3580 * returns the number of currently unused clusters
3582 ret_cluster_try_push
= 0;
3585 * if writes are not deferred, call cluster push immediately
3587 if (!((unsigned int)vfs_flags(vp
->v_mount
) & MNT_DEFWRITE
)) {
3589 ret_cluster_try_push
= cluster_try_push(wbp
, vp
, newEOF
, (flags
& IO_NOCACHE
) ? 0 : PUSH_DELAY
, 0, callback
, callback_arg
, NULL
);
3593 * execute following regardless of writes being deferred or not
3595 if (ret_cluster_try_push
== 0) {
3597 * no more room in the normal cluster mechanism
3598 * so let's switch to the more expansive but expensive
3599 * sparse mechanism....
3601 sparse_cluster_switch(wbp
, vp
, newEOF
, callback
, callback_arg
);
3602 sparse_cluster_add(&(wbp
->cl_scmap
), vp
, &cl
, newEOF
, callback
, callback_arg
);
3604 lck_mtx_unlock(&wbp
->cl_lockw
);
3609 wbp
->cl_clusters
[wbp
->cl_number
].b_addr
= cl
.b_addr
;
3610 wbp
->cl_clusters
[wbp
->cl_number
].e_addr
= cl
.e_addr
;
3612 wbp
->cl_clusters
[wbp
->cl_number
].io_flags
= 0;
3614 if (flags
& IO_NOCACHE
)
3615 wbp
->cl_clusters
[wbp
->cl_number
].io_flags
|= CLW_IONOCACHE
;
3617 if (bflag
& CL_PASSIVE
)
3618 wbp
->cl_clusters
[wbp
->cl_number
].io_flags
|= CLW_IOPASSIVE
;
3622 lck_mtx_unlock(&wbp
->cl_lockw
);
3627 * we don't hold the lock at this point
3629 * we've already dropped the current upl, so pick it back up with COPYOUT_FROM set
3630 * so that we correctly deal with a change in state of the hardware modify bit...
3631 * we do this via cluster_push_now... by passing along the IO_SYNC flag, we force
3632 * cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also
3633 * responsible for generating the correct sized I/O(s)
3635 retval
= cluster_push_now(vp
, &cl
, newEOF
, flags
, callback
, callback_arg
);
3638 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_END
, retval
, 0, io_resid
, 0, 0);
3646 cluster_read(vnode_t vp
, struct uio
*uio
, off_t filesize
, int xflags
)
3648 return cluster_read_ext(vp
, uio
, filesize
, xflags
, NULL
, NULL
);
3653 cluster_read_ext(vnode_t vp
, struct uio
*uio
, off_t filesize
, int xflags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
3657 user_ssize_t cur_resid
;
3659 u_int32_t read_length
= 0;
3660 int read_type
= IO_COPY
;
3664 if (vp
->v_flag
& VNOCACHE_DATA
)
3665 flags
|= IO_NOCACHE
;
3666 if ((vp
->v_flag
& VRAOFF
) || speculative_reads_disabled
)
3669 if (flags
& IO_SKIP_ENCRYPTION
)
3670 flags
|= IO_ENCRYPTED
;
3673 * do a read through the cache if one of the following is true....
3674 * NOCACHE is not true
3675 * the uio request doesn't target USERSPACE
3676 * Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well.
3677 * Reading encrypted data from a CP filesystem should never result in the data touching
3680 * otherwise, find out if we want the direct or contig variant for
3681 * the first vector in the uio request
3683 if ( ((flags
& IO_NOCACHE
) && UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
)) || (flags
& IO_ENCRYPTED
) ) {
3685 retval
= cluster_io_type(uio
, &read_type
, &read_length
, 0);
3688 while ((cur_resid
= uio_resid(uio
)) && uio
->uio_offset
< filesize
&& retval
== 0) {
3690 switch (read_type
) {
3694 * make sure the uio_resid isn't too big...
3695 * internally, we want to handle all of the I/O in
3696 * chunk sizes that fit in a 32 bit int
3698 if (cur_resid
> (user_ssize_t
)(MAX_IO_REQUEST_SIZE
))
3699 io_size
= MAX_IO_REQUEST_SIZE
;
3701 io_size
= (u_int32_t
)cur_resid
;
3703 retval
= cluster_read_copy(vp
, uio
, io_size
, filesize
, flags
, callback
, callback_arg
);
3707 retval
= cluster_read_direct(vp
, uio
, filesize
, &read_type
, &read_length
, flags
, callback
, callback_arg
);
3711 retval
= cluster_read_contig(vp
, uio
, filesize
, &read_type
, &read_length
, callback
, callback_arg
, flags
);
3715 retval
= cluster_io_type(uio
, &read_type
, &read_length
, 0);
3725 cluster_read_upl_release(upl_t upl
, int start_pg
, int last_pg
, int take_reference
)
3728 int abort_flags
= UPL_ABORT_FREE_ON_EMPTY
;
3730 if ((range
= last_pg
- start_pg
)) {
3732 abort_flags
|= UPL_ABORT_REFERENCE
;
3734 ubc_upl_abort_range(upl
, start_pg
* PAGE_SIZE
, range
* PAGE_SIZE
, abort_flags
);
3740 cluster_read_copy(vnode_t vp
, struct uio
*uio
, u_int32_t io_req_size
, off_t filesize
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
3742 upl_page_info_t
*pl
;
3744 vm_offset_t upl_offset
;
3753 off_t last_ioread_offset
;
3754 off_t last_request_offset
;
3758 u_int32_t size_of_prefetch
;
3761 u_int32_t max_rd_size
;
3762 u_int32_t max_io_size
;
3763 u_int32_t max_prefetch
;
3764 u_int rd_ahead_enabled
= 1;
3765 u_int prefetch_enabled
= 1;
3766 struct cl_readahead
* rap
;
3767 struct clios iostate
;
3768 struct cl_extent extent
;
3770 int take_reference
= 1;
3771 int policy
= IOPOL_DEFAULT
;
3772 boolean_t iolock_inited
= FALSE
;
3774 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_START
,
3775 (int)uio
->uio_offset
, io_req_size
, (int)filesize
, flags
, 0);
3777 if (flags
& IO_ENCRYPTED
) {
3778 panic ("encrypted blocks will hit UBC!");
3781 policy
= throttle_get_io_policy(NULL
);
3783 if (policy
== THROTTLE_LEVEL_TIER3
|| policy
== THROTTLE_LEVEL_TIER2
|| (flags
& IO_NOCACHE
))
3786 if (flags
& IO_PASSIVE
)
3791 if (flags
& IO_NOCACHE
)
3792 bflag
|= CL_NOCACHE
;
3794 if (flags
& IO_SKIP_ENCRYPTION
)
3795 bflag
|= CL_ENCRYPTED
;
3797 max_io_size
= cluster_max_io_size(vp
->v_mount
, CL_READ
);
3798 max_prefetch
= MAX_PREFETCH(vp
, max_io_size
, disk_conditioner_mount_is_ssd(vp
->v_mount
));
3799 max_rd_size
= max_prefetch
;
3801 last_request_offset
= uio
->uio_offset
+ io_req_size
;
3803 if (last_request_offset
> filesize
)
3804 last_request_offset
= filesize
;
3806 if ((flags
& (IO_RAOFF
|IO_NOCACHE
)) || ((last_request_offset
& ~PAGE_MASK_64
) == (uio
->uio_offset
& ~PAGE_MASK_64
))) {
3807 rd_ahead_enabled
= 0;
3810 if (cluster_is_throttled(vp
)) {
3812 * we're in the throttle window, at the very least
3813 * we want to limit the size of the I/O we're about
3816 rd_ahead_enabled
= 0;
3817 prefetch_enabled
= 0;
3819 max_rd_size
= THROTTLE_MAX_IOSIZE
;
3821 if ((rap
= cluster_get_rap(vp
)) == NULL
)
3822 rd_ahead_enabled
= 0;
3824 extent
.b_addr
= uio
->uio_offset
/ PAGE_SIZE_64
;
3825 extent
.e_addr
= (last_request_offset
- 1) / PAGE_SIZE_64
;
3828 if (rap
!= NULL
&& rap
->cl_ralen
&& (rap
->cl_lastr
== extent
.b_addr
|| (rap
->cl_lastr
+ 1) == extent
.b_addr
)) {
3830 * determine if we already have a read-ahead in the pipe courtesy of the
3831 * last read systemcall that was issued...
3832 * if so, pick up it's extent to determine where we should start
3833 * with respect to any read-ahead that might be necessary to
3834 * garner all the data needed to complete this read systemcall
3836 last_ioread_offset
= (rap
->cl_maxra
* PAGE_SIZE_64
) + PAGE_SIZE_64
;
3838 if (last_ioread_offset
< uio
->uio_offset
)
3839 last_ioread_offset
= (off_t
)0;
3840 else if (last_ioread_offset
> last_request_offset
)
3841 last_ioread_offset
= last_request_offset
;
3843 last_ioread_offset
= (off_t
)0;
3845 while (io_req_size
&& uio
->uio_offset
< filesize
&& retval
== 0) {
3847 max_size
= filesize
- uio
->uio_offset
;
3849 if ((off_t
)(io_req_size
) < max_size
)
3850 io_size
= io_req_size
;
3854 if (!(flags
& IO_NOCACHE
)) {
3858 u_int32_t io_requested
;
3861 * if we keep finding the pages we need already in the cache, then
3862 * don't bother to call cluster_read_prefetch since it costs CPU cycles
3863 * to determine that we have all the pages we need... once we miss in
3864 * the cache and have issued an I/O, than we'll assume that we're likely
3865 * to continue to miss in the cache and it's to our advantage to try and prefetch
3867 if (last_request_offset
&& last_ioread_offset
&& (size_of_prefetch
= (last_request_offset
- last_ioread_offset
))) {
3868 if ((last_ioread_offset
- uio
->uio_offset
) <= max_rd_size
&& prefetch_enabled
) {
3870 * we've already issued I/O for this request and
3871 * there's still work to do and
3872 * our prefetch stream is running dry, so issue a
3873 * pre-fetch I/O... the I/O latency will overlap
3874 * with the copying of the data
3876 if (size_of_prefetch
> max_rd_size
)
3877 size_of_prefetch
= max_rd_size
;
3879 size_of_prefetch
= cluster_read_prefetch(vp
, last_ioread_offset
, size_of_prefetch
, filesize
, callback
, callback_arg
, bflag
);
3881 last_ioread_offset
+= (off_t
)(size_of_prefetch
* PAGE_SIZE
);
3883 if (last_ioread_offset
> last_request_offset
)
3884 last_ioread_offset
= last_request_offset
;
3888 * limit the size of the copy we're about to do so that
3889 * we can notice that our I/O pipe is running dry and
3890 * get the next I/O issued before it does go dry
3892 if (last_ioread_offset
&& io_size
> (max_io_size
/ 4))
3893 io_resid
= (max_io_size
/ 4);
3897 io_requested
= io_resid
;
3899 retval
= cluster_copy_ubc_data_internal(vp
, uio
, (int *)&io_resid
, 0, take_reference
);
3901 xsize
= io_requested
- io_resid
;
3904 io_req_size
-= xsize
;
3906 if (retval
|| io_resid
)
3908 * if we run into a real error or
3909 * a page that is not in the cache
3910 * we need to leave streaming mode
3914 if (rd_ahead_enabled
&& (io_size
== 0 || last_ioread_offset
== last_request_offset
)) {
3916 * we're already finished the I/O for this read request
3917 * let's see if we should do a read-ahead
3919 cluster_read_ahead(vp
, &extent
, filesize
, rap
, callback
, callback_arg
, bflag
);
3926 if (extent
.e_addr
< rap
->cl_lastr
)
3928 rap
->cl_lastr
= extent
.e_addr
;
3933 * recompute max_size since cluster_copy_ubc_data_internal
3934 * may have advanced uio->uio_offset
3936 max_size
= filesize
- uio
->uio_offset
;
3939 iostate
.io_completed
= 0;
3940 iostate
.io_issued
= 0;
3941 iostate
.io_error
= 0;
3942 iostate
.io_wanted
= 0;
3944 if ( (flags
& IO_RETURN_ON_THROTTLE
) ) {
3945 if (cluster_is_throttled(vp
) == THROTTLE_NOW
) {
3946 if ( !cluster_io_present_in_BC(vp
, uio
->uio_offset
)) {
3948 * we're in the throttle window and at least 1 I/O
3949 * has already been issued by a throttleable thread
3950 * in this window, so return with EAGAIN to indicate
3951 * to the FS issuing the cluster_read call that it
3952 * should now throttle after dropping any locks
3954 throttle_info_update_by_mount(vp
->v_mount
);
3963 * compute the size of the upl needed to encompass
3964 * the requested read... limit each call to cluster_io
3965 * to the maximum UPL size... cluster_io will clip if
3966 * this exceeds the maximum io_size for the device,
3967 * make sure to account for
3968 * a starting offset that's not page aligned
3970 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
3971 upl_f_offset
= uio
->uio_offset
- (off_t
)start_offset
;
3973 if (io_size
> max_rd_size
)
3974 io_size
= max_rd_size
;
3976 upl_size
= (start_offset
+ io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
3978 if (flags
& IO_NOCACHE
) {
3979 if (upl_size
> max_io_size
)
3980 upl_size
= max_io_size
;
3982 if (upl_size
> max_io_size
/ 4) {
3983 upl_size
= max_io_size
/ 4;
3984 upl_size
&= ~PAGE_MASK
;
3987 upl_size
= PAGE_SIZE
;
3990 pages_in_upl
= upl_size
/ PAGE_SIZE
;
3992 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 33)) | DBG_FUNC_START
,
3993 upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
3995 kret
= ubc_create_upl_kernel(vp
,
4000 UPL_FILE_IO
| UPL_SET_LITE
,
4001 VM_KERN_MEMORY_FILE
);
4002 if (kret
!= KERN_SUCCESS
)
4003 panic("cluster_read_copy: failed to get pagelist");
4005 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 33)) | DBG_FUNC_END
,
4006 upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
4009 * scan from the beginning of the upl looking for the first
4010 * non-valid page.... this will become the first page in
4011 * the request we're going to make to 'cluster_io'... if all
4012 * of the pages are valid, we won't call through to 'cluster_io'
4014 for (start_pg
= 0; start_pg
< pages_in_upl
; start_pg
++) {
4015 if (!upl_valid_page(pl
, start_pg
))
4020 * scan from the starting invalid page looking for a valid
4021 * page before the end of the upl is reached, if we
4022 * find one, then it will be the last page of the request to
4025 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
4026 if (upl_valid_page(pl
, last_pg
))
4030 if (start_pg
< last_pg
) {
4032 * we found a range of 'invalid' pages that must be filled
4033 * if the last page in this range is the last page of the file
4034 * we may have to clip the size of it to keep from reading past
4035 * the end of the last physical block associated with the file
4037 if (iolock_inited
== FALSE
) {
4038 lck_mtx_init(&iostate
.io_mtxp
, cl_mtx_grp
, cl_mtx_attr
);
4040 iolock_inited
= TRUE
;
4042 upl_offset
= start_pg
* PAGE_SIZE
;
4043 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
4045 if ((off_t
)(upl_f_offset
+ upl_offset
+ io_size
) > filesize
)
4046 io_size
= filesize
- (upl_f_offset
+ upl_offset
);
4049 * issue an asynchronous read to cluster_io
4052 error
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
,
4053 io_size
, CL_READ
| CL_ASYNC
| bflag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
4056 if (extent
.e_addr
< rap
->cl_maxra
) {
4058 * we've just issued a read for a block that should have been
4059 * in the cache courtesy of the read-ahead engine... something
4060 * has gone wrong with the pipeline, so reset the read-ahead
4061 * logic which will cause us to restart from scratch
4069 * if the read completed successfully, or there was no I/O request
4070 * issued, than copy the data into user land via 'cluster_upl_copy_data'
4071 * we'll first add on any 'valid'
4072 * pages that were present in the upl when we acquired it.
4076 for (uio_last
= last_pg
; uio_last
< pages_in_upl
; uio_last
++) {
4077 if (!upl_valid_page(pl
, uio_last
))
4080 if (uio_last
< pages_in_upl
) {
4082 * there were some invalid pages beyond the valid pages
4083 * that we didn't issue an I/O for, just release them
4084 * unchanged now, so that any prefetch/readahed can
4087 ubc_upl_abort_range(upl
, uio_last
* PAGE_SIZE
,
4088 (pages_in_upl
- uio_last
) * PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
4092 * compute size to transfer this round, if io_req_size is
4093 * still non-zero after this attempt, we'll loop around and
4094 * set up for another I/O.
4096 val_size
= (uio_last
* PAGE_SIZE
) - start_offset
;
4098 if (val_size
> max_size
)
4099 val_size
= max_size
;
4101 if (val_size
> io_req_size
)
4102 val_size
= io_req_size
;
4104 if ((uio
->uio_offset
+ val_size
) > last_ioread_offset
)
4105 last_ioread_offset
= uio
->uio_offset
+ val_size
;
4107 if ((size_of_prefetch
= (last_request_offset
- last_ioread_offset
)) && prefetch_enabled
) {
4109 if ((last_ioread_offset
- (uio
->uio_offset
+ val_size
)) <= upl_size
) {
4111 * if there's still I/O left to do for this request, and...
4112 * we're not in hard throttle mode, and...
4113 * we're close to using up the previous prefetch, then issue a
4114 * new pre-fetch I/O... the I/O latency will overlap
4115 * with the copying of the data
4117 if (size_of_prefetch
> max_rd_size
)
4118 size_of_prefetch
= max_rd_size
;
4120 size_of_prefetch
= cluster_read_prefetch(vp
, last_ioread_offset
, size_of_prefetch
, filesize
, callback
, callback_arg
, bflag
);
4122 last_ioread_offset
+= (off_t
)(size_of_prefetch
* PAGE_SIZE
);
4124 if (last_ioread_offset
> last_request_offset
)
4125 last_ioread_offset
= last_request_offset
;
4128 } else if ((uio
->uio_offset
+ val_size
) == last_request_offset
) {
4130 * this transfer will finish this request, so...
4131 * let's try to read ahead if we're in
4132 * a sequential access pattern and we haven't
4133 * explicitly disabled it
4135 if (rd_ahead_enabled
)
4136 cluster_read_ahead(vp
, &extent
, filesize
, rap
, callback
, callback_arg
, bflag
);
4139 if (extent
.e_addr
< rap
->cl_lastr
)
4141 rap
->cl_lastr
= extent
.e_addr
;
4144 if (iolock_inited
== TRUE
)
4145 cluster_iostate_wait(&iostate
, 0, "cluster_read_copy");
4147 if (iostate
.io_error
)
4148 error
= iostate
.io_error
;
4150 u_int32_t io_requested
;
4152 io_requested
= val_size
;
4154 retval
= cluster_copy_upl_data(uio
, upl
, start_offset
, (int *)&io_requested
);
4156 io_req_size
-= (val_size
- io_requested
);
4159 if (iolock_inited
== TRUE
)
4160 cluster_iostate_wait(&iostate
, 0, "cluster_read_copy");
4162 if (start_pg
< last_pg
) {
4164 * compute the range of pages that we actually issued an I/O for
4165 * and either commit them as valid if the I/O succeeded
4166 * or abort them if the I/O failed or we're not supposed to
4167 * keep them in the cache
4169 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
4171 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_START
, upl
, start_pg
* PAGE_SIZE
, io_size
, error
, 0);
4173 if (error
|| (flags
& IO_NOCACHE
))
4174 ubc_upl_abort_range(upl
, start_pg
* PAGE_SIZE
, io_size
,
4175 UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
4177 int commit_flags
= UPL_COMMIT_CLEAR_DIRTY
| UPL_COMMIT_FREE_ON_EMPTY
;
4180 commit_flags
|= UPL_COMMIT_INACTIVATE
;
4182 commit_flags
|= UPL_COMMIT_SPECULATE
;
4184 ubc_upl_commit_range(upl
, start_pg
* PAGE_SIZE
, io_size
, commit_flags
);
4186 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_END
, upl
, start_pg
* PAGE_SIZE
, io_size
, error
, 0);
4188 if ((last_pg
- start_pg
) < pages_in_upl
) {
4190 * the set of pages that we issued an I/O for did not encompass
4191 * the entire upl... so just release these without modifying
4195 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
4198 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_START
,
4199 upl
, -1, pages_in_upl
- (last_pg
- start_pg
), 0, 0);
4202 * handle any valid pages at the beginning of
4203 * the upl... release these appropriately
4205 cluster_read_upl_release(upl
, 0, start_pg
, take_reference
);
4208 * handle any valid pages immediately after the
4209 * pages we issued I/O for... ... release these appropriately
4211 cluster_read_upl_release(upl
, last_pg
, uio_last
, take_reference
);
4213 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_END
, upl
, -1, -1, 0, 0);
4220 if (cluster_is_throttled(vp
)) {
4222 * we're in the throttle window, at the very least
4223 * we want to limit the size of the I/O we're about
4226 rd_ahead_enabled
= 0;
4227 prefetch_enabled
= 0;
4228 max_rd_size
= THROTTLE_MAX_IOSIZE
;
4230 if (max_rd_size
== THROTTLE_MAX_IOSIZE
) {
4232 * coming out of throttled state
4234 if (policy
!= THROTTLE_LEVEL_TIER3
&& policy
!= THROTTLE_LEVEL_TIER2
) {
4236 rd_ahead_enabled
= 1;
4237 prefetch_enabled
= 1;
4239 max_rd_size
= max_prefetch
;
4240 last_ioread_offset
= 0;
4245 if (iolock_inited
== TRUE
) {
4247 * cluster_io returned an error after it
4248 * had already issued some I/O. we need
4249 * to wait for that I/O to complete before
4250 * we can destroy the iostate mutex...
4251 * 'retval' already contains the early error
4252 * so no need to pick it up from iostate.io_error
4254 cluster_iostate_wait(&iostate
, 0, "cluster_read_copy");
4256 lck_mtx_destroy(&iostate
.io_mtxp
, cl_mtx_grp
);
4259 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_END
,
4260 (int)uio
->uio_offset
, io_req_size
, rap
->cl_lastr
, retval
, 0);
4262 lck_mtx_unlock(&rap
->cl_lockr
);
4264 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_END
,
4265 (int)uio
->uio_offset
, io_req_size
, 0, retval
, 0);
4272 * We don't want another read/write lock for every vnode in the system
4273 * so we keep a hash of them here. There should never be very many of
4274 * these around at any point in time.
4276 cl_direct_read_lock_t
*cluster_lock_direct_read(vnode_t vp
, lck_rw_type_t type
)
4278 struct cl_direct_read_locks
*head
4279 = &cl_direct_read_locks
[(uintptr_t)vp
/ sizeof(*vp
)
4280 % CL_DIRECT_READ_LOCK_BUCKETS
];
4282 struct cl_direct_read_lock
*lck
, *new_lck
= NULL
;
4285 lck_spin_lock(&cl_direct_read_spin_lock
);
4287 LIST_FOREACH(lck
, head
, chain
) {
4288 if (lck
->vp
== vp
) {
4290 lck_spin_unlock(&cl_direct_read_spin_lock
);
4292 // Someone beat us to it, ditch the allocation
4293 lck_rw_destroy(&new_lck
->rw_lock
, cl_mtx_grp
);
4294 FREE(new_lck
, M_TEMP
);
4296 lck_rw_lock(&lck
->rw_lock
, type
);
4302 // Use the lock we allocated
4303 LIST_INSERT_HEAD(head
, new_lck
, chain
);
4304 lck_spin_unlock(&cl_direct_read_spin_lock
);
4305 lck_rw_lock(&new_lck
->rw_lock
, type
);
4309 lck_spin_unlock(&cl_direct_read_spin_lock
);
4311 // Allocate a new lock
4312 MALLOC(new_lck
, cl_direct_read_lock_t
*, sizeof(*new_lck
),
4314 lck_rw_init(&new_lck
->rw_lock
, cl_mtx_grp
, cl_mtx_attr
);
4316 new_lck
->ref_count
= 1;
4318 // Got to go round again
4322 void cluster_unlock_direct_read(cl_direct_read_lock_t
*lck
)
4324 lck_rw_done(&lck
->rw_lock
);
4326 lck_spin_lock(&cl_direct_read_spin_lock
);
4327 if (lck
->ref_count
== 1) {
4328 LIST_REMOVE(lck
, chain
);
4329 lck_spin_unlock(&cl_direct_read_spin_lock
);
4330 lck_rw_destroy(&lck
->rw_lock
, cl_mtx_grp
);
4334 lck_spin_unlock(&cl_direct_read_spin_lock
);
4339 cluster_read_direct(vnode_t vp
, struct uio
*uio
, off_t filesize
, int *read_type
, u_int32_t
*read_length
,
4340 int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
4343 upl_page_info_t
*pl
;
4345 vm_offset_t upl_offset
, vector_upl_offset
= 0;
4346 upl_size_t upl_size
, vector_upl_size
= 0;
4347 vm_size_t upl_needed_size
;
4348 unsigned int pages_in_pl
;
4349 upl_control_flags_t upl_flags
;
4352 int force_data_sync
;
4354 int no_zero_fill
= 0;
4357 struct clios iostate
;
4358 user_addr_t iov_base
;
4359 u_int32_t io_req_size
;
4360 u_int32_t offset_in_file
;
4361 u_int32_t offset_in_iovbase
;
4365 u_int32_t devblocksize
;
4366 u_int32_t mem_alignment_mask
;
4367 u_int32_t max_upl_size
;
4368 u_int32_t max_rd_size
;
4369 u_int32_t max_rd_ahead
;
4370 u_int32_t max_vector_size
;
4371 boolean_t strict_uncached_IO
= FALSE
;
4372 boolean_t io_throttled
= FALSE
;
4374 u_int32_t vector_upl_iosize
= 0;
4375 int issueVectorUPL
= 0,useVectorUPL
= (uio
->uio_iovcnt
> 1);
4376 off_t v_upl_uio_offset
= 0;
4377 int vector_upl_index
=0;
4378 upl_t vector_upl
= NULL
;
4379 cl_direct_read_lock_t
*lock
= NULL
;
4381 user_addr_t orig_iov_base
= 0;
4382 user_addr_t last_iov_base
= 0;
4383 user_addr_t next_iov_base
= 0;
4385 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_START
,
4386 (int)uio
->uio_offset
, (int)filesize
, *read_type
, *read_length
, 0);
4388 max_upl_size
= cluster_max_io_size(vp
->v_mount
, CL_READ
);
4390 max_rd_size
= max_upl_size
;
4391 max_rd_ahead
= max_rd_size
* IO_SCALE(vp
, 2);
4393 io_flag
= CL_COMMIT
| CL_READ
| CL_ASYNC
| CL_NOZERO
| CL_DIRECT_IO
;
4395 if (flags
& IO_PASSIVE
)
4396 io_flag
|= CL_PASSIVE
;
4398 if (flags
& IO_ENCRYPTED
) {
4399 io_flag
|= CL_RAW_ENCRYPTED
;
4402 if (flags
& IO_NOCACHE
) {
4403 io_flag
|= CL_NOCACHE
;
4406 if (flags
& IO_SKIP_ENCRYPTION
)
4407 io_flag
|= CL_ENCRYPTED
;
4409 iostate
.io_completed
= 0;
4410 iostate
.io_issued
= 0;
4411 iostate
.io_error
= 0;
4412 iostate
.io_wanted
= 0;
4414 lck_mtx_init(&iostate
.io_mtxp
, cl_mtx_grp
, cl_mtx_attr
);
4416 devblocksize
= (u_int32_t
)vp
->v_mount
->mnt_devblocksize
;
4417 mem_alignment_mask
= (u_int32_t
)vp
->v_mount
->mnt_alignmentmask
;
4419 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_NONE
,
4420 (int)devblocksize
, (int)mem_alignment_mask
, 0, 0, 0);
4422 if (devblocksize
== 1) {
4424 * the AFP client advertises a devblocksize of 1
4425 * however, its BLOCKMAP routine maps to physical
4426 * blocks that are PAGE_SIZE in size...
4427 * therefore we can't ask for I/Os that aren't page aligned
4428 * or aren't multiples of PAGE_SIZE in size
4429 * by setting devblocksize to PAGE_SIZE, we re-instate
4430 * the old behavior we had before the mem_alignment_mask
4431 * changes went in...
4433 devblocksize
= PAGE_SIZE
;
4436 strict_uncached_IO
= ubc_strict_uncached_IO(vp
);
4438 orig_iov_base
= uio_curriovbase(uio
);
4439 last_iov_base
= orig_iov_base
;
4442 io_req_size
= *read_length
;
4443 iov_base
= uio_curriovbase(uio
);
4445 offset_in_file
= (u_int32_t
)uio
->uio_offset
& (devblocksize
- 1);
4446 offset_in_iovbase
= (u_int32_t
)iov_base
& mem_alignment_mask
;
4448 if (offset_in_file
|| offset_in_iovbase
) {
4450 * one of the 2 important offsets is misaligned
4451 * so fire an I/O through the cache for this entire vector
4455 if (iov_base
& (devblocksize
- 1)) {
4457 * the offset in memory must be on a device block boundary
4458 * so that we can guarantee that we can generate an
4459 * I/O that ends on a page boundary in cluster_io
4464 max_io_size
= filesize
- uio
->uio_offset
;
4467 * The user must request IO in aligned chunks. If the
4468 * offset into the file is bad, or the userland pointer
4469 * is non-aligned, then we cannot service the encrypted IO request.
4471 if (flags
& IO_ENCRYPTED
) {
4472 if (misaligned
|| (io_req_size
& (devblocksize
- 1)))
4475 max_io_size
= roundup(max_io_size
, devblocksize
);
4478 if ((off_t
)io_req_size
> max_io_size
)
4479 io_req_size
= max_io_size
;
4482 * When we get to this point, we know...
4483 * -- the offset into the file is on a devblocksize boundary
4486 while (io_req_size
&& retval
== 0) {
4489 if (cluster_is_throttled(vp
)) {
4491 * we're in the throttle window, at the very least
4492 * we want to limit the size of the I/O we're about
4495 max_rd_size
= THROTTLE_MAX_IOSIZE
;
4496 max_rd_ahead
= THROTTLE_MAX_IOSIZE
- 1;
4497 max_vector_size
= THROTTLE_MAX_IOSIZE
;
4499 max_rd_size
= max_upl_size
;
4500 max_rd_ahead
= max_rd_size
* IO_SCALE(vp
, 2);
4501 max_vector_size
= MAX_VECTOR_UPL_SIZE
;
4503 io_start
= io_size
= io_req_size
;
4506 * First look for pages already in the cache
4507 * and move them to user space. But only do this
4508 * check if we are not retrieving encrypted data directly
4509 * from the filesystem; those blocks should never
4512 * cluster_copy_ubc_data returns the resid
4515 if ((strict_uncached_IO
== FALSE
) && ((flags
& IO_ENCRYPTED
) == 0)) {
4516 retval
= cluster_copy_ubc_data_internal(vp
, uio
, (int *)&io_size
, 0, 0);
4519 * calculate the number of bytes actually copied
4520 * starting size - residual
4522 xsize
= io_start
- io_size
;
4524 io_req_size
-= xsize
;
4526 if(useVectorUPL
&& (xsize
|| (iov_base
& PAGE_MASK
))) {
4528 * We found something in the cache or we have an iov_base that's not
4531 * Issue all I/O's that have been collected within this Vectored UPL.
4533 if(vector_upl_index
) {
4534 retval
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
4535 reset_vector_run_state();
4542 * After this point, if we are using the Vector UPL path and the base is
4543 * not page-aligned then the UPL with that base will be the first in the vector UPL.
4548 * check to see if we are finished with this request.
4550 * If we satisfied this IO already, then io_req_size will be 0.
4551 * Otherwise, see if the IO was mis-aligned and needs to go through
4552 * the UBC to deal with the 'tail'.
4555 if (io_req_size
== 0 || (misaligned
)) {
4557 * see if there's another uio vector to
4558 * process that's of type IO_DIRECT
4560 * break out of while loop to get there
4565 * assume the request ends on a device block boundary
4567 io_min
= devblocksize
;
4570 * we can handle I/O's in multiples of the device block size
4571 * however, if io_size isn't a multiple of devblocksize we
4572 * want to clip it back to the nearest page boundary since
4573 * we are going to have to go through cluster_read_copy to
4574 * deal with the 'overhang'... by clipping it to a PAGE_SIZE
4575 * multiple, we avoid asking the drive for the same physical
4576 * blocks twice.. once for the partial page at the end of the
4577 * request and a 2nd time for the page we read into the cache
4578 * (which overlaps the end of the direct read) in order to
4579 * get at the overhang bytes
4581 if (io_size
& (devblocksize
- 1)) {
4582 assert(!(flags
& IO_ENCRYPTED
));
4584 * Clip the request to the previous page size boundary
4585 * since request does NOT end on a device block boundary
4587 io_size
&= ~PAGE_MASK
;
4590 if (retval
|| io_size
< io_min
) {
4592 * either an error or we only have the tail left to
4593 * complete via the copy path...
4594 * we may have already spun some portion of this request
4595 * off as async requests... we need to wait for the I/O
4596 * to complete before returning
4598 goto wait_for_dreads
;
4602 * Don't re-check the UBC data if we are looking for uncached IO
4603 * or asking for encrypted blocks.
4605 if ((strict_uncached_IO
== FALSE
) && ((flags
& IO_ENCRYPTED
) == 0)) {
4607 if ((xsize
= io_size
) > max_rd_size
)
4608 xsize
= max_rd_size
;
4614 * We hold a lock here between the time we check the
4615 * cache and the time we issue I/O. This saves us
4616 * from having to lock the pages in the cache. Not
4617 * all clients will care about this lock but some
4618 * clients may want to guarantee stability between
4619 * here and when the I/O is issued in which case they
4620 * will take the lock exclusively.
4622 lock
= cluster_lock_direct_read(vp
, LCK_RW_TYPE_SHARED
);
4625 ubc_range_op(vp
, uio
->uio_offset
, uio
->uio_offset
+ xsize
, UPL_ROP_ABSENT
, (int *)&io_size
);
4629 * a page must have just come into the cache
4630 * since the first page in this range is no
4631 * longer absent, go back and re-evaluate
4636 if ( (flags
& IO_RETURN_ON_THROTTLE
) ) {
4637 if (cluster_is_throttled(vp
) == THROTTLE_NOW
) {
4638 if ( !cluster_io_present_in_BC(vp
, uio
->uio_offset
)) {
4640 * we're in the throttle window and at least 1 I/O
4641 * has already been issued by a throttleable thread
4642 * in this window, so return with EAGAIN to indicate
4643 * to the FS issuing the cluster_read call that it
4644 * should now throttle after dropping any locks
4646 throttle_info_update_by_mount(vp
->v_mount
);
4648 io_throttled
= TRUE
;
4649 goto wait_for_dreads
;
4653 if (io_size
> max_rd_size
)
4654 io_size
= max_rd_size
;
4656 iov_base
= uio_curriovbase(uio
);
4658 upl_offset
= (vm_offset_t
)((u_int32_t
)iov_base
& PAGE_MASK
);
4659 upl_needed_size
= (upl_offset
+ io_size
+ (PAGE_SIZE
-1)) & ~PAGE_MASK
;
4661 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_START
,
4662 (int)upl_offset
, upl_needed_size
, (int)iov_base
, io_size
, 0);
4664 if (upl_offset
== 0 && ((io_size
& PAGE_MASK
) == 0))
4669 vm_map_t map
= UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
) ? current_map() : kernel_map
;
4670 for (force_data_sync
= 0; force_data_sync
< 3; force_data_sync
++) {
4672 upl_size
= upl_needed_size
;
4673 upl_flags
= UPL_FILE_IO
| UPL_NO_SYNC
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
4675 upl_flags
|= UPL_NOZEROFILL
;
4676 if (force_data_sync
)
4677 upl_flags
|= UPL_FORCE_DATA_SYNC
;
4679 kret
= vm_map_create_upl(map
,
4680 (vm_map_offset_t
)(iov_base
& ~((user_addr_t
)PAGE_MASK
)),
4681 &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
, VM_KERN_MEMORY_FILE
);
4683 if (kret
!= KERN_SUCCESS
) {
4684 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
4685 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
4687 * failed to get pagelist
4689 * we may have already spun some portion of this request
4690 * off as async requests... we need to wait for the I/O
4691 * to complete before returning
4693 goto wait_for_dreads
;
4695 pages_in_pl
= upl_size
/ PAGE_SIZE
;
4696 pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
4698 for (i
= 0; i
< pages_in_pl
; i
++) {
4699 if (!upl_page_present(pl
, i
))
4702 if (i
== pages_in_pl
)
4705 ubc_upl_abort(upl
, 0);
4707 if (force_data_sync
>= 3) {
4708 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
4709 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
4711 goto wait_for_dreads
;
4714 * Consider the possibility that upl_size wasn't satisfied.
4716 if (upl_size
< upl_needed_size
) {
4717 if (upl_size
&& upl_offset
== 0)
4723 ubc_upl_abort(upl
, 0);
4724 goto wait_for_dreads
;
4726 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
4727 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
4730 vm_offset_t end_off
= ((iov_base
+ io_size
) & PAGE_MASK
);
4734 * After this point, if we are using a vector UPL, then
4735 * either all the UPL elements end on a page boundary OR
4736 * this UPL is the last element because it does not end
4737 * on a page boundary.
4742 * request asynchronously so that we can overlap
4743 * the preparation of the next I/O
4744 * if there are already too many outstanding reads
4745 * wait until some have completed before issuing the next read
4747 cluster_iostate_wait(&iostate
, max_rd_ahead
, "cluster_read_direct");
4749 if (iostate
.io_error
) {
4751 * one of the earlier reads we issued ran into a hard error
4752 * don't issue any more reads, cleanup the UPL
4753 * that was just created but not used, then
4754 * go wait for any other reads to complete before
4755 * returning the error to the caller
4757 ubc_upl_abort(upl
, 0);
4759 goto wait_for_dreads
;
4761 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 73)) | DBG_FUNC_START
,
4762 upl
, (int)upl_offset
, (int)uio
->uio_offset
, io_size
, 0);
4766 io_flag
&= ~CL_PRESERVE
;
4768 io_flag
|= CL_PRESERVE
;
4770 retval
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
, io_size
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
4774 if(!vector_upl_index
) {
4775 vector_upl
= vector_upl_create(upl_offset
);
4776 v_upl_uio_offset
= uio
->uio_offset
;
4777 vector_upl_offset
= upl_offset
;
4780 vector_upl_set_subupl(vector_upl
,upl
, upl_size
);
4781 vector_upl_set_iostate(vector_upl
, upl
, vector_upl_size
, upl_size
);
4783 vector_upl_size
+= upl_size
;
4784 vector_upl_iosize
+= io_size
;
4786 if(issueVectorUPL
|| vector_upl_index
== MAX_VECTOR_UPL_ELEMENTS
|| vector_upl_size
>= max_vector_size
) {
4787 retval
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
4788 reset_vector_run_state();
4791 last_iov_base
= iov_base
+ io_size
;
4794 // We don't need to wait for the I/O to complete
4795 cluster_unlock_direct_read(lock
);
4800 * update the uio structure
4802 if ((flags
& IO_ENCRYPTED
) && (max_io_size
< io_size
)) {
4803 uio_update(uio
, (user_size_t
)max_io_size
);
4806 uio_update(uio
, (user_size_t
)io_size
);
4809 io_req_size
-= io_size
;
4811 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 73)) | DBG_FUNC_END
,
4812 upl
, (int)uio
->uio_offset
, io_req_size
, retval
, 0);
4816 if (retval
== 0 && iostate
.io_error
== 0 && io_req_size
== 0 && uio
->uio_offset
< filesize
) {
4818 retval
= cluster_io_type(uio
, read_type
, read_length
, 0);
4820 if (retval
== 0 && *read_type
== IO_DIRECT
) {
4822 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_NONE
,
4823 (int)uio
->uio_offset
, (int)filesize
, *read_type
, *read_length
, 0);
4831 if(retval
== 0 && iostate
.io_error
== 0 && useVectorUPL
&& vector_upl_index
) {
4832 retval
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
4833 reset_vector_run_state();
4836 // We don't need to wait for the I/O to complete
4838 cluster_unlock_direct_read(lock
);
4841 * make sure all async reads that are part of this stream
4842 * have completed before we return
4844 cluster_iostate_wait(&iostate
, 0, "cluster_read_direct");
4846 if (iostate
.io_error
)
4847 retval
= iostate
.io_error
;
4849 lck_mtx_destroy(&iostate
.io_mtxp
, cl_mtx_grp
);
4851 if (io_throttled
== TRUE
&& retval
== 0)
4854 for (next_iov_base
= orig_iov_base
; next_iov_base
< last_iov_base
; next_iov_base
+= PAGE_SIZE
) {
4856 * This is specifically done for pmap accounting purposes.
4857 * vm_pre_fault() will call vm_fault() to enter the page into
4858 * the pmap if there isn't _a_ physical page for that VA already.
4860 vm_pre_fault(vm_map_trunc_page(next_iov_base
, PAGE_MASK
));
4863 if (io_req_size
&& retval
== 0) {
4865 * we couldn't handle the tail of this request in DIRECT mode
4866 * so fire it through the copy path
4868 retval
= cluster_read_copy(vp
, uio
, io_req_size
, filesize
, flags
, callback
, callback_arg
);
4870 *read_type
= IO_UNKNOWN
;
4872 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_END
,
4873 (int)uio
->uio_offset
, (int)uio_resid(uio
), io_req_size
, retval
, 0);
4880 cluster_read_contig(vnode_t vp
, struct uio
*uio
, off_t filesize
, int *read_type
, u_int32_t
*read_length
,
4881 int (*callback
)(buf_t
, void *), void *callback_arg
, int flags
)
4883 upl_page_info_t
*pl
;
4884 upl_t upl
[MAX_VECTS
];
4885 vm_offset_t upl_offset
;
4886 addr64_t dst_paddr
= 0;
4887 user_addr_t iov_base
;
4889 upl_size_t upl_size
;
4890 vm_size_t upl_needed_size
;
4891 mach_msg_type_number_t pages_in_pl
;
4892 upl_control_flags_t upl_flags
;
4894 struct clios iostate
;
4901 u_int32_t devblocksize
;
4902 u_int32_t mem_alignment_mask
;
4903 u_int32_t tail_size
= 0;
4906 if (flags
& IO_PASSIVE
)
4911 if (flags
& IO_NOCACHE
)
4912 bflag
|= CL_NOCACHE
;
4915 * When we enter this routine, we know
4916 * -- the read_length will not exceed the current iov_len
4917 * -- the target address is physically contiguous for read_length
4919 cluster_syncup(vp
, filesize
, callback
, callback_arg
, PUSH_SYNC
);
4921 devblocksize
= (u_int32_t
)vp
->v_mount
->mnt_devblocksize
;
4922 mem_alignment_mask
= (u_int32_t
)vp
->v_mount
->mnt_alignmentmask
;
4924 iostate
.io_completed
= 0;
4925 iostate
.io_issued
= 0;
4926 iostate
.io_error
= 0;
4927 iostate
.io_wanted
= 0;
4929 lck_mtx_init(&iostate
.io_mtxp
, cl_mtx_grp
, cl_mtx_attr
);
4932 io_size
= *read_length
;
4934 max_size
= filesize
- uio
->uio_offset
;
4936 if (io_size
> max_size
)
4939 iov_base
= uio_curriovbase(uio
);
4941 upl_offset
= (vm_offset_t
)((u_int32_t
)iov_base
& PAGE_MASK
);
4942 upl_needed_size
= upl_offset
+ io_size
;
4945 upl_size
= upl_needed_size
;
4946 upl_flags
= UPL_FILE_IO
| UPL_NO_SYNC
| UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
4949 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 92)) | DBG_FUNC_START
,
4950 (int)upl_offset
, (int)upl_size
, (int)iov_base
, io_size
, 0);
4952 vm_map_t map
= UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
) ? current_map() : kernel_map
;
4953 kret
= vm_map_get_upl(map
,
4954 (vm_map_offset_t
)(iov_base
& ~((user_addr_t
)PAGE_MASK
)),
4955 &upl_size
, &upl
[cur_upl
], NULL
, &pages_in_pl
, &upl_flags
, VM_KERN_MEMORY_FILE
, 0);
4957 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 92)) | DBG_FUNC_END
,
4958 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
4960 if (kret
!= KERN_SUCCESS
) {
4962 * failed to get pagelist
4965 goto wait_for_creads
;
4969 if (upl_size
< upl_needed_size
) {
4971 * The upl_size wasn't satisfied.
4974 goto wait_for_creads
;
4976 pl
= ubc_upl_pageinfo(upl
[cur_upl
]);
4978 dst_paddr
= ((addr64_t
)upl_phys_page(pl
, 0) << PAGE_SHIFT
) + (addr64_t
)upl_offset
;
4980 while (((uio
->uio_offset
& (devblocksize
- 1)) || io_size
< devblocksize
) && io_size
) {
4981 u_int32_t head_size
;
4983 head_size
= devblocksize
- (u_int32_t
)(uio
->uio_offset
& (devblocksize
- 1));
4985 if (head_size
> io_size
)
4986 head_size
= io_size
;
4988 error
= cluster_align_phys_io(vp
, uio
, dst_paddr
, head_size
, CL_READ
, callback
, callback_arg
);
4991 goto wait_for_creads
;
4993 upl_offset
+= head_size
;
4994 dst_paddr
+= head_size
;
4995 io_size
-= head_size
;
4997 iov_base
+= head_size
;
4999 if ((u_int32_t
)iov_base
& mem_alignment_mask
) {
5001 * request doesn't set up on a memory boundary
5002 * the underlying DMA engine can handle...
5003 * return an error instead of going through
5004 * the slow copy path since the intent of this
5005 * path is direct I/O to device memory
5008 goto wait_for_creads
;
5011 tail_size
= io_size
& (devblocksize
- 1);
5013 io_size
-= tail_size
;
5015 while (io_size
&& error
== 0) {
5017 if (io_size
> MAX_IO_CONTIG_SIZE
)
5018 xsize
= MAX_IO_CONTIG_SIZE
;
5022 * request asynchronously so that we can overlap
5023 * the preparation of the next I/O... we'll do
5024 * the commit after all the I/O has completed
5025 * since its all issued against the same UPL
5026 * if there are already too many outstanding reads
5027 * wait until some have completed before issuing the next
5029 cluster_iostate_wait(&iostate
, MAX_IO_CONTIG_SIZE
* IO_SCALE(vp
, 2), "cluster_read_contig");
5031 if (iostate
.io_error
) {
5033 * one of the earlier reads we issued ran into a hard error
5034 * don't issue any more reads...
5035 * go wait for any other reads to complete before
5036 * returning the error to the caller
5038 goto wait_for_creads
;
5040 error
= cluster_io(vp
, upl
[cur_upl
], upl_offset
, uio
->uio_offset
, xsize
,
5041 CL_READ
| CL_NOZERO
| CL_DEV_MEMORY
| CL_ASYNC
| bflag
,
5042 (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
5044 * The cluster_io read was issued successfully,
5045 * update the uio structure
5048 uio_update(uio
, (user_size_t
)xsize
);
5051 upl_offset
+= xsize
;
5055 if (error
== 0 && iostate
.io_error
== 0 && tail_size
== 0 && num_upl
< MAX_VECTS
&& uio
->uio_offset
< filesize
) {
5057 error
= cluster_io_type(uio
, read_type
, read_length
, 0);
5059 if (error
== 0 && *read_type
== IO_CONTIG
) {
5064 *read_type
= IO_UNKNOWN
;
5068 * make sure all async reads that are part of this stream
5069 * have completed before we proceed
5071 cluster_iostate_wait(&iostate
, 0, "cluster_read_contig");
5073 if (iostate
.io_error
)
5074 error
= iostate
.io_error
;
5076 lck_mtx_destroy(&iostate
.io_mtxp
, cl_mtx_grp
);
5078 if (error
== 0 && tail_size
)
5079 error
= cluster_align_phys_io(vp
, uio
, dst_paddr
, tail_size
, CL_READ
, callback
, callback_arg
);
5081 for (n
= 0; n
< num_upl
; n
++)
5083 * just release our hold on each physically contiguous
5084 * region without changing any state
5086 ubc_upl_abort(upl
[n
], 0);
5093 cluster_io_type(struct uio
*uio
, int *io_type
, u_int32_t
*io_length
, u_int32_t min_length
)
5095 user_size_t iov_len
;
5096 user_addr_t iov_base
= 0;
5098 upl_size_t upl_size
;
5099 upl_control_flags_t upl_flags
;
5103 * skip over any emtpy vectors
5105 uio_update(uio
, (user_size_t
)0);
5107 iov_len
= uio_curriovlen(uio
);
5109 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 94)) | DBG_FUNC_START
, uio
, (int)iov_len
, 0, 0, 0);
5112 iov_base
= uio_curriovbase(uio
);
5114 * make sure the size of the vector isn't too big...
5115 * internally, we want to handle all of the I/O in
5116 * chunk sizes that fit in a 32 bit int
5118 if (iov_len
> (user_size_t
)MAX_IO_REQUEST_SIZE
)
5119 upl_size
= MAX_IO_REQUEST_SIZE
;
5121 upl_size
= (u_int32_t
)iov_len
;
5123 upl_flags
= UPL_QUERY_OBJECT_TYPE
;
5125 vm_map_t map
= UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
) ? current_map() : kernel_map
;
5126 if ((vm_map_get_upl(map
,
5127 (vm_map_offset_t
)(iov_base
& ~((user_addr_t
)PAGE_MASK
)),
5128 &upl_size
, &upl
, NULL
, NULL
, &upl_flags
, VM_KERN_MEMORY_FILE
, 0)) != KERN_SUCCESS
) {
5130 * the user app must have passed in an invalid address
5137 *io_length
= upl_size
;
5139 if (upl_flags
& UPL_PHYS_CONTIG
)
5140 *io_type
= IO_CONTIG
;
5141 else if (iov_len
>= min_length
)
5142 *io_type
= IO_DIRECT
;
5147 * nothing left to do for this uio
5150 *io_type
= IO_UNKNOWN
;
5152 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 94)) | DBG_FUNC_END
, iov_base
, *io_type
, *io_length
, retval
, 0);
5159 * generate advisory I/O's in the largest chunks possible
5160 * the completed pages will be released into the VM cache
5163 advisory_read(vnode_t vp
, off_t filesize
, off_t f_offset
, int resid
)
5165 return advisory_read_ext(vp
, filesize
, f_offset
, resid
, NULL
, NULL
, CL_PASSIVE
);
5169 advisory_read_ext(vnode_t vp
, off_t filesize
, off_t f_offset
, int resid
, int (*callback
)(buf_t
, void *), void *callback_arg
, int bflag
)
5171 upl_page_info_t
*pl
;
5173 vm_offset_t upl_offset
;
5186 uint32_t max_io_size
;
5189 if ( !UBCINFOEXISTS(vp
))
5195 max_io_size
= cluster_max_io_size(vp
->v_mount
, CL_READ
);
5198 if (max_io_size
> speculative_prefetch_max_iosize
)
5199 max_io_size
= speculative_prefetch_max_iosize
;
5201 if (disk_conditioner_mount_is_ssd(vp
->v_mount
)) {
5202 if (max_io_size
> speculative_prefetch_max_iosize
)
5203 max_io_size
= speculative_prefetch_max_iosize
;
5207 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 60)) | DBG_FUNC_START
,
5208 (int)f_offset
, resid
, (int)filesize
, 0, 0);
5210 while (resid
&& f_offset
< filesize
&& retval
== 0) {
5212 * compute the size of the upl needed to encompass
5213 * the requested read... limit each call to cluster_io
5214 * to the maximum UPL size... cluster_io will clip if
5215 * this exceeds the maximum io_size for the device,
5216 * make sure to account for
5217 * a starting offset that's not page aligned
5219 start_offset
= (int)(f_offset
& PAGE_MASK_64
);
5220 upl_f_offset
= f_offset
- (off_t
)start_offset
;
5221 max_size
= filesize
- f_offset
;
5223 if (resid
< max_size
)
5228 upl_size
= (start_offset
+ io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
5229 if ((uint32_t)upl_size
> max_io_size
)
5230 upl_size
= max_io_size
;
5234 * return the number of contiguously present pages in the cache
5235 * starting at upl_f_offset within the file
5237 ubc_range_op(vp
, upl_f_offset
, upl_f_offset
+ upl_size
, UPL_ROP_PRESENT
, &skip_range
);
5241 * skip over pages already present in the cache
5243 io_size
= skip_range
- start_offset
;
5245 f_offset
+= io_size
;
5248 if (skip_range
== upl_size
)
5251 * have to issue some real I/O
5252 * at this point, we know it's starting on a page boundary
5253 * because we've skipped over at least the first page in the request
5256 upl_f_offset
+= skip_range
;
5257 upl_size
-= skip_range
;
5259 pages_in_upl
= upl_size
/ PAGE_SIZE
;
5261 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 61)) | DBG_FUNC_START
,
5262 upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
5264 kret
= ubc_create_upl_kernel(vp
,
5269 UPL_RET_ONLY_ABSENT
| UPL_SET_LITE
,
5270 VM_KERN_MEMORY_FILE
);
5271 if (kret
!= KERN_SUCCESS
)
5276 * before we start marching forward, we must make sure we end on
5277 * a present page, otherwise we will be working with a freed
5280 for (last_pg
= pages_in_upl
- 1; last_pg
>= 0; last_pg
--) {
5281 if (upl_page_present(pl
, last_pg
))
5284 pages_in_upl
= last_pg
+ 1;
5287 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 61)) | DBG_FUNC_END
,
5288 upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
5291 for (last_pg
= 0; last_pg
< pages_in_upl
; ) {
5293 * scan from the beginning of the upl looking for the first
5294 * page that is present.... this will become the first page in
5295 * the request we're going to make to 'cluster_io'... if all
5296 * of the pages are absent, we won't call through to 'cluster_io'
5298 for (start_pg
= last_pg
; start_pg
< pages_in_upl
; start_pg
++) {
5299 if (upl_page_present(pl
, start_pg
))
5304 * scan from the starting present page looking for an absent
5305 * page before the end of the upl is reached, if we
5306 * find one, then it will terminate the range of pages being
5307 * presented to 'cluster_io'
5309 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
5310 if (!upl_page_present(pl
, last_pg
))
5314 if (last_pg
> start_pg
) {
5316 * we found a range of pages that must be filled
5317 * if the last page in this range is the last page of the file
5318 * we may have to clip the size of it to keep from reading past
5319 * the end of the last physical block associated with the file
5321 upl_offset
= start_pg
* PAGE_SIZE
;
5322 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
5324 if ((off_t
)(upl_f_offset
+ upl_offset
+ io_size
) > filesize
)
5325 io_size
= filesize
- (upl_f_offset
+ upl_offset
);
5328 * issue an asynchronous read to cluster_io
5330 retval
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, io_size
,
5331 CL_ASYNC
| CL_READ
| CL_COMMIT
| CL_AGE
| bflag
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
);
5337 ubc_upl_abort(upl
, 0);
5339 io_size
= upl_size
- start_offset
;
5341 if (io_size
> resid
)
5343 f_offset
+= io_size
;
5347 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 60)) | DBG_FUNC_END
,
5348 (int)f_offset
, resid
, retval
, 0, 0);
5355 cluster_push(vnode_t vp
, int flags
)
5357 return cluster_push_ext(vp
, flags
, NULL
, NULL
);
5362 cluster_push_ext(vnode_t vp
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
5364 return cluster_push_err(vp
, flags
, callback
, callback_arg
, NULL
);
5367 /* write errors via err, but return the number of clusters written */
5369 cluster_push_err(vnode_t vp
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
, int *err
)
5372 int my_sparse_wait
= 0;
5373 struct cl_writebehind
*wbp
;
5378 if ( !UBCINFOEXISTS(vp
)) {
5379 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_NONE
, kdebug_vnode(vp
), flags
, 0, -1, 0);
5382 /* return if deferred write is set */
5383 if (((unsigned int)vfs_flags(vp
->v_mount
) & MNT_DEFWRITE
) && (flags
& IO_DEFWRITE
)) {
5386 if ((wbp
= cluster_get_wbp(vp
, CLW_RETURNLOCKED
)) == NULL
) {
5387 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_NONE
, kdebug_vnode(vp
), flags
, 0, -2, 0);
5390 if (!ISSET(flags
, IO_SYNC
) && wbp
->cl_number
== 0 && wbp
->cl_scmap
== NULL
) {
5391 lck_mtx_unlock(&wbp
->cl_lockw
);
5393 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_NONE
, kdebug_vnode(vp
), flags
, 0, -3, 0);
5396 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_START
,
5397 wbp
->cl_scmap
, wbp
->cl_number
, flags
, 0, 0);
5400 * if we have an fsync in progress, we don't want to allow any additional
5401 * sync/fsync/close(s) to occur until it finishes.
5402 * note that its possible for writes to continue to occur to this file
5403 * while we're waiting and also once the fsync starts to clean if we're
5404 * in the sparse map case
5406 while (wbp
->cl_sparse_wait
) {
5407 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 97)) | DBG_FUNC_START
, kdebug_vnode(vp
), 0, 0, 0, 0);
5409 msleep((caddr_t
)&wbp
->cl_sparse_wait
, &wbp
->cl_lockw
, PRIBIO
+ 1, "cluster_push_ext", NULL
);
5411 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 97)) | DBG_FUNC_END
, kdebug_vnode(vp
), 0, 0, 0, 0);
5413 if (flags
& IO_SYNC
) {
5415 wbp
->cl_sparse_wait
= 1;
5418 * this is an fsync (or equivalent)... we must wait for any existing async
5419 * cleaning operations to complete before we evaulate the current state
5420 * and finish cleaning... this insures that all writes issued before this
5421 * fsync actually get cleaned to the disk before this fsync returns
5423 while (wbp
->cl_sparse_pushes
) {
5424 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 98)) | DBG_FUNC_START
, kdebug_vnode(vp
), 0, 0, 0, 0);
5426 msleep((caddr_t
)&wbp
->cl_sparse_pushes
, &wbp
->cl_lockw
, PRIBIO
+ 1, "cluster_push_ext", NULL
);
5428 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 98)) | DBG_FUNC_END
, kdebug_vnode(vp
), 0, 0, 0, 0);
5431 if (wbp
->cl_scmap
) {
5434 if (wbp
->cl_sparse_pushes
< SPARSE_PUSH_LIMIT
) {
5436 scmap
= wbp
->cl_scmap
;
5437 wbp
->cl_scmap
= NULL
;
5439 wbp
->cl_sparse_pushes
++;
5441 lck_mtx_unlock(&wbp
->cl_lockw
);
5443 retval
= sparse_cluster_push(&scmap
, vp
, ubc_getsize(vp
), PUSH_ALL
, flags
, callback
, callback_arg
);
5445 lck_mtx_lock(&wbp
->cl_lockw
);
5447 wbp
->cl_sparse_pushes
--;
5449 if (wbp
->cl_sparse_wait
&& wbp
->cl_sparse_pushes
== 0)
5450 wakeup((caddr_t
)&wbp
->cl_sparse_pushes
);
5452 retval
= sparse_cluster_push(&(wbp
->cl_scmap
), vp
, ubc_getsize(vp
), PUSH_ALL
, flags
, callback
, callback_arg
);
5458 retval
= cluster_try_push(wbp
, vp
, ubc_getsize(vp
), PUSH_ALL
, flags
, callback
, callback_arg
, err
);
5460 lck_mtx_unlock(&wbp
->cl_lockw
);
5462 if (flags
& IO_SYNC
)
5463 (void)vnode_waitforwrites(vp
, 0, 0, 0, "cluster_push");
5465 if (my_sparse_wait
) {
5467 * I'm the owner of the serialization token
5468 * clear it and wakeup anyone that is waiting
5471 lck_mtx_lock(&wbp
->cl_lockw
);
5473 wbp
->cl_sparse_wait
= 0;
5474 wakeup((caddr_t
)&wbp
->cl_sparse_wait
);
5476 lck_mtx_unlock(&wbp
->cl_lockw
);
5478 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_END
,
5479 wbp
->cl_scmap
, wbp
->cl_number
, retval
, 0, 0);
5485 __private_extern__
void
5486 cluster_release(struct ubc_info
*ubc
)
5488 struct cl_writebehind
*wbp
;
5489 struct cl_readahead
*rap
;
5491 if ((wbp
= ubc
->cl_wbehind
)) {
5493 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_START
, ubc
, wbp
->cl_scmap
, 0, 0, 0);
5496 vfs_drt_control(&(wbp
->cl_scmap
), 0);
5498 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_START
, ubc
, 0, 0, 0, 0);
5501 rap
= ubc
->cl_rahead
;
5504 lck_mtx_destroy(&wbp
->cl_lockw
, cl_mtx_grp
);
5505 FREE_ZONE((void *)wbp
, sizeof *wbp
, M_CLWRBEHIND
);
5507 if ((rap
= ubc
->cl_rahead
)) {
5508 lck_mtx_destroy(&rap
->cl_lockr
, cl_mtx_grp
);
5509 FREE_ZONE((void *)rap
, sizeof *rap
, M_CLRDAHEAD
);
5511 ubc
->cl_rahead
= NULL
;
5512 ubc
->cl_wbehind
= NULL
;
5514 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_END
, ubc
, rap
, wbp
, 0, 0);
5519 cluster_try_push(struct cl_writebehind
*wbp
, vnode_t vp
, off_t EOF
, int push_flag
, int io_flags
, int (*callback
)(buf_t
, void *), void *callback_arg
, int *err
)
5526 struct cl_wextent l_clusters
[MAX_CLUSTERS
];
5527 u_int max_cluster_pgcount
;
5530 max_cluster_pgcount
= MAX_CLUSTER_SIZE(vp
) / PAGE_SIZE
;
5532 * the write behind context exists and has
5533 * already been locked...
5535 if (wbp
->cl_number
== 0)
5537 * no clusters to push
5538 * return number of empty slots
5540 return (MAX_CLUSTERS
);
5543 * make a local 'sorted' copy of the clusters
5544 * and clear wbp->cl_number so that new clusters can
5547 for (cl_index
= 0; cl_index
< wbp
->cl_number
; cl_index
++) {
5548 for (min_index
= -1, cl_index1
= 0; cl_index1
< wbp
->cl_number
; cl_index1
++) {
5549 if (wbp
->cl_clusters
[cl_index1
].b_addr
== wbp
->cl_clusters
[cl_index1
].e_addr
)
5551 if (min_index
== -1)
5552 min_index
= cl_index1
;
5553 else if (wbp
->cl_clusters
[cl_index1
].b_addr
< wbp
->cl_clusters
[min_index
].b_addr
)
5554 min_index
= cl_index1
;
5556 if (min_index
== -1)
5559 l_clusters
[cl_index
].b_addr
= wbp
->cl_clusters
[min_index
].b_addr
;
5560 l_clusters
[cl_index
].e_addr
= wbp
->cl_clusters
[min_index
].e_addr
;
5561 l_clusters
[cl_index
].io_flags
= wbp
->cl_clusters
[min_index
].io_flags
;
5563 wbp
->cl_clusters
[min_index
].b_addr
= wbp
->cl_clusters
[min_index
].e_addr
;
5569 /* skip switching to the sparse cluster mechanism if on diskimage */
5570 if ( ((push_flag
& PUSH_DELAY
) && cl_len
== MAX_CLUSTERS
) &&
5571 !(vp
->v_mount
->mnt_kern_flag
& MNTK_VIRTUALDEV
) ) {
5575 * determine if we appear to be writing the file sequentially
5576 * if not, by returning without having pushed any clusters
5577 * we will cause this vnode to be pushed into the sparse cluster mechanism
5578 * used for managing more random I/O patterns
5580 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
5581 * that's why we're in try_push with PUSH_DELAY...
5583 * check to make sure that all the clusters except the last one are 'full'... and that each cluster
5584 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
5585 * so we can just make a simple pass through, up to, but not including the last one...
5586 * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
5589 * we let the last one be partial as long as it was adjacent to the previous one...
5590 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
5591 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
5593 for (i
= 0; i
< MAX_CLUSTERS
- 1; i
++) {
5594 if ((l_clusters
[i
].e_addr
- l_clusters
[i
].b_addr
) != max_cluster_pgcount
)
5596 if (l_clusters
[i
].e_addr
!= l_clusters
[i
+1].b_addr
)
5600 for (cl_index
= 0; cl_index
< cl_len
; cl_index
++) {
5602 struct cl_extent cl
;
5605 flags
= io_flags
& (IO_PASSIVE
|IO_CLOSE
);
5608 * try to push each cluster in turn...
5610 if (l_clusters
[cl_index
].io_flags
& CLW_IONOCACHE
)
5611 flags
|= IO_NOCACHE
;
5613 if (l_clusters
[cl_index
].io_flags
& CLW_IOPASSIVE
)
5614 flags
|= IO_PASSIVE
;
5616 if (push_flag
& PUSH_SYNC
)
5619 cl
.b_addr
= l_clusters
[cl_index
].b_addr
;
5620 cl
.e_addr
= l_clusters
[cl_index
].e_addr
;
5622 retval
= cluster_push_now(vp
, &cl
, EOF
, flags
, callback
, callback_arg
);
5624 if (error
== 0 && retval
)
5627 l_clusters
[cl_index
].b_addr
= 0;
5628 l_clusters
[cl_index
].e_addr
= 0;
5632 if ( !(push_flag
& PUSH_ALL
) )
5639 if (cl_len
> cl_pushed
) {
5641 * we didn't push all of the clusters, so
5642 * lets try to merge them back in to the vnode
5644 if ((MAX_CLUSTERS
- wbp
->cl_number
) < (cl_len
- cl_pushed
)) {
5646 * we picked up some new clusters while we were trying to
5647 * push the old ones... this can happen because I've dropped
5648 * the vnode lock... the sum of the
5649 * leftovers plus the new cluster count exceeds our ability
5650 * to represent them, so switch to the sparse cluster mechanism
5652 * collect the active public clusters...
5654 sparse_cluster_switch(wbp
, vp
, EOF
, callback
, callback_arg
);
5656 for (cl_index
= 0, cl_index1
= 0; cl_index
< cl_len
; cl_index
++) {
5657 if (l_clusters
[cl_index
].b_addr
== l_clusters
[cl_index
].e_addr
)
5659 wbp
->cl_clusters
[cl_index1
].b_addr
= l_clusters
[cl_index
].b_addr
;
5660 wbp
->cl_clusters
[cl_index1
].e_addr
= l_clusters
[cl_index
].e_addr
;
5661 wbp
->cl_clusters
[cl_index1
].io_flags
= l_clusters
[cl_index
].io_flags
;
5666 * update the cluster count
5668 wbp
->cl_number
= cl_index1
;
5671 * and collect the original clusters that were moved into the
5672 * local storage for sorting purposes
5674 sparse_cluster_switch(wbp
, vp
, EOF
, callback
, callback_arg
);
5678 * we've got room to merge the leftovers back in
5679 * just append them starting at the next 'hole'
5680 * represented by wbp->cl_number
5682 for (cl_index
= 0, cl_index1
= wbp
->cl_number
; cl_index
< cl_len
; cl_index
++) {
5683 if (l_clusters
[cl_index
].b_addr
== l_clusters
[cl_index
].e_addr
)
5686 wbp
->cl_clusters
[cl_index1
].b_addr
= l_clusters
[cl_index
].b_addr
;
5687 wbp
->cl_clusters
[cl_index1
].e_addr
= l_clusters
[cl_index
].e_addr
;
5688 wbp
->cl_clusters
[cl_index1
].io_flags
= l_clusters
[cl_index
].io_flags
;
5693 * update the cluster count
5695 wbp
->cl_number
= cl_index1
;
5698 return (MAX_CLUSTERS
- wbp
->cl_number
);
5704 cluster_push_now(vnode_t vp
, struct cl_extent
*cl
, off_t EOF
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
5706 upl_page_info_t
*pl
;
5708 vm_offset_t upl_offset
;
5723 if (flags
& IO_PASSIVE
)
5728 if (flags
& IO_SKIP_ENCRYPTION
)
5729 bflag
|= CL_ENCRYPTED
;
5731 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_START
,
5732 (int)cl
->b_addr
, (int)cl
->e_addr
, (int)EOF
, flags
, 0);
5734 if ((pages_in_upl
= (int)(cl
->e_addr
- cl
->b_addr
)) == 0) {
5735 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 0, 0, 0, 0);
5739 upl_size
= pages_in_upl
* PAGE_SIZE
;
5740 upl_f_offset
= (off_t
)(cl
->b_addr
* PAGE_SIZE_64
);
5742 if (upl_f_offset
+ upl_size
>= EOF
) {
5744 if (upl_f_offset
>= EOF
) {
5746 * must have truncated the file and missed
5747 * clearing a dangling cluster (i.e. it's completely
5748 * beyond the new EOF
5750 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 1, 0, 0, 0);
5754 size
= EOF
- upl_f_offset
;
5756 upl_size
= (size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
5757 pages_in_upl
= upl_size
/ PAGE_SIZE
;
5761 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_START
, upl_size
, size
, 0, 0, 0);
5764 * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
5766 * - only pages that are currently dirty are returned... these are the ones we need to clean
5767 * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
5768 * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
5769 * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
5770 * someone dirties this page while the I/O is in progress, we don't lose track of the new state
5772 * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
5775 if ((vp
->v_flag
& VNOCACHE_DATA
) || (flags
& IO_NOCACHE
))
5776 upl_flags
= UPL_COPYOUT_FROM
| UPL_RET_ONLY_DIRTY
| UPL_SET_LITE
| UPL_WILL_BE_DUMPED
;
5778 upl_flags
= UPL_COPYOUT_FROM
| UPL_RET_ONLY_DIRTY
| UPL_SET_LITE
;
5780 kret
= ubc_create_upl_kernel(vp
,
5786 VM_KERN_MEMORY_FILE
);
5787 if (kret
!= KERN_SUCCESS
)
5788 panic("cluster_push: failed to get pagelist");
5790 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_END
, upl
, upl_f_offset
, 0, 0, 0);
5793 * since we only asked for the dirty pages back
5794 * it's possible that we may only get a few or even none, so...
5795 * before we start marching forward, we must make sure we know
5796 * where the last present page is in the UPL, otherwise we could
5797 * end up working with a freed upl due to the FREE_ON_EMPTY semantics
5798 * employed by commit_range and abort_range.
5800 for (last_pg
= pages_in_upl
- 1; last_pg
>= 0; last_pg
--) {
5801 if (upl_page_present(pl
, last_pg
))
5804 pages_in_upl
= last_pg
+ 1;
5806 if (pages_in_upl
== 0) {
5807 ubc_upl_abort(upl
, 0);
5809 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 2, 0, 0, 0);
5813 for (last_pg
= 0; last_pg
< pages_in_upl
; ) {
5815 * find the next dirty page in the UPL
5816 * this will become the first page in the
5817 * next I/O to generate
5819 for (start_pg
= last_pg
; start_pg
< pages_in_upl
; start_pg
++) {
5820 if (upl_dirty_page(pl
, start_pg
))
5822 if (upl_page_present(pl
, start_pg
))
5824 * RET_ONLY_DIRTY will return non-dirty 'precious' pages
5825 * just release these unchanged since we're not going
5826 * to steal them or change their state
5828 ubc_upl_abort_range(upl
, start_pg
* PAGE_SIZE
, PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
5830 if (start_pg
>= pages_in_upl
)
5832 * done... no more dirty pages to push
5835 if (start_pg
> last_pg
)
5837 * skipped over some non-dirty pages
5839 size
-= ((start_pg
- last_pg
) * PAGE_SIZE
);
5842 * find a range of dirty pages to write
5844 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
5845 if (!upl_dirty_page(pl
, last_pg
))
5848 upl_offset
= start_pg
* PAGE_SIZE
;
5850 io_size
= min(size
, (last_pg
- start_pg
) * PAGE_SIZE
);
5852 io_flags
= CL_THROTTLE
| CL_COMMIT
| CL_AGE
| bflag
;
5854 if ( !(flags
& IO_SYNC
))
5855 io_flags
|= CL_ASYNC
;
5857 if (flags
& IO_CLOSE
)
5858 io_flags
|= CL_CLOSE
;
5860 if (flags
& IO_NOCACHE
)
5861 io_flags
|= CL_NOCACHE
;
5863 retval
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, io_size
,
5864 io_flags
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
);
5866 if (error
== 0 && retval
)
5871 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 3, 0, 0, 0);
5878 * sparse_cluster_switch is called with the write behind lock held
5881 sparse_cluster_switch(struct cl_writebehind
*wbp
, vnode_t vp
, off_t EOF
, int (*callback
)(buf_t
, void *), void *callback_arg
)
5885 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 78)) | DBG_FUNC_START
, kdebug_vnode(vp
), wbp
->cl_scmap
, 0, 0, 0);
5887 for (cl_index
= 0; cl_index
< wbp
->cl_number
; cl_index
++) {
5889 struct cl_extent cl
;
5891 for (cl
.b_addr
= wbp
->cl_clusters
[cl_index
].b_addr
; cl
.b_addr
< wbp
->cl_clusters
[cl_index
].e_addr
; cl
.b_addr
++) {
5893 if (ubc_page_op(vp
, (off_t
)(cl
.b_addr
* PAGE_SIZE_64
), 0, NULL
, &flags
) == KERN_SUCCESS
) {
5894 if (flags
& UPL_POP_DIRTY
) {
5895 cl
.e_addr
= cl
.b_addr
+ 1;
5897 sparse_cluster_add(&(wbp
->cl_scmap
), vp
, &cl
, EOF
, callback
, callback_arg
);
5904 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 78)) | DBG_FUNC_END
, kdebug_vnode(vp
), wbp
->cl_scmap
, 0, 0, 0);
5909 * sparse_cluster_push must be called with the write-behind lock held if the scmap is
5910 * still associated with the write-behind context... however, if the scmap has been disassociated
5911 * from the write-behind context (the cluster_push case), the wb lock is not held
5914 sparse_cluster_push(void **scmap
, vnode_t vp
, off_t EOF
, int push_flag
, int io_flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
5916 struct cl_extent cl
;
5921 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 79)) | DBG_FUNC_START
, kdebug_vnode(vp
), (*scmap
), 0, push_flag
, 0);
5923 if (push_flag
& PUSH_ALL
)
5924 vfs_drt_control(scmap
, 1);
5928 if (vfs_drt_get_cluster(scmap
, &offset
, &length
) != KERN_SUCCESS
)
5931 cl
.b_addr
= (daddr64_t
)(offset
/ PAGE_SIZE_64
);
5932 cl
.e_addr
= (daddr64_t
)((offset
+ length
) / PAGE_SIZE_64
);
5934 retval
= cluster_push_now(vp
, &cl
, EOF
, io_flags
& (IO_PASSIVE
|IO_CLOSE
), callback
, callback_arg
);
5935 if (error
== 0 && retval
)
5938 if ( !(push_flag
& PUSH_ALL
) )
5941 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 79)) | DBG_FUNC_END
, kdebug_vnode(vp
), (*scmap
), 0, 0, 0);
5948 * sparse_cluster_add is called with the write behind lock held
5951 sparse_cluster_add(void **scmap
, vnode_t vp
, struct cl_extent
*cl
, off_t EOF
, int (*callback
)(buf_t
, void *), void *callback_arg
)
5957 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 80)) | DBG_FUNC_START
, (*scmap
), 0, cl
->b_addr
, (int)cl
->e_addr
, 0);
5959 offset
= (off_t
)(cl
->b_addr
* PAGE_SIZE_64
);
5960 length
= ((u_int
)(cl
->e_addr
- cl
->b_addr
)) * PAGE_SIZE
;
5962 while (vfs_drt_mark_pages(scmap
, offset
, length
, &new_dirty
) != KERN_SUCCESS
) {
5964 * no room left in the map
5965 * only a partial update was done
5966 * push out some pages and try again
5968 sparse_cluster_push(scmap
, vp
, EOF
, 0, 0, callback
, callback_arg
);
5970 offset
+= (new_dirty
* PAGE_SIZE_64
);
5971 length
-= (new_dirty
* PAGE_SIZE
);
5973 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 80)) | DBG_FUNC_END
, kdebug_vnode(vp
), (*scmap
), 0, 0, 0);
5978 cluster_align_phys_io(vnode_t vp
, struct uio
*uio
, addr64_t usr_paddr
, u_int32_t xsize
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
5980 upl_page_info_t
*pl
;
5990 if (flags
& IO_PASSIVE
)
5995 if (flags
& IO_NOCACHE
)
5996 bflag
|= CL_NOCACHE
;
5998 upl_flags
= UPL_SET_LITE
;
6000 if ( !(flags
& CL_READ
) ) {
6002 * "write" operation: let the UPL subsystem know
6003 * that we intend to modify the buffer cache pages
6006 upl_flags
|= UPL_WILL_MODIFY
;
6009 * indicate that there is no need to pull the
6010 * mapping for this page... we're only going
6011 * to read from it, not modify it.
6013 upl_flags
|= UPL_FILE_IO
;
6015 kret
= ubc_create_upl_kernel(vp
,
6016 uio
->uio_offset
& ~PAGE_MASK_64
,
6021 VM_KERN_MEMORY_FILE
);
6023 if (kret
!= KERN_SUCCESS
)
6026 if (!upl_valid_page(pl
, 0)) {
6028 * issue a synchronous read to cluster_io
6030 error
= cluster_io(vp
, upl
, 0, uio
->uio_offset
& ~PAGE_MASK_64
, PAGE_SIZE
,
6031 CL_READ
| bflag
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
);
6033 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
6039 ubc_paddr
= ((addr64_t
)upl_phys_page(pl
, 0) << PAGE_SHIFT
) + (addr64_t
)(uio
->uio_offset
& PAGE_MASK_64
);
6042 * NOTE: There is no prototype for the following in BSD. It, and the definitions
6043 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
6044 * osfmk/ppc/mappings.h. They are not included here because there appears to be no
6045 * way to do so without exporting them to kexts as well.
6047 if (flags
& CL_READ
)
6048 // copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */
6049 copypv(ubc_paddr
, usr_paddr
, xsize
, 2 | 1 | 4); /* Copy physical to physical and flush the destination */
6051 // copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */
6052 copypv(usr_paddr
, ubc_paddr
, xsize
, 2 | 1 | 8); /* Copy physical to physical and flush the source */
6054 if ( !(flags
& CL_READ
) || (upl_valid_page(pl
, 0) && upl_dirty_page(pl
, 0))) {
6056 * issue a synchronous write to cluster_io
6058 error
= cluster_io(vp
, upl
, 0, uio
->uio_offset
& ~PAGE_MASK_64
, PAGE_SIZE
,
6059 bflag
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
);
6062 uio_update(uio
, (user_size_t
)xsize
);
6065 abort_flags
= UPL_ABORT_FREE_ON_EMPTY
;
6067 abort_flags
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_DUMP_PAGES
;
6069 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, abort_flags
);
6075 cluster_copy_upl_data(struct uio
*uio
, upl_t upl
, int upl_offset
, int *io_resid
)
6083 upl_page_info_t
*pl
;
6088 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_START
,
6089 (int)uio
->uio_offset
, upl_offset
, xsize
, 0, 0);
6091 segflg
= uio
->uio_segflg
;
6095 case UIO_USERSPACE32
:
6096 case UIO_USERISPACE32
:
6097 uio
->uio_segflg
= UIO_PHYS_USERSPACE32
;
6101 case UIO_USERISPACE
:
6102 uio
->uio_segflg
= UIO_PHYS_USERSPACE
;
6105 case UIO_USERSPACE64
:
6106 case UIO_USERISPACE64
:
6107 uio
->uio_segflg
= UIO_PHYS_USERSPACE64
;
6111 uio
->uio_segflg
= UIO_PHYS_SYSSPACE
;
6115 pl
= ubc_upl_pageinfo(upl
);
6117 pg_index
= upl_offset
/ PAGE_SIZE
;
6118 pg_offset
= upl_offset
& PAGE_MASK
;
6119 csize
= min(PAGE_SIZE
- pg_offset
, xsize
);
6122 while (xsize
&& retval
== 0) {
6125 paddr
= ((addr64_t
)upl_phys_page(pl
, pg_index
) << PAGE_SHIFT
) + pg_offset
;
6126 if ((uio
->uio_rw
== UIO_WRITE
) && (upl_dirty_page(pl
, pg_index
) == FALSE
))
6129 retval
= uiomove64(paddr
, csize
, uio
);
6134 csize
= min(PAGE_SIZE
, xsize
);
6138 uio
->uio_segflg
= segflg
;
6140 task_update_logical_writes(current_task(), (dirty_count
* PAGE_SIZE
), TASK_WRITE_DEFERRED
, upl_lookup_vnode(upl
));
6141 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
6142 (int)uio
->uio_offset
, xsize
, retval
, segflg
, 0);
6149 cluster_copy_ubc_data(vnode_t vp
, struct uio
*uio
, int *io_resid
, int mark_dirty
)
6152 return (cluster_copy_ubc_data_internal(vp
, uio
, io_resid
, mark_dirty
, 1));
6157 cluster_copy_ubc_data_internal(vnode_t vp
, struct uio
*uio
, int *io_resid
, int mark_dirty
, int take_reference
)
6164 memory_object_control_t control
;
6166 io_size
= *io_resid
;
6168 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_START
,
6169 (int)uio
->uio_offset
, io_size
, mark_dirty
, take_reference
, 0);
6171 control
= ubc_getobject(vp
, UBC_FLAGS_NONE
);
6173 if (control
== MEMORY_OBJECT_CONTROL_NULL
) {
6174 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
6175 (int)uio
->uio_offset
, io_size
, retval
, 3, 0);
6179 segflg
= uio
->uio_segflg
;
6183 case UIO_USERSPACE32
:
6184 case UIO_USERISPACE32
:
6185 uio
->uio_segflg
= UIO_PHYS_USERSPACE32
;
6188 case UIO_USERSPACE64
:
6189 case UIO_USERISPACE64
:
6190 uio
->uio_segflg
= UIO_PHYS_USERSPACE64
;
6194 case UIO_USERISPACE
:
6195 uio
->uio_segflg
= UIO_PHYS_USERSPACE
;
6199 uio
->uio_segflg
= UIO_PHYS_SYSSPACE
;
6203 if ( (io_size
= *io_resid
) ) {
6204 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
6205 xsize
= uio_resid(uio
);
6207 retval
= memory_object_control_uiomove(control
, uio
->uio_offset
- start_offset
, uio
,
6208 start_offset
, io_size
, mark_dirty
, take_reference
);
6209 xsize
-= uio_resid(uio
);
6212 uio
->uio_segflg
= segflg
;
6213 *io_resid
= io_size
;
6215 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
6216 (int)uio
->uio_offset
, io_size
, retval
, 0x80000000 | segflg
, 0);
6223 is_file_clean(vnode_t vp
, off_t filesize
)
6227 int total_dirty
= 0;
6229 for (f_offset
= 0; f_offset
< filesize
; f_offset
+= PAGE_SIZE_64
) {
6230 if (ubc_page_op(vp
, f_offset
, 0, NULL
, &flags
) == KERN_SUCCESS
) {
6231 if (flags
& UPL_POP_DIRTY
) {
6245 * Dirty region tracking/clustering mechanism.
6247 * This code (vfs_drt_*) provides a mechanism for tracking and clustering
6248 * dirty regions within a larger space (file). It is primarily intended to
6249 * support clustering in large files with many dirty areas.
6251 * The implementation assumes that the dirty regions are pages.
6253 * To represent dirty pages within the file, we store bit vectors in a
6254 * variable-size circular hash.
6258 * Bitvector size. This determines the number of pages we group in a
6259 * single hashtable entry. Each hashtable entry is aligned to this
6260 * size within the file.
6262 #define DRT_BITVECTOR_PAGES ((1024 * 1024) / PAGE_SIZE)
6265 * File offset handling.
6267 * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
6268 * the correct formula is (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6270 #define DRT_ADDRESS_MASK (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6271 #define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
6274 * Hashtable address field handling.
6276 * The low-order bits of the hashtable address are used to conserve
6279 * DRT_HASH_COUNT_MASK must be large enough to store the range
6280 * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
6281 * to indicate that the bucket is actually unoccupied.
6283 #define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
6284 #define DRT_HASH_SET_ADDRESS(scm, i, a) \
6286 (scm)->scm_hashtable[(i)].dhe_control = \
6287 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
6289 #define DRT_HASH_COUNT_MASK 0x1ff
6290 #define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
6291 #define DRT_HASH_SET_COUNT(scm, i, c) \
6293 (scm)->scm_hashtable[(i)].dhe_control = \
6294 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \
6296 #define DRT_HASH_CLEAR(scm, i) \
6298 (scm)->scm_hashtable[(i)].dhe_control = 0; \
6300 #define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
6301 #define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
6302 #define DRT_HASH_COPY(oscm, oi, scm, i) \
6304 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
6305 DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
6310 * Hash table moduli.
6312 * Since the hashtable entry's size is dependent on the size of
6313 * the bitvector, and since the hashtable size is constrained to
6314 * both being prime and fitting within the desired allocation
6315 * size, these values need to be manually determined.
6317 * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
6319 * The small hashtable allocation is 1024 bytes, so the modulus is 23.
6320 * The large hashtable allocation is 16384 bytes, so the modulus is 401.
6322 #define DRT_HASH_SMALL_MODULUS 23
6323 #define DRT_HASH_LARGE_MODULUS 401
6326 * Physical memory required before the large hash modulus is permitted.
6328 * On small memory systems, the large hash modulus can lead to phsyical
6329 * memory starvation, so we avoid using it there.
6331 #define DRT_HASH_LARGE_MEMORY_REQUIRED (1024LL * 1024LL * 1024LL) /* 1GiB */
6333 #define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */
6334 #define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */
6336 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
6339 * Hashtable bitvector handling.
6341 * Bitvector fields are 32 bits long.
6344 #define DRT_HASH_SET_BIT(scm, i, bit) \
6345 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
6347 #define DRT_HASH_CLEAR_BIT(scm, i, bit) \
6348 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
6350 #define DRT_HASH_TEST_BIT(scm, i, bit) \
6351 ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
6353 #define DRT_BITVECTOR_CLEAR(scm, i) \
6354 bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
6356 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
6357 bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
6358 &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
6359 (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
6366 struct vfs_drt_hashentry
{
6367 u_int64_t dhe_control
;
6369 * dhe_bitvector was declared as dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
6370 * DRT_BITVECTOR_PAGES is defined as ((1024 * 1024) / PAGE_SIZE)
6371 * Since PAGE_SIZE is only known at boot time,
6372 * -define MAX_DRT_BITVECTOR_PAGES for smallest supported page size (4k)
6373 * -declare dhe_bitvector array for largest possible length
6375 #define MAX_DRT_BITVECTOR_PAGES (1024 * 1024)/( 4 * 1024)
6376 u_int32_t dhe_bitvector
[MAX_DRT_BITVECTOR_PAGES
/32];
6380 * Dirty Region Tracking structure.
6382 * The hashtable is allocated entirely inside the DRT structure.
6384 * The hash is a simple circular prime modulus arrangement, the structure
6385 * is resized from small to large if it overflows.
6388 struct vfs_drt_clustermap
{
6389 u_int32_t scm_magic
; /* sanity/detection */
6390 #define DRT_SCM_MAGIC 0x12020003
6391 u_int32_t scm_modulus
; /* current ring size */
6392 u_int32_t scm_buckets
; /* number of occupied buckets */
6393 u_int32_t scm_lastclean
; /* last entry we cleaned */
6394 u_int32_t scm_iskips
; /* number of slot skips */
6396 struct vfs_drt_hashentry scm_hashtable
[0];
6400 #define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
6401 #define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
6404 * Debugging codes and arguments.
6406 #define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
6407 #define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
6408 #define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
6409 #define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
6410 #define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
6413 /* 1 (clean, no map) */
6414 /* 2 (map alloc fail) */
6415 /* 3, resid (partial) */
6416 #define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
6417 #define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
6418 * lastclean, iskips */
6421 static kern_return_t
vfs_drt_alloc_map(struct vfs_drt_clustermap
**cmapp
);
6422 static kern_return_t
vfs_drt_free_map(struct vfs_drt_clustermap
*cmap
);
6423 static kern_return_t
vfs_drt_search_index(struct vfs_drt_clustermap
*cmap
,
6424 u_int64_t offset
, int *indexp
);
6425 static kern_return_t
vfs_drt_get_index(struct vfs_drt_clustermap
**cmapp
,
6429 static kern_return_t
vfs_drt_do_mark_pages(
6435 static void vfs_drt_trace(
6436 struct vfs_drt_clustermap
*cmap
,
6445 * Allocate and initialise a sparse cluster map.
6447 * Will allocate a new map, resize or compact an existing map.
6449 * XXX we should probably have at least one intermediate map size,
6450 * as the 1:16 ratio seems a bit drastic.
6452 static kern_return_t
6453 vfs_drt_alloc_map(struct vfs_drt_clustermap
**cmapp
)
6455 struct vfs_drt_clustermap
*cmap
, *ocmap
;
6459 int nsize
, active_buckets
, index
, copycount
;
6466 * Decide on the size of the new map.
6468 if (ocmap
== NULL
) {
6469 nsize
= DRT_HASH_SMALL_MODULUS
;
6471 /* count the number of active buckets in the old map */
6473 for (i
= 0; i
< ocmap
->scm_modulus
; i
++) {
6474 if (!DRT_HASH_VACANT(ocmap
, i
) &&
6475 (DRT_HASH_GET_COUNT(ocmap
, i
) != 0))
6479 * If we're currently using the small allocation, check to
6480 * see whether we should grow to the large one.
6482 if (ocmap
->scm_modulus
== DRT_HASH_SMALL_MODULUS
) {
6484 * If the ring is nearly full and we are allowed to
6485 * use the large modulus, upgrade.
6487 if ((active_buckets
> (DRT_HASH_SMALL_MODULUS
- 5)) &&
6488 (max_mem
>= DRT_HASH_LARGE_MEMORY_REQUIRED
)) {
6489 nsize
= DRT_HASH_LARGE_MODULUS
;
6491 nsize
= DRT_HASH_SMALL_MODULUS
;
6494 /* already using the large modulus */
6495 nsize
= DRT_HASH_LARGE_MODULUS
;
6497 * If the ring is completely full, there's
6498 * nothing useful for us to do. Behave as
6499 * though we had compacted into the new
6502 if (active_buckets
>= DRT_HASH_LARGE_MODULUS
)
6503 return(KERN_SUCCESS
);
6508 * Allocate and initialise the new map.
6511 kret
= kmem_alloc(kernel_map
, (vm_offset_t
*)&cmap
,
6512 (nsize
== DRT_HASH_SMALL_MODULUS
) ? DRT_SMALL_ALLOCATION
: DRT_LARGE_ALLOCATION
, VM_KERN_MEMORY_FILE
);
6513 if (kret
!= KERN_SUCCESS
)
6515 cmap
->scm_magic
= DRT_SCM_MAGIC
;
6516 cmap
->scm_modulus
= nsize
;
6517 cmap
->scm_buckets
= 0;
6518 cmap
->scm_lastclean
= 0;
6519 cmap
->scm_iskips
= 0;
6520 for (i
= 0; i
< cmap
->scm_modulus
; i
++) {
6521 DRT_HASH_CLEAR(cmap
, i
);
6522 DRT_HASH_VACATE(cmap
, i
);
6523 DRT_BITVECTOR_CLEAR(cmap
, i
);
6527 * If there's an old map, re-hash entries from it into the new map.
6530 if (ocmap
!= NULL
) {
6531 for (i
= 0; i
< ocmap
->scm_modulus
; i
++) {
6532 /* skip empty buckets */
6533 if (DRT_HASH_VACANT(ocmap
, i
) ||
6534 (DRT_HASH_GET_COUNT(ocmap
, i
) == 0))
6537 offset
= DRT_HASH_GET_ADDRESS(ocmap
, i
);
6538 kret
= vfs_drt_get_index(&cmap
, offset
, &index
, 1);
6539 if (kret
!= KERN_SUCCESS
) {
6540 /* XXX need to bail out gracefully here */
6541 panic("vfs_drt: new cluster map mysteriously too small");
6545 DRT_HASH_COPY(ocmap
, i
, cmap
, index
);
6550 /* log what we've done */
6551 vfs_drt_trace(cmap
, DRT_DEBUG_ALLOC
, copycount
, 0, 0, 0);
6554 * It's important to ensure that *cmapp always points to
6555 * a valid map, so we must overwrite it before freeing
6559 if (ocmap
!= NULL
) {
6560 /* emit stats into trace buffer */
6561 vfs_drt_trace(ocmap
, DRT_DEBUG_SCMDATA
,
6564 ocmap
->scm_lastclean
,
6567 vfs_drt_free_map(ocmap
);
6569 return(KERN_SUCCESS
);
6574 * Free a sparse cluster map.
6576 static kern_return_t
6577 vfs_drt_free_map(struct vfs_drt_clustermap
*cmap
)
6579 kmem_free(kernel_map
, (vm_offset_t
)cmap
,
6580 (cmap
->scm_modulus
== DRT_HASH_SMALL_MODULUS
) ? DRT_SMALL_ALLOCATION
: DRT_LARGE_ALLOCATION
);
6581 return(KERN_SUCCESS
);
6586 * Find the hashtable slot currently occupied by an entry for the supplied offset.
6588 static kern_return_t
6589 vfs_drt_search_index(struct vfs_drt_clustermap
*cmap
, u_int64_t offset
, int *indexp
)
6594 offset
= DRT_ALIGN_ADDRESS(offset
);
6595 index
= DRT_HASH(cmap
, offset
);
6597 /* traverse the hashtable */
6598 for (i
= 0; i
< cmap
->scm_modulus
; i
++) {
6601 * If the slot is vacant, we can stop.
6603 if (DRT_HASH_VACANT(cmap
, index
))
6607 * If the address matches our offset, we have success.
6609 if (DRT_HASH_GET_ADDRESS(cmap
, index
) == offset
) {
6611 return(KERN_SUCCESS
);
6615 * Move to the next slot, try again.
6617 index
= DRT_HASH_NEXT(cmap
, index
);
6622 return(KERN_FAILURE
);
6626 * Find the hashtable slot for the supplied offset. If we haven't allocated
6627 * one yet, allocate one and populate the address field. Note that it will
6628 * not have a nonzero page count and thus will still technically be free, so
6629 * in the case where we are called to clean pages, the slot will remain free.
6631 static kern_return_t
6632 vfs_drt_get_index(struct vfs_drt_clustermap
**cmapp
, u_int64_t offset
, int *indexp
, int recursed
)
6634 struct vfs_drt_clustermap
*cmap
;
6641 /* look for an existing entry */
6642 kret
= vfs_drt_search_index(cmap
, offset
, indexp
);
6643 if (kret
== KERN_SUCCESS
)
6646 /* need to allocate an entry */
6647 offset
= DRT_ALIGN_ADDRESS(offset
);
6648 index
= DRT_HASH(cmap
, offset
);
6650 /* scan from the index forwards looking for a vacant slot */
6651 for (i
= 0; i
< cmap
->scm_modulus
; i
++) {
6653 if (DRT_HASH_VACANT(cmap
, index
) || DRT_HASH_GET_COUNT(cmap
,index
) == 0) {
6654 cmap
->scm_buckets
++;
6655 if (index
< cmap
->scm_lastclean
)
6656 cmap
->scm_lastclean
= index
;
6657 DRT_HASH_SET_ADDRESS(cmap
, index
, offset
);
6658 DRT_HASH_SET_COUNT(cmap
, index
, 0);
6659 DRT_BITVECTOR_CLEAR(cmap
, index
);
6661 vfs_drt_trace(cmap
, DRT_DEBUG_INSERT
, (int)offset
, i
, 0, 0);
6662 return(KERN_SUCCESS
);
6664 cmap
->scm_iskips
+= i
;
6665 index
= DRT_HASH_NEXT(cmap
, index
);
6669 * We haven't found a vacant slot, so the map is full. If we're not
6670 * already recursed, try reallocating/compacting it.
6673 return(KERN_FAILURE
);
6674 kret
= vfs_drt_alloc_map(cmapp
);
6675 if (kret
== KERN_SUCCESS
) {
6676 /* now try to insert again */
6677 kret
= vfs_drt_get_index(cmapp
, offset
, indexp
, 1);
6683 * Implementation of set dirty/clean.
6685 * In the 'clean' case, not finding a map is OK.
6687 static kern_return_t
6688 vfs_drt_do_mark_pages(
6695 struct vfs_drt_clustermap
*cmap
, **cmapp
;
6697 int i
, index
, pgoff
, pgcount
, setcount
, ecount
;
6699 cmapp
= (struct vfs_drt_clustermap
**)private;
6702 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_START
, (int)offset
, (int)length
, dirty
, 0);
6704 if (setcountp
!= NULL
)
6707 /* allocate a cluster map if we don't already have one */
6709 /* no cluster map, nothing to clean */
6711 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 1, 0, 0, 0);
6712 return(KERN_SUCCESS
);
6714 kret
= vfs_drt_alloc_map(cmapp
);
6715 if (kret
!= KERN_SUCCESS
) {
6716 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 2, 0, 0, 0);
6723 * Iterate over the length of the region.
6725 while (length
> 0) {
6727 * Get the hashtable index for this offset.
6729 * XXX this will add blank entries if we are clearing a range
6730 * that hasn't been dirtied.
6732 kret
= vfs_drt_get_index(cmapp
, offset
, &index
, 0);
6733 cmap
= *cmapp
; /* may have changed! */
6734 /* this may be a partial-success return */
6735 if (kret
!= KERN_SUCCESS
) {
6736 if (setcountp
!= NULL
)
6737 *setcountp
= setcount
;
6738 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 3, (int)length
, 0, 0);
6744 * Work out how many pages we're modifying in this
6747 pgoff
= (offset
- DRT_ALIGN_ADDRESS(offset
)) / PAGE_SIZE
;
6748 pgcount
= min((length
/ PAGE_SIZE
), (DRT_BITVECTOR_PAGES
- pgoff
));
6751 * Iterate over pages, dirty/clearing as we go.
6753 ecount
= DRT_HASH_GET_COUNT(cmap
, index
);
6754 for (i
= 0; i
< pgcount
; i
++) {
6756 if (!DRT_HASH_TEST_BIT(cmap
, index
, pgoff
+ i
)) {
6757 DRT_HASH_SET_BIT(cmap
, index
, pgoff
+ i
);
6762 if (DRT_HASH_TEST_BIT(cmap
, index
, pgoff
+ i
)) {
6763 DRT_HASH_CLEAR_BIT(cmap
, index
, pgoff
+ i
);
6769 DRT_HASH_SET_COUNT(cmap
, index
, ecount
);
6771 offset
+= pgcount
* PAGE_SIZE
;
6772 length
-= pgcount
* PAGE_SIZE
;
6774 if (setcountp
!= NULL
)
6775 *setcountp
= setcount
;
6777 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 0, setcount
, 0, 0);
6779 return(KERN_SUCCESS
);
6783 * Mark a set of pages as dirty/clean.
6785 * This is a public interface.
6788 * Pointer to storage suitable for holding a pointer. Note that
6789 * this must either be NULL or a value set by this function.
6792 * Current file size in bytes.
6795 * Offset of the first page to be marked as dirty, in bytes. Must be
6799 * Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
6802 * Number of pages newly marked dirty by this call (optional).
6804 * Returns KERN_SUCCESS if all the pages were successfully marked.
6806 static kern_return_t
6807 vfs_drt_mark_pages(void **cmapp
, off_t offset
, u_int length
, u_int
*setcountp
)
6809 /* XXX size unused, drop from interface */
6810 return(vfs_drt_do_mark_pages(cmapp
, offset
, length
, setcountp
, 1));
6814 static kern_return_t
6815 vfs_drt_unmark_pages(void **cmapp
, off_t offset
, u_int length
)
6817 return(vfs_drt_do_mark_pages(cmapp
, offset
, length
, NULL
, 0));
6822 * Get a cluster of dirty pages.
6824 * This is a public interface.
6827 * Pointer to storage managed by drt_mark_pages. Note that this must
6828 * be NULL or a value set by drt_mark_pages.
6831 * Returns the byte offset into the file of the first page in the cluster.
6834 * Returns the length in bytes of the cluster of dirty pages.
6836 * Returns success if a cluster was found. If KERN_FAILURE is returned, there
6837 * are no dirty pages meeting the minmum size criteria. Private storage will
6838 * be released if there are no more dirty pages left in the map
6841 static kern_return_t
6842 vfs_drt_get_cluster(void **cmapp
, off_t
*offsetp
, u_int
*lengthp
)
6844 struct vfs_drt_clustermap
*cmap
;
6848 int index
, i
, fs
, ls
;
6851 if ((cmapp
== NULL
) || (*cmapp
== NULL
))
6852 return(KERN_FAILURE
);
6855 /* walk the hashtable */
6856 for (offset
= 0, j
= 0; j
< cmap
->scm_modulus
; offset
+= (DRT_BITVECTOR_PAGES
* PAGE_SIZE
), j
++) {
6857 index
= DRT_HASH(cmap
, offset
);
6859 if (DRT_HASH_VACANT(cmap
, index
) || (DRT_HASH_GET_COUNT(cmap
, index
) == 0))
6862 /* scan the bitfield for a string of bits */
6865 for (i
= 0; i
< DRT_BITVECTOR_PAGES
; i
++) {
6866 if (DRT_HASH_TEST_BIT(cmap
, index
, i
)) {
6872 /* didn't find any bits set */
6873 panic("vfs_drt: entry summary count > 0 but no bits set in map");
6875 for (ls
= 0; i
< DRT_BITVECTOR_PAGES
; i
++, ls
++) {
6876 if (!DRT_HASH_TEST_BIT(cmap
, index
, i
))
6880 /* compute offset and length, mark pages clean */
6881 offset
= DRT_HASH_GET_ADDRESS(cmap
, index
) + (PAGE_SIZE
* fs
);
6882 length
= ls
* PAGE_SIZE
;
6883 vfs_drt_do_mark_pages(cmapp
, offset
, length
, NULL
, 0);
6884 cmap
->scm_lastclean
= index
;
6886 /* return successful */
6887 *offsetp
= (off_t
)offset
;
6890 vfs_drt_trace(cmap
, DRT_DEBUG_RETCLUSTER
, (int)offset
, (int)length
, 0, 0);
6891 return(KERN_SUCCESS
);
6894 * We didn't find anything... hashtable is empty
6895 * emit stats into trace buffer and
6898 vfs_drt_trace(cmap
, DRT_DEBUG_SCMDATA
,
6901 cmap
->scm_lastclean
,
6904 vfs_drt_free_map(cmap
);
6907 return(KERN_FAILURE
);
6911 static kern_return_t
6912 vfs_drt_control(void **cmapp
, int op_type
)
6914 struct vfs_drt_clustermap
*cmap
;
6917 if ((cmapp
== NULL
) || (*cmapp
== NULL
))
6918 return(KERN_FAILURE
);
6923 /* emit stats into trace buffer */
6924 vfs_drt_trace(cmap
, DRT_DEBUG_SCMDATA
,
6927 cmap
->scm_lastclean
,
6930 vfs_drt_free_map(cmap
);
6935 cmap
->scm_lastclean
= 0;
6938 return(KERN_SUCCESS
);
6944 * Emit a summary of the state of the clustermap into the trace buffer
6945 * along with some caller-provided data.
6949 vfs_drt_trace(__unused
struct vfs_drt_clustermap
*cmap
, int code
, int arg1
, int arg2
, int arg3
, int arg4
)
6951 KERNEL_DEBUG(code
, arg1
, arg2
, arg3
, arg4
, 0);
6955 vfs_drt_trace(__unused
struct vfs_drt_clustermap
*cmap
, __unused
int code
,
6956 __unused
int arg1
, __unused
int arg2
, __unused
int arg3
,
6964 * Perform basic sanity check on the hash entry summary count
6965 * vs. the actual bits set in the entry.
6968 vfs_drt_sanity(struct vfs_drt_clustermap
*cmap
)
6973 for (index
= 0; index
< cmap
->scm_modulus
; index
++) {
6974 if (DRT_HASH_VACANT(cmap
, index
))
6977 for (bits_on
= 0, i
= 0; i
< DRT_BITVECTOR_PAGES
; i
++) {
6978 if (DRT_HASH_TEST_BIT(cmap
, index
, i
))
6981 if (bits_on
!= DRT_HASH_GET_COUNT(cmap
, index
))
6982 panic("bits_on = %d, index = %d\n", bits_on
, index
);