2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
31 * The Regents of the University of California. All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
64 #include <sys/param.h>
65 #include <sys/proc_internal.h>
66 #include <sys/buf_internal.h>
67 #include <sys/mount_internal.h>
68 #include <sys/vnode_internal.h>
69 #include <sys/trace.h>
70 #include <sys/malloc.h>
72 #include <sys/kernel.h>
73 #include <sys/resourcevar.h>
74 #include <miscfs/specfs/specdev.h>
75 #include <sys/uio_internal.h>
76 #include <libkern/libkern.h>
77 #include <machine/machine_routines.h>
79 #include <sys/ubc_internal.h>
80 #include <vm/vnode_pager.h>
82 #include <mach/mach_types.h>
83 #include <mach/memory_object_types.h>
84 #include <mach/vm_map.h>
86 #include <kern/task.h>
87 #include <kern/policy_internal.h>
89 #include <vm/vm_kern.h>
90 #include <vm/vm_map.h>
91 #include <vm/vm_pageout.h>
92 #include <vm/vm_fault.h>
94 #include <sys/kdebug.h>
95 #include <libkern/OSAtomic.h>
101 #include <vfs/vfs_disk_conditioner.h>
105 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
110 #define CL_WRITE 0x02
111 #define CL_ASYNC 0x04
112 #define CL_COMMIT 0x08
113 #define CL_PAGEOUT 0x10
115 #define CL_NOZERO 0x40
116 #define CL_PAGEIN 0x80
117 #define CL_DEV_MEMORY 0x100
118 #define CL_PRESERVE 0x200
119 #define CL_THROTTLE 0x400
120 #define CL_KEEPCACHED 0x800
121 #define CL_DIRECT_IO 0x1000
122 #define CL_PASSIVE 0x2000
123 #define CL_IOSTREAMING 0x4000
124 #define CL_CLOSE 0x8000
125 #define CL_ENCRYPTED 0x10000
126 #define CL_RAW_ENCRYPTED 0x20000
127 #define CL_NOCACHE 0x40000
129 #define MAX_VECTOR_UPL_ELEMENTS 8
130 #define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE_BYTES)
132 #define CLUSTER_IO_WAITING ((buf_t)1)
134 extern upl_t
vector_upl_create(vm_offset_t
);
135 extern boolean_t
vector_upl_is_valid(upl_t
);
136 extern boolean_t
vector_upl_set_subupl(upl_t
,upl_t
, u_int32_t
);
137 extern void vector_upl_set_pagelist(upl_t
);
138 extern void vector_upl_set_iostate(upl_t
, upl_t
, vm_offset_t
, u_int32_t
);
142 u_int io_completed
; /* amount of io that has currently completed */
143 u_int io_issued
; /* amount of io that was successfully issued */
144 int io_error
; /* error code of first error encountered */
145 int io_wanted
; /* someone is sleeping waiting for a change in state */
148 struct cl_direct_read_lock
{
149 LIST_ENTRY(cl_direct_read_lock
) chain
;
155 #define CL_DIRECT_READ_LOCK_BUCKETS 61
157 static LIST_HEAD(cl_direct_read_locks
, cl_direct_read_lock
)
158 cl_direct_read_locks
[CL_DIRECT_READ_LOCK_BUCKETS
];
160 static lck_spin_t cl_direct_read_spin_lock
;
162 static lck_grp_t
*cl_mtx_grp
;
163 static lck_attr_t
*cl_mtx_attr
;
164 static lck_grp_attr_t
*cl_mtx_grp_attr
;
165 static lck_mtx_t
*cl_transaction_mtxp
;
172 #define PUSH_DELAY 0x01
173 #define PUSH_ALL 0x02
174 #define PUSH_SYNC 0x04
177 static void cluster_EOT(buf_t cbp_head
, buf_t cbp_tail
, int zero_offset
);
178 static void cluster_wait_IO(buf_t cbp_head
, int async
);
179 static void cluster_complete_transaction(buf_t
*cbp_head
, void *callback_arg
, int *retval
, int flags
, int needwait
);
181 static int cluster_io_type(struct uio
*uio
, int *io_type
, u_int32_t
*io_length
, u_int32_t min_length
);
183 static int cluster_io(vnode_t vp
, upl_t upl
, vm_offset_t upl_offset
, off_t f_offset
, int non_rounded_size
,
184 int flags
, buf_t real_bp
, struct clios
*iostate
, int (*)(buf_t
, void *), void *callback_arg
);
185 static int cluster_iodone(buf_t bp
, void *callback_arg
);
186 static int cluster_ioerror(upl_t upl
, int upl_offset
, int abort_size
, int error
, int io_flags
, vnode_t vp
);
187 static int cluster_is_throttled(vnode_t vp
);
189 static void cluster_iostate_wait(struct clios
*iostate
, u_int target
, const char *wait_name
);
191 static void cluster_syncup(vnode_t vp
, off_t newEOF
, int (*)(buf_t
, void *), void *callback_arg
, int flags
);
193 static void cluster_read_upl_release(upl_t upl
, int start_pg
, int last_pg
, int take_reference
);
194 static int cluster_copy_ubc_data_internal(vnode_t vp
, struct uio
*uio
, int *io_resid
, int mark_dirty
, int take_reference
);
196 static int cluster_read_copy(vnode_t vp
, struct uio
*uio
, u_int32_t io_req_size
, off_t filesize
, int flags
,
197 int (*)(buf_t
, void *), void *callback_arg
);
198 static int cluster_read_direct(vnode_t vp
, struct uio
*uio
, off_t filesize
, int *read_type
, u_int32_t
*read_length
,
199 int flags
, int (*)(buf_t
, void *), void *callback_arg
);
200 static int cluster_read_contig(vnode_t vp
, struct uio
*uio
, off_t filesize
, int *read_type
, u_int32_t
*read_length
,
201 int (*)(buf_t
, void *), void *callback_arg
, int flags
);
203 static int cluster_write_copy(vnode_t vp
, struct uio
*uio
, u_int32_t io_req_size
, off_t oldEOF
, off_t newEOF
,
204 off_t headOff
, off_t tailOff
, int flags
, int (*)(buf_t
, void *), void *callback_arg
);
205 static int cluster_write_direct(vnode_t vp
, struct uio
*uio
, off_t oldEOF
, off_t newEOF
,
206 int *write_type
, u_int32_t
*write_length
, int flags
, int (*)(buf_t
, void *), void *callback_arg
);
207 static int cluster_write_contig(vnode_t vp
, struct uio
*uio
, off_t newEOF
,
208 int *write_type
, u_int32_t
*write_length
, int (*)(buf_t
, void *), void *callback_arg
, int bflag
);
210 static int cluster_align_phys_io(vnode_t vp
, struct uio
*uio
, addr64_t usr_paddr
, u_int32_t xsize
, int flags
, int (*)(buf_t
, void *), void *callback_arg
);
212 static int cluster_read_prefetch(vnode_t vp
, off_t f_offset
, u_int size
, off_t filesize
, int (*callback
)(buf_t
, void *), void *callback_arg
, int bflag
);
213 static void cluster_read_ahead(vnode_t vp
, struct cl_extent
*extent
, off_t filesize
, struct cl_readahead
*ra
, int (*callback
)(buf_t
, void *), void *callback_arg
, int bflag
);
215 static int cluster_push_now(vnode_t vp
, struct cl_extent
*, off_t EOF
, int flags
, int (*)(buf_t
, void *), void *callback_arg
);
217 static int cluster_try_push(struct cl_writebehind
*, vnode_t vp
, off_t EOF
, int push_flag
, int flags
, int (*)(buf_t
, void *), void *callback_arg
, int *err
);
219 static void sparse_cluster_switch(struct cl_writebehind
*, vnode_t vp
, off_t EOF
, int (*)(buf_t
, void *), void *callback_arg
);
220 static int sparse_cluster_push(void **cmapp
, vnode_t vp
, off_t EOF
, int push_flag
, int io_flags
, int (*)(buf_t
, void *), void *callback_arg
);
221 static void sparse_cluster_add(void **cmapp
, vnode_t vp
, struct cl_extent
*, off_t EOF
, int (*)(buf_t
, void *), void *callback_arg
);
223 static kern_return_t
vfs_drt_mark_pages(void **cmapp
, off_t offset
, u_int length
, u_int
*setcountp
);
224 static kern_return_t
vfs_drt_get_cluster(void **cmapp
, off_t
*offsetp
, u_int
*lengthp
);
225 static kern_return_t
vfs_drt_control(void **cmapp
, int op_type
);
229 * For throttled IO to check whether
230 * a block is cached by the boot cache
231 * and thus it can avoid delaying the IO.
233 * bootcache_contains_block is initially
234 * NULL. The BootCache will set it while
235 * the cache is active and clear it when
236 * the cache is jettisoned.
238 * Returns 0 if the block is not
239 * contained in the cache, 1 if it is
242 * The function pointer remains valid
243 * after the cache has been evicted even
244 * if bootcache_contains_block has been
247 * See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs
249 int (*bootcache_contains_block
)(dev_t device
, u_int64_t blkno
) = NULL
;
253 * limit the internal I/O size so that we
254 * can represent it in a 32 bit int
256 #define MAX_IO_REQUEST_SIZE (1024 * 1024 * 512)
257 #define MAX_IO_CONTIG_SIZE MAX_UPL_SIZE_BYTES
260 * The MIN_DIRECT_WRITE_SIZE governs how much I/O should be issued before we consider
261 * allowing the caller to bypass the buffer cache. For small I/Os (less than 16k),
262 * we have not historically allowed the write to bypass the UBC.
264 #define MIN_DIRECT_WRITE_SIZE (16384)
266 #define WRITE_THROTTLE 6
267 #define WRITE_THROTTLE_SSD 2
268 #define WRITE_BEHIND 1
269 #define WRITE_BEHIND_SSD 1
273 #define PREFETCH_SSD 1
274 uint32_t speculative_prefetch_max
= (2048 * 1024); /* maximum bytes in a specluative read-ahead */
275 uint32_t speculative_prefetch_max_iosize
= (512 * 1024); /* maximum I/O size to use in a specluative read-ahead */
278 #define PREFETCH_SSD 2
279 uint32_t speculative_prefetch_max
= (MAX_UPL_SIZE_BYTES
* 3); /* maximum bytes in a specluative read-ahead */
280 uint32_t speculative_prefetch_max_iosize
= (512 * 1024); /* maximum I/O size to use in a specluative read-ahead on SSDs*/
284 #define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * (base))
285 #define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE))
286 #define MAX_PREFETCH(vp, size, is_ssd) (size * IO_SCALE(vp, ((is_ssd) ? PREFETCH_SSD : PREFETCH)))
288 int speculative_reads_disabled
= 0;
291 * throttle the number of async writes that
292 * can be outstanding on a single vnode
293 * before we issue a synchronous write
295 #define THROTTLE_MAXCNT 0
297 uint32_t throttle_max_iosize
= (128 * 1024);
299 #define THROTTLE_MAX_IOSIZE (throttle_max_iosize)
301 SYSCTL_INT(_debug
, OID_AUTO
, lowpri_throttle_max_iosize
, CTLFLAG_RW
| CTLFLAG_LOCKED
, &throttle_max_iosize
, 0, "");
307 * allocate lock group attribute and group
309 cl_mtx_grp_attr
= lck_grp_attr_alloc_init();
310 cl_mtx_grp
= lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr
);
313 * allocate the lock attribute
315 cl_mtx_attr
= lck_attr_alloc_init();
317 cl_transaction_mtxp
= lck_mtx_alloc_init(cl_mtx_grp
, cl_mtx_attr
);
319 if (cl_transaction_mtxp
== NULL
)
320 panic("cluster_init: failed to allocate cl_transaction_mtxp");
322 lck_spin_init(&cl_direct_read_spin_lock
, cl_mtx_grp
, cl_mtx_attr
);
324 for (int i
= 0; i
< CL_DIRECT_READ_LOCK_BUCKETS
; ++i
)
325 LIST_INIT(&cl_direct_read_locks
[i
]);
330 cluster_max_io_size(mount_t mp
, int type
)
332 uint32_t max_io_size
;
339 segcnt
= mp
->mnt_segreadcnt
;
340 maxcnt
= mp
->mnt_maxreadcnt
;
343 segcnt
= mp
->mnt_segwritecnt
;
344 maxcnt
= mp
->mnt_maxwritecnt
;
347 segcnt
= min(mp
->mnt_segreadcnt
, mp
->mnt_segwritecnt
);
348 maxcnt
= min(mp
->mnt_maxreadcnt
, mp
->mnt_maxwritecnt
);
351 if (segcnt
> (MAX_UPL_SIZE_BYTES
>> PAGE_SHIFT
)) {
353 * don't allow a size beyond the max UPL size we can create
355 segcnt
= MAX_UPL_SIZE_BYTES
>> PAGE_SHIFT
;
357 max_io_size
= min((segcnt
* PAGE_SIZE
), maxcnt
);
359 if (max_io_size
< MAX_UPL_TRANSFER_BYTES
) {
361 * don't allow a size smaller than the old fixed limit
363 max_io_size
= MAX_UPL_TRANSFER_BYTES
;
366 * make sure the size specified is a multiple of PAGE_SIZE
368 max_io_size
&= ~PAGE_MASK
;
370 return (max_io_size
);
376 #define CLW_ALLOCATE 0x01
377 #define CLW_RETURNLOCKED 0x02
378 #define CLW_IONOCACHE 0x04
379 #define CLW_IOPASSIVE 0x08
382 * if the read ahead context doesn't yet exist,
383 * allocate and initialize it...
384 * the vnode lock serializes multiple callers
385 * during the actual assignment... first one
386 * to grab the lock wins... the other callers
387 * will release the now unnecessary storage
389 * once the context is present, try to grab (but don't block on)
390 * the lock associated with it... if someone
391 * else currently owns it, than the read
392 * will run without read-ahead. this allows
393 * multiple readers to run in parallel and
394 * since there's only 1 read ahead context,
395 * there's no real loss in only allowing 1
396 * reader to have read-ahead enabled.
398 static struct cl_readahead
*
399 cluster_get_rap(vnode_t vp
)
401 struct ubc_info
*ubc
;
402 struct cl_readahead
*rap
;
406 if ((rap
= ubc
->cl_rahead
) == NULL
) {
407 MALLOC_ZONE(rap
, struct cl_readahead
*, sizeof *rap
, M_CLRDAHEAD
, M_WAITOK
);
409 bzero(rap
, sizeof *rap
);
411 lck_mtx_init(&rap
->cl_lockr
, cl_mtx_grp
, cl_mtx_attr
);
415 if (ubc
->cl_rahead
== NULL
)
416 ubc
->cl_rahead
= rap
;
418 lck_mtx_destroy(&rap
->cl_lockr
, cl_mtx_grp
);
419 FREE_ZONE((void *)rap
, sizeof *rap
, M_CLRDAHEAD
);
420 rap
= ubc
->cl_rahead
;
424 if (lck_mtx_try_lock(&rap
->cl_lockr
) == TRUE
)
427 return ((struct cl_readahead
*)NULL
);
432 * if the write behind context doesn't yet exist,
433 * and CLW_ALLOCATE is specified, allocate and initialize it...
434 * the vnode lock serializes multiple callers
435 * during the actual assignment... first one
436 * to grab the lock wins... the other callers
437 * will release the now unnecessary storage
439 * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
440 * the lock associated with the write behind context before
444 static struct cl_writebehind
*
445 cluster_get_wbp(vnode_t vp
, int flags
)
447 struct ubc_info
*ubc
;
448 struct cl_writebehind
*wbp
;
452 if ((wbp
= ubc
->cl_wbehind
) == NULL
) {
454 if ( !(flags
& CLW_ALLOCATE
))
455 return ((struct cl_writebehind
*)NULL
);
457 MALLOC_ZONE(wbp
, struct cl_writebehind
*, sizeof *wbp
, M_CLWRBEHIND
, M_WAITOK
);
459 bzero(wbp
, sizeof *wbp
);
460 lck_mtx_init(&wbp
->cl_lockw
, cl_mtx_grp
, cl_mtx_attr
);
464 if (ubc
->cl_wbehind
== NULL
)
465 ubc
->cl_wbehind
= wbp
;
467 lck_mtx_destroy(&wbp
->cl_lockw
, cl_mtx_grp
);
468 FREE_ZONE((void *)wbp
, sizeof *wbp
, M_CLWRBEHIND
);
469 wbp
= ubc
->cl_wbehind
;
473 if (flags
& CLW_RETURNLOCKED
)
474 lck_mtx_lock(&wbp
->cl_lockw
);
481 cluster_syncup(vnode_t vp
, off_t newEOF
, int (*callback
)(buf_t
, void *), void *callback_arg
, int flags
)
483 struct cl_writebehind
*wbp
;
485 if ((wbp
= cluster_get_wbp(vp
, 0)) != NULL
) {
487 if (wbp
->cl_number
) {
488 lck_mtx_lock(&wbp
->cl_lockw
);
490 cluster_try_push(wbp
, vp
, newEOF
, PUSH_ALL
| flags
, 0, callback
, callback_arg
, NULL
);
492 lck_mtx_unlock(&wbp
->cl_lockw
);
499 cluster_io_present_in_BC(vnode_t vp
, off_t f_offset
)
503 int (*bootcache_check_fn
)(dev_t device
, u_int64_t blkno
) = bootcache_contains_block
;
505 if (bootcache_check_fn
&& vp
->v_mount
&& vp
->v_mount
->mnt_devvp
) {
506 if (VNOP_BLOCKMAP(vp
, f_offset
, PAGE_SIZE
, &blkno
, &io_size
, NULL
, VNODE_READ
| VNODE_BLOCKMAP_NO_TRACK
, NULL
))
512 if (bootcache_check_fn(vp
->v_mount
->mnt_devvp
->v_rdev
, blkno
))
520 cluster_is_throttled(vnode_t vp
)
522 return (throttle_io_will_be_throttled(-1, vp
->v_mount
));
527 cluster_iostate_wait(struct clios
*iostate
, u_int target
, const char *wait_name
)
530 lck_mtx_lock(&iostate
->io_mtxp
);
532 while ((iostate
->io_issued
- iostate
->io_completed
) > target
) {
534 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 95)) | DBG_FUNC_START
,
535 iostate
->io_issued
, iostate
->io_completed
, target
, 0, 0);
537 iostate
->io_wanted
= 1;
538 msleep((caddr_t
)&iostate
->io_wanted
, &iostate
->io_mtxp
, PRIBIO
+ 1, wait_name
, NULL
);
540 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 95)) | DBG_FUNC_END
,
541 iostate
->io_issued
, iostate
->io_completed
, target
, 0, 0);
543 lck_mtx_unlock(&iostate
->io_mtxp
);
546 static void cluster_handle_associated_upl(struct clios
*iostate
, upl_t upl
,
547 upl_offset_t upl_offset
, upl_size_t size
)
552 upl_t associated_upl
= upl_associated_upl(upl
);
558 printf("1: %d %d\n", upl_offset
, upl_offset
+ size
);
562 * The associated UPL is page aligned to file offsets whereas the
563 * UPL it's attached to has different alignment requirements. The
564 * upl_offset that we have refers to @upl. The code that follows
565 * has to deal with the first and last pages in this transaction
566 * which might straddle pages in the associated UPL. To keep
567 * track of these pages, we use the mark bits: if the mark bit is
568 * set, we know another transaction has completed its part of that
569 * page and so we can unlock that page here.
571 * The following illustrates what we have to deal with:
573 * MEM u <------------ 1 PAGE ------------> e
574 * +-------------+----------------------+-----------------
575 * | |######################|#################
576 * +-------------+----------------------+-----------------
577 * FILE | <--- a ---> o <------------ 1 PAGE ------------>
579 * So here we show a write to offset @o. The data that is to be
580 * written is in a buffer that is not page aligned; it has offset
581 * @a in the page. The upl that carries the data starts in memory
582 * at @u. The associated upl starts in the file at offset @o. A
583 * transaction will always end on a page boundary (like @e above)
584 * except for the very last transaction in the group. We cannot
585 * unlock the page at @o in the associated upl until both the
586 * transaction ending at @e and the following transaction (that
587 * starts at @e) has completed.
591 * We record whether or not the two UPLs are aligned as the mark
592 * bit in the first page of @upl.
594 upl_page_info_t
*pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
595 bool is_unaligned
= upl_page_get_mark(pl
, 0);
598 upl_page_info_t
*assoc_pl
= UPL_GET_INTERNAL_PAGE_LIST(associated_upl
);
600 upl_offset_t upl_end
= upl_offset
+ size
;
601 assert(upl_end
>= PAGE_SIZE
);
603 upl_size_t assoc_upl_size
= upl_get_size(associated_upl
);
606 * In the very first transaction in the group, upl_offset will
607 * not be page aligned, but after that it will be and in that
608 * case we want the preceding page in the associated UPL hence
613 upl_offset
= trunc_page_32(upl_offset
- 1);
615 lck_mtx_lock_spin(&iostate
->io_mtxp
);
617 // Look at the first page...
619 && !upl_page_get_mark(assoc_pl
, upl_offset
>> PAGE_SHIFT
)) {
621 * The first page isn't marked so let another transaction
622 * completion handle it.
624 upl_page_set_mark(assoc_pl
, upl_offset
>> PAGE_SHIFT
, true);
625 upl_offset
+= PAGE_SIZE
;
628 // And now the last page...
631 * This needs to be > rather than >= because if it's equal, it
632 * means there's another transaction that is sharing the last
635 if (upl_end
> assoc_upl_size
)
636 upl_end
= assoc_upl_size
;
638 upl_end
= trunc_page_32(upl_end
);
639 const int last_pg
= (upl_end
>> PAGE_SHIFT
) - 1;
641 if (!upl_page_get_mark(assoc_pl
, last_pg
)) {
643 * The last page isn't marked so mark the page and let another
644 * transaction completion handle it.
646 upl_page_set_mark(assoc_pl
, last_pg
, true);
647 upl_end
-= PAGE_SIZE
;
651 lck_mtx_unlock(&iostate
->io_mtxp
);
654 printf("2: %d %d\n", upl_offset
, upl_end
);
657 if (upl_end
<= upl_offset
)
660 size
= upl_end
- upl_offset
;
662 assert(!(upl_offset
& PAGE_MASK
));
663 assert(!(size
& PAGE_MASK
));
669 * We can unlock these pages now and as this is for a
670 * direct/uncached write, we want to dump the pages too.
672 kern_return_t kr
= upl_abort_range(associated_upl
, upl_offset
, size
,
673 UPL_ABORT_DUMP_PAGES
, &empty
);
678 upl_set_associated_upl(upl
, NULL
);
679 upl_deallocate(associated_upl
);
684 cluster_ioerror(upl_t upl
, int upl_offset
, int abort_size
, int error
, int io_flags
, vnode_t vp
)
686 int upl_abort_code
= 0;
690 if ((io_flags
& (B_PHYS
| B_CACHE
)) == (B_PHYS
| B_CACHE
))
692 * direct write of any flavor, or a direct read that wasn't aligned
694 ubc_upl_commit_range(upl
, upl_offset
, abort_size
, UPL_COMMIT_FREE_ON_EMPTY
);
696 if (io_flags
& B_PAGEIO
) {
697 if (io_flags
& B_READ
)
702 if (io_flags
& B_CACHE
)
704 * leave pages in the cache unchanged on error
706 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
;
707 else if (page_out
&& ((error
!= ENXIO
) || vnode_isswap(vp
)))
709 * transient error... leave pages unchanged
711 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
;
713 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
;
715 upl_abort_code
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_DUMP_PAGES
;
717 ubc_upl_abort_range(upl
, upl_offset
, abort_size
, upl_abort_code
);
719 return (upl_abort_code
);
724 cluster_iodone(buf_t bp
, void *callback_arg
)
735 int transaction_size
= 0;
742 struct clios
*iostate
;
743 boolean_t transaction_complete
= FALSE
;
745 __IGNORE_WCASTALIGN(cbp_head
= (buf_t
)(bp
->b_trans_head
));
747 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_START
,
748 cbp_head
, bp
->b_lblkno
, bp
->b_bcount
, bp
->b_flags
, 0);
750 if (cbp_head
->b_trans_next
|| !(cbp_head
->b_flags
& B_EOT
)) {
751 lck_mtx_lock_spin(cl_transaction_mtxp
);
753 bp
->b_flags
|= B_TDONE
;
755 for (cbp
= cbp_head
; cbp
; cbp
= cbp
->b_trans_next
) {
757 * all I/O requests that are part of this transaction
758 * have to complete before we can process it
760 if ( !(cbp
->b_flags
& B_TDONE
)) {
762 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
763 cbp_head
, cbp
, cbp
->b_bcount
, cbp
->b_flags
, 0);
765 lck_mtx_unlock(cl_transaction_mtxp
);
770 if (cbp
->b_trans_next
== CLUSTER_IO_WAITING
) {
771 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
772 cbp_head
, cbp
, cbp
->b_bcount
, cbp
->b_flags
, 0);
774 lck_mtx_unlock(cl_transaction_mtxp
);
780 if (cbp
->b_flags
& B_EOT
)
781 transaction_complete
= TRUE
;
783 lck_mtx_unlock(cl_transaction_mtxp
);
785 if (transaction_complete
== FALSE
) {
786 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
787 cbp_head
, 0, 0, 0, 0);
797 upl_offset
= cbp
->b_uploffset
;
799 b_flags
= cbp
->b_flags
;
800 real_bp
= cbp
->b_real_bp
;
801 zero_offset
= cbp
->b_validend
;
802 iostate
= (struct clios
*)cbp
->b_iostate
;
805 real_bp
->b_dev
= cbp
->b_dev
;
808 if ((cbp
->b_flags
& B_ERROR
) && error
== 0)
809 error
= cbp
->b_error
;
811 total_resid
+= cbp
->b_resid
;
812 total_size
+= cbp
->b_bcount
;
814 cbp_next
= cbp
->b_trans_next
;
816 if (cbp_next
== NULL
)
818 * compute the overall size of the transaction
819 * in case we created one that has 'holes' in it
820 * 'total_size' represents the amount of I/O we
821 * did, not the span of the transaction w/r to the UPL
823 transaction_size
= cbp
->b_uploffset
+ cbp
->b_bcount
- upl_offset
;
831 if (ISSET(b_flags
, B_COMMIT_UPL
)) {
832 cluster_handle_associated_upl(iostate
,
838 if (error
== 0 && total_resid
)
842 int (*cliodone_func
)(buf_t
, void *) = (int (*)(buf_t
, void *))(cbp_head
->b_cliodone
);
844 if (cliodone_func
!= NULL
) {
845 cbp_head
->b_bcount
= transaction_size
;
847 error
= (*cliodone_func
)(cbp_head
, callback_arg
);
851 cluster_zero(upl
, zero_offset
, PAGE_SIZE
- (zero_offset
& PAGE_MASK
), real_bp
);
853 free_io_buf(cbp_head
);
859 * someone has issued multiple I/Os asynchrounsly
860 * and is waiting for them to complete (streaming)
862 lck_mtx_lock_spin(&iostate
->io_mtxp
);
864 if (error
&& iostate
->io_error
== 0)
865 iostate
->io_error
= error
;
867 iostate
->io_completed
+= total_size
;
869 if (iostate
->io_wanted
) {
871 * someone is waiting for the state of
872 * this io stream to change
874 iostate
->io_wanted
= 0;
877 lck_mtx_unlock(&iostate
->io_mtxp
);
880 wakeup((caddr_t
)&iostate
->io_wanted
);
883 if (b_flags
& B_COMMIT_UPL
) {
884 pg_offset
= upl_offset
& PAGE_MASK
;
885 commit_size
= (pg_offset
+ transaction_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
888 upl_flags
= cluster_ioerror(upl
, upl_offset
- pg_offset
, commit_size
, error
, b_flags
, vp
);
890 upl_flags
= UPL_COMMIT_FREE_ON_EMPTY
;
892 if ((b_flags
& B_PHYS
) && (b_flags
& B_READ
))
893 upl_flags
|= UPL_COMMIT_SET_DIRTY
;
896 upl_flags
|= UPL_COMMIT_INACTIVATE
;
898 ubc_upl_commit_range(upl
, upl_offset
- pg_offset
, commit_size
, upl_flags
);
903 real_bp
->b_flags
|= B_ERROR
;
904 real_bp
->b_error
= error
;
906 real_bp
->b_resid
= total_resid
;
908 buf_biodone(real_bp
);
910 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 20)) | DBG_FUNC_END
,
911 upl
, upl_offset
- pg_offset
, commit_size
, (error
<< 24) | upl_flags
, 0);
918 cluster_throttle_io_limit(vnode_t vp
, uint32_t *limit
)
920 if (cluster_is_throttled(vp
)) {
921 *limit
= THROTTLE_MAX_IOSIZE
;
929 cluster_zero(upl_t upl
, upl_offset_t upl_offset
, int size
, buf_t bp
)
932 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 23)) | DBG_FUNC_START
,
933 upl_offset
, size
, bp
, 0, 0);
935 if (bp
== NULL
|| bp
->b_datap
== 0) {
939 pl
= ubc_upl_pageinfo(upl
);
941 if (upl_device_page(pl
) == TRUE
) {
942 zero_addr
= ((addr64_t
)upl_phys_page(pl
, 0) << PAGE_SHIFT
) + upl_offset
;
944 bzero_phys_nc(zero_addr
, size
);
951 page_index
= upl_offset
/ PAGE_SIZE
;
952 page_offset
= upl_offset
& PAGE_MASK
;
954 zero_addr
= ((addr64_t
)upl_phys_page(pl
, page_index
) << PAGE_SHIFT
) + page_offset
;
955 zero_cnt
= min(PAGE_SIZE
- page_offset
, size
);
957 bzero_phys(zero_addr
, zero_cnt
);
960 upl_offset
+= zero_cnt
;
964 bzero((caddr_t
)((vm_offset_t
)bp
->b_datap
+ upl_offset
), size
);
966 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 23)) | DBG_FUNC_END
,
967 upl_offset
, size
, 0, 0, 0);
972 cluster_EOT(buf_t cbp_head
, buf_t cbp_tail
, int zero_offset
)
974 cbp_head
->b_validend
= zero_offset
;
975 cbp_tail
->b_flags
|= B_EOT
;
979 cluster_wait_IO(buf_t cbp_head
, int async
)
985 * Async callback completion will not normally generate a
986 * wakeup upon I/O completion. To get woken up, we set
987 * b_trans_next (which is safe for us to modify) on the last
988 * buffer to CLUSTER_IO_WAITING so that cluster_iodone knows
989 * to wake us up when all buffers as part of this transaction
990 * are completed. This is done under the umbrella of
991 * cl_transaction_mtxp which is also taken in cluster_iodone.
996 lck_mtx_lock_spin(cl_transaction_mtxp
);
998 for (cbp
= cbp_head
; cbp
; last
= cbp
, cbp
= cbp
->b_trans_next
) {
999 if (!ISSET(cbp
->b_flags
, B_TDONE
))
1004 last
->b_trans_next
= CLUSTER_IO_WAITING
;
1006 DTRACE_IO1(wait__start
, buf_t
, last
);
1008 msleep(last
, cl_transaction_mtxp
, PSPIN
| (PRIBIO
+1), "cluster_wait_IO", NULL
);
1011 * We should only have been woken up if all the
1012 * buffers are completed, but just in case...
1015 for (cbp
= cbp_head
; cbp
!= CLUSTER_IO_WAITING
; cbp
= cbp
->b_trans_next
) {
1016 if (!ISSET(cbp
->b_flags
, B_TDONE
)) {
1022 DTRACE_IO1(wait__done
, buf_t
, last
);
1024 last
->b_trans_next
= NULL
;
1027 lck_mtx_unlock(cl_transaction_mtxp
);
1029 for (cbp
= cbp_head
; cbp
; cbp
= cbp
->b_trans_next
)
1035 cluster_complete_transaction(buf_t
*cbp_head
, void *callback_arg
, int *retval
, int flags
, int needwait
)
1039 boolean_t isswapout
= FALSE
;
1042 * cluster_complete_transaction will
1043 * only be called if we've issued a complete chain in synchronous mode
1044 * or, we've already done a cluster_wait_IO on an incomplete chain
1047 for (cbp
= *cbp_head
; cbp
; cbp
= cbp
->b_trans_next
)
1051 * we've already waited on all of the I/Os in this transaction,
1052 * so mark all of the buf_t's in this transaction as B_TDONE
1053 * so that cluster_iodone sees the transaction as completed
1055 for (cbp
= *cbp_head
; cbp
; cbp
= cbp
->b_trans_next
)
1056 cbp
->b_flags
|= B_TDONE
;
1059 if ((flags
& (CL_ASYNC
| CL_PAGEOUT
)) == CL_PAGEOUT
&& vnode_isswap(cbp
->b_vp
))
1062 error
= cluster_iodone(cbp
, callback_arg
);
1064 if ( !(flags
& CL_ASYNC
) && error
&& *retval
== 0) {
1065 if (((flags
& (CL_PAGEOUT
| CL_KEEPCACHED
)) != CL_PAGEOUT
) || (error
!= ENXIO
))
1067 else if (isswapout
== TRUE
)
1070 *cbp_head
= (buf_t
)NULL
;
1075 cluster_io(vnode_t vp
, upl_t upl
, vm_offset_t upl_offset
, off_t f_offset
, int non_rounded_size
,
1076 int flags
, buf_t real_bp
, struct clios
*iostate
, int (*callback
)(buf_t
, void *), void *callback_arg
)
1085 buf_t cbp_head
= NULL
;
1086 buf_t cbp_tail
= NULL
;
1087 int trans_count
= 0;
1088 int max_trans_count
;
1094 int zero_offset
= 0;
1095 int async_throttle
= 0;
1097 vm_offset_t upl_end_offset
;
1098 boolean_t need_EOT
= FALSE
;
1101 * we currently don't support buffers larger than a page
1103 if (real_bp
&& non_rounded_size
> PAGE_SIZE
)
1104 panic("%s(): Called with real buffer of size %d bytes which "
1105 "is greater than the maximum allowed size of "
1106 "%d bytes (the system PAGE_SIZE).\n",
1107 __FUNCTION__
, non_rounded_size
, PAGE_SIZE
);
1112 * we don't want to do any funny rounding of the size for IO requests
1113 * coming through the DIRECT or CONTIGUOUS paths... those pages don't
1114 * belong to us... we can't extend (nor do we need to) the I/O to fill
1117 if (mp
->mnt_devblocksize
> 1 && !(flags
& (CL_DEV_MEMORY
| CL_DIRECT_IO
))) {
1119 * round the requested size up so that this I/O ends on a
1120 * page boundary in case this is a 'write'... if the filesystem
1121 * has blocks allocated to back the page beyond the EOF, we want to
1122 * make sure to write out the zero's that are sitting beyond the EOF
1123 * so that in case the filesystem doesn't explicitly zero this area
1124 * if a hole is created via a lseek/write beyond the current EOF,
1125 * it will return zeros when it's read back from the disk. If the
1126 * physical allocation doesn't extend for the whole page, we'll
1127 * only write/read from the disk up to the end of this allocation
1128 * via the extent info returned from the VNOP_BLOCKMAP call.
1130 pg_offset
= upl_offset
& PAGE_MASK
;
1132 size
= (((non_rounded_size
+ pg_offset
) + (PAGE_SIZE
- 1)) & ~PAGE_MASK
) - pg_offset
;
1135 * anyone advertising a blocksize of 1 byte probably
1136 * can't deal with us rounding up the request size
1137 * AFP is one such filesystem/device
1139 size
= non_rounded_size
;
1141 upl_end_offset
= upl_offset
+ size
;
1143 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_START
, (int)f_offset
, size
, upl_offset
, flags
, 0);
1146 * Set the maximum transaction size to the maximum desired number of
1149 max_trans_count
= 8;
1150 if (flags
& CL_DEV_MEMORY
)
1151 max_trans_count
= 16;
1153 if (flags
& CL_READ
) {
1155 bmap_flags
= VNODE_READ
;
1157 max_iosize
= mp
->mnt_maxreadcnt
;
1158 max_vectors
= mp
->mnt_segreadcnt
;
1161 bmap_flags
= VNODE_WRITE
;
1163 max_iosize
= mp
->mnt_maxwritecnt
;
1164 max_vectors
= mp
->mnt_segwritecnt
;
1166 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_NONE
, max_iosize
, max_vectors
, mp
->mnt_devblocksize
, 0, 0);
1169 * make sure the maximum iosize is a
1170 * multiple of the page size
1172 max_iosize
&= ~PAGE_MASK
;
1175 * Ensure the maximum iosize is sensible.
1178 max_iosize
= PAGE_SIZE
;
1180 if (flags
& CL_THROTTLE
) {
1181 if ( !(flags
& CL_PAGEOUT
) && cluster_is_throttled(vp
)) {
1182 if (max_iosize
> THROTTLE_MAX_IOSIZE
)
1183 max_iosize
= THROTTLE_MAX_IOSIZE
;
1184 async_throttle
= THROTTLE_MAXCNT
;
1186 if ( (flags
& CL_DEV_MEMORY
) )
1187 async_throttle
= IO_SCALE(vp
, VNODE_ASYNC_THROTTLE
);
1190 u_int max_cluster_size
;
1193 if (vp
->v_mount
->mnt_minsaturationbytecount
) {
1194 max_cluster_size
= vp
->v_mount
->mnt_minsaturationbytecount
;
1198 max_cluster_size
= MAX_CLUSTER_SIZE(vp
);
1200 if (disk_conditioner_mount_is_ssd(vp
->v_mount
))
1201 scale
= WRITE_THROTTLE_SSD
;
1203 scale
= WRITE_THROTTLE
;
1205 if (max_iosize
> max_cluster_size
)
1206 max_cluster
= max_cluster_size
;
1208 max_cluster
= max_iosize
;
1210 if (size
< max_cluster
)
1213 if (flags
& CL_CLOSE
)
1214 scale
+= MAX_CLUSTERS
;
1216 async_throttle
= min(IO_SCALE(vp
, VNODE_ASYNC_THROTTLE
), ((scale
* max_cluster_size
) / max_cluster
) - 1);
1222 if (flags
& (CL_PAGEIN
| CL_PAGEOUT
))
1223 io_flags
|= B_PAGEIO
;
1224 if (flags
& (CL_IOSTREAMING
))
1225 io_flags
|= B_IOSTREAMING
;
1226 if (flags
& CL_COMMIT
)
1227 io_flags
|= B_COMMIT_UPL
;
1228 if (flags
& CL_DIRECT_IO
)
1230 if (flags
& (CL_PRESERVE
| CL_KEEPCACHED
))
1231 io_flags
|= B_CACHE
;
1232 if (flags
& CL_PASSIVE
)
1233 io_flags
|= B_PASSIVE
;
1234 if (flags
& CL_ENCRYPTED
)
1235 io_flags
|= B_ENCRYPTED_IO
;
1237 if (vp
->v_flag
& VSYSTEM
)
1240 if ((flags
& CL_READ
) && ((upl_offset
+ non_rounded_size
) & PAGE_MASK
) && (!(flags
& CL_NOZERO
))) {
1242 * then we are going to end up
1243 * with a page that we can't complete (the file size wasn't a multiple
1244 * of PAGE_SIZE and we're trying to read to the end of the file
1245 * so we'll go ahead and zero out the portion of the page we can't
1246 * read in from the file
1248 zero_offset
= upl_offset
+ non_rounded_size
;
1249 } else if (!ISSET(flags
, CL_READ
) && ISSET(flags
, CL_DIRECT_IO
)) {
1250 assert(ISSET(flags
, CL_COMMIT
));
1252 // For a direct/uncached write, we need to lock pages...
1257 * Create a UPL to lock the pages in the cache whilst the
1258 * write is in progress.
1260 ubc_create_upl_kernel(vp
, f_offset
, non_rounded_size
, &cached_upl
,
1261 NULL
, UPL_SET_LITE
, VM_KERN_MEMORY_FILE
);
1264 * Attach this UPL to the other UPL so that we can find it
1267 upl_set_associated_upl(upl
, cached_upl
);
1269 if (upl_offset
& PAGE_MASK
) {
1271 * The two UPLs are not aligned, so mark the first page in
1272 * @upl so that cluster_handle_associated_upl can handle
1275 upl_page_info_t
*pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
1276 upl_page_set_mark(pl
, 0, true);
1283 u_int io_size_wanted
;
1286 if (size
> max_iosize
)
1287 io_size
= max_iosize
;
1291 io_size_wanted
= io_size
;
1292 io_size_tmp
= (size_t)io_size
;
1294 if ((error
= VNOP_BLOCKMAP(vp
, f_offset
, io_size
, &blkno
, &io_size_tmp
, NULL
, bmap_flags
, NULL
)))
1297 if (io_size_tmp
> io_size_wanted
)
1298 io_size
= io_size_wanted
;
1300 io_size
= (u_int
)io_size_tmp
;
1302 if (real_bp
&& (real_bp
->b_blkno
== real_bp
->b_lblkno
))
1303 real_bp
->b_blkno
= blkno
;
1305 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 24)) | DBG_FUNC_NONE
,
1306 (int)f_offset
, (int)(blkno
>>32), (int)blkno
, io_size
, 0);
1310 * vnop_blockmap didn't return an error... however, it did
1311 * return an extent size of 0 which means we can't
1312 * make forward progress on this I/O... a hole in the
1313 * file would be returned as a blkno of -1 with a non-zero io_size
1314 * a real extent is returned with a blkno != -1 and a non-zero io_size
1319 if ( !(flags
& CL_READ
) && blkno
== -1) {
1323 if (upl_get_internal_vectorupl(upl
))
1324 panic("Vector UPLs should not take this code-path\n");
1326 * we're writing into a 'hole'
1328 if (flags
& CL_PAGEOUT
) {
1330 * if we got here via cluster_pageout
1331 * then just error the request and return
1332 * the 'hole' should already have been covered
1338 * we can get here if the cluster code happens to
1339 * pick up a page that was dirtied via mmap vs
1340 * a 'write' and the page targets a 'hole'...
1341 * i.e. the writes to the cluster were sparse
1342 * and the file was being written for the first time
1344 * we can also get here if the filesystem supports
1345 * 'holes' that are less than PAGE_SIZE.... because
1346 * we can't know if the range in the page that covers
1347 * the 'hole' has been dirtied via an mmap or not,
1348 * we have to assume the worst and try to push the
1349 * entire page to storage.
1351 * Try paging out the page individually before
1352 * giving up entirely and dumping it (the pageout
1353 * path will insure that the zero extent accounting
1354 * has been taken care of before we get back into cluster_io)
1356 * go direct to vnode_pageout so that we don't have to
1357 * unbusy the page from the UPL... we used to do this
1358 * so that we could call ubc_msync, but that results
1359 * in a potential deadlock if someone else races us to acquire
1360 * that page and wins and in addition needs one of the pages
1361 * we're continuing to hold in the UPL
1363 pageout_flags
= UPL_MSYNC
| UPL_VNODE_PAGER
| UPL_NESTED_PAGEOUT
;
1365 if ( !(flags
& CL_ASYNC
))
1366 pageout_flags
|= UPL_IOSYNC
;
1367 if ( !(flags
& CL_COMMIT
))
1368 pageout_flags
|= UPL_NOCOMMIT
;
1372 int bytes_in_last_page
;
1375 * first we have to wait for the the current outstanding I/Os
1376 * to complete... EOT hasn't been set yet on this transaction
1377 * so the pages won't be released
1379 cluster_wait_IO(cbp_head
, (flags
& CL_ASYNC
));
1381 bytes_in_last_page
= cbp_head
->b_uploffset
& PAGE_MASK
;
1382 for (cbp
= cbp_head
; cbp
; cbp
= cbp
->b_trans_next
)
1383 bytes_in_last_page
+= cbp
->b_bcount
;
1384 bytes_in_last_page
&= PAGE_MASK
;
1386 while (bytes_in_last_page
) {
1388 * we've got a transcation that
1389 * includes the page we're about to push out through vnode_pageout...
1390 * find the bp's in the list which intersect this page and either
1391 * remove them entirely from the transaction (there could be multiple bp's), or
1392 * round it's iosize down to the page boundary (there can only be one)...
1394 * find the last bp in the list and act on it
1396 for (prev_cbp
= cbp
= cbp_head
; cbp
->b_trans_next
; cbp
= cbp
->b_trans_next
)
1399 if (bytes_in_last_page
>= cbp
->b_bcount
) {
1401 * this buf no longer has any I/O associated with it
1403 bytes_in_last_page
-= cbp
->b_bcount
;
1408 if (cbp
== cbp_head
) {
1409 assert(bytes_in_last_page
== 0);
1411 * the buf we just freed was the only buf in
1412 * this transaction... so there's no I/O to do
1418 * remove the buf we just freed from
1419 * the transaction list
1421 prev_cbp
->b_trans_next
= NULL
;
1422 cbp_tail
= prev_cbp
;
1426 * this is the last bp that has I/O
1427 * intersecting the page of interest
1428 * only some of the I/O is in the intersection
1429 * so clip the size but keep it in the transaction list
1431 cbp
->b_bcount
-= bytes_in_last_page
;
1433 bytes_in_last_page
= 0;
1438 * there was more to the current transaction
1439 * than just the page we are pushing out via vnode_pageout...
1440 * mark it as finished and complete it... we've already
1441 * waited for the I/Os to complete above in the call to cluster_wait_IO
1443 cluster_EOT(cbp_head
, cbp_tail
, 0);
1445 cluster_complete_transaction(&cbp_head
, callback_arg
, &retval
, flags
, 0);
1450 if (vnode_pageout(vp
, upl
, trunc_page(upl_offset
), trunc_page_64(f_offset
), PAGE_SIZE
, pageout_flags
, NULL
) != PAGER_SUCCESS
) {
1453 e_offset
= round_page_64(f_offset
+ 1);
1454 io_size
= e_offset
- f_offset
;
1456 f_offset
+= io_size
;
1457 upl_offset
+= io_size
;
1459 if (size
>= io_size
)
1464 * keep track of how much of the original request
1465 * that we've actually completed... non_rounded_size
1466 * may go negative due to us rounding the request
1467 * to a page size multiple (i.e. size > non_rounded_size)
1469 non_rounded_size
-= io_size
;
1471 if (non_rounded_size
<= 0) {
1473 * we've transferred all of the data in the original
1474 * request, but we were unable to complete the tail
1475 * of the last page because the file didn't have
1476 * an allocation to back that portion... this is ok.
1482 flags
&= ~CL_COMMIT
;
1487 lblkno
= (daddr64_t
)(f_offset
/ 0x1000);
1489 * we have now figured out how much I/O we can do - this is in 'io_size'
1490 * pg_offset is the starting point in the first page for the I/O
1491 * pg_count is the number of full and partial pages that 'io_size' encompasses
1493 pg_offset
= upl_offset
& PAGE_MASK
;
1495 if (flags
& CL_DEV_MEMORY
) {
1497 * treat physical requests as one 'giant' page
1501 pg_count
= (io_size
+ pg_offset
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
1503 if ((flags
& CL_READ
) && blkno
== -1) {
1504 vm_offset_t commit_offset
;
1506 int complete_transaction_now
= 0;
1509 * if we're reading and blkno == -1, then we've got a
1510 * 'hole' in the file that we need to deal with by zeroing
1511 * out the affected area in the upl
1513 if (io_size
>= (u_int
)non_rounded_size
) {
1515 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
1516 * than 'zero_offset' will be non-zero
1517 * if the 'hole' returned by vnop_blockmap extends all the way to the eof
1518 * (indicated by the io_size finishing off the I/O request for this UPL)
1519 * than we're not going to issue an I/O for the
1520 * last page in this upl... we need to zero both the hole and the tail
1521 * of the page beyond the EOF, since the delayed zero-fill won't kick in
1523 bytes_to_zero
= non_rounded_size
;
1524 if (!(flags
& CL_NOZERO
))
1525 bytes_to_zero
= (((upl_offset
+ io_size
) + (PAGE_SIZE
- 1)) & ~PAGE_MASK
) - upl_offset
;
1529 bytes_to_zero
= io_size
;
1533 cluster_zero(upl
, upl_offset
, bytes_to_zero
, real_bp
);
1539 * if there is a current I/O chain pending
1540 * then the first page of the group we just zero'd
1541 * will be handled by the I/O completion if the zero
1542 * fill started in the middle of the page
1544 commit_offset
= (upl_offset
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
1546 pg_resid
= commit_offset
- upl_offset
;
1548 if (bytes_to_zero
>= pg_resid
) {
1550 * the last page of the current I/O
1551 * has been completed...
1552 * compute the number of fully zero'd
1553 * pages that are beyond it
1554 * plus the last page if its partial
1555 * and we have no more I/O to issue...
1556 * otherwise a partial page is left
1557 * to begin the next I/O
1559 if ((int)io_size
>= non_rounded_size
)
1560 pg_count
= (bytes_to_zero
- pg_resid
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
1562 pg_count
= (bytes_to_zero
- pg_resid
) / PAGE_SIZE
;
1564 complete_transaction_now
= 1;
1568 * no pending I/O to deal with
1569 * so, commit all of the fully zero'd pages
1570 * plus the last page if its partial
1571 * and we have no more I/O to issue...
1572 * otherwise a partial page is left
1573 * to begin the next I/O
1575 if ((int)io_size
>= non_rounded_size
)
1576 pg_count
= (pg_offset
+ bytes_to_zero
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
1578 pg_count
= (pg_offset
+ bytes_to_zero
) / PAGE_SIZE
;
1580 commit_offset
= upl_offset
& ~PAGE_MASK
;
1583 // Associated UPL is currently only used in the direct write path
1584 assert(!upl_associated_upl(upl
));
1586 if ( (flags
& CL_COMMIT
) && pg_count
) {
1587 ubc_upl_commit_range(upl
, commit_offset
, pg_count
* PAGE_SIZE
,
1588 UPL_COMMIT_CLEAR_DIRTY
| UPL_COMMIT_FREE_ON_EMPTY
);
1590 upl_offset
+= io_size
;
1591 f_offset
+= io_size
;
1595 * keep track of how much of the original request
1596 * that we've actually completed... non_rounded_size
1597 * may go negative due to us rounding the request
1598 * to a page size multiple (i.e. size > non_rounded_size)
1600 non_rounded_size
-= io_size
;
1602 if (non_rounded_size
<= 0) {
1604 * we've transferred all of the data in the original
1605 * request, but we were unable to complete the tail
1606 * of the last page because the file didn't have
1607 * an allocation to back that portion... this is ok.
1611 if (cbp_head
&& (complete_transaction_now
|| size
== 0)) {
1612 cluster_wait_IO(cbp_head
, (flags
& CL_ASYNC
));
1614 cluster_EOT(cbp_head
, cbp_tail
, size
== 0 ? zero_offset
: 0);
1616 cluster_complete_transaction(&cbp_head
, callback_arg
, &retval
, flags
, 0);
1622 if (pg_count
> max_vectors
) {
1623 if (((pg_count
- max_vectors
) * PAGE_SIZE
) > io_size
) {
1624 io_size
= PAGE_SIZE
- pg_offset
;
1627 io_size
-= (pg_count
- max_vectors
) * PAGE_SIZE
;
1628 pg_count
= max_vectors
;
1632 * If the transaction is going to reach the maximum number of
1633 * desired elements, truncate the i/o to the nearest page so
1634 * that the actual i/o is initiated after this buffer is
1635 * created and added to the i/o chain.
1637 * I/O directed to physically contiguous memory
1638 * doesn't have a requirement to make sure we 'fill' a page
1640 if ( !(flags
& CL_DEV_MEMORY
) && trans_count
>= max_trans_count
&&
1641 ((upl_offset
+ io_size
) & PAGE_MASK
)) {
1642 vm_offset_t aligned_ofs
;
1644 aligned_ofs
= (upl_offset
+ io_size
) & ~PAGE_MASK
;
1646 * If the io_size does not actually finish off even a
1647 * single page we have to keep adding buffers to the
1648 * transaction despite having reached the desired limit.
1650 * Eventually we get here with the page being finished
1651 * off (and exceeded) and then we truncate the size of
1652 * this i/o request so that it is page aligned so that
1653 * we can finally issue the i/o on the transaction.
1655 if (aligned_ofs
> upl_offset
) {
1656 io_size
= aligned_ofs
- upl_offset
;
1661 if ( !(mp
->mnt_kern_flag
& MNTK_VIRTUALDEV
))
1663 * if we're not targeting a virtual device i.e. a disk image
1664 * it's safe to dip into the reserve pool since real devices
1665 * can complete this I/O request without requiring additional
1666 * bufs from the alloc_io_buf pool
1669 else if ((flags
& CL_ASYNC
) && !(flags
& CL_PAGEOUT
))
1671 * Throttle the speculative IO
1677 cbp
= alloc_io_buf(vp
, priv
);
1679 if (flags
& CL_PAGEOUT
) {
1683 * since blocks are in offsets of 0x1000, scale
1684 * iteration to (PAGE_SIZE * pg_count) of blks.
1686 for (i
= 0; i
< (PAGE_SIZE
* pg_count
)/0x1000; i
++) {
1687 if (buf_invalblkno(vp
, lblkno
+ i
, 0) == EBUSY
)
1688 panic("BUSY bp found in cluster_io");
1691 if (flags
& CL_ASYNC
) {
1692 if (buf_setcallback(cbp
, (void *)cluster_iodone
, callback_arg
))
1693 panic("buf_setcallback failed\n");
1695 cbp
->b_cliodone
= (void *)callback
;
1696 cbp
->b_flags
|= io_flags
;
1697 if (flags
& CL_NOCACHE
)
1698 cbp
->b_attr
.ba_flags
|= BA_NOCACHE
;
1700 cbp
->b_lblkno
= lblkno
;
1701 cbp
->b_blkno
= blkno
;
1702 cbp
->b_bcount
= io_size
;
1704 if (buf_setupl(cbp
, upl
, upl_offset
))
1705 panic("buf_setupl failed\n");
1707 upl_set_blkno(upl
, upl_offset
, io_size
, blkno
);
1709 cbp
->b_trans_next
= (buf_t
)NULL
;
1711 if ((cbp
->b_iostate
= (void *)iostate
))
1713 * caller wants to track the state of this
1714 * io... bump the amount issued against this stream
1716 iostate
->io_issued
+= io_size
;
1718 if (flags
& CL_READ
) {
1719 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 26)) | DBG_FUNC_NONE
,
1720 (int)cbp
->b_lblkno
, (int)cbp
->b_blkno
, upl_offset
, io_size
, 0);
1723 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 27)) | DBG_FUNC_NONE
,
1724 (int)cbp
->b_lblkno
, (int)cbp
->b_blkno
, upl_offset
, io_size
, 0);
1728 cbp_tail
->b_trans_next
= cbp
;
1734 if ( (cbp_head
->b_real_bp
= real_bp
) )
1735 real_bp
= (buf_t
)NULL
;
1737 *(buf_t
*)(&cbp
->b_trans_head
) = cbp_head
;
1741 upl_offset
+= io_size
;
1742 f_offset
+= io_size
;
1745 * keep track of how much of the original request
1746 * that we've actually completed... non_rounded_size
1747 * may go negative due to us rounding the request
1748 * to a page size multiple (i.e. size > non_rounded_size)
1750 non_rounded_size
-= io_size
;
1752 if (non_rounded_size
<= 0) {
1754 * we've transferred all of the data in the original
1755 * request, but we were unable to complete the tail
1756 * of the last page because the file didn't have
1757 * an allocation to back that portion... this is ok.
1763 * we have no more I/O to issue, so go
1764 * finish the final transaction
1767 } else if ( ((flags
& CL_DEV_MEMORY
) || (upl_offset
& PAGE_MASK
) == 0) &&
1768 ((flags
& CL_ASYNC
) || trans_count
> max_trans_count
) ) {
1770 * I/O directed to physically contiguous memory...
1771 * which doesn't have a requirement to make sure we 'fill' a page
1773 * the current I/O we've prepared fully
1774 * completes the last page in this request
1776 * it's either an ASYNC request or
1777 * we've already accumulated more than 8 I/O's into
1778 * this transaction so mark it as complete so that
1779 * it can finish asynchronously or via the cluster_complete_transaction
1780 * below if the request is synchronous
1784 if (need_EOT
== TRUE
)
1785 cluster_EOT(cbp_head
, cbp_tail
, size
== 0 ? zero_offset
: 0);
1787 if (flags
& CL_THROTTLE
)
1788 (void)vnode_waitforwrites(vp
, async_throttle
, 0, 0, "cluster_io");
1790 if ( !(io_flags
& B_READ
))
1791 vnode_startwrite(vp
);
1793 if (flags
& CL_RAW_ENCRYPTED
) {
1795 * User requested raw encrypted bytes.
1796 * Twiddle the bit in the ba_flags for the buffer
1798 cbp
->b_attr
.ba_flags
|= BA_RAW_ENCRYPTED_IO
;
1801 (void) VNOP_STRATEGY(cbp
);
1803 if (need_EOT
== TRUE
) {
1804 if ( !(flags
& CL_ASYNC
))
1805 cluster_complete_transaction(&cbp_head
, callback_arg
, &retval
, flags
, 1);
1819 * Wait until all of the outstanding I/O
1820 * for this partial transaction has completed
1822 cluster_wait_IO(cbp_head
, (flags
& CL_ASYNC
));
1825 * Rewind the upl offset to the beginning of the
1828 upl_offset
= cbp_head
->b_uploffset
;
1831 if (ISSET(flags
, CL_COMMIT
)) {
1832 cluster_handle_associated_upl(iostate
, upl
, upl_offset
,
1833 upl_end_offset
- upl_offset
);
1836 // Free all the IO buffers in this transaction
1837 for (cbp
= cbp_head
; cbp
;) {
1840 size
+= cbp
->b_bcount
;
1841 io_size
+= cbp
->b_bcount
;
1843 cbp_next
= cbp
->b_trans_next
;
1849 int need_wakeup
= 0;
1852 * update the error condition for this stream
1853 * since we never really issued the io
1854 * just go ahead and adjust it back
1856 lck_mtx_lock_spin(&iostate
->io_mtxp
);
1858 if (iostate
->io_error
== 0)
1859 iostate
->io_error
= error
;
1860 iostate
->io_issued
-= io_size
;
1862 if (iostate
->io_wanted
) {
1864 * someone is waiting for the state of
1865 * this io stream to change
1867 iostate
->io_wanted
= 0;
1870 lck_mtx_unlock(&iostate
->io_mtxp
);
1873 wakeup((caddr_t
)&iostate
->io_wanted
);
1876 if (flags
& CL_COMMIT
) {
1879 pg_offset
= upl_offset
& PAGE_MASK
;
1880 abort_size
= (upl_end_offset
- upl_offset
+ PAGE_MASK
) & ~PAGE_MASK
;
1882 upl_flags
= cluster_ioerror(upl
, upl_offset
- pg_offset
, abort_size
, error
, io_flags
, vp
);
1884 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 28)) | DBG_FUNC_NONE
,
1885 upl
, upl_offset
- pg_offset
, abort_size
, (error
<< 24) | upl_flags
, 0);
1889 } else if (cbp_head
)
1890 panic("%s(): cbp_head is not NULL.\n", __FUNCTION__
);
1894 * can get here if we either encountered an error
1895 * or we completely zero-filled the request and
1899 real_bp
->b_flags
|= B_ERROR
;
1900 real_bp
->b_error
= error
;
1902 buf_biodone(real_bp
);
1904 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 22)) | DBG_FUNC_END
, (int)f_offset
, size
, upl_offset
, retval
, 0);
1909 #define reset_vector_run_state() \
1910 issueVectorUPL = vector_upl_offset = vector_upl_index = vector_upl_iosize = vector_upl_size = 0;
1913 vector_cluster_io(vnode_t vp
, upl_t vector_upl
, vm_offset_t vector_upl_offset
, off_t v_upl_uio_offset
, int vector_upl_iosize
,
1914 int io_flag
, buf_t real_bp
, struct clios
*iostate
, int (*callback
)(buf_t
, void *), void *callback_arg
)
1916 vector_upl_set_pagelist(vector_upl
);
1918 if(io_flag
& CL_READ
) {
1919 if(vector_upl_offset
== 0 && ((vector_upl_iosize
& PAGE_MASK
)==0))
1920 io_flag
&= ~CL_PRESERVE
; /*don't zero fill*/
1922 io_flag
|= CL_PRESERVE
; /*zero fill*/
1924 return (cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, real_bp
, iostate
, callback
, callback_arg
));
1929 cluster_read_prefetch(vnode_t vp
, off_t f_offset
, u_int size
, off_t filesize
, int (*callback
)(buf_t
, void *), void *callback_arg
, int bflag
)
1931 int pages_in_prefetch
;
1933 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_START
,
1934 (int)f_offset
, size
, (int)filesize
, 0, 0);
1936 if (f_offset
>= filesize
) {
1937 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_END
,
1938 (int)f_offset
, 0, 0, 0, 0);
1941 if ((off_t
)size
> (filesize
- f_offset
))
1942 size
= filesize
- f_offset
;
1943 pages_in_prefetch
= (size
+ (PAGE_SIZE
- 1)) / PAGE_SIZE
;
1945 advisory_read_ext(vp
, filesize
, f_offset
, size
, callback
, callback_arg
, bflag
);
1947 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 49)) | DBG_FUNC_END
,
1948 (int)f_offset
+ size
, pages_in_prefetch
, 0, 1, 0);
1950 return (pages_in_prefetch
);
1956 cluster_read_ahead(vnode_t vp
, struct cl_extent
*extent
, off_t filesize
, struct cl_readahead
*rap
, int (*callback
)(buf_t
, void *), void *callback_arg
,
1961 int size_of_prefetch
;
1965 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_START
,
1966 (int)extent
->b_addr
, (int)extent
->e_addr
, (int)rap
->cl_lastr
, 0, 0);
1968 if (extent
->b_addr
== rap
->cl_lastr
&& extent
->b_addr
== extent
->e_addr
) {
1969 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
1970 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 0, 0);
1973 if (rap
->cl_lastr
== -1 || (extent
->b_addr
!= rap
->cl_lastr
&& extent
->b_addr
!= (rap
->cl_lastr
+ 1))) {
1977 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
1978 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 1, 0);
1982 max_prefetch
= MAX_PREFETCH(vp
, cluster_max_io_size(vp
->v_mount
, CL_READ
), disk_conditioner_mount_is_ssd(vp
->v_mount
));
1984 if (max_prefetch
> speculative_prefetch_max
)
1985 max_prefetch
= speculative_prefetch_max
;
1987 if (max_prefetch
<= PAGE_SIZE
) {
1988 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
1989 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 6, 0);
1992 if (extent
->e_addr
< rap
->cl_maxra
&& rap
->cl_ralen
>= 4) {
1993 if ((rap
->cl_maxra
- extent
->e_addr
) > (rap
->cl_ralen
/ 4)) {
1995 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
1996 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 2, 0);
2000 r_addr
= max(extent
->e_addr
, rap
->cl_maxra
) + 1;
2001 f_offset
= (off_t
)(r_addr
* PAGE_SIZE_64
);
2003 size_of_prefetch
= 0;
2005 ubc_range_op(vp
, f_offset
, f_offset
+ PAGE_SIZE_64
, UPL_ROP_PRESENT
, &size_of_prefetch
);
2007 if (size_of_prefetch
) {
2008 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
2009 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 3, 0);
2012 if (f_offset
< filesize
) {
2013 daddr64_t read_size
;
2015 rap
->cl_ralen
= rap
->cl_ralen
? min(max_prefetch
/ PAGE_SIZE
, rap
->cl_ralen
<< 1) : 1;
2017 read_size
= (extent
->e_addr
+ 1) - extent
->b_addr
;
2019 if (read_size
> rap
->cl_ralen
) {
2020 if (read_size
> max_prefetch
/ PAGE_SIZE
)
2021 rap
->cl_ralen
= max_prefetch
/ PAGE_SIZE
;
2023 rap
->cl_ralen
= read_size
;
2025 size_of_prefetch
= cluster_read_prefetch(vp
, f_offset
, rap
->cl_ralen
* PAGE_SIZE
, filesize
, callback
, callback_arg
, bflag
);
2027 if (size_of_prefetch
)
2028 rap
->cl_maxra
= (r_addr
+ size_of_prefetch
) - 1;
2030 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 48)) | DBG_FUNC_END
,
2031 rap
->cl_ralen
, (int)rap
->cl_maxra
, (int)rap
->cl_lastr
, 4, 0);
2036 cluster_pageout(vnode_t vp
, upl_t upl
, upl_offset_t upl_offset
, off_t f_offset
,
2037 int size
, off_t filesize
, int flags
)
2039 return cluster_pageout_ext(vp
, upl
, upl_offset
, f_offset
, size
, filesize
, flags
, NULL
, NULL
);
2045 cluster_pageout_ext(vnode_t vp
, upl_t upl
, upl_offset_t upl_offset
, off_t f_offset
,
2046 int size
, off_t filesize
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
2053 local_flags
= CL_PAGEOUT
| CL_THROTTLE
;
2055 if ((flags
& UPL_IOSYNC
) == 0)
2056 local_flags
|= CL_ASYNC
;
2057 if ((flags
& UPL_NOCOMMIT
) == 0)
2058 local_flags
|= CL_COMMIT
;
2059 if ((flags
& UPL_KEEPCACHED
))
2060 local_flags
|= CL_KEEPCACHED
;
2061 if (flags
& UPL_PAGING_ENCRYPTED
)
2062 local_flags
|= CL_ENCRYPTED
;
2065 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 52)) | DBG_FUNC_NONE
,
2066 (int)f_offset
, size
, (int)filesize
, local_flags
, 0);
2069 * If they didn't specify any I/O, then we are done...
2070 * we can't issue an abort because we don't know how
2071 * big the upl really is
2076 if (vp
->v_mount
->mnt_flag
& MNT_RDONLY
) {
2077 if (local_flags
& CL_COMMIT
)
2078 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
);
2082 * can't page-in from a negative offset
2083 * or if we're starting beyond the EOF
2084 * or if the file offset isn't page aligned
2085 * or the size requested isn't a multiple of PAGE_SIZE
2087 if (f_offset
< 0 || f_offset
>= filesize
||
2088 (f_offset
& PAGE_MASK_64
) || (size
& PAGE_MASK
)) {
2089 if (local_flags
& CL_COMMIT
)
2090 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
);
2093 max_size
= filesize
- f_offset
;
2095 if (size
< max_size
)
2100 rounded_size
= (io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
2102 if (size
> rounded_size
) {
2103 if (local_flags
& CL_COMMIT
)
2104 ubc_upl_abort_range(upl
, upl_offset
+ rounded_size
, size
- rounded_size
,
2105 UPL_ABORT_FREE_ON_EMPTY
);
2107 return (cluster_io(vp
, upl
, upl_offset
, f_offset
, io_size
,
2108 local_flags
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
));
2113 cluster_pagein(vnode_t vp
, upl_t upl
, upl_offset_t upl_offset
, off_t f_offset
,
2114 int size
, off_t filesize
, int flags
)
2116 return cluster_pagein_ext(vp
, upl
, upl_offset
, f_offset
, size
, filesize
, flags
, NULL
, NULL
);
2121 cluster_pagein_ext(vnode_t vp
, upl_t upl
, upl_offset_t upl_offset
, off_t f_offset
,
2122 int size
, off_t filesize
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
2128 int local_flags
= 0;
2130 if (upl
== NULL
|| size
< 0)
2131 panic("cluster_pagein: NULL upl passed in");
2133 if ((flags
& UPL_IOSYNC
) == 0)
2134 local_flags
|= CL_ASYNC
;
2135 if ((flags
& UPL_NOCOMMIT
) == 0)
2136 local_flags
|= CL_COMMIT
;
2137 if (flags
& UPL_IOSTREAMING
)
2138 local_flags
|= CL_IOSTREAMING
;
2139 if (flags
& UPL_PAGING_ENCRYPTED
)
2140 local_flags
|= CL_ENCRYPTED
;
2143 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 56)) | DBG_FUNC_NONE
,
2144 (int)f_offset
, size
, (int)filesize
, local_flags
, 0);
2147 * can't page-in from a negative offset
2148 * or if we're starting beyond the EOF
2149 * or if the file offset isn't page aligned
2150 * or the size requested isn't a multiple of PAGE_SIZE
2152 if (f_offset
< 0 || f_offset
>= filesize
||
2153 (f_offset
& PAGE_MASK_64
) || (size
& PAGE_MASK
) || (upl_offset
& PAGE_MASK
)) {
2154 if (local_flags
& CL_COMMIT
)
2155 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
);
2158 max_size
= filesize
- f_offset
;
2160 if (size
< max_size
)
2165 rounded_size
= (io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
2167 if (size
> rounded_size
&& (local_flags
& CL_COMMIT
))
2168 ubc_upl_abort_range(upl
, upl_offset
+ rounded_size
,
2169 size
- rounded_size
, UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
);
2171 retval
= cluster_io(vp
, upl
, upl_offset
, f_offset
, io_size
,
2172 local_flags
| CL_READ
| CL_PAGEIN
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
);
2179 cluster_bp(buf_t bp
)
2181 return cluster_bp_ext(bp
, NULL
, NULL
);
2186 cluster_bp_ext(buf_t bp
, int (*callback
)(buf_t
, void *), void *callback_arg
)
2191 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 19)) | DBG_FUNC_START
,
2192 bp
, (int)bp
->b_lblkno
, bp
->b_bcount
, bp
->b_flags
, 0);
2194 if (bp
->b_flags
& B_READ
)
2195 flags
= CL_ASYNC
| CL_READ
;
2198 if (bp
->b_flags
& B_PASSIVE
)
2199 flags
|= CL_PASSIVE
;
2201 f_offset
= ubc_blktooff(bp
->b_vp
, bp
->b_lblkno
);
2203 return (cluster_io(bp
->b_vp
, bp
->b_upl
, 0, f_offset
, bp
->b_bcount
, flags
, bp
, (struct clios
*)NULL
, callback
, callback_arg
));
2209 cluster_write(vnode_t vp
, struct uio
*uio
, off_t oldEOF
, off_t newEOF
, off_t headOff
, off_t tailOff
, int xflags
)
2211 return cluster_write_ext(vp
, uio
, oldEOF
, newEOF
, headOff
, tailOff
, xflags
, NULL
, NULL
);
2216 cluster_write_ext(vnode_t vp
, struct uio
*uio
, off_t oldEOF
, off_t newEOF
, off_t headOff
, off_t tailOff
,
2217 int xflags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
2219 user_ssize_t cur_resid
;
2224 int write_type
= IO_COPY
;
2225 u_int32_t write_length
;
2229 if (flags
& IO_PASSIVE
)
2234 if (vp
->v_flag
& VNOCACHE_DATA
){
2235 flags
|= IO_NOCACHE
;
2236 bflag
|= CL_NOCACHE
;
2241 * this call is being made to zero-fill some range in the file
2243 retval
= cluster_write_copy(vp
, NULL
, (u_int32_t
)0, oldEOF
, newEOF
, headOff
, tailOff
, flags
, callback
, callback_arg
);
2248 * do a write through the cache if one of the following is true....
2249 * NOCACHE is not true or NODIRECT is true
2250 * the uio request doesn't target USERSPACE
2251 * otherwise, find out if we want the direct or contig variant for
2252 * the first vector in the uio request
2254 if ( ((flags
& (IO_NOCACHE
| IO_NODIRECT
)) == IO_NOCACHE
) && UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
) )
2255 retval
= cluster_io_type(uio
, &write_type
, &write_length
, MIN_DIRECT_WRITE_SIZE
);
2257 if ( (flags
& (IO_TAILZEROFILL
| IO_HEADZEROFILL
)) && write_type
== IO_DIRECT
)
2259 * must go through the cached variant in this case
2261 write_type
= IO_COPY
;
2263 while ((cur_resid
= uio_resid(uio
)) && uio
->uio_offset
< newEOF
&& retval
== 0) {
2265 switch (write_type
) {
2269 * make sure the uio_resid isn't too big...
2270 * internally, we want to handle all of the I/O in
2271 * chunk sizes that fit in a 32 bit int
2273 if (cur_resid
> (user_ssize_t
)(MAX_IO_REQUEST_SIZE
)) {
2275 * we're going to have to call cluster_write_copy
2278 * only want the last call to cluster_write_copy to
2279 * have the IO_TAILZEROFILL flag set and only the
2280 * first call should have IO_HEADZEROFILL
2282 zflags
= flags
& ~IO_TAILZEROFILL
;
2283 flags
&= ~IO_HEADZEROFILL
;
2285 write_length
= MAX_IO_REQUEST_SIZE
;
2288 * last call to cluster_write_copy
2292 write_length
= (u_int32_t
)cur_resid
;
2294 retval
= cluster_write_copy(vp
, uio
, write_length
, oldEOF
, newEOF
, headOff
, tailOff
, zflags
, callback
, callback_arg
);
2298 zflags
= flags
& ~(IO_TAILZEROFILL
| IO_HEADZEROFILL
);
2300 if (flags
& IO_HEADZEROFILL
) {
2302 * only do this once per request
2304 flags
&= ~IO_HEADZEROFILL
;
2306 retval
= cluster_write_copy(vp
, (struct uio
*)0, (u_int32_t
)0, (off_t
)0, uio
->uio_offset
,
2307 headOff
, (off_t
)0, zflags
| IO_HEADZEROFILL
| IO_SYNC
, callback
, callback_arg
);
2311 retval
= cluster_write_contig(vp
, uio
, newEOF
, &write_type
, &write_length
, callback
, callback_arg
, bflag
);
2313 if (retval
== 0 && (flags
& IO_TAILZEROFILL
) && uio_resid(uio
) == 0) {
2315 * we're done with the data from the user specified buffer(s)
2316 * and we've been requested to zero fill at the tail
2317 * treat this as an IO_HEADZEROFILL which doesn't require a uio
2318 * by rearranging the args and passing in IO_HEADZEROFILL
2320 retval
= cluster_write_copy(vp
, (struct uio
*)0, (u_int32_t
)0, (off_t
)0, tailOff
, uio
->uio_offset
,
2321 (off_t
)0, zflags
| IO_HEADZEROFILL
| IO_SYNC
, callback
, callback_arg
);
2327 * cluster_write_direct is never called with IO_TAILZEROFILL || IO_HEADZEROFILL
2329 retval
= cluster_write_direct(vp
, uio
, oldEOF
, newEOF
, &write_type
, &write_length
, flags
, callback
, callback_arg
);
2333 retval
= cluster_io_type(uio
, &write_type
, &write_length
, MIN_DIRECT_WRITE_SIZE
);
2337 * in case we end up calling cluster_write_copy (from cluster_write_direct)
2338 * multiple times to service a multi-vector request that is not aligned properly
2339 * we need to update the oldEOF so that we
2340 * don't zero-fill the head of a page if we've successfully written
2341 * data to that area... 'cluster_write_copy' will zero-fill the head of a
2342 * page that is beyond the oldEOF if the write is unaligned... we only
2343 * want that to happen for the very first page of the cluster_write,
2344 * NOT the first page of each vector making up a multi-vector write.
2346 if (uio
->uio_offset
> oldEOF
)
2347 oldEOF
= uio
->uio_offset
;
2354 cluster_write_direct(vnode_t vp
, struct uio
*uio
, off_t oldEOF
, off_t newEOF
, int *write_type
, u_int32_t
*write_length
,
2355 int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
2358 upl_page_info_t
*pl
;
2359 vm_offset_t upl_offset
;
2360 vm_offset_t vector_upl_offset
= 0;
2361 u_int32_t io_req_size
;
2362 u_int32_t offset_in_file
;
2363 u_int32_t offset_in_iovbase
;
2366 upl_size_t upl_size
, vector_upl_size
= 0;
2367 vm_size_t upl_needed_size
;
2368 mach_msg_type_number_t pages_in_pl
;
2369 upl_control_flags_t upl_flags
;
2371 mach_msg_type_number_t i
;
2372 int force_data_sync
;
2375 struct clios iostate
;
2376 user_addr_t iov_base
;
2377 u_int32_t mem_alignment_mask
;
2378 u_int32_t devblocksize
;
2379 u_int32_t max_io_size
;
2380 u_int32_t max_upl_size
;
2381 u_int32_t max_vector_size
;
2382 u_int32_t bytes_outstanding_limit
;
2383 boolean_t io_throttled
= FALSE
;
2385 u_int32_t vector_upl_iosize
= 0;
2386 int issueVectorUPL
= 0,useVectorUPL
= (uio
->uio_iovcnt
> 1);
2387 off_t v_upl_uio_offset
= 0;
2388 int vector_upl_index
=0;
2389 upl_t vector_upl
= NULL
;
2393 * When we enter this routine, we know
2394 * -- the resid will not exceed iov_len
2396 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_START
,
2397 (int)uio
->uio_offset
, *write_length
, (int)newEOF
, 0, 0);
2399 max_upl_size
= cluster_max_io_size(vp
->v_mount
, CL_WRITE
);
2401 io_flag
= CL_ASYNC
| CL_PRESERVE
| CL_COMMIT
| CL_THROTTLE
| CL_DIRECT_IO
;
2403 if (flags
& IO_PASSIVE
)
2404 io_flag
|= CL_PASSIVE
;
2406 if (flags
& IO_NOCACHE
)
2407 io_flag
|= CL_NOCACHE
;
2409 if (flags
& IO_SKIP_ENCRYPTION
)
2410 io_flag
|= CL_ENCRYPTED
;
2412 iostate
.io_completed
= 0;
2413 iostate
.io_issued
= 0;
2414 iostate
.io_error
= 0;
2415 iostate
.io_wanted
= 0;
2417 lck_mtx_init(&iostate
.io_mtxp
, cl_mtx_grp
, cl_mtx_attr
);
2419 mem_alignment_mask
= (u_int32_t
)vp
->v_mount
->mnt_alignmentmask
;
2420 devblocksize
= (u_int32_t
)vp
->v_mount
->mnt_devblocksize
;
2422 if (devblocksize
== 1) {
2424 * the AFP client advertises a devblocksize of 1
2425 * however, its BLOCKMAP routine maps to physical
2426 * blocks that are PAGE_SIZE in size...
2427 * therefore we can't ask for I/Os that aren't page aligned
2428 * or aren't multiples of PAGE_SIZE in size
2429 * by setting devblocksize to PAGE_SIZE, we re-instate
2430 * the old behavior we had before the mem_alignment_mask
2431 * changes went in...
2433 devblocksize
= PAGE_SIZE
;
2437 io_req_size
= *write_length
;
2438 iov_base
= uio_curriovbase(uio
);
2440 offset_in_file
= (u_int32_t
)uio
->uio_offset
& PAGE_MASK
;
2441 offset_in_iovbase
= (u_int32_t
)iov_base
& mem_alignment_mask
;
2443 if (offset_in_file
|| offset_in_iovbase
) {
2445 * one of the 2 important offsets is misaligned
2446 * so fire an I/O through the cache for this entire vector
2448 goto wait_for_dwrites
;
2450 if (iov_base
& (devblocksize
- 1)) {
2452 * the offset in memory must be on a device block boundary
2453 * so that we can guarantee that we can generate an
2454 * I/O that ends on a page boundary in cluster_io
2456 goto wait_for_dwrites
;
2459 task_update_logical_writes(current_task(), (io_req_size
& ~PAGE_MASK
), TASK_WRITE_IMMEDIATE
, vp
);
2460 while (io_req_size
>= PAGE_SIZE
&& uio
->uio_offset
< newEOF
&& retval
== 0) {
2463 if ( (throttle_type
= cluster_is_throttled(vp
)) ) {
2465 * we're in the throttle window, at the very least
2466 * we want to limit the size of the I/O we're about
2469 if ( (flags
& IO_RETURN_ON_THROTTLE
) && throttle_type
== THROTTLE_NOW
) {
2471 * we're in the throttle window and at least 1 I/O
2472 * has already been issued by a throttleable thread
2473 * in this window, so return with EAGAIN to indicate
2474 * to the FS issuing the cluster_write call that it
2475 * should now throttle after dropping any locks
2477 throttle_info_update_by_mount(vp
->v_mount
);
2479 io_throttled
= TRUE
;
2480 goto wait_for_dwrites
;
2482 max_vector_size
= THROTTLE_MAX_IOSIZE
;
2483 max_io_size
= THROTTLE_MAX_IOSIZE
;
2485 max_vector_size
= MAX_VECTOR_UPL_SIZE
;
2486 max_io_size
= max_upl_size
;
2490 cluster_syncup(vp
, newEOF
, callback
, callback_arg
, callback
? PUSH_SYNC
: 0);
2493 io_size
= io_req_size
& ~PAGE_MASK
;
2494 iov_base
= uio_curriovbase(uio
);
2496 if (io_size
> max_io_size
)
2497 io_size
= max_io_size
;
2499 if(useVectorUPL
&& (iov_base
& PAGE_MASK
)) {
2501 * We have an iov_base that's not page-aligned.
2502 * Issue all I/O's that have been collected within
2503 * this Vectored UPL.
2505 if(vector_upl_index
) {
2506 retval
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
2507 reset_vector_run_state();
2511 * After this point, if we are using the Vector UPL path and the base is
2512 * not page-aligned then the UPL with that base will be the first in the vector UPL.
2516 upl_offset
= (vm_offset_t
)((u_int32_t
)iov_base
& PAGE_MASK
);
2517 upl_needed_size
= (upl_offset
+ io_size
+ (PAGE_SIZE
-1)) & ~PAGE_MASK
;
2519 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_START
,
2520 (int)upl_offset
, upl_needed_size
, (int)iov_base
, io_size
, 0);
2522 vm_map_t map
= UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
) ? current_map() : kernel_map
;
2523 for (force_data_sync
= 0; force_data_sync
< 3; force_data_sync
++) {
2525 upl_size
= upl_needed_size
;
2526 upl_flags
= UPL_FILE_IO
| UPL_COPYOUT_FROM
| UPL_NO_SYNC
|
2527 UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
2529 kret
= vm_map_get_upl(map
,
2530 (vm_map_offset_t
)(iov_base
& ~((user_addr_t
)PAGE_MASK
)),
2536 VM_KERN_MEMORY_FILE
,
2539 if (kret
!= KERN_SUCCESS
) {
2540 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
2543 * failed to get pagelist
2545 * we may have already spun some portion of this request
2546 * off as async requests... we need to wait for the I/O
2547 * to complete before returning
2549 goto wait_for_dwrites
;
2551 pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
2552 pages_in_pl
= upl_size
/ PAGE_SIZE
;
2554 for (i
= 0; i
< pages_in_pl
; i
++) {
2555 if (!upl_valid_page(pl
, i
))
2558 if (i
== pages_in_pl
)
2562 * didn't get all the pages back that we
2563 * needed... release this upl and try again
2565 ubc_upl_abort(upl
, 0);
2567 if (force_data_sync
>= 3) {
2568 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
2569 i
, pages_in_pl
, upl_size
, kret
, 0);
2571 * for some reason, we couldn't acquire a hold on all
2572 * the pages needed in the user's address space
2574 * we may have already spun some portion of this request
2575 * off as async requests... we need to wait for the I/O
2576 * to complete before returning
2578 goto wait_for_dwrites
;
2582 * Consider the possibility that upl_size wasn't satisfied.
2584 if (upl_size
< upl_needed_size
) {
2585 if (upl_size
&& upl_offset
== 0)
2590 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 76)) | DBG_FUNC_END
,
2591 (int)upl_offset
, upl_size
, (int)iov_base
, io_size
, 0);
2594 ubc_upl_abort(upl
, 0);
2596 * we may have already spun some portion of this request
2597 * off as async requests... we need to wait for the I/O
2598 * to complete before returning
2600 goto wait_for_dwrites
;
2604 vm_offset_t end_off
= ((iov_base
+ io_size
) & PAGE_MASK
);
2608 * After this point, if we are using a vector UPL, then
2609 * either all the UPL elements end on a page boundary OR
2610 * this UPL is the last element because it does not end
2611 * on a page boundary.
2616 * we want push out these writes asynchronously so that we can overlap
2617 * the preparation of the next I/O
2618 * if there are already too many outstanding writes
2619 * wait until some complete before issuing the next
2621 if (vp
->v_mount
->mnt_minsaturationbytecount
)
2622 bytes_outstanding_limit
= vp
->v_mount
->mnt_minsaturationbytecount
;
2624 bytes_outstanding_limit
= max_upl_size
* IO_SCALE(vp
, 2);
2626 cluster_iostate_wait(&iostate
, bytes_outstanding_limit
, "cluster_write_direct");
2628 if (iostate
.io_error
) {
2630 * one of the earlier writes we issued ran into a hard error
2631 * don't issue any more writes, cleanup the UPL
2632 * that was just created but not used, then
2633 * go wait for all writes that are part of this stream
2634 * to complete before returning the error to the caller
2636 ubc_upl_abort(upl
, 0);
2638 goto wait_for_dwrites
;
2641 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 77)) | DBG_FUNC_START
,
2642 (int)upl_offset
, (int)uio
->uio_offset
, io_size
, io_flag
, 0);
2645 retval
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
,
2646 io_size
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
2649 if(!vector_upl_index
) {
2650 vector_upl
= vector_upl_create(upl_offset
);
2651 v_upl_uio_offset
= uio
->uio_offset
;
2652 vector_upl_offset
= upl_offset
;
2655 vector_upl_set_subupl(vector_upl
,upl
,upl_size
);
2656 vector_upl_set_iostate(vector_upl
, upl
, vector_upl_size
, upl_size
);
2658 vector_upl_iosize
+= io_size
;
2659 vector_upl_size
+= upl_size
;
2661 if(issueVectorUPL
|| vector_upl_index
== MAX_VECTOR_UPL_ELEMENTS
|| vector_upl_size
>= max_vector_size
) {
2662 retval
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
2663 reset_vector_run_state();
2668 * update the uio structure to
2669 * reflect the I/O that we just issued
2671 uio_update(uio
, (user_size_t
)io_size
);
2674 * in case we end up calling through to cluster_write_copy to finish
2675 * the tail of this request, we need to update the oldEOF so that we
2676 * don't zero-fill the head of a page if we've successfully written
2677 * data to that area... 'cluster_write_copy' will zero-fill the head of a
2678 * page that is beyond the oldEOF if the write is unaligned... we only
2679 * want that to happen for the very first page of the cluster_write,
2680 * NOT the first page of each vector making up a multi-vector write.
2682 if (uio
->uio_offset
> oldEOF
)
2683 oldEOF
= uio
->uio_offset
;
2685 io_req_size
-= io_size
;
2687 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 77)) | DBG_FUNC_END
,
2688 (int)upl_offset
, (int)uio
->uio_offset
, io_req_size
, retval
, 0);
2692 if (retval
== 0 && iostate
.io_error
== 0 && io_req_size
== 0) {
2694 retval
= cluster_io_type(uio
, write_type
, write_length
, MIN_DIRECT_WRITE_SIZE
);
2696 if (retval
== 0 && *write_type
== IO_DIRECT
) {
2698 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_NONE
,
2699 (int)uio
->uio_offset
, *write_length
, (int)newEOF
, 0, 0);
2707 if (retval
== 0 && iostate
.io_error
== 0 && useVectorUPL
&& vector_upl_index
) {
2708 retval
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
2709 reset_vector_run_state();
2712 * make sure all async writes issued as part of this stream
2713 * have completed before we return
2715 cluster_iostate_wait(&iostate
, 0, "cluster_write_direct");
2717 if (iostate
.io_error
)
2718 retval
= iostate
.io_error
;
2720 lck_mtx_destroy(&iostate
.io_mtxp
, cl_mtx_grp
);
2722 if (io_throttled
== TRUE
&& retval
== 0)
2725 if (io_req_size
&& retval
== 0) {
2727 * we couldn't handle the tail of this request in DIRECT mode
2728 * so fire it through the copy path
2730 * note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set
2731 * so we can just pass 0 in for the headOff and tailOff
2733 if (uio
->uio_offset
> oldEOF
)
2734 oldEOF
= uio
->uio_offset
;
2736 retval
= cluster_write_copy(vp
, uio
, io_req_size
, oldEOF
, newEOF
, (off_t
)0, (off_t
)0, flags
, callback
, callback_arg
);
2738 *write_type
= IO_UNKNOWN
;
2740 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 75)) | DBG_FUNC_END
,
2741 (int)uio
->uio_offset
, io_req_size
, retval
, 4, 0);
2748 cluster_write_contig(vnode_t vp
, struct uio
*uio
, off_t newEOF
, int *write_type
, u_int32_t
*write_length
,
2749 int (*callback
)(buf_t
, void *), void *callback_arg
, int bflag
)
2751 upl_page_info_t
*pl
;
2752 addr64_t src_paddr
= 0;
2753 upl_t upl
[MAX_VECTS
];
2754 vm_offset_t upl_offset
;
2755 u_int32_t tail_size
= 0;
2758 upl_size_t upl_size
;
2759 vm_size_t upl_needed_size
;
2760 mach_msg_type_number_t pages_in_pl
;
2761 upl_control_flags_t upl_flags
;
2763 struct clios iostate
;
2768 user_addr_t iov_base
;
2769 u_int32_t devblocksize
;
2770 u_int32_t mem_alignment_mask
;
2773 * When we enter this routine, we know
2774 * -- the io_req_size will not exceed iov_len
2775 * -- the target address is physically contiguous
2777 cluster_syncup(vp
, newEOF
, callback
, callback_arg
, callback
? PUSH_SYNC
: 0);
2779 devblocksize
= (u_int32_t
)vp
->v_mount
->mnt_devblocksize
;
2780 mem_alignment_mask
= (u_int32_t
)vp
->v_mount
->mnt_alignmentmask
;
2782 iostate
.io_completed
= 0;
2783 iostate
.io_issued
= 0;
2784 iostate
.io_error
= 0;
2785 iostate
.io_wanted
= 0;
2787 lck_mtx_init(&iostate
.io_mtxp
, cl_mtx_grp
, cl_mtx_attr
);
2790 io_size
= *write_length
;
2792 iov_base
= uio_curriovbase(uio
);
2794 upl_offset
= (vm_offset_t
)((u_int32_t
)iov_base
& PAGE_MASK
);
2795 upl_needed_size
= upl_offset
+ io_size
;
2798 upl_size
= upl_needed_size
;
2799 upl_flags
= UPL_FILE_IO
| UPL_COPYOUT_FROM
| UPL_NO_SYNC
|
2800 UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
2802 vm_map_t map
= UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
) ? current_map() : kernel_map
;
2803 kret
= vm_map_get_upl(map
,
2804 (vm_map_offset_t
)(iov_base
& ~((user_addr_t
)PAGE_MASK
)),
2805 &upl_size
, &upl
[cur_upl
], NULL
, &pages_in_pl
, &upl_flags
, VM_KERN_MEMORY_FILE
, 0);
2807 if (kret
!= KERN_SUCCESS
) {
2809 * failed to get pagelist
2812 goto wait_for_cwrites
;
2817 * Consider the possibility that upl_size wasn't satisfied.
2819 if (upl_size
< upl_needed_size
) {
2821 * This is a failure in the physical memory case.
2824 goto wait_for_cwrites
;
2826 pl
= ubc_upl_pageinfo(upl
[cur_upl
]);
2828 src_paddr
= ((addr64_t
)upl_phys_page(pl
, 0) << PAGE_SHIFT
) + (addr64_t
)upl_offset
;
2830 while (((uio
->uio_offset
& (devblocksize
- 1)) || io_size
< devblocksize
) && io_size
) {
2831 u_int32_t head_size
;
2833 head_size
= devblocksize
- (u_int32_t
)(uio
->uio_offset
& (devblocksize
- 1));
2835 if (head_size
> io_size
)
2836 head_size
= io_size
;
2838 error
= cluster_align_phys_io(vp
, uio
, src_paddr
, head_size
, 0, callback
, callback_arg
);
2841 goto wait_for_cwrites
;
2843 upl_offset
+= head_size
;
2844 src_paddr
+= head_size
;
2845 io_size
-= head_size
;
2847 iov_base
+= head_size
;
2849 if ((u_int32_t
)iov_base
& mem_alignment_mask
) {
2851 * request doesn't set up on a memory boundary
2852 * the underlying DMA engine can handle...
2853 * return an error instead of going through
2854 * the slow copy path since the intent of this
2855 * path is direct I/O from device memory
2858 goto wait_for_cwrites
;
2861 tail_size
= io_size
& (devblocksize
- 1);
2862 io_size
-= tail_size
;
2864 while (io_size
&& error
== 0) {
2866 if (io_size
> MAX_IO_CONTIG_SIZE
)
2867 xsize
= MAX_IO_CONTIG_SIZE
;
2871 * request asynchronously so that we can overlap
2872 * the preparation of the next I/O... we'll do
2873 * the commit after all the I/O has completed
2874 * since its all issued against the same UPL
2875 * if there are already too many outstanding writes
2876 * wait until some have completed before issuing the next
2878 cluster_iostate_wait(&iostate
, MAX_IO_CONTIG_SIZE
* IO_SCALE(vp
, 2), "cluster_write_contig");
2880 if (iostate
.io_error
) {
2882 * one of the earlier writes we issued ran into a hard error
2883 * don't issue any more writes...
2884 * go wait for all writes that are part of this stream
2885 * to complete before returning the error to the caller
2887 goto wait_for_cwrites
;
2890 * issue an asynchronous write to cluster_io
2892 error
= cluster_io(vp
, upl
[cur_upl
], upl_offset
, uio
->uio_offset
,
2893 xsize
, CL_DEV_MEMORY
| CL_ASYNC
| bflag
, (buf_t
)NULL
, (struct clios
*)&iostate
, callback
, callback_arg
);
2897 * The cluster_io write completed successfully,
2898 * update the uio structure
2900 uio_update(uio
, (user_size_t
)xsize
);
2902 upl_offset
+= xsize
;
2907 if (error
== 0 && iostate
.io_error
== 0 && tail_size
== 0 && num_upl
< MAX_VECTS
) {
2909 error
= cluster_io_type(uio
, write_type
, write_length
, 0);
2911 if (error
== 0 && *write_type
== IO_CONTIG
) {
2916 *write_type
= IO_UNKNOWN
;
2920 * make sure all async writes that are part of this stream
2921 * have completed before we proceed
2923 cluster_iostate_wait(&iostate
, 0, "cluster_write_contig");
2925 if (iostate
.io_error
)
2926 error
= iostate
.io_error
;
2928 lck_mtx_destroy(&iostate
.io_mtxp
, cl_mtx_grp
);
2930 if (error
== 0 && tail_size
)
2931 error
= cluster_align_phys_io(vp
, uio
, src_paddr
, tail_size
, 0, callback
, callback_arg
);
2933 for (n
= 0; n
< num_upl
; n
++)
2935 * just release our hold on each physically contiguous
2936 * region without changing any state
2938 ubc_upl_abort(upl
[n
], 0);
2945 * need to avoid a race between an msync of a range of pages dirtied via mmap
2946 * vs a filesystem such as HFS deciding to write a 'hole' to disk via cluster_write's
2947 * zerofill mechanism before it has seen the VNOP_PAGEOUTs for the pages being msync'd
2949 * we should never force-zero-fill pages that are already valid in the cache...
2950 * the entire page contains valid data (either from disk, zero-filled or dirtied
2951 * via an mmap) so we can only do damage by trying to zero-fill
2955 cluster_zero_range(upl_t upl
, upl_page_info_t
*pl
, int flags
, int io_offset
, off_t zero_off
, off_t upl_f_offset
, int bytes_to_zero
)
2958 boolean_t need_cluster_zero
= TRUE
;
2960 if ((flags
& (IO_NOZEROVALID
| IO_NOZERODIRTY
))) {
2962 bytes_to_zero
= min(bytes_to_zero
, PAGE_SIZE
- (int)(zero_off
& PAGE_MASK_64
));
2963 zero_pg_index
= (int)((zero_off
- upl_f_offset
) / PAGE_SIZE_64
);
2965 if (upl_valid_page(pl
, zero_pg_index
)) {
2967 * never force zero valid pages - dirty or clean
2968 * we'll leave these in the UPL for cluster_write_copy to deal with
2970 need_cluster_zero
= FALSE
;
2973 if (need_cluster_zero
== TRUE
)
2974 cluster_zero(upl
, io_offset
, bytes_to_zero
, NULL
);
2976 return (bytes_to_zero
);
2981 cluster_write_copy(vnode_t vp
, struct uio
*uio
, u_int32_t io_req_size
, off_t oldEOF
, off_t newEOF
, off_t headOff
,
2982 off_t tailOff
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
2984 upl_page_info_t
*pl
;
2986 vm_offset_t upl_offset
= 0;
2999 long long total_size
;
3002 long long zero_cnt1
;
3004 off_t write_off
= 0;
3006 boolean_t first_pass
= FALSE
;
3007 struct cl_extent cl
;
3008 struct cl_writebehind
*wbp
;
3010 u_int max_cluster_pgcount
;
3014 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_START
,
3015 (int)uio
->uio_offset
, io_req_size
, (int)oldEOF
, (int)newEOF
, 0);
3017 io_resid
= io_req_size
;
3019 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_START
,
3020 0, 0, (int)oldEOF
, (int)newEOF
, 0);
3024 if (flags
& IO_PASSIVE
)
3028 if (flags
& IO_NOCACHE
)
3029 bflag
|= CL_NOCACHE
;
3031 if (flags
& IO_SKIP_ENCRYPTION
)
3032 bflag
|= CL_ENCRYPTED
;
3039 max_cluster_pgcount
= MAX_CLUSTER_SIZE(vp
) / PAGE_SIZE
;
3040 max_io_size
= cluster_max_io_size(vp
->v_mount
, CL_WRITE
);
3042 if (flags
& IO_HEADZEROFILL
) {
3044 * some filesystems (HFS is one) don't support unallocated holes within a file...
3045 * so we zero fill the intervening space between the old EOF and the offset
3046 * where the next chunk of real data begins.... ftruncate will also use this
3047 * routine to zero fill to the new EOF when growing a file... in this case, the
3048 * uio structure will not be provided
3051 if (headOff
< uio
->uio_offset
) {
3052 zero_cnt
= uio
->uio_offset
- headOff
;
3055 } else if (headOff
< newEOF
) {
3056 zero_cnt
= newEOF
- headOff
;
3060 if (uio
&& uio
->uio_offset
> oldEOF
) {
3061 zero_off
= uio
->uio_offset
& ~PAGE_MASK_64
;
3063 if (zero_off
>= oldEOF
) {
3064 zero_cnt
= uio
->uio_offset
- zero_off
;
3066 flags
|= IO_HEADZEROFILL
;
3070 if (flags
& IO_TAILZEROFILL
) {
3072 zero_off1
= uio
->uio_offset
+ io_req_size
;
3074 if (zero_off1
< tailOff
)
3075 zero_cnt1
= tailOff
- zero_off1
;
3078 if (uio
&& newEOF
> oldEOF
) {
3079 zero_off1
= uio
->uio_offset
+ io_req_size
;
3081 if (zero_off1
== newEOF
&& (zero_off1
& PAGE_MASK_64
)) {
3082 zero_cnt1
= PAGE_SIZE_64
- (zero_off1
& PAGE_MASK_64
);
3084 flags
|= IO_TAILZEROFILL
;
3088 if (zero_cnt
== 0 && uio
== (struct uio
*) 0) {
3089 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_END
,
3090 retval
, 0, 0, 0, 0);
3094 write_off
= uio
->uio_offset
;
3095 write_cnt
= uio_resid(uio
);
3097 * delay updating the sequential write info
3098 * in the control block until we've obtained
3103 while ((total_size
= (io_resid
+ zero_cnt
+ zero_cnt1
)) && retval
== 0) {
3105 * for this iteration of the loop, figure out where our starting point is
3108 start_offset
= (int)(zero_off
& PAGE_MASK_64
);
3109 upl_f_offset
= zero_off
- start_offset
;
3110 } else if (io_resid
) {
3111 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
3112 upl_f_offset
= uio
->uio_offset
- start_offset
;
3114 start_offset
= (int)(zero_off1
& PAGE_MASK_64
);
3115 upl_f_offset
= zero_off1
- start_offset
;
3117 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 46)) | DBG_FUNC_NONE
,
3118 (int)zero_off
, (int)zero_cnt
, (int)zero_off1
, (int)zero_cnt1
, 0);
3120 if (total_size
> max_io_size
)
3121 total_size
= max_io_size
;
3123 cl
.b_addr
= (daddr64_t
)(upl_f_offset
/ PAGE_SIZE_64
);
3125 if (uio
&& ((flags
& (IO_SYNC
| IO_HEADZEROFILL
| IO_TAILZEROFILL
)) == 0)) {
3127 * assumption... total_size <= io_resid
3128 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
3130 if ((start_offset
+ total_size
) > max_io_size
)
3131 total_size
= max_io_size
- start_offset
;
3132 xfer_resid
= total_size
;
3134 retval
= cluster_copy_ubc_data_internal(vp
, uio
, &xfer_resid
, 1, 1);
3139 io_resid
-= (total_size
- xfer_resid
);
3140 total_size
= xfer_resid
;
3141 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
3142 upl_f_offset
= uio
->uio_offset
- start_offset
;
3144 if (total_size
== 0) {
3147 * the write did not finish on a page boundary
3148 * which will leave upl_f_offset pointing to the
3149 * beginning of the last page written instead of
3150 * the page beyond it... bump it in this case
3151 * so that the cluster code records the last page
3154 upl_f_offset
+= PAGE_SIZE_64
;
3162 * compute the size of the upl needed to encompass
3163 * the requested write... limit each call to cluster_io
3164 * to the maximum UPL size... cluster_io will clip if
3165 * this exceeds the maximum io_size for the device,
3166 * make sure to account for
3167 * a starting offset that's not page aligned
3169 upl_size
= (start_offset
+ total_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
3171 if (upl_size
> max_io_size
)
3172 upl_size
= max_io_size
;
3174 pages_in_upl
= upl_size
/ PAGE_SIZE
;
3175 io_size
= upl_size
- start_offset
;
3177 if ((long long)io_size
> total_size
)
3178 io_size
= total_size
;
3180 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_START
, upl_size
, io_size
, total_size
, 0, 0);
3184 * Gather the pages from the buffer cache.
3185 * The UPL_WILL_MODIFY flag lets the UPL subsystem know
3186 * that we intend to modify these pages.
3188 kret
= ubc_create_upl_kernel(vp
,
3193 UPL_SET_LITE
| (( uio
!=NULL
&& (uio
->uio_flags
& UIO_FLAGS_IS_COMPRESSED_FILE
)) ? 0 : UPL_WILL_MODIFY
),
3194 VM_KERN_MEMORY_FILE
);
3195 if (kret
!= KERN_SUCCESS
)
3196 panic("cluster_write_copy: failed to get pagelist");
3198 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_END
,
3199 upl
, (int)upl_f_offset
, start_offset
, 0, 0);
3201 if (start_offset
&& upl_f_offset
< oldEOF
&& !upl_valid_page(pl
, 0)) {
3205 * we're starting in the middle of the first page of the upl
3206 * and the page isn't currently valid, so we're going to have
3207 * to read it in first... this is a synchronous operation
3209 read_size
= PAGE_SIZE
;
3211 if ((upl_f_offset
+ read_size
) > oldEOF
)
3212 read_size
= oldEOF
- upl_f_offset
;
3214 retval
= cluster_io(vp
, upl
, 0, upl_f_offset
, read_size
,
3215 CL_READ
| bflag
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
);
3218 * we had an error during the read which causes us to abort
3219 * the current cluster_write request... before we do, we need
3220 * to release the rest of the pages in the upl without modifying
3221 * there state and mark the failed page in error
3223 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
|UPL_ABORT_FREE_ON_EMPTY
);
3225 if (upl_size
> PAGE_SIZE
)
3226 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3228 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
3229 upl
, 0, 0, retval
, 0);
3233 if ((start_offset
== 0 || upl_size
> PAGE_SIZE
) && ((start_offset
+ io_size
) & PAGE_MASK
)) {
3235 * the last offset we're writing to in this upl does not end on a page
3236 * boundary... if it's not beyond the old EOF, then we'll also need to
3237 * pre-read this page in if it isn't already valid
3239 upl_offset
= upl_size
- PAGE_SIZE
;
3241 if ((upl_f_offset
+ start_offset
+ io_size
) < oldEOF
&&
3242 !upl_valid_page(pl
, upl_offset
/ PAGE_SIZE
)) {
3245 read_size
= PAGE_SIZE
;
3247 if ((off_t
)(upl_f_offset
+ upl_offset
+ read_size
) > oldEOF
)
3248 read_size
= oldEOF
- (upl_f_offset
+ upl_offset
);
3250 retval
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, read_size
,
3251 CL_READ
| bflag
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
);
3254 * we had an error during the read which causes us to abort
3255 * the current cluster_write request... before we do, we
3256 * need to release the rest of the pages in the upl without
3257 * modifying there state and mark the failed page in error
3259 ubc_upl_abort_range(upl
, upl_offset
, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
|UPL_ABORT_FREE_ON_EMPTY
);
3261 if (upl_size
> PAGE_SIZE
)
3262 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
3264 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
3265 upl
, 0, 0, retval
, 0);
3270 xfer_resid
= io_size
;
3271 io_offset
= start_offset
;
3273 while (zero_cnt
&& xfer_resid
) {
3275 if (zero_cnt
< (long long)xfer_resid
)
3276 bytes_to_zero
= zero_cnt
;
3278 bytes_to_zero
= xfer_resid
;
3280 bytes_to_zero
= cluster_zero_range(upl
, pl
, flags
, io_offset
, zero_off
, upl_f_offset
, bytes_to_zero
);
3282 xfer_resid
-= bytes_to_zero
;
3283 zero_cnt
-= bytes_to_zero
;
3284 zero_off
+= bytes_to_zero
;
3285 io_offset
+= bytes_to_zero
;
3287 if (xfer_resid
&& io_resid
) {
3288 u_int32_t io_requested
;
3290 bytes_to_move
= min(io_resid
, xfer_resid
);
3291 io_requested
= bytes_to_move
;
3293 retval
= cluster_copy_upl_data(uio
, upl
, io_offset
, (int *)&io_requested
);
3296 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
3298 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 45)) | DBG_FUNC_NONE
,
3299 upl
, 0, 0, retval
, 0);
3301 io_resid
-= bytes_to_move
;
3302 xfer_resid
-= bytes_to_move
;
3303 io_offset
+= bytes_to_move
;
3306 while (xfer_resid
&& zero_cnt1
&& retval
== 0) {
3308 if (zero_cnt1
< (long long)xfer_resid
)
3309 bytes_to_zero
= zero_cnt1
;
3311 bytes_to_zero
= xfer_resid
;
3313 bytes_to_zero
= cluster_zero_range(upl
, pl
, flags
, io_offset
, zero_off1
, upl_f_offset
, bytes_to_zero
);
3315 xfer_resid
-= bytes_to_zero
;
3316 zero_cnt1
-= bytes_to_zero
;
3317 zero_off1
+= bytes_to_zero
;
3318 io_offset
+= bytes_to_zero
;
3322 int ret_cluster_try_push
;
3324 io_size
+= start_offset
;
3326 if (newEOF
>= oldEOF
&& (upl_f_offset
+ io_size
) >= newEOF
&& (u_int
)io_size
< upl_size
) {
3328 * if we're extending the file with this write
3329 * we'll zero fill the rest of the page so that
3330 * if the file gets extended again in such a way as to leave a
3331 * hole starting at this EOF, we'll have zero's in the correct spot
3333 cluster_zero(upl
, io_size
, upl_size
- io_size
, NULL
);
3336 * release the upl now if we hold one since...
3337 * 1) pages in it may be present in the sparse cluster map
3338 * and may span 2 separate buckets there... if they do and
3339 * we happen to have to flush a bucket to make room and it intersects
3340 * this upl, a deadlock may result on page BUSY
3341 * 2) we're delaying the I/O... from this point forward we're just updating
3342 * the cluster state... no need to hold the pages, so commit them
3343 * 3) IO_SYNC is set...
3344 * because we had to ask for a UPL that provides currenty non-present pages, the
3345 * UPL has been automatically set to clear the dirty flags (both software and hardware)
3346 * upon committing it... this is not the behavior we want since it's possible for
3347 * pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
3348 * we'll pick these pages back up later with the correct behavior specified.
3349 * 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush
3350 * of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages
3351 * we hold since the flushing context is holding the cluster lock.
3353 ubc_upl_commit_range(upl
, 0, upl_size
,
3354 UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
3357 * calculate the last logical block number
3358 * that this delayed I/O encompassed
3360 cl
.e_addr
= (daddr64_t
)((upl_f_offset
+ (off_t
)upl_size
) / PAGE_SIZE_64
);
3362 if (flags
& IO_SYNC
) {
3364 * if the IO_SYNC flag is set than we need to
3365 * bypass any clusters and immediately issue
3371 * take the lock to protect our accesses
3372 * of the writebehind and sparse cluster state
3374 wbp
= cluster_get_wbp(vp
, CLW_ALLOCATE
| CLW_RETURNLOCKED
);
3376 if (wbp
->cl_scmap
) {
3378 if ( !(flags
& IO_NOCACHE
)) {
3380 * we've fallen into the sparse
3381 * cluster method of delaying dirty pages
3383 sparse_cluster_add(&(wbp
->cl_scmap
), vp
, &cl
, newEOF
, callback
, callback_arg
);
3385 lck_mtx_unlock(&wbp
->cl_lockw
);
3390 * must have done cached writes that fell into
3391 * the sparse cluster mechanism... we've switched
3392 * to uncached writes on the file, so go ahead
3393 * and push whatever's in the sparse map
3394 * and switch back to normal clustering
3398 sparse_cluster_push(&(wbp
->cl_scmap
), vp
, newEOF
, PUSH_ALL
, 0, callback
, callback_arg
);
3400 * no clusters of either type present at this point
3401 * so just go directly to start_new_cluster since
3402 * we know we need to delay this I/O since we've
3403 * already released the pages back into the cache
3404 * to avoid the deadlock with sparse_cluster_push
3406 goto start_new_cluster
;
3409 if (write_off
== wbp
->cl_last_write
)
3410 wbp
->cl_seq_written
+= write_cnt
;
3412 wbp
->cl_seq_written
= write_cnt
;
3414 wbp
->cl_last_write
= write_off
+ write_cnt
;
3418 if (wbp
->cl_number
== 0)
3420 * no clusters currently present
3422 goto start_new_cluster
;
3424 for (cl_index
= 0; cl_index
< wbp
->cl_number
; cl_index
++) {
3426 * check each cluster that we currently hold
3427 * try to merge some or all of this write into
3428 * one or more of the existing clusters... if
3429 * any portion of the write remains, start a
3432 if (cl
.b_addr
>= wbp
->cl_clusters
[cl_index
].b_addr
) {
3434 * the current write starts at or after the current cluster
3436 if (cl
.e_addr
<= (wbp
->cl_clusters
[cl_index
].b_addr
+ max_cluster_pgcount
)) {
3438 * we have a write that fits entirely
3439 * within the existing cluster limits
3441 if (cl
.e_addr
> wbp
->cl_clusters
[cl_index
].e_addr
)
3443 * update our idea of where the cluster ends
3445 wbp
->cl_clusters
[cl_index
].e_addr
= cl
.e_addr
;
3448 if (cl
.b_addr
< (wbp
->cl_clusters
[cl_index
].b_addr
+ max_cluster_pgcount
)) {
3450 * we have a write that starts in the middle of the current cluster
3451 * but extends beyond the cluster's limit... we know this because
3452 * of the previous checks
3453 * we'll extend the current cluster to the max
3454 * and update the b_addr for the current write to reflect that
3455 * the head of it was absorbed into this cluster...
3456 * note that we'll always have a leftover tail in this case since
3457 * full absorbtion would have occurred in the clause above
3459 wbp
->cl_clusters
[cl_index
].e_addr
= wbp
->cl_clusters
[cl_index
].b_addr
+ max_cluster_pgcount
;
3461 cl
.b_addr
= wbp
->cl_clusters
[cl_index
].e_addr
;
3464 * we come here for the case where the current write starts
3465 * beyond the limit of the existing cluster or we have a leftover
3466 * tail after a partial absorbtion
3468 * in either case, we'll check the remaining clusters before
3469 * starting a new one
3473 * the current write starts in front of the cluster we're currently considering
3475 if ((wbp
->cl_clusters
[cl_index
].e_addr
- cl
.b_addr
) <= max_cluster_pgcount
) {
3477 * we can just merge the new request into
3478 * this cluster and leave it in the cache
3479 * since the resulting cluster is still
3480 * less than the maximum allowable size
3482 wbp
->cl_clusters
[cl_index
].b_addr
= cl
.b_addr
;
3484 if (cl
.e_addr
> wbp
->cl_clusters
[cl_index
].e_addr
) {
3486 * the current write completely
3487 * envelops the existing cluster and since
3488 * each write is limited to at most max_cluster_pgcount pages
3489 * we can just use the start and last blocknos of the write
3490 * to generate the cluster limits
3492 wbp
->cl_clusters
[cl_index
].e_addr
= cl
.e_addr
;
3498 * if we were to combine this write with the current cluster
3499 * we would exceed the cluster size limit.... so,
3500 * let's see if there's any overlap of the new I/O with
3501 * the cluster we're currently considering... in fact, we'll
3502 * stretch the cluster out to it's full limit and see if we
3503 * get an intersection with the current write
3506 if (cl
.e_addr
> wbp
->cl_clusters
[cl_index
].e_addr
- max_cluster_pgcount
) {
3508 * the current write extends into the proposed cluster
3509 * clip the length of the current write after first combining it's
3510 * tail with the newly shaped cluster
3512 wbp
->cl_clusters
[cl_index
].b_addr
= wbp
->cl_clusters
[cl_index
].e_addr
- max_cluster_pgcount
;
3514 cl
.e_addr
= wbp
->cl_clusters
[cl_index
].b_addr
;
3517 * if we get here, there was no way to merge
3518 * any portion of this write with this cluster
3519 * or we could only merge part of it which
3520 * will leave a tail...
3521 * we'll check the remaining clusters before starting a new one
3525 if (cl_index
< wbp
->cl_number
)
3527 * we found an existing cluster(s) that we
3528 * could entirely merge this I/O into
3532 if (!((unsigned int)vfs_flags(vp
->v_mount
) & MNT_DEFWRITE
) &&
3533 wbp
->cl_number
== MAX_CLUSTERS
&&
3534 wbp
->cl_seq_written
>= (MAX_CLUSTERS
* (max_cluster_pgcount
* PAGE_SIZE
))) {
3537 if (vp
->v_mount
->mnt_minsaturationbytecount
) {
3538 n
= vp
->v_mount
->mnt_minsaturationbytecount
/ MAX_CLUSTER_SIZE(vp
);
3540 if (n
> MAX_CLUSTERS
)
3546 if (disk_conditioner_mount_is_ssd(vp
->v_mount
))
3547 n
= WRITE_BEHIND_SSD
;
3552 cluster_try_push(wbp
, vp
, newEOF
, 0, 0, callback
, callback_arg
, NULL
);
3554 if (wbp
->cl_number
< MAX_CLUSTERS
) {
3556 * we didn't find an existing cluster to
3557 * merge into, but there's room to start
3560 goto start_new_cluster
;
3563 * no exisitng cluster to merge with and no
3564 * room to start a new one... we'll try
3565 * pushing one of the existing ones... if none of
3566 * them are able to be pushed, we'll switch
3567 * to the sparse cluster mechanism
3568 * cluster_try_push updates cl_number to the
3569 * number of remaining clusters... and
3570 * returns the number of currently unused clusters
3572 ret_cluster_try_push
= 0;
3575 * if writes are not deferred, call cluster push immediately
3577 if (!((unsigned int)vfs_flags(vp
->v_mount
) & MNT_DEFWRITE
)) {
3579 ret_cluster_try_push
= cluster_try_push(wbp
, vp
, newEOF
, (flags
& IO_NOCACHE
) ? 0 : PUSH_DELAY
, 0, callback
, callback_arg
, NULL
);
3583 * execute following regardless of writes being deferred or not
3585 if (ret_cluster_try_push
== 0) {
3587 * no more room in the normal cluster mechanism
3588 * so let's switch to the more expansive but expensive
3589 * sparse mechanism....
3591 sparse_cluster_switch(wbp
, vp
, newEOF
, callback
, callback_arg
);
3592 sparse_cluster_add(&(wbp
->cl_scmap
), vp
, &cl
, newEOF
, callback
, callback_arg
);
3594 lck_mtx_unlock(&wbp
->cl_lockw
);
3599 wbp
->cl_clusters
[wbp
->cl_number
].b_addr
= cl
.b_addr
;
3600 wbp
->cl_clusters
[wbp
->cl_number
].e_addr
= cl
.e_addr
;
3602 wbp
->cl_clusters
[wbp
->cl_number
].io_flags
= 0;
3604 if (flags
& IO_NOCACHE
)
3605 wbp
->cl_clusters
[wbp
->cl_number
].io_flags
|= CLW_IONOCACHE
;
3607 if (bflag
& CL_PASSIVE
)
3608 wbp
->cl_clusters
[wbp
->cl_number
].io_flags
|= CLW_IOPASSIVE
;
3612 lck_mtx_unlock(&wbp
->cl_lockw
);
3617 * we don't hold the lock at this point
3619 * we've already dropped the current upl, so pick it back up with COPYOUT_FROM set
3620 * so that we correctly deal with a change in state of the hardware modify bit...
3621 * we do this via cluster_push_now... by passing along the IO_SYNC flag, we force
3622 * cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also
3623 * responsible for generating the correct sized I/O(s)
3625 retval
= cluster_push_now(vp
, &cl
, newEOF
, flags
, callback
, callback_arg
);
3628 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 40)) | DBG_FUNC_END
, retval
, 0, io_resid
, 0, 0);
3636 cluster_read(vnode_t vp
, struct uio
*uio
, off_t filesize
, int xflags
)
3638 return cluster_read_ext(vp
, uio
, filesize
, xflags
, NULL
, NULL
);
3643 cluster_read_ext(vnode_t vp
, struct uio
*uio
, off_t filesize
, int xflags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
3647 user_ssize_t cur_resid
;
3649 u_int32_t read_length
= 0;
3650 int read_type
= IO_COPY
;
3654 if (vp
->v_flag
& VNOCACHE_DATA
)
3655 flags
|= IO_NOCACHE
;
3656 if ((vp
->v_flag
& VRAOFF
) || speculative_reads_disabled
)
3659 if (flags
& IO_SKIP_ENCRYPTION
)
3660 flags
|= IO_ENCRYPTED
;
3663 * do a read through the cache if one of the following is true....
3664 * NOCACHE is not true
3665 * the uio request doesn't target USERSPACE
3666 * Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well.
3667 * Reading encrypted data from a CP filesystem should never result in the data touching
3670 * otherwise, find out if we want the direct or contig variant for
3671 * the first vector in the uio request
3673 if ( ((flags
& IO_NOCACHE
) && UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
)) || (flags
& IO_ENCRYPTED
) ) {
3675 retval
= cluster_io_type(uio
, &read_type
, &read_length
, 0);
3678 while ((cur_resid
= uio_resid(uio
)) && uio
->uio_offset
< filesize
&& retval
== 0) {
3680 switch (read_type
) {
3684 * make sure the uio_resid isn't too big...
3685 * internally, we want to handle all of the I/O in
3686 * chunk sizes that fit in a 32 bit int
3688 if (cur_resid
> (user_ssize_t
)(MAX_IO_REQUEST_SIZE
))
3689 io_size
= MAX_IO_REQUEST_SIZE
;
3691 io_size
= (u_int32_t
)cur_resid
;
3693 retval
= cluster_read_copy(vp
, uio
, io_size
, filesize
, flags
, callback
, callback_arg
);
3697 retval
= cluster_read_direct(vp
, uio
, filesize
, &read_type
, &read_length
, flags
, callback
, callback_arg
);
3701 retval
= cluster_read_contig(vp
, uio
, filesize
, &read_type
, &read_length
, callback
, callback_arg
, flags
);
3705 retval
= cluster_io_type(uio
, &read_type
, &read_length
, 0);
3715 cluster_read_upl_release(upl_t upl
, int start_pg
, int last_pg
, int take_reference
)
3718 int abort_flags
= UPL_ABORT_FREE_ON_EMPTY
;
3720 if ((range
= last_pg
- start_pg
)) {
3722 abort_flags
|= UPL_ABORT_REFERENCE
;
3724 ubc_upl_abort_range(upl
, start_pg
* PAGE_SIZE
, range
* PAGE_SIZE
, abort_flags
);
3730 cluster_read_copy(vnode_t vp
, struct uio
*uio
, u_int32_t io_req_size
, off_t filesize
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
3732 upl_page_info_t
*pl
;
3734 vm_offset_t upl_offset
;
3743 off_t last_ioread_offset
;
3744 off_t last_request_offset
;
3748 u_int32_t size_of_prefetch
;
3751 u_int32_t max_rd_size
;
3752 u_int32_t max_io_size
;
3753 u_int32_t max_prefetch
;
3754 u_int rd_ahead_enabled
= 1;
3755 u_int prefetch_enabled
= 1;
3756 struct cl_readahead
* rap
;
3757 struct clios iostate
;
3758 struct cl_extent extent
;
3760 int take_reference
= 1;
3761 int policy
= IOPOL_DEFAULT
;
3762 boolean_t iolock_inited
= FALSE
;
3764 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_START
,
3765 (int)uio
->uio_offset
, io_req_size
, (int)filesize
, flags
, 0);
3767 if (flags
& IO_ENCRYPTED
) {
3768 panic ("encrypted blocks will hit UBC!");
3771 policy
= throttle_get_io_policy(NULL
);
3773 if (policy
== THROTTLE_LEVEL_TIER3
|| policy
== THROTTLE_LEVEL_TIER2
|| (flags
& IO_NOCACHE
))
3776 if (flags
& IO_PASSIVE
)
3781 if (flags
& IO_NOCACHE
)
3782 bflag
|= CL_NOCACHE
;
3784 if (flags
& IO_SKIP_ENCRYPTION
)
3785 bflag
|= CL_ENCRYPTED
;
3787 max_io_size
= cluster_max_io_size(vp
->v_mount
, CL_READ
);
3788 max_prefetch
= MAX_PREFETCH(vp
, max_io_size
, disk_conditioner_mount_is_ssd(vp
->v_mount
));
3789 max_rd_size
= max_prefetch
;
3791 last_request_offset
= uio
->uio_offset
+ io_req_size
;
3793 if (last_request_offset
> filesize
)
3794 last_request_offset
= filesize
;
3796 if ((flags
& (IO_RAOFF
|IO_NOCACHE
)) || ((last_request_offset
& ~PAGE_MASK_64
) == (uio
->uio_offset
& ~PAGE_MASK_64
))) {
3797 rd_ahead_enabled
= 0;
3800 if (cluster_is_throttled(vp
)) {
3802 * we're in the throttle window, at the very least
3803 * we want to limit the size of the I/O we're about
3806 rd_ahead_enabled
= 0;
3807 prefetch_enabled
= 0;
3809 max_rd_size
= THROTTLE_MAX_IOSIZE
;
3811 if ((rap
= cluster_get_rap(vp
)) == NULL
)
3812 rd_ahead_enabled
= 0;
3814 extent
.b_addr
= uio
->uio_offset
/ PAGE_SIZE_64
;
3815 extent
.e_addr
= (last_request_offset
- 1) / PAGE_SIZE_64
;
3818 if (rap
!= NULL
&& rap
->cl_ralen
&& (rap
->cl_lastr
== extent
.b_addr
|| (rap
->cl_lastr
+ 1) == extent
.b_addr
)) {
3820 * determine if we already have a read-ahead in the pipe courtesy of the
3821 * last read systemcall that was issued...
3822 * if so, pick up it's extent to determine where we should start
3823 * with respect to any read-ahead that might be necessary to
3824 * garner all the data needed to complete this read systemcall
3826 last_ioread_offset
= (rap
->cl_maxra
* PAGE_SIZE_64
) + PAGE_SIZE_64
;
3828 if (last_ioread_offset
< uio
->uio_offset
)
3829 last_ioread_offset
= (off_t
)0;
3830 else if (last_ioread_offset
> last_request_offset
)
3831 last_ioread_offset
= last_request_offset
;
3833 last_ioread_offset
= (off_t
)0;
3835 while (io_req_size
&& uio
->uio_offset
< filesize
&& retval
== 0) {
3837 max_size
= filesize
- uio
->uio_offset
;
3839 if ((off_t
)(io_req_size
) < max_size
)
3840 io_size
= io_req_size
;
3844 if (!(flags
& IO_NOCACHE
)) {
3848 u_int32_t io_requested
;
3851 * if we keep finding the pages we need already in the cache, then
3852 * don't bother to call cluster_read_prefetch since it costs CPU cycles
3853 * to determine that we have all the pages we need... once we miss in
3854 * the cache and have issued an I/O, than we'll assume that we're likely
3855 * to continue to miss in the cache and it's to our advantage to try and prefetch
3857 if (last_request_offset
&& last_ioread_offset
&& (size_of_prefetch
= (last_request_offset
- last_ioread_offset
))) {
3858 if ((last_ioread_offset
- uio
->uio_offset
) <= max_rd_size
&& prefetch_enabled
) {
3860 * we've already issued I/O for this request and
3861 * there's still work to do and
3862 * our prefetch stream is running dry, so issue a
3863 * pre-fetch I/O... the I/O latency will overlap
3864 * with the copying of the data
3866 if (size_of_prefetch
> max_rd_size
)
3867 size_of_prefetch
= max_rd_size
;
3869 size_of_prefetch
= cluster_read_prefetch(vp
, last_ioread_offset
, size_of_prefetch
, filesize
, callback
, callback_arg
, bflag
);
3871 last_ioread_offset
+= (off_t
)(size_of_prefetch
* PAGE_SIZE
);
3873 if (last_ioread_offset
> last_request_offset
)
3874 last_ioread_offset
= last_request_offset
;
3878 * limit the size of the copy we're about to do so that
3879 * we can notice that our I/O pipe is running dry and
3880 * get the next I/O issued before it does go dry
3882 if (last_ioread_offset
&& io_size
> (max_io_size
/ 4))
3883 io_resid
= (max_io_size
/ 4);
3887 io_requested
= io_resid
;
3889 retval
= cluster_copy_ubc_data_internal(vp
, uio
, (int *)&io_resid
, 0, take_reference
);
3891 xsize
= io_requested
- io_resid
;
3894 io_req_size
-= xsize
;
3896 if (retval
|| io_resid
)
3898 * if we run into a real error or
3899 * a page that is not in the cache
3900 * we need to leave streaming mode
3904 if (rd_ahead_enabled
&& (io_size
== 0 || last_ioread_offset
== last_request_offset
)) {
3906 * we're already finished the I/O for this read request
3907 * let's see if we should do a read-ahead
3909 cluster_read_ahead(vp
, &extent
, filesize
, rap
, callback
, callback_arg
, bflag
);
3916 if (extent
.e_addr
< rap
->cl_lastr
)
3918 rap
->cl_lastr
= extent
.e_addr
;
3923 * recompute max_size since cluster_copy_ubc_data_internal
3924 * may have advanced uio->uio_offset
3926 max_size
= filesize
- uio
->uio_offset
;
3929 iostate
.io_completed
= 0;
3930 iostate
.io_issued
= 0;
3931 iostate
.io_error
= 0;
3932 iostate
.io_wanted
= 0;
3934 if ( (flags
& IO_RETURN_ON_THROTTLE
) ) {
3935 if (cluster_is_throttled(vp
) == THROTTLE_NOW
) {
3936 if ( !cluster_io_present_in_BC(vp
, uio
->uio_offset
)) {
3938 * we're in the throttle window and at least 1 I/O
3939 * has already been issued by a throttleable thread
3940 * in this window, so return with EAGAIN to indicate
3941 * to the FS issuing the cluster_read call that it
3942 * should now throttle after dropping any locks
3944 throttle_info_update_by_mount(vp
->v_mount
);
3953 * compute the size of the upl needed to encompass
3954 * the requested read... limit each call to cluster_io
3955 * to the maximum UPL size... cluster_io will clip if
3956 * this exceeds the maximum io_size for the device,
3957 * make sure to account for
3958 * a starting offset that's not page aligned
3960 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
3961 upl_f_offset
= uio
->uio_offset
- (off_t
)start_offset
;
3963 if (io_size
> max_rd_size
)
3964 io_size
= max_rd_size
;
3966 upl_size
= (start_offset
+ io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
3968 if (flags
& IO_NOCACHE
) {
3969 if (upl_size
> max_io_size
)
3970 upl_size
= max_io_size
;
3972 if (upl_size
> max_io_size
/ 4) {
3973 upl_size
= max_io_size
/ 4;
3974 upl_size
&= ~PAGE_MASK
;
3977 upl_size
= PAGE_SIZE
;
3980 pages_in_upl
= upl_size
/ PAGE_SIZE
;
3982 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 33)) | DBG_FUNC_START
,
3983 upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
3985 kret
= ubc_create_upl_kernel(vp
,
3990 UPL_FILE_IO
| UPL_SET_LITE
,
3991 VM_KERN_MEMORY_FILE
);
3992 if (kret
!= KERN_SUCCESS
)
3993 panic("cluster_read_copy: failed to get pagelist");
3995 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 33)) | DBG_FUNC_END
,
3996 upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
3999 * scan from the beginning of the upl looking for the first
4000 * non-valid page.... this will become the first page in
4001 * the request we're going to make to 'cluster_io'... if all
4002 * of the pages are valid, we won't call through to 'cluster_io'
4004 for (start_pg
= 0; start_pg
< pages_in_upl
; start_pg
++) {
4005 if (!upl_valid_page(pl
, start_pg
))
4010 * scan from the starting invalid page looking for a valid
4011 * page before the end of the upl is reached, if we
4012 * find one, then it will be the last page of the request to
4015 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
4016 if (upl_valid_page(pl
, last_pg
))
4020 if (start_pg
< last_pg
) {
4022 * we found a range of 'invalid' pages that must be filled
4023 * if the last page in this range is the last page of the file
4024 * we may have to clip the size of it to keep from reading past
4025 * the end of the last physical block associated with the file
4027 if (iolock_inited
== FALSE
) {
4028 lck_mtx_init(&iostate
.io_mtxp
, cl_mtx_grp
, cl_mtx_attr
);
4030 iolock_inited
= TRUE
;
4032 upl_offset
= start_pg
* PAGE_SIZE
;
4033 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
4035 if ((off_t
)(upl_f_offset
+ upl_offset
+ io_size
) > filesize
)
4036 io_size
= filesize
- (upl_f_offset
+ upl_offset
);
4039 * issue an asynchronous read to cluster_io
4042 error
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
,
4043 io_size
, CL_READ
| CL_ASYNC
| bflag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
4046 if (extent
.e_addr
< rap
->cl_maxra
) {
4048 * we've just issued a read for a block that should have been
4049 * in the cache courtesy of the read-ahead engine... something
4050 * has gone wrong with the pipeline, so reset the read-ahead
4051 * logic which will cause us to restart from scratch
4059 * if the read completed successfully, or there was no I/O request
4060 * issued, than copy the data into user land via 'cluster_upl_copy_data'
4061 * we'll first add on any 'valid'
4062 * pages that were present in the upl when we acquired it.
4066 for (uio_last
= last_pg
; uio_last
< pages_in_upl
; uio_last
++) {
4067 if (!upl_valid_page(pl
, uio_last
))
4070 if (uio_last
< pages_in_upl
) {
4072 * there were some invalid pages beyond the valid pages
4073 * that we didn't issue an I/O for, just release them
4074 * unchanged now, so that any prefetch/readahed can
4077 ubc_upl_abort_range(upl
, uio_last
* PAGE_SIZE
,
4078 (pages_in_upl
- uio_last
) * PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
4082 * compute size to transfer this round, if io_req_size is
4083 * still non-zero after this attempt, we'll loop around and
4084 * set up for another I/O.
4086 val_size
= (uio_last
* PAGE_SIZE
) - start_offset
;
4088 if (val_size
> max_size
)
4089 val_size
= max_size
;
4091 if (val_size
> io_req_size
)
4092 val_size
= io_req_size
;
4094 if ((uio
->uio_offset
+ val_size
) > last_ioread_offset
)
4095 last_ioread_offset
= uio
->uio_offset
+ val_size
;
4097 if ((size_of_prefetch
= (last_request_offset
- last_ioread_offset
)) && prefetch_enabled
) {
4099 if ((last_ioread_offset
- (uio
->uio_offset
+ val_size
)) <= upl_size
) {
4101 * if there's still I/O left to do for this request, and...
4102 * we're not in hard throttle mode, and...
4103 * we're close to using up the previous prefetch, then issue a
4104 * new pre-fetch I/O... the I/O latency will overlap
4105 * with the copying of the data
4107 if (size_of_prefetch
> max_rd_size
)
4108 size_of_prefetch
= max_rd_size
;
4110 size_of_prefetch
= cluster_read_prefetch(vp
, last_ioread_offset
, size_of_prefetch
, filesize
, callback
, callback_arg
, bflag
);
4112 last_ioread_offset
+= (off_t
)(size_of_prefetch
* PAGE_SIZE
);
4114 if (last_ioread_offset
> last_request_offset
)
4115 last_ioread_offset
= last_request_offset
;
4118 } else if ((uio
->uio_offset
+ val_size
) == last_request_offset
) {
4120 * this transfer will finish this request, so...
4121 * let's try to read ahead if we're in
4122 * a sequential access pattern and we haven't
4123 * explicitly disabled it
4125 if (rd_ahead_enabled
)
4126 cluster_read_ahead(vp
, &extent
, filesize
, rap
, callback
, callback_arg
, bflag
);
4129 if (extent
.e_addr
< rap
->cl_lastr
)
4131 rap
->cl_lastr
= extent
.e_addr
;
4134 if (iolock_inited
== TRUE
)
4135 cluster_iostate_wait(&iostate
, 0, "cluster_read_copy");
4137 if (iostate
.io_error
)
4138 error
= iostate
.io_error
;
4140 u_int32_t io_requested
;
4142 io_requested
= val_size
;
4144 retval
= cluster_copy_upl_data(uio
, upl
, start_offset
, (int *)&io_requested
);
4146 io_req_size
-= (val_size
- io_requested
);
4149 if (iolock_inited
== TRUE
)
4150 cluster_iostate_wait(&iostate
, 0, "cluster_read_copy");
4152 if (start_pg
< last_pg
) {
4154 * compute the range of pages that we actually issued an I/O for
4155 * and either commit them as valid if the I/O succeeded
4156 * or abort them if the I/O failed or we're not supposed to
4157 * keep them in the cache
4159 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
4161 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_START
, upl
, start_pg
* PAGE_SIZE
, io_size
, error
, 0);
4163 if (error
|| (flags
& IO_NOCACHE
))
4164 ubc_upl_abort_range(upl
, start_pg
* PAGE_SIZE
, io_size
,
4165 UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
4167 int commit_flags
= UPL_COMMIT_CLEAR_DIRTY
| UPL_COMMIT_FREE_ON_EMPTY
;
4170 commit_flags
|= UPL_COMMIT_INACTIVATE
;
4172 commit_flags
|= UPL_COMMIT_SPECULATE
;
4174 ubc_upl_commit_range(upl
, start_pg
* PAGE_SIZE
, io_size
, commit_flags
);
4176 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_END
, upl
, start_pg
* PAGE_SIZE
, io_size
, error
, 0);
4178 if ((last_pg
- start_pg
) < pages_in_upl
) {
4180 * the set of pages that we issued an I/O for did not encompass
4181 * the entire upl... so just release these without modifying
4185 ubc_upl_abort_range(upl
, 0, upl_size
, UPL_ABORT_FREE_ON_EMPTY
);
4188 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_START
,
4189 upl
, -1, pages_in_upl
- (last_pg
- start_pg
), 0, 0);
4192 * handle any valid pages at the beginning of
4193 * the upl... release these appropriately
4195 cluster_read_upl_release(upl
, 0, start_pg
, take_reference
);
4198 * handle any valid pages immediately after the
4199 * pages we issued I/O for... ... release these appropriately
4201 cluster_read_upl_release(upl
, last_pg
, uio_last
, take_reference
);
4203 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 35)) | DBG_FUNC_END
, upl
, -1, -1, 0, 0);
4210 if (cluster_is_throttled(vp
)) {
4212 * we're in the throttle window, at the very least
4213 * we want to limit the size of the I/O we're about
4216 rd_ahead_enabled
= 0;
4217 prefetch_enabled
= 0;
4218 max_rd_size
= THROTTLE_MAX_IOSIZE
;
4220 if (max_rd_size
== THROTTLE_MAX_IOSIZE
) {
4222 * coming out of throttled state
4224 if (policy
!= THROTTLE_LEVEL_TIER3
&& policy
!= THROTTLE_LEVEL_TIER2
) {
4226 rd_ahead_enabled
= 1;
4227 prefetch_enabled
= 1;
4229 max_rd_size
= max_prefetch
;
4230 last_ioread_offset
= 0;
4235 if (iolock_inited
== TRUE
) {
4237 * cluster_io returned an error after it
4238 * had already issued some I/O. we need
4239 * to wait for that I/O to complete before
4240 * we can destroy the iostate mutex...
4241 * 'retval' already contains the early error
4242 * so no need to pick it up from iostate.io_error
4244 cluster_iostate_wait(&iostate
, 0, "cluster_read_copy");
4246 lck_mtx_destroy(&iostate
.io_mtxp
, cl_mtx_grp
);
4249 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_END
,
4250 (int)uio
->uio_offset
, io_req_size
, rap
->cl_lastr
, retval
, 0);
4252 lck_mtx_unlock(&rap
->cl_lockr
);
4254 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 32)) | DBG_FUNC_END
,
4255 (int)uio
->uio_offset
, io_req_size
, 0, retval
, 0);
4262 * We don't want another read/write lock for every vnode in the system
4263 * so we keep a hash of them here. There should never be very many of
4264 * these around at any point in time.
4266 cl_direct_read_lock_t
*cluster_lock_direct_read(vnode_t vp
, lck_rw_type_t type
)
4268 struct cl_direct_read_locks
*head
4269 = &cl_direct_read_locks
[(uintptr_t)vp
/ sizeof(*vp
)
4270 % CL_DIRECT_READ_LOCK_BUCKETS
];
4272 struct cl_direct_read_lock
*lck
, *new_lck
= NULL
;
4275 lck_spin_lock(&cl_direct_read_spin_lock
);
4277 LIST_FOREACH(lck
, head
, chain
) {
4278 if (lck
->vp
== vp
) {
4280 lck_spin_unlock(&cl_direct_read_spin_lock
);
4282 // Someone beat us to it, ditch the allocation
4283 lck_rw_destroy(&new_lck
->rw_lock
, cl_mtx_grp
);
4284 FREE(new_lck
, M_TEMP
);
4286 lck_rw_lock(&lck
->rw_lock
, type
);
4292 // Use the lock we allocated
4293 LIST_INSERT_HEAD(head
, new_lck
, chain
);
4294 lck_spin_unlock(&cl_direct_read_spin_lock
);
4295 lck_rw_lock(&new_lck
->rw_lock
, type
);
4299 lck_spin_unlock(&cl_direct_read_spin_lock
);
4301 // Allocate a new lock
4302 MALLOC(new_lck
, cl_direct_read_lock_t
*, sizeof(*new_lck
),
4304 lck_rw_init(&new_lck
->rw_lock
, cl_mtx_grp
, cl_mtx_attr
);
4306 new_lck
->ref_count
= 1;
4308 // Got to go round again
4312 void cluster_unlock_direct_read(cl_direct_read_lock_t
*lck
)
4314 lck_rw_done(&lck
->rw_lock
);
4316 lck_spin_lock(&cl_direct_read_spin_lock
);
4317 if (lck
->ref_count
== 1) {
4318 LIST_REMOVE(lck
, chain
);
4319 lck_spin_unlock(&cl_direct_read_spin_lock
);
4320 lck_rw_destroy(&lck
->rw_lock
, cl_mtx_grp
);
4324 lck_spin_unlock(&cl_direct_read_spin_lock
);
4329 cluster_read_direct(vnode_t vp
, struct uio
*uio
, off_t filesize
, int *read_type
, u_int32_t
*read_length
,
4330 int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
4333 upl_page_info_t
*pl
;
4335 vm_offset_t upl_offset
, vector_upl_offset
= 0;
4336 upl_size_t upl_size
, vector_upl_size
= 0;
4337 vm_size_t upl_needed_size
;
4338 unsigned int pages_in_pl
;
4339 upl_control_flags_t upl_flags
;
4342 int force_data_sync
;
4344 int no_zero_fill
= 0;
4347 struct clios iostate
;
4348 user_addr_t iov_base
;
4349 u_int32_t io_req_size
;
4350 u_int32_t offset_in_file
;
4351 u_int32_t offset_in_iovbase
;
4355 u_int32_t devblocksize
;
4356 u_int32_t mem_alignment_mask
;
4357 u_int32_t max_upl_size
;
4358 u_int32_t max_rd_size
;
4359 u_int32_t max_rd_ahead
;
4360 u_int32_t max_vector_size
;
4361 boolean_t strict_uncached_IO
= FALSE
;
4362 boolean_t io_throttled
= FALSE
;
4364 u_int32_t vector_upl_iosize
= 0;
4365 int issueVectorUPL
= 0,useVectorUPL
= (uio
->uio_iovcnt
> 1);
4366 off_t v_upl_uio_offset
= 0;
4367 int vector_upl_index
=0;
4368 upl_t vector_upl
= NULL
;
4369 cl_direct_read_lock_t
*lock
= NULL
;
4371 user_addr_t orig_iov_base
= 0;
4372 user_addr_t last_iov_base
= 0;
4373 user_addr_t next_iov_base
= 0;
4375 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_START
,
4376 (int)uio
->uio_offset
, (int)filesize
, *read_type
, *read_length
, 0);
4378 max_upl_size
= cluster_max_io_size(vp
->v_mount
, CL_READ
);
4380 max_rd_size
= max_upl_size
;
4381 max_rd_ahead
= max_rd_size
* IO_SCALE(vp
, 2);
4383 io_flag
= CL_COMMIT
| CL_READ
| CL_ASYNC
| CL_NOZERO
| CL_DIRECT_IO
;
4385 if (flags
& IO_PASSIVE
)
4386 io_flag
|= CL_PASSIVE
;
4388 if (flags
& IO_ENCRYPTED
) {
4389 io_flag
|= CL_RAW_ENCRYPTED
;
4392 if (flags
& IO_NOCACHE
) {
4393 io_flag
|= CL_NOCACHE
;
4396 if (flags
& IO_SKIP_ENCRYPTION
)
4397 io_flag
|= CL_ENCRYPTED
;
4399 iostate
.io_completed
= 0;
4400 iostate
.io_issued
= 0;
4401 iostate
.io_error
= 0;
4402 iostate
.io_wanted
= 0;
4404 lck_mtx_init(&iostate
.io_mtxp
, cl_mtx_grp
, cl_mtx_attr
);
4406 devblocksize
= (u_int32_t
)vp
->v_mount
->mnt_devblocksize
;
4407 mem_alignment_mask
= (u_int32_t
)vp
->v_mount
->mnt_alignmentmask
;
4409 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_NONE
,
4410 (int)devblocksize
, (int)mem_alignment_mask
, 0, 0, 0);
4412 if (devblocksize
== 1) {
4414 * the AFP client advertises a devblocksize of 1
4415 * however, its BLOCKMAP routine maps to physical
4416 * blocks that are PAGE_SIZE in size...
4417 * therefore we can't ask for I/Os that aren't page aligned
4418 * or aren't multiples of PAGE_SIZE in size
4419 * by setting devblocksize to PAGE_SIZE, we re-instate
4420 * the old behavior we had before the mem_alignment_mask
4421 * changes went in...
4423 devblocksize
= PAGE_SIZE
;
4426 strict_uncached_IO
= ubc_strict_uncached_IO(vp
);
4428 orig_iov_base
= uio_curriovbase(uio
);
4429 last_iov_base
= orig_iov_base
;
4432 io_req_size
= *read_length
;
4433 iov_base
= uio_curriovbase(uio
);
4435 offset_in_file
= (u_int32_t
)uio
->uio_offset
& (devblocksize
- 1);
4436 offset_in_iovbase
= (u_int32_t
)iov_base
& mem_alignment_mask
;
4438 if (offset_in_file
|| offset_in_iovbase
) {
4440 * one of the 2 important offsets is misaligned
4441 * so fire an I/O through the cache for this entire vector
4445 if (iov_base
& (devblocksize
- 1)) {
4447 * the offset in memory must be on a device block boundary
4448 * so that we can guarantee that we can generate an
4449 * I/O that ends on a page boundary in cluster_io
4454 max_io_size
= filesize
- uio
->uio_offset
;
4457 * The user must request IO in aligned chunks. If the
4458 * offset into the file is bad, or the userland pointer
4459 * is non-aligned, then we cannot service the encrypted IO request.
4461 if (flags
& IO_ENCRYPTED
) {
4462 if (misaligned
|| (io_req_size
& (devblocksize
- 1)))
4465 max_io_size
= roundup(max_io_size
, devblocksize
);
4468 if ((off_t
)io_req_size
> max_io_size
)
4469 io_req_size
= max_io_size
;
4472 * When we get to this point, we know...
4473 * -- the offset into the file is on a devblocksize boundary
4476 while (io_req_size
&& retval
== 0) {
4479 if (cluster_is_throttled(vp
)) {
4481 * we're in the throttle window, at the very least
4482 * we want to limit the size of the I/O we're about
4485 max_rd_size
= THROTTLE_MAX_IOSIZE
;
4486 max_rd_ahead
= THROTTLE_MAX_IOSIZE
- 1;
4487 max_vector_size
= THROTTLE_MAX_IOSIZE
;
4489 max_rd_size
= max_upl_size
;
4490 max_rd_ahead
= max_rd_size
* IO_SCALE(vp
, 2);
4491 max_vector_size
= MAX_VECTOR_UPL_SIZE
;
4493 io_start
= io_size
= io_req_size
;
4496 * First look for pages already in the cache
4497 * and move them to user space. But only do this
4498 * check if we are not retrieving encrypted data directly
4499 * from the filesystem; those blocks should never
4502 * cluster_copy_ubc_data returns the resid
4505 if ((strict_uncached_IO
== FALSE
) && ((flags
& IO_ENCRYPTED
) == 0)) {
4506 retval
= cluster_copy_ubc_data_internal(vp
, uio
, (int *)&io_size
, 0, 0);
4509 * calculate the number of bytes actually copied
4510 * starting size - residual
4512 xsize
= io_start
- io_size
;
4514 io_req_size
-= xsize
;
4516 if(useVectorUPL
&& (xsize
|| (iov_base
& PAGE_MASK
))) {
4518 * We found something in the cache or we have an iov_base that's not
4521 * Issue all I/O's that have been collected within this Vectored UPL.
4523 if(vector_upl_index
) {
4524 retval
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
4525 reset_vector_run_state();
4532 * After this point, if we are using the Vector UPL path and the base is
4533 * not page-aligned then the UPL with that base will be the first in the vector UPL.
4538 * check to see if we are finished with this request.
4540 * If we satisfied this IO already, then io_req_size will be 0.
4541 * Otherwise, see if the IO was mis-aligned and needs to go through
4542 * the UBC to deal with the 'tail'.
4545 if (io_req_size
== 0 || (misaligned
)) {
4547 * see if there's another uio vector to
4548 * process that's of type IO_DIRECT
4550 * break out of while loop to get there
4555 * assume the request ends on a device block boundary
4557 io_min
= devblocksize
;
4560 * we can handle I/O's in multiples of the device block size
4561 * however, if io_size isn't a multiple of devblocksize we
4562 * want to clip it back to the nearest page boundary since
4563 * we are going to have to go through cluster_read_copy to
4564 * deal with the 'overhang'... by clipping it to a PAGE_SIZE
4565 * multiple, we avoid asking the drive for the same physical
4566 * blocks twice.. once for the partial page at the end of the
4567 * request and a 2nd time for the page we read into the cache
4568 * (which overlaps the end of the direct read) in order to
4569 * get at the overhang bytes
4571 if (io_size
& (devblocksize
- 1)) {
4572 assert(!(flags
& IO_ENCRYPTED
));
4574 * Clip the request to the previous page size boundary
4575 * since request does NOT end on a device block boundary
4577 io_size
&= ~PAGE_MASK
;
4580 if (retval
|| io_size
< io_min
) {
4582 * either an error or we only have the tail left to
4583 * complete via the copy path...
4584 * we may have already spun some portion of this request
4585 * off as async requests... we need to wait for the I/O
4586 * to complete before returning
4588 goto wait_for_dreads
;
4592 * Don't re-check the UBC data if we are looking for uncached IO
4593 * or asking for encrypted blocks.
4595 if ((strict_uncached_IO
== FALSE
) && ((flags
& IO_ENCRYPTED
) == 0)) {
4597 if ((xsize
= io_size
) > max_rd_size
)
4598 xsize
= max_rd_size
;
4604 * We hold a lock here between the time we check the
4605 * cache and the time we issue I/O. This saves us
4606 * from having to lock the pages in the cache. Not
4607 * all clients will care about this lock but some
4608 * clients may want to guarantee stability between
4609 * here and when the I/O is issued in which case they
4610 * will take the lock exclusively.
4612 lock
= cluster_lock_direct_read(vp
, LCK_RW_TYPE_SHARED
);
4615 ubc_range_op(vp
, uio
->uio_offset
, uio
->uio_offset
+ xsize
, UPL_ROP_ABSENT
, (int *)&io_size
);
4619 * a page must have just come into the cache
4620 * since the first page in this range is no
4621 * longer absent, go back and re-evaluate
4626 if ( (flags
& IO_RETURN_ON_THROTTLE
) ) {
4627 if (cluster_is_throttled(vp
) == THROTTLE_NOW
) {
4628 if ( !cluster_io_present_in_BC(vp
, uio
->uio_offset
)) {
4630 * we're in the throttle window and at least 1 I/O
4631 * has already been issued by a throttleable thread
4632 * in this window, so return with EAGAIN to indicate
4633 * to the FS issuing the cluster_read call that it
4634 * should now throttle after dropping any locks
4636 throttle_info_update_by_mount(vp
->v_mount
);
4638 io_throttled
= TRUE
;
4639 goto wait_for_dreads
;
4643 if (io_size
> max_rd_size
)
4644 io_size
= max_rd_size
;
4646 iov_base
= uio_curriovbase(uio
);
4648 upl_offset
= (vm_offset_t
)((u_int32_t
)iov_base
& PAGE_MASK
);
4649 upl_needed_size
= (upl_offset
+ io_size
+ (PAGE_SIZE
-1)) & ~PAGE_MASK
;
4651 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_START
,
4652 (int)upl_offset
, upl_needed_size
, (int)iov_base
, io_size
, 0);
4654 if (upl_offset
== 0 && ((io_size
& PAGE_MASK
) == 0))
4659 vm_map_t map
= UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
) ? current_map() : kernel_map
;
4660 for (force_data_sync
= 0; force_data_sync
< 3; force_data_sync
++) {
4662 upl_size
= upl_needed_size
;
4663 upl_flags
= UPL_FILE_IO
| UPL_NO_SYNC
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
4665 upl_flags
|= UPL_NOZEROFILL
;
4666 if (force_data_sync
)
4667 upl_flags
|= UPL_FORCE_DATA_SYNC
;
4669 kret
= vm_map_create_upl(map
,
4670 (vm_map_offset_t
)(iov_base
& ~((user_addr_t
)PAGE_MASK
)),
4671 &upl_size
, &upl
, NULL
, &pages_in_pl
, &upl_flags
, VM_KERN_MEMORY_FILE
);
4673 if (kret
!= KERN_SUCCESS
) {
4674 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
4675 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
4677 * failed to get pagelist
4679 * we may have already spun some portion of this request
4680 * off as async requests... we need to wait for the I/O
4681 * to complete before returning
4683 goto wait_for_dreads
;
4685 pages_in_pl
= upl_size
/ PAGE_SIZE
;
4686 pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
4688 for (i
= 0; i
< pages_in_pl
; i
++) {
4689 if (!upl_page_present(pl
, i
))
4692 if (i
== pages_in_pl
)
4695 ubc_upl_abort(upl
, 0);
4697 if (force_data_sync
>= 3) {
4698 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
4699 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
4701 goto wait_for_dreads
;
4704 * Consider the possibility that upl_size wasn't satisfied.
4706 if (upl_size
< upl_needed_size
) {
4707 if (upl_size
&& upl_offset
== 0)
4713 ubc_upl_abort(upl
, 0);
4714 goto wait_for_dreads
;
4716 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 72)) | DBG_FUNC_END
,
4717 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
4720 vm_offset_t end_off
= ((iov_base
+ io_size
) & PAGE_MASK
);
4724 * After this point, if we are using a vector UPL, then
4725 * either all the UPL elements end on a page boundary OR
4726 * this UPL is the last element because it does not end
4727 * on a page boundary.
4732 * request asynchronously so that we can overlap
4733 * the preparation of the next I/O
4734 * if there are already too many outstanding reads
4735 * wait until some have completed before issuing the next read
4737 cluster_iostate_wait(&iostate
, max_rd_ahead
, "cluster_read_direct");
4739 if (iostate
.io_error
) {
4741 * one of the earlier reads we issued ran into a hard error
4742 * don't issue any more reads, cleanup the UPL
4743 * that was just created but not used, then
4744 * go wait for any other reads to complete before
4745 * returning the error to the caller
4747 ubc_upl_abort(upl
, 0);
4749 goto wait_for_dreads
;
4751 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 73)) | DBG_FUNC_START
,
4752 upl
, (int)upl_offset
, (int)uio
->uio_offset
, io_size
, 0);
4756 io_flag
&= ~CL_PRESERVE
;
4758 io_flag
|= CL_PRESERVE
;
4760 retval
= cluster_io(vp
, upl
, upl_offset
, uio
->uio_offset
, io_size
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
4764 if(!vector_upl_index
) {
4765 vector_upl
= vector_upl_create(upl_offset
);
4766 v_upl_uio_offset
= uio
->uio_offset
;
4767 vector_upl_offset
= upl_offset
;
4770 vector_upl_set_subupl(vector_upl
,upl
, upl_size
);
4771 vector_upl_set_iostate(vector_upl
, upl
, vector_upl_size
, upl_size
);
4773 vector_upl_size
+= upl_size
;
4774 vector_upl_iosize
+= io_size
;
4776 if(issueVectorUPL
|| vector_upl_index
== MAX_VECTOR_UPL_ELEMENTS
|| vector_upl_size
>= max_vector_size
) {
4777 retval
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
4778 reset_vector_run_state();
4781 last_iov_base
= iov_base
+ io_size
;
4784 // We don't need to wait for the I/O to complete
4785 cluster_unlock_direct_read(lock
);
4790 * update the uio structure
4792 if ((flags
& IO_ENCRYPTED
) && (max_io_size
< io_size
)) {
4793 uio_update(uio
, (user_size_t
)max_io_size
);
4796 uio_update(uio
, (user_size_t
)io_size
);
4799 io_req_size
-= io_size
;
4801 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 73)) | DBG_FUNC_END
,
4802 upl
, (int)uio
->uio_offset
, io_req_size
, retval
, 0);
4806 if (retval
== 0 && iostate
.io_error
== 0 && io_req_size
== 0 && uio
->uio_offset
< filesize
) {
4808 retval
= cluster_io_type(uio
, read_type
, read_length
, 0);
4810 if (retval
== 0 && *read_type
== IO_DIRECT
) {
4812 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_NONE
,
4813 (int)uio
->uio_offset
, (int)filesize
, *read_type
, *read_length
, 0);
4821 if(retval
== 0 && iostate
.io_error
== 0 && useVectorUPL
&& vector_upl_index
) {
4822 retval
= vector_cluster_io(vp
, vector_upl
, vector_upl_offset
, v_upl_uio_offset
, vector_upl_iosize
, io_flag
, (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
4823 reset_vector_run_state();
4826 // We don't need to wait for the I/O to complete
4828 cluster_unlock_direct_read(lock
);
4831 * make sure all async reads that are part of this stream
4832 * have completed before we return
4834 cluster_iostate_wait(&iostate
, 0, "cluster_read_direct");
4836 if (iostate
.io_error
)
4837 retval
= iostate
.io_error
;
4839 lck_mtx_destroy(&iostate
.io_mtxp
, cl_mtx_grp
);
4841 if (io_throttled
== TRUE
&& retval
== 0)
4844 for (next_iov_base
= orig_iov_base
; next_iov_base
< last_iov_base
; next_iov_base
+= PAGE_SIZE
) {
4846 * This is specifically done for pmap accounting purposes.
4847 * vm_pre_fault() will call vm_fault() to enter the page into
4848 * the pmap if there isn't _a_ physical page for that VA already.
4850 vm_pre_fault(vm_map_trunc_page(next_iov_base
, PAGE_MASK
));
4853 if (io_req_size
&& retval
== 0) {
4855 * we couldn't handle the tail of this request in DIRECT mode
4856 * so fire it through the copy path
4858 retval
= cluster_read_copy(vp
, uio
, io_req_size
, filesize
, flags
, callback
, callback_arg
);
4860 *read_type
= IO_UNKNOWN
;
4862 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 70)) | DBG_FUNC_END
,
4863 (int)uio
->uio_offset
, (int)uio_resid(uio
), io_req_size
, retval
, 0);
4870 cluster_read_contig(vnode_t vp
, struct uio
*uio
, off_t filesize
, int *read_type
, u_int32_t
*read_length
,
4871 int (*callback
)(buf_t
, void *), void *callback_arg
, int flags
)
4873 upl_page_info_t
*pl
;
4874 upl_t upl
[MAX_VECTS
];
4875 vm_offset_t upl_offset
;
4876 addr64_t dst_paddr
= 0;
4877 user_addr_t iov_base
;
4879 upl_size_t upl_size
;
4880 vm_size_t upl_needed_size
;
4881 mach_msg_type_number_t pages_in_pl
;
4882 upl_control_flags_t upl_flags
;
4884 struct clios iostate
;
4891 u_int32_t devblocksize
;
4892 u_int32_t mem_alignment_mask
;
4893 u_int32_t tail_size
= 0;
4896 if (flags
& IO_PASSIVE
)
4901 if (flags
& IO_NOCACHE
)
4902 bflag
|= CL_NOCACHE
;
4905 * When we enter this routine, we know
4906 * -- the read_length will not exceed the current iov_len
4907 * -- the target address is physically contiguous for read_length
4909 cluster_syncup(vp
, filesize
, callback
, callback_arg
, PUSH_SYNC
);
4911 devblocksize
= (u_int32_t
)vp
->v_mount
->mnt_devblocksize
;
4912 mem_alignment_mask
= (u_int32_t
)vp
->v_mount
->mnt_alignmentmask
;
4914 iostate
.io_completed
= 0;
4915 iostate
.io_issued
= 0;
4916 iostate
.io_error
= 0;
4917 iostate
.io_wanted
= 0;
4919 lck_mtx_init(&iostate
.io_mtxp
, cl_mtx_grp
, cl_mtx_attr
);
4922 io_size
= *read_length
;
4924 max_size
= filesize
- uio
->uio_offset
;
4926 if (io_size
> max_size
)
4929 iov_base
= uio_curriovbase(uio
);
4931 upl_offset
= (vm_offset_t
)((u_int32_t
)iov_base
& PAGE_MASK
);
4932 upl_needed_size
= upl_offset
+ io_size
;
4935 upl_size
= upl_needed_size
;
4936 upl_flags
= UPL_FILE_IO
| UPL_NO_SYNC
| UPL_CLEAN_IN_PLACE
| UPL_SET_INTERNAL
| UPL_SET_LITE
| UPL_SET_IO_WIRE
;
4939 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 92)) | DBG_FUNC_START
,
4940 (int)upl_offset
, (int)upl_size
, (int)iov_base
, io_size
, 0);
4942 vm_map_t map
= UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
) ? current_map() : kernel_map
;
4943 kret
= vm_map_get_upl(map
,
4944 (vm_map_offset_t
)(iov_base
& ~((user_addr_t
)PAGE_MASK
)),
4945 &upl_size
, &upl
[cur_upl
], NULL
, &pages_in_pl
, &upl_flags
, VM_KERN_MEMORY_FILE
, 0);
4947 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 92)) | DBG_FUNC_END
,
4948 (int)upl_offset
, upl_size
, io_size
, kret
, 0);
4950 if (kret
!= KERN_SUCCESS
) {
4952 * failed to get pagelist
4955 goto wait_for_creads
;
4959 if (upl_size
< upl_needed_size
) {
4961 * The upl_size wasn't satisfied.
4964 goto wait_for_creads
;
4966 pl
= ubc_upl_pageinfo(upl
[cur_upl
]);
4968 dst_paddr
= ((addr64_t
)upl_phys_page(pl
, 0) << PAGE_SHIFT
) + (addr64_t
)upl_offset
;
4970 while (((uio
->uio_offset
& (devblocksize
- 1)) || io_size
< devblocksize
) && io_size
) {
4971 u_int32_t head_size
;
4973 head_size
= devblocksize
- (u_int32_t
)(uio
->uio_offset
& (devblocksize
- 1));
4975 if (head_size
> io_size
)
4976 head_size
= io_size
;
4978 error
= cluster_align_phys_io(vp
, uio
, dst_paddr
, head_size
, CL_READ
, callback
, callback_arg
);
4981 goto wait_for_creads
;
4983 upl_offset
+= head_size
;
4984 dst_paddr
+= head_size
;
4985 io_size
-= head_size
;
4987 iov_base
+= head_size
;
4989 if ((u_int32_t
)iov_base
& mem_alignment_mask
) {
4991 * request doesn't set up on a memory boundary
4992 * the underlying DMA engine can handle...
4993 * return an error instead of going through
4994 * the slow copy path since the intent of this
4995 * path is direct I/O to device memory
4998 goto wait_for_creads
;
5001 tail_size
= io_size
& (devblocksize
- 1);
5003 io_size
-= tail_size
;
5005 while (io_size
&& error
== 0) {
5007 if (io_size
> MAX_IO_CONTIG_SIZE
)
5008 xsize
= MAX_IO_CONTIG_SIZE
;
5012 * request asynchronously so that we can overlap
5013 * the preparation of the next I/O... we'll do
5014 * the commit after all the I/O has completed
5015 * since its all issued against the same UPL
5016 * if there are already too many outstanding reads
5017 * wait until some have completed before issuing the next
5019 cluster_iostate_wait(&iostate
, MAX_IO_CONTIG_SIZE
* IO_SCALE(vp
, 2), "cluster_read_contig");
5021 if (iostate
.io_error
) {
5023 * one of the earlier reads we issued ran into a hard error
5024 * don't issue any more reads...
5025 * go wait for any other reads to complete before
5026 * returning the error to the caller
5028 goto wait_for_creads
;
5030 error
= cluster_io(vp
, upl
[cur_upl
], upl_offset
, uio
->uio_offset
, xsize
,
5031 CL_READ
| CL_NOZERO
| CL_DEV_MEMORY
| CL_ASYNC
| bflag
,
5032 (buf_t
)NULL
, &iostate
, callback
, callback_arg
);
5034 * The cluster_io read was issued successfully,
5035 * update the uio structure
5038 uio_update(uio
, (user_size_t
)xsize
);
5041 upl_offset
+= xsize
;
5045 if (error
== 0 && iostate
.io_error
== 0 && tail_size
== 0 && num_upl
< MAX_VECTS
&& uio
->uio_offset
< filesize
) {
5047 error
= cluster_io_type(uio
, read_type
, read_length
, 0);
5049 if (error
== 0 && *read_type
== IO_CONTIG
) {
5054 *read_type
= IO_UNKNOWN
;
5058 * make sure all async reads that are part of this stream
5059 * have completed before we proceed
5061 cluster_iostate_wait(&iostate
, 0, "cluster_read_contig");
5063 if (iostate
.io_error
)
5064 error
= iostate
.io_error
;
5066 lck_mtx_destroy(&iostate
.io_mtxp
, cl_mtx_grp
);
5068 if (error
== 0 && tail_size
)
5069 error
= cluster_align_phys_io(vp
, uio
, dst_paddr
, tail_size
, CL_READ
, callback
, callback_arg
);
5071 for (n
= 0; n
< num_upl
; n
++)
5073 * just release our hold on each physically contiguous
5074 * region without changing any state
5076 ubc_upl_abort(upl
[n
], 0);
5083 cluster_io_type(struct uio
*uio
, int *io_type
, u_int32_t
*io_length
, u_int32_t min_length
)
5085 user_size_t iov_len
;
5086 user_addr_t iov_base
= 0;
5088 upl_size_t upl_size
;
5089 upl_control_flags_t upl_flags
;
5093 * skip over any emtpy vectors
5095 uio_update(uio
, (user_size_t
)0);
5097 iov_len
= uio_curriovlen(uio
);
5099 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 94)) | DBG_FUNC_START
, uio
, (int)iov_len
, 0, 0, 0);
5102 iov_base
= uio_curriovbase(uio
);
5104 * make sure the size of the vector isn't too big...
5105 * internally, we want to handle all of the I/O in
5106 * chunk sizes that fit in a 32 bit int
5108 if (iov_len
> (user_size_t
)MAX_IO_REQUEST_SIZE
)
5109 upl_size
= MAX_IO_REQUEST_SIZE
;
5111 upl_size
= (u_int32_t
)iov_len
;
5113 upl_flags
= UPL_QUERY_OBJECT_TYPE
;
5115 vm_map_t map
= UIO_SEG_IS_USER_SPACE(uio
->uio_segflg
) ? current_map() : kernel_map
;
5116 if ((vm_map_get_upl(map
,
5117 (vm_map_offset_t
)(iov_base
& ~((user_addr_t
)PAGE_MASK
)),
5118 &upl_size
, &upl
, NULL
, NULL
, &upl_flags
, VM_KERN_MEMORY_FILE
, 0)) != KERN_SUCCESS
) {
5120 * the user app must have passed in an invalid address
5127 *io_length
= upl_size
;
5129 if (upl_flags
& UPL_PHYS_CONTIG
)
5130 *io_type
= IO_CONTIG
;
5131 else if (iov_len
>= min_length
)
5132 *io_type
= IO_DIRECT
;
5137 * nothing left to do for this uio
5140 *io_type
= IO_UNKNOWN
;
5142 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 94)) | DBG_FUNC_END
, iov_base
, *io_type
, *io_length
, retval
, 0);
5149 * generate advisory I/O's in the largest chunks possible
5150 * the completed pages will be released into the VM cache
5153 advisory_read(vnode_t vp
, off_t filesize
, off_t f_offset
, int resid
)
5155 return advisory_read_ext(vp
, filesize
, f_offset
, resid
, NULL
, NULL
, CL_PASSIVE
);
5159 advisory_read_ext(vnode_t vp
, off_t filesize
, off_t f_offset
, int resid
, int (*callback
)(buf_t
, void *), void *callback_arg
, int bflag
)
5161 upl_page_info_t
*pl
;
5163 vm_offset_t upl_offset
;
5176 uint32_t max_io_size
;
5179 if ( !UBCINFOEXISTS(vp
))
5185 max_io_size
= cluster_max_io_size(vp
->v_mount
, CL_READ
);
5188 if (max_io_size
> speculative_prefetch_max_iosize
)
5189 max_io_size
= speculative_prefetch_max_iosize
;
5191 if (disk_conditioner_mount_is_ssd(vp
->v_mount
)) {
5192 if (max_io_size
> speculative_prefetch_max_iosize
)
5193 max_io_size
= speculative_prefetch_max_iosize
;
5197 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 60)) | DBG_FUNC_START
,
5198 (int)f_offset
, resid
, (int)filesize
, 0, 0);
5200 while (resid
&& f_offset
< filesize
&& retval
== 0) {
5202 * compute the size of the upl needed to encompass
5203 * the requested read... limit each call to cluster_io
5204 * to the maximum UPL size... cluster_io will clip if
5205 * this exceeds the maximum io_size for the device,
5206 * make sure to account for
5207 * a starting offset that's not page aligned
5209 start_offset
= (int)(f_offset
& PAGE_MASK_64
);
5210 upl_f_offset
= f_offset
- (off_t
)start_offset
;
5211 max_size
= filesize
- f_offset
;
5213 if (resid
< max_size
)
5218 upl_size
= (start_offset
+ io_size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
5219 if ((uint32_t)upl_size
> max_io_size
)
5220 upl_size
= max_io_size
;
5224 * return the number of contiguously present pages in the cache
5225 * starting at upl_f_offset within the file
5227 ubc_range_op(vp
, upl_f_offset
, upl_f_offset
+ upl_size
, UPL_ROP_PRESENT
, &skip_range
);
5231 * skip over pages already present in the cache
5233 io_size
= skip_range
- start_offset
;
5235 f_offset
+= io_size
;
5238 if (skip_range
== upl_size
)
5241 * have to issue some real I/O
5242 * at this point, we know it's starting on a page boundary
5243 * because we've skipped over at least the first page in the request
5246 upl_f_offset
+= skip_range
;
5247 upl_size
-= skip_range
;
5249 pages_in_upl
= upl_size
/ PAGE_SIZE
;
5251 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 61)) | DBG_FUNC_START
,
5252 upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
5254 kret
= ubc_create_upl_kernel(vp
,
5259 UPL_RET_ONLY_ABSENT
| UPL_SET_LITE
,
5260 VM_KERN_MEMORY_FILE
);
5261 if (kret
!= KERN_SUCCESS
)
5266 * before we start marching forward, we must make sure we end on
5267 * a present page, otherwise we will be working with a freed
5270 for (last_pg
= pages_in_upl
- 1; last_pg
>= 0; last_pg
--) {
5271 if (upl_page_present(pl
, last_pg
))
5274 pages_in_upl
= last_pg
+ 1;
5277 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 61)) | DBG_FUNC_END
,
5278 upl
, (int)upl_f_offset
, upl_size
, start_offset
, 0);
5281 for (last_pg
= 0; last_pg
< pages_in_upl
; ) {
5283 * scan from the beginning of the upl looking for the first
5284 * page that is present.... this will become the first page in
5285 * the request we're going to make to 'cluster_io'... if all
5286 * of the pages are absent, we won't call through to 'cluster_io'
5288 for (start_pg
= last_pg
; start_pg
< pages_in_upl
; start_pg
++) {
5289 if (upl_page_present(pl
, start_pg
))
5294 * scan from the starting present page looking for an absent
5295 * page before the end of the upl is reached, if we
5296 * find one, then it will terminate the range of pages being
5297 * presented to 'cluster_io'
5299 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
5300 if (!upl_page_present(pl
, last_pg
))
5304 if (last_pg
> start_pg
) {
5306 * we found a range of pages that must be filled
5307 * if the last page in this range is the last page of the file
5308 * we may have to clip the size of it to keep from reading past
5309 * the end of the last physical block associated with the file
5311 upl_offset
= start_pg
* PAGE_SIZE
;
5312 io_size
= (last_pg
- start_pg
) * PAGE_SIZE
;
5314 if ((off_t
)(upl_f_offset
+ upl_offset
+ io_size
) > filesize
)
5315 io_size
= filesize
- (upl_f_offset
+ upl_offset
);
5318 * issue an asynchronous read to cluster_io
5320 retval
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, io_size
,
5321 CL_ASYNC
| CL_READ
| CL_COMMIT
| CL_AGE
| bflag
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
);
5327 ubc_upl_abort(upl
, 0);
5329 io_size
= upl_size
- start_offset
;
5331 if (io_size
> resid
)
5333 f_offset
+= io_size
;
5337 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 60)) | DBG_FUNC_END
,
5338 (int)f_offset
, resid
, retval
, 0, 0);
5345 cluster_push(vnode_t vp
, int flags
)
5347 return cluster_push_ext(vp
, flags
, NULL
, NULL
);
5352 cluster_push_ext(vnode_t vp
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
5354 return cluster_push_err(vp
, flags
, callback
, callback_arg
, NULL
);
5357 /* write errors via err, but return the number of clusters written */
5359 cluster_push_err(vnode_t vp
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
, int *err
)
5362 int my_sparse_wait
= 0;
5363 struct cl_writebehind
*wbp
;
5368 if ( !UBCINFOEXISTS(vp
)) {
5369 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_NONE
, kdebug_vnode(vp
), flags
, 0, -1, 0);
5372 /* return if deferred write is set */
5373 if (((unsigned int)vfs_flags(vp
->v_mount
) & MNT_DEFWRITE
) && (flags
& IO_DEFWRITE
)) {
5376 if ((wbp
= cluster_get_wbp(vp
, CLW_RETURNLOCKED
)) == NULL
) {
5377 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_NONE
, kdebug_vnode(vp
), flags
, 0, -2, 0);
5380 if (!ISSET(flags
, IO_SYNC
) && wbp
->cl_number
== 0 && wbp
->cl_scmap
== NULL
) {
5381 lck_mtx_unlock(&wbp
->cl_lockw
);
5383 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_NONE
, kdebug_vnode(vp
), flags
, 0, -3, 0);
5386 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_START
,
5387 wbp
->cl_scmap
, wbp
->cl_number
, flags
, 0, 0);
5390 * if we have an fsync in progress, we don't want to allow any additional
5391 * sync/fsync/close(s) to occur until it finishes.
5392 * note that its possible for writes to continue to occur to this file
5393 * while we're waiting and also once the fsync starts to clean if we're
5394 * in the sparse map case
5396 while (wbp
->cl_sparse_wait
) {
5397 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 97)) | DBG_FUNC_START
, kdebug_vnode(vp
), 0, 0, 0, 0);
5399 msleep((caddr_t
)&wbp
->cl_sparse_wait
, &wbp
->cl_lockw
, PRIBIO
+ 1, "cluster_push_ext", NULL
);
5401 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 97)) | DBG_FUNC_END
, kdebug_vnode(vp
), 0, 0, 0, 0);
5403 if (flags
& IO_SYNC
) {
5405 wbp
->cl_sparse_wait
= 1;
5408 * this is an fsync (or equivalent)... we must wait for any existing async
5409 * cleaning operations to complete before we evaulate the current state
5410 * and finish cleaning... this insures that all writes issued before this
5411 * fsync actually get cleaned to the disk before this fsync returns
5413 while (wbp
->cl_sparse_pushes
) {
5414 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 98)) | DBG_FUNC_START
, kdebug_vnode(vp
), 0, 0, 0, 0);
5416 msleep((caddr_t
)&wbp
->cl_sparse_pushes
, &wbp
->cl_lockw
, PRIBIO
+ 1, "cluster_push_ext", NULL
);
5418 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 98)) | DBG_FUNC_END
, kdebug_vnode(vp
), 0, 0, 0, 0);
5421 if (wbp
->cl_scmap
) {
5424 if (wbp
->cl_sparse_pushes
< SPARSE_PUSH_LIMIT
) {
5426 scmap
= wbp
->cl_scmap
;
5427 wbp
->cl_scmap
= NULL
;
5429 wbp
->cl_sparse_pushes
++;
5431 lck_mtx_unlock(&wbp
->cl_lockw
);
5433 retval
= sparse_cluster_push(&scmap
, vp
, ubc_getsize(vp
), PUSH_ALL
, flags
, callback
, callback_arg
);
5435 lck_mtx_lock(&wbp
->cl_lockw
);
5437 wbp
->cl_sparse_pushes
--;
5439 if (wbp
->cl_sparse_wait
&& wbp
->cl_sparse_pushes
== 0)
5440 wakeup((caddr_t
)&wbp
->cl_sparse_pushes
);
5442 retval
= sparse_cluster_push(&(wbp
->cl_scmap
), vp
, ubc_getsize(vp
), PUSH_ALL
, flags
, callback
, callback_arg
);
5448 retval
= cluster_try_push(wbp
, vp
, ubc_getsize(vp
), PUSH_ALL
, flags
, callback
, callback_arg
, err
);
5450 lck_mtx_unlock(&wbp
->cl_lockw
);
5452 if (flags
& IO_SYNC
)
5453 (void)vnode_waitforwrites(vp
, 0, 0, 0, "cluster_push");
5455 if (my_sparse_wait
) {
5457 * I'm the owner of the serialization token
5458 * clear it and wakeup anyone that is waiting
5461 lck_mtx_lock(&wbp
->cl_lockw
);
5463 wbp
->cl_sparse_wait
= 0;
5464 wakeup((caddr_t
)&wbp
->cl_sparse_wait
);
5466 lck_mtx_unlock(&wbp
->cl_lockw
);
5468 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 53)) | DBG_FUNC_END
,
5469 wbp
->cl_scmap
, wbp
->cl_number
, retval
, 0, 0);
5475 __private_extern__
void
5476 cluster_release(struct ubc_info
*ubc
)
5478 struct cl_writebehind
*wbp
;
5479 struct cl_readahead
*rap
;
5481 if ((wbp
= ubc
->cl_wbehind
)) {
5483 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_START
, ubc
, wbp
->cl_scmap
, 0, 0, 0);
5486 vfs_drt_control(&(wbp
->cl_scmap
), 0);
5488 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_START
, ubc
, 0, 0, 0, 0);
5491 rap
= ubc
->cl_rahead
;
5494 lck_mtx_destroy(&wbp
->cl_lockw
, cl_mtx_grp
);
5495 FREE_ZONE((void *)wbp
, sizeof *wbp
, M_CLWRBEHIND
);
5497 if ((rap
= ubc
->cl_rahead
)) {
5498 lck_mtx_destroy(&rap
->cl_lockr
, cl_mtx_grp
);
5499 FREE_ZONE((void *)rap
, sizeof *rap
, M_CLRDAHEAD
);
5501 ubc
->cl_rahead
= NULL
;
5502 ubc
->cl_wbehind
= NULL
;
5504 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 81)) | DBG_FUNC_END
, ubc
, rap
, wbp
, 0, 0);
5509 cluster_try_push(struct cl_writebehind
*wbp
, vnode_t vp
, off_t EOF
, int push_flag
, int io_flags
, int (*callback
)(buf_t
, void *), void *callback_arg
, int *err
)
5516 struct cl_wextent l_clusters
[MAX_CLUSTERS
];
5517 u_int max_cluster_pgcount
;
5520 max_cluster_pgcount
= MAX_CLUSTER_SIZE(vp
) / PAGE_SIZE
;
5522 * the write behind context exists and has
5523 * already been locked...
5525 if (wbp
->cl_number
== 0)
5527 * no clusters to push
5528 * return number of empty slots
5530 return (MAX_CLUSTERS
);
5533 * make a local 'sorted' copy of the clusters
5534 * and clear wbp->cl_number so that new clusters can
5537 for (cl_index
= 0; cl_index
< wbp
->cl_number
; cl_index
++) {
5538 for (min_index
= -1, cl_index1
= 0; cl_index1
< wbp
->cl_number
; cl_index1
++) {
5539 if (wbp
->cl_clusters
[cl_index1
].b_addr
== wbp
->cl_clusters
[cl_index1
].e_addr
)
5541 if (min_index
== -1)
5542 min_index
= cl_index1
;
5543 else if (wbp
->cl_clusters
[cl_index1
].b_addr
< wbp
->cl_clusters
[min_index
].b_addr
)
5544 min_index
= cl_index1
;
5546 if (min_index
== -1)
5549 l_clusters
[cl_index
].b_addr
= wbp
->cl_clusters
[min_index
].b_addr
;
5550 l_clusters
[cl_index
].e_addr
= wbp
->cl_clusters
[min_index
].e_addr
;
5551 l_clusters
[cl_index
].io_flags
= wbp
->cl_clusters
[min_index
].io_flags
;
5553 wbp
->cl_clusters
[min_index
].b_addr
= wbp
->cl_clusters
[min_index
].e_addr
;
5559 /* skip switching to the sparse cluster mechanism if on diskimage */
5560 if ( ((push_flag
& PUSH_DELAY
) && cl_len
== MAX_CLUSTERS
) &&
5561 !(vp
->v_mount
->mnt_kern_flag
& MNTK_VIRTUALDEV
) ) {
5565 * determine if we appear to be writing the file sequentially
5566 * if not, by returning without having pushed any clusters
5567 * we will cause this vnode to be pushed into the sparse cluster mechanism
5568 * used for managing more random I/O patterns
5570 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
5571 * that's why we're in try_push with PUSH_DELAY...
5573 * check to make sure that all the clusters except the last one are 'full'... and that each cluster
5574 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
5575 * so we can just make a simple pass through, up to, but not including the last one...
5576 * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
5579 * we let the last one be partial as long as it was adjacent to the previous one...
5580 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
5581 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
5583 for (i
= 0; i
< MAX_CLUSTERS
- 1; i
++) {
5584 if ((l_clusters
[i
].e_addr
- l_clusters
[i
].b_addr
) != max_cluster_pgcount
)
5586 if (l_clusters
[i
].e_addr
!= l_clusters
[i
+1].b_addr
)
5590 for (cl_index
= 0; cl_index
< cl_len
; cl_index
++) {
5592 struct cl_extent cl
;
5595 flags
= io_flags
& (IO_PASSIVE
|IO_CLOSE
);
5598 * try to push each cluster in turn...
5600 if (l_clusters
[cl_index
].io_flags
& CLW_IONOCACHE
)
5601 flags
|= IO_NOCACHE
;
5603 if (l_clusters
[cl_index
].io_flags
& CLW_IOPASSIVE
)
5604 flags
|= IO_PASSIVE
;
5606 if (push_flag
& PUSH_SYNC
)
5609 cl
.b_addr
= l_clusters
[cl_index
].b_addr
;
5610 cl
.e_addr
= l_clusters
[cl_index
].e_addr
;
5612 retval
= cluster_push_now(vp
, &cl
, EOF
, flags
, callback
, callback_arg
);
5614 if (error
== 0 && retval
)
5617 l_clusters
[cl_index
].b_addr
= 0;
5618 l_clusters
[cl_index
].e_addr
= 0;
5622 if ( !(push_flag
& PUSH_ALL
) )
5629 if (cl_len
> cl_pushed
) {
5631 * we didn't push all of the clusters, so
5632 * lets try to merge them back in to the vnode
5634 if ((MAX_CLUSTERS
- wbp
->cl_number
) < (cl_len
- cl_pushed
)) {
5636 * we picked up some new clusters while we were trying to
5637 * push the old ones... this can happen because I've dropped
5638 * the vnode lock... the sum of the
5639 * leftovers plus the new cluster count exceeds our ability
5640 * to represent them, so switch to the sparse cluster mechanism
5642 * collect the active public clusters...
5644 sparse_cluster_switch(wbp
, vp
, EOF
, callback
, callback_arg
);
5646 for (cl_index
= 0, cl_index1
= 0; cl_index
< cl_len
; cl_index
++) {
5647 if (l_clusters
[cl_index
].b_addr
== l_clusters
[cl_index
].e_addr
)
5649 wbp
->cl_clusters
[cl_index1
].b_addr
= l_clusters
[cl_index
].b_addr
;
5650 wbp
->cl_clusters
[cl_index1
].e_addr
= l_clusters
[cl_index
].e_addr
;
5651 wbp
->cl_clusters
[cl_index1
].io_flags
= l_clusters
[cl_index
].io_flags
;
5656 * update the cluster count
5658 wbp
->cl_number
= cl_index1
;
5661 * and collect the original clusters that were moved into the
5662 * local storage for sorting purposes
5664 sparse_cluster_switch(wbp
, vp
, EOF
, callback
, callback_arg
);
5668 * we've got room to merge the leftovers back in
5669 * just append them starting at the next 'hole'
5670 * represented by wbp->cl_number
5672 for (cl_index
= 0, cl_index1
= wbp
->cl_number
; cl_index
< cl_len
; cl_index
++) {
5673 if (l_clusters
[cl_index
].b_addr
== l_clusters
[cl_index
].e_addr
)
5676 wbp
->cl_clusters
[cl_index1
].b_addr
= l_clusters
[cl_index
].b_addr
;
5677 wbp
->cl_clusters
[cl_index1
].e_addr
= l_clusters
[cl_index
].e_addr
;
5678 wbp
->cl_clusters
[cl_index1
].io_flags
= l_clusters
[cl_index
].io_flags
;
5683 * update the cluster count
5685 wbp
->cl_number
= cl_index1
;
5688 return (MAX_CLUSTERS
- wbp
->cl_number
);
5694 cluster_push_now(vnode_t vp
, struct cl_extent
*cl
, off_t EOF
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
5696 upl_page_info_t
*pl
;
5698 vm_offset_t upl_offset
;
5713 if (flags
& IO_PASSIVE
)
5718 if (flags
& IO_SKIP_ENCRYPTION
)
5719 bflag
|= CL_ENCRYPTED
;
5721 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_START
,
5722 (int)cl
->b_addr
, (int)cl
->e_addr
, (int)EOF
, flags
, 0);
5724 if ((pages_in_upl
= (int)(cl
->e_addr
- cl
->b_addr
)) == 0) {
5725 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 0, 0, 0, 0);
5729 upl_size
= pages_in_upl
* PAGE_SIZE
;
5730 upl_f_offset
= (off_t
)(cl
->b_addr
* PAGE_SIZE_64
);
5732 if (upl_f_offset
+ upl_size
>= EOF
) {
5734 if (upl_f_offset
>= EOF
) {
5736 * must have truncated the file and missed
5737 * clearing a dangling cluster (i.e. it's completely
5738 * beyond the new EOF
5740 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 1, 0, 0, 0);
5744 size
= EOF
- upl_f_offset
;
5746 upl_size
= (size
+ (PAGE_SIZE
- 1)) & ~PAGE_MASK
;
5747 pages_in_upl
= upl_size
/ PAGE_SIZE
;
5751 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_START
, upl_size
, size
, 0, 0, 0);
5754 * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
5756 * - only pages that are currently dirty are returned... these are the ones we need to clean
5757 * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
5758 * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
5759 * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
5760 * someone dirties this page while the I/O is in progress, we don't lose track of the new state
5762 * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
5765 if ((vp
->v_flag
& VNOCACHE_DATA
) || (flags
& IO_NOCACHE
))
5766 upl_flags
= UPL_COPYOUT_FROM
| UPL_RET_ONLY_DIRTY
| UPL_SET_LITE
| UPL_WILL_BE_DUMPED
;
5768 upl_flags
= UPL_COPYOUT_FROM
| UPL_RET_ONLY_DIRTY
| UPL_SET_LITE
;
5770 kret
= ubc_create_upl_kernel(vp
,
5776 VM_KERN_MEMORY_FILE
);
5777 if (kret
!= KERN_SUCCESS
)
5778 panic("cluster_push: failed to get pagelist");
5780 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 41)) | DBG_FUNC_END
, upl
, upl_f_offset
, 0, 0, 0);
5783 * since we only asked for the dirty pages back
5784 * it's possible that we may only get a few or even none, so...
5785 * before we start marching forward, we must make sure we know
5786 * where the last present page is in the UPL, otherwise we could
5787 * end up working with a freed upl due to the FREE_ON_EMPTY semantics
5788 * employed by commit_range and abort_range.
5790 for (last_pg
= pages_in_upl
- 1; last_pg
>= 0; last_pg
--) {
5791 if (upl_page_present(pl
, last_pg
))
5794 pages_in_upl
= last_pg
+ 1;
5796 if (pages_in_upl
== 0) {
5797 ubc_upl_abort(upl
, 0);
5799 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 2, 0, 0, 0);
5803 for (last_pg
= 0; last_pg
< pages_in_upl
; ) {
5805 * find the next dirty page in the UPL
5806 * this will become the first page in the
5807 * next I/O to generate
5809 for (start_pg
= last_pg
; start_pg
< pages_in_upl
; start_pg
++) {
5810 if (upl_dirty_page(pl
, start_pg
))
5812 if (upl_page_present(pl
, start_pg
))
5814 * RET_ONLY_DIRTY will return non-dirty 'precious' pages
5815 * just release these unchanged since we're not going
5816 * to steal them or change their state
5818 ubc_upl_abort_range(upl
, start_pg
* PAGE_SIZE
, PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
5820 if (start_pg
>= pages_in_upl
)
5822 * done... no more dirty pages to push
5825 if (start_pg
> last_pg
)
5827 * skipped over some non-dirty pages
5829 size
-= ((start_pg
- last_pg
) * PAGE_SIZE
);
5832 * find a range of dirty pages to write
5834 for (last_pg
= start_pg
; last_pg
< pages_in_upl
; last_pg
++) {
5835 if (!upl_dirty_page(pl
, last_pg
))
5838 upl_offset
= start_pg
* PAGE_SIZE
;
5840 io_size
= min(size
, (last_pg
- start_pg
) * PAGE_SIZE
);
5842 io_flags
= CL_THROTTLE
| CL_COMMIT
| CL_AGE
| bflag
;
5844 if ( !(flags
& IO_SYNC
))
5845 io_flags
|= CL_ASYNC
;
5847 if (flags
& IO_CLOSE
)
5848 io_flags
|= CL_CLOSE
;
5850 if (flags
& IO_NOCACHE
)
5851 io_flags
|= CL_NOCACHE
;
5853 retval
= cluster_io(vp
, upl
, upl_offset
, upl_f_offset
+ upl_offset
, io_size
,
5854 io_flags
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
);
5856 if (error
== 0 && retval
)
5861 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 51)) | DBG_FUNC_END
, 1, 3, 0, 0, 0);
5868 * sparse_cluster_switch is called with the write behind lock held
5871 sparse_cluster_switch(struct cl_writebehind
*wbp
, vnode_t vp
, off_t EOF
, int (*callback
)(buf_t
, void *), void *callback_arg
)
5875 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 78)) | DBG_FUNC_START
, kdebug_vnode(vp
), wbp
->cl_scmap
, 0, 0, 0);
5877 for (cl_index
= 0; cl_index
< wbp
->cl_number
; cl_index
++) {
5879 struct cl_extent cl
;
5881 for (cl
.b_addr
= wbp
->cl_clusters
[cl_index
].b_addr
; cl
.b_addr
< wbp
->cl_clusters
[cl_index
].e_addr
; cl
.b_addr
++) {
5883 if (ubc_page_op(vp
, (off_t
)(cl
.b_addr
* PAGE_SIZE_64
), 0, NULL
, &flags
) == KERN_SUCCESS
) {
5884 if (flags
& UPL_POP_DIRTY
) {
5885 cl
.e_addr
= cl
.b_addr
+ 1;
5887 sparse_cluster_add(&(wbp
->cl_scmap
), vp
, &cl
, EOF
, callback
, callback_arg
);
5894 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 78)) | DBG_FUNC_END
, kdebug_vnode(vp
), wbp
->cl_scmap
, 0, 0, 0);
5899 * sparse_cluster_push must be called with the write-behind lock held if the scmap is
5900 * still associated with the write-behind context... however, if the scmap has been disassociated
5901 * from the write-behind context (the cluster_push case), the wb lock is not held
5904 sparse_cluster_push(void **scmap
, vnode_t vp
, off_t EOF
, int push_flag
, int io_flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
5906 struct cl_extent cl
;
5911 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 79)) | DBG_FUNC_START
, kdebug_vnode(vp
), (*scmap
), 0, push_flag
, 0);
5913 if (push_flag
& PUSH_ALL
)
5914 vfs_drt_control(scmap
, 1);
5918 if (vfs_drt_get_cluster(scmap
, &offset
, &length
) != KERN_SUCCESS
)
5921 cl
.b_addr
= (daddr64_t
)(offset
/ PAGE_SIZE_64
);
5922 cl
.e_addr
= (daddr64_t
)((offset
+ length
) / PAGE_SIZE_64
);
5924 retval
= cluster_push_now(vp
, &cl
, EOF
, io_flags
& (IO_PASSIVE
|IO_CLOSE
), callback
, callback_arg
);
5925 if (error
== 0 && retval
)
5928 if ( !(push_flag
& PUSH_ALL
) )
5931 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 79)) | DBG_FUNC_END
, kdebug_vnode(vp
), (*scmap
), 0, 0, 0);
5938 * sparse_cluster_add is called with the write behind lock held
5941 sparse_cluster_add(void **scmap
, vnode_t vp
, struct cl_extent
*cl
, off_t EOF
, int (*callback
)(buf_t
, void *), void *callback_arg
)
5947 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 80)) | DBG_FUNC_START
, (*scmap
), 0, cl
->b_addr
, (int)cl
->e_addr
, 0);
5949 offset
= (off_t
)(cl
->b_addr
* PAGE_SIZE_64
);
5950 length
= ((u_int
)(cl
->e_addr
- cl
->b_addr
)) * PAGE_SIZE
;
5952 while (vfs_drt_mark_pages(scmap
, offset
, length
, &new_dirty
) != KERN_SUCCESS
) {
5954 * no room left in the map
5955 * only a partial update was done
5956 * push out some pages and try again
5958 sparse_cluster_push(scmap
, vp
, EOF
, 0, 0, callback
, callback_arg
);
5960 offset
+= (new_dirty
* PAGE_SIZE_64
);
5961 length
-= (new_dirty
* PAGE_SIZE
);
5963 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 80)) | DBG_FUNC_END
, kdebug_vnode(vp
), (*scmap
), 0, 0, 0);
5968 cluster_align_phys_io(vnode_t vp
, struct uio
*uio
, addr64_t usr_paddr
, u_int32_t xsize
, int flags
, int (*callback
)(buf_t
, void *), void *callback_arg
)
5970 upl_page_info_t
*pl
;
5980 if (flags
& IO_PASSIVE
)
5985 if (flags
& IO_NOCACHE
)
5986 bflag
|= CL_NOCACHE
;
5988 upl_flags
= UPL_SET_LITE
;
5990 if ( !(flags
& CL_READ
) ) {
5992 * "write" operation: let the UPL subsystem know
5993 * that we intend to modify the buffer cache pages
5996 upl_flags
|= UPL_WILL_MODIFY
;
5999 * indicate that there is no need to pull the
6000 * mapping for this page... we're only going
6001 * to read from it, not modify it.
6003 upl_flags
|= UPL_FILE_IO
;
6005 kret
= ubc_create_upl_kernel(vp
,
6006 uio
->uio_offset
& ~PAGE_MASK_64
,
6011 VM_KERN_MEMORY_FILE
);
6013 if (kret
!= KERN_SUCCESS
)
6016 if (!upl_valid_page(pl
, 0)) {
6018 * issue a synchronous read to cluster_io
6020 error
= cluster_io(vp
, upl
, 0, uio
->uio_offset
& ~PAGE_MASK_64
, PAGE_SIZE
,
6021 CL_READ
| bflag
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
);
6023 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, UPL_ABORT_DUMP_PAGES
| UPL_ABORT_FREE_ON_EMPTY
);
6029 ubc_paddr
= ((addr64_t
)upl_phys_page(pl
, 0) << PAGE_SHIFT
) + (addr64_t
)(uio
->uio_offset
& PAGE_MASK_64
);
6032 * NOTE: There is no prototype for the following in BSD. It, and the definitions
6033 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
6034 * osfmk/ppc/mappings.h. They are not included here because there appears to be no
6035 * way to do so without exporting them to kexts as well.
6037 if (flags
& CL_READ
)
6038 // copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */
6039 copypv(ubc_paddr
, usr_paddr
, xsize
, 2 | 1 | 4); /* Copy physical to physical and flush the destination */
6041 // copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */
6042 copypv(usr_paddr
, ubc_paddr
, xsize
, 2 | 1 | 8); /* Copy physical to physical and flush the source */
6044 if ( !(flags
& CL_READ
) || (upl_valid_page(pl
, 0) && upl_dirty_page(pl
, 0))) {
6046 * issue a synchronous write to cluster_io
6048 error
= cluster_io(vp
, upl
, 0, uio
->uio_offset
& ~PAGE_MASK_64
, PAGE_SIZE
,
6049 bflag
, (buf_t
)NULL
, (struct clios
*)NULL
, callback
, callback_arg
);
6052 uio_update(uio
, (user_size_t
)xsize
);
6055 abort_flags
= UPL_ABORT_FREE_ON_EMPTY
;
6057 abort_flags
= UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_DUMP_PAGES
;
6059 ubc_upl_abort_range(upl
, 0, PAGE_SIZE
, abort_flags
);
6065 cluster_copy_upl_data(struct uio
*uio
, upl_t upl
, int upl_offset
, int *io_resid
)
6073 upl_page_info_t
*pl
;
6078 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_START
,
6079 (int)uio
->uio_offset
, upl_offset
, xsize
, 0, 0);
6081 segflg
= uio
->uio_segflg
;
6085 case UIO_USERSPACE32
:
6086 case UIO_USERISPACE32
:
6087 uio
->uio_segflg
= UIO_PHYS_USERSPACE32
;
6091 case UIO_USERISPACE
:
6092 uio
->uio_segflg
= UIO_PHYS_USERSPACE
;
6095 case UIO_USERSPACE64
:
6096 case UIO_USERISPACE64
:
6097 uio
->uio_segflg
= UIO_PHYS_USERSPACE64
;
6101 uio
->uio_segflg
= UIO_PHYS_SYSSPACE
;
6105 pl
= ubc_upl_pageinfo(upl
);
6107 pg_index
= upl_offset
/ PAGE_SIZE
;
6108 pg_offset
= upl_offset
& PAGE_MASK
;
6109 csize
= min(PAGE_SIZE
- pg_offset
, xsize
);
6112 while (xsize
&& retval
== 0) {
6115 paddr
= ((addr64_t
)upl_phys_page(pl
, pg_index
) << PAGE_SHIFT
) + pg_offset
;
6116 if ((uio
->uio_rw
== UIO_WRITE
) && (upl_dirty_page(pl
, pg_index
) == FALSE
))
6119 retval
= uiomove64(paddr
, csize
, uio
);
6124 csize
= min(PAGE_SIZE
, xsize
);
6128 uio
->uio_segflg
= segflg
;
6130 task_update_logical_writes(current_task(), (dirty_count
* PAGE_SIZE
), TASK_WRITE_DEFERRED
, upl_lookup_vnode(upl
));
6131 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
6132 (int)uio
->uio_offset
, xsize
, retval
, segflg
, 0);
6139 cluster_copy_ubc_data(vnode_t vp
, struct uio
*uio
, int *io_resid
, int mark_dirty
)
6142 return (cluster_copy_ubc_data_internal(vp
, uio
, io_resid
, mark_dirty
, 1));
6147 cluster_copy_ubc_data_internal(vnode_t vp
, struct uio
*uio
, int *io_resid
, int mark_dirty
, int take_reference
)
6154 memory_object_control_t control
;
6156 io_size
= *io_resid
;
6158 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_START
,
6159 (int)uio
->uio_offset
, io_size
, mark_dirty
, take_reference
, 0);
6161 control
= ubc_getobject(vp
, UBC_FLAGS_NONE
);
6163 if (control
== MEMORY_OBJECT_CONTROL_NULL
) {
6164 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
6165 (int)uio
->uio_offset
, io_size
, retval
, 3, 0);
6169 segflg
= uio
->uio_segflg
;
6173 case UIO_USERSPACE32
:
6174 case UIO_USERISPACE32
:
6175 uio
->uio_segflg
= UIO_PHYS_USERSPACE32
;
6178 case UIO_USERSPACE64
:
6179 case UIO_USERISPACE64
:
6180 uio
->uio_segflg
= UIO_PHYS_USERSPACE64
;
6184 case UIO_USERISPACE
:
6185 uio
->uio_segflg
= UIO_PHYS_USERSPACE
;
6189 uio
->uio_segflg
= UIO_PHYS_SYSSPACE
;
6193 if ( (io_size
= *io_resid
) ) {
6194 start_offset
= (int)(uio
->uio_offset
& PAGE_MASK_64
);
6195 xsize
= uio_resid(uio
);
6197 retval
= memory_object_control_uiomove(control
, uio
->uio_offset
- start_offset
, uio
,
6198 start_offset
, io_size
, mark_dirty
, take_reference
);
6199 xsize
-= uio_resid(uio
);
6202 uio
->uio_segflg
= segflg
;
6203 *io_resid
= io_size
;
6205 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 34)) | DBG_FUNC_END
,
6206 (int)uio
->uio_offset
, io_size
, retval
, 0x80000000 | segflg
, 0);
6213 is_file_clean(vnode_t vp
, off_t filesize
)
6217 int total_dirty
= 0;
6219 for (f_offset
= 0; f_offset
< filesize
; f_offset
+= PAGE_SIZE_64
) {
6220 if (ubc_page_op(vp
, f_offset
, 0, NULL
, &flags
) == KERN_SUCCESS
) {
6221 if (flags
& UPL_POP_DIRTY
) {
6235 * Dirty region tracking/clustering mechanism.
6237 * This code (vfs_drt_*) provides a mechanism for tracking and clustering
6238 * dirty regions within a larger space (file). It is primarily intended to
6239 * support clustering in large files with many dirty areas.
6241 * The implementation assumes that the dirty regions are pages.
6243 * To represent dirty pages within the file, we store bit vectors in a
6244 * variable-size circular hash.
6248 * Bitvector size. This determines the number of pages we group in a
6249 * single hashtable entry. Each hashtable entry is aligned to this
6250 * size within the file.
6252 #define DRT_BITVECTOR_PAGES ((1024 * 1024) / PAGE_SIZE)
6255 * File offset handling.
6257 * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
6258 * the correct formula is (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6260 #define DRT_ADDRESS_MASK (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6261 #define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
6264 * Hashtable address field handling.
6266 * The low-order bits of the hashtable address are used to conserve
6269 * DRT_HASH_COUNT_MASK must be large enough to store the range
6270 * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
6271 * to indicate that the bucket is actually unoccupied.
6273 #define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
6274 #define DRT_HASH_SET_ADDRESS(scm, i, a) \
6276 (scm)->scm_hashtable[(i)].dhe_control = \
6277 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
6279 #define DRT_HASH_COUNT_MASK 0x1ff
6280 #define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
6281 #define DRT_HASH_SET_COUNT(scm, i, c) \
6283 (scm)->scm_hashtable[(i)].dhe_control = \
6284 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \
6286 #define DRT_HASH_CLEAR(scm, i) \
6288 (scm)->scm_hashtable[(i)].dhe_control = 0; \
6290 #define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
6291 #define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
6292 #define DRT_HASH_COPY(oscm, oi, scm, i) \
6294 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
6295 DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
6300 * Hash table moduli.
6302 * Since the hashtable entry's size is dependent on the size of
6303 * the bitvector, and since the hashtable size is constrained to
6304 * both being prime and fitting within the desired allocation
6305 * size, these values need to be manually determined.
6307 * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
6309 * The small hashtable allocation is 1024 bytes, so the modulus is 23.
6310 * The large hashtable allocation is 16384 bytes, so the modulus is 401.
6312 #define DRT_HASH_SMALL_MODULUS 23
6313 #define DRT_HASH_LARGE_MODULUS 401
6316 * Physical memory required before the large hash modulus is permitted.
6318 * On small memory systems, the large hash modulus can lead to phsyical
6319 * memory starvation, so we avoid using it there.
6321 #define DRT_HASH_LARGE_MEMORY_REQUIRED (1024LL * 1024LL * 1024LL) /* 1GiB */
6323 #define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */
6324 #define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */
6326 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
6329 * Hashtable bitvector handling.
6331 * Bitvector fields are 32 bits long.
6334 #define DRT_HASH_SET_BIT(scm, i, bit) \
6335 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
6337 #define DRT_HASH_CLEAR_BIT(scm, i, bit) \
6338 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
6340 #define DRT_HASH_TEST_BIT(scm, i, bit) \
6341 ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
6343 #define DRT_BITVECTOR_CLEAR(scm, i) \
6344 bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
6346 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
6347 bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
6348 &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
6349 (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
6356 struct vfs_drt_hashentry
{
6357 u_int64_t dhe_control
;
6359 * dhe_bitvector was declared as dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
6360 * DRT_BITVECTOR_PAGES is defined as ((1024 * 1024) / PAGE_SIZE)
6361 * Since PAGE_SIZE is only known at boot time,
6362 * -define MAX_DRT_BITVECTOR_PAGES for smallest supported page size (4k)
6363 * -declare dhe_bitvector array for largest possible length
6365 #define MAX_DRT_BITVECTOR_PAGES (1024 * 1024)/( 4 * 1024)
6366 u_int32_t dhe_bitvector
[MAX_DRT_BITVECTOR_PAGES
/32];
6370 * Dirty Region Tracking structure.
6372 * The hashtable is allocated entirely inside the DRT structure.
6374 * The hash is a simple circular prime modulus arrangement, the structure
6375 * is resized from small to large if it overflows.
6378 struct vfs_drt_clustermap
{
6379 u_int32_t scm_magic
; /* sanity/detection */
6380 #define DRT_SCM_MAGIC 0x12020003
6381 u_int32_t scm_modulus
; /* current ring size */
6382 u_int32_t scm_buckets
; /* number of occupied buckets */
6383 u_int32_t scm_lastclean
; /* last entry we cleaned */
6384 u_int32_t scm_iskips
; /* number of slot skips */
6386 struct vfs_drt_hashentry scm_hashtable
[0];
6390 #define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
6391 #define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
6394 * Debugging codes and arguments.
6396 #define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
6397 #define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
6398 #define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
6399 #define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
6400 #define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
6403 /* 1 (clean, no map) */
6404 /* 2 (map alloc fail) */
6405 /* 3, resid (partial) */
6406 #define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
6407 #define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
6408 * lastclean, iskips */
6411 static kern_return_t
vfs_drt_alloc_map(struct vfs_drt_clustermap
**cmapp
);
6412 static kern_return_t
vfs_drt_free_map(struct vfs_drt_clustermap
*cmap
);
6413 static kern_return_t
vfs_drt_search_index(struct vfs_drt_clustermap
*cmap
,
6414 u_int64_t offset
, int *indexp
);
6415 static kern_return_t
vfs_drt_get_index(struct vfs_drt_clustermap
**cmapp
,
6419 static kern_return_t
vfs_drt_do_mark_pages(
6425 static void vfs_drt_trace(
6426 struct vfs_drt_clustermap
*cmap
,
6435 * Allocate and initialise a sparse cluster map.
6437 * Will allocate a new map, resize or compact an existing map.
6439 * XXX we should probably have at least one intermediate map size,
6440 * as the 1:16 ratio seems a bit drastic.
6442 static kern_return_t
6443 vfs_drt_alloc_map(struct vfs_drt_clustermap
**cmapp
)
6445 struct vfs_drt_clustermap
*cmap
, *ocmap
;
6449 int nsize
, active_buckets
, index
, copycount
;
6456 * Decide on the size of the new map.
6458 if (ocmap
== NULL
) {
6459 nsize
= DRT_HASH_SMALL_MODULUS
;
6461 /* count the number of active buckets in the old map */
6463 for (i
= 0; i
< ocmap
->scm_modulus
; i
++) {
6464 if (!DRT_HASH_VACANT(ocmap
, i
) &&
6465 (DRT_HASH_GET_COUNT(ocmap
, i
) != 0))
6469 * If we're currently using the small allocation, check to
6470 * see whether we should grow to the large one.
6472 if (ocmap
->scm_modulus
== DRT_HASH_SMALL_MODULUS
) {
6474 * If the ring is nearly full and we are allowed to
6475 * use the large modulus, upgrade.
6477 if ((active_buckets
> (DRT_HASH_SMALL_MODULUS
- 5)) &&
6478 (max_mem
>= DRT_HASH_LARGE_MEMORY_REQUIRED
)) {
6479 nsize
= DRT_HASH_LARGE_MODULUS
;
6481 nsize
= DRT_HASH_SMALL_MODULUS
;
6484 /* already using the large modulus */
6485 nsize
= DRT_HASH_LARGE_MODULUS
;
6487 * If the ring is completely full, there's
6488 * nothing useful for us to do. Behave as
6489 * though we had compacted into the new
6492 if (active_buckets
>= DRT_HASH_LARGE_MODULUS
)
6493 return(KERN_SUCCESS
);
6498 * Allocate and initialise the new map.
6501 kret
= kmem_alloc(kernel_map
, (vm_offset_t
*)&cmap
,
6502 (nsize
== DRT_HASH_SMALL_MODULUS
) ? DRT_SMALL_ALLOCATION
: DRT_LARGE_ALLOCATION
, VM_KERN_MEMORY_FILE
);
6503 if (kret
!= KERN_SUCCESS
)
6505 cmap
->scm_magic
= DRT_SCM_MAGIC
;
6506 cmap
->scm_modulus
= nsize
;
6507 cmap
->scm_buckets
= 0;
6508 cmap
->scm_lastclean
= 0;
6509 cmap
->scm_iskips
= 0;
6510 for (i
= 0; i
< cmap
->scm_modulus
; i
++) {
6511 DRT_HASH_CLEAR(cmap
, i
);
6512 DRT_HASH_VACATE(cmap
, i
);
6513 DRT_BITVECTOR_CLEAR(cmap
, i
);
6517 * If there's an old map, re-hash entries from it into the new map.
6520 if (ocmap
!= NULL
) {
6521 for (i
= 0; i
< ocmap
->scm_modulus
; i
++) {
6522 /* skip empty buckets */
6523 if (DRT_HASH_VACANT(ocmap
, i
) ||
6524 (DRT_HASH_GET_COUNT(ocmap
, i
) == 0))
6527 offset
= DRT_HASH_GET_ADDRESS(ocmap
, i
);
6528 kret
= vfs_drt_get_index(&cmap
, offset
, &index
, 1);
6529 if (kret
!= KERN_SUCCESS
) {
6530 /* XXX need to bail out gracefully here */
6531 panic("vfs_drt: new cluster map mysteriously too small");
6535 DRT_HASH_COPY(ocmap
, i
, cmap
, index
);
6540 /* log what we've done */
6541 vfs_drt_trace(cmap
, DRT_DEBUG_ALLOC
, copycount
, 0, 0, 0);
6544 * It's important to ensure that *cmapp always points to
6545 * a valid map, so we must overwrite it before freeing
6549 if (ocmap
!= NULL
) {
6550 /* emit stats into trace buffer */
6551 vfs_drt_trace(ocmap
, DRT_DEBUG_SCMDATA
,
6554 ocmap
->scm_lastclean
,
6557 vfs_drt_free_map(ocmap
);
6559 return(KERN_SUCCESS
);
6564 * Free a sparse cluster map.
6566 static kern_return_t
6567 vfs_drt_free_map(struct vfs_drt_clustermap
*cmap
)
6569 kmem_free(kernel_map
, (vm_offset_t
)cmap
,
6570 (cmap
->scm_modulus
== DRT_HASH_SMALL_MODULUS
) ? DRT_SMALL_ALLOCATION
: DRT_LARGE_ALLOCATION
);
6571 return(KERN_SUCCESS
);
6576 * Find the hashtable slot currently occupied by an entry for the supplied offset.
6578 static kern_return_t
6579 vfs_drt_search_index(struct vfs_drt_clustermap
*cmap
, u_int64_t offset
, int *indexp
)
6584 offset
= DRT_ALIGN_ADDRESS(offset
);
6585 index
= DRT_HASH(cmap
, offset
);
6587 /* traverse the hashtable */
6588 for (i
= 0; i
< cmap
->scm_modulus
; i
++) {
6591 * If the slot is vacant, we can stop.
6593 if (DRT_HASH_VACANT(cmap
, index
))
6597 * If the address matches our offset, we have success.
6599 if (DRT_HASH_GET_ADDRESS(cmap
, index
) == offset
) {
6601 return(KERN_SUCCESS
);
6605 * Move to the next slot, try again.
6607 index
= DRT_HASH_NEXT(cmap
, index
);
6612 return(KERN_FAILURE
);
6616 * Find the hashtable slot for the supplied offset. If we haven't allocated
6617 * one yet, allocate one and populate the address field. Note that it will
6618 * not have a nonzero page count and thus will still technically be free, so
6619 * in the case where we are called to clean pages, the slot will remain free.
6621 static kern_return_t
6622 vfs_drt_get_index(struct vfs_drt_clustermap
**cmapp
, u_int64_t offset
, int *indexp
, int recursed
)
6624 struct vfs_drt_clustermap
*cmap
;
6631 /* look for an existing entry */
6632 kret
= vfs_drt_search_index(cmap
, offset
, indexp
);
6633 if (kret
== KERN_SUCCESS
)
6636 /* need to allocate an entry */
6637 offset
= DRT_ALIGN_ADDRESS(offset
);
6638 index
= DRT_HASH(cmap
, offset
);
6640 /* scan from the index forwards looking for a vacant slot */
6641 for (i
= 0; i
< cmap
->scm_modulus
; i
++) {
6643 if (DRT_HASH_VACANT(cmap
, index
) || DRT_HASH_GET_COUNT(cmap
,index
) == 0) {
6644 cmap
->scm_buckets
++;
6645 if (index
< cmap
->scm_lastclean
)
6646 cmap
->scm_lastclean
= index
;
6647 DRT_HASH_SET_ADDRESS(cmap
, index
, offset
);
6648 DRT_HASH_SET_COUNT(cmap
, index
, 0);
6649 DRT_BITVECTOR_CLEAR(cmap
, index
);
6651 vfs_drt_trace(cmap
, DRT_DEBUG_INSERT
, (int)offset
, i
, 0, 0);
6652 return(KERN_SUCCESS
);
6654 cmap
->scm_iskips
+= i
;
6655 index
= DRT_HASH_NEXT(cmap
, index
);
6659 * We haven't found a vacant slot, so the map is full. If we're not
6660 * already recursed, try reallocating/compacting it.
6663 return(KERN_FAILURE
);
6664 kret
= vfs_drt_alloc_map(cmapp
);
6665 if (kret
== KERN_SUCCESS
) {
6666 /* now try to insert again */
6667 kret
= vfs_drt_get_index(cmapp
, offset
, indexp
, 1);
6673 * Implementation of set dirty/clean.
6675 * In the 'clean' case, not finding a map is OK.
6677 static kern_return_t
6678 vfs_drt_do_mark_pages(
6685 struct vfs_drt_clustermap
*cmap
, **cmapp
;
6687 int i
, index
, pgoff
, pgcount
, setcount
, ecount
;
6689 cmapp
= (struct vfs_drt_clustermap
**)private;
6692 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_START
, (int)offset
, (int)length
, dirty
, 0);
6694 if (setcountp
!= NULL
)
6697 /* allocate a cluster map if we don't already have one */
6699 /* no cluster map, nothing to clean */
6701 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 1, 0, 0, 0);
6702 return(KERN_SUCCESS
);
6704 kret
= vfs_drt_alloc_map(cmapp
);
6705 if (kret
!= KERN_SUCCESS
) {
6706 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 2, 0, 0, 0);
6713 * Iterate over the length of the region.
6715 while (length
> 0) {
6717 * Get the hashtable index for this offset.
6719 * XXX this will add blank entries if we are clearing a range
6720 * that hasn't been dirtied.
6722 kret
= vfs_drt_get_index(cmapp
, offset
, &index
, 0);
6723 cmap
= *cmapp
; /* may have changed! */
6724 /* this may be a partial-success return */
6725 if (kret
!= KERN_SUCCESS
) {
6726 if (setcountp
!= NULL
)
6727 *setcountp
= setcount
;
6728 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 3, (int)length
, 0, 0);
6734 * Work out how many pages we're modifying in this
6737 pgoff
= (offset
- DRT_ALIGN_ADDRESS(offset
)) / PAGE_SIZE
;
6738 pgcount
= min((length
/ PAGE_SIZE
), (DRT_BITVECTOR_PAGES
- pgoff
));
6741 * Iterate over pages, dirty/clearing as we go.
6743 ecount
= DRT_HASH_GET_COUNT(cmap
, index
);
6744 for (i
= 0; i
< pgcount
; i
++) {
6746 if (!DRT_HASH_TEST_BIT(cmap
, index
, pgoff
+ i
)) {
6747 DRT_HASH_SET_BIT(cmap
, index
, pgoff
+ i
);
6752 if (DRT_HASH_TEST_BIT(cmap
, index
, pgoff
+ i
)) {
6753 DRT_HASH_CLEAR_BIT(cmap
, index
, pgoff
+ i
);
6759 DRT_HASH_SET_COUNT(cmap
, index
, ecount
);
6761 offset
+= pgcount
* PAGE_SIZE
;
6762 length
-= pgcount
* PAGE_SIZE
;
6764 if (setcountp
!= NULL
)
6765 *setcountp
= setcount
;
6767 vfs_drt_trace(cmap
, DRT_DEBUG_MARK
| DBG_FUNC_END
, 0, setcount
, 0, 0);
6769 return(KERN_SUCCESS
);
6773 * Mark a set of pages as dirty/clean.
6775 * This is a public interface.
6778 * Pointer to storage suitable for holding a pointer. Note that
6779 * this must either be NULL or a value set by this function.
6782 * Current file size in bytes.
6785 * Offset of the first page to be marked as dirty, in bytes. Must be
6789 * Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
6792 * Number of pages newly marked dirty by this call (optional).
6794 * Returns KERN_SUCCESS if all the pages were successfully marked.
6796 static kern_return_t
6797 vfs_drt_mark_pages(void **cmapp
, off_t offset
, u_int length
, u_int
*setcountp
)
6799 /* XXX size unused, drop from interface */
6800 return(vfs_drt_do_mark_pages(cmapp
, offset
, length
, setcountp
, 1));
6804 static kern_return_t
6805 vfs_drt_unmark_pages(void **cmapp
, off_t offset
, u_int length
)
6807 return(vfs_drt_do_mark_pages(cmapp
, offset
, length
, NULL
, 0));
6812 * Get a cluster of dirty pages.
6814 * This is a public interface.
6817 * Pointer to storage managed by drt_mark_pages. Note that this must
6818 * be NULL or a value set by drt_mark_pages.
6821 * Returns the byte offset into the file of the first page in the cluster.
6824 * Returns the length in bytes of the cluster of dirty pages.
6826 * Returns success if a cluster was found. If KERN_FAILURE is returned, there
6827 * are no dirty pages meeting the minmum size criteria. Private storage will
6828 * be released if there are no more dirty pages left in the map
6831 static kern_return_t
6832 vfs_drt_get_cluster(void **cmapp
, off_t
*offsetp
, u_int
*lengthp
)
6834 struct vfs_drt_clustermap
*cmap
;
6838 int index
, i
, fs
, ls
;
6841 if ((cmapp
== NULL
) || (*cmapp
== NULL
))
6842 return(KERN_FAILURE
);
6845 /* walk the hashtable */
6846 for (offset
= 0, j
= 0; j
< cmap
->scm_modulus
; offset
+= (DRT_BITVECTOR_PAGES
* PAGE_SIZE
), j
++) {
6847 index
= DRT_HASH(cmap
, offset
);
6849 if (DRT_HASH_VACANT(cmap
, index
) || (DRT_HASH_GET_COUNT(cmap
, index
) == 0))
6852 /* scan the bitfield for a string of bits */
6855 for (i
= 0; i
< DRT_BITVECTOR_PAGES
; i
++) {
6856 if (DRT_HASH_TEST_BIT(cmap
, index
, i
)) {
6862 /* didn't find any bits set */
6863 panic("vfs_drt: entry summary count > 0 but no bits set in map");
6865 for (ls
= 0; i
< DRT_BITVECTOR_PAGES
; i
++, ls
++) {
6866 if (!DRT_HASH_TEST_BIT(cmap
, index
, i
))
6870 /* compute offset and length, mark pages clean */
6871 offset
= DRT_HASH_GET_ADDRESS(cmap
, index
) + (PAGE_SIZE
* fs
);
6872 length
= ls
* PAGE_SIZE
;
6873 vfs_drt_do_mark_pages(cmapp
, offset
, length
, NULL
, 0);
6874 cmap
->scm_lastclean
= index
;
6876 /* return successful */
6877 *offsetp
= (off_t
)offset
;
6880 vfs_drt_trace(cmap
, DRT_DEBUG_RETCLUSTER
, (int)offset
, (int)length
, 0, 0);
6881 return(KERN_SUCCESS
);
6884 * We didn't find anything... hashtable is empty
6885 * emit stats into trace buffer and
6888 vfs_drt_trace(cmap
, DRT_DEBUG_SCMDATA
,
6891 cmap
->scm_lastclean
,
6894 vfs_drt_free_map(cmap
);
6897 return(KERN_FAILURE
);
6901 static kern_return_t
6902 vfs_drt_control(void **cmapp
, int op_type
)
6904 struct vfs_drt_clustermap
*cmap
;
6907 if ((cmapp
== NULL
) || (*cmapp
== NULL
))
6908 return(KERN_FAILURE
);
6913 /* emit stats into trace buffer */
6914 vfs_drt_trace(cmap
, DRT_DEBUG_SCMDATA
,
6917 cmap
->scm_lastclean
,
6920 vfs_drt_free_map(cmap
);
6925 cmap
->scm_lastclean
= 0;
6928 return(KERN_SUCCESS
);
6934 * Emit a summary of the state of the clustermap into the trace buffer
6935 * along with some caller-provided data.
6939 vfs_drt_trace(__unused
struct vfs_drt_clustermap
*cmap
, int code
, int arg1
, int arg2
, int arg3
, int arg4
)
6941 KERNEL_DEBUG(code
, arg1
, arg2
, arg3
, arg4
, 0);
6945 vfs_drt_trace(__unused
struct vfs_drt_clustermap
*cmap
, __unused
int code
,
6946 __unused
int arg1
, __unused
int arg2
, __unused
int arg3
,
6954 * Perform basic sanity check on the hash entry summary count
6955 * vs. the actual bits set in the entry.
6958 vfs_drt_sanity(struct vfs_drt_clustermap
*cmap
)
6963 for (index
= 0; index
< cmap
->scm_modulus
; index
++) {
6964 if (DRT_HASH_VACANT(cmap
, index
))
6967 for (bits_on
= 0, i
= 0; i
< DRT_BITVECTOR_PAGES
; i
++) {
6968 if (DRT_HASH_TEST_BIT(cmap
, index
, i
))
6971 if (bits_on
!= DRT_HASH_GET_COUNT(cmap
, index
))
6972 panic("bits_on = %d, index = %d\n", bits_on
, index
);