]> git.saurik.com Git - apple/xnu.git/blame - bsd/vfs/vfs_cluster.c
xnu-6153.141.1.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_cluster.c
CommitLineData
1c79356b 1/*
fe8ab488 2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
5d5c5d0d 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
0a7de745 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
0a7de745 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
0a7de745 17 *
2d21ac55
A
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
0a7de745 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
62 */
63
64#include <sys/param.h>
91447636
A
65#include <sys/proc_internal.h>
66#include <sys/buf_internal.h>
67#include <sys/mount_internal.h>
68#include <sys/vnode_internal.h>
1c79356b
A
69#include <sys/trace.h>
70#include <sys/malloc.h>
55e303ae
A
71#include <sys/time.h>
72#include <sys/kernel.h>
1c79356b 73#include <sys/resourcevar.h>
316670eb 74#include <miscfs/specfs/specdev.h>
91447636 75#include <sys/uio_internal.h>
1c79356b 76#include <libkern/libkern.h>
55e303ae 77#include <machine/machine_routines.h>
1c79356b 78
91447636 79#include <sys/ubc_internal.h>
2d21ac55 80#include <vm/vnode_pager.h>
1c79356b 81
55e303ae
A
82#include <mach/mach_types.h>
83#include <mach/memory_object_types.h>
91447636
A
84#include <mach/vm_map.h>
85#include <mach/upl.h>
6d2010ae 86#include <kern/task.h>
39037602 87#include <kern/policy_internal.h>
91447636
A
88
89#include <vm/vm_kern.h>
90#include <vm/vm_map.h>
91#include <vm/vm_pageout.h>
fe8ab488 92#include <vm/vm_fault.h>
55e303ae 93
1c79356b 94#include <sys/kdebug.h>
0a7de745 95#include <libkern/OSAtomic.h>
b0d623f7 96
6d2010ae
A
97#include <sys/sdt.h>
98
3e170ce0
A
99#include <stdbool.h>
100
5ba3f43e
A
101#include <vfs/vfs_disk_conditioner.h>
102
b0d623f7
A
103#if 0
104#undef KERNEL_DEBUG
105#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
106#endif
107
1c79356b 108
0a7de745
A
109#define CL_READ 0x01
110#define CL_WRITE 0x02
111#define CL_ASYNC 0x04
112#define CL_COMMIT 0x08
113#define CL_PAGEOUT 0x10
114#define CL_AGE 0x20
115#define CL_NOZERO 0x40
116#define CL_PAGEIN 0x80
117#define CL_DEV_MEMORY 0x100
118#define CL_PRESERVE 0x200
119#define CL_THROTTLE 0x400
120#define CL_KEEPCACHED 0x800
121#define CL_DIRECT_IO 0x1000
122#define CL_PASSIVE 0x2000
123#define CL_IOSTREAMING 0x4000
124#define CL_CLOSE 0x8000
125#define CL_ENCRYPTED 0x10000
126#define CL_RAW_ENCRYPTED 0x20000
127#define CL_NOCACHE 0x40000
128
129#define MAX_VECTOR_UPL_ELEMENTS 8
130#define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE_BYTES)
131
132#define CLUSTER_IO_WAITING ((buf_t)1)
39037602 133
b0d623f7
A
134extern upl_t vector_upl_create(vm_offset_t);
135extern boolean_t vector_upl_is_valid(upl_t);
0a7de745 136extern boolean_t vector_upl_set_subupl(upl_t, upl_t, u_int32_t);
b0d623f7
A
137extern void vector_upl_set_pagelist(upl_t);
138extern void vector_upl_set_iostate(upl_t, upl_t, vm_offset_t, u_int32_t);
d7e50217 139
b4c24cb9 140struct clios {
6d2010ae 141 lck_mtx_t io_mtxp;
0a7de745
A
142 u_int io_completed; /* amount of io that has currently completed */
143 u_int io_issued; /* amount of io that was successfully issued */
144 int io_error; /* error code of first error encountered */
145 int io_wanted; /* someone is sleeping waiting for a change in state */
b4c24cb9
A
146};
147
3e170ce0 148struct cl_direct_read_lock {
0a7de745
A
149 LIST_ENTRY(cl_direct_read_lock) chain;
150 int32_t ref_count;
151 vnode_t vp;
152 lck_rw_t rw_lock;
3e170ce0
A
153};
154
155#define CL_DIRECT_READ_LOCK_BUCKETS 61
156
157static LIST_HEAD(cl_direct_read_locks, cl_direct_read_lock)
0a7de745 158cl_direct_read_locks[CL_DIRECT_READ_LOCK_BUCKETS];
3e170ce0
A
159
160static lck_spin_t cl_direct_read_spin_lock;
161
0a7de745
A
162static lck_grp_t *cl_mtx_grp;
163static lck_attr_t *cl_mtx_attr;
91447636 164static lck_grp_attr_t *cl_mtx_grp_attr;
0a7de745 165static lck_mtx_t *cl_transaction_mtxp;
91447636 166
0a7de745
A
167#define IO_UNKNOWN 0
168#define IO_DIRECT 1
169#define IO_CONTIG 2
170#define IO_COPY 3
2d21ac55 171
0a7de745
A
172#define PUSH_DELAY 0x01
173#define PUSH_ALL 0x02
174#define PUSH_SYNC 0x04
2d21ac55
A
175
176
177static void cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset);
178static void cluster_wait_IO(buf_t cbp_head, int async);
179static void cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait);
180
181static int cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length);
182
91447636 183static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
0a7de745 184 int flags, buf_t real_bp, struct clios *iostate, int (*)(buf_t, void *), void *callback_arg);
2d21ac55 185static int cluster_iodone(buf_t bp, void *callback_arg);
39236c6e
A
186static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp);
187static int cluster_is_throttled(vnode_t vp);
91447636 188
6d2010ae
A
189static void cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name);
190
fe8ab488 191static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg, int flags);
2d21ac55 192
b0d623f7 193static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference);
2d21ac55
A
194static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference);
195
0a7de745
A
196static int cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags,
197 int (*)(buf_t, void *), void *callback_arg);
2d21ac55 198static int cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
0a7de745 199 int flags, int (*)(buf_t, void *), void *callback_arg);
2d21ac55 200static int cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
0a7de745 201 int (*)(buf_t, void *), void *callback_arg, int flags);
1c79356b 202
2d21ac55 203static int cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF,
0a7de745 204 off_t headOff, off_t tailOff, int flags, int (*)(buf_t, void *), void *callback_arg);
2d21ac55 205static int cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
0a7de745 206 int *write_type, u_int32_t *write_length, int flags, int (*)(buf_t, void *), void *callback_arg);
2d21ac55 207static int cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF,
0a7de745 208 int *write_type, u_int32_t *write_length, int (*)(buf_t, void *), void *callback_arg, int bflag);
91447636 209
d9a64523 210static void cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes, boolean_t *first_pass,
0a7de745 211 off_t write_off, int write_cnt, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
d9a64523 212
2d21ac55 213static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*)(buf_t, void *), void *callback_arg);
91447636 214
0a7de745
A
215static int cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag);
216static void cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra,
217 int (*callback)(buf_t, void *), void *callback_arg, int bflag);
91447636 218
0a7de745 219static int cluster_push_now(vnode_t vp, struct cl_extent *, off_t EOF, int flags, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_ioitiated);
55e303ae 220
0a7de745
A
221static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int flags, int (*)(buf_t, void *),
222 void *callback_arg, int *err, boolean_t vm_initiated);
2d21ac55 223
0a7de745
A
224static int sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
225static int sparse_cluster_push(struct cl_writebehind *, void **cmapp, vnode_t vp, off_t EOF, int push_flag,
226 int io_flags, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
227static int sparse_cluster_add(struct cl_writebehind *, void **cmapp, vnode_t vp, struct cl_extent *, off_t EOF,
228 int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
2d21ac55
A
229
230static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp);
55e303ae
A
231static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
232static kern_return_t vfs_drt_control(void **cmapp, int op_type);
0a7de745 233static kern_return_t vfs_get_scmap_push_behavior_internal(void **cmapp, int *push_flag);
55e303ae 234
9bccf70c 235
316670eb
A
236/*
237 * For throttled IO to check whether
238 * a block is cached by the boot cache
239 * and thus it can avoid delaying the IO.
240 *
241 * bootcache_contains_block is initially
242 * NULL. The BootCache will set it while
243 * the cache is active and clear it when
244 * the cache is jettisoned.
245 *
246 * Returns 0 if the block is not
247 * contained in the cache, 1 if it is
248 * contained.
249 *
250 * The function pointer remains valid
251 * after the cache has been evicted even
252 * if bootcache_contains_block has been
253 * cleared.
254 *
255 * See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs
256 */
257int (*bootcache_contains_block)(dev_t device, u_int64_t blkno) = NULL;
258
259
2d21ac55
A
260/*
261 * limit the internal I/O size so that we
262 * can represent it in a 32 bit int
263 */
0a7de745
A
264#define MAX_IO_REQUEST_SIZE (1024 * 1024 * 512)
265#define MAX_IO_CONTIG_SIZE MAX_UPL_SIZE_BYTES
266#define MAX_VECTS 16
3e170ce0
A
267/*
268 * The MIN_DIRECT_WRITE_SIZE governs how much I/O should be issued before we consider
0a7de745
A
269 * allowing the caller to bypass the buffer cache. For small I/Os (less than 16k),
270 * we have not historically allowed the write to bypass the UBC.
3e170ce0 271 */
0a7de745 272#define MIN_DIRECT_WRITE_SIZE (16384)
2d21ac55 273
0a7de745
A
274#define WRITE_THROTTLE 6
275#define WRITE_THROTTLE_SSD 2
276#define WRITE_BEHIND 1
277#define WRITE_BEHIND_SSD 1
316670eb 278
5ba3f43e 279#if CONFIG_EMBEDDED
0a7de745
A
280#define PREFETCH 1
281#define PREFETCH_SSD 1
282uint32_t speculative_prefetch_max = (2048 * 1024); /* maximum bytes in a specluative read-ahead */
283uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use in a specluative read-ahead */
5ba3f43e 284#else
0a7de745
A
285#define PREFETCH 3
286#define PREFETCH_SSD 2
287uint32_t speculative_prefetch_max = (MAX_UPL_SIZE_BYTES * 3); /* maximum bytes in a specluative read-ahead */
288uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use in a specluative read-ahead on SSDs*/
5ba3f43e 289#endif
316670eb 290
6d2010ae 291
0a7de745
A
292#define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * (base))
293#define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE))
294#define MAX_PREFETCH(vp, size, is_ssd) (size * IO_SCALE(vp, ((is_ssd) ? PREFETCH_SSD : PREFETCH)))
cf7d32b8 295
0a7de745 296int speculative_reads_disabled = 0;
2d21ac55 297
1c79356b
A
298/*
299 * throttle the number of async writes that
300 * can be outstanding on a single vnode
0a7de745 301 * before we issue a synchronous write
1c79356b 302 */
0a7de745 303#define THROTTLE_MAXCNT 0
316670eb 304
39236c6e 305uint32_t throttle_max_iosize = (128 * 1024);
316670eb 306
39236c6e
A
307#define THROTTLE_MAX_IOSIZE (throttle_max_iosize)
308
309SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_max_iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_max_iosize, 0, "");
316670eb 310
55e303ae 311
91447636 312void
0a7de745
A
313cluster_init(void)
314{
315 /*
91447636
A
316 * allocate lock group attribute and group
317 */
0a7de745 318 cl_mtx_grp_attr = lck_grp_attr_alloc_init();
91447636 319 cl_mtx_grp = lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr);
0a7de745 320
91447636
A
321 /*
322 * allocate the lock attribute
323 */
324 cl_mtx_attr = lck_attr_alloc_init();
91447636 325
060df5ea
A
326 cl_transaction_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr);
327
0a7de745
A
328 if (cl_transaction_mtxp == NULL) {
329 panic("cluster_init: failed to allocate cl_transaction_mtxp");
330 }
3e170ce0
A
331
332 lck_spin_init(&cl_direct_read_spin_lock, cl_mtx_grp, cl_mtx_attr);
333
0a7de745 334 for (int i = 0; i < CL_DIRECT_READ_LOCK_BUCKETS; ++i) {
3e170ce0 335 LIST_INIT(&cl_direct_read_locks[i]);
0a7de745 336 }
91447636
A
337}
338
339
cf7d32b8
A
340uint32_t
341cluster_max_io_size(mount_t mp, int type)
342{
0a7de745
A
343 uint32_t max_io_size;
344 uint32_t segcnt;
345 uint32_t maxcnt;
b0d623f7 346
0a7de745 347 switch (type) {
b0d623f7
A
348 case CL_READ:
349 segcnt = mp->mnt_segreadcnt;
350 maxcnt = mp->mnt_maxreadcnt;
351 break;
352 case CL_WRITE:
353 segcnt = mp->mnt_segwritecnt;
354 maxcnt = mp->mnt_maxwritecnt;
355 break;
356 default:
357 segcnt = min(mp->mnt_segreadcnt, mp->mnt_segwritecnt);
358 maxcnt = min(mp->mnt_maxreadcnt, mp->mnt_maxwritecnt);
359 break;
360 }
fe8ab488 361 if (segcnt > (MAX_UPL_SIZE_BYTES >> PAGE_SHIFT)) {
0a7de745
A
362 /*
363 * don't allow a size beyond the max UPL size we can create
364 */
365 segcnt = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
366 }
367 max_io_size = min((segcnt * PAGE_SIZE), maxcnt);
368
369 if (max_io_size < MAX_UPL_TRANSFER_BYTES) {
370 /*
371 * don't allow a size smaller than the old fixed limit
372 */
373 max_io_size = MAX_UPL_TRANSFER_BYTES;
374 } else {
375 /*
376 * make sure the size specified is a multiple of PAGE_SIZE
377 */
378 max_io_size &= ~PAGE_MASK;
379 }
380 return max_io_size;
cf7d32b8
A
381}
382
383
384
91447636 385
0a7de745
A
386#define CLW_ALLOCATE 0x01
387#define CLW_RETURNLOCKED 0x02
388#define CLW_IONOCACHE 0x04
389#define CLW_IOPASSIVE 0x08
2d21ac55 390
91447636
A
391/*
392 * if the read ahead context doesn't yet exist,
393 * allocate and initialize it...
394 * the vnode lock serializes multiple callers
395 * during the actual assignment... first one
396 * to grab the lock wins... the other callers
397 * will release the now unnecessary storage
0a7de745 398 *
91447636
A
399 * once the context is present, try to grab (but don't block on)
400 * the lock associated with it... if someone
401 * else currently owns it, than the read
402 * will run without read-ahead. this allows
403 * multiple readers to run in parallel and
404 * since there's only 1 read ahead context,
405 * there's no real loss in only allowing 1
406 * reader to have read-ahead enabled.
407 */
408static struct cl_readahead *
409cluster_get_rap(vnode_t vp)
410{
0a7de745
A
411 struct ubc_info *ubc;
412 struct cl_readahead *rap;
91447636
A
413
414 ubc = vp->v_ubcinfo;
415
0a7de745
A
416 if ((rap = ubc->cl_rahead) == NULL) {
417 MALLOC_ZONE(rap, struct cl_readahead *, sizeof *rap, M_CLRDAHEAD, M_WAITOK);
91447636
A
418
419 bzero(rap, sizeof *rap);
420 rap->cl_lastr = -1;
421 lck_mtx_init(&rap->cl_lockr, cl_mtx_grp, cl_mtx_attr);
422
423 vnode_lock(vp);
0a7de745
A
424
425 if (ubc->cl_rahead == NULL) {
426 ubc->cl_rahead = rap;
427 } else {
428 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
429 FREE_ZONE(rap, sizeof *rap, M_CLRDAHEAD);
2d21ac55 430 rap = ubc->cl_rahead;
91447636
A
431 }
432 vnode_unlock(vp);
433 }
0a7de745
A
434 if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE) {
435 return rap;
436 }
437
438 return (struct cl_readahead *)NULL;
91447636
A
439}
440
441
442/*
443 * if the write behind context doesn't yet exist,
444 * and CLW_ALLOCATE is specified, allocate and initialize it...
445 * the vnode lock serializes multiple callers
446 * during the actual assignment... first one
447 * to grab the lock wins... the other callers
448 * will release the now unnecessary storage
0a7de745 449 *
91447636
A
450 * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
451 * the lock associated with the write behind context before
452 * returning
453 */
454
455static struct cl_writebehind *
456cluster_get_wbp(vnode_t vp, int flags)
457{
0a7de745 458 struct ubc_info *ubc;
91447636
A
459 struct cl_writebehind *wbp;
460
461 ubc = vp->v_ubcinfo;
462
0a7de745
A
463 if ((wbp = ubc->cl_wbehind) == NULL) {
464 if (!(flags & CLW_ALLOCATE)) {
465 return (struct cl_writebehind *)NULL;
466 }
91447636 467
0a7de745 468 MALLOC_ZONE(wbp, struct cl_writebehind *, sizeof *wbp, M_CLWRBEHIND, M_WAITOK);
91447636
A
469
470 bzero(wbp, sizeof *wbp);
471 lck_mtx_init(&wbp->cl_lockw, cl_mtx_grp, cl_mtx_attr);
472
473 vnode_lock(vp);
0a7de745
A
474
475 if (ubc->cl_wbehind == NULL) {
476 ubc->cl_wbehind = wbp;
477 } else {
478 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
479 FREE_ZONE(wbp, sizeof *wbp, M_CLWRBEHIND);
2d21ac55 480 wbp = ubc->cl_wbehind;
91447636
A
481 }
482 vnode_unlock(vp);
483 }
0a7de745
A
484 if (flags & CLW_RETURNLOCKED) {
485 lck_mtx_lock(&wbp->cl_lockw);
486 }
91447636 487
0a7de745 488 return wbp;
91447636
A
489}
490
491
2d21ac55 492static void
fe8ab488 493cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, int flags)
2d21ac55
A
494{
495 struct cl_writebehind *wbp;
496
497 if ((wbp = cluster_get_wbp(vp, 0)) != NULL) {
0a7de745
A
498 if (wbp->cl_number) {
499 lck_mtx_lock(&wbp->cl_lockw);
2d21ac55 500
d9a64523 501 cluster_try_push(wbp, vp, newEOF, PUSH_ALL | flags, 0, callback, callback_arg, NULL, FALSE);
2d21ac55
A
502
503 lck_mtx_unlock(&wbp->cl_lockw);
504 }
505 }
506}
507
508
316670eb
A
509static int
510cluster_io_present_in_BC(vnode_t vp, off_t f_offset)
511{
512 daddr64_t blkno;
0a7de745 513 size_t io_size;
316670eb 514 int (*bootcache_check_fn)(dev_t device, u_int64_t blkno) = bootcache_contains_block;
0a7de745 515
5ba3f43e 516 if (bootcache_check_fn && vp->v_mount && vp->v_mount->mnt_devvp) {
0a7de745
A
517 if (VNOP_BLOCKMAP(vp, f_offset, PAGE_SIZE, &blkno, &io_size, NULL, VNODE_READ | VNODE_BLOCKMAP_NO_TRACK, NULL)) {
518 return 0;
519 }
316670eb 520
0a7de745
A
521 if (io_size == 0) {
522 return 0;
523 }
316670eb 524
0a7de745
A
525 if (bootcache_check_fn(vp->v_mount->mnt_devvp->v_rdev, blkno)) {
526 return 1;
527 }
316670eb 528 }
0a7de745 529 return 0;
316670eb
A
530}
531
532
0a7de745 533static int
39236c6e 534cluster_is_throttled(vnode_t vp)
55e303ae 535{
0a7de745 536 return throttle_io_will_be_throttled(-1, vp->v_mount);
55e303ae
A
537}
538
1c79356b 539
6d2010ae
A
540static void
541cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name)
542{
6d2010ae
A
543 lck_mtx_lock(&iostate->io_mtxp);
544
545 while ((iostate->io_issued - iostate->io_completed) > target) {
6d2010ae 546 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START,
0a7de745 547 iostate->io_issued, iostate->io_completed, target, 0, 0);
6d2010ae
A
548
549 iostate->io_wanted = 1;
550 msleep((caddr_t)&iostate->io_wanted, &iostate->io_mtxp, PRIBIO + 1, wait_name, NULL);
551
552 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END,
0a7de745
A
553 iostate->io_issued, iostate->io_completed, target, 0, 0);
554 }
6d2010ae
A
555 lck_mtx_unlock(&iostate->io_mtxp);
556}
557
0a7de745
A
558static void
559cluster_handle_associated_upl(struct clios *iostate, upl_t upl,
560 upl_offset_t upl_offset, upl_size_t size)
3e170ce0 561{
0a7de745 562 if (!size) {
3e170ce0 563 return;
0a7de745 564 }
3e170ce0
A
565
566 upl_t associated_upl = upl_associated_upl(upl);
567
0a7de745 568 if (!associated_upl) {
3e170ce0 569 return;
0a7de745 570 }
3e170ce0
A
571
572#if 0
573 printf("1: %d %d\n", upl_offset, upl_offset + size);
574#endif
575
576 /*
577 * The associated UPL is page aligned to file offsets whereas the
578 * UPL it's attached to has different alignment requirements. The
579 * upl_offset that we have refers to @upl. The code that follows
580 * has to deal with the first and last pages in this transaction
581 * which might straddle pages in the associated UPL. To keep
582 * track of these pages, we use the mark bits: if the mark bit is
583 * set, we know another transaction has completed its part of that
584 * page and so we can unlock that page here.
585 *
586 * The following illustrates what we have to deal with:
587 *
588 * MEM u <------------ 1 PAGE ------------> e
589 * +-------------+----------------------+-----------------
590 * | |######################|#################
591 * +-------------+----------------------+-----------------
592 * FILE | <--- a ---> o <------------ 1 PAGE ------------>
593 *
594 * So here we show a write to offset @o. The data that is to be
595 * written is in a buffer that is not page aligned; it has offset
596 * @a in the page. The upl that carries the data starts in memory
597 * at @u. The associated upl starts in the file at offset @o. A
598 * transaction will always end on a page boundary (like @e above)
599 * except for the very last transaction in the group. We cannot
600 * unlock the page at @o in the associated upl until both the
601 * transaction ending at @e and the following transaction (that
602 * starts at @e) has completed.
603 */
604
605 /*
606 * We record whether or not the two UPLs are aligned as the mark
607 * bit in the first page of @upl.
608 */
609 upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
610 bool is_unaligned = upl_page_get_mark(pl, 0);
611
612 if (is_unaligned) {
613 upl_page_info_t *assoc_pl = UPL_GET_INTERNAL_PAGE_LIST(associated_upl);
614
615 upl_offset_t upl_end = upl_offset + size;
616 assert(upl_end >= PAGE_SIZE);
617
618 upl_size_t assoc_upl_size = upl_get_size(associated_upl);
619
620 /*
621 * In the very first transaction in the group, upl_offset will
622 * not be page aligned, but after that it will be and in that
623 * case we want the preceding page in the associated UPL hence
624 * the minus one.
625 */
626 assert(upl_offset);
0a7de745 627 if (upl_offset) {
3e170ce0 628 upl_offset = trunc_page_32(upl_offset - 1);
0a7de745 629 }
3e170ce0
A
630
631 lck_mtx_lock_spin(&iostate->io_mtxp);
632
633 // Look at the first page...
634 if (upl_offset
0a7de745 635 && !upl_page_get_mark(assoc_pl, upl_offset >> PAGE_SHIFT)) {
3e170ce0
A
636 /*
637 * The first page isn't marked so let another transaction
638 * completion handle it.
639 */
640 upl_page_set_mark(assoc_pl, upl_offset >> PAGE_SHIFT, true);
641 upl_offset += PAGE_SIZE;
642 }
643
644 // And now the last page...
645
646 /*
647 * This needs to be > rather than >= because if it's equal, it
648 * means there's another transaction that is sharing the last
649 * page.
650 */
0a7de745 651 if (upl_end > assoc_upl_size) {
3e170ce0 652 upl_end = assoc_upl_size;
0a7de745 653 } else {
3e170ce0
A
654 upl_end = trunc_page_32(upl_end);
655 const int last_pg = (upl_end >> PAGE_SHIFT) - 1;
656
657 if (!upl_page_get_mark(assoc_pl, last_pg)) {
658 /*
659 * The last page isn't marked so mark the page and let another
660 * transaction completion handle it.
661 */
662 upl_page_set_mark(assoc_pl, last_pg, true);
663 upl_end -= PAGE_SIZE;
664 }
665 }
666
667 lck_mtx_unlock(&iostate->io_mtxp);
668
669#if 0
670 printf("2: %d %d\n", upl_offset, upl_end);
671#endif
672
0a7de745 673 if (upl_end <= upl_offset) {
3e170ce0 674 return;
0a7de745 675 }
3e170ce0
A
676
677 size = upl_end - upl_offset;
678 } else {
679 assert(!(upl_offset & PAGE_MASK));
680 assert(!(size & PAGE_MASK));
681 }
682
683 boolean_t empty;
684
685 /*
686 * We can unlock these pages now and as this is for a
687 * direct/uncached write, we want to dump the pages too.
688 */
689 kern_return_t kr = upl_abort_range(associated_upl, upl_offset, size,
0a7de745 690 UPL_ABORT_DUMP_PAGES, &empty);
3e170ce0
A
691
692 assert(!kr);
693
694 if (!kr && empty) {
695 upl_set_associated_upl(upl, NULL);
696 upl_deallocate(associated_upl);
697 }
698}
6d2010ae 699
1c79356b 700static int
39236c6e 701cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp)
2d21ac55 702{
0a7de745 703 int upl_abort_code = 0;
2d21ac55
A
704 int page_in = 0;
705 int page_out = 0;
706
0a7de745
A
707 if ((io_flags & (B_PHYS | B_CACHE)) == (B_PHYS | B_CACHE)) {
708 /*
2d21ac55
A
709 * direct write of any flavor, or a direct read that wasn't aligned
710 */
0a7de745
A
711 ubc_upl_commit_range(upl, upl_offset, abort_size, UPL_COMMIT_FREE_ON_EMPTY);
712 } else {
713 if (io_flags & B_PAGEIO) {
714 if (io_flags & B_READ) {
715 page_in = 1;
716 } else {
717 page_out = 1;
718 }
719 }
720 if (io_flags & B_CACHE) {
721 /*
2d21ac55
A
722 * leave pages in the cache unchanged on error
723 */
0a7de745
A
724 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
725 } else if (((io_flags & B_READ) == 0) && ((error != ENXIO) || vnode_isswap(vp))) {
726 /*
d9a64523 727 * transient error on pageout/write path... leave pages unchanged
2d21ac55 728 */
0a7de745
A
729 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
730 } else if (page_in) {
731 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
732 } else {
733 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
734 }
2d21ac55
A
735
736 ubc_upl_abort_range(upl, upl_offset, abort_size, upl_abort_code);
737 }
0a7de745 738 return upl_abort_code;
2d21ac55
A
739}
740
741
742static int
743cluster_iodone(buf_t bp, void *callback_arg)
1c79356b 744{
0a7de745
A
745 int b_flags;
746 int error;
747 int total_size;
748 int total_resid;
749 int upl_offset;
750 int zero_offset;
751 int pg_offset = 0;
752 int commit_size = 0;
753 int upl_flags = 0;
754 int transaction_size = 0;
755 upl_t upl;
756 buf_t cbp;
757 buf_t cbp_head;
758 buf_t cbp_next;
759 buf_t real_bp;
760 vnode_t vp;
761 struct clios *iostate;
762 boolean_t transaction_complete = FALSE;
91447636 763
3e170ce0 764 __IGNORE_WCASTALIGN(cbp_head = (buf_t)(bp->b_trans_head));
1c79356b
A
765
766 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
0a7de745 767 cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1c79356b 768
060df5ea 769 if (cbp_head->b_trans_next || !(cbp_head->b_flags & B_EOT)) {
060df5ea
A
770 lck_mtx_lock_spin(cl_transaction_mtxp);
771
772 bp->b_flags |= B_TDONE;
3e170ce0 773
060df5ea 774 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
6d2010ae 775 /*
060df5ea
A
776 * all I/O requests that are part of this transaction
777 * have to complete before we can process it
778 */
0a7de745 779 if (!(cbp->b_flags & B_TDONE)) {
6d2010ae 780 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
0a7de745 781 cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
060df5ea
A
782
783 lck_mtx_unlock(cl_transaction_mtxp);
6d2010ae 784
39037602
A
785 return 0;
786 }
787
788 if (cbp->b_trans_next == CLUSTER_IO_WAITING) {
789 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
0a7de745 790 cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
39037602
A
791
792 lck_mtx_unlock(cl_transaction_mtxp);
793 wakeup(cbp);
6d2010ae 794
060df5ea
A
795 return 0;
796 }
39037602 797
0a7de745 798 if (cbp->b_flags & B_EOT) {
6d2010ae 799 transaction_complete = TRUE;
0a7de745 800 }
060df5ea
A
801 }
802 lck_mtx_unlock(cl_transaction_mtxp);
803
804 if (transaction_complete == FALSE) {
6d2010ae 805 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
0a7de745 806 cbp_head, 0, 0, 0, 0);
2d21ac55 807 return 0;
1c79356b
A
808 }
809 }
810 error = 0;
811 total_size = 0;
812 total_resid = 0;
813
814 cbp = cbp_head;
0a7de745 815 vp = cbp->b_vp;
1c79356b 816 upl_offset = cbp->b_uploffset;
91447636 817 upl = cbp->b_upl;
1c79356b
A
818 b_flags = cbp->b_flags;
819 real_bp = cbp->b_real_bp;
0a7de745 820 zero_offset = cbp->b_validend;
b4c24cb9 821 iostate = (struct clios *)cbp->b_iostate;
1c79356b 822
0a7de745
A
823 if (real_bp) {
824 real_bp->b_dev = cbp->b_dev;
825 }
91447636 826
1c79356b 827 while (cbp) {
0a7de745
A
828 if ((cbp->b_flags & B_ERROR) && error == 0) {
829 error = cbp->b_error;
830 }
1c79356b
A
831
832 total_resid += cbp->b_resid;
833 total_size += cbp->b_bcount;
834
835 cbp_next = cbp->b_trans_next;
836
0a7de745
A
837 if (cbp_next == NULL) {
838 /*
2d21ac55
A
839 * compute the overall size of the transaction
840 * in case we created one that has 'holes' in it
841 * 'total_size' represents the amount of I/O we
842 * did, not the span of the transaction w/r to the UPL
843 */
844 transaction_size = cbp->b_uploffset + cbp->b_bcount - upl_offset;
0a7de745 845 }
2d21ac55 846
0a7de745
A
847 if (cbp != cbp_head) {
848 free_io_buf(cbp);
849 }
1c79356b
A
850
851 cbp = cbp_next;
852 }
3e170ce0
A
853
854 if (ISSET(b_flags, B_COMMIT_UPL)) {
855 cluster_handle_associated_upl(iostate,
0a7de745
A
856 cbp_head->b_upl,
857 upl_offset,
858 transaction_size);
3e170ce0
A
859 }
860
0a7de745 861 if (error == 0 && total_resid) {
2d21ac55 862 error = EIO;
0a7de745 863 }
2d21ac55
A
864
865 if (error == 0) {
0a7de745 866 int (*cliodone_func)(buf_t, void *) = (int (*)(buf_t, void *))(cbp_head->b_cliodone);
2d21ac55
A
867
868 if (cliodone_func != NULL) {
0a7de745 869 cbp_head->b_bcount = transaction_size;
2d21ac55 870
0a7de745 871 error = (*cliodone_func)(cbp_head, callback_arg);
2d21ac55
A
872 }
873 }
0a7de745
A
874 if (zero_offset) {
875 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
876 }
b4c24cb9 877
0a7de745 878 free_io_buf(cbp_head);
2d21ac55 879
b4c24cb9 880 if (iostate) {
0a7de745 881 int need_wakeup = 0;
91447636 882
0a7de745 883 /*
d7e50217
A
884 * someone has issued multiple I/Os asynchrounsly
885 * and is waiting for them to complete (streaming)
886 */
6d2010ae 887 lck_mtx_lock_spin(&iostate->io_mtxp);
91447636 888
0a7de745
A
889 if (error && iostate->io_error == 0) {
890 iostate->io_error = error;
891 }
9bccf70c 892
b4c24cb9
A
893 iostate->io_completed += total_size;
894
895 if (iostate->io_wanted) {
0a7de745
A
896 /*
897 * someone is waiting for the state of
d7e50217
A
898 * this io stream to change
899 */
0a7de745 900 iostate->io_wanted = 0;
91447636 901 need_wakeup = 1;
b4c24cb9 902 }
6d2010ae 903 lck_mtx_unlock(&iostate->io_mtxp);
91447636 904
0a7de745
A
905 if (need_wakeup) {
906 wakeup((caddr_t)&iostate->io_wanted);
907 }
b4c24cb9 908 }
1c79356b
A
909
910 if (b_flags & B_COMMIT_UPL) {
3e170ce0 911 pg_offset = upl_offset & PAGE_MASK;
2d21ac55 912 commit_size = (pg_offset + transaction_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1c79356b 913
d9a64523 914 if (error) {
0a7de745 915 upl_set_iodone_error(upl, error);
d9a64523 916
39236c6e 917 upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, commit_size, error, b_flags, vp);
d9a64523 918 } else {
3e170ce0 919 upl_flags = UPL_COMMIT_FREE_ON_EMPTY;
1c79356b 920
0a7de745
A
921 if ((b_flags & B_PHYS) && (b_flags & B_READ)) {
922 upl_flags |= UPL_COMMIT_SET_DIRTY;
923 }
55e303ae 924
0a7de745
A
925 if (b_flags & B_AGE) {
926 upl_flags |= UPL_COMMIT_INACTIVATE;
927 }
1c79356b 928
2d21ac55 929 ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags);
1c79356b 930 }
91447636 931 }
6d2010ae 932 if (real_bp) {
2d21ac55
A
933 if (error) {
934 real_bp->b_flags |= B_ERROR;
935 real_bp->b_error = error;
936 }
937 real_bp->b_resid = total_resid;
938
939 buf_biodone(real_bp);
940 }
941 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
0a7de745 942 upl, upl_offset - pg_offset, commit_size, (error << 24) | upl_flags, 0);
1c79356b 943
0a7de745 944 return error;
1c79356b
A
945}
946
947
b0d623f7 948uint32_t
39236c6e 949cluster_throttle_io_limit(vnode_t vp, uint32_t *limit)
b0d623f7 950{
39236c6e 951 if (cluster_is_throttled(vp)) {
316670eb 952 *limit = THROTTLE_MAX_IOSIZE;
b0d623f7
A
953 return 1;
954 }
0a7de745 955 return 0;
b0d623f7
A
956}
957
958
91447636 959void
b0d623f7 960cluster_zero(upl_t upl, upl_offset_t upl_offset, int size, buf_t bp)
1c79356b 961{
55e303ae 962 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
0a7de745 963 upl_offset, size, bp, 0, 0);
9bccf70c 964
91447636 965 if (bp == NULL || bp->b_datap == 0) {
0a7de745
A
966 upl_page_info_t *pl;
967 addr64_t zero_addr;
9bccf70c 968
0a7de745 969 pl = ubc_upl_pageinfo(upl);
55e303ae 970
2d21ac55 971 if (upl_device_page(pl) == TRUE) {
0a7de745 972 zero_addr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + upl_offset;
2d21ac55
A
973
974 bzero_phys_nc(zero_addr, size);
975 } else {
0a7de745
A
976 while (size) {
977 int page_offset;
978 int page_index;
979 int zero_cnt;
55e303ae 980
2d21ac55
A
981 page_index = upl_offset / PAGE_SIZE;
982 page_offset = upl_offset & PAGE_MASK;
55e303ae 983
fe8ab488 984 zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << PAGE_SHIFT) + page_offset;
2d21ac55 985 zero_cnt = min(PAGE_SIZE - page_offset, size);
55e303ae 986
2d21ac55 987 bzero_phys(zero_addr, zero_cnt);
55e303ae 988
2d21ac55
A
989 size -= zero_cnt;
990 upl_offset += zero_cnt;
991 }
55e303ae 992 }
0a7de745 993 } else {
91447636 994 bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
0a7de745 995 }
1c79356b 996
55e303ae 997 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
0a7de745 998 upl_offset, size, 0, 0, 0);
1c79356b
A
999}
1000
91447636 1001
2d21ac55
A
1002static void
1003cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset)
1004{
0a7de745
A
1005 cbp_head->b_validend = zero_offset;
1006 cbp_tail->b_flags |= B_EOT;
2d21ac55
A
1007}
1008
1009static void
1010cluster_wait_IO(buf_t cbp_head, int async)
1011{
0a7de745 1012 buf_t cbp;
2d21ac55
A
1013
1014 if (async) {
39037602
A
1015 /*
1016 * Async callback completion will not normally generate a
1017 * wakeup upon I/O completion. To get woken up, we set
1018 * b_trans_next (which is safe for us to modify) on the last
1019 * buffer to CLUSTER_IO_WAITING so that cluster_iodone knows
1020 * to wake us up when all buffers as part of this transaction
1021 * are completed. This is done under the umbrella of
1022 * cl_transaction_mtxp which is also taken in cluster_iodone.
2d21ac55 1023 */
39037602
A
1024 bool done = true;
1025 buf_t last = NULL;
1026
6d2010ae 1027 lck_mtx_lock_spin(cl_transaction_mtxp);
2d21ac55 1028
39037602 1029 for (cbp = cbp_head; cbp; last = cbp, cbp = cbp->b_trans_next) {
0a7de745 1030 if (!ISSET(cbp->b_flags, B_TDONE)) {
39037602 1031 done = false;
0a7de745 1032 }
39037602 1033 }
2d21ac55 1034
39037602
A
1035 if (!done) {
1036 last->b_trans_next = CLUSTER_IO_WAITING;
1037
1038 DTRACE_IO1(wait__start, buf_t, last);
1039 do {
0a7de745 1040 msleep(last, cl_transaction_mtxp, PSPIN | (PRIBIO + 1), "cluster_wait_IO", NULL);
6d2010ae 1041
39037602
A
1042 /*
1043 * We should only have been woken up if all the
1044 * buffers are completed, but just in case...
1045 */
1046 done = true;
1047 for (cbp = cbp_head; cbp != CLUSTER_IO_WAITING; cbp = cbp->b_trans_next) {
1048 if (!ISSET(cbp->b_flags, B_TDONE)) {
1049 done = false;
1050 break;
1051 }
1052 }
1053 } while (!done);
1054 DTRACE_IO1(wait__done, buf_t, last);
6d2010ae 1055
39037602
A
1056 last->b_trans_next = NULL;
1057 }
6d2010ae 1058
39037602
A
1059 lck_mtx_unlock(cl_transaction_mtxp);
1060 } else { // !async
0a7de745 1061 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
39037602 1062 buf_biowait(cbp);
0a7de745 1063 }
2d21ac55
A
1064 }
1065}
1066
1067static void
1068cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait)
1069{
0a7de745
A
1070 buf_t cbp;
1071 int error;
39236c6e 1072 boolean_t isswapout = FALSE;
2d21ac55
A
1073
1074 /*
1075 * cluster_complete_transaction will
1076 * only be called if we've issued a complete chain in synchronous mode
1077 * or, we've already done a cluster_wait_IO on an incomplete chain
1078 */
0a7de745
A
1079 if (needwait) {
1080 for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) {
1081 buf_biowait(cbp);
1082 }
2d21ac55 1083 }
060df5ea
A
1084 /*
1085 * we've already waited on all of the I/Os in this transaction,
1086 * so mark all of the buf_t's in this transaction as B_TDONE
1087 * so that cluster_iodone sees the transaction as completed
1088 */
0a7de745 1089 for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) {
6d2010ae 1090 cbp->b_flags |= B_TDONE;
0a7de745 1091 }
39236c6e 1092 cbp = *cbp_head;
060df5ea 1093
0a7de745 1094 if ((flags & (CL_ASYNC | CL_PAGEOUT)) == CL_PAGEOUT && vnode_isswap(cbp->b_vp)) {
39236c6e 1095 isswapout = TRUE;
0a7de745 1096 }
39236c6e
A
1097
1098 error = cluster_iodone(cbp, callback_arg);
2d21ac55 1099
0a7de745
A
1100 if (!(flags & CL_ASYNC) && error && *retval == 0) {
1101 if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) != CL_PAGEOUT) || (error != ENXIO)) {
39236c6e 1102 *retval = error;
0a7de745 1103 } else if (isswapout == TRUE) {
39236c6e 1104 *retval = error;
0a7de745 1105 }
2d21ac55
A
1106 }
1107 *cbp_head = (buf_t)NULL;
1108}
1109
1110
1c79356b 1111static int
91447636 1112cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
0a7de745 1113 int flags, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg)
1c79356b 1114{
0a7de745
A
1115 buf_t cbp;
1116 u_int size;
1117 u_int io_size;
1118 int io_flags;
1119 int bmap_flags;
1120 int error = 0;
1121 int retval = 0;
1122 buf_t cbp_head = NULL;
1123 buf_t cbp_tail = NULL;
1124 int trans_count = 0;
1125 int max_trans_count;
1126 u_int pg_count;
1127 int pg_offset;
1128 u_int max_iosize;
1129 u_int max_vectors;
1130 int priv;
1131 int zero_offset = 0;
1132 int async_throttle = 0;
1133 mount_t mp;
2d21ac55
A
1134 vm_offset_t upl_end_offset;
1135 boolean_t need_EOT = FALSE;
1136
1137 /*
1138 * we currently don't support buffers larger than a page
1139 */
0a7de745 1140 if (real_bp && non_rounded_size > PAGE_SIZE) {
2d21ac55 1141 panic("%s(): Called with real buffer of size %d bytes which "
0a7de745
A
1142 "is greater than the maximum allowed size of "
1143 "%d bytes (the system PAGE_SIZE).\n",
1144 __FUNCTION__, non_rounded_size, PAGE_SIZE);
1145 }
91447636
A
1146
1147 mp = vp->v_mount;
1148
2d21ac55
A
1149 /*
1150 * we don't want to do any funny rounding of the size for IO requests
1151 * coming through the DIRECT or CONTIGUOUS paths... those pages don't
1152 * belong to us... we can't extend (nor do we need to) the I/O to fill
1153 * out a page
1154 */
1155 if (mp->mnt_devblocksize > 1 && !(flags & (CL_DEV_MEMORY | CL_DIRECT_IO))) {
0a7de745 1156 /*
91447636
A
1157 * round the requested size up so that this I/O ends on a
1158 * page boundary in case this is a 'write'... if the filesystem
1159 * has blocks allocated to back the page beyond the EOF, we want to
1160 * make sure to write out the zero's that are sitting beyond the EOF
1161 * so that in case the filesystem doesn't explicitly zero this area
1162 * if a hole is created via a lseek/write beyond the current EOF,
1163 * it will return zeros when it's read back from the disk. If the
1164 * physical allocation doesn't extend for the whole page, we'll
1165 * only write/read from the disk up to the end of this allocation
1166 * via the extent info returned from the VNOP_BLOCKMAP call.
1167 */
0a7de745 1168 pg_offset = upl_offset & PAGE_MASK;
55e303ae 1169
91447636
A
1170 size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
1171 } else {
0a7de745 1172 /*
91447636
A
1173 * anyone advertising a blocksize of 1 byte probably
1174 * can't deal with us rounding up the request size
1175 * AFP is one such filesystem/device
1176 */
0a7de745 1177 size = non_rounded_size;
91447636 1178 }
2d21ac55
A
1179 upl_end_offset = upl_offset + size;
1180
1181 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START, (int)f_offset, size, upl_offset, flags, 0);
1182
1183 /*
1184 * Set the maximum transaction size to the maximum desired number of
1185 * buffers.
1186 */
1187 max_trans_count = 8;
0a7de745 1188 if (flags & CL_DEV_MEMORY) {
2d21ac55 1189 max_trans_count = 16;
0a7de745 1190 }
55e303ae 1191
0b4e3aa0 1192 if (flags & CL_READ) {
0a7de745 1193 io_flags = B_READ;
91447636 1194 bmap_flags = VNODE_READ;
0b4e3aa0 1195
91447636
A
1196 max_iosize = mp->mnt_maxreadcnt;
1197 max_vectors = mp->mnt_segreadcnt;
0b4e3aa0 1198 } else {
0a7de745 1199 io_flags = B_WRITE;
91447636 1200 bmap_flags = VNODE_WRITE;
1c79356b 1201
91447636
A
1202 max_iosize = mp->mnt_maxwritecnt;
1203 max_vectors = mp->mnt_segwritecnt;
0b4e3aa0 1204 }
91447636
A
1205 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
1206
55e303ae 1207 /*
91447636
A
1208 * make sure the maximum iosize is a
1209 * multiple of the page size
55e303ae
A
1210 */
1211 max_iosize &= ~PAGE_MASK;
1212
2d21ac55
A
1213 /*
1214 * Ensure the maximum iosize is sensible.
1215 */
0a7de745 1216 if (!max_iosize) {
2d21ac55 1217 max_iosize = PAGE_SIZE;
0a7de745 1218 }
2d21ac55 1219
55e303ae 1220 if (flags & CL_THROTTLE) {
0a7de745
A
1221 if (!(flags & CL_PAGEOUT) && cluster_is_throttled(vp)) {
1222 if (max_iosize > THROTTLE_MAX_IOSIZE) {
1223 max_iosize = THROTTLE_MAX_IOSIZE;
1224 }
39236c6e 1225 async_throttle = THROTTLE_MAXCNT;
2d21ac55 1226 } else {
0a7de745
A
1227 if ((flags & CL_DEV_MEMORY)) {
1228 async_throttle = IO_SCALE(vp, VNODE_ASYNC_THROTTLE);
1229 } else {
1230 u_int max_cluster;
cf7d32b8 1231 u_int max_cluster_size;
6d2010ae
A
1232 u_int scale;
1233
39037602
A
1234 if (vp->v_mount->mnt_minsaturationbytecount) {
1235 max_cluster_size = vp->v_mount->mnt_minsaturationbytecount;
b0d623f7 1236
39037602
A
1237 scale = 1;
1238 } else {
1239 max_cluster_size = MAX_CLUSTER_SIZE(vp);
1240
0a7de745 1241 if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
39037602 1242 scale = WRITE_THROTTLE_SSD;
0a7de745 1243 } else {
39037602 1244 scale = WRITE_THROTTLE;
0a7de745
A
1245 }
1246 }
1247 if (max_iosize > max_cluster_size) {
1248 max_cluster = max_cluster_size;
1249 } else {
1250 max_cluster = max_iosize;
39037602 1251 }
0a7de745
A
1252
1253 if (size < max_cluster) {
1254 max_cluster = size;
1255 }
1256
1257 if (flags & CL_CLOSE) {
6d2010ae 1258 scale += MAX_CLUSTERS;
0a7de745
A
1259 }
1260
1261 async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), ((scale * max_cluster_size) / max_cluster) - 1);
2d21ac55
A
1262 }
1263 }
55e303ae 1264 }
0a7de745
A
1265 if (flags & CL_AGE) {
1266 io_flags |= B_AGE;
1267 }
1268 if (flags & (CL_PAGEIN | CL_PAGEOUT)) {
91447636 1269 io_flags |= B_PAGEIO;
0a7de745
A
1270 }
1271 if (flags & (CL_IOSTREAMING)) {
b0d623f7 1272 io_flags |= B_IOSTREAMING;
0a7de745
A
1273 }
1274 if (flags & CL_COMMIT) {
1275 io_flags |= B_COMMIT_UPL;
1276 }
1277 if (flags & CL_DIRECT_IO) {
1278 io_flags |= B_PHYS;
1279 }
1280 if (flags & (CL_PRESERVE | CL_KEEPCACHED)) {
6d2010ae 1281 io_flags |= B_CACHE;
0a7de745
A
1282 }
1283 if (flags & CL_PASSIVE) {
1284 io_flags |= B_PASSIVE;
1285 }
1286 if (flags & CL_ENCRYPTED) {
1287 io_flags |= B_ENCRYPTED_IO;
1288 }
3e170ce0 1289
0a7de745
A
1290 if (vp->v_flag & VSYSTEM) {
1291 io_flags |= B_META;
1292 }
1c79356b 1293
9bccf70c 1294 if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
0a7de745 1295 /*
1c79356b
A
1296 * then we are going to end up
1297 * with a page that we can't complete (the file size wasn't a multiple
1298 * of PAGE_SIZE and we're trying to read to the end of the file
1299 * so we'll go ahead and zero out the portion of the page we can't
1300 * read in from the file
1301 */
0a7de745 1302 zero_offset = upl_offset + non_rounded_size;
3e170ce0
A
1303 } else if (!ISSET(flags, CL_READ) && ISSET(flags, CL_DIRECT_IO)) {
1304 assert(ISSET(flags, CL_COMMIT));
1305
1306 // For a direct/uncached write, we need to lock pages...
1307
1308 upl_t cached_upl;
1309
1310 /*
1311 * Create a UPL to lock the pages in the cache whilst the
1312 * write is in progress.
1313 */
5ba3f43e 1314 ubc_create_upl_kernel(vp, f_offset, non_rounded_size, &cached_upl,
0a7de745 1315 NULL, UPL_SET_LITE, VM_KERN_MEMORY_FILE);
3e170ce0
A
1316
1317 /*
1318 * Attach this UPL to the other UPL so that we can find it
1319 * later.
1320 */
1321 upl_set_associated_upl(upl, cached_upl);
1322
1323 if (upl_offset & PAGE_MASK) {
1324 /*
1325 * The two UPLs are not aligned, so mark the first page in
1326 * @upl so that cluster_handle_associated_upl can handle
1327 * it accordingly.
1328 */
1329 upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1330 upl_page_set_mark(pl, 0, true);
1331 }
1c79356b 1332 }
3e170ce0 1333
1c79356b 1334 while (size) {
91447636
A
1335 daddr64_t blkno;
1336 daddr64_t lblkno;
0a7de745
A
1337 u_int io_size_wanted;
1338 size_t io_size_tmp;
1c79356b 1339
0a7de745
A
1340 if (size > max_iosize) {
1341 io_size = max_iosize;
1342 } else {
1343 io_size = size;
1344 }
2d21ac55
A
1345
1346 io_size_wanted = io_size;
b0d623f7 1347 io_size_tmp = (size_t)io_size;
0a7de745
A
1348
1349 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, &io_size_tmp, NULL, bmap_flags, NULL))) {
1c79356b 1350 break;
0a7de745 1351 }
2d21ac55 1352
0a7de745
A
1353 if (io_size_tmp > io_size_wanted) {
1354 io_size = io_size_wanted;
1355 } else {
1356 io_size = (u_int)io_size_tmp;
1357 }
2d21ac55 1358
0a7de745
A
1359 if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
1360 real_bp->b_blkno = blkno;
1361 }
1c79356b
A
1362
1363 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
0a7de745 1364 (int)f_offset, (int)(blkno >> 32), (int)blkno, io_size, 0);
1c79356b 1365
91447636 1366 if (io_size == 0) {
0a7de745 1367 /*
91447636
A
1368 * vnop_blockmap didn't return an error... however, it did
1369 * return an extent size of 0 which means we can't
1370 * make forward progress on this I/O... a hole in the
1371 * file would be returned as a blkno of -1 with a non-zero io_size
1372 * a real extent is returned with a blkno != -1 and a non-zero io_size
1373 */
0a7de745 1374 error = EINVAL;
91447636
A
1375 break;
1376 }
0a7de745
A
1377 if (!(flags & CL_READ) && blkno == -1) {
1378 off_t e_offset;
1379 int pageout_flags;
91447636 1380
0a7de745 1381 if (upl_get_internal_vectorupl(upl)) {
b0d623f7 1382 panic("Vector UPLs should not take this code-path\n");
0a7de745
A
1383 }
1384 /*
91447636
A
1385 * we're writing into a 'hole'
1386 */
0b4e3aa0 1387 if (flags & CL_PAGEOUT) {
0a7de745
A
1388 /*
1389 * if we got here via cluster_pageout
91447636
A
1390 * then just error the request and return
1391 * the 'hole' should already have been covered
1392 */
0a7de745 1393 error = EINVAL;
0b4e3aa0 1394 break;
91447636 1395 }
91447636 1396 /*
0a7de745 1397 * we can get here if the cluster code happens to
91447636
A
1398 * pick up a page that was dirtied via mmap vs
1399 * a 'write' and the page targets a 'hole'...
1400 * i.e. the writes to the cluster were sparse
1401 * and the file was being written for the first time
1402 *
1403 * we can also get here if the filesystem supports
1404 * 'holes' that are less than PAGE_SIZE.... because
1405 * we can't know if the range in the page that covers
1406 * the 'hole' has been dirtied via an mmap or not,
1407 * we have to assume the worst and try to push the
1408 * entire page to storage.
1409 *
1410 * Try paging out the page individually before
1411 * giving up entirely and dumping it (the pageout
1412 * path will insure that the zero extent accounting
1413 * has been taken care of before we get back into cluster_io)
2d21ac55
A
1414 *
1415 * go direct to vnode_pageout so that we don't have to
1416 * unbusy the page from the UPL... we used to do this
fe8ab488 1417 * so that we could call ubc_msync, but that results
2d21ac55
A
1418 * in a potential deadlock if someone else races us to acquire
1419 * that page and wins and in addition needs one of the pages
1420 * we're continuing to hold in the UPL
0b4e3aa0 1421 */
2d21ac55 1422 pageout_flags = UPL_MSYNC | UPL_VNODE_PAGER | UPL_NESTED_PAGEOUT;
91447636 1423
0a7de745
A
1424 if (!(flags & CL_ASYNC)) {
1425 pageout_flags |= UPL_IOSYNC;
1426 }
1427 if (!(flags & CL_COMMIT)) {
1428 pageout_flags |= UPL_NOCOMMIT;
1429 }
2d21ac55
A
1430
1431 if (cbp_head) {
00867663
A
1432 buf_t prev_cbp;
1433 int bytes_in_last_page;
2d21ac55
A
1434
1435 /*
1436 * first we have to wait for the the current outstanding I/Os
1437 * to complete... EOT hasn't been set yet on this transaction
00867663 1438 * so the pages won't be released
2d21ac55
A
1439 */
1440 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1441
00867663 1442 bytes_in_last_page = cbp_head->b_uploffset & PAGE_MASK;
0a7de745 1443 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
00867663 1444 bytes_in_last_page += cbp->b_bcount;
0a7de745 1445 }
00867663 1446 bytes_in_last_page &= PAGE_MASK;
0a7de745 1447
00867663
A
1448 while (bytes_in_last_page) {
1449 /*
1450 * we've got a transcation that
1451 * includes the page we're about to push out through vnode_pageout...
1452 * find the bp's in the list which intersect this page and either
1453 * remove them entirely from the transaction (there could be multiple bp's), or
1454 * round it's iosize down to the page boundary (there can only be one)...
1455 *
1456 * find the last bp in the list and act on it
2d21ac55 1457 */
0a7de745 1458 for (prev_cbp = cbp = cbp_head; cbp->b_trans_next; cbp = cbp->b_trans_next) {
00867663 1459 prev_cbp = cbp;
0a7de745 1460 }
2d21ac55 1461
00867663
A
1462 if (bytes_in_last_page >= cbp->b_bcount) {
1463 /*
1464 * this buf no longer has any I/O associated with it
2d21ac55 1465 */
00867663
A
1466 bytes_in_last_page -= cbp->b_bcount;
1467 cbp->b_bcount = 0;
1468
1469 free_io_buf(cbp);
1470
1471 if (cbp == cbp_head) {
1472 assert(bytes_in_last_page == 0);
1473 /*
1474 * the buf we just freed was the only buf in
1475 * this transaction... so there's no I/O to do
1476 */
1477 cbp_head = NULL;
1478 cbp_tail = NULL;
1479 } else {
1480 /*
1481 * remove the buf we just freed from
1482 * the transaction list
1483 */
1484 prev_cbp->b_trans_next = NULL;
1485 cbp_tail = prev_cbp;
1486 }
2d21ac55 1487 } else {
00867663
A
1488 /*
1489 * this is the last bp that has I/O
1490 * intersecting the page of interest
1491 * only some of the I/O is in the intersection
1492 * so clip the size but keep it in the transaction list
2d21ac55 1493 */
00867663
A
1494 cbp->b_bcount -= bytes_in_last_page;
1495 cbp_tail = cbp;
1496 bytes_in_last_page = 0;
2d21ac55
A
1497 }
1498 }
1499 if (cbp_head) {
0a7de745 1500 /*
2d21ac55
A
1501 * there was more to the current transaction
1502 * than just the page we are pushing out via vnode_pageout...
1503 * mark it as finished and complete it... we've already
1504 * waited for the I/Os to complete above in the call to cluster_wait_IO
1505 */
0a7de745 1506 cluster_EOT(cbp_head, cbp_tail, 0);
91447636 1507
2d21ac55
A
1508 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
1509
1510 trans_count = 0;
1511 }
1512 }
1513 if (vnode_pageout(vp, upl, trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) {
0a7de745 1514 error = EINVAL;
91447636 1515 }
2d21ac55 1516 e_offset = round_page_64(f_offset + 1);
91447636
A
1517 io_size = e_offset - f_offset;
1518
1519 f_offset += io_size;
1520 upl_offset += io_size;
1521
0a7de745
A
1522 if (size >= io_size) {
1523 size -= io_size;
1524 } else {
1525 size = 0;
1526 }
91447636
A
1527 /*
1528 * keep track of how much of the original request
1529 * that we've actually completed... non_rounded_size
1530 * may go negative due to us rounding the request
1531 * to a page size multiple (i.e. size > non_rounded_size)
1532 */
1533 non_rounded_size -= io_size;
1534
1535 if (non_rounded_size <= 0) {
0a7de745 1536 /*
91447636
A
1537 * we've transferred all of the data in the original
1538 * request, but we were unable to complete the tail
1539 * of the last page because the file didn't have
1540 * an allocation to back that portion... this is ok.
1541 */
0a7de745 1542 size = 0;
91447636 1543 }
6d2010ae 1544 if (error) {
0a7de745 1545 if (size == 0) {
6d2010ae 1546 flags &= ~CL_COMMIT;
0a7de745
A
1547 }
1548 break;
6d2010ae 1549 }
0b4e3aa0 1550 continue;
1c79356b 1551 }
fe8ab488 1552 lblkno = (daddr64_t)(f_offset / 0x1000);
1c79356b
A
1553 /*
1554 * we have now figured out how much I/O we can do - this is in 'io_size'
1c79356b
A
1555 * pg_offset is the starting point in the first page for the I/O
1556 * pg_count is the number of full and partial pages that 'io_size' encompasses
1557 */
1c79356b 1558 pg_offset = upl_offset & PAGE_MASK;
1c79356b 1559
0b4e3aa0 1560 if (flags & CL_DEV_MEMORY) {
0b4e3aa0
A
1561 /*
1562 * treat physical requests as one 'giant' page
1563 */
1564 pg_count = 1;
0a7de745
A
1565 } else {
1566 pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
1567 }
55e303ae 1568
91447636 1569 if ((flags & CL_READ) && blkno == -1) {
2d21ac55 1570 vm_offset_t commit_offset;
0a7de745 1571 int bytes_to_zero;
2d21ac55 1572 int complete_transaction_now = 0;
9bccf70c 1573
0a7de745 1574 /*
1c79356b
A
1575 * if we're reading and blkno == -1, then we've got a
1576 * 'hole' in the file that we need to deal with by zeroing
1577 * out the affected area in the upl
1578 */
2d21ac55 1579 if (io_size >= (u_int)non_rounded_size) {
0a7de745 1580 /*
9bccf70c
A
1581 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
1582 * than 'zero_offset' will be non-zero
91447636 1583 * if the 'hole' returned by vnop_blockmap extends all the way to the eof
9bccf70c
A
1584 * (indicated by the io_size finishing off the I/O request for this UPL)
1585 * than we're not going to issue an I/O for the
1586 * last page in this upl... we need to zero both the hole and the tail
0a7de745 1587 * of the page beyond the EOF, since the delayed zero-fill won't kick in
9bccf70c 1588 */
2d21ac55 1589 bytes_to_zero = non_rounded_size;
0a7de745 1590 if (!(flags & CL_NOZERO)) {
2d21ac55 1591 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
0a7de745 1592 }
1c79356b 1593
9bccf70c 1594 zero_offset = 0;
0a7de745
A
1595 } else {
1596 bytes_to_zero = io_size;
1597 }
1c79356b 1598
2d21ac55
A
1599 pg_count = 0;
1600
1601 cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
0a7de745 1602
2d21ac55 1603 if (cbp_head) {
0a7de745 1604 int pg_resid;
2d21ac55 1605
0a7de745 1606 /*
9bccf70c
A
1607 * if there is a current I/O chain pending
1608 * then the first page of the group we just zero'd
1609 * will be handled by the I/O completion if the zero
1610 * fill started in the middle of the page
1611 */
0a7de745 1612 commit_offset = (upl_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2d21ac55
A
1613
1614 pg_resid = commit_offset - upl_offset;
0a7de745 1615
2d21ac55 1616 if (bytes_to_zero >= pg_resid) {
0a7de745
A
1617 /*
1618 * the last page of the current I/O
2d21ac55 1619 * has been completed...
0a7de745 1620 * compute the number of fully zero'd
2d21ac55
A
1621 * pages that are beyond it
1622 * plus the last page if its partial
1623 * and we have no more I/O to issue...
1624 * otherwise a partial page is left
1625 * to begin the next I/O
1626 */
0a7de745
A
1627 if ((int)io_size >= non_rounded_size) {
1628 pg_count = (bytes_to_zero - pg_resid + (PAGE_SIZE - 1)) / PAGE_SIZE;
1629 } else {
1630 pg_count = (bytes_to_zero - pg_resid) / PAGE_SIZE;
1631 }
1632
2d21ac55
A
1633 complete_transaction_now = 1;
1634 }
1635 } else {
0a7de745 1636 /*
2d21ac55
A
1637 * no pending I/O to deal with
1638 * so, commit all of the fully zero'd pages
1639 * plus the last page if its partial
1640 * and we have no more I/O to issue...
1641 * otherwise a partial page is left
1642 * to begin the next I/O
9bccf70c 1643 */
0a7de745
A
1644 if ((int)io_size >= non_rounded_size) {
1645 pg_count = (pg_offset + bytes_to_zero + (PAGE_SIZE - 1)) / PAGE_SIZE;
1646 } else {
1647 pg_count = (pg_offset + bytes_to_zero) / PAGE_SIZE;
1648 }
9bccf70c 1649
2d21ac55
A
1650 commit_offset = upl_offset & ~PAGE_MASK;
1651 }
3e170ce0
A
1652
1653 // Associated UPL is currently only used in the direct write path
1654 assert(!upl_associated_upl(upl));
1655
0a7de745
A
1656 if ((flags & CL_COMMIT) && pg_count) {
1657 ubc_upl_commit_range(upl, commit_offset, pg_count * PAGE_SIZE,
1658 UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
1c79356b
A
1659 }
1660 upl_offset += io_size;
1661 f_offset += io_size;
1662 size -= io_size;
2d21ac55 1663
91447636
A
1664 /*
1665 * keep track of how much of the original request
1666 * that we've actually completed... non_rounded_size
1667 * may go negative due to us rounding the request
1668 * to a page size multiple (i.e. size > non_rounded_size)
1669 */
1670 non_rounded_size -= io_size;
1c79356b 1671
91447636 1672 if (non_rounded_size <= 0) {
0a7de745 1673 /*
91447636
A
1674 * we've transferred all of the data in the original
1675 * request, but we were unable to complete the tail
1676 * of the last page because the file didn't have
1677 * an allocation to back that portion... this is ok.
1678 */
0a7de745 1679 size = 0;
91447636 1680 }
0a7de745
A
1681 if (cbp_head && (complete_transaction_now || size == 0)) {
1682 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
9bccf70c 1683
2d21ac55
A
1684 cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0);
1685
1686 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
1687
1688 trans_count = 0;
1689 }
1690 continue;
1c79356b 1691 }
55e303ae 1692 if (pg_count > max_vectors) {
0a7de745
A
1693 if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
1694 io_size = PAGE_SIZE - pg_offset;
55e303ae 1695 pg_count = 1;
91447636 1696 } else {
0a7de745
A
1697 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
1698 pg_count = max_vectors;
91447636 1699 }
1c79356b 1700 }
2d21ac55
A
1701 /*
1702 * If the transaction is going to reach the maximum number of
1703 * desired elements, truncate the i/o to the nearest page so
1704 * that the actual i/o is initiated after this buffer is
1705 * created and added to the i/o chain.
1706 *
0a7de745 1707 * I/O directed to physically contiguous memory
2d21ac55
A
1708 * doesn't have a requirement to make sure we 'fill' a page
1709 */
0a7de745
A
1710 if (!(flags & CL_DEV_MEMORY) && trans_count >= max_trans_count &&
1711 ((upl_offset + io_size) & PAGE_MASK)) {
2d21ac55
A
1712 vm_offset_t aligned_ofs;
1713
1714 aligned_ofs = (upl_offset + io_size) & ~PAGE_MASK;
1715 /*
1716 * If the io_size does not actually finish off even a
1717 * single page we have to keep adding buffers to the
1718 * transaction despite having reached the desired limit.
1719 *
1720 * Eventually we get here with the page being finished
1721 * off (and exceeded) and then we truncate the size of
1722 * this i/o request so that it is page aligned so that
1723 * we can finally issue the i/o on the transaction.
1724 */
1725 if (aligned_ofs > upl_offset) {
1726 io_size = aligned_ofs - upl_offset;
1727 pg_count--;
1728 }
1729 }
1c79356b 1730
0a7de745
A
1731 if (!(mp->mnt_kern_flag & MNTK_VIRTUALDEV)) {
1732 /*
55e303ae
A
1733 * if we're not targeting a virtual device i.e. a disk image
1734 * it's safe to dip into the reserve pool since real devices
1735 * can complete this I/O request without requiring additional
1736 * bufs from the alloc_io_buf pool
1737 */
1738 priv = 1;
0a7de745
A
1739 } else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT)) {
1740 /*
55e303ae
A
1741 * Throttle the speculative IO
1742 */
0b4e3aa0 1743 priv = 0;
0a7de745 1744 } else {
0b4e3aa0 1745 priv = 1;
0a7de745 1746 }
0b4e3aa0
A
1747
1748 cbp = alloc_io_buf(vp, priv);
1c79356b 1749
55e303ae 1750 if (flags & CL_PAGEOUT) {
0a7de745 1751 u_int i;
91447636 1752
3e170ce0
A
1753 /*
1754 * since blocks are in offsets of 0x1000, scale
1755 * iteration to (PAGE_SIZE * pg_count) of blks.
1756 */
0a7de745
A
1757 for (i = 0; i < (PAGE_SIZE * pg_count) / 0x1000; i++) {
1758 if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY) {
3e170ce0 1759 panic("BUSY bp found in cluster_io");
0a7de745 1760 }
1c79356b 1761 }
1c79356b 1762 }
b4c24cb9 1763 if (flags & CL_ASYNC) {
0a7de745
A
1764 if (buf_setcallback(cbp, (void *)cluster_iodone, callback_arg)) {
1765 panic("buf_setcallback failed\n");
1766 }
b4c24cb9 1767 }
2d21ac55 1768 cbp->b_cliodone = (void *)callback;
1c79356b 1769 cbp->b_flags |= io_flags;
0a7de745 1770 if (flags & CL_NOCACHE) {
316670eb 1771 cbp->b_attr.ba_flags |= BA_NOCACHE;
0a7de745 1772 }
1c79356b
A
1773
1774 cbp->b_lblkno = lblkno;
1775 cbp->b_blkno = blkno;
1776 cbp->b_bcount = io_size;
1c79356b 1777
0a7de745
A
1778 if (buf_setupl(cbp, upl, upl_offset)) {
1779 panic("buf_setupl failed\n");
1780 }
fe8ab488
A
1781#if CONFIG_IOSCHED
1782 upl_set_blkno(upl, upl_offset, io_size, blkno);
1783#endif
91447636
A
1784 cbp->b_trans_next = (buf_t)NULL;
1785
0a7de745
A
1786 if ((cbp->b_iostate = (void *)iostate)) {
1787 /*
d7e50217
A
1788 * caller wants to track the state of this
1789 * io... bump the amount issued against this stream
1790 */
0a7de745
A
1791 iostate->io_issued += io_size;
1792 }
b4c24cb9 1793
91447636 1794 if (flags & CL_READ) {
1c79356b 1795 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
0a7de745
A
1796 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
1797 } else {
1c79356b 1798 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
0a7de745 1799 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
91447636 1800 }
1c79356b
A
1801
1802 if (cbp_head) {
0a7de745 1803 cbp_tail->b_trans_next = cbp;
1c79356b
A
1804 cbp_tail = cbp;
1805 } else {
0a7de745 1806 cbp_head = cbp;
1c79356b 1807 cbp_tail = cbp;
2d21ac55 1808
0a7de745 1809 if ((cbp_head->b_real_bp = real_bp)) {
2d21ac55 1810 real_bp = (buf_t)NULL;
0a7de745 1811 }
1c79356b 1812 }
2d21ac55
A
1813 *(buf_t *)(&cbp->b_trans_head) = cbp_head;
1814
91447636 1815 trans_count++;
1c79356b
A
1816
1817 upl_offset += io_size;
1818 f_offset += io_size;
1819 size -= io_size;
91447636
A
1820 /*
1821 * keep track of how much of the original request
1822 * that we've actually completed... non_rounded_size
1823 * may go negative due to us rounding the request
1824 * to a page size multiple (i.e. size > non_rounded_size)
1825 */
1826 non_rounded_size -= io_size;
1c79356b 1827
91447636 1828 if (non_rounded_size <= 0) {
0a7de745 1829 /*
91447636
A
1830 * we've transferred all of the data in the original
1831 * request, but we were unable to complete the tail
1832 * of the last page because the file didn't have
1833 * an allocation to back that portion... this is ok.
1834 */
0a7de745 1835 size = 0;
91447636 1836 }
2d21ac55 1837 if (size == 0) {
0a7de745 1838 /*
2d21ac55
A
1839 * we have no more I/O to issue, so go
1840 * finish the final transaction
1841 */
0a7de745
A
1842 need_EOT = TRUE;
1843 } else if (((flags & CL_DEV_MEMORY) || (upl_offset & PAGE_MASK) == 0) &&
1844 ((flags & CL_ASYNC) || trans_count > max_trans_count)) {
1845 /*
2d21ac55
A
1846 * I/O directed to physically contiguous memory...
1847 * which doesn't have a requirement to make sure we 'fill' a page
0a7de745 1848 * or...
1c79356b
A
1849 * the current I/O we've prepared fully
1850 * completes the last page in this request
2d21ac55 1851 * and ...
0a7de745 1852 * it's either an ASYNC request or
9bccf70c 1853 * we've already accumulated more than 8 I/O's into
2d21ac55
A
1854 * this transaction so mark it as complete so that
1855 * it can finish asynchronously or via the cluster_complete_transaction
1856 * below if the request is synchronous
1c79356b 1857 */
0a7de745
A
1858 need_EOT = TRUE;
1859 }
1860 if (need_EOT == TRUE) {
1861 cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0);
1862 }
1863
1864 if (flags & CL_THROTTLE) {
1865 (void)vnode_waitforwrites(vp, async_throttle, 0, 0, "cluster_io");
2d21ac55 1866 }
1c79356b 1867
0a7de745
A
1868 if (!(io_flags & B_READ)) {
1869 vnode_startwrite(vp);
1870 }
1c79356b 1871
316670eb 1872 if (flags & CL_RAW_ENCRYPTED) {
0a7de745 1873 /*
316670eb
A
1874 * User requested raw encrypted bytes.
1875 * Twiddle the bit in the ba_flags for the buffer
1876 */
1877 cbp->b_attr.ba_flags |= BA_RAW_ENCRYPTED_IO;
1878 }
0a7de745 1879
2d21ac55
A
1880 (void) VNOP_STRATEGY(cbp);
1881
1882 if (need_EOT == TRUE) {
0a7de745
A
1883 if (!(flags & CL_ASYNC)) {
1884 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 1);
1885 }
9bccf70c 1886
2d21ac55 1887 need_EOT = FALSE;
91447636 1888 trans_count = 0;
2d21ac55 1889 cbp_head = NULL;
1c79356b 1890 }
0a7de745 1891 }
1c79356b 1892 if (error) {
3e170ce0 1893 int abort_size;
0b4e3aa0 1894
b4c24cb9 1895 io_size = 0;
3e170ce0 1896
2d21ac55 1897 if (cbp_head) {
3e170ce0
A
1898 /*
1899 * Wait until all of the outstanding I/O
1900 * for this partial transaction has completed
1901 */
1902 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
0b4e3aa0 1903
2d21ac55
A
1904 /*
1905 * Rewind the upl offset to the beginning of the
1906 * transaction.
1907 */
1908 upl_offset = cbp_head->b_uploffset;
3e170ce0 1909 }
2d21ac55 1910
3e170ce0
A
1911 if (ISSET(flags, CL_COMMIT)) {
1912 cluster_handle_associated_upl(iostate, upl, upl_offset,
0a7de745 1913 upl_end_offset - upl_offset);
3e170ce0 1914 }
2d21ac55 1915
3e170ce0
A
1916 // Free all the IO buffers in this transaction
1917 for (cbp = cbp_head; cbp;) {
0a7de745
A
1918 buf_t cbp_next;
1919
3e170ce0
A
1920 size += cbp->b_bcount;
1921 io_size += cbp->b_bcount;
1922
1923 cbp_next = cbp->b_trans_next;
1924 free_io_buf(cbp);
1925 cbp = cbp_next;
1c79356b 1926 }
3e170ce0 1927
b4c24cb9 1928 if (iostate) {
0a7de745 1929 int need_wakeup = 0;
91447636 1930
0a7de745 1931 /*
d7e50217
A
1932 * update the error condition for this stream
1933 * since we never really issued the io
1934 * just go ahead and adjust it back
1935 */
0a7de745 1936 lck_mtx_lock_spin(&iostate->io_mtxp);
91447636 1937
0a7de745
A
1938 if (iostate->io_error == 0) {
1939 iostate->io_error = error;
1940 }
b4c24cb9
A
1941 iostate->io_issued -= io_size;
1942
1943 if (iostate->io_wanted) {
0a7de745 1944 /*
d7e50217
A
1945 * someone is waiting for the state of
1946 * this io stream to change
1947 */
0a7de745 1948 iostate->io_wanted = 0;
2d21ac55 1949 need_wakeup = 1;
b4c24cb9 1950 }
0a7de745 1951 lck_mtx_unlock(&iostate->io_mtxp);
91447636 1952
0a7de745
A
1953 if (need_wakeup) {
1954 wakeup((caddr_t)&iostate->io_wanted);
1955 }
b4c24cb9 1956 }
3e170ce0 1957
1c79356b 1958 if (flags & CL_COMMIT) {
0a7de745 1959 int upl_flags;
1c79356b 1960
3e170ce0 1961 pg_offset = upl_offset & PAGE_MASK;
2d21ac55 1962 abort_size = (upl_end_offset - upl_offset + PAGE_MASK) & ~PAGE_MASK;
3e170ce0 1963
39236c6e 1964 upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, abort_size, error, io_flags, vp);
0a7de745 1965
1c79356b 1966 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
0a7de745
A
1967 upl, upl_offset - pg_offset, abort_size, (error << 24) | upl_flags, 0);
1968 }
1969 if (retval == 0) {
1970 retval = error;
1c79356b 1971 }
0a7de745
A
1972 } else if (cbp_head) {
1973 panic("%s(): cbp_head is not NULL.\n", __FUNCTION__);
1974 }
2d21ac55
A
1975
1976 if (real_bp) {
0a7de745 1977 /*
2d21ac55
A
1978 * can get here if we either encountered an error
1979 * or we completely zero-filled the request and
1980 * no I/O was issued
1981 */
1982 if (error) {
1983 real_bp->b_flags |= B_ERROR;
1984 real_bp->b_error = error;
1985 }
1986 buf_biodone(real_bp);
1c79356b 1987 }
2d21ac55 1988 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END, (int)f_offset, size, upl_offset, retval, 0);
1c79356b 1989
0a7de745 1990 return retval;
1c79356b
A
1991}
1992
0a7de745
A
1993#define reset_vector_run_state() \
1994 issueVectorUPL = vector_upl_offset = vector_upl_index = vector_upl_iosize = vector_upl_size = 0;
b0d623f7
A
1995
1996static int
1997vector_cluster_io(vnode_t vp, upl_t vector_upl, vm_offset_t vector_upl_offset, off_t v_upl_uio_offset, int vector_upl_iosize,
0a7de745 1998 int io_flag, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg)
b0d623f7
A
1999{
2000 vector_upl_set_pagelist(vector_upl);
2001
0a7de745
A
2002 if (io_flag & CL_READ) {
2003 if (vector_upl_offset == 0 && ((vector_upl_iosize & PAGE_MASK) == 0)) {
2004 io_flag &= ~CL_PRESERVE; /*don't zero fill*/
2005 } else {
2006 io_flag |= CL_PRESERVE; /*zero fill*/
2007 }
2008 }
2009 return cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp, iostate, callback, callback_arg);
b0d623f7 2010}
1c79356b
A
2011
2012static int
2d21ac55 2013cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
1c79356b 2014{
55e303ae 2015 int pages_in_prefetch;
1c79356b
A
2016
2017 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
0a7de745 2018 (int)f_offset, size, (int)filesize, 0, 0);
1c79356b
A
2019
2020 if (f_offset >= filesize) {
0a7de745
A
2021 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
2022 (int)f_offset, 0, 0, 0, 0);
2023 return 0;
2024 }
2025 if ((off_t)size > (filesize - f_offset)) {
2026 size = filesize - f_offset;
1c79356b 2027 }
55e303ae 2028 pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
1c79356b 2029
2d21ac55 2030 advisory_read_ext(vp, filesize, f_offset, size, callback, callback_arg, bflag);
1c79356b
A
2031
2032 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
0a7de745 2033 (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
1c79356b 2034
0a7de745 2035 return pages_in_prefetch;
1c79356b
A
2036}
2037
2038
2039
2040static void
2d21ac55 2041cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap, int (*callback)(buf_t, void *), void *callback_arg,
0a7de745 2042 int bflag)
1c79356b 2043{
0a7de745
A
2044 daddr64_t r_addr;
2045 off_t f_offset;
2046 int size_of_prefetch;
2047 u_int max_prefetch;
91447636 2048
1c79356b
A
2049
2050 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
0a7de745 2051 (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
1c79356b 2052
91447636 2053 if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
1c79356b 2054 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
0a7de745 2055 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
1c79356b
A
2056 return;
2057 }
2d21ac55 2058 if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1))) {
0a7de745 2059 rap->cl_ralen = 0;
91447636 2060 rap->cl_maxra = 0;
1c79356b
A
2061
2062 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
0a7de745 2063 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
1c79356b
A
2064
2065 return;
2066 }
5ba3f43e 2067 max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ), disk_conditioner_mount_is_ssd(vp->v_mount));
cf7d32b8 2068
0a7de745 2069 if (max_prefetch > speculative_prefetch_max) {
fe8ab488 2070 max_prefetch = speculative_prefetch_max;
0a7de745 2071 }
6d2010ae
A
2072
2073 if (max_prefetch <= PAGE_SIZE) {
2074 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
0a7de745 2075 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 6, 0);
6d2010ae
A
2076 return;
2077 }
fe8ab488 2078 if (extent->e_addr < rap->cl_maxra && rap->cl_ralen >= 4) {
0a7de745
A
2079 if ((rap->cl_maxra - extent->e_addr) > (rap->cl_ralen / 4)) {
2080 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2081 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
1c79356b
A
2082 return;
2083 }
2084 }
91447636
A
2085 r_addr = max(extent->e_addr, rap->cl_maxra) + 1;
2086 f_offset = (off_t)(r_addr * PAGE_SIZE_64);
1c79356b 2087
0a7de745 2088 size_of_prefetch = 0;
55e303ae
A
2089
2090 ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
2091
2092 if (size_of_prefetch) {
0a7de745
A
2093 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2094 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
55e303ae
A
2095 return;
2096 }
9bccf70c 2097 if (f_offset < filesize) {
0a7de745 2098 daddr64_t read_size;
55e303ae 2099
0a7de745 2100 rap->cl_ralen = rap->cl_ralen ? min(max_prefetch / PAGE_SIZE, rap->cl_ralen << 1) : 1;
55e303ae 2101
91447636
A
2102 read_size = (extent->e_addr + 1) - extent->b_addr;
2103
2104 if (read_size > rap->cl_ralen) {
0a7de745
A
2105 if (read_size > max_prefetch / PAGE_SIZE) {
2106 rap->cl_ralen = max_prefetch / PAGE_SIZE;
2107 } else {
2108 rap->cl_ralen = read_size;
2109 }
91447636 2110 }
2d21ac55 2111 size_of_prefetch = cluster_read_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize, callback, callback_arg, bflag);
1c79356b 2112
0a7de745
A
2113 if (size_of_prefetch) {
2114 rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
2115 }
9bccf70c 2116 }
1c79356b 2117 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
0a7de745 2118 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
1c79356b
A
2119}
2120
2d21ac55 2121
9bccf70c 2122int
b0d623f7 2123cluster_pageout(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
0a7de745 2124 int size, off_t filesize, int flags)
2d21ac55 2125{
0a7de745 2126 return cluster_pageout_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
2d21ac55
A
2127}
2128
2129
2130int
b0d623f7 2131cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
0a7de745 2132 int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
1c79356b
A
2133{
2134 int io_size;
55e303ae 2135 int rounded_size;
0a7de745 2136 off_t max_size;
55e303ae
A
2137 int local_flags;
2138
6d2010ae 2139 local_flags = CL_PAGEOUT | CL_THROTTLE;
1c79356b 2140
0a7de745 2141 if ((flags & UPL_IOSYNC) == 0) {
1c79356b 2142 local_flags |= CL_ASYNC;
0a7de745
A
2143 }
2144 if ((flags & UPL_NOCOMMIT) == 0) {
1c79356b 2145 local_flags |= CL_COMMIT;
0a7de745
A
2146 }
2147 if ((flags & UPL_KEEPCACHED)) {
2148 local_flags |= CL_KEEPCACHED;
2149 }
2150 if (flags & UPL_PAGING_ENCRYPTED) {
6d2010ae 2151 local_flags |= CL_ENCRYPTED;
0a7de745 2152 }
1c79356b 2153
1c79356b
A
2154
2155 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
0a7de745 2156 (int)f_offset, size, (int)filesize, local_flags, 0);
1c79356b
A
2157
2158 /*
2159 * If they didn't specify any I/O, then we are done...
2160 * we can't issue an abort because we don't know how
2161 * big the upl really is
2162 */
0a7de745
A
2163 if (size <= 0) {
2164 return EINVAL;
2165 }
1c79356b 2166
0a7de745
A
2167 if (vp->v_mount->mnt_flag & MNT_RDONLY) {
2168 if (local_flags & CL_COMMIT) {
2169 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
2170 }
2171 return EROFS;
1c79356b
A
2172 }
2173 /*
2174 * can't page-in from a negative offset
2175 * or if we're starting beyond the EOF
2176 * or if the file offset isn't page aligned
2177 * or the size requested isn't a multiple of PAGE_SIZE
2178 */
2179 if (f_offset < 0 || f_offset >= filesize ||
0a7de745
A
2180 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
2181 if (local_flags & CL_COMMIT) {
0b4e3aa0 2182 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
0a7de745
A
2183 }
2184 return EINVAL;
1c79356b
A
2185 }
2186 max_size = filesize - f_offset;
2187
0a7de745
A
2188 if (size < max_size) {
2189 io_size = size;
2190 } else {
2191 io_size = max_size;
2192 }
1c79356b 2193
55e303ae 2194 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1c79356b 2195
55e303ae 2196 if (size > rounded_size) {
0a7de745 2197 if (local_flags & CL_COMMIT) {
55e303ae 2198 ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
0a7de745
A
2199 UPL_ABORT_FREE_ON_EMPTY);
2200 }
1c79356b 2201 }
0a7de745
A
2202 return cluster_io(vp, upl, upl_offset, f_offset, io_size,
2203 local_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
1c79356b
A
2204}
2205
2d21ac55 2206
9bccf70c 2207int
b0d623f7 2208cluster_pagein(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
0a7de745 2209 int size, off_t filesize, int flags)
2d21ac55 2210{
0a7de745 2211 return cluster_pagein_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
2d21ac55
A
2212}
2213
2214
2215int
b0d623f7 2216cluster_pagein_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
0a7de745 2217 int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
1c79356b
A
2218{
2219 u_int io_size;
9bccf70c 2220 int rounded_size;
0a7de745 2221 off_t max_size;
1c79356b
A
2222 int retval;
2223 int local_flags = 0;
1c79356b 2224
0a7de745
A
2225 if (upl == NULL || size < 0) {
2226 panic("cluster_pagein: NULL upl passed in");
2227 }
1c79356b 2228
0a7de745
A
2229 if ((flags & UPL_IOSYNC) == 0) {
2230 local_flags |= CL_ASYNC;
2231 }
2232 if ((flags & UPL_NOCOMMIT) == 0) {
9bccf70c 2233 local_flags |= CL_COMMIT;
0a7de745
A
2234 }
2235 if (flags & UPL_IOSTREAMING) {
b0d623f7 2236 local_flags |= CL_IOSTREAMING;
0a7de745
A
2237 }
2238 if (flags & UPL_PAGING_ENCRYPTED) {
6d2010ae 2239 local_flags |= CL_ENCRYPTED;
0a7de745 2240 }
9bccf70c 2241
1c79356b
A
2242
2243 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
0a7de745 2244 (int)f_offset, size, (int)filesize, local_flags, 0);
1c79356b
A
2245
2246 /*
2247 * can't page-in from a negative offset
2248 * or if we're starting beyond the EOF
2249 * or if the file offset isn't page aligned
2250 * or the size requested isn't a multiple of PAGE_SIZE
2251 */
2252 if (f_offset < 0 || f_offset >= filesize ||
0a7de745
A
2253 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
2254 if (local_flags & CL_COMMIT) {
2255 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
2256 }
2257 return EINVAL;
1c79356b
A
2258 }
2259 max_size = filesize - f_offset;
2260
0a7de745
A
2261 if (size < max_size) {
2262 io_size = size;
2263 } else {
2264 io_size = max_size;
2265 }
1c79356b 2266
9bccf70c 2267 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1c79356b 2268
0a7de745 2269 if (size > rounded_size && (local_flags & CL_COMMIT)) {
9bccf70c 2270 ubc_upl_abort_range(upl, upl_offset + rounded_size,
0a7de745
A
2271 size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
2272 }
2273
91447636 2274 retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
0a7de745 2275 local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
1c79356b 2276
0a7de745 2277 return retval;
1c79356b
A
2278}
2279
2d21ac55 2280
9bccf70c 2281int
91447636 2282cluster_bp(buf_t bp)
2d21ac55 2283{
0a7de745 2284 return cluster_bp_ext(bp, NULL, NULL);
2d21ac55
A
2285}
2286
2287
2288int
2289cluster_bp_ext(buf_t bp, int (*callback)(buf_t, void *), void *callback_arg)
1c79356b 2290{
0a7de745 2291 off_t f_offset;
1c79356b
A
2292 int flags;
2293
9bccf70c 2294 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
0a7de745 2295 bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
9bccf70c 2296
0a7de745
A
2297 if (bp->b_flags & B_READ) {
2298 flags = CL_ASYNC | CL_READ;
2299 } else {
2300 flags = CL_ASYNC;
2301 }
2302 if (bp->b_flags & B_PASSIVE) {
2d21ac55 2303 flags |= CL_PASSIVE;
0a7de745 2304 }
1c79356b
A
2305
2306 f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
2307
0a7de745 2308 return cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL, callback, callback_arg);
1c79356b
A
2309}
2310
2d21ac55
A
2311
2312
9bccf70c 2313int
91447636 2314cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
1c79356b 2315{
0a7de745 2316 return cluster_write_ext(vp, uio, oldEOF, newEOF, headOff, tailOff, xflags, NULL, NULL);
2d21ac55
A
2317}
2318
2319
2320int
2321cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff,
0a7de745 2322 int xflags, int (*callback)(buf_t, void *), void *callback_arg)
2d21ac55 2323{
0a7de745
A
2324 user_ssize_t cur_resid;
2325 int retval = 0;
2326 int flags;
2327 int zflags;
2d21ac55 2328 int bflag;
0a7de745
A
2329 int write_type = IO_COPY;
2330 u_int32_t write_length;
1c79356b 2331
91447636
A
2332 flags = xflags;
2333
0a7de745 2334 if (flags & IO_PASSIVE) {
b0d623f7 2335 bflag = CL_PASSIVE;
0a7de745 2336 } else {
b0d623f7 2337 bflag = 0;
0a7de745 2338 }
2d21ac55 2339
0a7de745
A
2340 if (vp->v_flag & VNOCACHE_DATA) {
2341 flags |= IO_NOCACHE;
316670eb
A
2342 bflag |= CL_NOCACHE;
2343 }
0a7de745
A
2344 if (uio == NULL) {
2345 /*
2d21ac55
A
2346 * no user data...
2347 * this call is being made to zero-fill some range in the file
91447636 2348 */
0a7de745
A
2349 retval = cluster_write_copy(vp, NULL, (u_int32_t)0, oldEOF, newEOF, headOff, tailOff, flags, callback, callback_arg);
2350
2351 return retval;
2352 }
2353 /*
2354 * do a write through the cache if one of the following is true....
2355 * NOCACHE is not true or NODIRECT is true
2356 * the uio request doesn't target USERSPACE
2357 * otherwise, find out if we want the direct or contig variant for
2358 * the first vector in the uio request
2359 */
2360 if (((flags & (IO_NOCACHE | IO_NODIRECT)) == IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) {
2361 retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
2362 }
2363
2364 if ((flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)) && write_type == IO_DIRECT) {
2365 /*
2d21ac55 2366 * must go through the cached variant in this case
0b4e3aa0 2367 */
0a7de745
A
2368 write_type = IO_COPY;
2369 }
0b4e3aa0 2370
2d21ac55 2371 while ((cur_resid = uio_resid(uio)) && uio->uio_offset < newEOF && retval == 0) {
0a7de745 2372 switch (write_type) {
2d21ac55 2373 case IO_COPY:
0a7de745 2374 /*
2d21ac55
A
2375 * make sure the uio_resid isn't too big...
2376 * internally, we want to handle all of the I/O in
2377 * chunk sizes that fit in a 32 bit int
91447636 2378 */
0a7de745
A
2379 if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
2380 /*
2d21ac55
A
2381 * we're going to have to call cluster_write_copy
2382 * more than once...
2383 *
2384 * only want the last call to cluster_write_copy to
2385 * have the IO_TAILZEROFILL flag set and only the
2386 * first call should have IO_HEADZEROFILL
91447636 2387 */
0a7de745 2388 zflags = flags & ~IO_TAILZEROFILL;
2d21ac55 2389 flags &= ~IO_HEADZEROFILL;
91447636 2390
2d21ac55
A
2391 write_length = MAX_IO_REQUEST_SIZE;
2392 } else {
0a7de745 2393 /*
2d21ac55 2394 * last call to cluster_write_copy
91447636 2395 */
0a7de745
A
2396 zflags = flags;
2397
2d21ac55
A
2398 write_length = (u_int32_t)cur_resid;
2399 }
2400 retval = cluster_write_copy(vp, uio, write_length, oldEOF, newEOF, headOff, tailOff, zflags, callback, callback_arg);
2401 break;
91447636 2402
2d21ac55 2403 case IO_CONTIG:
0a7de745 2404 zflags = flags & ~(IO_TAILZEROFILL | IO_HEADZEROFILL);
91447636 2405
2d21ac55 2406 if (flags & IO_HEADZEROFILL) {
0a7de745 2407 /*
2d21ac55 2408 * only do this once per request
91447636 2409 */
0a7de745 2410 flags &= ~IO_HEADZEROFILL;
91447636 2411
2d21ac55 2412 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, uio->uio_offset,
0a7de745
A
2413 headOff, (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
2414 if (retval) {
2415 break;
2416 }
91447636 2417 }
2d21ac55
A
2418 retval = cluster_write_contig(vp, uio, newEOF, &write_type, &write_length, callback, callback_arg, bflag);
2419
2420 if (retval == 0 && (flags & IO_TAILZEROFILL) && uio_resid(uio) == 0) {
0a7de745 2421 /*
2d21ac55
A
2422 * we're done with the data from the user specified buffer(s)
2423 * and we've been requested to zero fill at the tail
2424 * treat this as an IO_HEADZEROFILL which doesn't require a uio
2425 * by rearranging the args and passing in IO_HEADZEROFILL
91447636 2426 */
0a7de745
A
2427 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, tailOff, uio->uio_offset,
2428 (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
2d21ac55
A
2429 }
2430 break;
91447636 2431
2d21ac55
A
2432 case IO_DIRECT:
2433 /*
2434 * cluster_write_direct is never called with IO_TAILZEROFILL || IO_HEADZEROFILL
2435 */
2436 retval = cluster_write_direct(vp, uio, oldEOF, newEOF, &write_type, &write_length, flags, callback, callback_arg);
2437 break;
91447636 2438
2d21ac55 2439 case IO_UNKNOWN:
0a7de745 2440 retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
2d21ac55
A
2441 break;
2442 }
b0d623f7
A
2443 /*
2444 * in case we end up calling cluster_write_copy (from cluster_write_direct)
2445 * multiple times to service a multi-vector request that is not aligned properly
2446 * we need to update the oldEOF so that we
2447 * don't zero-fill the head of a page if we've successfully written
2448 * data to that area... 'cluster_write_copy' will zero-fill the head of a
2449 * page that is beyond the oldEOF if the write is unaligned... we only
0a7de745 2450 * want that to happen for the very first page of the cluster_write,
b0d623f7
A
2451 * NOT the first page of each vector making up a multi-vector write.
2452 */
0a7de745 2453 if (uio->uio_offset > oldEOF) {
b0d623f7 2454 oldEOF = uio->uio_offset;
0a7de745 2455 }
2d21ac55 2456 }
0a7de745 2457 return retval;
1c79356b
A
2458}
2459
b4c24cb9 2460
9bccf70c 2461static int
2d21ac55 2462cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, int *write_type, u_int32_t *write_length,
0a7de745 2463 int flags, int (*callback)(buf_t, void *), void *callback_arg)
1c79356b
A
2464{
2465 upl_t upl;
2466 upl_page_info_t *pl;
1c79356b 2467 vm_offset_t upl_offset;
0a7de745
A
2468 vm_offset_t vector_upl_offset = 0;
2469 u_int32_t io_req_size;
2470 u_int32_t offset_in_file;
2471 u_int32_t offset_in_iovbase;
b0d623f7
A
2472 u_int32_t io_size;
2473 int io_flag = 0;
0a7de745
A
2474 upl_size_t upl_size, vector_upl_size = 0;
2475 vm_size_t upl_needed_size;
2476 mach_msg_type_number_t pages_in_pl;
3e170ce0 2477 upl_control_flags_t upl_flags;
1c79356b 2478 kern_return_t kret;
0a7de745 2479 mach_msg_type_number_t i;
1c79356b 2480 int force_data_sync;
2d21ac55 2481 int retval = 0;
0a7de745 2482 int first_IO = 1;
d7e50217 2483 struct clios iostate;
0a7de745
A
2484 user_addr_t iov_base;
2485 u_int32_t mem_alignment_mask;
2486 u_int32_t devblocksize;
2487 u_int32_t max_io_size;
2488 u_int32_t max_upl_size;
316670eb 2489 u_int32_t max_vector_size;
0a7de745
A
2490 u_int32_t bytes_outstanding_limit;
2491 boolean_t io_throttled = FALSE;
cf7d32b8 2492
0a7de745
A
2493 u_int32_t vector_upl_iosize = 0;
2494 int issueVectorUPL = 0, useVectorUPL = (uio->uio_iovcnt > 1);
2495 off_t v_upl_uio_offset = 0;
2496 int vector_upl_index = 0;
2497 upl_t vector_upl = NULL;
cf7d32b8 2498
1c79356b
A
2499
2500 /*
2501 * When we enter this routine, we know
1c79356b
A
2502 * -- the resid will not exceed iov_len
2503 */
2d21ac55 2504 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
0a7de745 2505 (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
91447636 2506
b0d623f7
A
2507 max_upl_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
2508
2509 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE | CL_DIRECT_IO;
2510
0a7de745 2511 if (flags & IO_PASSIVE) {
b0d623f7 2512 io_flag |= CL_PASSIVE;
0a7de745
A
2513 }
2514
2515 if (flags & IO_NOCACHE) {
2516 io_flag |= CL_NOCACHE;
2517 }
2518
2519 if (flags & IO_SKIP_ENCRYPTION) {
fe8ab488 2520 io_flag |= CL_ENCRYPTED;
0a7de745 2521 }
fe8ab488 2522
d7e50217
A
2523 iostate.io_completed = 0;
2524 iostate.io_issued = 0;
2525 iostate.io_error = 0;
2526 iostate.io_wanted = 0;
2527
6d2010ae
A
2528 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
2529
2d21ac55
A
2530 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
2531 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
2532
2533 if (devblocksize == 1) {
0a7de745
A
2534 /*
2535 * the AFP client advertises a devblocksize of 1
2536 * however, its BLOCKMAP routine maps to physical
2537 * blocks that are PAGE_SIZE in size...
2538 * therefore we can't ask for I/Os that aren't page aligned
2539 * or aren't multiples of PAGE_SIZE in size
2540 * by setting devblocksize to PAGE_SIZE, we re-instate
2541 * the old behavior we had before the mem_alignment_mask
2542 * changes went in...
2543 */
2544 devblocksize = PAGE_SIZE;
2d21ac55
A
2545 }
2546
2547next_dwrite:
2548 io_req_size = *write_length;
2549 iov_base = uio_curriovbase(uio);
cc9f6e38 2550
2d21ac55
A
2551 offset_in_file = (u_int32_t)uio->uio_offset & PAGE_MASK;
2552 offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
1c79356b 2553
2d21ac55 2554 if (offset_in_file || offset_in_iovbase) {
0a7de745 2555 /*
2d21ac55
A
2556 * one of the 2 important offsets is misaligned
2557 * so fire an I/O through the cache for this entire vector
2558 */
0a7de745 2559 goto wait_for_dwrites;
2d21ac55
A
2560 }
2561 if (iov_base & (devblocksize - 1)) {
0a7de745 2562 /*
2d21ac55
A
2563 * the offset in memory must be on a device block boundary
2564 * so that we can guarantee that we can generate an
2565 * I/O that ends on a page boundary in cluster_io
2566 */
0a7de745
A
2567 goto wait_for_dwrites;
2568 }
1c79356b 2569
39037602 2570 task_update_logical_writes(current_task(), (io_req_size & ~PAGE_MASK), TASK_WRITE_IMMEDIATE, vp);
2d21ac55 2571 while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == 0) {
0a7de745 2572 int throttle_type;
316670eb 2573
0a7de745 2574 if ((throttle_type = cluster_is_throttled(vp))) {
316670eb
A
2575 /*
2576 * we're in the throttle window, at the very least
2577 * we want to limit the size of the I/O we're about
2578 * to issue
2579 */
0a7de745 2580 if ((flags & IO_RETURN_ON_THROTTLE) && throttle_type == THROTTLE_NOW) {
316670eb
A
2581 /*
2582 * we're in the throttle window and at least 1 I/O
2583 * has already been issued by a throttleable thread
2584 * in this window, so return with EAGAIN to indicate
2585 * to the FS issuing the cluster_write call that it
2586 * should now throttle after dropping any locks
2587 */
2588 throttle_info_update_by_mount(vp->v_mount);
2589
2590 io_throttled = TRUE;
2591 goto wait_for_dwrites;
2592 }
2593 max_vector_size = THROTTLE_MAX_IOSIZE;
2594 max_io_size = THROTTLE_MAX_IOSIZE;
2595 } else {
2596 max_vector_size = MAX_VECTOR_UPL_SIZE;
2597 max_io_size = max_upl_size;
2598 }
2d21ac55 2599
0a7de745
A
2600 if (first_IO) {
2601 cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
2d21ac55
A
2602 first_IO = 0;
2603 }
0a7de745 2604 io_size = io_req_size & ~PAGE_MASK;
cc9f6e38
A
2605 iov_base = uio_curriovbase(uio);
2606
0a7de745
A
2607 if (io_size > max_io_size) {
2608 io_size = max_io_size;
2609 }
2d21ac55 2610
0a7de745 2611 if (useVectorUPL && (iov_base & PAGE_MASK)) {
b0d623f7
A
2612 /*
2613 * We have an iov_base that's not page-aligned.
0a7de745 2614 * Issue all I/O's that have been collected within
b0d623f7
A
2615 * this Vectored UPL.
2616 */
0a7de745 2617 if (vector_upl_index) {
b0d623f7
A
2618 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2619 reset_vector_run_state();
2620 }
0a7de745
A
2621
2622 /*
2623 * After this point, if we are using the Vector UPL path and the base is
2624 * not page-aligned then the UPL with that base will be the first in the vector UPL.
2625 */
b0d623f7
A
2626 }
2627
2d21ac55 2628 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
0a7de745 2629 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
d7e50217
A
2630
2631 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
0a7de745 2632 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
d7e50217 2633
3e170ce0 2634 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
d7e50217 2635 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
0a7de745 2636 pages_in_pl = 0;
d7e50217
A
2637 upl_size = upl_needed_size;
2638 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
0a7de745 2639 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
d7e50217 2640
3e170ce0 2641 kret = vm_map_get_upl(map,
0a7de745
A
2642 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
2643 &upl_size,
2644 &upl,
2645 NULL,
2646 &pages_in_pl,
2647 &upl_flags,
2648 VM_KERN_MEMORY_FILE,
2649 force_data_sync);
d7e50217
A
2650
2651 if (kret != KERN_SUCCESS) {
0a7de745
A
2652 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2653 0, 0, 0, kret, 0);
d7e50217 2654 /*
2d21ac55 2655 * failed to get pagelist
d7e50217
A
2656 *
2657 * we may have already spun some portion of this request
2658 * off as async requests... we need to wait for the I/O
2659 * to complete before returning
2660 */
2d21ac55 2661 goto wait_for_dwrites;
d7e50217
A
2662 }
2663 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2664 pages_in_pl = upl_size / PAGE_SIZE;
1c79356b 2665
d7e50217 2666 for (i = 0; i < pages_in_pl; i++) {
0a7de745
A
2667 if (!upl_valid_page(pl, i)) {
2668 break;
2669 }
2670 }
2671 if (i == pages_in_pl) {
2672 break;
d7e50217 2673 }
1c79356b 2674
d7e50217
A
2675 /*
2676 * didn't get all the pages back that we
2677 * needed... release this upl and try again
2678 */
2d21ac55 2679 ubc_upl_abort(upl, 0);
1c79356b 2680 }
d7e50217 2681 if (force_data_sync >= 3) {
0a7de745
A
2682 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2683 i, pages_in_pl, upl_size, kret, 0);
d7e50217
A
2684 /*
2685 * for some reason, we couldn't acquire a hold on all
2686 * the pages needed in the user's address space
2687 *
2688 * we may have already spun some portion of this request
2689 * off as async requests... we need to wait for the I/O
2690 * to complete before returning
2691 */
2d21ac55 2692 goto wait_for_dwrites;
1c79356b 2693 }
0b4e3aa0 2694
d7e50217
A
2695 /*
2696 * Consider the possibility that upl_size wasn't satisfied.
2697 */
2d21ac55 2698 if (upl_size < upl_needed_size) {
0a7de745
A
2699 if (upl_size && upl_offset == 0) {
2700 io_size = upl_size;
2701 } else {
2702 io_size = 0;
2703 }
2d21ac55 2704 }
d7e50217 2705 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
0a7de745 2706 (int)upl_offset, upl_size, (int)iov_base, io_size, 0);
1c79356b 2707
d7e50217 2708 if (io_size == 0) {
0a7de745 2709 ubc_upl_abort(upl, 0);
d7e50217
A
2710 /*
2711 * we may have already spun some portion of this request
2712 * off as async requests... we need to wait for the I/O
2713 * to complete before returning
2714 */
2d21ac55 2715 goto wait_for_dwrites;
d7e50217 2716 }
0a7de745
A
2717
2718 if (useVectorUPL) {
b0d623f7 2719 vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
0a7de745 2720 if (end_off) {
b0d623f7 2721 issueVectorUPL = 1;
0a7de745 2722 }
b0d623f7
A
2723 /*
2724 * After this point, if we are using a vector UPL, then
2725 * either all the UPL elements end on a page boundary OR
2726 * this UPL is the last element because it does not end
2727 * on a page boundary.
2728 */
2729 }
2d21ac55 2730
d7e50217
A
2731 /*
2732 * we want push out these writes asynchronously so that we can overlap
2733 * the preparation of the next I/O
2734 * if there are already too many outstanding writes
2735 * wait until some complete before issuing the next
2736 */
0a7de745 2737 if (vp->v_mount->mnt_minsaturationbytecount) {
39037602 2738 bytes_outstanding_limit = vp->v_mount->mnt_minsaturationbytecount;
0a7de745 2739 } else {
39037602 2740 bytes_outstanding_limit = max_upl_size * IO_SCALE(vp, 2);
0a7de745 2741 }
39037602
A
2742
2743 cluster_iostate_wait(&iostate, bytes_outstanding_limit, "cluster_write_direct");
cf7d32b8 2744
d7e50217 2745 if (iostate.io_error) {
0a7de745 2746 /*
d7e50217
A
2747 * one of the earlier writes we issued ran into a hard error
2748 * don't issue any more writes, cleanup the UPL
2749 * that was just created but not used, then
2750 * go wait for all writes that are part of this stream
2751 * to complete before returning the error to the caller
2752 */
0a7de745 2753 ubc_upl_abort(upl, 0);
1c79356b 2754
0a7de745
A
2755 goto wait_for_dwrites;
2756 }
1c79356b 2757
d7e50217 2758 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
0a7de745 2759 (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
1c79356b 2760
0a7de745 2761 if (!useVectorUPL) {
b0d623f7 2762 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset,
0a7de745
A
2763 io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2764 } else {
2765 if (!vector_upl_index) {
b0d623f7
A
2766 vector_upl = vector_upl_create(upl_offset);
2767 v_upl_uio_offset = uio->uio_offset;
2768 vector_upl_offset = upl_offset;
2769 }
2770
0a7de745 2771 vector_upl_set_subupl(vector_upl, upl, upl_size);
b0d623f7
A
2772 vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
2773 vector_upl_index++;
2774 vector_upl_iosize += io_size;
2775 vector_upl_size += upl_size;
2776
0a7de745 2777 if (issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) {
b0d623f7
A
2778 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2779 reset_vector_run_state();
2780 }
0a7de745 2781 }
b0d623f7 2782
2d21ac55
A
2783 /*
2784 * update the uio structure to
2785 * reflect the I/O that we just issued
2786 */
cc9f6e38 2787 uio_update(uio, (user_size_t)io_size);
1c79356b 2788
b0d623f7
A
2789 /*
2790 * in case we end up calling through to cluster_write_copy to finish
2791 * the tail of this request, we need to update the oldEOF so that we
2792 * don't zero-fill the head of a page if we've successfully written
2793 * data to that area... 'cluster_write_copy' will zero-fill the head of a
2794 * page that is beyond the oldEOF if the write is unaligned... we only
0a7de745 2795 * want that to happen for the very first page of the cluster_write,
b0d623f7
A
2796 * NOT the first page of each vector making up a multi-vector write.
2797 */
0a7de745 2798 if (uio->uio_offset > oldEOF) {
b0d623f7 2799 oldEOF = uio->uio_offset;
0a7de745 2800 }
b0d623f7 2801
2d21ac55
A
2802 io_req_size -= io_size;
2803
d7e50217 2804 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
0a7de745 2805 (int)upl_offset, (int)uio->uio_offset, io_req_size, retval, 0);
1c79356b
A
2806 } /* end while */
2807
0a7de745
A
2808 if (retval == 0 && iostate.io_error == 0 && io_req_size == 0) {
2809 retval = cluster_io_type(uio, write_type, write_length, MIN_DIRECT_WRITE_SIZE);
2d21ac55
A
2810
2811 if (retval == 0 && *write_type == IO_DIRECT) {
0a7de745
A
2812 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_NONE,
2813 (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
2d21ac55 2814
0a7de745 2815 goto next_dwrite;
2d21ac55 2816 }
0a7de745 2817 }
2d21ac55
A
2818
2819wait_for_dwrites:
b0d623f7 2820
6d2010ae 2821 if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
b0d623f7 2822 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
0a7de745 2823 reset_vector_run_state();
b0d623f7 2824 }
fe8ab488
A
2825 /*
2826 * make sure all async writes issued as part of this stream
2827 * have completed before we return
2828 */
2829 cluster_iostate_wait(&iostate, 0, "cluster_write_direct");
b0d623f7 2830
0a7de745
A
2831 if (iostate.io_error) {
2832 retval = iostate.io_error;
2833 }
2d21ac55 2834
6d2010ae
A
2835 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
2836
0a7de745 2837 if (io_throttled == TRUE && retval == 0) {
316670eb 2838 retval = EAGAIN;
0a7de745 2839 }
316670eb 2840
2d21ac55 2841 if (io_req_size && retval == 0) {
0a7de745 2842 /*
2d21ac55
A
2843 * we couldn't handle the tail of this request in DIRECT mode
2844 * so fire it through the copy path
2845 *
2846 * note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set
2847 * so we can just pass 0 in for the headOff and tailOff
2848 */
0a7de745 2849 if (uio->uio_offset > oldEOF) {
b0d623f7 2850 oldEOF = uio->uio_offset;
0a7de745 2851 }
b0d623f7 2852
0a7de745 2853 retval = cluster_write_copy(vp, uio, io_req_size, oldEOF, newEOF, (off_t)0, (off_t)0, flags, callback, callback_arg);
1c79356b 2854
2d21ac55
A
2855 *write_type = IO_UNKNOWN;
2856 }
1c79356b 2857 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
0a7de745 2858 (int)uio->uio_offset, io_req_size, retval, 4, 0);
1c79356b 2859
0a7de745 2860 return retval;
1c79356b
A
2861}
2862
b4c24cb9 2863
9bccf70c 2864static int
2d21ac55 2865cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, u_int32_t *write_length,
0a7de745 2866 int (*callback)(buf_t, void *), void *callback_arg, int bflag)
0b4e3aa0 2867{
b4c24cb9 2868 upl_page_info_t *pl;
0a7de745
A
2869 addr64_t src_paddr = 0;
2870 upl_t upl[MAX_VECTS];
0b4e3aa0 2871 vm_offset_t upl_offset;
2d21ac55 2872 u_int32_t tail_size = 0;
0a7de745
A
2873 u_int32_t io_size;
2874 u_int32_t xsize;
2875 upl_size_t upl_size;
2876 vm_size_t upl_needed_size;
2877 mach_msg_type_number_t pages_in_pl;
3e170ce0 2878 upl_control_flags_t upl_flags;
0b4e3aa0 2879 kern_return_t kret;
0a7de745 2880 struct clios iostate;
0b4e3aa0 2881 int error = 0;
0a7de745
A
2882 int cur_upl = 0;
2883 int num_upl = 0;
2884 int n;
2885 user_addr_t iov_base;
2886 u_int32_t devblocksize;
2887 u_int32_t mem_alignment_mask;
0b4e3aa0
A
2888
2889 /*
2890 * When we enter this routine, we know
2d21ac55
A
2891 * -- the io_req_size will not exceed iov_len
2892 * -- the target address is physically contiguous
0b4e3aa0 2893 */
fe8ab488 2894 cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
0b4e3aa0 2895
2d21ac55
A
2896 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
2897 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
91447636 2898
0a7de745
A
2899 iostate.io_completed = 0;
2900 iostate.io_issued = 0;
2901 iostate.io_error = 0;
2902 iostate.io_wanted = 0;
2d21ac55 2903
6d2010ae
A
2904 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
2905
2d21ac55
A
2906next_cwrite:
2907 io_size = *write_length;
91447636 2908
cc9f6e38
A
2909 iov_base = uio_curriovbase(uio);
2910
2d21ac55 2911 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
0b4e3aa0
A
2912 upl_needed_size = upl_offset + io_size;
2913
2914 pages_in_pl = 0;
2915 upl_size = upl_needed_size;
0a7de745
A
2916 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
2917 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
0b4e3aa0 2918
3e170ce0
A
2919 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
2920 kret = vm_map_get_upl(map,
0a7de745
A
2921 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
2922 &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE, 0);
0b4e3aa0 2923
b4c24cb9 2924 if (kret != KERN_SUCCESS) {
0a7de745 2925 /*
2d21ac55 2926 * failed to get pagelist
b4c24cb9 2927 */
0a7de745 2928 error = EINVAL;
2d21ac55 2929 goto wait_for_cwrites;
b4c24cb9 2930 }
2d21ac55
A
2931 num_upl++;
2932
0b4e3aa0
A
2933 /*
2934 * Consider the possibility that upl_size wasn't satisfied.
0b4e3aa0 2935 */
b4c24cb9 2936 if (upl_size < upl_needed_size) {
2d21ac55
A
2937 /*
2938 * This is a failure in the physical memory case.
2939 */
2940 error = EINVAL;
2941 goto wait_for_cwrites;
b4c24cb9 2942 }
2d21ac55 2943 pl = ubc_upl_pageinfo(upl[cur_upl]);
0b4e3aa0 2944
fe8ab488 2945 src_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
0b4e3aa0 2946
b4c24cb9 2947 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
0a7de745 2948 u_int32_t head_size;
0b4e3aa0 2949
2d21ac55 2950 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
0b4e3aa0 2951
0a7de745
A
2952 if (head_size > io_size) {
2953 head_size = io_size;
2954 }
b4c24cb9 2955
2d21ac55 2956 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0, callback, callback_arg);
b4c24cb9 2957
0a7de745
A
2958 if (error) {
2959 goto wait_for_cwrites;
2960 }
b4c24cb9 2961
b4c24cb9
A
2962 upl_offset += head_size;
2963 src_paddr += head_size;
2964 io_size -= head_size;
2d21ac55
A
2965
2966 iov_base += head_size;
2967 }
2968 if ((u_int32_t)iov_base & mem_alignment_mask) {
0a7de745 2969 /*
2d21ac55
A
2970 * request doesn't set up on a memory boundary
2971 * the underlying DMA engine can handle...
2972 * return an error instead of going through
2973 * the slow copy path since the intent of this
2974 * path is direct I/O from device memory
2975 */
0a7de745 2976 error = EINVAL;
2d21ac55 2977 goto wait_for_cwrites;
0b4e3aa0 2978 }
2d21ac55 2979
b4c24cb9
A
2980 tail_size = io_size & (devblocksize - 1);
2981 io_size -= tail_size;
2982
2d21ac55 2983 while (io_size && error == 0) {
0a7de745
A
2984 if (io_size > MAX_IO_CONTIG_SIZE) {
2985 xsize = MAX_IO_CONTIG_SIZE;
2986 } else {
2987 xsize = io_size;
2988 }
2d21ac55
A
2989 /*
2990 * request asynchronously so that we can overlap
2991 * the preparation of the next I/O... we'll do
2992 * the commit after all the I/O has completed
2993 * since its all issued against the same UPL
2994 * if there are already too many outstanding writes
2995 * wait until some have completed before issuing the next
b4c24cb9 2996 */
fe8ab488 2997 cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_write_contig");
2d21ac55 2998
0a7de745
A
2999 if (iostate.io_error) {
3000 /*
3001 * one of the earlier writes we issued ran into a hard error
3002 * don't issue any more writes...
3003 * go wait for all writes that are part of this stream
3004 * to complete before returning the error to the caller
3005 */
3006 goto wait_for_cwrites;
2d21ac55 3007 }
0a7de745 3008 /*
2d21ac55 3009 * issue an asynchronous write to cluster_io
b4c24cb9 3010 */
0a7de745
A
3011 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset,
3012 xsize, CL_DEV_MEMORY | CL_ASYNC | bflag, (buf_t)NULL, (struct clios *)&iostate, callback, callback_arg);
cc9f6e38 3013
2d21ac55 3014 if (error == 0) {
0a7de745 3015 /*
2d21ac55
A
3016 * The cluster_io write completed successfully,
3017 * update the uio structure
3018 */
0a7de745 3019 uio_update(uio, (user_size_t)xsize);
b4c24cb9 3020
2d21ac55
A
3021 upl_offset += xsize;
3022 src_paddr += xsize;
3023 io_size -= xsize;
3024 }
b4c24cb9 3025 }
0a7de745
A
3026 if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS) {
3027 error = cluster_io_type(uio, write_type, write_length, 0);
2d21ac55
A
3028
3029 if (error == 0 && *write_type == IO_CONTIG) {
0a7de745
A
3030 cur_upl++;
3031 goto next_cwrite;
2d21ac55 3032 }
0a7de745
A
3033 } else {
3034 *write_type = IO_UNKNOWN;
3035 }
2d21ac55
A
3036
3037wait_for_cwrites:
b4c24cb9 3038 /*
0a7de745
A
3039 * make sure all async writes that are part of this stream
3040 * have completed before we proceed
3041 */
fe8ab488 3042 cluster_iostate_wait(&iostate, 0, "cluster_write_contig");
cf7d32b8 3043
0a7de745
A
3044 if (iostate.io_error) {
3045 error = iostate.io_error;
3046 }
2d21ac55 3047
6d2010ae
A
3048 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
3049
0a7de745
A
3050 if (error == 0 && tail_size) {
3051 error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0, callback, callback_arg);
3052 }
2d21ac55 3053
0a7de745
A
3054 for (n = 0; n < num_upl; n++) {
3055 /*
2d21ac55
A
3056 * just release our hold on each physically contiguous
3057 * region without changing any state
3058 */
0a7de745
A
3059 ubc_upl_abort(upl[n], 0);
3060 }
0b4e3aa0 3061
0a7de745 3062 return error;
0b4e3aa0
A
3063}
3064
b4c24cb9 3065
b0d623f7
A
3066/*
3067 * need to avoid a race between an msync of a range of pages dirtied via mmap
3068 * vs a filesystem such as HFS deciding to write a 'hole' to disk via cluster_write's
3069 * zerofill mechanism before it has seen the VNOP_PAGEOUTs for the pages being msync'd
3070 *
3071 * we should never force-zero-fill pages that are already valid in the cache...
3072 * the entire page contains valid data (either from disk, zero-filled or dirtied
3073 * via an mmap) so we can only do damage by trying to zero-fill
3074 *
3075 */
3076static int
3077cluster_zero_range(upl_t upl, upl_page_info_t *pl, int flags, int io_offset, off_t zero_off, off_t upl_f_offset, int bytes_to_zero)
3078{
3079 int zero_pg_index;
3080 boolean_t need_cluster_zero = TRUE;
3081
0a7de745
A
3082 if ((flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
3083 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
b0d623f7
A
3084 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
3085
3086 if (upl_valid_page(pl, zero_pg_index)) {
3087 /*
3088 * never force zero valid pages - dirty or clean
3089 * we'll leave these in the UPL for cluster_write_copy to deal with
3090 */
3091 need_cluster_zero = FALSE;
0a7de745 3092 }
b0d623f7 3093 }
0a7de745 3094 if (need_cluster_zero == TRUE) {
b0d623f7 3095 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
0a7de745 3096 }
b0d623f7 3097
0a7de745 3098 return bytes_to_zero;
b0d623f7
A
3099}
3100
3101
d9a64523
A
3102void
3103cluster_update_state(vnode_t vp, vm_object_offset_t s_offset, vm_object_offset_t e_offset, boolean_t vm_initiated)
3104{
3105 struct cl_extent cl;
3106 boolean_t first_pass = TRUE;
3107
3108 assert(s_offset < e_offset);
3109 assert((s_offset & PAGE_MASK_64) == 0);
3110 assert((e_offset & PAGE_MASK_64) == 0);
3111
3112 cl.b_addr = (daddr64_t)(s_offset / PAGE_SIZE_64);
3113 cl.e_addr = (daddr64_t)(e_offset / PAGE_SIZE_64);
3114
3115 cluster_update_state_internal(vp, &cl, 0, TRUE, &first_pass, s_offset, (int)(e_offset - s_offset),
0a7de745 3116 vp->v_un.vu_ubcinfo->ui_size, NULL, NULL, vm_initiated);
d9a64523
A
3117}
3118
3119
3120static void
3121cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes,
0a7de745
A
3122 boolean_t *first_pass, off_t write_off, int write_cnt, off_t newEOF,
3123 int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
d9a64523
A
3124{
3125 struct cl_writebehind *wbp;
0a7de745
A
3126 int cl_index;
3127 int ret_cluster_try_push;
3128 u_int max_cluster_pgcount;
d9a64523
A
3129
3130
3131 max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
3132
3133 /*
3134 * take the lock to protect our accesses
3135 * of the writebehind and sparse cluster state
3136 */
3137 wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
3138
3139 if (wbp->cl_scmap) {
0a7de745
A
3140 if (!(flags & IO_NOCACHE)) {
3141 /*
d9a64523
A
3142 * we've fallen into the sparse
3143 * cluster method of delaying dirty pages
3144 */
0a7de745 3145 sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated);
d9a64523
A
3146
3147 lck_mtx_unlock(&wbp->cl_lockw);
3148 return;
3149 }
3150 /*
3151 * must have done cached writes that fell into
3152 * the sparse cluster mechanism... we've switched
3153 * to uncached writes on the file, so go ahead
3154 * and push whatever's in the sparse map
3155 * and switch back to normal clustering
3156 */
3157 wbp->cl_number = 0;
3158
3159 sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, newEOF, PUSH_ALL, 0, callback, callback_arg, vm_initiated);
3160 /*
3161 * no clusters of either type present at this point
3162 * so just go directly to start_new_cluster since
3163 * we know we need to delay this I/O since we've
3164 * already released the pages back into the cache
3165 * to avoid the deadlock with sparse_cluster_push
3166 */
3167 goto start_new_cluster;
3168 }
3169 if (*first_pass == TRUE) {
0a7de745 3170 if (write_off == wbp->cl_last_write) {
d9a64523 3171 wbp->cl_seq_written += write_cnt;
0a7de745 3172 } else {
d9a64523 3173 wbp->cl_seq_written = write_cnt;
0a7de745 3174 }
d9a64523
A
3175
3176 wbp->cl_last_write = write_off + write_cnt;
3177
3178 *first_pass = FALSE;
3179 }
0a7de745 3180 if (wbp->cl_number == 0) {
d9a64523
A
3181 /*
3182 * no clusters currently present
3183 */
3184 goto start_new_cluster;
0a7de745 3185 }
d9a64523
A
3186
3187 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
3188 /*
3189 * check each cluster that we currently hold
3190 * try to merge some or all of this write into
3191 * one or more of the existing clusters... if
3192 * any portion of the write remains, start a
3193 * new cluster
3194 */
3195 if (cl->b_addr >= wbp->cl_clusters[cl_index].b_addr) {
3196 /*
3197 * the current write starts at or after the current cluster
3198 */
3199 if (cl->e_addr <= (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
3200 /*
3201 * we have a write that fits entirely
3202 * within the existing cluster limits
3203 */
0a7de745 3204 if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) {
d9a64523
A
3205 /*
3206 * update our idea of where the cluster ends
3207 */
3208 wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
0a7de745 3209 }
d9a64523
A
3210 break;
3211 }
3212 if (cl->b_addr < (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
3213 /*
3214 * we have a write that starts in the middle of the current cluster
3215 * but extends beyond the cluster's limit... we know this because
3216 * of the previous checks
3217 * we'll extend the current cluster to the max
3218 * and update the b_addr for the current write to reflect that
3219 * the head of it was absorbed into this cluster...
3220 * note that we'll always have a leftover tail in this case since
3221 * full absorbtion would have occurred in the clause above
3222 */
3223 wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount;
3224
3225 cl->b_addr = wbp->cl_clusters[cl_index].e_addr;
3226 }
3227 /*
3228 * we come here for the case where the current write starts
3229 * beyond the limit of the existing cluster or we have a leftover
3230 * tail after a partial absorbtion
3231 *
0a7de745 3232 * in either case, we'll check the remaining clusters before
d9a64523
A
3233 * starting a new one
3234 */
3235 } else {
3236 /*
3237 * the current write starts in front of the cluster we're currently considering
3238 */
3239 if ((wbp->cl_clusters[cl_index].e_addr - cl->b_addr) <= max_cluster_pgcount) {
3240 /*
3241 * we can just merge the new request into
3242 * this cluster and leave it in the cache
0a7de745 3243 * since the resulting cluster is still
d9a64523
A
3244 * less than the maximum allowable size
3245 */
3246 wbp->cl_clusters[cl_index].b_addr = cl->b_addr;
3247
3248 if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) {
3249 /*
3250 * the current write completely
3251 * envelops the existing cluster and since
3252 * each write is limited to at most max_cluster_pgcount pages
3253 * we can just use the start and last blocknos of the write
3254 * to generate the cluster limits
3255 */
3256 wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
3257 }
3258 break;
3259 }
3260 /*
3261 * if we were to combine this write with the current cluster
3262 * we would exceed the cluster size limit.... so,
3263 * let's see if there's any overlap of the new I/O with
3264 * the cluster we're currently considering... in fact, we'll
3265 * stretch the cluster out to it's full limit and see if we
3266 * get an intersection with the current write
0a7de745 3267 *
d9a64523
A
3268 */
3269 if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount) {
3270 /*
3271 * the current write extends into the proposed cluster
3272 * clip the length of the current write after first combining it's
3273 * tail with the newly shaped cluster
3274 */
3275 wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount;
3276
3277 cl->e_addr = wbp->cl_clusters[cl_index].b_addr;
3278 }
3279 /*
3280 * if we get here, there was no way to merge
0a7de745
A
3281 * any portion of this write with this cluster
3282 * or we could only merge part of it which
d9a64523
A
3283 * will leave a tail...
3284 * we'll check the remaining clusters before starting a new one
3285 */
3286 }
3287 }
0a7de745 3288 if (cl_index < wbp->cl_number) {
d9a64523
A
3289 /*
3290 * we found an existing cluster(s) that we
3291 * could entirely merge this I/O into
3292 */
3293 goto delay_io;
0a7de745 3294 }
d9a64523
A
3295
3296 if (defer_writes == FALSE &&
3297 wbp->cl_number == MAX_CLUSTERS &&
3298 wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) {
0a7de745 3299 uint32_t n;
d9a64523
A
3300
3301 if (vp->v_mount->mnt_minsaturationbytecount) {
3302 n = vp->v_mount->mnt_minsaturationbytecount / MAX_CLUSTER_SIZE(vp);
0a7de745
A
3303
3304 if (n > MAX_CLUSTERS) {
d9a64523 3305 n = MAX_CLUSTERS;
0a7de745
A
3306 }
3307 } else {
d9a64523 3308 n = 0;
0a7de745 3309 }
d9a64523
A
3310
3311 if (n == 0) {
0a7de745 3312 if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
d9a64523 3313 n = WRITE_BEHIND_SSD;
0a7de745 3314 } else {
d9a64523 3315 n = WRITE_BEHIND;
0a7de745
A
3316 }
3317 }
3318 while (n--) {
3319 cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg, NULL, vm_initiated);
d9a64523 3320 }
d9a64523
A
3321 }
3322 if (wbp->cl_number < MAX_CLUSTERS) {
3323 /*
3324 * we didn't find an existing cluster to
3325 * merge into, but there's room to start
3326 * a new one
3327 */
3328 goto start_new_cluster;
3329 }
3330 /*
3331 * no exisitng cluster to merge with and no
0a7de745 3332 * room to start a new one... we'll try
d9a64523
A
3333 * pushing one of the existing ones... if none of
3334 * them are able to be pushed, we'll switch
3335 * to the sparse cluster mechanism
3336 * cluster_try_push updates cl_number to the
3337 * number of remaining clusters... and
3338 * returns the number of currently unused clusters
3339 */
3340 ret_cluster_try_push = 0;
3341
3342 /*
3343 * if writes are not deferred, call cluster push immediately
3344 */
3345 if (defer_writes == FALSE) {
0a7de745 3346 ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, 0, callback, callback_arg, NULL, vm_initiated);
d9a64523
A
3347 }
3348 /*
3349 * execute following regardless of writes being deferred or not
3350 */
3351 if (ret_cluster_try_push == 0) {
3352 /*
3353 * no more room in the normal cluster mechanism
3354 * so let's switch to the more expansive but expensive
3355 * sparse mechanism....
3356 */
0a7de745 3357 sparse_cluster_switch(wbp, vp, newEOF, callback, callback_arg, vm_initiated);
d9a64523 3358 sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated);
0a7de745 3359
d9a64523
A
3360 lck_mtx_unlock(&wbp->cl_lockw);
3361 return;
3362 }
3363start_new_cluster:
3364 wbp->cl_clusters[wbp->cl_number].b_addr = cl->b_addr;
3365 wbp->cl_clusters[wbp->cl_number].e_addr = cl->e_addr;
3366
3367 wbp->cl_clusters[wbp->cl_number].io_flags = 0;
3368
0a7de745 3369 if (flags & IO_NOCACHE) {
d9a64523 3370 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IONOCACHE;
0a7de745 3371 }
d9a64523 3372
0a7de745 3373 if (flags & IO_PASSIVE) {
d9a64523 3374 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IOPASSIVE;
0a7de745 3375 }
d9a64523
A
3376
3377 wbp->cl_number++;
3378delay_io:
3379 lck_mtx_unlock(&wbp->cl_lockw);
3380 return;
3381}
3382
3383
9bccf70c 3384static int
2d21ac55 3385cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, off_t headOff,
0a7de745 3386 off_t tailOff, int flags, int (*callback)(buf_t, void *), void *callback_arg)
1c79356b
A
3387{
3388 upl_page_info_t *pl;
3389 upl_t upl;
91447636 3390 vm_offset_t upl_offset = 0;
0a7de745
A
3391 vm_size_t upl_size;
3392 off_t upl_f_offset;
1c79356b 3393 int pages_in_upl;
0a7de745 3394 int start_offset;
1c79356b
A
3395 int xfer_resid;
3396 int io_size;
1c79356b
A
3397 int io_offset;
3398 int bytes_to_zero;
3399 int bytes_to_move;
3400 kern_return_t kret;
3401 int retval = 0;
91447636 3402 int io_resid;
1c79356b
A
3403 long long total_size;
3404 long long zero_cnt;
3405 off_t zero_off;
3406 long long zero_cnt1;
3407 off_t zero_off1;
0a7de745
A
3408 off_t write_off = 0;
3409 int write_cnt = 0;
3410 boolean_t first_pass = FALSE;
91447636 3411 struct cl_extent cl;
2d21ac55 3412 int bflag;
0a7de745 3413 u_int max_io_size;
1c79356b
A
3414
3415 if (uio) {
0a7de745
A
3416 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
3417 (int)uio->uio_offset, io_req_size, (int)oldEOF, (int)newEOF, 0);
1c79356b 3418
0a7de745 3419 io_resid = io_req_size;
1c79356b 3420 } else {
0a7de745
A
3421 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
3422 0, 0, (int)oldEOF, (int)newEOF, 0);
1c79356b 3423
0a7de745 3424 io_resid = 0;
1c79356b 3425 }
0a7de745 3426 if (flags & IO_PASSIVE) {
b0d623f7 3427 bflag = CL_PASSIVE;
0a7de745 3428 } else {
b0d623f7 3429 bflag = 0;
0a7de745
A
3430 }
3431 if (flags & IO_NOCACHE) {
316670eb 3432 bflag |= CL_NOCACHE;
0a7de745
A
3433 }
3434
3435 if (flags & IO_SKIP_ENCRYPTION) {
fe8ab488 3436 bflag |= CL_ENCRYPTED;
0a7de745 3437 }
fe8ab488 3438
1c79356b
A
3439 zero_cnt = 0;
3440 zero_cnt1 = 0;
91447636
A
3441 zero_off = 0;
3442 zero_off1 = 0;
1c79356b 3443
cf7d32b8
A
3444 max_io_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
3445
1c79356b 3446 if (flags & IO_HEADZEROFILL) {
0a7de745 3447 /*
1c79356b
A
3448 * some filesystems (HFS is one) don't support unallocated holes within a file...
3449 * so we zero fill the intervening space between the old EOF and the offset
3450 * where the next chunk of real data begins.... ftruncate will also use this
3451 * routine to zero fill to the new EOF when growing a file... in this case, the
3452 * uio structure will not be provided
3453 */
0a7de745
A
3454 if (uio) {
3455 if (headOff < uio->uio_offset) {
3456 zero_cnt = uio->uio_offset - headOff;
1c79356b
A
3457 zero_off = headOff;
3458 }
0a7de745
A
3459 } else if (headOff < newEOF) {
3460 zero_cnt = newEOF - headOff;
1c79356b
A
3461 zero_off = headOff;
3462 }
b0d623f7
A
3463 } else {
3464 if (uio && uio->uio_offset > oldEOF) {
3465 zero_off = uio->uio_offset & ~PAGE_MASK_64;
3466
3467 if (zero_off >= oldEOF) {
3468 zero_cnt = uio->uio_offset - zero_off;
3469
3470 flags |= IO_HEADZEROFILL;
3471 }
3472 }
1c79356b
A
3473 }
3474 if (flags & IO_TAILZEROFILL) {
0a7de745
A
3475 if (uio) {
3476 zero_off1 = uio->uio_offset + io_req_size;
1c79356b 3477
0a7de745
A
3478 if (zero_off1 < tailOff) {
3479 zero_cnt1 = tailOff - zero_off1;
3480 }
3481 }
b0d623f7
A
3482 } else {
3483 if (uio && newEOF > oldEOF) {
0a7de745 3484 zero_off1 = uio->uio_offset + io_req_size;
b0d623f7
A
3485
3486 if (zero_off1 == newEOF && (zero_off1 & PAGE_MASK_64)) {
3487 zero_cnt1 = PAGE_SIZE_64 - (zero_off1 & PAGE_MASK_64);
3488
3489 flags |= IO_TAILZEROFILL;
3490 }
3491 }
1c79356b 3492 }
55e303ae 3493 if (zero_cnt == 0 && uio == (struct uio *) 0) {
0a7de745
A
3494 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
3495 retval, 0, 0, 0, 0);
3496 return 0;
55e303ae 3497 }
6d2010ae
A
3498 if (uio) {
3499 write_off = uio->uio_offset;
3500 write_cnt = uio_resid(uio);
3501 /*
3502 * delay updating the sequential write info
3503 * in the control block until we've obtained
3504 * the lock for it
3505 */
3506 first_pass = TRUE;
3507 }
91447636 3508 while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
0a7de745 3509 /*
1c79356b
A
3510 * for this iteration of the loop, figure out where our starting point is
3511 */
0a7de745
A
3512 if (zero_cnt) {
3513 start_offset = (int)(zero_off & PAGE_MASK_64);
1c79356b 3514 upl_f_offset = zero_off - start_offset;
91447636 3515 } else if (io_resid) {
0a7de745 3516 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1c79356b
A
3517 upl_f_offset = uio->uio_offset - start_offset;
3518 } else {
0a7de745 3519 start_offset = (int)(zero_off1 & PAGE_MASK_64);
1c79356b
A
3520 upl_f_offset = zero_off1 - start_offset;
3521 }
0a7de745
A
3522 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
3523 (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1c79356b 3524
0a7de745
A
3525 if (total_size > max_io_size) {
3526 total_size = max_io_size;
3527 }
1c79356b 3528
91447636 3529 cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
0a7de745 3530
2d21ac55 3531 if (uio && ((flags & (IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) {
0a7de745 3532 /*
91447636 3533 * assumption... total_size <= io_resid
55e303ae
A
3534 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
3535 */
0a7de745
A
3536 if ((start_offset + total_size) > max_io_size) {
3537 total_size = max_io_size - start_offset;
3538 }
3539 xfer_resid = total_size;
55e303ae 3540
0a7de745 3541 retval = cluster_copy_ubc_data_internal(vp, uio, &xfer_resid, 1, 1);
b0d623f7 3542
0a7de745
A
3543 if (retval) {
3544 break;
3545 }
55e303ae 3546
2d21ac55 3547 io_resid -= (total_size - xfer_resid);
55e303ae
A
3548 total_size = xfer_resid;
3549 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3550 upl_f_offset = uio->uio_offset - start_offset;
3551
3552 if (total_size == 0) {
0a7de745
A
3553 if (start_offset) {
3554 /*
55e303ae
A
3555 * the write did not finish on a page boundary
3556 * which will leave upl_f_offset pointing to the
3557 * beginning of the last page written instead of
3558 * the page beyond it... bump it in this case
3559 * so that the cluster code records the last page
3560 * written as dirty
3561 */
0a7de745 3562 upl_f_offset += PAGE_SIZE_64;
55e303ae 3563 }
0a7de745
A
3564 upl_size = 0;
3565
3566 goto check_cluster;
55e303ae
A
3567 }
3568 }
1c79356b
A
3569 /*
3570 * compute the size of the upl needed to encompass
3571 * the requested write... limit each call to cluster_io
0b4e3aa0
A
3572 * to the maximum UPL size... cluster_io will clip if
3573 * this exceeds the maximum io_size for the device,
0a7de745 3574 * make sure to account for
1c79356b
A
3575 * a starting offset that's not page aligned
3576 */
3577 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3578
0a7de745
A
3579 if (upl_size > max_io_size) {
3580 upl_size = max_io_size;
3581 }
1c79356b
A
3582
3583 pages_in_upl = upl_size / PAGE_SIZE;
3584 io_size = upl_size - start_offset;
0a7de745
A
3585
3586 if ((long long)io_size > total_size) {
3587 io_size = total_size;
3588 }
1c79356b 3589
55e303ae 3590 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
0a7de745 3591
1c79356b 3592
91447636
A
3593 /*
3594 * Gather the pages from the buffer cache.
3595 * The UPL_WILL_MODIFY flag lets the UPL subsystem know
3596 * that we intend to modify these pages.
3597 */
5ba3f43e 3598 kret = ubc_create_upl_kernel(vp,
0a7de745
A
3599 upl_f_offset,
3600 upl_size,
3601 &upl,
3602 &pl,
3603 UPL_SET_LITE | ((uio != NULL && (uio->uio_flags & UIO_FLAGS_IS_COMPRESSED_FILE)) ? 0 : UPL_WILL_MODIFY),
3604 VM_KERN_MEMORY_FILE);
3605 if (kret != KERN_SUCCESS) {
2d21ac55 3606 panic("cluster_write_copy: failed to get pagelist");
0a7de745 3607 }
1c79356b 3608
55e303ae 3609 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
0a7de745 3610 upl, (int)upl_f_offset, start_offset, 0, 0);
1c79356b 3611
b0d623f7 3612 if (start_offset && upl_f_offset < oldEOF && !upl_valid_page(pl, 0)) {
0b4e3aa0 3613 int read_size;
1c79356b 3614
0b4e3aa0 3615 /*
1c79356b
A
3616 * we're starting in the middle of the first page of the upl
3617 * and the page isn't currently valid, so we're going to have
3618 * to read it in first... this is a synchronous operation
3619 */
3620 read_size = PAGE_SIZE;
3621
0a7de745
A
3622 if ((upl_f_offset + read_size) > oldEOF) {
3623 read_size = oldEOF - upl_f_offset;
3624 }
9bccf70c 3625
0a7de745
A
3626 retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
3627 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
1c79356b 3628 if (retval) {
0b4e3aa0 3629 /*
1c79356b
A
3630 * we had an error during the read which causes us to abort
3631 * the current cluster_write request... before we do, we need
3632 * to release the rest of the pages in the upl without modifying
3633 * there state and mark the failed page in error
3634 */
0a7de745 3635 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
91447636 3636
0a7de745
A
3637 if (upl_size > PAGE_SIZE) {
3638 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3639 }
1c79356b
A
3640
3641 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
0a7de745 3642 upl, 0, 0, retval, 0);
1c79356b
A
3643 break;
3644 }
3645 }
3646 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
0a7de745 3647 /*
1c79356b
A
3648 * the last offset we're writing to in this upl does not end on a page
3649 * boundary... if it's not beyond the old EOF, then we'll also need to
3650 * pre-read this page in if it isn't already valid
3651 */
0a7de745 3652 upl_offset = upl_size - PAGE_SIZE;
1c79356b 3653
0a7de745 3654 if ((upl_f_offset + start_offset + io_size) < oldEOF &&
1c79356b 3655 !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
0a7de745 3656 int read_size;
1c79356b
A
3657
3658 read_size = PAGE_SIZE;
3659
0a7de745
A
3660 if ((off_t)(upl_f_offset + upl_offset + read_size) > oldEOF) {
3661 read_size = oldEOF - (upl_f_offset + upl_offset);
3662 }
9bccf70c 3663
0a7de745
A
3664 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
3665 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
1c79356b 3666 if (retval) {
0b4e3aa0 3667 /*
1c79356b 3668 * we had an error during the read which causes us to abort
0b4e3aa0
A
3669 * the current cluster_write request... before we do, we
3670 * need to release the rest of the pages in the upl without
3671 * modifying there state and mark the failed page in error
1c79356b 3672 */
0a7de745 3673 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
91447636 3674
0a7de745
A
3675 if (upl_size > PAGE_SIZE) {
3676 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3677 }
1c79356b
A
3678
3679 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
0a7de745 3680 upl, 0, 0, retval, 0);
1c79356b
A
3681 break;
3682 }
3683 }
3684 }
1c79356b
A
3685 xfer_resid = io_size;
3686 io_offset = start_offset;
3687
3688 while (zero_cnt && xfer_resid) {
0a7de745
A
3689 if (zero_cnt < (long long)xfer_resid) {
3690 bytes_to_zero = zero_cnt;
3691 } else {
3692 bytes_to_zero = xfer_resid;
3693 }
1c79356b 3694
b0d623f7 3695 bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off, upl_f_offset, bytes_to_zero);
9bccf70c 3696
1c79356b
A
3697 xfer_resid -= bytes_to_zero;
3698 zero_cnt -= bytes_to_zero;
3699 zero_off += bytes_to_zero;
3700 io_offset += bytes_to_zero;
3701 }
91447636 3702 if (xfer_resid && io_resid) {
0a7de745 3703 u_int32_t io_requested;
2d21ac55 3704
91447636 3705 bytes_to_move = min(io_resid, xfer_resid);
2d21ac55 3706 io_requested = bytes_to_move;
1c79356b 3707
2d21ac55 3708 retval = cluster_copy_upl_data(uio, upl, io_offset, (int *)&io_requested);
9bccf70c 3709
1c79356b 3710 if (retval) {
d9a64523 3711 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
3712
3713 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
0a7de745 3714 upl, 0, 0, retval, 0);
1c79356b 3715 } else {
0a7de745 3716 io_resid -= bytes_to_move;
1c79356b
A
3717 xfer_resid -= bytes_to_move;
3718 io_offset += bytes_to_move;
3719 }
3720 }
3721 while (xfer_resid && zero_cnt1 && retval == 0) {
0a7de745
A
3722 if (zero_cnt1 < (long long)xfer_resid) {
3723 bytes_to_zero = zero_cnt1;
3724 } else {
3725 bytes_to_zero = xfer_resid;
3726 }
1c79356b 3727
b0d623f7
A
3728 bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off1, upl_f_offset, bytes_to_zero);
3729
1c79356b
A
3730 xfer_resid -= bytes_to_zero;
3731 zero_cnt1 -= bytes_to_zero;
3732 zero_off1 += bytes_to_zero;
3733 io_offset += bytes_to_zero;
3734 }
1c79356b 3735 if (retval == 0) {
a39ff7e2 3736 int do_zeroing = 1;
0a7de745 3737
a39ff7e2 3738 io_size += start_offset;
1c79356b 3739
a39ff7e2
A
3740 /* Force more restrictive zeroing behavior only on APFS */
3741 if ((vnode_tag(vp) == VT_APFS) && (newEOF < oldEOF)) {
3742 do_zeroing = 0;
3743 }
3744
a39ff7e2 3745 if (do_zeroing && (upl_f_offset + io_size) >= newEOF && (u_int)io_size < upl_size) {
a39ff7e2 3746 /*
1c79356b
A
3747 * if we're extending the file with this write
3748 * we'll zero fill the rest of the page so that
3749 * if the file gets extended again in such a way as to leave a
3750 * hole starting at this EOF, we'll have zero's in the correct spot
3751 */
0a7de745 3752 cluster_zero(upl, io_size, upl_size - io_size, NULL);
1c79356b 3753 }
935ed37a
A
3754 /*
3755 * release the upl now if we hold one since...
3756 * 1) pages in it may be present in the sparse cluster map
0a7de745 3757 * and may span 2 separate buckets there... if they do and
935ed37a
A
3758 * we happen to have to flush a bucket to make room and it intersects
3759 * this upl, a deadlock may result on page BUSY
3760 * 2) we're delaying the I/O... from this point forward we're just updating
3761 * the cluster state... no need to hold the pages, so commit them
3762 * 3) IO_SYNC is set...
3763 * because we had to ask for a UPL that provides currenty non-present pages, the
3764 * UPL has been automatically set to clear the dirty flags (both software and hardware)
3765 * upon committing it... this is not the behavior we want since it's possible for
3766 * pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
3767 * we'll pick these pages back up later with the correct behavior specified.
3768 * 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush
3769 * of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages
3770 * we hold since the flushing context is holding the cluster lock.
3771 */
3772 ubc_upl_commit_range(upl, 0, upl_size,
0a7de745 3773 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
935ed37a
A
3774check_cluster:
3775 /*
0a7de745 3776 * calculate the last logical block number
935ed37a
A
3777 * that this delayed I/O encompassed
3778 */
3779 cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
3780
b0d623f7 3781 if (flags & IO_SYNC) {
55e303ae 3782 /*
d9a64523
A
3783 * if the IO_SYNC flag is set than we need to bypass
3784 * any clustering and immediately issue the I/O
3785 *
3786 * we don't hold the lock at this point
3787 *
3788 * we've already dropped the current upl, so pick it back up with COPYOUT_FROM set
3789 * so that we correctly deal with a change in state of the hardware modify bit...
3790 * we do this via cluster_push_now... by passing along the IO_SYNC flag, we force
3791 * cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also
3792 * responsible for generating the correct sized I/O(s)
55e303ae 3793 */
0a7de745 3794 retval = cluster_push_now(vp, &cl, newEOF, flags, callback, callback_arg, FALSE);
d9a64523
A
3795 } else {
3796 boolean_t defer_writes = FALSE;
91447636 3797
0a7de745 3798 if (vfs_flags(vp->v_mount) & MNT_DEFWRITE) {
d9a64523 3799 defer_writes = TRUE;
0a7de745 3800 }
55e303ae 3801
d9a64523 3802 cluster_update_state_internal(vp, &cl, flags, defer_writes, &first_pass,
0a7de745 3803 write_off, write_cnt, newEOF, callback, callback_arg, FALSE);
9bccf70c 3804 }
1c79356b
A
3805 }
3806 }
2d21ac55 3807 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, retval, 0, io_resid, 0, 0);
1c79356b 3808
0a7de745 3809 return retval;
1c79356b
A
3810}
3811
2d21ac55
A
3812
3813
9bccf70c 3814int
91447636 3815cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
1c79356b 3816{
0a7de745 3817 return cluster_read_ext(vp, uio, filesize, xflags, NULL, NULL);
2d21ac55
A
3818}
3819
3820
3821int
3822cluster_read_ext(vnode_t vp, struct uio *uio, off_t filesize, int xflags, int (*callback)(buf_t, void *), void *callback_arg)
3823{
0a7de745
A
3824 int retval = 0;
3825 int flags;
3826 user_ssize_t cur_resid;
3827 u_int32_t io_size;
3828 u_int32_t read_length = 0;
3829 int read_type = IO_COPY;
1c79356b 3830
91447636 3831 flags = xflags;
1c79356b 3832
0a7de745
A
3833 if (vp->v_flag & VNOCACHE_DATA) {
3834 flags |= IO_NOCACHE;
3835 }
3836 if ((vp->v_flag & VRAOFF) || speculative_reads_disabled) {
3837 flags |= IO_RAOFF;
3838 }
3e170ce0 3839
0a7de745 3840 if (flags & IO_SKIP_ENCRYPTION) {
fe8ab488 3841 flags |= IO_ENCRYPTED;
0a7de745 3842 }
91447636 3843
316670eb 3844 /*
2d21ac55
A
3845 * do a read through the cache if one of the following is true....
3846 * NOCACHE is not true
3847 * the uio request doesn't target USERSPACE
316670eb
A
3848 * Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well.
3849 * Reading encrypted data from a CP filesystem should never result in the data touching
3850 * the UBC.
3851 *
2d21ac55
A
3852 * otherwise, find out if we want the direct or contig variant for
3853 * the first vector in the uio request
3854 */
0a7de745 3855 if (((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) || (flags & IO_ENCRYPTED)) {
fe8ab488 3856 retval = cluster_io_type(uio, &read_type, &read_length, 0);
316670eb 3857 }
39037602 3858
2d21ac55 3859 while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) {
2d21ac55 3860 switch (read_type) {
2d21ac55 3861 case IO_COPY:
0a7de745 3862 /*
2d21ac55
A
3863 * make sure the uio_resid isn't too big...
3864 * internally, we want to handle all of the I/O in
3865 * chunk sizes that fit in a 32 bit int
91447636 3866 */
0a7de745
A
3867 if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
3868 io_size = MAX_IO_REQUEST_SIZE;
3869 } else {
3870 io_size = (u_int32_t)cur_resid;
3871 }
91447636 3872
2d21ac55
A
3873 retval = cluster_read_copy(vp, uio, io_size, filesize, flags, callback, callback_arg);
3874 break;
1c79356b 3875
2d21ac55 3876 case IO_DIRECT:
0a7de745 3877 retval = cluster_read_direct(vp, uio, filesize, &read_type, &read_length, flags, callback, callback_arg);
2d21ac55 3878 break;
91447636 3879
2d21ac55 3880 case IO_CONTIG:
0a7de745 3881 retval = cluster_read_contig(vp, uio, filesize, &read_type, &read_length, callback, callback_arg, flags);
2d21ac55 3882 break;
0a7de745 3883
2d21ac55 3884 case IO_UNKNOWN:
0a7de745 3885 retval = cluster_io_type(uio, &read_type, &read_length, 0);
2d21ac55
A
3886 break;
3887 }
3888 }
0a7de745 3889 return retval;
2d21ac55 3890}
91447636 3891
91447636 3892
91447636 3893
2d21ac55 3894static void
b0d623f7 3895cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference)
2d21ac55
A
3896{
3897 int range;
3898 int abort_flags = UPL_ABORT_FREE_ON_EMPTY;
1c79356b 3899
2d21ac55 3900 if ((range = last_pg - start_pg)) {
0a7de745 3901 if (take_reference) {
2d21ac55 3902 abort_flags |= UPL_ABORT_REFERENCE;
0a7de745 3903 }
2d21ac55
A
3904
3905 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, range * PAGE_SIZE, abort_flags);
3906 }
1c79356b
A
3907}
3908
2d21ac55 3909
9bccf70c 3910static int
2d21ac55 3911cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
1c79356b
A
3912{
3913 upl_page_info_t *pl;
3914 upl_t upl;
3915 vm_offset_t upl_offset;
0a7de745
A
3916 u_int32_t upl_size;
3917 off_t upl_f_offset;
3918 int start_offset;
3919 int start_pg;
3920 int last_pg;
91447636 3921 int uio_last = 0;
1c79356b
A
3922 int pages_in_upl;
3923 off_t max_size;
55e303ae
A
3924 off_t last_ioread_offset;
3925 off_t last_request_offset;
1c79356b 3926 kern_return_t kret;
1c79356b
A
3927 int error = 0;
3928 int retval = 0;
2d21ac55
A
3929 u_int32_t size_of_prefetch;
3930 u_int32_t xsize;
3931 u_int32_t io_size;
cf7d32b8 3932 u_int32_t max_rd_size;
b0d623f7
A
3933 u_int32_t max_io_size;
3934 u_int32_t max_prefetch;
55e303ae
A
3935 u_int rd_ahead_enabled = 1;
3936 u_int prefetch_enabled = 1;
0a7de745
A
3937 struct cl_readahead * rap;
3938 struct clios iostate;
3939 struct cl_extent extent;
2d21ac55 3940 int bflag;
0a7de745
A
3941 int take_reference = 1;
3942 int policy = IOPOL_DEFAULT;
3943 boolean_t iolock_inited = FALSE;
b0d623f7
A
3944
3945 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
0a7de745
A
3946 (int)uio->uio_offset, io_req_size, (int)filesize, flags, 0);
3947
316670eb 3948 if (flags & IO_ENCRYPTED) {
0a7de745 3949 panic("encrypted blocks will hit UBC!");
316670eb 3950 }
0a7de745 3951
39236c6e 3952 policy = throttle_get_io_policy(NULL);
2d21ac55 3953
0a7de745 3954 if (policy == THROTTLE_LEVEL_TIER3 || policy == THROTTLE_LEVEL_TIER2 || (flags & IO_NOCACHE)) {
2d21ac55 3955 take_reference = 0;
0a7de745 3956 }
2d21ac55 3957
0a7de745 3958 if (flags & IO_PASSIVE) {
cf7d32b8 3959 bflag = CL_PASSIVE;
0a7de745 3960 } else {
b0d623f7 3961 bflag = 0;
0a7de745 3962 }
cf7d32b8 3963
0a7de745 3964 if (flags & IO_NOCACHE) {
316670eb 3965 bflag |= CL_NOCACHE;
0a7de745 3966 }
316670eb 3967
0a7de745 3968 if (flags & IO_SKIP_ENCRYPTION) {
fe8ab488 3969 bflag |= CL_ENCRYPTED;
0a7de745 3970 }
fe8ab488 3971
b0d623f7 3972 max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
5ba3f43e 3973 max_prefetch = MAX_PREFETCH(vp, max_io_size, disk_conditioner_mount_is_ssd(vp->v_mount));
b0d623f7 3974 max_rd_size = max_prefetch;
55e303ae 3975
2d21ac55 3976 last_request_offset = uio->uio_offset + io_req_size;
55e303ae 3977
0a7de745
A
3978 if (last_request_offset > filesize) {
3979 last_request_offset = filesize;
3980 }
b0d623f7 3981
0a7de745
A
3982 if ((flags & (IO_RAOFF | IO_NOCACHE)) || ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
3983 rd_ahead_enabled = 0;
91447636
A
3984 rap = NULL;
3985 } else {
0a7de745 3986 if (cluster_is_throttled(vp)) {
316670eb
A
3987 /*
3988 * we're in the throttle window, at the very least
3989 * we want to limit the size of the I/O we're about
3990 * to issue
3991 */
0a7de745 3992 rd_ahead_enabled = 0;
91447636 3993 prefetch_enabled = 0;
55e303ae 3994
316670eb 3995 max_rd_size = THROTTLE_MAX_IOSIZE;
91447636 3996 }
0a7de745
A
3997 if ((rap = cluster_get_rap(vp)) == NULL) {
3998 rd_ahead_enabled = 0;
3999 } else {
b0d623f7
A
4000 extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
4001 extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
4002 }
55e303ae 4003 }
91447636 4004 if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) {
0a7de745 4005 /*
55e303ae
A
4006 * determine if we already have a read-ahead in the pipe courtesy of the
4007 * last read systemcall that was issued...
4008 * if so, pick up it's extent to determine where we should start
0a7de745 4009 * with respect to any read-ahead that might be necessary to
55e303ae
A
4010 * garner all the data needed to complete this read systemcall
4011 */
0a7de745 4012 last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
1c79356b 4013
0a7de745
A
4014 if (last_ioread_offset < uio->uio_offset) {
4015 last_ioread_offset = (off_t)0;
4016 } else if (last_ioread_offset > last_request_offset) {
4017 last_ioread_offset = last_request_offset;
4018 }
4019 } else {
4020 last_ioread_offset = (off_t)0;
4021 }
1c79356b 4022
2d21ac55 4023 while (io_req_size && uio->uio_offset < filesize && retval == 0) {
b0d623f7 4024 max_size = filesize - uio->uio_offset;
1c79356b 4025
0a7de745
A
4026 if ((off_t)(io_req_size) < max_size) {
4027 io_size = io_req_size;
4028 } else {
4029 io_size = max_size;
4030 }
9bccf70c 4031
91447636 4032 if (!(flags & IO_NOCACHE)) {
0a7de745
A
4033 while (io_size) {
4034 u_int32_t io_resid;
2d21ac55 4035 u_int32_t io_requested;
1c79356b 4036
55e303ae
A
4037 /*
4038 * if we keep finding the pages we need already in the cache, then
2d21ac55 4039 * don't bother to call cluster_read_prefetch since it costs CPU cycles
55e303ae
A
4040 * to determine that we have all the pages we need... once we miss in
4041 * the cache and have issued an I/O, than we'll assume that we're likely
4042 * to continue to miss in the cache and it's to our advantage to try and prefetch
4043 */
4044 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) {
0a7de745
A
4045 if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
4046 /*
55e303ae
A
4047 * we've already issued I/O for this request and
4048 * there's still work to do and
4049 * our prefetch stream is running dry, so issue a
4050 * pre-fetch I/O... the I/O latency will overlap
4051 * with the copying of the data
4052 */
0a7de745
A
4053 if (size_of_prefetch > max_rd_size) {
4054 size_of_prefetch = max_rd_size;
4055 }
1c79356b 4056
0a7de745 4057 size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
1c79356b 4058
55e303ae 4059 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
0a7de745
A
4060
4061 if (last_ioread_offset > last_request_offset) {
4062 last_ioread_offset = last_request_offset;
4063 }
55e303ae
A
4064 }
4065 }
4066 /*
0a7de745
A
4067 * limit the size of the copy we're about to do so that
4068 * we can notice that our I/O pipe is running dry and
55e303ae
A
4069 * get the next I/O issued before it does go dry
4070 */
0a7de745
A
4071 if (last_ioread_offset && io_size > (max_io_size / 4)) {
4072 io_resid = (max_io_size / 4);
4073 } else {
4074 io_resid = io_size;
4075 }
1c79356b 4076
55e303ae 4077 io_requested = io_resid;
1c79356b 4078
0a7de745 4079 retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, take_reference);
2d21ac55
A
4080
4081 xsize = io_requested - io_resid;
1c79356b 4082
2d21ac55
A
4083 io_size -= xsize;
4084 io_req_size -= xsize;
1c79356b 4085
0a7de745
A
4086 if (retval || io_resid) {
4087 /*
55e303ae
A
4088 * if we run into a real error or
4089 * a page that is not in the cache
4090 * we need to leave streaming mode
4091 */
0a7de745
A
4092 break;
4093 }
4094
b0d623f7 4095 if (rd_ahead_enabled && (io_size == 0 || last_ioread_offset == last_request_offset)) {
0a7de745 4096 /*
55e303ae
A
4097 * we're already finished the I/O for this read request
4098 * let's see if we should do a read-ahead
4099 */
0a7de745 4100 cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
55e303ae 4101 }
1c79356b 4102 }
0a7de745
A
4103 if (retval) {
4104 break;
4105 }
1c79356b 4106 if (io_size == 0) {
91447636 4107 if (rap != NULL) {
0a7de745
A
4108 if (extent.e_addr < rap->cl_lastr) {
4109 rap->cl_maxra = 0;
4110 }
91447636
A
4111 rap->cl_lastr = extent.e_addr;
4112 }
0a7de745 4113 break;
1c79356b 4114 }
b0d623f7
A
4115 /*
4116 * recompute max_size since cluster_copy_ubc_data_internal
4117 * may have advanced uio->uio_offset
4118 */
4119 max_size = filesize - uio->uio_offset;
1c79356b 4120 }
316670eb
A
4121
4122 iostate.io_completed = 0;
4123 iostate.io_issued = 0;
4124 iostate.io_error = 0;
4125 iostate.io_wanted = 0;
4126
0a7de745 4127 if ((flags & IO_RETURN_ON_THROTTLE)) {
39236c6e 4128 if (cluster_is_throttled(vp) == THROTTLE_NOW) {
0a7de745 4129 if (!cluster_io_present_in_BC(vp, uio->uio_offset)) {
316670eb
A
4130 /*
4131 * we're in the throttle window and at least 1 I/O
4132 * has already been issued by a throttleable thread
4133 * in this window, so return with EAGAIN to indicate
4134 * to the FS issuing the cluster_read call that it
4135 * should now throttle after dropping any locks
4136 */
4137 throttle_info_update_by_mount(vp->v_mount);
4138
4139 retval = EAGAIN;
4140 break;
4141 }
4142 }
4143 }
4144
b0d623f7
A
4145 /*
4146 * compute the size of the upl needed to encompass
4147 * the requested read... limit each call to cluster_io
4148 * to the maximum UPL size... cluster_io will clip if
4149 * this exceeds the maximum io_size for the device,
0a7de745 4150 * make sure to account for
b0d623f7
A
4151 * a starting offset that's not page aligned
4152 */
4153 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
4154 upl_f_offset = uio->uio_offset - (off_t)start_offset;
4155
0a7de745
A
4156 if (io_size > max_rd_size) {
4157 io_size = max_rd_size;
4158 }
55e303ae 4159
1c79356b 4160 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
55e303ae 4161
2d21ac55 4162 if (flags & IO_NOCACHE) {
0a7de745
A
4163 if (upl_size > max_io_size) {
4164 upl_size = max_io_size;
4165 }
2d21ac55 4166 } else {
0a7de745
A
4167 if (upl_size > max_io_size / 4) {
4168 upl_size = max_io_size / 4;
fe8ab488 4169 upl_size &= ~PAGE_MASK;
0a7de745
A
4170
4171 if (upl_size == 0) {
fe8ab488 4172 upl_size = PAGE_SIZE;
0a7de745 4173 }
fe8ab488 4174 }
2d21ac55 4175 }
1c79356b
A
4176 pages_in_upl = upl_size / PAGE_SIZE;
4177
4178 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
0a7de745 4179 upl, (int)upl_f_offset, upl_size, start_offset, 0);
1c79356b 4180
5ba3f43e 4181 kret = ubc_create_upl_kernel(vp,
0a7de745
A
4182 upl_f_offset,
4183 upl_size,
4184 &upl,
4185 &pl,
4186 UPL_FILE_IO | UPL_SET_LITE,
4187 VM_KERN_MEMORY_FILE);
4188 if (kret != KERN_SUCCESS) {
2d21ac55 4189 panic("cluster_read_copy: failed to get pagelist");
0a7de745 4190 }
1c79356b 4191
1c79356b 4192 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
0a7de745 4193 upl, (int)upl_f_offset, upl_size, start_offset, 0);
1c79356b
A
4194
4195 /*
4196 * scan from the beginning of the upl looking for the first
4197 * non-valid page.... this will become the first page in
4198 * the request we're going to make to 'cluster_io'... if all
4199 * of the pages are valid, we won't call through to 'cluster_io'
4200 */
4201 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
0a7de745 4202 if (!upl_valid_page(pl, start_pg)) {
1c79356b 4203 break;
0a7de745 4204 }
1c79356b
A
4205 }
4206
4207 /*
4208 * scan from the starting invalid page looking for a valid
0a7de745 4209 * page before the end of the upl is reached, if we
1c79356b
A
4210 * find one, then it will be the last page of the request to
4211 * 'cluster_io'
4212 */
4213 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
0a7de745 4214 if (upl_valid_page(pl, last_pg)) {
1c79356b 4215 break;
0a7de745 4216 }
1c79356b
A
4217 }
4218
0a7de745
A
4219 if (start_pg < last_pg) {
4220 /*
1c79356b
A
4221 * we found a range of 'invalid' pages that must be filled
4222 * if the last page in this range is the last page of the file
4223 * we may have to clip the size of it to keep from reading past
4224 * the end of the last physical block associated with the file
4225 */
6d2010ae
A
4226 if (iolock_inited == FALSE) {
4227 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
4228
4229 iolock_inited = TRUE;
4230 }
1c79356b
A
4231 upl_offset = start_pg * PAGE_SIZE;
4232 io_size = (last_pg - start_pg) * PAGE_SIZE;
4233
0a7de745
A
4234 if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) {
4235 io_size = filesize - (upl_f_offset + upl_offset);
4236 }
9bccf70c 4237
1c79356b 4238 /*
55e303ae 4239 * issue an asynchronous read to cluster_io
1c79356b
A
4240 */
4241
4242 error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
0a7de745 4243 io_size, CL_READ | CL_ASYNC | bflag, (buf_t)NULL, &iostate, callback, callback_arg);
6d2010ae
A
4244
4245 if (rap) {
0a7de745
A
4246 if (extent.e_addr < rap->cl_maxra) {
4247 /*
4248 * we've just issued a read for a block that should have been
4249 * in the cache courtesy of the read-ahead engine... something
4250 * has gone wrong with the pipeline, so reset the read-ahead
4251 * logic which will cause us to restart from scratch
4252 */
4253 rap->cl_maxra = 0;
4254 }
4255 }
1c79356b
A
4256 }
4257 if (error == 0) {
0a7de745 4258 /*
1c79356b 4259 * if the read completed successfully, or there was no I/O request
55e303ae
A
4260 * issued, than copy the data into user land via 'cluster_upl_copy_data'
4261 * we'll first add on any 'valid'
1c79356b
A
4262 * pages that were present in the upl when we acquired it.
4263 */
4264 u_int val_size;
1c79356b 4265
0a7de745
A
4266 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
4267 if (!upl_valid_page(pl, uio_last)) {
4268 break;
4269 }
1c79356b 4270 }
2d21ac55 4271 if (uio_last < pages_in_upl) {
0a7de745 4272 /*
2d21ac55
A
4273 * there were some invalid pages beyond the valid pages
4274 * that we didn't issue an I/O for, just release them
4275 * unchanged now, so that any prefetch/readahed can
4276 * include them
4277 */
0a7de745
A
4278 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
4279 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
2d21ac55
A
4280 }
4281
1c79356b 4282 /*
2d21ac55 4283 * compute size to transfer this round, if io_req_size is
55e303ae 4284 * still non-zero after this attempt, we'll loop around and
1c79356b
A
4285 * set up for another I/O.
4286 */
4287 val_size = (uio_last * PAGE_SIZE) - start_offset;
1c79356b 4288
0a7de745
A
4289 if (val_size > max_size) {
4290 val_size = max_size;
4291 }
4292
4293 if (val_size > io_req_size) {
4294 val_size = io_req_size;
4295 }
1c79356b 4296
0a7de745
A
4297 if ((uio->uio_offset + val_size) > last_ioread_offset) {
4298 last_ioread_offset = uio->uio_offset + val_size;
4299 }
1c79356b 4300
55e303ae 4301 if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) {
0a7de745
A
4302 if ((last_ioread_offset - (uio->uio_offset + val_size)) <= upl_size) {
4303 /*
2d21ac55
A
4304 * if there's still I/O left to do for this request, and...
4305 * we're not in hard throttle mode, and...
4306 * we're close to using up the previous prefetch, then issue a
4307 * new pre-fetch I/O... the I/O latency will overlap
4308 * with the copying of the data
4309 */
0a7de745
A
4310 if (size_of_prefetch > max_rd_size) {
4311 size_of_prefetch = max_rd_size;
4312 }
2d21ac55
A
4313
4314 size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
4315
4316 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
1c79356b 4317
0a7de745
A
4318 if (last_ioread_offset > last_request_offset) {
4319 last_ioread_offset = last_request_offset;
4320 }
4321 }
55e303ae 4322 } else if ((uio->uio_offset + val_size) == last_request_offset) {
0a7de745 4323 /*
55e303ae 4324 * this transfer will finish this request, so...
0a7de745 4325 * let's try to read ahead if we're in
55e303ae
A
4326 * a sequential access pattern and we haven't
4327 * explicitly disabled it
4328 */
0a7de745 4329 if (rd_ahead_enabled) {
2d21ac55 4330 cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
0a7de745
A
4331 }
4332
91447636 4333 if (rap != NULL) {
0a7de745
A
4334 if (extent.e_addr < rap->cl_lastr) {
4335 rap->cl_maxra = 0;
4336 }
91447636
A
4337 rap->cl_lastr = extent.e_addr;
4338 }
9bccf70c 4339 }
0a7de745 4340 if (iolock_inited == TRUE) {
6d2010ae 4341 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
0a7de745
A
4342 }
4343
4344 if (iostate.io_error) {
4345 error = iostate.io_error;
4346 } else {
4347 u_int32_t io_requested;
cf7d32b8 4348
0a7de745 4349 io_requested = val_size;
2d21ac55 4350
0a7de745 4351 retval = cluster_copy_upl_data(uio, upl, start_offset, (int *)&io_requested);
2d21ac55 4352
2d21ac55
A
4353 io_req_size -= (val_size - io_requested);
4354 }
6d2010ae 4355 } else {
0a7de745 4356 if (iolock_inited == TRUE) {
6d2010ae 4357 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
0a7de745 4358 }
1c79356b
A
4359 }
4360 if (start_pg < last_pg) {
0a7de745 4361 /*
1c79356b
A
4362 * compute the range of pages that we actually issued an I/O for
4363 * and either commit them as valid if the I/O succeeded
0a7de745 4364 * or abort them if the I/O failed or we're not supposed to
2d21ac55 4365 * keep them in the cache
1c79356b 4366 */
0a7de745 4367 io_size = (last_pg - start_pg) * PAGE_SIZE;
1c79356b 4368
b0d623f7 4369 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START, upl, start_pg * PAGE_SIZE, io_size, error, 0);
1c79356b 4370
0a7de745
A
4371 if (error || (flags & IO_NOCACHE)) {
4372 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
4373 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
4374 } else {
4375 int commit_flags = UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY;
b0d623f7 4376
0a7de745 4377 if (take_reference) {
b0d623f7 4378 commit_flags |= UPL_COMMIT_INACTIVATE;
0a7de745 4379 } else {
b0d623f7 4380 commit_flags |= UPL_COMMIT_SPECULATE;
0a7de745 4381 }
1c79356b 4382
0a7de745 4383 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size, commit_flags);
b0d623f7
A
4384 }
4385 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, start_pg * PAGE_SIZE, io_size, error, 0);
1c79356b
A
4386 }
4387 if ((last_pg - start_pg) < pages_in_upl) {
0a7de745 4388 /*
1c79356b
A
4389 * the set of pages that we issued an I/O for did not encompass
4390 * the entire upl... so just release these without modifying
55e303ae 4391 * their state
1c79356b 4392 */
0a7de745 4393 if (error) {
9bccf70c 4394 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
0a7de745 4395 } else {
2d21ac55 4396 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
0a7de745 4397 upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
2d21ac55
A
4398
4399 /*
4400 * handle any valid pages at the beginning of
4401 * the upl... release these appropriately
4402 */
b0d623f7 4403 cluster_read_upl_release(upl, 0, start_pg, take_reference);
2d21ac55
A
4404
4405 /*
4406 * handle any valid pages immediately after the
4407 * pages we issued I/O for... ... release these appropriately
4408 */
b0d623f7 4409 cluster_read_upl_release(upl, last_pg, uio_last, take_reference);
2d21ac55 4410
b0d623f7 4411 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, -1, -1, 0, 0);
1c79356b
A
4412 }
4413 }
0a7de745
A
4414 if (retval == 0) {
4415 retval = error;
4416 }
91447636 4417
2d21ac55 4418 if (io_req_size) {
0a7de745 4419 if (cluster_is_throttled(vp)) {
316670eb
A
4420 /*
4421 * we're in the throttle window, at the very least
4422 * we want to limit the size of the I/O we're about
4423 * to issue
4424 */
0a7de745 4425 rd_ahead_enabled = 0;
91447636 4426 prefetch_enabled = 0;
316670eb 4427 max_rd_size = THROTTLE_MAX_IOSIZE;
91447636 4428 } else {
0a7de745
A
4429 if (max_rd_size == THROTTLE_MAX_IOSIZE) {
4430 /*
2d21ac55
A
4431 * coming out of throttled state
4432 */
39236c6e 4433 if (policy != THROTTLE_LEVEL_TIER3 && policy != THROTTLE_LEVEL_TIER2) {
0a7de745 4434 if (rap != NULL) {
b0d623f7 4435 rd_ahead_enabled = 1;
0a7de745 4436 }
b0d623f7
A
4437 prefetch_enabled = 1;
4438 }
cf7d32b8 4439 max_rd_size = max_prefetch;
2d21ac55
A
4440 last_ioread_offset = 0;
4441 }
91447636
A
4442 }
4443 }
4444 }
6d2010ae 4445 if (iolock_inited == TRUE) {
fe8ab488
A
4446 /*
4447 * cluster_io returned an error after it
4448 * had already issued some I/O. we need
4449 * to wait for that I/O to complete before
4450 * we can destroy the iostate mutex...
4451 * 'retval' already contains the early error
4452 * so no need to pick it up from iostate.io_error
4453 */
4454 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4455
6d2010ae
A
4456 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
4457 }
91447636 4458 if (rap != NULL) {
0a7de745
A
4459 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
4460 (int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, 0);
91447636 4461
0a7de745 4462 lck_mtx_unlock(&rap->cl_lockr);
91447636 4463 } else {
0a7de745
A
4464 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
4465 (int)uio->uio_offset, io_req_size, 0, retval, 0);
1c79356b
A
4466 }
4467
0a7de745 4468 return retval;
1c79356b
A
4469}
4470
3e170ce0
A
4471/*
4472 * We don't want another read/write lock for every vnode in the system
4473 * so we keep a hash of them here. There should never be very many of
4474 * these around at any point in time.
4475 */
0a7de745
A
4476cl_direct_read_lock_t *
4477cluster_lock_direct_read(vnode_t vp, lck_rw_type_t type)
3e170ce0
A
4478{
4479 struct cl_direct_read_locks *head
0a7de745
A
4480 = &cl_direct_read_locks[(uintptr_t)vp / sizeof(*vp)
4481 % CL_DIRECT_READ_LOCK_BUCKETS];
3e170ce0
A
4482
4483 struct cl_direct_read_lock *lck, *new_lck = NULL;
4484
4485 for (;;) {
4486 lck_spin_lock(&cl_direct_read_spin_lock);
4487
4488 LIST_FOREACH(lck, head, chain) {
4489 if (lck->vp == vp) {
4490 ++lck->ref_count;
4491 lck_spin_unlock(&cl_direct_read_spin_lock);
4492 if (new_lck) {
4493 // Someone beat us to it, ditch the allocation
4494 lck_rw_destroy(&new_lck->rw_lock, cl_mtx_grp);
4495 FREE(new_lck, M_TEMP);
4496 }
4497 lck_rw_lock(&lck->rw_lock, type);
4498 return lck;
4499 }
4500 }
4501
4502 if (new_lck) {
4503 // Use the lock we allocated
4504 LIST_INSERT_HEAD(head, new_lck, chain);
4505 lck_spin_unlock(&cl_direct_read_spin_lock);
4506 lck_rw_lock(&new_lck->rw_lock, type);
4507 return new_lck;
4508 }
4509
4510 lck_spin_unlock(&cl_direct_read_spin_lock);
4511
4512 // Allocate a new lock
4513 MALLOC(new_lck, cl_direct_read_lock_t *, sizeof(*new_lck),
0a7de745 4514 M_TEMP, M_WAITOK);
3e170ce0
A
4515 lck_rw_init(&new_lck->rw_lock, cl_mtx_grp, cl_mtx_attr);
4516 new_lck->vp = vp;
4517 new_lck->ref_count = 1;
4518
4519 // Got to go round again
4520 }
4521}
4522
0a7de745
A
4523void
4524cluster_unlock_direct_read(cl_direct_read_lock_t *lck)
3e170ce0
A
4525{
4526 lck_rw_done(&lck->rw_lock);
4527
4528 lck_spin_lock(&cl_direct_read_spin_lock);
4529 if (lck->ref_count == 1) {
4530 LIST_REMOVE(lck, chain);
4531 lck_spin_unlock(&cl_direct_read_spin_lock);
4532 lck_rw_destroy(&lck->rw_lock, cl_mtx_grp);
4533 FREE(lck, M_TEMP);
4534 } else {
4535 --lck->ref_count;
4536 lck_spin_unlock(&cl_direct_read_spin_lock);
4537 }
4538}
4539
9bccf70c 4540static int
2d21ac55 4541cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
0a7de745 4542 int flags, int (*callback)(buf_t, void *), void *callback_arg)
1c79356b
A
4543{
4544 upl_t upl;
4545 upl_page_info_t *pl;
0a7de745 4546 off_t max_io_size;
b0d623f7 4547 vm_offset_t upl_offset, vector_upl_offset = 0;
0a7de745
A
4548 upl_size_t upl_size, vector_upl_size = 0;
4549 vm_size_t upl_needed_size;
4550 unsigned int pages_in_pl;
3e170ce0 4551 upl_control_flags_t upl_flags;
1c79356b 4552 kern_return_t kret;
2d21ac55 4553 unsigned int i;
1c79356b 4554 int force_data_sync;
1c79356b 4555 int retval = 0;
0a7de745 4556 int no_zero_fill = 0;
2d21ac55 4557 int io_flag = 0;
0a7de745 4558 int misaligned = 0;
d7e50217 4559 struct clios iostate;
0a7de745
A
4560 user_addr_t iov_base;
4561 u_int32_t io_req_size;
4562 u_int32_t offset_in_file;
4563 u_int32_t offset_in_iovbase;
4564 u_int32_t io_size;
4565 u_int32_t io_min;
4566 u_int32_t xsize;
4567 u_int32_t devblocksize;
4568 u_int32_t mem_alignment_mask;
4569 u_int32_t max_upl_size;
b0d623f7
A
4570 u_int32_t max_rd_size;
4571 u_int32_t max_rd_ahead;
316670eb 4572 u_int32_t max_vector_size;
0a7de745 4573 boolean_t io_throttled = FALSE;
cf7d32b8 4574
0a7de745
A
4575 u_int32_t vector_upl_iosize = 0;
4576 int issueVectorUPL = 0, useVectorUPL = (uio->uio_iovcnt > 1);
4577 off_t v_upl_uio_offset = 0;
4578 int vector_upl_index = 0;
4579 upl_t vector_upl = NULL;
3e170ce0 4580 cl_direct_read_lock_t *lock = NULL;
cf7d32b8 4581
0a7de745
A
4582 user_addr_t orig_iov_base = 0;
4583 user_addr_t last_iov_base = 0;
4584 user_addr_t next_iov_base = 0;
fe8ab488 4585
b0d623f7 4586 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
0a7de745 4587 (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
cf7d32b8 4588
b0d623f7 4589 max_upl_size = cluster_max_io_size(vp->v_mount, CL_READ);
2d21ac55 4590
b0d623f7
A
4591 max_rd_size = max_upl_size;
4592 max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
1c79356b 4593
b0d623f7 4594 io_flag = CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO | CL_DIRECT_IO;
6d2010ae 4595
0a7de745 4596 if (flags & IO_PASSIVE) {
b0d623f7 4597 io_flag |= CL_PASSIVE;
0a7de745 4598 }
1c79356b 4599
316670eb
A
4600 if (flags & IO_ENCRYPTED) {
4601 io_flag |= CL_RAW_ENCRYPTED;
4602 }
4603
4604 if (flags & IO_NOCACHE) {
4605 io_flag |= CL_NOCACHE;
4606 }
4607
0a7de745 4608 if (flags & IO_SKIP_ENCRYPTION) {
fe8ab488 4609 io_flag |= CL_ENCRYPTED;
0a7de745 4610 }
fe8ab488 4611
d7e50217
A
4612 iostate.io_completed = 0;
4613 iostate.io_issued = 0;
4614 iostate.io_error = 0;
4615 iostate.io_wanted = 0;
4616
6d2010ae
A
4617 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
4618
2d21ac55
A
4619 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
4620 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
4621
4622 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
0a7de745 4623 (int)devblocksize, (int)mem_alignment_mask, 0, 0, 0);
2d21ac55
A
4624
4625 if (devblocksize == 1) {
0a7de745
A
4626 /*
4627 * the AFP client advertises a devblocksize of 1
4628 * however, its BLOCKMAP routine maps to physical
4629 * blocks that are PAGE_SIZE in size...
4630 * therefore we can't ask for I/Os that aren't page aligned
4631 * or aren't multiples of PAGE_SIZE in size
4632 * by setting devblocksize to PAGE_SIZE, we re-instate
4633 * the old behavior we had before the mem_alignment_mask
4634 * changes went in...
4635 */
4636 devblocksize = PAGE_SIZE;
2d21ac55 4637 }
6d2010ae 4638
fe8ab488
A
4639 orig_iov_base = uio_curriovbase(uio);
4640 last_iov_base = orig_iov_base;
4641
2d21ac55
A
4642next_dread:
4643 io_req_size = *read_length;
4644 iov_base = uio_curriovbase(uio);
4645
2d21ac55
A
4646 offset_in_file = (u_int32_t)uio->uio_offset & (devblocksize - 1);
4647 offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
4648
4649 if (offset_in_file || offset_in_iovbase) {
0a7de745 4650 /*
2d21ac55
A
4651 * one of the 2 important offsets is misaligned
4652 * so fire an I/O through the cache for this entire vector
4653 */
4654 misaligned = 1;
4655 }
4656 if (iov_base & (devblocksize - 1)) {
0a7de745 4657 /*
2d21ac55
A
4658 * the offset in memory must be on a device block boundary
4659 * so that we can guarantee that we can generate an
4660 * I/O that ends on a page boundary in cluster_io
4661 */
4662 misaligned = 1;
0a7de745 4663 }
316670eb 4664
39037602
A
4665 max_io_size = filesize - uio->uio_offset;
4666
0a7de745
A
4667 /*
4668 * The user must request IO in aligned chunks. If the
4669 * offset into the file is bad, or the userland pointer
316670eb
A
4670 * is non-aligned, then we cannot service the encrypted IO request.
4671 */
39037602 4672 if (flags & IO_ENCRYPTED) {
0a7de745 4673 if (misaligned || (io_req_size & (devblocksize - 1))) {
39037602 4674 retval = EINVAL;
0a7de745 4675 }
39037602
A
4676
4677 max_io_size = roundup(max_io_size, devblocksize);
316670eb
A
4678 }
4679
0a7de745
A
4680 if ((off_t)io_req_size > max_io_size) {
4681 io_req_size = max_io_size;
4682 }
39037602 4683
2d21ac55
A
4684 /*
4685 * When we get to this point, we know...
4686 * -- the offset into the file is on a devblocksize boundary
4687 */
4688
4689 while (io_req_size && retval == 0) {
0a7de745 4690 u_int32_t io_start;
1c79356b 4691
0a7de745 4692 if (cluster_is_throttled(vp)) {
316670eb
A
4693 /*
4694 * we're in the throttle window, at the very least
4695 * we want to limit the size of the I/O we're about
4696 * to issue
4697 */
0a7de745 4698 max_rd_size = THROTTLE_MAX_IOSIZE;
316670eb
A
4699 max_rd_ahead = THROTTLE_MAX_IOSIZE - 1;
4700 max_vector_size = THROTTLE_MAX_IOSIZE;
91447636 4701 } else {
0a7de745 4702 max_rd_size = max_upl_size;
b0d623f7 4703 max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
316670eb 4704 max_vector_size = MAX_VECTOR_UPL_SIZE;
91447636 4705 }
2d21ac55 4706 io_start = io_size = io_req_size;
1c79356b 4707
d7e50217
A
4708 /*
4709 * First look for pages already in the cache
316670eb
A
4710 * and move them to user space. But only do this
4711 * check if we are not retrieving encrypted data directly
4712 * from the filesystem; those blocks should never
0a7de745 4713 * be in the UBC.
2d21ac55
A
4714 *
4715 * cluster_copy_ubc_data returns the resid
4716 * in io_size
d7e50217 4717 */
d9a64523 4718 if ((flags & IO_ENCRYPTED) == 0) {
6d2010ae
A
4719 retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0);
4720 }
2d21ac55
A
4721 /*
4722 * calculate the number of bytes actually copied
4723 * starting size - residual
4724 */
4725 xsize = io_start - io_size;
4726
4727 io_req_size -= xsize;
4728
0a7de745 4729 if (useVectorUPL && (xsize || (iov_base & PAGE_MASK))) {
b0d623f7
A
4730 /*
4731 * We found something in the cache or we have an iov_base that's not
4732 * page-aligned.
0a7de745 4733 *
b0d623f7
A
4734 * Issue all I/O's that have been collected within this Vectored UPL.
4735 */
0a7de745 4736 if (vector_upl_index) {
b0d623f7
A
4737 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
4738 reset_vector_run_state();
4739 }
0a7de745
A
4740
4741 if (xsize) {
b0d623f7 4742 useVectorUPL = 0;
0a7de745 4743 }
b0d623f7 4744
0a7de745
A
4745 /*
4746 * After this point, if we are using the Vector UPL path and the base is
4747 * not page-aligned then the UPL with that base will be the first in the vector UPL.
4748 */
b0d623f7
A
4749 }
4750
2d21ac55 4751 /*
316670eb
A
4752 * check to see if we are finished with this request.
4753 *
4754 * If we satisfied this IO already, then io_req_size will be 0.
0a7de745 4755 * Otherwise, see if the IO was mis-aligned and needs to go through
316670eb
A
4756 * the UBC to deal with the 'tail'.
4757 *
2d21ac55 4758 */
316670eb 4759 if (io_req_size == 0 || (misaligned)) {
0a7de745 4760 /*
2d21ac55
A
4761 * see if there's another uio vector to
4762 * process that's of type IO_DIRECT
4763 *
4764 * break out of while loop to get there
d7e50217 4765 */
0a7de745 4766 break;
0b4e3aa0 4767 }
d7e50217 4768 /*
2d21ac55 4769 * assume the request ends on a device block boundary
d7e50217 4770 */
2d21ac55
A
4771 io_min = devblocksize;
4772
4773 /*
4774 * we can handle I/O's in multiples of the device block size
4775 * however, if io_size isn't a multiple of devblocksize we
4776 * want to clip it back to the nearest page boundary since
4777 * we are going to have to go through cluster_read_copy to
4778 * deal with the 'overhang'... by clipping it to a PAGE_SIZE
4779 * multiple, we avoid asking the drive for the same physical
4780 * blocks twice.. once for the partial page at the end of the
4781 * request and a 2nd time for the page we read into the cache
0a7de745 4782 * (which overlaps the end of the direct read) in order to
2d21ac55
A
4783 * get at the overhang bytes
4784 */
39037602
A
4785 if (io_size & (devblocksize - 1)) {
4786 assert(!(flags & IO_ENCRYPTED));
4787 /*
4788 * Clip the request to the previous page size boundary
4789 * since request does NOT end on a device block boundary
4790 */
4791 io_size &= ~PAGE_MASK;
4792 io_min = PAGE_SIZE;
2d21ac55
A
4793 }
4794 if (retval || io_size < io_min) {
0a7de745 4795 /*
2d21ac55
A
4796 * either an error or we only have the tail left to
4797 * complete via the copy path...
d7e50217
A
4798 * we may have already spun some portion of this request
4799 * off as async requests... we need to wait for the I/O
4800 * to complete before returning
4801 */
0a7de745 4802 goto wait_for_dreads;
d7e50217 4803 }
55e303ae 4804
3e170ce0 4805 /*
316670eb
A
4806 * Don't re-check the UBC data if we are looking for uncached IO
4807 * or asking for encrypted blocks.
4808 */
d9a64523 4809 if ((flags & IO_ENCRYPTED) == 0) {
0a7de745 4810 if ((xsize = io_size) > max_rd_size) {
316670eb 4811 xsize = max_rd_size;
0a7de745 4812 }
55e303ae 4813
6d2010ae
A
4814 io_size = 0;
4815
3e170ce0
A
4816 if (!lock) {
4817 /*
4818 * We hold a lock here between the time we check the
4819 * cache and the time we issue I/O. This saves us
4820 * from having to lock the pages in the cache. Not
4821 * all clients will care about this lock but some
4822 * clients may want to guarantee stability between
4823 * here and when the I/O is issued in which case they
4824 * will take the lock exclusively.
4825 */
4826 lock = cluster_lock_direct_read(vp, LCK_RW_TYPE_SHARED);
4827 }
4828
6d2010ae
A
4829 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size);
4830
4831 if (io_size == 0) {
4832 /*
4833 * a page must have just come into the cache
4834 * since the first page in this range is no
4835 * longer absent, go back and re-evaluate
4836 */
4837 continue;
4838 }
2d21ac55 4839 }
0a7de745 4840 if ((flags & IO_RETURN_ON_THROTTLE)) {
39236c6e 4841 if (cluster_is_throttled(vp) == THROTTLE_NOW) {
0a7de745 4842 if (!cluster_io_present_in_BC(vp, uio->uio_offset)) {
316670eb
A
4843 /*
4844 * we're in the throttle window and at least 1 I/O
4845 * has already been issued by a throttleable thread
4846 * in this window, so return with EAGAIN to indicate
4847 * to the FS issuing the cluster_read call that it
4848 * should now throttle after dropping any locks
4849 */
4850 throttle_info_update_by_mount(vp->v_mount);
4851
4852 io_throttled = TRUE;
4853 goto wait_for_dreads;
4854 }
4855 }
4856 }
0a7de745 4857 if (io_size > max_rd_size) {
316670eb 4858 io_size = max_rd_size;
0a7de745 4859 }
6d2010ae 4860
cc9f6e38 4861 iov_base = uio_curriovbase(uio);
1c79356b 4862
2d21ac55 4863 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
0a7de745 4864 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1c79356b 4865
d7e50217 4866 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
0a7de745 4867 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
1c79356b 4868
0a7de745
A
4869 if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) {
4870 no_zero_fill = 1;
4871 } else {
4872 no_zero_fill = 0;
4873 }
0b4c1975 4874
3e170ce0 4875 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
d7e50217 4876 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
0a7de745 4877 pages_in_pl = 0;
d7e50217 4878 upl_size = upl_needed_size;
5ba3f43e 4879 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
0a7de745
A
4880 if (no_zero_fill) {
4881 upl_flags |= UPL_NOZEROFILL;
4882 }
4883 if (force_data_sync) {
4884 upl_flags |= UPL_FORCE_DATA_SYNC;
4885 }
91447636 4886
3e170ce0 4887 kret = vm_map_create_upl(map,
0a7de745
A
4888 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
4889 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE);
1c79356b 4890
d7e50217 4891 if (kret != KERN_SUCCESS) {
0a7de745
A
4892 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
4893 (int)upl_offset, upl_size, io_size, kret, 0);
d7e50217 4894 /*
2d21ac55 4895 * failed to get pagelist
d7e50217
A
4896 *
4897 * we may have already spun some portion of this request
4898 * off as async requests... we need to wait for the I/O
4899 * to complete before returning
4900 */
2d21ac55 4901 goto wait_for_dreads;
d7e50217
A
4902 }
4903 pages_in_pl = upl_size / PAGE_SIZE;
4904 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1c79356b 4905
d7e50217 4906 for (i = 0; i < pages_in_pl; i++) {
0a7de745
A
4907 if (!upl_page_present(pl, i)) {
4908 break;
4909 }
4910 }
4911 if (i == pages_in_pl) {
4912 break;
d7e50217 4913 }
0b4e3aa0 4914
0b4c1975 4915 ubc_upl_abort(upl, 0);
1c79356b 4916 }
d7e50217 4917 if (force_data_sync >= 3) {
0a7de745
A
4918 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
4919 (int)upl_offset, upl_size, io_size, kret, 0);
4920
2d21ac55 4921 goto wait_for_dreads;
d7e50217
A
4922 }
4923 /*
4924 * Consider the possibility that upl_size wasn't satisfied.
4925 */
2d21ac55 4926 if (upl_size < upl_needed_size) {
0a7de745
A
4927 if (upl_size && upl_offset == 0) {
4928 io_size = upl_size;
4929 } else {
4930 io_size = 0;
4931 }
2d21ac55 4932 }
d7e50217 4933 if (io_size == 0) {
0b4c1975 4934 ubc_upl_abort(upl, 0);
2d21ac55 4935 goto wait_for_dreads;
d7e50217
A
4936 }
4937 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
0a7de745 4938 (int)upl_offset, upl_size, io_size, kret, 0);
1c79356b 4939
0a7de745 4940 if (useVectorUPL) {
b0d623f7 4941 vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
0a7de745 4942 if (end_off) {
b0d623f7 4943 issueVectorUPL = 1;
0a7de745 4944 }
b0d623f7
A
4945 /*
4946 * After this point, if we are using a vector UPL, then
4947 * either all the UPL elements end on a page boundary OR
4948 * this UPL is the last element because it does not end
4949 * on a page boundary.
4950 */
4951 }
4952
d7e50217
A
4953 /*
4954 * request asynchronously so that we can overlap
4955 * the preparation of the next I/O
4956 * if there are already too many outstanding reads
4957 * wait until some have completed before issuing the next read
4958 */
fe8ab488 4959 cluster_iostate_wait(&iostate, max_rd_ahead, "cluster_read_direct");
91447636 4960
d7e50217 4961 if (iostate.io_error) {
0a7de745 4962 /*
d7e50217
A
4963 * one of the earlier reads we issued ran into a hard error
4964 * don't issue any more reads, cleanup the UPL
4965 * that was just created but not used, then
4966 * go wait for any other reads to complete before
4967 * returning the error to the caller
4968 */
0b4c1975 4969 ubc_upl_abort(upl, 0);
1c79356b 4970
0a7de745
A
4971 goto wait_for_dreads;
4972 }
d7e50217 4973 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
0a7de745 4974 upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
b0d623f7 4975
0a7de745
A
4976 if (!useVectorUPL) {
4977 if (no_zero_fill) {
4978 io_flag &= ~CL_PRESERVE;
4979 } else {
4980 io_flag |= CL_PRESERVE;
4981 }
1c79356b 4982
0a7de745
A
4983 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
4984 } else {
4985 if (!vector_upl_index) {
b0d623f7
A
4986 vector_upl = vector_upl_create(upl_offset);
4987 v_upl_uio_offset = uio->uio_offset;
4988 vector_upl_offset = upl_offset;
4989 }
4990
0a7de745 4991 vector_upl_set_subupl(vector_upl, upl, upl_size);
b0d623f7
A
4992 vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
4993 vector_upl_index++;
4994 vector_upl_size += upl_size;
4995 vector_upl_iosize += io_size;
0a7de745
A
4996
4997 if (issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) {
4998 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
4999 reset_vector_run_state();
b0d623f7 5000 }
fe8ab488
A
5001 }
5002 last_iov_base = iov_base + io_size;
5003
3e170ce0
A
5004 if (lock) {
5005 // We don't need to wait for the I/O to complete
5006 cluster_unlock_direct_read(lock);
5007 lock = NULL;
5008 }
5009
d7e50217
A
5010 /*
5011 * update the uio structure
5012 */
316670eb
A
5013 if ((flags & IO_ENCRYPTED) && (max_io_size < io_size)) {
5014 uio_update(uio, (user_size_t)max_io_size);
0a7de745 5015 } else {
316670eb
A
5016 uio_update(uio, (user_size_t)io_size);
5017 }
39037602
A
5018
5019 io_req_size -= io_size;
2d21ac55 5020
d7e50217 5021 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
0a7de745 5022 upl, (int)uio->uio_offset, io_req_size, retval, 0);
1c79356b
A
5023 } /* end while */
5024
2d21ac55 5025 if (retval == 0 && iostate.io_error == 0 && io_req_size == 0 && uio->uio_offset < filesize) {
0a7de745 5026 retval = cluster_io_type(uio, read_type, read_length, 0);
91447636 5027
2d21ac55 5028 if (retval == 0 && *read_type == IO_DIRECT) {
0a7de745
A
5029 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
5030 (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
2d21ac55
A
5031
5032 goto next_dread;
5033 }
5034 }
5035
5036wait_for_dreads:
b0d623f7 5037
0a7de745
A
5038 if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
5039 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
b0d623f7
A
5040 reset_vector_run_state();
5041 }
3e170ce0
A
5042
5043 // We don't need to wait for the I/O to complete
0a7de745 5044 if (lock) {
3e170ce0 5045 cluster_unlock_direct_read(lock);
0a7de745 5046 }
3e170ce0 5047
b0d623f7
A
5048 /*
5049 * make sure all async reads that are part of this stream
5050 * have completed before we return
5051 */
fe8ab488 5052 cluster_iostate_wait(&iostate, 0, "cluster_read_direct");
b0d623f7 5053
0a7de745
A
5054 if (iostate.io_error) {
5055 retval = iostate.io_error;
5056 }
2d21ac55 5057
6d2010ae
A
5058 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
5059
0a7de745 5060 if (io_throttled == TRUE && retval == 0) {
316670eb 5061 retval = EAGAIN;
0a7de745 5062 }
316670eb 5063
fe8ab488
A
5064 for (next_iov_base = orig_iov_base; next_iov_base < last_iov_base; next_iov_base += PAGE_SIZE) {
5065 /*
5066 * This is specifically done for pmap accounting purposes.
5067 * vm_pre_fault() will call vm_fault() to enter the page into
5068 * the pmap if there isn't _a_ physical page for that VA already.
5069 */
cb323159 5070 vm_pre_fault(vm_map_trunc_page(next_iov_base, PAGE_MASK), VM_PROT_READ);
fe8ab488
A
5071 }
5072
2d21ac55 5073 if (io_req_size && retval == 0) {
0a7de745 5074 /*
2d21ac55
A
5075 * we couldn't handle the tail of this request in DIRECT mode
5076 * so fire it through the copy path
5077 */
d9a64523
A
5078 if (flags & IO_ENCRYPTED) {
5079 /*
5080 * We cannot fall back to the copy path for encrypted I/O. If this
5081 * happens, there is something wrong with the user buffer passed
5082 * down.
5083 */
5084 retval = EFAULT;
5085 } else {
5086 retval = cluster_read_copy(vp, uio, io_req_size, filesize, flags, callback, callback_arg);
5087 }
1c79356b 5088
2d21ac55
A
5089 *read_type = IO_UNKNOWN;
5090 }
1c79356b 5091 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
0a7de745 5092 (int)uio->uio_offset, (int)uio_resid(uio), io_req_size, retval, 0);
1c79356b 5093
0a7de745 5094 return retval;
1c79356b
A
5095}
5096
5097
9bccf70c 5098static int
2d21ac55 5099cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
0a7de745 5100 int (*callback)(buf_t, void *), void *callback_arg, int flags)
0b4e3aa0 5101{
b4c24cb9 5102 upl_page_info_t *pl;
2d21ac55 5103 upl_t upl[MAX_VECTS];
0b4e3aa0 5104 vm_offset_t upl_offset;
0a7de745
A
5105 addr64_t dst_paddr = 0;
5106 user_addr_t iov_base;
2d21ac55 5107 off_t max_size;
0a7de745
A
5108 upl_size_t upl_size;
5109 vm_size_t upl_needed_size;
5110 mach_msg_type_number_t pages_in_pl;
3e170ce0 5111 upl_control_flags_t upl_flags;
0b4e3aa0 5112 kern_return_t kret;
b4c24cb9 5113 struct clios iostate;
0a7de745
A
5114 int error = 0;
5115 int cur_upl = 0;
5116 int num_upl = 0;
5117 int n;
5118 u_int32_t xsize;
5119 u_int32_t io_size;
5120 u_int32_t devblocksize;
5121 u_int32_t mem_alignment_mask;
5122 u_int32_t tail_size = 0;
2d21ac55
A
5123 int bflag;
5124
0a7de745 5125 if (flags & IO_PASSIVE) {
b0d623f7 5126 bflag = CL_PASSIVE;
0a7de745 5127 } else {
b0d623f7 5128 bflag = 0;
0a7de745
A
5129 }
5130
5131 if (flags & IO_NOCACHE) {
316670eb 5132 bflag |= CL_NOCACHE;
0a7de745
A
5133 }
5134
0b4e3aa0
A
5135 /*
5136 * When we enter this routine, we know
2d21ac55
A
5137 * -- the read_length will not exceed the current iov_len
5138 * -- the target address is physically contiguous for read_length
0b4e3aa0 5139 */
fe8ab488 5140 cluster_syncup(vp, filesize, callback, callback_arg, PUSH_SYNC);
0b4e3aa0 5141
2d21ac55
A
5142 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
5143 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
91447636 5144
2d21ac55
A
5145 iostate.io_completed = 0;
5146 iostate.io_issued = 0;
5147 iostate.io_error = 0;
5148 iostate.io_wanted = 0;
5149
6d2010ae
A
5150 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
5151
2d21ac55
A
5152next_cread:
5153 io_size = *read_length;
0b4e3aa0
A
5154
5155 max_size = filesize - uio->uio_offset;
5156
0a7de745
A
5157 if (io_size > max_size) {
5158 io_size = max_size;
5159 }
0b4e3aa0 5160
2d21ac55
A
5161 iov_base = uio_curriovbase(uio);
5162
5163 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
0b4e3aa0
A
5164 upl_needed_size = upl_offset + io_size;
5165
5166 pages_in_pl = 0;
5167 upl_size = upl_needed_size;
5ba3f43e 5168 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
0b4e3aa0 5169
2d21ac55
A
5170
5171 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_START,
0a7de745 5172 (int)upl_offset, (int)upl_size, (int)iov_base, io_size, 0);
2d21ac55 5173
3e170ce0
A
5174 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5175 kret = vm_map_get_upl(map,
0a7de745
A
5176 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
5177 &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE, 0);
2d21ac55
A
5178
5179 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_END,
0a7de745 5180 (int)upl_offset, upl_size, io_size, kret, 0);
0b4e3aa0 5181
b4c24cb9 5182 if (kret != KERN_SUCCESS) {
0a7de745 5183 /*
2d21ac55 5184 * failed to get pagelist
b4c24cb9 5185 */
0a7de745 5186 error = EINVAL;
2d21ac55 5187 goto wait_for_creads;
b4c24cb9 5188 }
2d21ac55
A
5189 num_upl++;
5190
b4c24cb9 5191 if (upl_size < upl_needed_size) {
0a7de745 5192 /*
b4c24cb9
A
5193 * The upl_size wasn't satisfied.
5194 */
0a7de745 5195 error = EINVAL;
2d21ac55 5196 goto wait_for_creads;
b4c24cb9 5197 }
2d21ac55 5198 pl = ubc_upl_pageinfo(upl[cur_upl]);
b4c24cb9 5199
fe8ab488 5200 dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
0b4e3aa0 5201
b4c24cb9 5202 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
0a7de745 5203 u_int32_t head_size;
b4c24cb9 5204
2d21ac55 5205 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
b4c24cb9 5206
0a7de745
A
5207 if (head_size > io_size) {
5208 head_size = io_size;
5209 }
b4c24cb9 5210
2d21ac55 5211 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ, callback, callback_arg);
b4c24cb9 5212
0a7de745 5213 if (error) {
2d21ac55 5214 goto wait_for_creads;
0a7de745 5215 }
b4c24cb9 5216
b4c24cb9
A
5217 upl_offset += head_size;
5218 dst_paddr += head_size;
5219 io_size -= head_size;
2d21ac55
A
5220
5221 iov_base += head_size;
5222 }
5223 if ((u_int32_t)iov_base & mem_alignment_mask) {
0a7de745 5224 /*
2d21ac55
A
5225 * request doesn't set up on a memory boundary
5226 * the underlying DMA engine can handle...
5227 * return an error instead of going through
5228 * the slow copy path since the intent of this
5229 * path is direct I/O to device memory
5230 */
0a7de745 5231 error = EINVAL;
2d21ac55 5232 goto wait_for_creads;
b4c24cb9 5233 }
2d21ac55 5234
b4c24cb9 5235 tail_size = io_size & (devblocksize - 1);
b4c24cb9 5236
2d21ac55 5237 io_size -= tail_size;
b4c24cb9
A
5238
5239 while (io_size && error == 0) {
0a7de745
A
5240 if (io_size > MAX_IO_CONTIG_SIZE) {
5241 xsize = MAX_IO_CONTIG_SIZE;
5242 } else {
5243 xsize = io_size;
5244 }
b4c24cb9
A
5245 /*
5246 * request asynchronously so that we can overlap
5247 * the preparation of the next I/O... we'll do
5248 * the commit after all the I/O has completed
5249 * since its all issued against the same UPL
5250 * if there are already too many outstanding reads
d7e50217 5251 * wait until some have completed before issuing the next
b4c24cb9 5252 */
fe8ab488 5253 cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_read_contig");
cf7d32b8 5254
2d21ac55 5255 if (iostate.io_error) {
0a7de745 5256 /*
2d21ac55
A
5257 * one of the earlier reads we issued ran into a hard error
5258 * don't issue any more reads...
5259 * go wait for any other reads to complete before
5260 * returning the error to the caller
5261 */
0a7de745 5262 goto wait_for_creads;
2d21ac55 5263 }
0a7de745
A
5264 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset, xsize,
5265 CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC | bflag,
5266 (buf_t)NULL, &iostate, callback, callback_arg);
5267 /*
b4c24cb9
A
5268 * The cluster_io read was issued successfully,
5269 * update the uio structure
5270 */
5271 if (error == 0) {
0a7de745 5272 uio_update(uio, (user_size_t)xsize);
cc9f6e38
A
5273
5274 dst_paddr += xsize;
5275 upl_offset += xsize;
5276 io_size -= xsize;
b4c24cb9
A
5277 }
5278 }
2d21ac55 5279 if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS && uio->uio_offset < filesize) {
0a7de745 5280 error = cluster_io_type(uio, read_type, read_length, 0);
2d21ac55 5281
2d21ac55 5282 if (error == 0 && *read_type == IO_CONTIG) {
0a7de745 5283 cur_upl++;
2d21ac55
A
5284 goto next_cread;
5285 }
0a7de745
A
5286 } else {
5287 *read_type = IO_UNKNOWN;
5288 }
2d21ac55
A
5289
5290wait_for_creads:
0b4e3aa0 5291 /*
d7e50217
A
5292 * make sure all async reads that are part of this stream
5293 * have completed before we proceed
0b4e3aa0 5294 */
fe8ab488 5295 cluster_iostate_wait(&iostate, 0, "cluster_read_contig");
91447636 5296
0a7de745
A
5297 if (iostate.io_error) {
5298 error = iostate.io_error;
5299 }
91447636 5300
6d2010ae
A
5301 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
5302
0a7de745
A
5303 if (error == 0 && tail_size) {
5304 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ, callback, callback_arg);
5305 }
0b4e3aa0 5306
0a7de745
A
5307 for (n = 0; n < num_upl; n++) {
5308 /*
2d21ac55
A
5309 * just release our hold on each physically contiguous
5310 * region without changing any state
5311 */
0a7de745
A
5312 ubc_upl_abort(upl[n], 0);
5313 }
5314
5315 return error;
0b4e3aa0 5316}
1c79356b 5317
b4c24cb9 5318
2d21ac55
A
5319static int
5320cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length)
5321{
0a7de745
A
5322 user_size_t iov_len;
5323 user_addr_t iov_base = 0;
2d21ac55 5324 upl_t upl;
b0d623f7 5325 upl_size_t upl_size;
3e170ce0 5326 upl_control_flags_t upl_flags;
0a7de745 5327 int retval = 0;
2d21ac55 5328
0a7de745 5329 /*
2d21ac55
A
5330 * skip over any emtpy vectors
5331 */
0a7de745 5332 uio_update(uio, (user_size_t)0);
2d21ac55
A
5333
5334 iov_len = uio_curriovlen(uio);
5335
b0d623f7 5336 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_START, uio, (int)iov_len, 0, 0, 0);
2d21ac55
A
5337
5338 if (iov_len) {
0a7de745
A
5339 iov_base = uio_curriovbase(uio);
5340 /*
2d21ac55
A
5341 * make sure the size of the vector isn't too big...
5342 * internally, we want to handle all of the I/O in
5343 * chunk sizes that fit in a 32 bit int
5344 */
0a7de745
A
5345 if (iov_len > (user_size_t)MAX_IO_REQUEST_SIZE) {
5346 upl_size = MAX_IO_REQUEST_SIZE;
5347 } else {
5348 upl_size = (u_int32_t)iov_len;
5349 }
2d21ac55 5350
5ba3f43e 5351 upl_flags = UPL_QUERY_OBJECT_TYPE;
3e170ce0
A
5352
5353 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5354 if ((vm_map_get_upl(map,
0a7de745
A
5355 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
5356 &upl_size, &upl, NULL, NULL, &upl_flags, VM_KERN_MEMORY_FILE, 0)) != KERN_SUCCESS) {
5357 /*
2d21ac55
A
5358 * the user app must have passed in an invalid address
5359 */
0a7de745
A
5360 retval = EFAULT;
5361 }
5362 if (upl_size == 0) {
5363 retval = EFAULT;
2d21ac55 5364 }
2d21ac55
A
5365
5366 *io_length = upl_size;
5367
0a7de745
A
5368 if (upl_flags & UPL_PHYS_CONTIG) {
5369 *io_type = IO_CONTIG;
5370 } else if (iov_len >= min_length) {
5371 *io_type = IO_DIRECT;
5372 } else {
5373 *io_type = IO_COPY;
5374 }
2d21ac55 5375 } else {
0a7de745 5376 /*
2d21ac55
A
5377 * nothing left to do for this uio
5378 */
0a7de745 5379 *io_length = 0;
2d21ac55
A
5380 *io_type = IO_UNKNOWN;
5381 }
b0d623f7 5382 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_END, iov_base, *io_type, *io_length, retval, 0);
2d21ac55 5383
0a7de745 5384 return retval;
2d21ac55
A
5385}
5386
5387
1c79356b
A
5388/*
5389 * generate advisory I/O's in the largest chunks possible
5390 * the completed pages will be released into the VM cache
5391 */
9bccf70c 5392int
91447636 5393advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
2d21ac55 5394{
0a7de745 5395 return advisory_read_ext(vp, filesize, f_offset, resid, NULL, NULL, CL_PASSIVE);
2d21ac55
A
5396}
5397
5398int
5399advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
1c79356b 5400{
1c79356b
A
5401 upl_page_info_t *pl;
5402 upl_t upl;
5403 vm_offset_t upl_offset;
0a7de745
A
5404 int upl_size;
5405 off_t upl_f_offset;
5406 int start_offset;
5407 int start_pg;
5408 int last_pg;
1c79356b
A
5409 int pages_in_upl;
5410 off_t max_size;
5411 int io_size;
5412 kern_return_t kret;
5413 int retval = 0;
9bccf70c 5414 int issued_io;
55e303ae 5415 int skip_range;
0a7de745 5416 uint32_t max_io_size;
b0d623f7
A
5417
5418
0a7de745
A
5419 if (!UBCINFOEXISTS(vp)) {
5420 return EINVAL;
5421 }
1c79356b 5422
0a7de745
A
5423 if (resid < 0) {
5424 return EINVAL;
5425 }
ca66cea6 5426
cf7d32b8 5427 max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
b0d623f7 5428
5ba3f43e 5429#if CONFIG_EMBEDDED
0a7de745 5430 if (max_io_size > speculative_prefetch_max_iosize) {
5ba3f43e 5431 max_io_size = speculative_prefetch_max_iosize;
0a7de745 5432 }
5ba3f43e
A
5433#else
5434 if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
0a7de745 5435 if (max_io_size > speculative_prefetch_max_iosize) {
316670eb 5436 max_io_size = speculative_prefetch_max_iosize;
0a7de745 5437 }
316670eb 5438 }
5ba3f43e 5439#endif
316670eb 5440
1c79356b 5441 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
0a7de745 5442 (int)f_offset, resid, (int)filesize, 0, 0);
1c79356b
A
5443
5444 while (resid && f_offset < filesize && retval == 0) {
5445 /*
5446 * compute the size of the upl needed to encompass
5447 * the requested read... limit each call to cluster_io
0b4e3aa0
A
5448 * to the maximum UPL size... cluster_io will clip if
5449 * this exceeds the maximum io_size for the device,
0a7de745 5450 * make sure to account for
1c79356b
A
5451 * a starting offset that's not page aligned
5452 */
5453 start_offset = (int)(f_offset & PAGE_MASK_64);
5454 upl_f_offset = f_offset - (off_t)start_offset;
5455 max_size = filesize - f_offset;
5456
0a7de745
A
5457 if (resid < max_size) {
5458 io_size = resid;
5459 } else {
5460 io_size = max_size;
5461 }
1c79356b
A
5462
5463 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
0a7de745
A
5464 if ((uint32_t)upl_size > max_io_size) {
5465 upl_size = max_io_size;
5466 }
55e303ae
A
5467
5468 skip_range = 0;
5469 /*
5470 * return the number of contiguously present pages in the cache
5471 * starting at upl_f_offset within the file
5472 */
5473 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
5474
5475 if (skip_range) {
0a7de745 5476 /*
55e303ae
A
5477 * skip over pages already present in the cache
5478 */
0a7de745 5479 io_size = skip_range - start_offset;
55e303ae 5480
0a7de745 5481 f_offset += io_size;
55e303ae
A
5482 resid -= io_size;
5483
0a7de745
A
5484 if (skip_range == upl_size) {
5485 continue;
5486 }
55e303ae
A
5487 /*
5488 * have to issue some real I/O
5489 * at this point, we know it's starting on a page boundary
5490 * because we've skipped over at least the first page in the request
5491 */
5492 start_offset = 0;
5493 upl_f_offset += skip_range;
5494 upl_size -= skip_range;
5495 }
1c79356b
A
5496 pages_in_upl = upl_size / PAGE_SIZE;
5497
55e303ae 5498 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
0a7de745 5499 upl, (int)upl_f_offset, upl_size, start_offset, 0);
55e303ae 5500
5ba3f43e 5501 kret = ubc_create_upl_kernel(vp,
0a7de745
A
5502 upl_f_offset,
5503 upl_size,
5504 &upl,
5505 &pl,
5506 UPL_RET_ONLY_ABSENT | UPL_SET_LITE,
5507 VM_KERN_MEMORY_FILE);
5508 if (kret != KERN_SUCCESS) {
5509 return retval;
5510 }
9bccf70c 5511 issued_io = 0;
1c79356b
A
5512
5513 /*
0a7de745 5514 * before we start marching forward, we must make sure we end on
9bccf70c
A
5515 * a present page, otherwise we will be working with a freed
5516 * upl
1c79356b 5517 */
9bccf70c 5518 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
0a7de745
A
5519 if (upl_page_present(pl, last_pg)) {
5520 break;
5521 }
1c79356b 5522 }
9bccf70c 5523 pages_in_upl = last_pg + 1;
1c79356b 5524
1c79356b 5525
55e303ae 5526 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
0a7de745 5527 upl, (int)upl_f_offset, upl_size, start_offset, 0);
9bccf70c
A
5528
5529
0a7de745
A
5530 for (last_pg = 0; last_pg < pages_in_upl;) {
5531 /*
9bccf70c
A
5532 * scan from the beginning of the upl looking for the first
5533 * page that is present.... this will become the first page in
5534 * the request we're going to make to 'cluster_io'... if all
5535 * of the pages are absent, we won't call through to 'cluster_io'
1c79356b 5536 */
0a7de745
A
5537 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
5538 if (upl_page_present(pl, start_pg)) {
5539 break;
5540 }
1c79356b 5541 }
1c79356b 5542
1c79356b 5543 /*
9bccf70c 5544 * scan from the starting present page looking for an absent
0a7de745 5545 * page before the end of the upl is reached, if we
9bccf70c
A
5546 * find one, then it will terminate the range of pages being
5547 * presented to 'cluster_io'
1c79356b 5548 */
9bccf70c 5549 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
0a7de745
A
5550 if (!upl_page_present(pl, last_pg)) {
5551 break;
5552 }
9bccf70c
A
5553 }
5554
0a7de745
A
5555 if (last_pg > start_pg) {
5556 /*
9bccf70c
A
5557 * we found a range of pages that must be filled
5558 * if the last page in this range is the last page of the file
5559 * we may have to clip the size of it to keep from reading past
5560 * the end of the last physical block associated with the file
5561 */
0a7de745 5562 upl_offset = start_pg * PAGE_SIZE;
9bccf70c
A
5563 io_size = (last_pg - start_pg) * PAGE_SIZE;
5564
0a7de745
A
5565 if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) {
5566 io_size = filesize - (upl_f_offset + upl_offset);
5567 }
9bccf70c
A
5568
5569 /*
5570 * issue an asynchronous read to cluster_io
5571 */
91447636 5572 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
0a7de745 5573 CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
1c79356b 5574
9bccf70c
A
5575 issued_io = 1;
5576 }
1c79356b 5577 }
0a7de745
A
5578 if (issued_io == 0) {
5579 ubc_upl_abort(upl, 0);
5580 }
9bccf70c
A
5581
5582 io_size = upl_size - start_offset;
0a7de745
A
5583
5584 if (io_size > resid) {
5585 io_size = resid;
5586 }
1c79356b
A
5587 f_offset += io_size;
5588 resid -= io_size;
5589 }
9bccf70c 5590
1c79356b 5591 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
0a7de745 5592 (int)f_offset, resid, retval, 0, 0);
1c79356b 5593
0a7de745 5594 return retval;
1c79356b
A
5595}
5596
5597
9bccf70c 5598int
91447636 5599cluster_push(vnode_t vp, int flags)
2d21ac55 5600{
0a7de745 5601 return cluster_push_ext(vp, flags, NULL, NULL);
2d21ac55
A
5602}
5603
5604
5605int
5606cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg)
813fb2f6
A
5607{
5608 return cluster_push_err(vp, flags, callback, callback_arg, NULL);
5609}
5610
5611/* write errors via err, but return the number of clusters written */
5612int
5613cluster_push_err(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg, int *err)
9bccf70c 5614{
0a7de745
A
5615 int retval;
5616 int my_sparse_wait = 0;
5617 struct cl_writebehind *wbp;
5618 int local_err = 0;
9bccf70c 5619
0a7de745 5620 if (err) {
813fb2f6 5621 *err = 0;
0a7de745 5622 }
813fb2f6 5623
0a7de745
A
5624 if (!UBCINFOEXISTS(vp)) {
5625 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -1, 0);
5626 return 0;
91447636
A
5627 }
5628 /* return if deferred write is set */
5629 if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
0a7de745 5630 return 0;
91447636
A
5631 }
5632 if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
0a7de745
A
5633 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -2, 0);
5634 return 0;
91447636 5635 }
fe8ab488 5636 if (!ISSET(flags, IO_SYNC) && wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
0a7de745 5637 lck_mtx_unlock(&wbp->cl_lockw);
9bccf70c 5638
0a7de745
A
5639 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -3, 0);
5640 return 0;
91447636 5641 }
9bccf70c 5642 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
0a7de745 5643 wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
b0d623f7
A
5644
5645 /*
5646 * if we have an fsync in progress, we don't want to allow any additional
5647 * sync/fsync/close(s) to occur until it finishes.
5648 * note that its possible for writes to continue to occur to this file
5649 * while we're waiting and also once the fsync starts to clean if we're
5650 * in the sparse map case
5651 */
5652 while (wbp->cl_sparse_wait) {
39037602 5653 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
b0d623f7
A
5654
5655 msleep((caddr_t)&wbp->cl_sparse_wait, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
5656
39037602 5657 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
b0d623f7
A
5658 }
5659 if (flags & IO_SYNC) {
5660 my_sparse_wait = 1;
5661 wbp->cl_sparse_wait = 1;
9bccf70c 5662
b0d623f7
A
5663 /*
5664 * this is an fsync (or equivalent)... we must wait for any existing async
5665 * cleaning operations to complete before we evaulate the current state
5666 * and finish cleaning... this insures that all writes issued before this
5667 * fsync actually get cleaned to the disk before this fsync returns
5668 */
5669 while (wbp->cl_sparse_pushes) {
39037602 5670 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
b0d623f7
A
5671
5672 msleep((caddr_t)&wbp->cl_sparse_pushes, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
5673
39037602 5674 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
b0d623f7
A
5675 }
5676 }
91447636 5677 if (wbp->cl_scmap) {
0a7de745 5678 void *scmap;
b0d623f7
A
5679
5680 if (wbp->cl_sparse_pushes < SPARSE_PUSH_LIMIT) {
b0d623f7
A
5681 scmap = wbp->cl_scmap;
5682 wbp->cl_scmap = NULL;
5683
5684 wbp->cl_sparse_pushes++;
5685
5686 lck_mtx_unlock(&wbp->cl_lockw);
5687
d9a64523 5688 retval = sparse_cluster_push(wbp, &scmap, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE);
b0d623f7
A
5689
5690 lck_mtx_lock(&wbp->cl_lockw);
9bccf70c 5691
b0d623f7 5692 wbp->cl_sparse_pushes--;
d9a64523
A
5693
5694 if (retval) {
5695 if (wbp->cl_scmap != NULL) {
5696 panic("cluster_push_err: Expected NULL cl_scmap\n");
5697 }
5698
5699 wbp->cl_scmap = scmap;
5700 }
0a7de745
A
5701
5702 if (wbp->cl_sparse_wait && wbp->cl_sparse_pushes == 0) {
b0d623f7 5703 wakeup((caddr_t)&wbp->cl_sparse_pushes);
0a7de745 5704 }
b0d623f7 5705 } else {
0a7de745 5706 retval = sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE);
b0d623f7 5707 }
d9a64523
A
5708
5709 local_err = retval;
5710
0a7de745 5711 if (err) {
813fb2f6 5712 *err = retval;
0a7de745 5713 }
55e303ae 5714 retval = 1;
813fb2f6 5715 } else {
d9a64523 5716 retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, &local_err, FALSE);
0a7de745 5717 if (err) {
d9a64523 5718 *err = local_err;
0a7de745 5719 }
b0d623f7 5720 }
91447636
A
5721 lck_mtx_unlock(&wbp->cl_lockw);
5722
0a7de745
A
5723 if (flags & IO_SYNC) {
5724 (void)vnode_waitforwrites(vp, 0, 0, 0, "cluster_push");
5725 }
9bccf70c 5726
b0d623f7
A
5727 if (my_sparse_wait) {
5728 /*
5729 * I'm the owner of the serialization token
5730 * clear it and wakeup anyone that is waiting
5731 * for me to finish
5732 */
5733 lck_mtx_lock(&wbp->cl_lockw);
5734
5735 wbp->cl_sparse_wait = 0;
5736 wakeup((caddr_t)&wbp->cl_sparse_wait);
5737
5738 lck_mtx_unlock(&wbp->cl_lockw);
5739 }
55e303ae 5740 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
0a7de745 5741 wbp->cl_scmap, wbp->cl_number, retval, local_err, 0);
9bccf70c 5742
0a7de745 5743 return retval;
55e303ae 5744}
9bccf70c 5745
9bccf70c 5746
91447636
A
5747__private_extern__ void
5748cluster_release(struct ubc_info *ubc)
55e303ae 5749{
0a7de745 5750 struct cl_writebehind *wbp;
91447636
A
5751 struct cl_readahead *rap;
5752
5753 if ((wbp = ubc->cl_wbehind)) {
0a7de745 5754 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, wbp->cl_scmap, 0, 0, 0);
9bccf70c 5755
0a7de745
A
5756 if (wbp->cl_scmap) {
5757 vfs_drt_control(&(wbp->cl_scmap), 0);
5758 }
91447636 5759 } else {
0a7de745 5760 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, 0, 0, 0, 0);
91447636 5761 }
9bccf70c 5762
91447636 5763 rap = ubc->cl_rahead;
55e303ae 5764
91447636 5765 if (wbp != NULL) {
0a7de745
A
5766 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
5767 FREE_ZONE(wbp, sizeof *wbp, M_CLWRBEHIND);
91447636
A
5768 }
5769 if ((rap = ubc->cl_rahead)) {
0a7de745
A
5770 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
5771 FREE_ZONE(rap, sizeof *rap, M_CLRDAHEAD);
55e303ae 5772 }
91447636
A
5773 ubc->cl_rahead = NULL;
5774 ubc->cl_wbehind = NULL;
5775
b0d623f7 5776 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, ubc, rap, wbp, 0, 0);
91447636
A
5777}
5778
5779
9bccf70c 5780static int
d9a64523 5781cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg, int *err, boolean_t vm_initiated)
9bccf70c 5782{
0a7de745 5783 int cl_index;
9bccf70c
A
5784 int cl_index1;
5785 int min_index;
0a7de745 5786 int cl_len;
55e303ae 5787 int cl_pushed = 0;
91447636 5788 struct cl_wextent l_clusters[MAX_CLUSTERS];
b0d623f7 5789 u_int max_cluster_pgcount;
813fb2f6 5790 int error = 0;
b0d623f7
A
5791
5792 max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
9bccf70c 5793 /*
91447636
A
5794 * the write behind context exists and has
5795 * already been locked...
2d21ac55 5796 */
0a7de745
A
5797 if (wbp->cl_number == 0) {
5798 /*
2d21ac55
A
5799 * no clusters to push
5800 * return number of empty slots
5801 */
0a7de745
A
5802 return MAX_CLUSTERS;
5803 }
5804
2d21ac55 5805 /*
9bccf70c 5806 * make a local 'sorted' copy of the clusters
91447636 5807 * and clear wbp->cl_number so that new clusters can
9bccf70c
A
5808 * be developed
5809 */
91447636 5810 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
0a7de745
A
5811 for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
5812 if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr) {
5813 continue;
5814 }
5815 if (min_index == -1) {
5816 min_index = cl_index1;
5817 } else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr) {
5818 min_index = cl_index1;
5819 }
5820 }
5821 if (min_index == -1) {
5822 break;
5823 }
5824
5825 l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
91447636 5826 l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
2d21ac55 5827 l_clusters[cl_index].io_flags = wbp->cl_clusters[min_index].io_flags;
9bccf70c 5828
0a7de745 5829 wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
9bccf70c 5830 }
91447636
A
5831 wbp->cl_number = 0;
5832
5833 cl_len = cl_index;
9bccf70c 5834
39037602 5835 /* skip switching to the sparse cluster mechanism if on diskimage */
0a7de745
A
5836 if (((push_flag & PUSH_DELAY) && cl_len == MAX_CLUSTERS) &&
5837 !(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)) {
55e303ae 5838 int i;
0a7de745 5839
55e303ae
A
5840 /*
5841 * determine if we appear to be writing the file sequentially
5842 * if not, by returning without having pushed any clusters
5843 * we will cause this vnode to be pushed into the sparse cluster mechanism
5844 * used for managing more random I/O patterns
5845 *
5846 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
2d21ac55 5847 * that's why we're in try_push with PUSH_DELAY...
55e303ae
A
5848 *
5849 * check to make sure that all the clusters except the last one are 'full'... and that each cluster
5850 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
91447636
A
5851 * so we can just make a simple pass through, up to, but not including the last one...
5852 * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
55e303ae 5853 * are sequential
0a7de745 5854 *
55e303ae
A
5855 * we let the last one be partial as long as it was adjacent to the previous one...
5856 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
5857 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
5858 */
5859 for (i = 0; i < MAX_CLUSTERS - 1; i++) {
0a7de745
A
5860 if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != max_cluster_pgcount) {
5861 goto dont_try;
5862 }
5863 if (l_clusters[i].e_addr != l_clusters[i + 1].b_addr) {
5864 goto dont_try;
5865 }
55e303ae
A
5866 }
5867 }
0a7de745 5868 if (vm_initiated == TRUE) {
d9a64523 5869 lck_mtx_unlock(&wbp->cl_lockw);
0a7de745 5870 }
d9a64523 5871
55e303ae 5872 for (cl_index = 0; cl_index < cl_len; cl_index++) {
0a7de745
A
5873 int flags;
5874 struct cl_extent cl;
813fb2f6 5875 int retval;
91447636 5876
0a7de745 5877 flags = io_flags & (IO_PASSIVE | IO_CLOSE);
6d2010ae 5878
0a7de745 5879 /*
91447636 5880 * try to push each cluster in turn...
9bccf70c 5881 */
0a7de745
A
5882 if (l_clusters[cl_index].io_flags & CLW_IONOCACHE) {
5883 flags |= IO_NOCACHE;
5884 }
2d21ac55 5885
0a7de745
A
5886 if (l_clusters[cl_index].io_flags & CLW_IOPASSIVE) {
5887 flags |= IO_PASSIVE;
5888 }
2d21ac55 5889
0a7de745
A
5890 if (push_flag & PUSH_SYNC) {
5891 flags |= IO_SYNC;
5892 }
2d21ac55 5893
91447636
A
5894 cl.b_addr = l_clusters[cl_index].b_addr;
5895 cl.e_addr = l_clusters[cl_index].e_addr;
9bccf70c 5896
d9a64523 5897 retval = cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg, vm_initiated);
9bccf70c 5898
d9a64523
A
5899 if (retval == 0) {
5900 cl_pushed++;
91447636 5901
d9a64523
A
5902 l_clusters[cl_index].b_addr = 0;
5903 l_clusters[cl_index].e_addr = 0;
5904 } else if (error == 0) {
5905 error = retval;
5906 }
91447636 5907
0a7de745
A
5908 if (!(push_flag & PUSH_ALL)) {
5909 break;
5910 }
9bccf70c 5911 }
0a7de745 5912 if (vm_initiated == TRUE) {
d9a64523 5913 lck_mtx_lock(&wbp->cl_lockw);
0a7de745 5914 }
d9a64523 5915
0a7de745 5916 if (err) {
813fb2f6 5917 *err = error;
0a7de745 5918 }
813fb2f6 5919
55e303ae 5920dont_try:
9bccf70c 5921 if (cl_len > cl_pushed) {
0a7de745
A
5922 /*
5923 * we didn't push all of the clusters, so
5924 * lets try to merge them back in to the vnode
5925 */
5926 if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
5927 /*
9bccf70c 5928 * we picked up some new clusters while we were trying to
91447636
A
5929 * push the old ones... this can happen because I've dropped
5930 * the vnode lock... the sum of the
9bccf70c 5931 * leftovers plus the new cluster count exceeds our ability
55e303ae 5932 * to represent them, so switch to the sparse cluster mechanism
91447636
A
5933 *
5934 * collect the active public clusters...
9bccf70c 5935 */
0a7de745 5936 sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
55e303ae 5937
0a7de745
A
5938 for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
5939 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) {
5940 continue;
5941 }
5942 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
91447636 5943 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
2d21ac55 5944 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
9bccf70c 5945
55e303ae 5946 cl_index1++;
9bccf70c 5947 }
55e303ae
A
5948 /*
5949 * update the cluster count
5950 */
91447636 5951 wbp->cl_number = cl_index1;
55e303ae 5952
0a7de745
A
5953 /*
5954 * and collect the original clusters that were moved into the
55e303ae
A
5955 * local storage for sorting purposes
5956 */
0a7de745 5957 sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
9bccf70c 5958 } else {
0a7de745 5959 /*
9bccf70c
A
5960 * we've got room to merge the leftovers back in
5961 * just append them starting at the next 'hole'
91447636 5962 * represented by wbp->cl_number
9bccf70c 5963 */
0a7de745
A
5964 for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
5965 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) {
5966 continue;
5967 }
9bccf70c 5968
0a7de745 5969 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
91447636 5970 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
2d21ac55 5971 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
9bccf70c 5972
9bccf70c
A
5973 cl_index1++;
5974 }
5975 /*
5976 * update the cluster count
5977 */
91447636 5978 wbp->cl_number = cl_index1;
9bccf70c
A
5979 }
5980 }
0a7de745 5981 return MAX_CLUSTERS - wbp->cl_number;
9bccf70c
A
5982}
5983
5984
5985
5986static int
d9a64523 5987cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags,
0a7de745 5988 int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
1c79356b 5989{
1c79356b
A
5990 upl_page_info_t *pl;
5991 upl_t upl;
5992 vm_offset_t upl_offset;
5993 int upl_size;
0a7de745
A
5994 off_t upl_f_offset;
5995 int pages_in_upl;
1c79356b
A
5996 int start_pg;
5997 int last_pg;
5998 int io_size;
5999 int io_flags;
55e303ae 6000 int upl_flags;
2d21ac55 6001 int bflag;
1c79356b 6002 int size;
91447636
A
6003 int error = 0;
6004 int retval;
1c79356b
A
6005 kern_return_t kret;
6006
0a7de745 6007 if (flags & IO_PASSIVE) {
6d2010ae 6008 bflag = CL_PASSIVE;
0a7de745 6009 } else {
6d2010ae 6010 bflag = 0;
0a7de745 6011 }
1c79356b 6012
0a7de745 6013 if (flags & IO_SKIP_ENCRYPTION) {
fe8ab488 6014 bflag |= CL_ENCRYPTED;
0a7de745 6015 }
fe8ab488 6016
9bccf70c 6017 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
0a7de745 6018 (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
9bccf70c 6019
91447636 6020 if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
0a7de745 6021 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
1c79356b 6022
0a7de745 6023 return 0;
9bccf70c 6024 }
1c79356b 6025 upl_size = pages_in_upl * PAGE_SIZE;
91447636 6026 upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
1c79356b 6027
9bccf70c 6028 if (upl_f_offset + upl_size >= EOF) {
0a7de745
A
6029 if (upl_f_offset >= EOF) {
6030 /*
6031 * must have truncated the file and missed
9bccf70c
A
6032 * clearing a dangling cluster (i.e. it's completely
6033 * beyond the new EOF
6034 */
0a7de745 6035 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
9bccf70c 6036
0a7de745 6037 return 0;
9bccf70c
A
6038 }
6039 size = EOF - upl_f_offset;
1c79356b 6040
55e303ae 6041 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
9bccf70c 6042 pages_in_upl = upl_size / PAGE_SIZE;
0a7de745
A
6043 } else {
6044 size = upl_size;
6045 }
55e303ae 6046
d9a64523
A
6047
6048 if (vm_initiated) {
0a7de745
A
6049 vnode_pageout(vp, NULL, (upl_offset_t)0, upl_f_offset, (upl_size_t)upl_size,
6050 UPL_MSYNC | UPL_VNODE_PAGER | UPL_KEEPCACHED, &error);
d9a64523 6051
0a7de745 6052 return error;
d9a64523 6053 }
55e303ae
A
6054 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
6055
91447636
A
6056 /*
6057 * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
0a7de745 6058 *
91447636
A
6059 * - only pages that are currently dirty are returned... these are the ones we need to clean
6060 * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
6061 * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
0a7de745 6062 * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
91447636
A
6063 * someone dirties this page while the I/O is in progress, we don't lose track of the new state
6064 *
6065 * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
6066 */
6067
0a7de745
A
6068 if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE)) {
6069 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
6070 } else {
6071 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
6072 }
55e303ae 6073
5ba3f43e 6074 kret = ubc_create_upl_kernel(vp,
0a7de745
A
6075 upl_f_offset,
6076 upl_size,
6077 &upl,
6078 &pl,
6079 upl_flags,
6080 VM_KERN_MEMORY_FILE);
6081 if (kret != KERN_SUCCESS) {
6082 panic("cluster_push: failed to get pagelist");
6083 }
1c79356b 6084
b0d623f7 6085 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, upl, upl_f_offset, 0, 0, 0);
9bccf70c 6086
55e303ae
A
6087 /*
6088 * since we only asked for the dirty pages back
6089 * it's possible that we may only get a few or even none, so...
6090 * before we start marching forward, we must make sure we know
6091 * where the last present page is in the UPL, otherwise we could
6092 * end up working with a freed upl due to the FREE_ON_EMPTY semantics
6093 * employed by commit_range and abort_range.
6094 */
6095 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
0a7de745
A
6096 if (upl_page_present(pl, last_pg)) {
6097 break;
6098 }
9bccf70c 6099 }
55e303ae 6100 pages_in_upl = last_pg + 1;
1c79356b 6101
55e303ae 6102 if (pages_in_upl == 0) {
0a7de745 6103 ubc_upl_abort(upl, 0);
1c79356b 6104
55e303ae 6105 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
0a7de745
A
6106 return 0;
6107 }
55e303ae 6108
0a7de745
A
6109 for (last_pg = 0; last_pg < pages_in_upl;) {
6110 /*
55e303ae 6111 * find the next dirty page in the UPL
0a7de745 6112 * this will become the first page in the
55e303ae
A
6113 * next I/O to generate
6114 */
1c79356b 6115 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
0a7de745 6116 if (upl_dirty_page(pl, start_pg)) {
1c79356b 6117 break;
0a7de745
A
6118 }
6119 if (upl_page_present(pl, start_pg)) {
6120 /*
55e303ae
A
6121 * RET_ONLY_DIRTY will return non-dirty 'precious' pages
6122 * just release these unchanged since we're not going
6123 * to steal them or change their state
6124 */
0a7de745
A
6125 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
6126 }
1c79356b 6127 }
0a7de745
A
6128 if (start_pg >= pages_in_upl) {
6129 /*
55e303ae
A
6130 * done... no more dirty pages to push
6131 */
0a7de745
A
6132 break;
6133 }
6134 if (start_pg > last_pg) {
6135 /*
55e303ae
A
6136 * skipped over some non-dirty pages
6137 */
6138 size -= ((start_pg - last_pg) * PAGE_SIZE);
0a7de745 6139 }
1c79356b 6140
55e303ae
A
6141 /*
6142 * find a range of dirty pages to write
6143 */
1c79356b 6144 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
0a7de745 6145 if (!upl_dirty_page(pl, last_pg)) {
1c79356b 6146 break;
0a7de745 6147 }
1c79356b
A
6148 }
6149 upl_offset = start_pg * PAGE_SIZE;
6150
6151 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
6152
2d21ac55 6153 io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE | bflag;
91447636 6154
0a7de745
A
6155 if (!(flags & IO_SYNC)) {
6156 io_flags |= CL_ASYNC;
6157 }
91447636 6158
0a7de745
A
6159 if (flags & IO_CLOSE) {
6160 io_flags |= CL_CLOSE;
6161 }
6d2010ae 6162
0a7de745 6163 if (flags & IO_NOCACHE) {
316670eb 6164 io_flags |= CL_NOCACHE;
0a7de745 6165 }
316670eb 6166
91447636 6167 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
0a7de745 6168 io_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
1c79356b 6169
0a7de745
A
6170 if (error == 0 && retval) {
6171 error = retval;
6172 }
1c79356b
A
6173
6174 size -= io_size;
6175 }
d9a64523 6176 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, error, 0, 0);
9bccf70c 6177
0a7de745 6178 return error;
1c79356b 6179}
b4c24cb9
A
6180
6181
91447636
A
6182/*
6183 * sparse_cluster_switch is called with the write behind lock held
6184 */
d9a64523
A
6185static int
6186sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
b4c24cb9 6187{
0a7de745
A
6188 int cl_index;
6189 int error;
b4c24cb9 6190
d9a64523 6191 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, 0, 0);
91447636
A
6192
6193 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
0a7de745 6194 int flags;
91447636
A
6195 struct cl_extent cl;
6196
0a7de745
A
6197 for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
6198 if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, NULL, &flags) == KERN_SUCCESS) {
6199 if (flags & UPL_POP_DIRTY) {
6200 cl.e_addr = cl.b_addr + 1;
b4c24cb9 6201
0a7de745 6202 error = sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, &cl, EOF, callback, callback_arg, vm_initiated);
d9a64523
A
6203
6204 if (error) {
6205 break;
6206 }
91447636 6207 }
55e303ae
A
6208 }
6209 }
6210 }
d9a64523
A
6211 wbp->cl_number -= cl_index;
6212
6213 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, error, 0);
91447636 6214
d9a64523 6215 return error;
55e303ae
A
6216}
6217
6218
91447636 6219/*
b0d623f7
A
6220 * sparse_cluster_push must be called with the write-behind lock held if the scmap is
6221 * still associated with the write-behind context... however, if the scmap has been disassociated
6222 * from the write-behind context (the cluster_push case), the wb lock is not held
91447636 6223 */
813fb2f6 6224static int
d9a64523 6225sparse_cluster_push(struct cl_writebehind *wbp, void **scmap, vnode_t vp, off_t EOF, int push_flag,
0a7de745 6226 int io_flags, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
55e303ae 6227{
0a7de745
A
6228 struct cl_extent cl;
6229 off_t offset;
6230 u_int length;
d9a64523 6231 void *l_scmap;
813fb2f6 6232 int error = 0;
55e303ae 6233
39037602 6234 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, kdebug_vnode(vp), (*scmap), 0, push_flag, 0);
55e303ae 6235
0a7de745
A
6236 if (push_flag & PUSH_ALL) {
6237 vfs_drt_control(scmap, 1);
6238 }
55e303ae 6239
d9a64523
A
6240 l_scmap = *scmap;
6241
55e303ae 6242 for (;;) {
813fb2f6 6243 int retval;
d9a64523 6244
0a7de745 6245 if (vfs_drt_get_cluster(scmap, &offset, &length) != KERN_SUCCESS) {
55e303ae 6246 break;
0a7de745 6247 }
55e303ae 6248
0a7de745
A
6249 if (vm_initiated == TRUE) {
6250 lck_mtx_unlock(&wbp->cl_lockw);
6251 }
d9a64523 6252
91447636
A
6253 cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
6254 cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
6255
d9a64523 6256 retval = cluster_push_now(vp, &cl, EOF, io_flags, callback, callback_arg, vm_initiated);
0a7de745 6257 if (error == 0 && retval) {
813fb2f6 6258 error = retval;
0a7de745 6259 }
2d21ac55 6260
d9a64523 6261 if (vm_initiated == TRUE) {
0a7de745 6262 lck_mtx_lock(&wbp->cl_lockw);
d9a64523 6263
0a7de745
A
6264 if (*scmap != l_scmap) {
6265 break;
6266 }
d9a64523
A
6267 }
6268
6269 if (error) {
6270 if (vfs_drt_mark_pages(scmap, offset, length, NULL) != KERN_SUCCESS) {
6271 panic("Failed to restore dirty state on failure\n");
6272 }
6273
6274 break;
6275 }
6276
0a7de745
A
6277 if (!(push_flag & PUSH_ALL)) {
6278 break;
d9a64523 6279 }
55e303ae 6280 }
d9a64523 6281 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
813fb2f6
A
6282
6283 return error;
55e303ae
A
6284}
6285
6286
91447636
A
6287/*
6288 * sparse_cluster_add is called with the write behind lock held
6289 */
d9a64523
A
6290static int
6291sparse_cluster_add(struct cl_writebehind *wbp, void **scmap, vnode_t vp, struct cl_extent *cl, off_t EOF,
0a7de745 6292 int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
55e303ae 6293{
0a7de745
A
6294 u_int new_dirty;
6295 u_int length;
6296 off_t offset;
6297 int error;
6298 int push_flag = 0; /* Is this a valid value? */
55e303ae 6299
b0d623f7 6300 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (*scmap), 0, cl->b_addr, (int)cl->e_addr, 0);
55e303ae 6301
91447636
A
6302 offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
6303 length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
55e303ae 6304
b0d623f7 6305 while (vfs_drt_mark_pages(scmap, offset, length, &new_dirty) != KERN_SUCCESS) {
0a7de745 6306 /*
55e303ae
A
6307 * no room left in the map
6308 * only a partial update was done
6309 * push out some pages and try again
6310 */
0a7de745
A
6311
6312 if (vfs_get_scmap_push_behavior_internal(scmap, &push_flag)) {
6313 push_flag = 0;
6314 }
6315
6316 error = sparse_cluster_push(wbp, scmap, vp, EOF, push_flag, 0, callback, callback_arg, vm_initiated);
d9a64523
A
6317
6318 if (error) {
6319 break;
6320 }
55e303ae
A
6321
6322 offset += (new_dirty * PAGE_SIZE_64);
6323 length -= (new_dirty * PAGE_SIZE);
6324 }
d9a64523
A
6325 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
6326
6327 return error;
55e303ae
A
6328}
6329
6330
6331static int
2d21ac55 6332cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
55e303ae 6333{
0a7de745
A
6334 upl_page_info_t *pl;
6335 upl_t upl;
6336 addr64_t ubc_paddr;
6337 kern_return_t kret;
6338 int error = 0;
6339 int did_read = 0;
6340 int abort_flags;
6341 int upl_flags;
2d21ac55
A
6342 int bflag;
6343
0a7de745 6344 if (flags & IO_PASSIVE) {
6d2010ae 6345 bflag = CL_PASSIVE;
0a7de745 6346 } else {
6d2010ae 6347 bflag = 0;
0a7de745 6348 }
55e303ae 6349
0a7de745 6350 if (flags & IO_NOCACHE) {
316670eb 6351 bflag |= CL_NOCACHE;
0a7de745 6352 }
316670eb 6353
91447636 6354 upl_flags = UPL_SET_LITE;
2d21ac55 6355
0a7de745 6356 if (!(flags & CL_READ)) {
91447636
A
6357 /*
6358 * "write" operation: let the UPL subsystem know
6359 * that we intend to modify the buffer cache pages
6360 * we're gathering.
6361 */
6362 upl_flags |= UPL_WILL_MODIFY;
2d21ac55 6363 } else {
0a7de745 6364 /*
2d21ac55
A
6365 * indicate that there is no need to pull the
6366 * mapping for this page... we're only going
6367 * to read from it, not modify it.
6368 */
6369 upl_flags |= UPL_FILE_IO;
91447636 6370 }
0a7de745
A
6371 kret = ubc_create_upl_kernel(vp,
6372 uio->uio_offset & ~PAGE_MASK_64,
6373 PAGE_SIZE,
6374 &upl,
6375 &pl,
6376 upl_flags,
6377 VM_KERN_MEMORY_FILE);
6378
6379 if (kret != KERN_SUCCESS) {
6380 return EINVAL;
6381 }
6382
6383 if (!upl_valid_page(pl, 0)) {
6384 /*
6385 * issue a synchronous read to cluster_io
6386 */
6387 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
6388 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
6389 if (error) {
6390 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
6391
6392 return error;
6393 }
91447636 6394 did_read = 1;
0a7de745
A
6395 }
6396 ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
b4c24cb9 6397
55e303ae
A
6398/*
6399 * NOTE: There is no prototype for the following in BSD. It, and the definitions
6400 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
6401 * osfmk/ppc/mappings.h. They are not included here because there appears to be no
6402 * way to do so without exporting them to kexts as well.
6403 */
0a7de745 6404 if (flags & CL_READ) {
55e303ae 6405// copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */
0a7de745
A
6406 copypv(ubc_paddr, usr_paddr, xsize, 2 | 1 | 4); /* Copy physical to physical and flush the destination */
6407 } else {
4a249263 6408// copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */
0a7de745
A
6409 copypv(usr_paddr, ubc_paddr, xsize, 2 | 1 | 8); /* Copy physical to physical and flush the source */
6410 }
6411 if (!(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
6412 /*
55e303ae
A
6413 * issue a synchronous write to cluster_io
6414 */
91447636 6415 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
0a7de745
A
6416 bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
6417 }
6418 if (error == 0) {
6419 uio_update(uio, (user_size_t)xsize);
de355530 6420 }
cc9f6e38 6421
0a7de745
A
6422 if (did_read) {
6423 abort_flags = UPL_ABORT_FREE_ON_EMPTY;
6424 } else {
6425 abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
6426 }
91447636
A
6427
6428 ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
0a7de745
A
6429
6430 return error;
55e303ae
A
6431}
6432
55e303ae 6433int
2d21ac55 6434cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid)
55e303ae 6435{
0a7de745 6436 int pg_offset;
55e303ae 6437 int pg_index;
0a7de745 6438 int csize;
55e303ae
A
6439 int segflg;
6440 int retval = 0;
0a7de745 6441 int xsize;
55e303ae 6442 upl_page_info_t *pl;
0a7de745 6443 int dirty_count;
55e303ae 6444
2d21ac55
A
6445 xsize = *io_resid;
6446
55e303ae 6447 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
0a7de745 6448 (int)uio->uio_offset, upl_offset, xsize, 0, 0);
55e303ae
A
6449
6450 segflg = uio->uio_segflg;
6451
0a7de745
A
6452 switch (segflg) {
6453 case UIO_USERSPACE32:
6454 case UIO_USERISPACE32:
91447636
A
6455 uio->uio_segflg = UIO_PHYS_USERSPACE32;
6456 break;
6457
0a7de745
A
6458 case UIO_USERSPACE:
6459 case UIO_USERISPACE:
55e303ae
A
6460 uio->uio_segflg = UIO_PHYS_USERSPACE;
6461 break;
6462
0a7de745
A
6463 case UIO_USERSPACE64:
6464 case UIO_USERISPACE64:
91447636
A
6465 uio->uio_segflg = UIO_PHYS_USERSPACE64;
6466 break;
6467
0a7de745 6468 case UIO_SYSSPACE:
55e303ae
A
6469 uio->uio_segflg = UIO_PHYS_SYSSPACE;
6470 break;
6471 }
6472 pl = ubc_upl_pageinfo(upl);
6473
6474 pg_index = upl_offset / PAGE_SIZE;
6475 pg_offset = upl_offset & PAGE_MASK;
6476 csize = min(PAGE_SIZE - pg_offset, xsize);
6477
4bd07ac2 6478 dirty_count = 0;
55e303ae 6479 while (xsize && retval == 0) {
0a7de745 6480 addr64_t paddr;
55e303ae 6481
fe8ab488 6482 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << PAGE_SHIFT) + pg_offset;
0a7de745 6483 if ((uio->uio_rw == UIO_WRITE) && (upl_dirty_page(pl, pg_index) == FALSE)) {
4bd07ac2 6484 dirty_count++;
0a7de745 6485 }
de355530 6486
55e303ae
A
6487 retval = uiomove64(paddr, csize, uio);
6488
6489 pg_index += 1;
6490 pg_offset = 0;
6491 xsize -= csize;
6492 csize = min(PAGE_SIZE, xsize);
6493 }
2d21ac55
A
6494 *io_resid = xsize;
6495
55e303ae
A
6496 uio->uio_segflg = segflg;
6497
39037602 6498 task_update_logical_writes(current_task(), (dirty_count * PAGE_SIZE), TASK_WRITE_DEFERRED, upl_lookup_vnode(upl));
55e303ae 6499 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
0a7de745
A
6500 (int)uio->uio_offset, xsize, retval, segflg, 0);
6501
6502 return retval;
55e303ae
A
6503}
6504
6505
6506int
91447636 6507cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty)
2d21ac55 6508{
0a7de745 6509 return cluster_copy_ubc_data_internal(vp, uio, io_resid, mark_dirty, 1);
2d21ac55
A
6510}
6511
6512
6513static int
6514cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference)
55e303ae
A
6515{
6516 int segflg;
6517 int io_size;
6518 int xsize;
6519 int start_offset;
55e303ae 6520 int retval = 0;
0a7de745 6521 memory_object_control_t control;
55e303ae 6522
2d21ac55 6523 io_size = *io_resid;
55e303ae
A
6524
6525 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
0a7de745 6526 (int)uio->uio_offset, io_size, mark_dirty, take_reference, 0);
55e303ae
A
6527
6528 control = ubc_getobject(vp, UBC_FLAGS_NONE);
2d21ac55 6529
55e303ae
A
6530 if (control == MEMORY_OBJECT_CONTROL_NULL) {
6531 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
0a7de745 6532 (int)uio->uio_offset, io_size, retval, 3, 0);
55e303ae 6533
0a7de745 6534 return 0;
55e303ae 6535 }
55e303ae
A
6536 segflg = uio->uio_segflg;
6537
0a7de745
A
6538 switch (segflg) {
6539 case UIO_USERSPACE32:
6540 case UIO_USERISPACE32:
91447636
A
6541 uio->uio_segflg = UIO_PHYS_USERSPACE32;
6542 break;
6543
0a7de745
A
6544 case UIO_USERSPACE64:
6545 case UIO_USERISPACE64:
91447636
A
6546 uio->uio_segflg = UIO_PHYS_USERSPACE64;
6547 break;
6548
0a7de745
A
6549 case UIO_USERSPACE:
6550 case UIO_USERISPACE:
55e303ae
A
6551 uio->uio_segflg = UIO_PHYS_USERSPACE;
6552 break;
6553
0a7de745 6554 case UIO_SYSSPACE:
55e303ae
A
6555 uio->uio_segflg = UIO_PHYS_SYSSPACE;
6556 break;
6557 }
55e303ae 6558
0a7de745
A
6559 if ((io_size = *io_resid)) {
6560 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
91447636 6561 xsize = uio_resid(uio);
55e303ae 6562
2d21ac55 6563 retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset, uio,
0a7de745 6564 start_offset, io_size, mark_dirty, take_reference);
91447636
A
6565 xsize -= uio_resid(uio);
6566 io_size -= xsize;
55e303ae
A
6567 }
6568 uio->uio_segflg = segflg;
6569 *io_resid = io_size;
6570
55e303ae 6571 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
0a7de745 6572 (int)uio->uio_offset, io_size, retval, 0x80000000 | segflg, 0);
55e303ae 6573
0a7de745 6574 return retval;
55e303ae
A
6575}
6576
6577
6578int
91447636 6579is_file_clean(vnode_t vp, off_t filesize)
55e303ae 6580{
0a7de745 6581 off_t f_offset;
55e303ae
A
6582 int flags;
6583 int total_dirty = 0;
6584
6585 for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
0a7de745
A
6586 if (ubc_page_op(vp, f_offset, 0, NULL, &flags) == KERN_SUCCESS) {
6587 if (flags & UPL_POP_DIRTY) {
6588 total_dirty++;
55e303ae
A
6589 }
6590 }
6591 }
0a7de745
A
6592 if (total_dirty) {
6593 return EINVAL;
6594 }
55e303ae 6595
0a7de745 6596 return 0;
55e303ae
A
6597}
6598
6599
6600
6601/*
6602 * Dirty region tracking/clustering mechanism.
6603 *
6604 * This code (vfs_drt_*) provides a mechanism for tracking and clustering
6605 * dirty regions within a larger space (file). It is primarily intended to
6606 * support clustering in large files with many dirty areas.
6607 *
6608 * The implementation assumes that the dirty regions are pages.
6609 *
6610 * To represent dirty pages within the file, we store bit vectors in a
6611 * variable-size circular hash.
6612 */
6613
6614/*
6615 * Bitvector size. This determines the number of pages we group in a
6616 * single hashtable entry. Each hashtable entry is aligned to this
6617 * size within the file.
6618 */
0a7de745 6619#define DRT_BITVECTOR_PAGES ((1024 * 256) / PAGE_SIZE)
55e303ae
A
6620
6621/*
6622 * File offset handling.
6623 *
0a7de745 6624 * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
3e170ce0 6625 * the correct formula is (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
55e303ae 6626 */
0a7de745
A
6627#define DRT_ADDRESS_MASK (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6628#define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
55e303ae
A
6629
6630/*
6631 * Hashtable address field handling.
6632 *
6633 * The low-order bits of the hashtable address are used to conserve
6634 * space.
6635 *
6636 * DRT_HASH_COUNT_MASK must be large enough to store the range
6637 * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
6638 * to indicate that the bucket is actually unoccupied.
6639 */
0a7de745
A
6640#define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
6641#define DRT_HASH_SET_ADDRESS(scm, i, a) \
6642 do { \
6643 (scm)->scm_hashtable[(i)].dhe_control = \
6644 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
55e303ae 6645 } while (0)
0a7de745
A
6646#define DRT_HASH_COUNT_MASK 0x1ff
6647#define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
6648#define DRT_HASH_SET_COUNT(scm, i, c) \
6649 do { \
6650 (scm)->scm_hashtable[(i)].dhe_control = \
6651 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \
55e303ae
A
6652 } while (0)
6653#define DRT_HASH_CLEAR(scm, i) \
0a7de745
A
6654 do { \
6655 (scm)->scm_hashtable[(i)].dhe_control = 0; \
55e303ae 6656 } while (0)
0a7de745
A
6657#define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
6658#define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
6659#define DRT_HASH_COPY(oscm, oi, scm, i) \
6660 do { \
6661 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
6662 DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
55e303ae
A
6663 } while(0);
6664
6665
d9a64523 6666#if CONFIG_EMBEDDED
55e303ae
A
6667/*
6668 * Hash table moduli.
6669 *
6670 * Since the hashtable entry's size is dependent on the size of
6671 * the bitvector, and since the hashtable size is constrained to
6672 * both being prime and fitting within the desired allocation
6673 * size, these values need to be manually determined.
6674 *
d9a64523 6675 * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
55e303ae 6676 *
d9a64523
A
6677 * The small hashtable allocation is 4096 bytes, so the modulus is 251.
6678 * The large hashtable allocation is 32768 bytes, so the modulus is 2039.
0a7de745 6679 * The xlarge hashtable allocation is 131072 bytes, so the modulus is 8179.
55e303ae 6680 */
d9a64523 6681
0a7de745
A
6682#define DRT_HASH_SMALL_MODULUS 251
6683#define DRT_HASH_LARGE_MODULUS 2039
6684#define DRT_HASH_XLARGE_MODULUS 8179
55e303ae 6685
b7266188
A
6686/*
6687 * Physical memory required before the large hash modulus is permitted.
6688 *
6689 * On small memory systems, the large hash modulus can lead to phsyical
6690 * memory starvation, so we avoid using it there.
6691 */
0a7de745
A
6692#define DRT_HASH_LARGE_MEMORY_REQUIRED (1024LL * 1024LL * 1024LL) /* 1GiB */
6693#define DRT_HASH_XLARGE_MEMORY_REQUIRED (8 * 1024LL * 1024LL * 1024LL) /* 8GiB */
b7266188 6694
0a7de745
A
6695#define DRT_SMALL_ALLOCATION 4096 /* 80 bytes spare */
6696#define DRT_LARGE_ALLOCATION 32768 /* 144 bytes spare */
6697#define DRT_XLARGE_ALLOCATION 131072 /* 208 bytes spare */
d9a64523
A
6698
6699#else
6700/*
6701 * Hash table moduli.
6702 *
6703 * Since the hashtable entry's size is dependent on the size of
6704 * the bitvector, and since the hashtable size is constrained to
6705 * both being prime and fitting within the desired allocation
6706 * size, these values need to be manually determined.
6707 *
6708 * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
6709 *
6710 * The small hashtable allocation is 16384 bytes, so the modulus is 1019.
6711 * The large hashtable allocation is 131072 bytes, so the modulus is 8179.
0a7de745 6712 * The xlarge hashtable allocation is 524288 bytes, so the modulus is 32749.
d9a64523
A
6713 */
6714
0a7de745
A
6715#define DRT_HASH_SMALL_MODULUS 1019
6716#define DRT_HASH_LARGE_MODULUS 8179
6717#define DRT_HASH_XLARGE_MODULUS 32749
d9a64523
A
6718
6719/*
6720 * Physical memory required before the large hash modulus is permitted.
6721 *
6722 * On small memory systems, the large hash modulus can lead to phsyical
6723 * memory starvation, so we avoid using it there.
6724 */
0a7de745
A
6725#define DRT_HASH_LARGE_MEMORY_REQUIRED (4 * 1024LL * 1024LL * 1024LL) /* 4GiB */
6726#define DRT_HASH_XLARGE_MEMORY_REQUIRED (32 * 1024LL * 1024LL * 1024LL) /* 32GiB */
d9a64523 6727
0a7de745
A
6728#define DRT_SMALL_ALLOCATION 16384 /* 80 bytes spare */
6729#define DRT_LARGE_ALLOCATION 131072 /* 208 bytes spare */
6730#define DRT_XLARGE_ALLOCATION 524288 /* 304 bytes spare */
d9a64523
A
6731
6732#endif
55e303ae
A
6733
6734/* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
6735
d9a64523
A
6736/*
6737 * Hashtable entry.
6738 */
6739struct vfs_drt_hashentry {
0a7de745 6740 u_int64_t dhe_control;
d9a64523 6741/*
0a7de745
A
6742 * dhe_bitvector was declared as dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
6743 * DRT_BITVECTOR_PAGES is defined as ((1024 * 256) / PAGE_SIZE)
6744 * Since PAGE_SIZE is only known at boot time,
6745 * -define MAX_DRT_BITVECTOR_PAGES for smallest supported page size (4k)
6746 * -declare dhe_bitvector array for largest possible length
6747 */
d9a64523 6748#define MAX_DRT_BITVECTOR_PAGES (1024 * 256)/( 4 * 1024)
0a7de745 6749 u_int32_t dhe_bitvector[MAX_DRT_BITVECTOR_PAGES / 32];
d9a64523
A
6750};
6751
55e303ae
A
6752/*
6753 * Hashtable bitvector handling.
6754 *
6755 * Bitvector fields are 32 bits long.
6756 */
6757
0a7de745 6758#define DRT_HASH_SET_BIT(scm, i, bit) \
55e303ae
A
6759 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
6760
0a7de745 6761#define DRT_HASH_CLEAR_BIT(scm, i, bit) \
55e303ae 6762 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
0a7de745
A
6763
6764#define DRT_HASH_TEST_BIT(scm, i, bit) \
55e303ae 6765 ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
0a7de745
A
6766
6767#define DRT_BITVECTOR_CLEAR(scm, i) \
d9a64523 6768 bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
55e303ae 6769
0a7de745
A
6770#define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
6771 bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
6772 &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
d9a64523 6773 (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
55e303ae
A
6774
6775/*
6776 * Dirty Region Tracking structure.
6777 *
6778 * The hashtable is allocated entirely inside the DRT structure.
6779 *
6780 * The hash is a simple circular prime modulus arrangement, the structure
6781 * is resized from small to large if it overflows.
6782 */
6783
6784struct vfs_drt_clustermap {
0a7de745
A
6785 u_int32_t scm_magic; /* sanity/detection */
6786#define DRT_SCM_MAGIC 0x12020003
6787 u_int32_t scm_modulus; /* current ring size */
6788 u_int32_t scm_buckets; /* number of occupied buckets */
6789 u_int32_t scm_lastclean; /* last entry we cleaned */
6790 u_int32_t scm_iskips; /* number of slot skips */
55e303ae
A
6791
6792 struct vfs_drt_hashentry scm_hashtable[0];
6793};
6794
6795
0a7de745
A
6796#define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
6797#define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
55e303ae
A
6798
6799/*
6800 * Debugging codes and arguments.
6801 */
0a7de745
A
6802#define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
6803#define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
6804#define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
6805#define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
6806#define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
6807 * dirty */
6808 /* 0, setcount */
6809 /* 1 (clean, no map) */
6810 /* 2 (map alloc fail) */
6811 /* 3, resid (partial) */
6812#define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
6813#define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
6814 * lastclean, iskips */
6815
6816
6817static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
6818static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
6819static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
6820 u_int64_t offset, int *indexp);
6821static kern_return_t vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
6822 u_int64_t offset,
6823 int *indexp,
6824 int recursed);
6825static kern_return_t vfs_drt_do_mark_pages(
6826 void **cmapp,
6827 u_int64_t offset,
6828 u_int length,
6829 u_int *setcountp,
6830 int dirty);
6831static void vfs_drt_trace(
55e303ae
A
6832 struct vfs_drt_clustermap *cmap,
6833 int code,
6834 int arg1,
6835 int arg2,
6836 int arg3,
6837 int arg4);
6838
6839
6840/*
6841 * Allocate and initialise a sparse cluster map.
6842 *
6843 * Will allocate a new map, resize or compact an existing map.
6844 *
6845 * XXX we should probably have at least one intermediate map size,
6846 * as the 1:16 ratio seems a bit drastic.
6847 */
6848static kern_return_t
6849vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
6850{
0a7de745
A
6851 struct vfs_drt_clustermap *cmap = NULL, *ocmap = NULL;
6852 kern_return_t kret = KERN_SUCCESS;
6853 u_int64_t offset = 0;
6854 u_int32_t i = 0;
6855 int modulus_size = 0, map_size = 0, active_buckets = 0, index = 0, copycount = 0;
55e303ae
A
6856
6857 ocmap = NULL;
0a7de745 6858 if (cmapp != NULL) {
55e303ae 6859 ocmap = *cmapp;
0a7de745
A
6860 }
6861
55e303ae
A
6862 /*
6863 * Decide on the size of the new map.
6864 */
6865 if (ocmap == NULL) {
0a7de745
A
6866 modulus_size = DRT_HASH_SMALL_MODULUS;
6867 map_size = DRT_SMALL_ALLOCATION;
55e303ae
A
6868 } else {
6869 /* count the number of active buckets in the old map */
6870 active_buckets = 0;
6871 for (i = 0; i < ocmap->scm_modulus; i++) {
6872 if (!DRT_HASH_VACANT(ocmap, i) &&
0a7de745 6873 (DRT_HASH_GET_COUNT(ocmap, i) != 0)) {
55e303ae 6874 active_buckets++;
0a7de745 6875 }
55e303ae
A
6876 }
6877 /*
6878 * If we're currently using the small allocation, check to
6879 * see whether we should grow to the large one.
6880 */
6881 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
0a7de745 6882 /*
b7266188
A
6883 * If the ring is nearly full and we are allowed to
6884 * use the large modulus, upgrade.
6885 */
6886 if ((active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) &&
6887 (max_mem >= DRT_HASH_LARGE_MEMORY_REQUIRED)) {
0a7de745
A
6888 modulus_size = DRT_HASH_LARGE_MODULUS;
6889 map_size = DRT_LARGE_ALLOCATION;
6890 } else {
6891 modulus_size = DRT_HASH_SMALL_MODULUS;
6892 map_size = DRT_SMALL_ALLOCATION;
6893 }
6894 } else if (ocmap->scm_modulus == DRT_HASH_LARGE_MODULUS) {
6895 if ((active_buckets > (DRT_HASH_LARGE_MODULUS - 5)) &&
6896 (max_mem >= DRT_HASH_XLARGE_MEMORY_REQUIRED)) {
6897 modulus_size = DRT_HASH_XLARGE_MODULUS;
6898 map_size = DRT_XLARGE_ALLOCATION;
55e303ae 6899 } else {
cb323159
A
6900 /*
6901 * If the ring is completely full and we can't
6902 * expand, there's nothing useful for us to do.
6903 * Behave as though we had compacted into the new
6904 * array and return.
6905 */
6906 return KERN_SUCCESS;
55e303ae
A
6907 }
6908 } else {
0a7de745
A
6909 /* already using the xlarge modulus */
6910 modulus_size = DRT_HASH_XLARGE_MODULUS;
6911 map_size = DRT_XLARGE_ALLOCATION;
6912
55e303ae
A
6913 /*
6914 * If the ring is completely full, there's
6915 * nothing useful for us to do. Behave as
6916 * though we had compacted into the new
6917 * array and return.
6918 */
0a7de745
A
6919 if (active_buckets >= DRT_HASH_XLARGE_MODULUS) {
6920 return KERN_SUCCESS;
6921 }
55e303ae
A
6922 }
6923 }
6924
6925 /*
6926 * Allocate and initialise the new map.
6927 */
6928
0a7de745
A
6929 kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap, map_size, VM_KERN_MEMORY_FILE);
6930 if (kret != KERN_SUCCESS) {
6931 return kret;
6932 }
55e303ae 6933 cmap->scm_magic = DRT_SCM_MAGIC;
0a7de745 6934 cmap->scm_modulus = modulus_size;
55e303ae
A
6935 cmap->scm_buckets = 0;
6936 cmap->scm_lastclean = 0;
6937 cmap->scm_iskips = 0;
6938 for (i = 0; i < cmap->scm_modulus; i++) {
0a7de745 6939 DRT_HASH_CLEAR(cmap, i);
55e303ae
A
6940 DRT_HASH_VACATE(cmap, i);
6941 DRT_BITVECTOR_CLEAR(cmap, i);
6942 }
6943
6944 /*
6945 * If there's an old map, re-hash entries from it into the new map.
6946 */
6947 copycount = 0;
6948 if (ocmap != NULL) {
6949 for (i = 0; i < ocmap->scm_modulus; i++) {
6950 /* skip empty buckets */
6951 if (DRT_HASH_VACANT(ocmap, i) ||
0a7de745 6952 (DRT_HASH_GET_COUNT(ocmap, i) == 0)) {
55e303ae 6953 continue;
0a7de745 6954 }
55e303ae
A
6955 /* get new index */
6956 offset = DRT_HASH_GET_ADDRESS(ocmap, i);
6957 kret = vfs_drt_get_index(&cmap, offset, &index, 1);
6958 if (kret != KERN_SUCCESS) {
6959 /* XXX need to bail out gracefully here */
6960 panic("vfs_drt: new cluster map mysteriously too small");
2d21ac55 6961 index = 0;
55e303ae
A
6962 }
6963 /* copy */
6964 DRT_HASH_COPY(ocmap, i, cmap, index);
6965 copycount++;
6966 }
6967 }
6968
6969 /* log what we've done */
6970 vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
0a7de745 6971
55e303ae 6972 /*
0a7de745 6973 * It's important to ensure that *cmapp always points to
55e303ae
A
6974 * a valid map, so we must overwrite it before freeing
6975 * the old map.
6976 */
6977 *cmapp = cmap;
6978 if (ocmap != NULL) {
6979 /* emit stats into trace buffer */
6980 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
0a7de745
A
6981 ocmap->scm_modulus,
6982 ocmap->scm_buckets,
6983 ocmap->scm_lastclean,
6984 ocmap->scm_iskips);
55e303ae
A
6985
6986 vfs_drt_free_map(ocmap);
6987 }
0a7de745 6988 return KERN_SUCCESS;
55e303ae
A
6989}
6990
6991
6992/*
6993 * Free a sparse cluster map.
6994 */
6995static kern_return_t
6996vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
6997{
0a7de745
A
6998 vm_size_t map_size = 0;
6999
7000 if (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
7001 map_size = DRT_SMALL_ALLOCATION;
7002 } else if (cmap->scm_modulus == DRT_HASH_LARGE_MODULUS) {
7003 map_size = DRT_LARGE_ALLOCATION;
7004 } else if (cmap->scm_modulus == DRT_HASH_XLARGE_MODULUS) {
7005 map_size = DRT_XLARGE_ALLOCATION;
7006 } else {
7007 panic("vfs_drt_free_map: Invalid modulus %d\n", cmap->scm_modulus);
7008 }
7009
7010 kmem_free(kernel_map, (vm_offset_t)cmap, map_size);
7011 return KERN_SUCCESS;
55e303ae
A
7012}
7013
7014
7015/*
7016 * Find the hashtable slot currently occupied by an entry for the supplied offset.
7017 */
7018static kern_return_t
7019vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
7020{
0a7de745
A
7021 int index;
7022 u_int32_t i;
55e303ae
A
7023
7024 offset = DRT_ALIGN_ADDRESS(offset);
7025 index = DRT_HASH(cmap, offset);
7026
7027 /* traverse the hashtable */
7028 for (i = 0; i < cmap->scm_modulus; i++) {
55e303ae
A
7029 /*
7030 * If the slot is vacant, we can stop.
7031 */
0a7de745 7032 if (DRT_HASH_VACANT(cmap, index)) {
55e303ae 7033 break;
0a7de745 7034 }
55e303ae
A
7035
7036 /*
7037 * If the address matches our offset, we have success.
7038 */
7039 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
7040 *indexp = index;
0a7de745 7041 return KERN_SUCCESS;
55e303ae
A
7042 }
7043
7044 /*
7045 * Move to the next slot, try again.
7046 */
7047 index = DRT_HASH_NEXT(cmap, index);
7048 }
7049 /*
7050 * It's not there.
7051 */
0a7de745 7052 return KERN_FAILURE;
55e303ae
A
7053}
7054
7055/*
7056 * Find the hashtable slot for the supplied offset. If we haven't allocated
7057 * one yet, allocate one and populate the address field. Note that it will
7058 * not have a nonzero page count and thus will still technically be free, so
7059 * in the case where we are called to clean pages, the slot will remain free.
7060 */
7061static kern_return_t
7062vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
7063{
7064 struct vfs_drt_clustermap *cmap;
0a7de745
A
7065 kern_return_t kret;
7066 u_int32_t index;
7067 u_int32_t i;
55e303ae
A
7068
7069 cmap = *cmapp;
7070
7071 /* look for an existing entry */
7072 kret = vfs_drt_search_index(cmap, offset, indexp);
0a7de745
A
7073 if (kret == KERN_SUCCESS) {
7074 return kret;
7075 }
55e303ae
A
7076
7077 /* need to allocate an entry */
7078 offset = DRT_ALIGN_ADDRESS(offset);
7079 index = DRT_HASH(cmap, offset);
7080
7081 /* scan from the index forwards looking for a vacant slot */
7082 for (i = 0; i < cmap->scm_modulus; i++) {
7083 /* slot vacant? */
0a7de745 7084 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap, index) == 0) {
55e303ae 7085 cmap->scm_buckets++;
0a7de745 7086 if (index < cmap->scm_lastclean) {
55e303ae 7087 cmap->scm_lastclean = index;
0a7de745 7088 }
55e303ae
A
7089 DRT_HASH_SET_ADDRESS(cmap, index, offset);
7090 DRT_HASH_SET_COUNT(cmap, index, 0);
7091 DRT_BITVECTOR_CLEAR(cmap, index);
7092 *indexp = index;
7093 vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
0a7de745 7094 return KERN_SUCCESS;
55e303ae
A
7095 }
7096 cmap->scm_iskips += i;
7097 index = DRT_HASH_NEXT(cmap, index);
7098 }
7099
7100 /*
7101 * We haven't found a vacant slot, so the map is full. If we're not
7102 * already recursed, try reallocating/compacting it.
7103 */
0a7de745
A
7104 if (recursed) {
7105 return KERN_FAILURE;
7106 }
55e303ae
A
7107 kret = vfs_drt_alloc_map(cmapp);
7108 if (kret == KERN_SUCCESS) {
7109 /* now try to insert again */
7110 kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
7111 }
0a7de745 7112 return kret;
55e303ae
A
7113}
7114
7115/*
7116 * Implementation of set dirty/clean.
7117 *
7118 * In the 'clean' case, not finding a map is OK.
7119 */
7120static kern_return_t
7121vfs_drt_do_mark_pages(
0a7de745
A
7122 void **private,
7123 u_int64_t offset,
7124 u_int length,
7125 u_int *setcountp,
7126 int dirty)
55e303ae
A
7127{
7128 struct vfs_drt_clustermap *cmap, **cmapp;
0a7de745
A
7129 kern_return_t kret;
7130 int i, index, pgoff, pgcount, setcount, ecount;
55e303ae
A
7131
7132 cmapp = (struct vfs_drt_clustermap **)private;
7133 cmap = *cmapp;
7134
7135 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
7136
0a7de745
A
7137 if (setcountp != NULL) {
7138 *setcountp = 0;
7139 }
7140
55e303ae
A
7141 /* allocate a cluster map if we don't already have one */
7142 if (cmap == NULL) {
7143 /* no cluster map, nothing to clean */
7144 if (!dirty) {
7145 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
0a7de745 7146 return KERN_SUCCESS;
55e303ae
A
7147 }
7148 kret = vfs_drt_alloc_map(cmapp);
7149 if (kret != KERN_SUCCESS) {
7150 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
0a7de745 7151 return kret;
55e303ae
A
7152 }
7153 }
7154 setcount = 0;
7155
7156 /*
7157 * Iterate over the length of the region.
7158 */
7159 while (length > 0) {
7160 /*
7161 * Get the hashtable index for this offset.
7162 *
7163 * XXX this will add blank entries if we are clearing a range
7164 * that hasn't been dirtied.
7165 */
7166 kret = vfs_drt_get_index(cmapp, offset, &index, 0);
0a7de745 7167 cmap = *cmapp; /* may have changed! */
55e303ae
A
7168 /* this may be a partial-success return */
7169 if (kret != KERN_SUCCESS) {
0a7de745
A
7170 if (setcountp != NULL) {
7171 *setcountp = setcount;
7172 }
55e303ae
A
7173 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
7174
0a7de745 7175 return kret;
55e303ae
A
7176 }
7177
7178 /*
7179 * Work out how many pages we're modifying in this
7180 * hashtable entry.
7181 */
7182 pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE;
7183 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
7184
7185 /*
7186 * Iterate over pages, dirty/clearing as we go.
7187 */
7188 ecount = DRT_HASH_GET_COUNT(cmap, index);
7189 for (i = 0; i < pgcount; i++) {
7190 if (dirty) {
7191 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
0a7de745
A
7192 if (ecount >= DRT_BITVECTOR_PAGES) {
7193 panic("ecount >= DRT_BITVECTOR_PAGES, cmap = %p, index = %d, bit = %d", cmap, index, pgoff + i);
7194 }
55e303ae
A
7195 DRT_HASH_SET_BIT(cmap, index, pgoff + i);
7196 ecount++;
7197 setcount++;
7198 }
7199 } else {
7200 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
0a7de745
A
7201 if (ecount <= 0) {
7202 panic("ecount <= 0, cmap = %p, index = %d, bit = %d", cmap, index, pgoff + i);
7203 }
7204 assert(ecount > 0);
55e303ae
A
7205 DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
7206 ecount--;
7207 setcount++;
7208 }
7209 }
7210 }
7211 DRT_HASH_SET_COUNT(cmap, index, ecount);
91447636 7212
55e303ae
A
7213 offset += pgcount * PAGE_SIZE;
7214 length -= pgcount * PAGE_SIZE;
7215 }
0a7de745 7216 if (setcountp != NULL) {
55e303ae 7217 *setcountp = setcount;
0a7de745 7218 }
55e303ae
A
7219
7220 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
7221
0a7de745 7222 return KERN_SUCCESS;
55e303ae
A
7223}
7224
7225/*
7226 * Mark a set of pages as dirty/clean.
7227 *
7228 * This is a public interface.
7229 *
7230 * cmapp
7231 * Pointer to storage suitable for holding a pointer. Note that
7232 * this must either be NULL or a value set by this function.
7233 *
7234 * size
7235 * Current file size in bytes.
7236 *
7237 * offset
7238 * Offset of the first page to be marked as dirty, in bytes. Must be
7239 * page-aligned.
7240 *
7241 * length
7242 * Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
7243 *
7244 * setcountp
7245 * Number of pages newly marked dirty by this call (optional).
7246 *
7247 * Returns KERN_SUCCESS if all the pages were successfully marked.
7248 */
7249static kern_return_t
2d21ac55 7250vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp)
55e303ae
A
7251{
7252 /* XXX size unused, drop from interface */
0a7de745 7253 return vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1);
55e303ae
A
7254}
7255
91447636 7256#if 0
55e303ae
A
7257static kern_return_t
7258vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
7259{
0a7de745 7260 return vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
55e303ae 7261}
91447636 7262#endif
55e303ae
A
7263
7264/*
7265 * Get a cluster of dirty pages.
7266 *
7267 * This is a public interface.
7268 *
7269 * cmapp
7270 * Pointer to storage managed by drt_mark_pages. Note that this must
7271 * be NULL or a value set by drt_mark_pages.
7272 *
7273 * offsetp
7274 * Returns the byte offset into the file of the first page in the cluster.
7275 *
7276 * lengthp
7277 * Returns the length in bytes of the cluster of dirty pages.
7278 *
7279 * Returns success if a cluster was found. If KERN_FAILURE is returned, there
7280 * are no dirty pages meeting the minmum size criteria. Private storage will
7281 * be released if there are no more dirty pages left in the map
7282 *
7283 */
7284static kern_return_t
7285vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
7286{
7287 struct vfs_drt_clustermap *cmap;
0a7de745
A
7288 u_int64_t offset;
7289 u_int length;
7290 u_int32_t j;
7291 int index, i, fs, ls;
55e303ae
A
7292
7293 /* sanity */
0a7de745
A
7294 if ((cmapp == NULL) || (*cmapp == NULL)) {
7295 return KERN_FAILURE;
7296 }
55e303ae
A
7297 cmap = *cmapp;
7298
7299 /* walk the hashtable */
7300 for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
0a7de745 7301 index = DRT_HASH(cmap, offset);
55e303ae 7302
0a7de745 7303 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0)) {
55e303ae 7304 continue;
0a7de745 7305 }
55e303ae
A
7306
7307 /* scan the bitfield for a string of bits */
7308 fs = -1;
7309
7310 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
0a7de745
A
7311 if (DRT_HASH_TEST_BIT(cmap, index, i)) {
7312 fs = i;
55e303ae
A
7313 break;
7314 }
7315 }
7316 if (fs == -1) {
0a7de745
A
7317 /* didn't find any bits set */
7318 panic("vfs_drt: entry summary count > 0 but no bits set in map, cmap = %p, index = %d, count = %lld",
7319 cmap, index, DRT_HASH_GET_COUNT(cmap, index));
55e303ae
A
7320 }
7321 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
0a7de745
A
7322 if (!DRT_HASH_TEST_BIT(cmap, index, i)) {
7323 break;
7324 }
55e303ae 7325 }
0a7de745 7326
55e303ae
A
7327 /* compute offset and length, mark pages clean */
7328 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
7329 length = ls * PAGE_SIZE;
7330 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
7331 cmap->scm_lastclean = index;
7332
7333 /* return successful */
7334 *offsetp = (off_t)offset;
7335 *lengthp = length;
7336
7337 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
0a7de745 7338 return KERN_SUCCESS;
55e303ae
A
7339 }
7340 /*
7341 * We didn't find anything... hashtable is empty
7342 * emit stats into trace buffer and
7343 * then free it
7344 */
7345 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
0a7de745
A
7346 cmap->scm_modulus,
7347 cmap->scm_buckets,
7348 cmap->scm_lastclean,
7349 cmap->scm_iskips);
7350
55e303ae
A
7351 vfs_drt_free_map(cmap);
7352 *cmapp = NULL;
7353
0a7de745 7354 return KERN_FAILURE;
55e303ae
A
7355}
7356
7357
7358static kern_return_t
7359vfs_drt_control(void **cmapp, int op_type)
7360{
7361 struct vfs_drt_clustermap *cmap;
7362
7363 /* sanity */
0a7de745
A
7364 if ((cmapp == NULL) || (*cmapp == NULL)) {
7365 return KERN_FAILURE;
7366 }
55e303ae
A
7367 cmap = *cmapp;
7368
7369 switch (op_type) {
7370 case 0:
7371 /* emit stats into trace buffer */
7372 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
0a7de745
A
7373 cmap->scm_modulus,
7374 cmap->scm_buckets,
7375 cmap->scm_lastclean,
7376 cmap->scm_iskips);
55e303ae
A
7377
7378 vfs_drt_free_map(cmap);
7379 *cmapp = NULL;
0a7de745 7380 break;
55e303ae
A
7381
7382 case 1:
0a7de745
A
7383 cmap->scm_lastclean = 0;
7384 break;
55e303ae 7385 }
0a7de745 7386 return KERN_SUCCESS;
55e303ae
A
7387}
7388
7389
7390
7391/*
7392 * Emit a summary of the state of the clustermap into the trace buffer
7393 * along with some caller-provided data.
7394 */
91447636 7395#if KDEBUG
55e303ae 7396static void
91447636 7397vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
55e303ae
A
7398{
7399 KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
7400}
91447636
A
7401#else
7402static void
0a7de745
A
7403vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
7404 __unused int arg1, __unused int arg2, __unused int arg3,
7405 __unused int arg4)
91447636
A
7406{
7407}
0a7de745 7408#endif
55e303ae 7409
91447636 7410#if 0
55e303ae
A
7411/*
7412 * Perform basic sanity check on the hash entry summary count
7413 * vs. the actual bits set in the entry.
7414 */
7415static void
7416vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
7417{
0a7de745 7418 int index, i;
55e303ae 7419 int bits_on;
0a7de745 7420
55e303ae 7421 for (index = 0; index < cmap->scm_modulus; index++) {
0a7de745
A
7422 if (DRT_HASH_VACANT(cmap, index)) {
7423 continue;
7424 }
55e303ae
A
7425
7426 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
0a7de745
A
7427 if (DRT_HASH_TEST_BIT(cmap, index, i)) {
7428 bits_on++;
7429 }
55e303ae 7430 }
0a7de745
A
7431 if (bits_on != DRT_HASH_GET_COUNT(cmap, index)) {
7432 panic("bits_on = %d, index = %d\n", bits_on, index);
7433 }
7434 }
b4c24cb9 7435}
91447636 7436#endif
0a7de745
A
7437
7438/*
7439 * Internal interface only.
7440 */
7441static kern_return_t
7442vfs_get_scmap_push_behavior_internal(void **cmapp, int *push_flag)
7443{
7444 struct vfs_drt_clustermap *cmap;
7445
7446 /* sanity */
7447 if ((cmapp == NULL) || (*cmapp == NULL) || (push_flag == NULL)) {
7448 return KERN_FAILURE;
7449 }
7450 cmap = *cmapp;
7451
7452 if (cmap->scm_modulus == DRT_HASH_XLARGE_MODULUS) {
7453 /*
7454 * If we have a full xlarge sparse cluster,
7455 * we push it out all at once so the cluster
7456 * map can be available to absorb more I/Os.
7457 * This is done on large memory configs so
7458 * the small I/Os don't interfere with the
7459 * pro workloads.
7460 */
7461 *push_flag = PUSH_ALL;
7462 }
7463 return KERN_SUCCESS;
7464}